diff options
Diffstat (limited to 'llvm/lib/Target/AArch64/GISel')
9 files changed, 9420 insertions, 0 deletions
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp b/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp new file mode 100644 index 000000000000..11a8d5def429 --- /dev/null +++ b/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp @@ -0,0 +1,1049 @@ +//===--- AArch64CallLowering.cpp - Call lowering --------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file implements the lowering of LLVM calls to machine code calls for +/// GlobalISel. +/// +//===----------------------------------------------------------------------===// + +#include "AArch64CallLowering.h" +#include "AArch64ISelLowering.h" +#include "AArch64MachineFunctionInfo.h" +#include "AArch64Subtarget.h" +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/CodeGen/Analysis.h" +#include "llvm/CodeGen/CallingConvLower.h" +#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" +#include "llvm/CodeGen/GlobalISel/Utils.h" +#include "llvm/CodeGen/LowLevelType.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineMemOperand.h" +#include "llvm/CodeGen/MachineOperand.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/TargetRegisterInfo.h" +#include "llvm/CodeGen/TargetSubtargetInfo.h" +#include "llvm/CodeGen/ValueTypes.h" +#include "llvm/IR/Argument.h" +#include "llvm/IR/Attributes.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Type.h" +#include "llvm/IR/Value.h" +#include "llvm/Support/MachineValueType.h" +#include <algorithm> +#include <cassert> +#include <cstdint> +#include <iterator> + +#define DEBUG_TYPE "aarch64-call-lowering" + +using namespace llvm; + +AArch64CallLowering::AArch64CallLowering(const AArch64TargetLowering &TLI) + : CallLowering(&TLI) {} + +namespace { +struct IncomingArgHandler : public CallLowering::ValueHandler { + IncomingArgHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI, + CCAssignFn *AssignFn) + : ValueHandler(MIRBuilder, MRI, AssignFn), StackUsed(0) {} + + Register getStackAddress(uint64_t Size, int64_t Offset, + MachinePointerInfo &MPO) override { + auto &MFI = MIRBuilder.getMF().getFrameInfo(); + int FI = MFI.CreateFixedObject(Size, Offset, true); + MPO = MachinePointerInfo::getFixedStack(MIRBuilder.getMF(), FI); + auto AddrReg = MIRBuilder.buildFrameIndex(LLT::pointer(0, 64), FI); + StackUsed = std::max(StackUsed, Size + Offset); + return AddrReg.getReg(0); + } + + void assignValueToReg(Register ValVReg, Register PhysReg, + CCValAssign &VA) override { + markPhysRegUsed(PhysReg); + switch (VA.getLocInfo()) { + default: + MIRBuilder.buildCopy(ValVReg, PhysReg); + break; + case CCValAssign::LocInfo::SExt: + case CCValAssign::LocInfo::ZExt: + case CCValAssign::LocInfo::AExt: { + auto Copy = MIRBuilder.buildCopy(LLT{VA.getLocVT()}, PhysReg); + MIRBuilder.buildTrunc(ValVReg, Copy); + break; + } + } + } + + void assignValueToAddress(Register ValVReg, Register Addr, uint64_t Size, + MachinePointerInfo &MPO, CCValAssign &VA) override { + MachineFunction &MF = MIRBuilder.getMF(); + auto MMO = MF.getMachineMemOperand( + MPO, MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant, Size, + inferAlignFromPtrInfo(MF, MPO)); + MIRBuilder.buildLoad(ValVReg, Addr, *MMO); + } + + /// How the physical register gets marked varies between formal + /// parameters (it's a basic-block live-in), and a call instruction + /// (it's an implicit-def of the BL). + virtual void markPhysRegUsed(unsigned PhysReg) = 0; + + bool isIncomingArgumentHandler() const override { return true; } + + uint64_t StackUsed; +}; + +struct FormalArgHandler : public IncomingArgHandler { + FormalArgHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI, + CCAssignFn *AssignFn) + : IncomingArgHandler(MIRBuilder, MRI, AssignFn) {} + + void markPhysRegUsed(unsigned PhysReg) override { + MIRBuilder.getMRI()->addLiveIn(PhysReg); + MIRBuilder.getMBB().addLiveIn(PhysReg); + } +}; + +struct CallReturnHandler : public IncomingArgHandler { + CallReturnHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI, + MachineInstrBuilder MIB, CCAssignFn *AssignFn) + : IncomingArgHandler(MIRBuilder, MRI, AssignFn), MIB(MIB) {} + + void markPhysRegUsed(unsigned PhysReg) override { + MIB.addDef(PhysReg, RegState::Implicit); + } + + MachineInstrBuilder MIB; +}; + +struct OutgoingArgHandler : public CallLowering::ValueHandler { + OutgoingArgHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI, + MachineInstrBuilder MIB, CCAssignFn *AssignFn, + CCAssignFn *AssignFnVarArg, bool IsTailCall = false, + int FPDiff = 0) + : ValueHandler(MIRBuilder, MRI, AssignFn), MIB(MIB), + AssignFnVarArg(AssignFnVarArg), IsTailCall(IsTailCall), FPDiff(FPDiff), + StackSize(0), SPReg(0) {} + + bool isIncomingArgumentHandler() const override { return false; } + + Register getStackAddress(uint64_t Size, int64_t Offset, + MachinePointerInfo &MPO) override { + MachineFunction &MF = MIRBuilder.getMF(); + LLT p0 = LLT::pointer(0, 64); + LLT s64 = LLT::scalar(64); + + if (IsTailCall) { + Offset += FPDiff; + int FI = MF.getFrameInfo().CreateFixedObject(Size, Offset, true); + auto FIReg = MIRBuilder.buildFrameIndex(p0, FI); + MPO = MachinePointerInfo::getFixedStack(MF, FI); + return FIReg.getReg(0); + } + + if (!SPReg) + SPReg = MIRBuilder.buildCopy(p0, Register(AArch64::SP)).getReg(0); + + auto OffsetReg = MIRBuilder.buildConstant(s64, Offset); + + auto AddrReg = MIRBuilder.buildPtrAdd(p0, SPReg, OffsetReg); + + MPO = MachinePointerInfo::getStack(MF, Offset); + return AddrReg.getReg(0); + } + + void assignValueToReg(Register ValVReg, Register PhysReg, + CCValAssign &VA) override { + MIB.addUse(PhysReg, RegState::Implicit); + Register ExtReg = extendRegister(ValVReg, VA); + MIRBuilder.buildCopy(PhysReg, ExtReg); + } + + void assignValueToAddress(Register ValVReg, Register Addr, uint64_t Size, + MachinePointerInfo &MPO, CCValAssign &VA) override { + MachineFunction &MF = MIRBuilder.getMF(); + auto MMO = MF.getMachineMemOperand(MPO, MachineMemOperand::MOStore, Size, + inferAlignFromPtrInfo(MF, MPO)); + MIRBuilder.buildStore(ValVReg, Addr, *MMO); + } + + void assignValueToAddress(const CallLowering::ArgInfo &Arg, Register Addr, + uint64_t Size, MachinePointerInfo &MPO, + CCValAssign &VA) override { + unsigned MaxSize = Size * 8; + // For varargs, we always want to extend them to 8 bytes, in which case + // we disable setting a max. + if (!Arg.IsFixed) + MaxSize = 0; + + Register ValVReg = VA.getLocInfo() != CCValAssign::LocInfo::FPExt + ? extendRegister(Arg.Regs[0], VA, MaxSize) + : Arg.Regs[0]; + + // If we extended we might need to adjust the MMO's Size. + const LLT RegTy = MRI.getType(ValVReg); + if (RegTy.getSizeInBytes() > Size) + Size = RegTy.getSizeInBytes(); + + assignValueToAddress(ValVReg, Addr, Size, MPO, VA); + } + + bool assignArg(unsigned ValNo, MVT ValVT, MVT LocVT, + CCValAssign::LocInfo LocInfo, + const CallLowering::ArgInfo &Info, + ISD::ArgFlagsTy Flags, + CCState &State) override { + bool Res; + if (Info.IsFixed) + Res = AssignFn(ValNo, ValVT, LocVT, LocInfo, Flags, State); + else + Res = AssignFnVarArg(ValNo, ValVT, LocVT, LocInfo, Flags, State); + + StackSize = State.getNextStackOffset(); + return Res; + } + + MachineInstrBuilder MIB; + CCAssignFn *AssignFnVarArg; + bool IsTailCall; + + /// For tail calls, the byte offset of the call's argument area from the + /// callee's. Unused elsewhere. + int FPDiff; + uint64_t StackSize; + + // Cache the SP register vreg if we need it more than once in this call site. + Register SPReg; +}; +} // namespace + +static bool doesCalleeRestoreStack(CallingConv::ID CallConv, bool TailCallOpt) { + return CallConv == CallingConv::Fast && TailCallOpt; +} + +void AArch64CallLowering::splitToValueTypes( + const ArgInfo &OrigArg, SmallVectorImpl<ArgInfo> &SplitArgs, + const DataLayout &DL, MachineRegisterInfo &MRI, CallingConv::ID CallConv) const { + const AArch64TargetLowering &TLI = *getTLI<AArch64TargetLowering>(); + LLVMContext &Ctx = OrigArg.Ty->getContext(); + + SmallVector<EVT, 4> SplitVTs; + SmallVector<uint64_t, 4> Offsets; + ComputeValueVTs(TLI, DL, OrigArg.Ty, SplitVTs, &Offsets, 0); + + if (SplitVTs.size() == 0) + return; + + if (SplitVTs.size() == 1) { + // No splitting to do, but we want to replace the original type (e.g. [1 x + // double] -> double). + SplitArgs.emplace_back(OrigArg.Regs[0], SplitVTs[0].getTypeForEVT(Ctx), + OrigArg.Flags[0], OrigArg.IsFixed); + return; + } + + // Create one ArgInfo for each virtual register in the original ArgInfo. + assert(OrigArg.Regs.size() == SplitVTs.size() && "Regs / types mismatch"); + + bool NeedsRegBlock = TLI.functionArgumentNeedsConsecutiveRegisters( + OrigArg.Ty, CallConv, false); + for (unsigned i = 0, e = SplitVTs.size(); i < e; ++i) { + Type *SplitTy = SplitVTs[i].getTypeForEVT(Ctx); + SplitArgs.emplace_back(OrigArg.Regs[i], SplitTy, OrigArg.Flags[0], + OrigArg.IsFixed); + if (NeedsRegBlock) + SplitArgs.back().Flags[0].setInConsecutiveRegs(); + } + + SplitArgs.back().Flags[0].setInConsecutiveRegsLast(); +} + +bool AArch64CallLowering::lowerReturn(MachineIRBuilder &MIRBuilder, + const Value *Val, + ArrayRef<Register> VRegs, + Register SwiftErrorVReg) const { + auto MIB = MIRBuilder.buildInstrNoInsert(AArch64::RET_ReallyLR); + assert(((Val && !VRegs.empty()) || (!Val && VRegs.empty())) && + "Return value without a vreg"); + + bool Success = true; + if (!VRegs.empty()) { + MachineFunction &MF = MIRBuilder.getMF(); + const Function &F = MF.getFunction(); + + MachineRegisterInfo &MRI = MF.getRegInfo(); + const AArch64TargetLowering &TLI = *getTLI<AArch64TargetLowering>(); + CCAssignFn *AssignFn = TLI.CCAssignFnForReturn(F.getCallingConv()); + auto &DL = F.getParent()->getDataLayout(); + LLVMContext &Ctx = Val->getType()->getContext(); + + SmallVector<EVT, 4> SplitEVTs; + ComputeValueVTs(TLI, DL, Val->getType(), SplitEVTs); + assert(VRegs.size() == SplitEVTs.size() && + "For each split Type there should be exactly one VReg."); + + SmallVector<ArgInfo, 8> SplitArgs; + CallingConv::ID CC = F.getCallingConv(); + + for (unsigned i = 0; i < SplitEVTs.size(); ++i) { + if (TLI.getNumRegistersForCallingConv(Ctx, CC, SplitEVTs[i]) > 1) { + LLVM_DEBUG(dbgs() << "Can't handle extended arg types which need split"); + return false; + } + + Register CurVReg = VRegs[i]; + ArgInfo CurArgInfo = ArgInfo{CurVReg, SplitEVTs[i].getTypeForEVT(Ctx)}; + setArgFlags(CurArgInfo, AttributeList::ReturnIndex, DL, F); + + // i1 is a special case because SDAG i1 true is naturally zero extended + // when widened using ANYEXT. We need to do it explicitly here. + if (MRI.getType(CurVReg).getSizeInBits() == 1) { + CurVReg = MIRBuilder.buildZExt(LLT::scalar(8), CurVReg).getReg(0); + } else { + // Some types will need extending as specified by the CC. + MVT NewVT = TLI.getRegisterTypeForCallingConv(Ctx, CC, SplitEVTs[i]); + if (EVT(NewVT) != SplitEVTs[i]) { + unsigned ExtendOp = TargetOpcode::G_ANYEXT; + if (F.getAttributes().hasAttribute(AttributeList::ReturnIndex, + Attribute::SExt)) + ExtendOp = TargetOpcode::G_SEXT; + else if (F.getAttributes().hasAttribute(AttributeList::ReturnIndex, + Attribute::ZExt)) + ExtendOp = TargetOpcode::G_ZEXT; + + LLT NewLLT(NewVT); + LLT OldLLT(MVT::getVT(CurArgInfo.Ty)); + CurArgInfo.Ty = EVT(NewVT).getTypeForEVT(Ctx); + // Instead of an extend, we might have a vector type which needs + // padding with more elements, e.g. <2 x half> -> <4 x half>. + if (NewVT.isVector()) { + if (OldLLT.isVector()) { + if (NewLLT.getNumElements() > OldLLT.getNumElements()) { + // We don't handle VA types which are not exactly twice the + // size, but can easily be done in future. + if (NewLLT.getNumElements() != OldLLT.getNumElements() * 2) { + LLVM_DEBUG(dbgs() << "Outgoing vector ret has too many elts"); + return false; + } + auto Undef = MIRBuilder.buildUndef({OldLLT}); + CurVReg = + MIRBuilder.buildMerge({NewLLT}, {CurVReg, Undef}).getReg(0); + } else { + // Just do a vector extend. + CurVReg = MIRBuilder.buildInstr(ExtendOp, {NewLLT}, {CurVReg}) + .getReg(0); + } + } else if (NewLLT.getNumElements() == 2) { + // We need to pad a <1 x S> type to <2 x S>. Since we don't have + // <1 x S> vector types in GISel we use a build_vector instead + // of a vector merge/concat. + auto Undef = MIRBuilder.buildUndef({OldLLT}); + CurVReg = + MIRBuilder + .buildBuildVector({NewLLT}, {CurVReg, Undef.getReg(0)}) + .getReg(0); + } else { + LLVM_DEBUG(dbgs() << "Could not handle ret ty"); + return false; + } + } else { + // A scalar extend. + CurVReg = + MIRBuilder.buildInstr(ExtendOp, {NewLLT}, {CurVReg}).getReg(0); + } + } + } + if (CurVReg != CurArgInfo.Regs[0]) { + CurArgInfo.Regs[0] = CurVReg; + // Reset the arg flags after modifying CurVReg. + setArgFlags(CurArgInfo, AttributeList::ReturnIndex, DL, F); + } + splitToValueTypes(CurArgInfo, SplitArgs, DL, MRI, CC); + } + + OutgoingArgHandler Handler(MIRBuilder, MRI, MIB, AssignFn, AssignFn); + Success = handleAssignments(MIRBuilder, SplitArgs, Handler); + } + + if (SwiftErrorVReg) { + MIB.addUse(AArch64::X21, RegState::Implicit); + MIRBuilder.buildCopy(AArch64::X21, SwiftErrorVReg); + } + + MIRBuilder.insertInstr(MIB); + return Success; +} + +/// Helper function to compute forwarded registers for musttail calls. Computes +/// the forwarded registers, sets MBB liveness, and emits COPY instructions that +/// can be used to save + restore registers later. +static void handleMustTailForwardedRegisters(MachineIRBuilder &MIRBuilder, + CCAssignFn *AssignFn) { + MachineBasicBlock &MBB = MIRBuilder.getMBB(); + MachineFunction &MF = MIRBuilder.getMF(); + MachineFrameInfo &MFI = MF.getFrameInfo(); + + if (!MFI.hasMustTailInVarArgFunc()) + return; + + AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>(); + const Function &F = MF.getFunction(); + assert(F.isVarArg() && "Expected F to be vararg?"); + + // Compute the set of forwarded registers. The rest are scratch. + SmallVector<CCValAssign, 16> ArgLocs; + CCState CCInfo(F.getCallingConv(), /*IsVarArg=*/true, MF, ArgLocs, + F.getContext()); + SmallVector<MVT, 2> RegParmTypes; + RegParmTypes.push_back(MVT::i64); + RegParmTypes.push_back(MVT::f128); + + // Later on, we can use this vector to restore the registers if necessary. + SmallVectorImpl<ForwardedRegister> &Forwards = + FuncInfo->getForwardedMustTailRegParms(); + CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, AssignFn); + + // Conservatively forward X8, since it might be used for an aggregate + // return. + if (!CCInfo.isAllocated(AArch64::X8)) { + unsigned X8VReg = MF.addLiveIn(AArch64::X8, &AArch64::GPR64RegClass); + Forwards.push_back(ForwardedRegister(X8VReg, AArch64::X8, MVT::i64)); + } + + // Add the forwards to the MachineBasicBlock and MachineFunction. + for (const auto &F : Forwards) { + MBB.addLiveIn(F.PReg); + MIRBuilder.buildCopy(Register(F.VReg), Register(F.PReg)); + } +} + +bool AArch64CallLowering::fallBackToDAGISel(const Function &F) const { + if (isa<ScalableVectorType>(F.getReturnType())) + return true; + return llvm::any_of(F.args(), [](const Argument &A) { + return isa<ScalableVectorType>(A.getType()); + }); +} + +bool AArch64CallLowering::lowerFormalArguments( + MachineIRBuilder &MIRBuilder, const Function &F, + ArrayRef<ArrayRef<Register>> VRegs) const { + MachineFunction &MF = MIRBuilder.getMF(); + MachineBasicBlock &MBB = MIRBuilder.getMBB(); + MachineRegisterInfo &MRI = MF.getRegInfo(); + auto &DL = F.getParent()->getDataLayout(); + + SmallVector<ArgInfo, 8> SplitArgs; + unsigned i = 0; + for (auto &Arg : F.args()) { + if (DL.getTypeStoreSize(Arg.getType()).isZero()) + continue; + + ArgInfo OrigArg{VRegs[i], Arg.getType()}; + setArgFlags(OrigArg, i + AttributeList::FirstArgIndex, DL, F); + + splitToValueTypes(OrigArg, SplitArgs, DL, MRI, F.getCallingConv()); + ++i; + } + + if (!MBB.empty()) + MIRBuilder.setInstr(*MBB.begin()); + + const AArch64TargetLowering &TLI = *getTLI<AArch64TargetLowering>(); + CCAssignFn *AssignFn = + TLI.CCAssignFnForCall(F.getCallingConv(), /*IsVarArg=*/false); + + FormalArgHandler Handler(MIRBuilder, MRI, AssignFn); + if (!handleAssignments(MIRBuilder, SplitArgs, Handler)) + return false; + + AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>(); + uint64_t StackOffset = Handler.StackUsed; + if (F.isVarArg()) { + auto &Subtarget = MF.getSubtarget<AArch64Subtarget>(); + if (!Subtarget.isTargetDarwin()) { + // FIXME: we need to reimplement saveVarArgsRegisters from + // AArch64ISelLowering. + return false; + } + + // We currently pass all varargs at 8-byte alignment, or 4 in ILP32. + StackOffset = alignTo(Handler.StackUsed, Subtarget.isTargetILP32() ? 4 : 8); + + auto &MFI = MIRBuilder.getMF().getFrameInfo(); + FuncInfo->setVarArgsStackIndex(MFI.CreateFixedObject(4, StackOffset, true)); + } + + if (doesCalleeRestoreStack(F.getCallingConv(), + MF.getTarget().Options.GuaranteedTailCallOpt)) { + // We have a non-standard ABI, so why not make full use of the stack that + // we're going to pop? It must be aligned to 16 B in any case. + StackOffset = alignTo(StackOffset, 16); + + // If we're expected to restore the stack (e.g. fastcc), then we'll be + // adding a multiple of 16. + FuncInfo->setArgumentStackToRestore(StackOffset); + + // Our own callers will guarantee that the space is free by giving an + // aligned value to CALLSEQ_START. + } + + // When we tail call, we need to check if the callee's arguments + // will fit on the caller's stack. So, whenever we lower formal arguments, + // we should keep track of this information, since we might lower a tail call + // in this function later. + FuncInfo->setBytesInStackArgArea(StackOffset); + + auto &Subtarget = MF.getSubtarget<AArch64Subtarget>(); + if (Subtarget.hasCustomCallingConv()) + Subtarget.getRegisterInfo()->UpdateCustomCalleeSavedRegs(MF); + + handleMustTailForwardedRegisters(MIRBuilder, AssignFn); + + // Move back to the end of the basic block. + MIRBuilder.setMBB(MBB); + + return true; +} + +/// Return true if the calling convention is one that we can guarantee TCO for. +static bool canGuaranteeTCO(CallingConv::ID CC) { + return CC == CallingConv::Fast; +} + +/// Return true if we might ever do TCO for calls with this calling convention. +static bool mayTailCallThisCC(CallingConv::ID CC) { + switch (CC) { + case CallingConv::C: + case CallingConv::PreserveMost: + case CallingConv::Swift: + return true; + default: + return canGuaranteeTCO(CC); + } +} + +/// Returns a pair containing the fixed CCAssignFn and the vararg CCAssignFn for +/// CC. +static std::pair<CCAssignFn *, CCAssignFn *> +getAssignFnsForCC(CallingConv::ID CC, const AArch64TargetLowering &TLI) { + return {TLI.CCAssignFnForCall(CC, false), TLI.CCAssignFnForCall(CC, true)}; +} + +bool AArch64CallLowering::doCallerAndCalleePassArgsTheSameWay( + CallLoweringInfo &Info, MachineFunction &MF, + SmallVectorImpl<ArgInfo> &InArgs) const { + const Function &CallerF = MF.getFunction(); + CallingConv::ID CalleeCC = Info.CallConv; + CallingConv::ID CallerCC = CallerF.getCallingConv(); + + // If the calling conventions match, then everything must be the same. + if (CalleeCC == CallerCC) + return true; + + // Check if the caller and callee will handle arguments in the same way. + const AArch64TargetLowering &TLI = *getTLI<AArch64TargetLowering>(); + CCAssignFn *CalleeAssignFnFixed; + CCAssignFn *CalleeAssignFnVarArg; + std::tie(CalleeAssignFnFixed, CalleeAssignFnVarArg) = + getAssignFnsForCC(CalleeCC, TLI); + + CCAssignFn *CallerAssignFnFixed; + CCAssignFn *CallerAssignFnVarArg; + std::tie(CallerAssignFnFixed, CallerAssignFnVarArg) = + getAssignFnsForCC(CallerCC, TLI); + + if (!resultsCompatible(Info, MF, InArgs, *CalleeAssignFnFixed, + *CalleeAssignFnVarArg, *CallerAssignFnFixed, + *CallerAssignFnVarArg)) + return false; + + // Make sure that the caller and callee preserve all of the same registers. + auto TRI = MF.getSubtarget<AArch64Subtarget>().getRegisterInfo(); + const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC); + const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC); + if (MF.getSubtarget<AArch64Subtarget>().hasCustomCallingConv()) { + TRI->UpdateCustomCallPreservedMask(MF, &CallerPreserved); + TRI->UpdateCustomCallPreservedMask(MF, &CalleePreserved); + } + + return TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved); +} + +bool AArch64CallLowering::areCalleeOutgoingArgsTailCallable( + CallLoweringInfo &Info, MachineFunction &MF, + SmallVectorImpl<ArgInfo> &OutArgs) const { + // If there are no outgoing arguments, then we are done. + if (OutArgs.empty()) + return true; + + const Function &CallerF = MF.getFunction(); + CallingConv::ID CalleeCC = Info.CallConv; + CallingConv::ID CallerCC = CallerF.getCallingConv(); + const AArch64TargetLowering &TLI = *getTLI<AArch64TargetLowering>(); + + CCAssignFn *AssignFnFixed; + CCAssignFn *AssignFnVarArg; + std::tie(AssignFnFixed, AssignFnVarArg) = getAssignFnsForCC(CalleeCC, TLI); + + // We have outgoing arguments. Make sure that we can tail call with them. + SmallVector<CCValAssign, 16> OutLocs; + CCState OutInfo(CalleeCC, false, MF, OutLocs, CallerF.getContext()); + + if (!analyzeArgInfo(OutInfo, OutArgs, *AssignFnFixed, *AssignFnVarArg)) { + LLVM_DEBUG(dbgs() << "... Could not analyze call operands.\n"); + return false; + } + + // Make sure that they can fit on the caller's stack. + const AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>(); + if (OutInfo.getNextStackOffset() > FuncInfo->getBytesInStackArgArea()) { + LLVM_DEBUG(dbgs() << "... Cannot fit call operands on caller's stack.\n"); + return false; + } + + // Verify that the parameters in callee-saved registers match. + // TODO: Port this over to CallLowering as general code once swiftself is + // supported. + auto TRI = MF.getSubtarget<AArch64Subtarget>().getRegisterInfo(); + const uint32_t *CallerPreservedMask = TRI->getCallPreservedMask(MF, CallerCC); + MachineRegisterInfo &MRI = MF.getRegInfo(); + + for (unsigned i = 0; i < OutLocs.size(); ++i) { + auto &ArgLoc = OutLocs[i]; + // If it's not a register, it's fine. + if (!ArgLoc.isRegLoc()) { + if (Info.IsVarArg) { + // Be conservative and disallow variadic memory operands to match SDAG's + // behaviour. + // FIXME: If the caller's calling convention is C, then we can + // potentially use its argument area. However, for cases like fastcc, + // we can't do anything. + LLVM_DEBUG( + dbgs() + << "... Cannot tail call vararg function with stack arguments\n"); + return false; + } + continue; + } + + Register Reg = ArgLoc.getLocReg(); + + // Only look at callee-saved registers. + if (MachineOperand::clobbersPhysReg(CallerPreservedMask, Reg)) + continue; + + LLVM_DEBUG( + dbgs() + << "... Call has an argument passed in a callee-saved register.\n"); + + // Check if it was copied from. + ArgInfo &OutInfo = OutArgs[i]; + + if (OutInfo.Regs.size() > 1) { + LLVM_DEBUG( + dbgs() << "... Cannot handle arguments in multiple registers.\n"); + return false; + } + + // Check if we copy the register, walking through copies from virtual + // registers. Note that getDefIgnoringCopies does not ignore copies from + // physical registers. + MachineInstr *RegDef = getDefIgnoringCopies(OutInfo.Regs[0], MRI); + if (!RegDef || RegDef->getOpcode() != TargetOpcode::COPY) { + LLVM_DEBUG( + dbgs() + << "... Parameter was not copied into a VReg, cannot tail call.\n"); + return false; + } + + // Got a copy. Verify that it's the same as the register we want. + Register CopyRHS = RegDef->getOperand(1).getReg(); + if (CopyRHS != Reg) { + LLVM_DEBUG(dbgs() << "... Callee-saved register was not copied into " + "VReg, cannot tail call.\n"); + return false; + } + } + + return true; +} + +bool AArch64CallLowering::isEligibleForTailCallOptimization( + MachineIRBuilder &MIRBuilder, CallLoweringInfo &Info, + SmallVectorImpl<ArgInfo> &InArgs, + SmallVectorImpl<ArgInfo> &OutArgs) const { + + // Must pass all target-independent checks in order to tail call optimize. + if (!Info.IsTailCall) + return false; + + CallingConv::ID CalleeCC = Info.CallConv; + MachineFunction &MF = MIRBuilder.getMF(); + const Function &CallerF = MF.getFunction(); + + LLVM_DEBUG(dbgs() << "Attempting to lower call as tail call\n"); + + if (Info.SwiftErrorVReg) { + // TODO: We should handle this. + // Note that this is also handled by the check for no outgoing arguments. + // Proactively disabling this though, because the swifterror handling in + // lowerCall inserts a COPY *after* the location of the call. + LLVM_DEBUG(dbgs() << "... Cannot handle tail calls with swifterror yet.\n"); + return false; + } + + if (!mayTailCallThisCC(CalleeCC)) { + LLVM_DEBUG(dbgs() << "... Calling convention cannot be tail called.\n"); + return false; + } + + // Byval parameters hand the function a pointer directly into the stack area + // we want to reuse during a tail call. Working around this *is* possible (see + // X86). + // + // FIXME: In AArch64ISelLowering, this isn't worked around. Can/should we try + // it? + // + // On Windows, "inreg" attributes signify non-aggregate indirect returns. + // In this case, it is necessary to save/restore X0 in the callee. Tail + // call opt interferes with this. So we disable tail call opt when the + // caller has an argument with "inreg" attribute. + // + // FIXME: Check whether the callee also has an "inreg" argument. + // + // When the caller has a swifterror argument, we don't want to tail call + // because would have to move into the swifterror register before the + // tail call. + if (any_of(CallerF.args(), [](const Argument &A) { + return A.hasByValAttr() || A.hasInRegAttr() || A.hasSwiftErrorAttr(); + })) { + LLVM_DEBUG(dbgs() << "... Cannot tail call from callers with byval, " + "inreg, or swifterror arguments\n"); + return false; + } + + // Externally-defined functions with weak linkage should not be + // tail-called on AArch64 when the OS does not support dynamic + // pre-emption of symbols, as the AAELF spec requires normal calls + // to undefined weak functions to be replaced with a NOP or jump to the + // next instruction. The behaviour of branch instructions in this + // situation (as used for tail calls) is implementation-defined, so we + // cannot rely on the linker replacing the tail call with a return. + if (Info.Callee.isGlobal()) { + const GlobalValue *GV = Info.Callee.getGlobal(); + const Triple &TT = MF.getTarget().getTargetTriple(); + if (GV->hasExternalWeakLinkage() && + (!TT.isOSWindows() || TT.isOSBinFormatELF() || + TT.isOSBinFormatMachO())) { + LLVM_DEBUG(dbgs() << "... Cannot tail call externally-defined function " + "with weak linkage for this OS.\n"); + return false; + } + } + + // If we have -tailcallopt, then we're done. + if (MF.getTarget().Options.GuaranteedTailCallOpt) + return canGuaranteeTCO(CalleeCC) && CalleeCC == CallerF.getCallingConv(); + + // We don't have -tailcallopt, so we're allowed to change the ABI (sibcall). + // Try to find cases where we can do that. + + // I want anyone implementing a new calling convention to think long and hard + // about this assert. + assert((!Info.IsVarArg || CalleeCC == CallingConv::C) && + "Unexpected variadic calling convention"); + + // Verify that the incoming and outgoing arguments from the callee are + // safe to tail call. + if (!doCallerAndCalleePassArgsTheSameWay(Info, MF, InArgs)) { + LLVM_DEBUG( + dbgs() + << "... Caller and callee have incompatible calling conventions.\n"); + return false; + } + + if (!areCalleeOutgoingArgsTailCallable(Info, MF, OutArgs)) + return false; + + LLVM_DEBUG( + dbgs() << "... Call is eligible for tail call optimization.\n"); + return true; +} + +static unsigned getCallOpcode(const MachineFunction &CallerF, bool IsIndirect, + bool IsTailCall) { + if (!IsTailCall) + return IsIndirect ? getBLRCallOpcode(CallerF) : (unsigned)AArch64::BL; + + if (!IsIndirect) + return AArch64::TCRETURNdi; + + // When BTI is enabled, we need to use TCRETURNriBTI to make sure that we use + // x16 or x17. + if (CallerF.getFunction().hasFnAttribute("branch-target-enforcement")) + return AArch64::TCRETURNriBTI; + + return AArch64::TCRETURNri; +} + +bool AArch64CallLowering::lowerTailCall( + MachineIRBuilder &MIRBuilder, CallLoweringInfo &Info, + SmallVectorImpl<ArgInfo> &OutArgs) const { + MachineFunction &MF = MIRBuilder.getMF(); + const Function &F = MF.getFunction(); + MachineRegisterInfo &MRI = MF.getRegInfo(); + const AArch64TargetLowering &TLI = *getTLI<AArch64TargetLowering>(); + AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>(); + + // True when we're tail calling, but without -tailcallopt. + bool IsSibCall = !MF.getTarget().Options.GuaranteedTailCallOpt; + + // TODO: Right now, regbankselect doesn't know how to handle the rtcGPR64 + // register class. Until we can do that, we should fall back here. + if (F.hasFnAttribute("branch-target-enforcement")) { + LLVM_DEBUG( + dbgs() << "Cannot lower indirect tail calls with BTI enabled yet.\n"); + return false; + } + + // Find out which ABI gets to decide where things go. + CallingConv::ID CalleeCC = Info.CallConv; + CCAssignFn *AssignFnFixed; + CCAssignFn *AssignFnVarArg; + std::tie(AssignFnFixed, AssignFnVarArg) = getAssignFnsForCC(CalleeCC, TLI); + + MachineInstrBuilder CallSeqStart; + if (!IsSibCall) + CallSeqStart = MIRBuilder.buildInstr(AArch64::ADJCALLSTACKDOWN); + + unsigned Opc = getCallOpcode(MF, Info.Callee.isReg(), true); + auto MIB = MIRBuilder.buildInstrNoInsert(Opc); + MIB.add(Info.Callee); + + // Byte offset for the tail call. When we are sibcalling, this will always + // be 0. + MIB.addImm(0); + + // Tell the call which registers are clobbered. + auto TRI = MF.getSubtarget<AArch64Subtarget>().getRegisterInfo(); + const uint32_t *Mask = TRI->getCallPreservedMask(MF, CalleeCC); + if (MF.getSubtarget<AArch64Subtarget>().hasCustomCallingConv()) + TRI->UpdateCustomCallPreservedMask(MF, &Mask); + MIB.addRegMask(Mask); + + if (TRI->isAnyArgRegReserved(MF)) + TRI->emitReservedArgRegCallError(MF); + + // FPDiff is the byte offset of the call's argument area from the callee's. + // Stores to callee stack arguments will be placed in FixedStackSlots offset + // by this amount for a tail call. In a sibling call it must be 0 because the + // caller will deallocate the entire stack and the callee still expects its + // arguments to begin at SP+0. + int FPDiff = 0; + + // This will be 0 for sibcalls, potentially nonzero for tail calls produced + // by -tailcallopt. For sibcalls, the memory operands for the call are + // already available in the caller's incoming argument space. + unsigned NumBytes = 0; + if (!IsSibCall) { + // We aren't sibcalling, so we need to compute FPDiff. We need to do this + // before handling assignments, because FPDiff must be known for memory + // arguments. + unsigned NumReusableBytes = FuncInfo->getBytesInStackArgArea(); + SmallVector<CCValAssign, 16> OutLocs; + CCState OutInfo(CalleeCC, false, MF, OutLocs, F.getContext()); + analyzeArgInfo(OutInfo, OutArgs, *AssignFnFixed, *AssignFnVarArg); + + // The callee will pop the argument stack as a tail call. Thus, we must + // keep it 16-byte aligned. + NumBytes = alignTo(OutInfo.getNextStackOffset(), 16); + + // FPDiff will be negative if this tail call requires more space than we + // would automatically have in our incoming argument space. Positive if we + // actually shrink the stack. + FPDiff = NumReusableBytes - NumBytes; + + // The stack pointer must be 16-byte aligned at all times it's used for a + // memory operation, which in practice means at *all* times and in + // particular across call boundaries. Therefore our own arguments started at + // a 16-byte aligned SP and the delta applied for the tail call should + // satisfy the same constraint. + assert(FPDiff % 16 == 0 && "unaligned stack on tail call"); + } + + const auto &Forwards = FuncInfo->getForwardedMustTailRegParms(); + + // Do the actual argument marshalling. + OutgoingArgHandler Handler(MIRBuilder, MRI, MIB, AssignFnFixed, + AssignFnVarArg, true, FPDiff); + if (!handleAssignments(MIRBuilder, OutArgs, Handler)) + return false; + + if (Info.IsVarArg && Info.IsMustTailCall) { + // Now we know what's being passed to the function. Add uses to the call for + // the forwarded registers that we *aren't* passing as parameters. This will + // preserve the copies we build earlier. + for (const auto &F : Forwards) { + Register ForwardedReg = F.PReg; + // If the register is already passed, or aliases a register which is + // already being passed, then skip it. + if (any_of(MIB->uses(), [&ForwardedReg, &TRI](const MachineOperand &Use) { + if (!Use.isReg()) + return false; + return TRI->regsOverlap(Use.getReg(), ForwardedReg); + })) + continue; + + // We aren't passing it already, so we should add it to the call. + MIRBuilder.buildCopy(ForwardedReg, Register(F.VReg)); + MIB.addReg(ForwardedReg, RegState::Implicit); + } + } + + // If we have -tailcallopt, we need to adjust the stack. We'll do the call + // sequence start and end here. + if (!IsSibCall) { + MIB->getOperand(1).setImm(FPDiff); + CallSeqStart.addImm(NumBytes).addImm(0); + // End the call sequence *before* emitting the call. Normally, we would + // tidy the frame up after the call. However, here, we've laid out the + // parameters so that when SP is reset, they will be in the correct + // location. + MIRBuilder.buildInstr(AArch64::ADJCALLSTACKUP).addImm(NumBytes).addImm(0); + } + + // Now we can add the actual call instruction to the correct basic block. + MIRBuilder.insertInstr(MIB); + + // If Callee is a reg, since it is used by a target specific instruction, + // it must have a register class matching the constraint of that instruction. + if (Info.Callee.isReg()) + MIB->getOperand(0).setReg(constrainOperandRegClass( + MF, *TRI, MRI, *MF.getSubtarget().getInstrInfo(), + *MF.getSubtarget().getRegBankInfo(), *MIB, MIB->getDesc(), Info.Callee, + 0)); + + MF.getFrameInfo().setHasTailCall(); + Info.LoweredTailCall = true; + return true; +} + +bool AArch64CallLowering::lowerCall(MachineIRBuilder &MIRBuilder, + CallLoweringInfo &Info) const { + MachineFunction &MF = MIRBuilder.getMF(); + const Function &F = MF.getFunction(); + MachineRegisterInfo &MRI = MF.getRegInfo(); + auto &DL = F.getParent()->getDataLayout(); + const AArch64TargetLowering &TLI = *getTLI<AArch64TargetLowering>(); + + SmallVector<ArgInfo, 8> OutArgs; + for (auto &OrigArg : Info.OrigArgs) { + splitToValueTypes(OrigArg, OutArgs, DL, MRI, Info.CallConv); + // AAPCS requires that we zero-extend i1 to 8 bits by the caller. + if (OrigArg.Ty->isIntegerTy(1)) + OutArgs.back().Flags[0].setZExt(); + } + + SmallVector<ArgInfo, 8> InArgs; + if (!Info.OrigRet.Ty->isVoidTy()) + splitToValueTypes(Info.OrigRet, InArgs, DL, MRI, F.getCallingConv()); + + // If we can lower as a tail call, do that instead. + bool CanTailCallOpt = + isEligibleForTailCallOptimization(MIRBuilder, Info, InArgs, OutArgs); + + // We must emit a tail call if we have musttail. + if (Info.IsMustTailCall && !CanTailCallOpt) { + // There are types of incoming/outgoing arguments we can't handle yet, so + // it doesn't make sense to actually die here like in ISelLowering. Instead, + // fall back to SelectionDAG and let it try to handle this. + LLVM_DEBUG(dbgs() << "Failed to lower musttail call as tail call\n"); + return false; + } + + if (CanTailCallOpt) + return lowerTailCall(MIRBuilder, Info, OutArgs); + + // Find out which ABI gets to decide where things go. + CCAssignFn *AssignFnFixed; + CCAssignFn *AssignFnVarArg; + std::tie(AssignFnFixed, AssignFnVarArg) = + getAssignFnsForCC(Info.CallConv, TLI); + + MachineInstrBuilder CallSeqStart; + CallSeqStart = MIRBuilder.buildInstr(AArch64::ADJCALLSTACKDOWN); + + // Create a temporarily-floating call instruction so we can add the implicit + // uses of arg registers. + unsigned Opc = getCallOpcode(MF, Info.Callee.isReg(), false); + + auto MIB = MIRBuilder.buildInstrNoInsert(Opc); + MIB.add(Info.Callee); + + // Tell the call which registers are clobbered. + auto TRI = MF.getSubtarget<AArch64Subtarget>().getRegisterInfo(); + const uint32_t *Mask = TRI->getCallPreservedMask(MF, Info.CallConv); + if (MF.getSubtarget<AArch64Subtarget>().hasCustomCallingConv()) + TRI->UpdateCustomCallPreservedMask(MF, &Mask); + MIB.addRegMask(Mask); + + if (TRI->isAnyArgRegReserved(MF)) + TRI->emitReservedArgRegCallError(MF); + + // Do the actual argument marshalling. + OutgoingArgHandler Handler(MIRBuilder, MRI, MIB, AssignFnFixed, + AssignFnVarArg, false); + if (!handleAssignments(MIRBuilder, OutArgs, Handler)) + return false; + + // Now we can add the actual call instruction to the correct basic block. + MIRBuilder.insertInstr(MIB); + + // If Callee is a reg, since it is used by a target specific + // instruction, it must have a register class matching the + // constraint of that instruction. + if (Info.Callee.isReg()) + MIB->getOperand(0).setReg(constrainOperandRegClass( + MF, *TRI, MRI, *MF.getSubtarget().getInstrInfo(), + *MF.getSubtarget().getRegBankInfo(), *MIB, MIB->getDesc(), Info.Callee, + 0)); + + // Finally we can copy the returned value back into its virtual-register. In + // symmetry with the arguments, the physical register must be an + // implicit-define of the call instruction. + if (!Info.OrigRet.Ty->isVoidTy()) { + CCAssignFn *RetAssignFn = TLI.CCAssignFnForReturn(Info.CallConv); + CallReturnHandler Handler(MIRBuilder, MRI, MIB, RetAssignFn); + if (!handleAssignments(MIRBuilder, InArgs, Handler)) + return false; + } + + if (Info.SwiftErrorVReg) { + MIB.addDef(AArch64::X21, RegState::Implicit); + MIRBuilder.buildCopy(Info.SwiftErrorVReg, Register(AArch64::X21)); + } + + uint64_t CalleePopBytes = + doesCalleeRestoreStack(Info.CallConv, + MF.getTarget().Options.GuaranteedTailCallOpt) + ? alignTo(Handler.StackSize, 16) + : 0; + + CallSeqStart.addImm(Handler.StackSize).addImm(0); + MIRBuilder.buildInstr(AArch64::ADJCALLSTACKUP) + .addImm(Handler.StackSize) + .addImm(CalleePopBytes); + + return true; +} diff --git a/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.h b/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.h new file mode 100644 index 000000000000..640a86253059 --- /dev/null +++ b/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.h @@ -0,0 +1,84 @@ +//===- AArch64CallLowering.h - Call lowering --------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file describes how to lower LLVM calls to machine code calls. +/// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64CALLLOWERING_H +#define LLVM_LIB_TARGET_AARCH64_AARCH64CALLLOWERING_H + +#include "llvm/ADT/ArrayRef.h" +#include "llvm/CodeGen/GlobalISel/CallLowering.h" +#include "llvm/IR/CallingConv.h" +#include <cstdint> +#include <functional> + +namespace llvm { + +class AArch64TargetLowering; +class CCValAssign; +class DataLayout; +class MachineIRBuilder; +class MachineRegisterInfo; +class Type; + +class AArch64CallLowering: public CallLowering { +public: + AArch64CallLowering(const AArch64TargetLowering &TLI); + + bool lowerReturn(MachineIRBuilder &MIRBuilder, const Value *Val, + ArrayRef<Register> VRegs, + Register SwiftErrorVReg) const override; + + bool fallBackToDAGISel(const Function &F) const override; + + bool lowerFormalArguments(MachineIRBuilder &MIRBuilder, const Function &F, + ArrayRef<ArrayRef<Register>> VRegs) const override; + + bool lowerCall(MachineIRBuilder &MIRBuilder, + CallLoweringInfo &Info) const override; + + /// Returns true if the call can be lowered as a tail call. + bool + isEligibleForTailCallOptimization(MachineIRBuilder &MIRBuilder, + CallLoweringInfo &Info, + SmallVectorImpl<ArgInfo> &InArgs, + SmallVectorImpl<ArgInfo> &OutArgs) const; + + bool supportSwiftError() const override { return true; } + +private: + using RegHandler = std::function<void(MachineIRBuilder &, Type *, unsigned, + CCValAssign &)>; + + using MemHandler = + std::function<void(MachineIRBuilder &, int, CCValAssign &)>; + + void splitToValueTypes(const ArgInfo &OrigArgInfo, + SmallVectorImpl<ArgInfo> &SplitArgs, + const DataLayout &DL, MachineRegisterInfo &MRI, + CallingConv::ID CallConv) const; + + bool lowerTailCall(MachineIRBuilder &MIRBuilder, CallLoweringInfo &Info, + SmallVectorImpl<ArgInfo> &OutArgs) const; + + bool + doCallerAndCalleePassArgsTheSameWay(CallLoweringInfo &Info, + MachineFunction &MF, + SmallVectorImpl<ArgInfo> &InArgs) const; + + bool + areCalleeOutgoingArgsTailCallable(CallLoweringInfo &Info, MachineFunction &MF, + SmallVectorImpl<ArgInfo> &OutArgs) const; +}; + +} // end namespace llvm + +#endif // LLVM_LIB_TARGET_AARCH64_AARCH64CALLLOWERING_H diff --git a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp new file mode 100644 index 000000000000..408f0cb77e73 --- /dev/null +++ b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp @@ -0,0 +1,5704 @@ +//===- AArch64InstructionSelector.cpp ----------------------------*- C++ -*-==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// \file +/// This file implements the targeting of the InstructionSelector class for +/// AArch64. +/// \todo This should be generated by TableGen. +//===----------------------------------------------------------------------===// + +#include "AArch64InstrInfo.h" +#include "AArch64MachineFunctionInfo.h" +#include "AArch64RegisterBankInfo.h" +#include "AArch64RegisterInfo.h" +#include "AArch64Subtarget.h" +#include "AArch64TargetMachine.h" +#include "MCTargetDesc/AArch64AddressingModes.h" +#include "llvm/ADT/Optional.h" +#include "llvm/CodeGen/GlobalISel/InstructionSelector.h" +#include "llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h" +#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" +#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" +#include "llvm/CodeGen/GlobalISel/Utils.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineConstantPool.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineOperand.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/TargetOpcodes.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/Type.h" +#include "llvm/IR/IntrinsicsAArch64.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" + +#define DEBUG_TYPE "aarch64-isel" + +using namespace llvm; + +namespace { + +#define GET_GLOBALISEL_PREDICATE_BITSET +#include "AArch64GenGlobalISel.inc" +#undef GET_GLOBALISEL_PREDICATE_BITSET + +class AArch64InstructionSelector : public InstructionSelector { +public: + AArch64InstructionSelector(const AArch64TargetMachine &TM, + const AArch64Subtarget &STI, + const AArch64RegisterBankInfo &RBI); + + bool select(MachineInstr &I) override; + static const char *getName() { return DEBUG_TYPE; } + + void setupMF(MachineFunction &MF, GISelKnownBits &KB, + CodeGenCoverage &CoverageInfo) override { + InstructionSelector::setupMF(MF, KB, CoverageInfo); + + // hasFnAttribute() is expensive to call on every BRCOND selection, so + // cache it here for each run of the selector. + ProduceNonFlagSettingCondBr = + !MF.getFunction().hasFnAttribute(Attribute::SpeculativeLoadHardening); + MFReturnAddr = Register(); + + processPHIs(MF); + } + +private: + /// tblgen-erated 'select' implementation, used as the initial selector for + /// the patterns that don't require complex C++. + bool selectImpl(MachineInstr &I, CodeGenCoverage &CoverageInfo) const; + + // A lowering phase that runs before any selection attempts. + // Returns true if the instruction was modified. + bool preISelLower(MachineInstr &I); + + // An early selection function that runs before the selectImpl() call. + bool earlySelect(MachineInstr &I) const; + + // Do some preprocessing of G_PHIs before we begin selection. + void processPHIs(MachineFunction &MF); + + bool earlySelectSHL(MachineInstr &I, MachineRegisterInfo &MRI) const; + + /// Eliminate same-sized cross-bank copies into stores before selectImpl(). + bool contractCrossBankCopyIntoStore(MachineInstr &I, + MachineRegisterInfo &MRI); + + bool convertPtrAddToAdd(MachineInstr &I, MachineRegisterInfo &MRI); + + bool selectVaStartAAPCS(MachineInstr &I, MachineFunction &MF, + MachineRegisterInfo &MRI) const; + bool selectVaStartDarwin(MachineInstr &I, MachineFunction &MF, + MachineRegisterInfo &MRI) const; + + bool tryOptAndIntoCompareBranch(MachineInstr *LHS, + int64_t CmpConstant, + const CmpInst::Predicate &Pred, + MachineBasicBlock *DstMBB, + MachineIRBuilder &MIB) const; + bool selectCompareBranch(MachineInstr &I, MachineFunction &MF, + MachineRegisterInfo &MRI) const; + + bool selectVectorASHR(MachineInstr &I, MachineRegisterInfo &MRI) const; + bool selectVectorSHL(MachineInstr &I, MachineRegisterInfo &MRI) const; + + // Helper to generate an equivalent of scalar_to_vector into a new register, + // returned via 'Dst'. + MachineInstr *emitScalarToVector(unsigned EltSize, + const TargetRegisterClass *DstRC, + Register Scalar, + MachineIRBuilder &MIRBuilder) const; + + /// Emit a lane insert into \p DstReg, or a new vector register if None is + /// provided. + /// + /// The lane inserted into is defined by \p LaneIdx. The vector source + /// register is given by \p SrcReg. The register containing the element is + /// given by \p EltReg. + MachineInstr *emitLaneInsert(Optional<Register> DstReg, Register SrcReg, + Register EltReg, unsigned LaneIdx, + const RegisterBank &RB, + MachineIRBuilder &MIRBuilder) const; + bool selectInsertElt(MachineInstr &I, MachineRegisterInfo &MRI) const; + bool tryOptConstantBuildVec(MachineInstr &MI, LLT DstTy, + MachineRegisterInfo &MRI) const; + bool selectBuildVector(MachineInstr &I, MachineRegisterInfo &MRI) const; + bool selectMergeValues(MachineInstr &I, MachineRegisterInfo &MRI) const; + bool selectUnmergeValues(MachineInstr &I, MachineRegisterInfo &MRI) const; + + bool selectShuffleVector(MachineInstr &I, MachineRegisterInfo &MRI) const; + bool selectExtractElt(MachineInstr &I, MachineRegisterInfo &MRI) const; + bool selectConcatVectors(MachineInstr &I, MachineRegisterInfo &MRI) const; + bool selectSplitVectorUnmerge(MachineInstr &I, + MachineRegisterInfo &MRI) const; + bool selectIntrinsicWithSideEffects(MachineInstr &I, + MachineRegisterInfo &MRI) const; + bool selectIntrinsic(MachineInstr &I, MachineRegisterInfo &MRI); + bool selectVectorICmp(MachineInstr &I, MachineRegisterInfo &MRI) const; + bool selectIntrinsicTrunc(MachineInstr &I, MachineRegisterInfo &MRI) const; + bool selectIntrinsicRound(MachineInstr &I, MachineRegisterInfo &MRI) const; + bool selectJumpTable(MachineInstr &I, MachineRegisterInfo &MRI) const; + bool selectBrJT(MachineInstr &I, MachineRegisterInfo &MRI) const; + bool selectTLSGlobalValue(MachineInstr &I, MachineRegisterInfo &MRI) const; + + unsigned emitConstantPoolEntry(const Constant *CPVal, + MachineFunction &MF) const; + MachineInstr *emitLoadFromConstantPool(const Constant *CPVal, + MachineIRBuilder &MIRBuilder) const; + + // Emit a vector concat operation. + MachineInstr *emitVectorConcat(Optional<Register> Dst, Register Op1, + Register Op2, + MachineIRBuilder &MIRBuilder) const; + + // Emit an integer compare between LHS and RHS, which checks for Predicate. + // + // This returns the produced compare instruction, and the predicate which + // was ultimately used in the compare. The predicate may differ from what + // is passed in \p Predicate due to optimization. + std::pair<MachineInstr *, CmpInst::Predicate> + emitIntegerCompare(MachineOperand &LHS, MachineOperand &RHS, + MachineOperand &Predicate, + MachineIRBuilder &MIRBuilder) const; + MachineInstr *emitADD(Register DefReg, MachineOperand &LHS, MachineOperand &RHS, + MachineIRBuilder &MIRBuilder) const; + MachineInstr *emitCMN(MachineOperand &LHS, MachineOperand &RHS, + MachineIRBuilder &MIRBuilder) const; + MachineInstr *emitTST(const Register &LHS, const Register &RHS, + MachineIRBuilder &MIRBuilder) const; + MachineInstr *emitExtractVectorElt(Optional<Register> DstReg, + const RegisterBank &DstRB, LLT ScalarTy, + Register VecReg, unsigned LaneIdx, + MachineIRBuilder &MIRBuilder) const; + + /// Helper function for selecting G_FCONSTANT. If the G_FCONSTANT can be + /// materialized using a FMOV instruction, then update MI and return it. + /// Otherwise, do nothing and return a nullptr. + MachineInstr *emitFMovForFConstant(MachineInstr &MI, + MachineRegisterInfo &MRI) const; + + /// Emit a CSet for a compare. + MachineInstr *emitCSetForICMP(Register DefReg, unsigned Pred, + MachineIRBuilder &MIRBuilder) const; + + /// Emit a TB(N)Z instruction which tests \p Bit in \p TestReg. + /// \p IsNegative is true if the test should be "not zero". + /// This will also optimize the test bit instruction when possible. + MachineInstr *emitTestBit(Register TestReg, uint64_t Bit, bool IsNegative, + MachineBasicBlock *DstMBB, + MachineIRBuilder &MIB) const; + + // Equivalent to the i32shift_a and friends from AArch64InstrInfo.td. + // We use these manually instead of using the importer since it doesn't + // support SDNodeXForm. + ComplexRendererFns selectShiftA_32(const MachineOperand &Root) const; + ComplexRendererFns selectShiftB_32(const MachineOperand &Root) const; + ComplexRendererFns selectShiftA_64(const MachineOperand &Root) const; + ComplexRendererFns selectShiftB_64(const MachineOperand &Root) const; + + ComplexRendererFns select12BitValueWithLeftShift(uint64_t Immed) const; + ComplexRendererFns selectArithImmed(MachineOperand &Root) const; + ComplexRendererFns selectNegArithImmed(MachineOperand &Root) const; + + ComplexRendererFns selectAddrModeUnscaled(MachineOperand &Root, + unsigned Size) const; + + ComplexRendererFns selectAddrModeUnscaled8(MachineOperand &Root) const { + return selectAddrModeUnscaled(Root, 1); + } + ComplexRendererFns selectAddrModeUnscaled16(MachineOperand &Root) const { + return selectAddrModeUnscaled(Root, 2); + } + ComplexRendererFns selectAddrModeUnscaled32(MachineOperand &Root) const { + return selectAddrModeUnscaled(Root, 4); + } + ComplexRendererFns selectAddrModeUnscaled64(MachineOperand &Root) const { + return selectAddrModeUnscaled(Root, 8); + } + ComplexRendererFns selectAddrModeUnscaled128(MachineOperand &Root) const { + return selectAddrModeUnscaled(Root, 16); + } + + /// Helper to try to fold in a GISEL_ADD_LOW into an immediate, to be used + /// from complex pattern matchers like selectAddrModeIndexed(). + ComplexRendererFns tryFoldAddLowIntoImm(MachineInstr &RootDef, unsigned Size, + MachineRegisterInfo &MRI) const; + + ComplexRendererFns selectAddrModeIndexed(MachineOperand &Root, + unsigned Size) const; + template <int Width> + ComplexRendererFns selectAddrModeIndexed(MachineOperand &Root) const { + return selectAddrModeIndexed(Root, Width / 8); + } + + bool isWorthFoldingIntoExtendedReg(MachineInstr &MI, + const MachineRegisterInfo &MRI) const; + ComplexRendererFns + selectAddrModeShiftedExtendXReg(MachineOperand &Root, + unsigned SizeInBytes) const; + + /// Returns a \p ComplexRendererFns which contains a base, offset, and whether + /// or not a shift + extend should be folded into an addressing mode. Returns + /// None when this is not profitable or possible. + ComplexRendererFns + selectExtendedSHL(MachineOperand &Root, MachineOperand &Base, + MachineOperand &Offset, unsigned SizeInBytes, + bool WantsExt) const; + ComplexRendererFns selectAddrModeRegisterOffset(MachineOperand &Root) const; + ComplexRendererFns selectAddrModeXRO(MachineOperand &Root, + unsigned SizeInBytes) const; + template <int Width> + ComplexRendererFns selectAddrModeXRO(MachineOperand &Root) const { + return selectAddrModeXRO(Root, Width / 8); + } + + ComplexRendererFns selectAddrModeWRO(MachineOperand &Root, + unsigned SizeInBytes) const; + template <int Width> + ComplexRendererFns selectAddrModeWRO(MachineOperand &Root) const { + return selectAddrModeWRO(Root, Width / 8); + } + + ComplexRendererFns selectShiftedRegister(MachineOperand &Root) const; + + ComplexRendererFns selectArithShiftedRegister(MachineOperand &Root) const { + return selectShiftedRegister(Root); + } + + ComplexRendererFns selectLogicalShiftedRegister(MachineOperand &Root) const { + // TODO: selectShiftedRegister should allow for rotates on logical shifts. + // For now, make them the same. The only difference between the two is that + // logical shifts are allowed to fold in rotates. Otherwise, these are + // functionally the same. + return selectShiftedRegister(Root); + } + + /// Given an extend instruction, determine the correct shift-extend type for + /// that instruction. + /// + /// If the instruction is going to be used in a load or store, pass + /// \p IsLoadStore = true. + AArch64_AM::ShiftExtendType + getExtendTypeForInst(MachineInstr &MI, MachineRegisterInfo &MRI, + bool IsLoadStore = false) const; + + /// Instructions that accept extend modifiers like UXTW expect the register + /// being extended to be a GPR32. Narrow ExtReg to a 32-bit register using a + /// subregister copy if necessary. Return either ExtReg, or the result of the + /// new copy. + Register narrowExtendRegIfNeeded(Register ExtReg, + MachineIRBuilder &MIB) const; + Register widenGPRBankRegIfNeeded(Register Reg, unsigned Size, + MachineIRBuilder &MIB) const; + ComplexRendererFns selectArithExtendedRegister(MachineOperand &Root) const; + + void renderTruncImm(MachineInstrBuilder &MIB, const MachineInstr &MI, + int OpIdx = -1) const; + void renderLogicalImm32(MachineInstrBuilder &MIB, const MachineInstr &I, + int OpIdx = -1) const; + void renderLogicalImm64(MachineInstrBuilder &MIB, const MachineInstr &I, + int OpIdx = -1) const; + + // Materialize a GlobalValue or BlockAddress using a movz+movk sequence. + void materializeLargeCMVal(MachineInstr &I, const Value *V, + unsigned OpFlags) const; + + // Optimization methods. + bool tryOptSelect(MachineInstr &MI) const; + MachineInstr *tryFoldIntegerCompare(MachineOperand &LHS, MachineOperand &RHS, + MachineOperand &Predicate, + MachineIRBuilder &MIRBuilder) const; + MachineInstr *tryOptArithImmedIntegerCompare(MachineOperand &LHS, + MachineOperand &RHS, + CmpInst::Predicate &Predicate, + MachineIRBuilder &MIB) const; + MachineInstr *tryOptArithShiftedCompare(MachineOperand &LHS, + MachineOperand &RHS, + MachineIRBuilder &MIB) const; + + /// Return true if \p MI is a load or store of \p NumBytes bytes. + bool isLoadStoreOfNumBytes(const MachineInstr &MI, unsigned NumBytes) const; + + /// Returns true if \p MI is guaranteed to have the high-half of a 64-bit + /// register zeroed out. In other words, the result of MI has been explicitly + /// zero extended. + bool isDef32(const MachineInstr &MI) const; + + const AArch64TargetMachine &TM; + const AArch64Subtarget &STI; + const AArch64InstrInfo &TII; + const AArch64RegisterInfo &TRI; + const AArch64RegisterBankInfo &RBI; + + bool ProduceNonFlagSettingCondBr = false; + + // Some cached values used during selection. + // We use LR as a live-in register, and we keep track of it here as it can be + // clobbered by calls. + Register MFReturnAddr; + +#define GET_GLOBALISEL_PREDICATES_DECL +#include "AArch64GenGlobalISel.inc" +#undef GET_GLOBALISEL_PREDICATES_DECL + +// We declare the temporaries used by selectImpl() in the class to minimize the +// cost of constructing placeholder values. +#define GET_GLOBALISEL_TEMPORARIES_DECL +#include "AArch64GenGlobalISel.inc" +#undef GET_GLOBALISEL_TEMPORARIES_DECL +}; + +} // end anonymous namespace + +#define GET_GLOBALISEL_IMPL +#include "AArch64GenGlobalISel.inc" +#undef GET_GLOBALISEL_IMPL + +AArch64InstructionSelector::AArch64InstructionSelector( + const AArch64TargetMachine &TM, const AArch64Subtarget &STI, + const AArch64RegisterBankInfo &RBI) + : InstructionSelector(), TM(TM), STI(STI), TII(*STI.getInstrInfo()), + TRI(*STI.getRegisterInfo()), RBI(RBI), +#define GET_GLOBALISEL_PREDICATES_INIT +#include "AArch64GenGlobalISel.inc" +#undef GET_GLOBALISEL_PREDICATES_INIT +#define GET_GLOBALISEL_TEMPORARIES_INIT +#include "AArch64GenGlobalISel.inc" +#undef GET_GLOBALISEL_TEMPORARIES_INIT +{ +} + +// FIXME: This should be target-independent, inferred from the types declared +// for each class in the bank. +static const TargetRegisterClass * +getRegClassForTypeOnBank(LLT Ty, const RegisterBank &RB, + const RegisterBankInfo &RBI, + bool GetAllRegSet = false) { + if (RB.getID() == AArch64::GPRRegBankID) { + if (Ty.getSizeInBits() <= 32) + return GetAllRegSet ? &AArch64::GPR32allRegClass + : &AArch64::GPR32RegClass; + if (Ty.getSizeInBits() == 64) + return GetAllRegSet ? &AArch64::GPR64allRegClass + : &AArch64::GPR64RegClass; + return nullptr; + } + + if (RB.getID() == AArch64::FPRRegBankID) { + if (Ty.getSizeInBits() <= 16) + return &AArch64::FPR16RegClass; + if (Ty.getSizeInBits() == 32) + return &AArch64::FPR32RegClass; + if (Ty.getSizeInBits() == 64) + return &AArch64::FPR64RegClass; + if (Ty.getSizeInBits() == 128) + return &AArch64::FPR128RegClass; + return nullptr; + } + + return nullptr; +} + +/// Given a register bank, and size in bits, return the smallest register class +/// that can represent that combination. +static const TargetRegisterClass * +getMinClassForRegBank(const RegisterBank &RB, unsigned SizeInBits, + bool GetAllRegSet = false) { + unsigned RegBankID = RB.getID(); + + if (RegBankID == AArch64::GPRRegBankID) { + if (SizeInBits <= 32) + return GetAllRegSet ? &AArch64::GPR32allRegClass + : &AArch64::GPR32RegClass; + if (SizeInBits == 64) + return GetAllRegSet ? &AArch64::GPR64allRegClass + : &AArch64::GPR64RegClass; + } + + if (RegBankID == AArch64::FPRRegBankID) { + switch (SizeInBits) { + default: + return nullptr; + case 8: + return &AArch64::FPR8RegClass; + case 16: + return &AArch64::FPR16RegClass; + case 32: + return &AArch64::FPR32RegClass; + case 64: + return &AArch64::FPR64RegClass; + case 128: + return &AArch64::FPR128RegClass; + } + } + + return nullptr; +} + +/// Returns the correct subregister to use for a given register class. +static bool getSubRegForClass(const TargetRegisterClass *RC, + const TargetRegisterInfo &TRI, unsigned &SubReg) { + switch (TRI.getRegSizeInBits(*RC)) { + case 8: + SubReg = AArch64::bsub; + break; + case 16: + SubReg = AArch64::hsub; + break; + case 32: + if (RC != &AArch64::FPR32RegClass) + SubReg = AArch64::sub_32; + else + SubReg = AArch64::ssub; + break; + case 64: + SubReg = AArch64::dsub; + break; + default: + LLVM_DEBUG( + dbgs() << "Couldn't find appropriate subregister for register class."); + return false; + } + + return true; +} + +/// Returns the minimum size the given register bank can hold. +static unsigned getMinSizeForRegBank(const RegisterBank &RB) { + switch (RB.getID()) { + case AArch64::GPRRegBankID: + return 32; + case AArch64::FPRRegBankID: + return 8; + default: + llvm_unreachable("Tried to get minimum size for unknown register bank."); + } +} + +static Optional<uint64_t> getImmedFromMO(const MachineOperand &Root) { + auto &MI = *Root.getParent(); + auto &MBB = *MI.getParent(); + auto &MF = *MBB.getParent(); + auto &MRI = MF.getRegInfo(); + uint64_t Immed; + if (Root.isImm()) + Immed = Root.getImm(); + else if (Root.isCImm()) + Immed = Root.getCImm()->getZExtValue(); + else if (Root.isReg()) { + auto ValAndVReg = + getConstantVRegValWithLookThrough(Root.getReg(), MRI, true); + if (!ValAndVReg) + return None; + Immed = ValAndVReg->Value; + } else + return None; + return Immed; +} + +/// Check whether \p I is a currently unsupported binary operation: +/// - it has an unsized type +/// - an operand is not a vreg +/// - all operands are not in the same bank +/// These are checks that should someday live in the verifier, but right now, +/// these are mostly limitations of the aarch64 selector. +static bool unsupportedBinOp(const MachineInstr &I, + const AArch64RegisterBankInfo &RBI, + const MachineRegisterInfo &MRI, + const AArch64RegisterInfo &TRI) { + LLT Ty = MRI.getType(I.getOperand(0).getReg()); + if (!Ty.isValid()) { + LLVM_DEBUG(dbgs() << "Generic binop register should be typed\n"); + return true; + } + + const RegisterBank *PrevOpBank = nullptr; + for (auto &MO : I.operands()) { + // FIXME: Support non-register operands. + if (!MO.isReg()) { + LLVM_DEBUG(dbgs() << "Generic inst non-reg operands are unsupported\n"); + return true; + } + + // FIXME: Can generic operations have physical registers operands? If + // so, this will need to be taught about that, and we'll need to get the + // bank out of the minimal class for the register. + // Either way, this needs to be documented (and possibly verified). + if (!Register::isVirtualRegister(MO.getReg())) { + LLVM_DEBUG(dbgs() << "Generic inst has physical register operand\n"); + return true; + } + + const RegisterBank *OpBank = RBI.getRegBank(MO.getReg(), MRI, TRI); + if (!OpBank) { + LLVM_DEBUG(dbgs() << "Generic register has no bank or class\n"); + return true; + } + + if (PrevOpBank && OpBank != PrevOpBank) { + LLVM_DEBUG(dbgs() << "Generic inst operands have different banks\n"); + return true; + } + PrevOpBank = OpBank; + } + return false; +} + +/// Select the AArch64 opcode for the basic binary operation \p GenericOpc +/// (such as G_OR or G_SDIV), appropriate for the register bank \p RegBankID +/// and of size \p OpSize. +/// \returns \p GenericOpc if the combination is unsupported. +static unsigned selectBinaryOp(unsigned GenericOpc, unsigned RegBankID, + unsigned OpSize) { + switch (RegBankID) { + case AArch64::GPRRegBankID: + if (OpSize == 32) { + switch (GenericOpc) { + case TargetOpcode::G_SHL: + return AArch64::LSLVWr; + case TargetOpcode::G_LSHR: + return AArch64::LSRVWr; + case TargetOpcode::G_ASHR: + return AArch64::ASRVWr; + default: + return GenericOpc; + } + } else if (OpSize == 64) { + switch (GenericOpc) { + case TargetOpcode::G_PTR_ADD: + return AArch64::ADDXrr; + case TargetOpcode::G_SHL: + return AArch64::LSLVXr; + case TargetOpcode::G_LSHR: + return AArch64::LSRVXr; + case TargetOpcode::G_ASHR: + return AArch64::ASRVXr; + default: + return GenericOpc; + } + } + break; + case AArch64::FPRRegBankID: + switch (OpSize) { + case 32: + switch (GenericOpc) { + case TargetOpcode::G_FADD: + return AArch64::FADDSrr; + case TargetOpcode::G_FSUB: + return AArch64::FSUBSrr; + case TargetOpcode::G_FMUL: + return AArch64::FMULSrr; + case TargetOpcode::G_FDIV: + return AArch64::FDIVSrr; + default: + return GenericOpc; + } + case 64: + switch (GenericOpc) { + case TargetOpcode::G_FADD: + return AArch64::FADDDrr; + case TargetOpcode::G_FSUB: + return AArch64::FSUBDrr; + case TargetOpcode::G_FMUL: + return AArch64::FMULDrr; + case TargetOpcode::G_FDIV: + return AArch64::FDIVDrr; + case TargetOpcode::G_OR: + return AArch64::ORRv8i8; + default: + return GenericOpc; + } + } + break; + } + return GenericOpc; +} + +/// Select the AArch64 opcode for the G_LOAD or G_STORE operation \p GenericOpc, +/// appropriate for the (value) register bank \p RegBankID and of memory access +/// size \p OpSize. This returns the variant with the base+unsigned-immediate +/// addressing mode (e.g., LDRXui). +/// \returns \p GenericOpc if the combination is unsupported. +static unsigned selectLoadStoreUIOp(unsigned GenericOpc, unsigned RegBankID, + unsigned OpSize) { + const bool isStore = GenericOpc == TargetOpcode::G_STORE; + switch (RegBankID) { + case AArch64::GPRRegBankID: + switch (OpSize) { + case 8: + return isStore ? AArch64::STRBBui : AArch64::LDRBBui; + case 16: + return isStore ? AArch64::STRHHui : AArch64::LDRHHui; + case 32: + return isStore ? AArch64::STRWui : AArch64::LDRWui; + case 64: + return isStore ? AArch64::STRXui : AArch64::LDRXui; + } + break; + case AArch64::FPRRegBankID: + switch (OpSize) { + case 8: + return isStore ? AArch64::STRBui : AArch64::LDRBui; + case 16: + return isStore ? AArch64::STRHui : AArch64::LDRHui; + case 32: + return isStore ? AArch64::STRSui : AArch64::LDRSui; + case 64: + return isStore ? AArch64::STRDui : AArch64::LDRDui; + } + break; + } + return GenericOpc; +} + +#ifndef NDEBUG +/// Helper function that verifies that we have a valid copy at the end of +/// selectCopy. Verifies that the source and dest have the expected sizes and +/// then returns true. +static bool isValidCopy(const MachineInstr &I, const RegisterBank &DstBank, + const MachineRegisterInfo &MRI, + const TargetRegisterInfo &TRI, + const RegisterBankInfo &RBI) { + const Register DstReg = I.getOperand(0).getReg(); + const Register SrcReg = I.getOperand(1).getReg(); + const unsigned DstSize = RBI.getSizeInBits(DstReg, MRI, TRI); + const unsigned SrcSize = RBI.getSizeInBits(SrcReg, MRI, TRI); + + // Make sure the size of the source and dest line up. + assert( + (DstSize == SrcSize || + // Copies are a mean to setup initial types, the number of + // bits may not exactly match. + (Register::isPhysicalRegister(SrcReg) && DstSize <= SrcSize) || + // Copies are a mean to copy bits around, as long as we are + // on the same register class, that's fine. Otherwise, that + // means we need some SUBREG_TO_REG or AND & co. + (((DstSize + 31) / 32 == (SrcSize + 31) / 32) && DstSize > SrcSize)) && + "Copy with different width?!"); + + // Check the size of the destination. + assert((DstSize <= 64 || DstBank.getID() == AArch64::FPRRegBankID) && + "GPRs cannot get more than 64-bit width values"); + + return true; +} +#endif + +/// Helper function for selectCopy. Inserts a subregister copy from \p SrcReg +/// to \p *To. +/// +/// E.g "To = COPY SrcReg:SubReg" +static bool copySubReg(MachineInstr &I, MachineRegisterInfo &MRI, + const RegisterBankInfo &RBI, Register SrcReg, + const TargetRegisterClass *To, unsigned SubReg) { + assert(SrcReg.isValid() && "Expected a valid source register?"); + assert(To && "Destination register class cannot be null"); + assert(SubReg && "Expected a valid subregister"); + + MachineIRBuilder MIB(I); + auto SubRegCopy = + MIB.buildInstr(TargetOpcode::COPY, {To}, {}).addReg(SrcReg, 0, SubReg); + MachineOperand &RegOp = I.getOperand(1); + RegOp.setReg(SubRegCopy.getReg(0)); + + // It's possible that the destination register won't be constrained. Make + // sure that happens. + if (!Register::isPhysicalRegister(I.getOperand(0).getReg())) + RBI.constrainGenericRegister(I.getOperand(0).getReg(), *To, MRI); + + return true; +} + +/// Helper function to get the source and destination register classes for a +/// copy. Returns a std::pair containing the source register class for the +/// copy, and the destination register class for the copy. If a register class +/// cannot be determined, then it will be nullptr. +static std::pair<const TargetRegisterClass *, const TargetRegisterClass *> +getRegClassesForCopy(MachineInstr &I, const TargetInstrInfo &TII, + MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI, + const RegisterBankInfo &RBI) { + Register DstReg = I.getOperand(0).getReg(); + Register SrcReg = I.getOperand(1).getReg(); + const RegisterBank &DstRegBank = *RBI.getRegBank(DstReg, MRI, TRI); + const RegisterBank &SrcRegBank = *RBI.getRegBank(SrcReg, MRI, TRI); + unsigned DstSize = RBI.getSizeInBits(DstReg, MRI, TRI); + unsigned SrcSize = RBI.getSizeInBits(SrcReg, MRI, TRI); + + // Special casing for cross-bank copies of s1s. We can technically represent + // a 1-bit value with any size of register. The minimum size for a GPR is 32 + // bits. So, we need to put the FPR on 32 bits as well. + // + // FIXME: I'm not sure if this case holds true outside of copies. If it does, + // then we can pull it into the helpers that get the appropriate class for a + // register bank. Or make a new helper that carries along some constraint + // information. + if (SrcRegBank != DstRegBank && (DstSize == 1 && SrcSize == 1)) + SrcSize = DstSize = 32; + + return {getMinClassForRegBank(SrcRegBank, SrcSize, true), + getMinClassForRegBank(DstRegBank, DstSize, true)}; +} + +static bool selectCopy(MachineInstr &I, const TargetInstrInfo &TII, + MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI, + const RegisterBankInfo &RBI) { + Register DstReg = I.getOperand(0).getReg(); + Register SrcReg = I.getOperand(1).getReg(); + const RegisterBank &DstRegBank = *RBI.getRegBank(DstReg, MRI, TRI); + const RegisterBank &SrcRegBank = *RBI.getRegBank(SrcReg, MRI, TRI); + + // Find the correct register classes for the source and destination registers. + const TargetRegisterClass *SrcRC; + const TargetRegisterClass *DstRC; + std::tie(SrcRC, DstRC) = getRegClassesForCopy(I, TII, MRI, TRI, RBI); + + if (!DstRC) { + LLVM_DEBUG(dbgs() << "Unexpected dest size " + << RBI.getSizeInBits(DstReg, MRI, TRI) << '\n'); + return false; + } + + // A couple helpers below, for making sure that the copy we produce is valid. + + // Set to true if we insert a SUBREG_TO_REG. If we do this, then we don't want + // to verify that the src and dst are the same size, since that's handled by + // the SUBREG_TO_REG. + bool KnownValid = false; + + // Returns true, or asserts if something we don't expect happens. Instead of + // returning true, we return isValidCopy() to ensure that we verify the + // result. + auto CheckCopy = [&]() { + // If we have a bitcast or something, we can't have physical registers. + assert((I.isCopy() || + (!Register::isPhysicalRegister(I.getOperand(0).getReg()) && + !Register::isPhysicalRegister(I.getOperand(1).getReg()))) && + "No phys reg on generic operator!"); + bool ValidCopy = true; +#ifndef NDEBUG + ValidCopy = KnownValid || isValidCopy(I, DstRegBank, MRI, TRI, RBI); + assert(ValidCopy && "Invalid copy."); +#endif + return ValidCopy; + }; + + // Is this a copy? If so, then we may need to insert a subregister copy. + if (I.isCopy()) { + // Yes. Check if there's anything to fix up. + if (!SrcRC) { + LLVM_DEBUG(dbgs() << "Couldn't determine source register class\n"); + return false; + } + + unsigned SrcSize = TRI.getRegSizeInBits(*SrcRC); + unsigned DstSize = TRI.getRegSizeInBits(*DstRC); + unsigned SubReg; + + // If the source bank doesn't support a subregister copy small enough, + // then we first need to copy to the destination bank. + if (getMinSizeForRegBank(SrcRegBank) > DstSize) { + const TargetRegisterClass *DstTempRC = + getMinClassForRegBank(DstRegBank, SrcSize, /* GetAllRegSet */ true); + getSubRegForClass(DstRC, TRI, SubReg); + + MachineIRBuilder MIB(I); + auto Copy = MIB.buildCopy({DstTempRC}, {SrcReg}); + copySubReg(I, MRI, RBI, Copy.getReg(0), DstRC, SubReg); + } else if (SrcSize > DstSize) { + // If the source register is bigger than the destination we need to + // perform a subregister copy. + const TargetRegisterClass *SubRegRC = + getMinClassForRegBank(SrcRegBank, DstSize, /* GetAllRegSet */ true); + getSubRegForClass(SubRegRC, TRI, SubReg); + copySubReg(I, MRI, RBI, SrcReg, DstRC, SubReg); + } else if (DstSize > SrcSize) { + // If the destination register is bigger than the source we need to do + // a promotion using SUBREG_TO_REG. + const TargetRegisterClass *PromotionRC = + getMinClassForRegBank(SrcRegBank, DstSize, /* GetAllRegSet */ true); + getSubRegForClass(SrcRC, TRI, SubReg); + + Register PromoteReg = MRI.createVirtualRegister(PromotionRC); + BuildMI(*I.getParent(), I, I.getDebugLoc(), + TII.get(AArch64::SUBREG_TO_REG), PromoteReg) + .addImm(0) + .addUse(SrcReg) + .addImm(SubReg); + MachineOperand &RegOp = I.getOperand(1); + RegOp.setReg(PromoteReg); + + // Promise that the copy is implicitly validated by the SUBREG_TO_REG. + KnownValid = true; + } + + // If the destination is a physical register, then there's nothing to + // change, so we're done. + if (Register::isPhysicalRegister(DstReg)) + return CheckCopy(); + } + + // No need to constrain SrcReg. It will get constrained when we hit another + // of its use or its defs. Copies do not have constraints. + if (!RBI.constrainGenericRegister(DstReg, *DstRC, MRI)) { + LLVM_DEBUG(dbgs() << "Failed to constrain " << TII.getName(I.getOpcode()) + << " operand\n"); + return false; + } + I.setDesc(TII.get(AArch64::COPY)); + return CheckCopy(); +} + +static unsigned selectFPConvOpc(unsigned GenericOpc, LLT DstTy, LLT SrcTy) { + if (!DstTy.isScalar() || !SrcTy.isScalar()) + return GenericOpc; + + const unsigned DstSize = DstTy.getSizeInBits(); + const unsigned SrcSize = SrcTy.getSizeInBits(); + + switch (DstSize) { + case 32: + switch (SrcSize) { + case 32: + switch (GenericOpc) { + case TargetOpcode::G_SITOFP: + return AArch64::SCVTFUWSri; + case TargetOpcode::G_UITOFP: + return AArch64::UCVTFUWSri; + case TargetOpcode::G_FPTOSI: + return AArch64::FCVTZSUWSr; + case TargetOpcode::G_FPTOUI: + return AArch64::FCVTZUUWSr; + default: + return GenericOpc; + } + case 64: + switch (GenericOpc) { + case TargetOpcode::G_SITOFP: + return AArch64::SCVTFUXSri; + case TargetOpcode::G_UITOFP: + return AArch64::UCVTFUXSri; + case TargetOpcode::G_FPTOSI: + return AArch64::FCVTZSUWDr; + case TargetOpcode::G_FPTOUI: + return AArch64::FCVTZUUWDr; + default: + return GenericOpc; + } + default: + return GenericOpc; + } + case 64: + switch (SrcSize) { + case 32: + switch (GenericOpc) { + case TargetOpcode::G_SITOFP: + return AArch64::SCVTFUWDri; + case TargetOpcode::G_UITOFP: + return AArch64::UCVTFUWDri; + case TargetOpcode::G_FPTOSI: + return AArch64::FCVTZSUXSr; + case TargetOpcode::G_FPTOUI: + return AArch64::FCVTZUUXSr; + default: + return GenericOpc; + } + case 64: + switch (GenericOpc) { + case TargetOpcode::G_SITOFP: + return AArch64::SCVTFUXDri; + case TargetOpcode::G_UITOFP: + return AArch64::UCVTFUXDri; + case TargetOpcode::G_FPTOSI: + return AArch64::FCVTZSUXDr; + case TargetOpcode::G_FPTOUI: + return AArch64::FCVTZUUXDr; + default: + return GenericOpc; + } + default: + return GenericOpc; + } + default: + return GenericOpc; + }; + return GenericOpc; +} + +static unsigned selectSelectOpc(MachineInstr &I, MachineRegisterInfo &MRI, + const RegisterBankInfo &RBI) { + const TargetRegisterInfo &TRI = *MRI.getTargetRegisterInfo(); + bool IsFP = (RBI.getRegBank(I.getOperand(0).getReg(), MRI, TRI)->getID() != + AArch64::GPRRegBankID); + LLT Ty = MRI.getType(I.getOperand(0).getReg()); + if (Ty == LLT::scalar(32)) + return IsFP ? AArch64::FCSELSrrr : AArch64::CSELWr; + else if (Ty == LLT::scalar(64) || Ty == LLT::pointer(0, 64)) + return IsFP ? AArch64::FCSELDrrr : AArch64::CSELXr; + return 0; +} + +/// Helper function to select the opcode for a G_FCMP. +static unsigned selectFCMPOpc(MachineInstr &I, MachineRegisterInfo &MRI) { + // If this is a compare against +0.0, then we don't have to explicitly + // materialize a constant. + const ConstantFP *FPImm = getConstantFPVRegVal(I.getOperand(3).getReg(), MRI); + bool ShouldUseImm = FPImm && (FPImm->isZero() && !FPImm->isNegative()); + unsigned OpSize = MRI.getType(I.getOperand(2).getReg()).getSizeInBits(); + if (OpSize != 32 && OpSize != 64) + return 0; + unsigned CmpOpcTbl[2][2] = {{AArch64::FCMPSrr, AArch64::FCMPDrr}, + {AArch64::FCMPSri, AArch64::FCMPDri}}; + return CmpOpcTbl[ShouldUseImm][OpSize == 64]; +} + +/// Returns true if \p P is an unsigned integer comparison predicate. +static bool isUnsignedICMPPred(const CmpInst::Predicate P) { + switch (P) { + default: + return false; + case CmpInst::ICMP_UGT: + case CmpInst::ICMP_UGE: + case CmpInst::ICMP_ULT: + case CmpInst::ICMP_ULE: + return true; + } +} + +static AArch64CC::CondCode changeICMPPredToAArch64CC(CmpInst::Predicate P) { + switch (P) { + default: + llvm_unreachable("Unknown condition code!"); + case CmpInst::ICMP_NE: + return AArch64CC::NE; + case CmpInst::ICMP_EQ: + return AArch64CC::EQ; + case CmpInst::ICMP_SGT: + return AArch64CC::GT; + case CmpInst::ICMP_SGE: + return AArch64CC::GE; + case CmpInst::ICMP_SLT: + return AArch64CC::LT; + case CmpInst::ICMP_SLE: + return AArch64CC::LE; + case CmpInst::ICMP_UGT: + return AArch64CC::HI; + case CmpInst::ICMP_UGE: + return AArch64CC::HS; + case CmpInst::ICMP_ULT: + return AArch64CC::LO; + case CmpInst::ICMP_ULE: + return AArch64CC::LS; + } +} + +static void changeFCMPPredToAArch64CC(CmpInst::Predicate P, + AArch64CC::CondCode &CondCode, + AArch64CC::CondCode &CondCode2) { + CondCode2 = AArch64CC::AL; + switch (P) { + default: + llvm_unreachable("Unknown FP condition!"); + case CmpInst::FCMP_OEQ: + CondCode = AArch64CC::EQ; + break; + case CmpInst::FCMP_OGT: + CondCode = AArch64CC::GT; + break; + case CmpInst::FCMP_OGE: + CondCode = AArch64CC::GE; + break; + case CmpInst::FCMP_OLT: + CondCode = AArch64CC::MI; + break; + case CmpInst::FCMP_OLE: + CondCode = AArch64CC::LS; + break; + case CmpInst::FCMP_ONE: + CondCode = AArch64CC::MI; + CondCode2 = AArch64CC::GT; + break; + case CmpInst::FCMP_ORD: + CondCode = AArch64CC::VC; + break; + case CmpInst::FCMP_UNO: + CondCode = AArch64CC::VS; + break; + case CmpInst::FCMP_UEQ: + CondCode = AArch64CC::EQ; + CondCode2 = AArch64CC::VS; + break; + case CmpInst::FCMP_UGT: + CondCode = AArch64CC::HI; + break; + case CmpInst::FCMP_UGE: + CondCode = AArch64CC::PL; + break; + case CmpInst::FCMP_ULT: + CondCode = AArch64CC::LT; + break; + case CmpInst::FCMP_ULE: + CondCode = AArch64CC::LE; + break; + case CmpInst::FCMP_UNE: + CondCode = AArch64CC::NE; + break; + } +} + +/// Return a register which can be used as a bit to test in a TB(N)Z. +static Register getTestBitReg(Register Reg, uint64_t &Bit, bool &Invert, + MachineRegisterInfo &MRI) { + assert(Reg.isValid() && "Expected valid register!"); + while (MachineInstr *MI = getDefIgnoringCopies(Reg, MRI)) { + unsigned Opc = MI->getOpcode(); + + if (!MI->getOperand(0).isReg() || + !MRI.hasOneNonDBGUse(MI->getOperand(0).getReg())) + break; + + // (tbz (any_ext x), b) -> (tbz x, b) if we don't use the extended bits. + // + // (tbz (trunc x), b) -> (tbz x, b) is always safe, because the bit number + // on the truncated x is the same as the bit number on x. + if (Opc == TargetOpcode::G_ANYEXT || Opc == TargetOpcode::G_ZEXT || + Opc == TargetOpcode::G_TRUNC) { + Register NextReg = MI->getOperand(1).getReg(); + // Did we find something worth folding? + if (!NextReg.isValid() || !MRI.hasOneNonDBGUse(NextReg)) + break; + + // NextReg is worth folding. Keep looking. + Reg = NextReg; + continue; + } + + // Attempt to find a suitable operation with a constant on one side. + Optional<uint64_t> C; + Register TestReg; + switch (Opc) { + default: + break; + case TargetOpcode::G_AND: + case TargetOpcode::G_XOR: { + TestReg = MI->getOperand(1).getReg(); + Register ConstantReg = MI->getOperand(2).getReg(); + auto VRegAndVal = getConstantVRegValWithLookThrough(ConstantReg, MRI); + if (!VRegAndVal) { + // AND commutes, check the other side for a constant. + // FIXME: Can we canonicalize the constant so that it's always on the + // same side at some point earlier? + std::swap(ConstantReg, TestReg); + VRegAndVal = getConstantVRegValWithLookThrough(ConstantReg, MRI); + } + if (VRegAndVal) + C = VRegAndVal->Value; + break; + } + case TargetOpcode::G_ASHR: + case TargetOpcode::G_LSHR: + case TargetOpcode::G_SHL: { + TestReg = MI->getOperand(1).getReg(); + auto VRegAndVal = + getConstantVRegValWithLookThrough(MI->getOperand(2).getReg(), MRI); + if (VRegAndVal) + C = VRegAndVal->Value; + break; + } + } + + // Didn't find a constant or viable register. Bail out of the loop. + if (!C || !TestReg.isValid()) + break; + + // We found a suitable instruction with a constant. Check to see if we can + // walk through the instruction. + Register NextReg; + unsigned TestRegSize = MRI.getType(TestReg).getSizeInBits(); + switch (Opc) { + default: + break; + case TargetOpcode::G_AND: + // (tbz (and x, m), b) -> (tbz x, b) when the b-th bit of m is set. + if ((*C >> Bit) & 1) + NextReg = TestReg; + break; + case TargetOpcode::G_SHL: + // (tbz (shl x, c), b) -> (tbz x, b-c) when b-c is positive and fits in + // the type of the register. + if (*C <= Bit && (Bit - *C) < TestRegSize) { + NextReg = TestReg; + Bit = Bit - *C; + } + break; + case TargetOpcode::G_ASHR: + // (tbz (ashr x, c), b) -> (tbz x, b+c) or (tbz x, msb) if b+c is > # bits + // in x + NextReg = TestReg; + Bit = Bit + *C; + if (Bit >= TestRegSize) + Bit = TestRegSize - 1; + break; + case TargetOpcode::G_LSHR: + // (tbz (lshr x, c), b) -> (tbz x, b+c) when b + c is < # bits in x + if ((Bit + *C) < TestRegSize) { + NextReg = TestReg; + Bit = Bit + *C; + } + break; + case TargetOpcode::G_XOR: + // We can walk through a G_XOR by inverting whether we use tbz/tbnz when + // appropriate. + // + // e.g. If x' = xor x, c, and the b-th bit is set in c then + // + // tbz x', b -> tbnz x, b + // + // Because x' only has the b-th bit set if x does not. + if ((*C >> Bit) & 1) + Invert = !Invert; + NextReg = TestReg; + break; + } + + // Check if we found anything worth folding. + if (!NextReg.isValid()) + return Reg; + Reg = NextReg; + } + + return Reg; +} + +MachineInstr *AArch64InstructionSelector::emitTestBit( + Register TestReg, uint64_t Bit, bool IsNegative, MachineBasicBlock *DstMBB, + MachineIRBuilder &MIB) const { + assert(TestReg.isValid()); + assert(ProduceNonFlagSettingCondBr && + "Cannot emit TB(N)Z with speculation tracking!"); + MachineRegisterInfo &MRI = *MIB.getMRI(); + + // Attempt to optimize the test bit by walking over instructions. + TestReg = getTestBitReg(TestReg, Bit, IsNegative, MRI); + LLT Ty = MRI.getType(TestReg); + unsigned Size = Ty.getSizeInBits(); + assert(!Ty.isVector() && "Expected a scalar!"); + assert(Bit < 64 && "Bit is too large!"); + + // When the test register is a 64-bit register, we have to narrow to make + // TBNZW work. + bool UseWReg = Bit < 32; + unsigned NecessarySize = UseWReg ? 32 : 64; + if (Size < NecessarySize) + TestReg = widenGPRBankRegIfNeeded(TestReg, NecessarySize, MIB); + else if (Size > NecessarySize) + TestReg = narrowExtendRegIfNeeded(TestReg, MIB); + + static const unsigned OpcTable[2][2] = {{AArch64::TBZX, AArch64::TBNZX}, + {AArch64::TBZW, AArch64::TBNZW}}; + unsigned Opc = OpcTable[UseWReg][IsNegative]; + auto TestBitMI = + MIB.buildInstr(Opc).addReg(TestReg).addImm(Bit).addMBB(DstMBB); + constrainSelectedInstRegOperands(*TestBitMI, TII, TRI, RBI); + return &*TestBitMI; +} + +bool AArch64InstructionSelector::tryOptAndIntoCompareBranch( + MachineInstr *AndInst, int64_t CmpConstant, const CmpInst::Predicate &Pred, + MachineBasicBlock *DstMBB, MachineIRBuilder &MIB) const { + // Given something like this: + // + // %x = ...Something... + // %one = G_CONSTANT i64 1 + // %zero = G_CONSTANT i64 0 + // %and = G_AND %x, %one + // %cmp = G_ICMP intpred(ne), %and, %zero + // %cmp_trunc = G_TRUNC %cmp + // G_BRCOND %cmp_trunc, %bb.3 + // + // We want to try and fold the AND into the G_BRCOND and produce either a + // TBNZ (when we have intpred(ne)) or a TBZ (when we have intpred(eq)). + // + // In this case, we'd get + // + // TBNZ %x %bb.3 + // + if (!AndInst || AndInst->getOpcode() != TargetOpcode::G_AND) + return false; + + // Need to be comparing against 0 to fold. + if (CmpConstant != 0) + return false; + + MachineRegisterInfo &MRI = *MIB.getMRI(); + + // Only support EQ and NE. If we have LT, then it *is* possible to fold, but + // we don't want to do this. When we have an AND and LT, we need a TST/ANDS, + // so folding would be redundant. + if (Pred != CmpInst::Predicate::ICMP_EQ && + Pred != CmpInst::Predicate::ICMP_NE) + return false; + + // Check if the AND has a constant on its RHS which we can use as a mask. + // If it's a power of 2, then it's the same as checking a specific bit. + // (e.g, ANDing with 8 == ANDing with 000...100 == testing if bit 3 is set) + auto MaybeBit = + getConstantVRegValWithLookThrough(AndInst->getOperand(2).getReg(), MRI); + if (!MaybeBit || !isPowerOf2_64(MaybeBit->Value)) + return false; + + uint64_t Bit = Log2_64(static_cast<uint64_t>(MaybeBit->Value)); + Register TestReg = AndInst->getOperand(1).getReg(); + bool Invert = Pred == CmpInst::Predicate::ICMP_NE; + + // Emit a TB(N)Z. + emitTestBit(TestReg, Bit, Invert, DstMBB, MIB); + return true; +} + +bool AArch64InstructionSelector::selectCompareBranch( + MachineInstr &I, MachineFunction &MF, MachineRegisterInfo &MRI) const { + + const Register CondReg = I.getOperand(0).getReg(); + MachineBasicBlock *DestMBB = I.getOperand(1).getMBB(); + MachineInstr *CCMI = MRI.getVRegDef(CondReg); + if (CCMI->getOpcode() == TargetOpcode::G_TRUNC) + CCMI = MRI.getVRegDef(CCMI->getOperand(1).getReg()); + if (CCMI->getOpcode() != TargetOpcode::G_ICMP) + return false; + + Register LHS = CCMI->getOperand(2).getReg(); + Register RHS = CCMI->getOperand(3).getReg(); + auto VRegAndVal = getConstantVRegValWithLookThrough(RHS, MRI); + MachineIRBuilder MIB(I); + CmpInst::Predicate Pred = + (CmpInst::Predicate)CCMI->getOperand(1).getPredicate(); + MachineInstr *LHSMI = getDefIgnoringCopies(LHS, MRI); + + // When we can emit a TB(N)Z, prefer that. + // + // Handle non-commutative condition codes first. + // Note that we don't want to do this when we have a G_AND because it can + // become a tst. The tst will make the test bit in the TB(N)Z redundant. + if (VRegAndVal && LHSMI->getOpcode() != TargetOpcode::G_AND) { + int64_t C = VRegAndVal->Value; + + // When we have a greater-than comparison, we can just test if the msb is + // zero. + if (C == -1 && Pred == CmpInst::ICMP_SGT) { + uint64_t Bit = MRI.getType(LHS).getSizeInBits() - 1; + emitTestBit(LHS, Bit, /*IsNegative = */ false, DestMBB, MIB); + I.eraseFromParent(); + return true; + } + + // When we have a less than comparison, we can just test if the msb is not + // zero. + if (C == 0 && Pred == CmpInst::ICMP_SLT) { + uint64_t Bit = MRI.getType(LHS).getSizeInBits() - 1; + emitTestBit(LHS, Bit, /*IsNegative = */ true, DestMBB, MIB); + I.eraseFromParent(); + return true; + } + } + + if (!VRegAndVal) { + std::swap(RHS, LHS); + VRegAndVal = getConstantVRegValWithLookThrough(RHS, MRI); + LHSMI = getDefIgnoringCopies(LHS, MRI); + } + + if (!VRegAndVal || VRegAndVal->Value != 0) { + // If we can't select a CBZ then emit a cmp + Bcc. + MachineInstr *Cmp; + std::tie(Cmp, Pred) = emitIntegerCompare( + CCMI->getOperand(2), CCMI->getOperand(3), CCMI->getOperand(1), MIB); + if (!Cmp) + return false; + const AArch64CC::CondCode CC = changeICMPPredToAArch64CC(Pred); + MIB.buildInstr(AArch64::Bcc, {}, {}).addImm(CC).addMBB(DestMBB); + I.eraseFromParent(); + return true; + } + + // Try to emit a TB(N)Z for an eq or ne condition. + if (tryOptAndIntoCompareBranch(LHSMI, VRegAndVal->Value, Pred, DestMBB, + MIB)) { + I.eraseFromParent(); + return true; + } + + const RegisterBank &RB = *RBI.getRegBank(LHS, MRI, TRI); + if (RB.getID() != AArch64::GPRRegBankID) + return false; + if (Pred != CmpInst::ICMP_NE && Pred != CmpInst::ICMP_EQ) + return false; + + const unsigned CmpWidth = MRI.getType(LHS).getSizeInBits(); + unsigned CBOpc = 0; + if (CmpWidth <= 32) + CBOpc = (Pred == CmpInst::ICMP_EQ ? AArch64::CBZW : AArch64::CBNZW); + else if (CmpWidth == 64) + CBOpc = (Pred == CmpInst::ICMP_EQ ? AArch64::CBZX : AArch64::CBNZX); + else + return false; + + BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(CBOpc)) + .addUse(LHS) + .addMBB(DestMBB) + .constrainAllUses(TII, TRI, RBI); + + I.eraseFromParent(); + return true; +} + +/// Returns the element immediate value of a vector shift operand if found. +/// This needs to detect a splat-like operation, e.g. a G_BUILD_VECTOR. +static Optional<int64_t> getVectorShiftImm(Register Reg, + MachineRegisterInfo &MRI) { + assert(MRI.getType(Reg).isVector() && "Expected a *vector* shift operand"); + MachineInstr *OpMI = MRI.getVRegDef(Reg); + assert(OpMI && "Expected to find a vreg def for vector shift operand"); + if (OpMI->getOpcode() != TargetOpcode::G_BUILD_VECTOR) + return None; + + // Check all operands are identical immediates. + int64_t ImmVal = 0; + for (unsigned Idx = 1; Idx < OpMI->getNumOperands(); ++Idx) { + auto VRegAndVal = getConstantVRegValWithLookThrough(OpMI->getOperand(Idx).getReg(), MRI); + if (!VRegAndVal) + return None; + + if (Idx == 1) + ImmVal = VRegAndVal->Value; + if (ImmVal != VRegAndVal->Value) + return None; + } + + return ImmVal; +} + +/// Matches and returns the shift immediate value for a SHL instruction given +/// a shift operand. +static Optional<int64_t> getVectorSHLImm(LLT SrcTy, Register Reg, MachineRegisterInfo &MRI) { + Optional<int64_t> ShiftImm = getVectorShiftImm(Reg, MRI); + if (!ShiftImm) + return None; + // Check the immediate is in range for a SHL. + int64_t Imm = *ShiftImm; + if (Imm < 0) + return None; + switch (SrcTy.getElementType().getSizeInBits()) { + default: + LLVM_DEBUG(dbgs() << "Unhandled element type for vector shift"); + return None; + case 8: + if (Imm > 7) + return None; + break; + case 16: + if (Imm > 15) + return None; + break; + case 32: + if (Imm > 31) + return None; + break; + case 64: + if (Imm > 63) + return None; + break; + } + return Imm; +} + +bool AArch64InstructionSelector::selectVectorSHL( + MachineInstr &I, MachineRegisterInfo &MRI) const { + assert(I.getOpcode() == TargetOpcode::G_SHL); + Register DstReg = I.getOperand(0).getReg(); + const LLT Ty = MRI.getType(DstReg); + Register Src1Reg = I.getOperand(1).getReg(); + Register Src2Reg = I.getOperand(2).getReg(); + + if (!Ty.isVector()) + return false; + + // Check if we have a vector of constants on RHS that we can select as the + // immediate form. + Optional<int64_t> ImmVal = getVectorSHLImm(Ty, Src2Reg, MRI); + + unsigned Opc = 0; + if (Ty == LLT::vector(2, 64)) { + Opc = ImmVal ? AArch64::SHLv2i64_shift : AArch64::USHLv2i64; + } else if (Ty == LLT::vector(4, 32)) { + Opc = ImmVal ? AArch64::SHLv4i32_shift : AArch64::USHLv4i32; + } else if (Ty == LLT::vector(2, 32)) { + Opc = ImmVal ? AArch64::SHLv2i32_shift : AArch64::USHLv2i32; + } else { + LLVM_DEBUG(dbgs() << "Unhandled G_SHL type"); + return false; + } + + MachineIRBuilder MIB(I); + auto Shl = MIB.buildInstr(Opc, {DstReg}, {Src1Reg}); + if (ImmVal) + Shl.addImm(*ImmVal); + else + Shl.addUse(Src2Reg); + constrainSelectedInstRegOperands(*Shl, TII, TRI, RBI); + I.eraseFromParent(); + return true; +} + +bool AArch64InstructionSelector::selectVectorASHR( + MachineInstr &I, MachineRegisterInfo &MRI) const { + assert(I.getOpcode() == TargetOpcode::G_ASHR); + Register DstReg = I.getOperand(0).getReg(); + const LLT Ty = MRI.getType(DstReg); + Register Src1Reg = I.getOperand(1).getReg(); + Register Src2Reg = I.getOperand(2).getReg(); + + if (!Ty.isVector()) + return false; + + // There is not a shift right register instruction, but the shift left + // register instruction takes a signed value, where negative numbers specify a + // right shift. + + unsigned Opc = 0; + unsigned NegOpc = 0; + const TargetRegisterClass *RC = nullptr; + if (Ty == LLT::vector(2, 64)) { + Opc = AArch64::SSHLv2i64; + NegOpc = AArch64::NEGv2i64; + RC = &AArch64::FPR128RegClass; + } else if (Ty == LLT::vector(4, 32)) { + Opc = AArch64::SSHLv4i32; + NegOpc = AArch64::NEGv4i32; + RC = &AArch64::FPR128RegClass; + } else if (Ty == LLT::vector(2, 32)) { + Opc = AArch64::SSHLv2i32; + NegOpc = AArch64::NEGv2i32; + RC = &AArch64::FPR64RegClass; + } else { + LLVM_DEBUG(dbgs() << "Unhandled G_ASHR type"); + return false; + } + + MachineIRBuilder MIB(I); + auto Neg = MIB.buildInstr(NegOpc, {RC}, {Src2Reg}); + constrainSelectedInstRegOperands(*Neg, TII, TRI, RBI); + auto SShl = MIB.buildInstr(Opc, {DstReg}, {Src1Reg, Neg}); + constrainSelectedInstRegOperands(*SShl, TII, TRI, RBI); + I.eraseFromParent(); + return true; +} + +bool AArch64InstructionSelector::selectVaStartAAPCS( + MachineInstr &I, MachineFunction &MF, MachineRegisterInfo &MRI) const { + return false; +} + +bool AArch64InstructionSelector::selectVaStartDarwin( + MachineInstr &I, MachineFunction &MF, MachineRegisterInfo &MRI) const { + AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>(); + Register ListReg = I.getOperand(0).getReg(); + + Register ArgsAddrReg = MRI.createVirtualRegister(&AArch64::GPR64RegClass); + + auto MIB = + BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(AArch64::ADDXri)) + .addDef(ArgsAddrReg) + .addFrameIndex(FuncInfo->getVarArgsStackIndex()) + .addImm(0) + .addImm(0); + + constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); + + MIB = BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(AArch64::STRXui)) + .addUse(ArgsAddrReg) + .addUse(ListReg) + .addImm(0) + .addMemOperand(*I.memoperands_begin()); + + constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); + I.eraseFromParent(); + return true; +} + +void AArch64InstructionSelector::materializeLargeCMVal( + MachineInstr &I, const Value *V, unsigned OpFlags) const { + MachineBasicBlock &MBB = *I.getParent(); + MachineFunction &MF = *MBB.getParent(); + MachineRegisterInfo &MRI = MF.getRegInfo(); + MachineIRBuilder MIB(I); + + auto MovZ = MIB.buildInstr(AArch64::MOVZXi, {&AArch64::GPR64RegClass}, {}); + MovZ->addOperand(MF, I.getOperand(1)); + MovZ->getOperand(1).setTargetFlags(OpFlags | AArch64II::MO_G0 | + AArch64II::MO_NC); + MovZ->addOperand(MF, MachineOperand::CreateImm(0)); + constrainSelectedInstRegOperands(*MovZ, TII, TRI, RBI); + + auto BuildMovK = [&](Register SrcReg, unsigned char Flags, unsigned Offset, + Register ForceDstReg) { + Register DstReg = ForceDstReg + ? ForceDstReg + : MRI.createVirtualRegister(&AArch64::GPR64RegClass); + auto MovI = MIB.buildInstr(AArch64::MOVKXi).addDef(DstReg).addUse(SrcReg); + if (auto *GV = dyn_cast<GlobalValue>(V)) { + MovI->addOperand(MF, MachineOperand::CreateGA( + GV, MovZ->getOperand(1).getOffset(), Flags)); + } else { + MovI->addOperand( + MF, MachineOperand::CreateBA(cast<BlockAddress>(V), + MovZ->getOperand(1).getOffset(), Flags)); + } + MovI->addOperand(MF, MachineOperand::CreateImm(Offset)); + constrainSelectedInstRegOperands(*MovI, TII, TRI, RBI); + return DstReg; + }; + Register DstReg = BuildMovK(MovZ.getReg(0), + AArch64II::MO_G1 | AArch64II::MO_NC, 16, 0); + DstReg = BuildMovK(DstReg, AArch64II::MO_G2 | AArch64II::MO_NC, 32, 0); + BuildMovK(DstReg, AArch64II::MO_G3, 48, I.getOperand(0).getReg()); + return; +} + +bool AArch64InstructionSelector::preISelLower(MachineInstr &I) { + MachineBasicBlock &MBB = *I.getParent(); + MachineFunction &MF = *MBB.getParent(); + MachineRegisterInfo &MRI = MF.getRegInfo(); + + switch (I.getOpcode()) { + case TargetOpcode::G_SHL: + case TargetOpcode::G_ASHR: + case TargetOpcode::G_LSHR: { + // These shifts are legalized to have 64 bit shift amounts because we want + // to take advantage of the existing imported selection patterns that assume + // the immediates are s64s. However, if the shifted type is 32 bits and for + // some reason we receive input GMIR that has an s64 shift amount that's not + // a G_CONSTANT, insert a truncate so that we can still select the s32 + // register-register variant. + Register SrcReg = I.getOperand(1).getReg(); + Register ShiftReg = I.getOperand(2).getReg(); + const LLT ShiftTy = MRI.getType(ShiftReg); + const LLT SrcTy = MRI.getType(SrcReg); + if (SrcTy.isVector()) + return false; + assert(!ShiftTy.isVector() && "unexpected vector shift ty"); + if (SrcTy.getSizeInBits() != 32 || ShiftTy.getSizeInBits() != 64) + return false; + auto *AmtMI = MRI.getVRegDef(ShiftReg); + assert(AmtMI && "could not find a vreg definition for shift amount"); + if (AmtMI->getOpcode() != TargetOpcode::G_CONSTANT) { + // Insert a subregister copy to implement a 64->32 trunc + MachineIRBuilder MIB(I); + auto Trunc = MIB.buildInstr(TargetOpcode::COPY, {SrcTy}, {}) + .addReg(ShiftReg, 0, AArch64::sub_32); + MRI.setRegBank(Trunc.getReg(0), RBI.getRegBank(AArch64::GPRRegBankID)); + I.getOperand(2).setReg(Trunc.getReg(0)); + } + return true; + } + case TargetOpcode::G_STORE: + return contractCrossBankCopyIntoStore(I, MRI); + case TargetOpcode::G_PTR_ADD: + return convertPtrAddToAdd(I, MRI); + case TargetOpcode::G_LOAD: { + // For scalar loads of pointers, we try to convert the dest type from p0 + // to s64 so that our imported patterns can match. Like with the G_PTR_ADD + // conversion, this should be ok because all users should have been + // selected already, so the type doesn't matter for them. + Register DstReg = I.getOperand(0).getReg(); + const LLT DstTy = MRI.getType(DstReg); + if (!DstTy.isPointer()) + return false; + MRI.setType(DstReg, LLT::scalar(64)); + return true; + } + default: + return false; + } +} + +/// This lowering tries to look for G_PTR_ADD instructions and then converts +/// them to a standard G_ADD with a COPY on the source. +/// +/// The motivation behind this is to expose the add semantics to the imported +/// tablegen patterns. We shouldn't need to check for uses being loads/stores, +/// because the selector works bottom up, uses before defs. By the time we +/// end up trying to select a G_PTR_ADD, we should have already attempted to +/// fold this into addressing modes and were therefore unsuccessful. +bool AArch64InstructionSelector::convertPtrAddToAdd( + MachineInstr &I, MachineRegisterInfo &MRI) { + assert(I.getOpcode() == TargetOpcode::G_PTR_ADD && "Expected G_PTR_ADD"); + Register DstReg = I.getOperand(0).getReg(); + Register AddOp1Reg = I.getOperand(1).getReg(); + const LLT PtrTy = MRI.getType(DstReg); + if (PtrTy.getAddressSpace() != 0) + return false; + + MachineIRBuilder MIB(I); + const LLT CastPtrTy = PtrTy.isVector() ? LLT::vector(2, 64) : LLT::scalar(64); + auto PtrToInt = MIB.buildPtrToInt(CastPtrTy, AddOp1Reg); + // Set regbanks on the registers. + if (PtrTy.isVector()) + MRI.setRegBank(PtrToInt.getReg(0), RBI.getRegBank(AArch64::FPRRegBankID)); + else + MRI.setRegBank(PtrToInt.getReg(0), RBI.getRegBank(AArch64::GPRRegBankID)); + + // Now turn the %dst(p0) = G_PTR_ADD %base, off into: + // %dst(intty) = G_ADD %intbase, off + I.setDesc(TII.get(TargetOpcode::G_ADD)); + MRI.setType(DstReg, CastPtrTy); + I.getOperand(1).setReg(PtrToInt.getReg(0)); + if (!select(*PtrToInt)) { + LLVM_DEBUG(dbgs() << "Failed to select G_PTRTOINT in convertPtrAddToAdd"); + return false; + } + return true; +} + +bool AArch64InstructionSelector::earlySelectSHL( + MachineInstr &I, MachineRegisterInfo &MRI) const { + // We try to match the immediate variant of LSL, which is actually an alias + // for a special case of UBFM. Otherwise, we fall back to the imported + // selector which will match the register variant. + assert(I.getOpcode() == TargetOpcode::G_SHL && "unexpected op"); + const auto &MO = I.getOperand(2); + auto VRegAndVal = getConstantVRegVal(MO.getReg(), MRI); + if (!VRegAndVal) + return false; + + const LLT DstTy = MRI.getType(I.getOperand(0).getReg()); + if (DstTy.isVector()) + return false; + bool Is64Bit = DstTy.getSizeInBits() == 64; + auto Imm1Fn = Is64Bit ? selectShiftA_64(MO) : selectShiftA_32(MO); + auto Imm2Fn = Is64Bit ? selectShiftB_64(MO) : selectShiftB_32(MO); + MachineIRBuilder MIB(I); + + if (!Imm1Fn || !Imm2Fn) + return false; + + auto NewI = + MIB.buildInstr(Is64Bit ? AArch64::UBFMXri : AArch64::UBFMWri, + {I.getOperand(0).getReg()}, {I.getOperand(1).getReg()}); + + for (auto &RenderFn : *Imm1Fn) + RenderFn(NewI); + for (auto &RenderFn : *Imm2Fn) + RenderFn(NewI); + + I.eraseFromParent(); + return constrainSelectedInstRegOperands(*NewI, TII, TRI, RBI); +} + +bool AArch64InstructionSelector::contractCrossBankCopyIntoStore( + MachineInstr &I, MachineRegisterInfo &MRI) { + assert(I.getOpcode() == TargetOpcode::G_STORE && "Expected G_STORE"); + // If we're storing a scalar, it doesn't matter what register bank that + // scalar is on. All that matters is the size. + // + // So, if we see something like this (with a 32-bit scalar as an example): + // + // %x:gpr(s32) = ... something ... + // %y:fpr(s32) = COPY %x:gpr(s32) + // G_STORE %y:fpr(s32) + // + // We can fix this up into something like this: + // + // G_STORE %x:gpr(s32) + // + // And then continue the selection process normally. + Register DefDstReg = getSrcRegIgnoringCopies(I.getOperand(0).getReg(), MRI); + if (!DefDstReg.isValid()) + return false; + LLT DefDstTy = MRI.getType(DefDstReg); + Register StoreSrcReg = I.getOperand(0).getReg(); + LLT StoreSrcTy = MRI.getType(StoreSrcReg); + + // If we get something strange like a physical register, then we shouldn't + // go any further. + if (!DefDstTy.isValid()) + return false; + + // Are the source and dst types the same size? + if (DefDstTy.getSizeInBits() != StoreSrcTy.getSizeInBits()) + return false; + + if (RBI.getRegBank(StoreSrcReg, MRI, TRI) == + RBI.getRegBank(DefDstReg, MRI, TRI)) + return false; + + // We have a cross-bank copy, which is entering a store. Let's fold it. + I.getOperand(0).setReg(DefDstReg); + return true; +} + +bool AArch64InstructionSelector::earlySelect(MachineInstr &I) const { + assert(I.getParent() && "Instruction should be in a basic block!"); + assert(I.getParent()->getParent() && "Instruction should be in a function!"); + + MachineBasicBlock &MBB = *I.getParent(); + MachineFunction &MF = *MBB.getParent(); + MachineRegisterInfo &MRI = MF.getRegInfo(); + + switch (I.getOpcode()) { + case TargetOpcode::G_SHL: + return earlySelectSHL(I, MRI); + case TargetOpcode::G_CONSTANT: { + bool IsZero = false; + if (I.getOperand(1).isCImm()) + IsZero = I.getOperand(1).getCImm()->getZExtValue() == 0; + else if (I.getOperand(1).isImm()) + IsZero = I.getOperand(1).getImm() == 0; + + if (!IsZero) + return false; + + Register DefReg = I.getOperand(0).getReg(); + LLT Ty = MRI.getType(DefReg); + if (Ty.getSizeInBits() == 64) { + I.getOperand(1).ChangeToRegister(AArch64::XZR, false); + RBI.constrainGenericRegister(DefReg, AArch64::GPR64RegClass, MRI); + } else if (Ty.getSizeInBits() == 32) { + I.getOperand(1).ChangeToRegister(AArch64::WZR, false); + RBI.constrainGenericRegister(DefReg, AArch64::GPR32RegClass, MRI); + } else + return false; + + I.setDesc(TII.get(TargetOpcode::COPY)); + return true; + } + default: + return false; + } +} + +bool AArch64InstructionSelector::select(MachineInstr &I) { + assert(I.getParent() && "Instruction should be in a basic block!"); + assert(I.getParent()->getParent() && "Instruction should be in a function!"); + + MachineBasicBlock &MBB = *I.getParent(); + MachineFunction &MF = *MBB.getParent(); + MachineRegisterInfo &MRI = MF.getRegInfo(); + + const AArch64Subtarget *Subtarget = + &static_cast<const AArch64Subtarget &>(MF.getSubtarget()); + if (Subtarget->requiresStrictAlign()) { + // We don't support this feature yet. + LLVM_DEBUG(dbgs() << "AArch64 GISel does not support strict-align yet\n"); + return false; + } + + unsigned Opcode = I.getOpcode(); + // G_PHI requires same handling as PHI + if (!I.isPreISelOpcode() || Opcode == TargetOpcode::G_PHI) { + // Certain non-generic instructions also need some special handling. + + if (Opcode == TargetOpcode::LOAD_STACK_GUARD) + return constrainSelectedInstRegOperands(I, TII, TRI, RBI); + + if (Opcode == TargetOpcode::PHI || Opcode == TargetOpcode::G_PHI) { + const Register DefReg = I.getOperand(0).getReg(); + const LLT DefTy = MRI.getType(DefReg); + + const RegClassOrRegBank &RegClassOrBank = + MRI.getRegClassOrRegBank(DefReg); + + const TargetRegisterClass *DefRC + = RegClassOrBank.dyn_cast<const TargetRegisterClass *>(); + if (!DefRC) { + if (!DefTy.isValid()) { + LLVM_DEBUG(dbgs() << "PHI operand has no type, not a gvreg?\n"); + return false; + } + const RegisterBank &RB = *RegClassOrBank.get<const RegisterBank *>(); + DefRC = getRegClassForTypeOnBank(DefTy, RB, RBI); + if (!DefRC) { + LLVM_DEBUG(dbgs() << "PHI operand has unexpected size/bank\n"); + return false; + } + } + + I.setDesc(TII.get(TargetOpcode::PHI)); + + return RBI.constrainGenericRegister(DefReg, *DefRC, MRI); + } + + if (I.isCopy()) + return selectCopy(I, TII, MRI, TRI, RBI); + + return true; + } + + + if (I.getNumOperands() != I.getNumExplicitOperands()) { + LLVM_DEBUG( + dbgs() << "Generic instruction has unexpected implicit operands\n"); + return false; + } + + // Try to do some lowering before we start instruction selecting. These + // lowerings are purely transformations on the input G_MIR and so selection + // must continue after any modification of the instruction. + if (preISelLower(I)) { + Opcode = I.getOpcode(); // The opcode may have been modified, refresh it. + } + + // There may be patterns where the importer can't deal with them optimally, + // but does select it to a suboptimal sequence so our custom C++ selection + // code later never has a chance to work on it. Therefore, we have an early + // selection attempt here to give priority to certain selection routines + // over the imported ones. + if (earlySelect(I)) + return true; + + if (selectImpl(I, *CoverageInfo)) + return true; + + LLT Ty = + I.getOperand(0).isReg() ? MRI.getType(I.getOperand(0).getReg()) : LLT{}; + + MachineIRBuilder MIB(I); + + switch (Opcode) { + case TargetOpcode::G_BRCOND: { + if (Ty.getSizeInBits() > 32) { + // We shouldn't need this on AArch64, but it would be implemented as an + // EXTRACT_SUBREG followed by a TBNZW because TBNZX has no encoding if the + // bit being tested is < 32. + LLVM_DEBUG(dbgs() << "G_BRCOND has type: " << Ty + << ", expected at most 32-bits"); + return false; + } + + const Register CondReg = I.getOperand(0).getReg(); + MachineBasicBlock *DestMBB = I.getOperand(1).getMBB(); + + // Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z + // instructions will not be produced, as they are conditional branch + // instructions that do not set flags. + if (ProduceNonFlagSettingCondBr && selectCompareBranch(I, MF, MRI)) + return true; + + if (ProduceNonFlagSettingCondBr) { + auto MIB = BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::TBNZW)) + .addUse(CondReg) + .addImm(/*bit offset=*/0) + .addMBB(DestMBB); + + I.eraseFromParent(); + return constrainSelectedInstRegOperands(*MIB.getInstr(), TII, TRI, RBI); + } else { + auto CMP = BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::ANDSWri)) + .addDef(AArch64::WZR) + .addUse(CondReg) + .addImm(1); + constrainSelectedInstRegOperands(*CMP.getInstr(), TII, TRI, RBI); + auto Bcc = + BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::Bcc)) + .addImm(AArch64CC::EQ) + .addMBB(DestMBB); + + I.eraseFromParent(); + return constrainSelectedInstRegOperands(*Bcc.getInstr(), TII, TRI, RBI); + } + } + + case TargetOpcode::G_BRINDIRECT: { + I.setDesc(TII.get(AArch64::BR)); + return constrainSelectedInstRegOperands(I, TII, TRI, RBI); + } + + case TargetOpcode::G_BRJT: + return selectBrJT(I, MRI); + + case AArch64::G_ADD_LOW: { + // This op may have been separated from it's ADRP companion by the localizer + // or some other code motion pass. Given that many CPUs will try to + // macro fuse these operations anyway, select this into a MOVaddr pseudo + // which will later be expanded into an ADRP+ADD pair after scheduling. + MachineInstr *BaseMI = MRI.getVRegDef(I.getOperand(1).getReg()); + if (BaseMI->getOpcode() != AArch64::ADRP) { + I.setDesc(TII.get(AArch64::ADDXri)); + I.addOperand(MachineOperand::CreateImm(0)); + return constrainSelectedInstRegOperands(I, TII, TRI, RBI); + } + assert(TM.getCodeModel() == CodeModel::Small && + "Expected small code model"); + MachineIRBuilder MIB(I); + auto Op1 = BaseMI->getOperand(1); + auto Op2 = I.getOperand(2); + auto MovAddr = MIB.buildInstr(AArch64::MOVaddr, {I.getOperand(0)}, {}) + .addGlobalAddress(Op1.getGlobal(), Op1.getOffset(), + Op1.getTargetFlags()) + .addGlobalAddress(Op2.getGlobal(), Op2.getOffset(), + Op2.getTargetFlags()); + I.eraseFromParent(); + return constrainSelectedInstRegOperands(*MovAddr, TII, TRI, RBI); + } + + case TargetOpcode::G_BSWAP: { + // Handle vector types for G_BSWAP directly. + Register DstReg = I.getOperand(0).getReg(); + LLT DstTy = MRI.getType(DstReg); + + // We should only get vector types here; everything else is handled by the + // importer right now. + if (!DstTy.isVector() || DstTy.getSizeInBits() > 128) { + LLVM_DEBUG(dbgs() << "Dst type for G_BSWAP currently unsupported.\n"); + return false; + } + + // Only handle 4 and 2 element vectors for now. + // TODO: 16-bit elements. + unsigned NumElts = DstTy.getNumElements(); + if (NumElts != 4 && NumElts != 2) { + LLVM_DEBUG(dbgs() << "Unsupported number of elements for G_BSWAP.\n"); + return false; + } + + // Choose the correct opcode for the supported types. Right now, that's + // v2s32, v4s32, and v2s64. + unsigned Opc = 0; + unsigned EltSize = DstTy.getElementType().getSizeInBits(); + if (EltSize == 32) + Opc = (DstTy.getNumElements() == 2) ? AArch64::REV32v8i8 + : AArch64::REV32v16i8; + else if (EltSize == 64) + Opc = AArch64::REV64v16i8; + + // We should always get something by the time we get here... + assert(Opc != 0 && "Didn't get an opcode for G_BSWAP?"); + + I.setDesc(TII.get(Opc)); + return constrainSelectedInstRegOperands(I, TII, TRI, RBI); + } + + case TargetOpcode::G_FCONSTANT: + case TargetOpcode::G_CONSTANT: { + const bool isFP = Opcode == TargetOpcode::G_FCONSTANT; + + const LLT s8 = LLT::scalar(8); + const LLT s16 = LLT::scalar(16); + const LLT s32 = LLT::scalar(32); + const LLT s64 = LLT::scalar(64); + const LLT p0 = LLT::pointer(0, 64); + + const Register DefReg = I.getOperand(0).getReg(); + const LLT DefTy = MRI.getType(DefReg); + const unsigned DefSize = DefTy.getSizeInBits(); + const RegisterBank &RB = *RBI.getRegBank(DefReg, MRI, TRI); + + // FIXME: Redundant check, but even less readable when factored out. + if (isFP) { + if (Ty != s32 && Ty != s64) { + LLVM_DEBUG(dbgs() << "Unable to materialize FP " << Ty + << " constant, expected: " << s32 << " or " << s64 + << '\n'); + return false; + } + + if (RB.getID() != AArch64::FPRRegBankID) { + LLVM_DEBUG(dbgs() << "Unable to materialize FP " << Ty + << " constant on bank: " << RB + << ", expected: FPR\n"); + return false; + } + + // The case when we have 0.0 is covered by tablegen. Reject it here so we + // can be sure tablegen works correctly and isn't rescued by this code. + if (I.getOperand(1).getFPImm()->getValueAPF().isExactlyValue(0.0)) + return false; + } else { + // s32 and s64 are covered by tablegen. + if (Ty != p0 && Ty != s8 && Ty != s16) { + LLVM_DEBUG(dbgs() << "Unable to materialize integer " << Ty + << " constant, expected: " << s32 << ", " << s64 + << ", or " << p0 << '\n'); + return false; + } + + if (RB.getID() != AArch64::GPRRegBankID) { + LLVM_DEBUG(dbgs() << "Unable to materialize integer " << Ty + << " constant on bank: " << RB + << ", expected: GPR\n"); + return false; + } + } + + // We allow G_CONSTANT of types < 32b. + const unsigned MovOpc = + DefSize == 64 ? AArch64::MOVi64imm : AArch64::MOVi32imm; + + if (isFP) { + // Either emit a FMOV, or emit a copy to emit a normal mov. + const TargetRegisterClass &GPRRC = + DefSize == 32 ? AArch64::GPR32RegClass : AArch64::GPR64RegClass; + const TargetRegisterClass &FPRRC = + DefSize == 32 ? AArch64::FPR32RegClass : AArch64::FPR64RegClass; + + // Can we use a FMOV instruction to represent the immediate? + if (emitFMovForFConstant(I, MRI)) + return true; + + // For 64b values, emit a constant pool load instead. + if (DefSize == 64) { + auto *FPImm = I.getOperand(1).getFPImm(); + MachineIRBuilder MIB(I); + auto *LoadMI = emitLoadFromConstantPool(FPImm, MIB); + if (!LoadMI) { + LLVM_DEBUG(dbgs() << "Failed to load double constant pool entry\n"); + return false; + } + MIB.buildCopy({DefReg}, {LoadMI->getOperand(0).getReg()}); + I.eraseFromParent(); + return RBI.constrainGenericRegister(DefReg, FPRRC, MRI); + } + + // Nope. Emit a copy and use a normal mov instead. + const Register DefGPRReg = MRI.createVirtualRegister(&GPRRC); + MachineOperand &RegOp = I.getOperand(0); + RegOp.setReg(DefGPRReg); + MIB.setInsertPt(MIB.getMBB(), std::next(I.getIterator())); + MIB.buildCopy({DefReg}, {DefGPRReg}); + + if (!RBI.constrainGenericRegister(DefReg, FPRRC, MRI)) { + LLVM_DEBUG(dbgs() << "Failed to constrain G_FCONSTANT def operand\n"); + return false; + } + + MachineOperand &ImmOp = I.getOperand(1); + // FIXME: Is going through int64_t always correct? + ImmOp.ChangeToImmediate( + ImmOp.getFPImm()->getValueAPF().bitcastToAPInt().getZExtValue()); + } else if (I.getOperand(1).isCImm()) { + uint64_t Val = I.getOperand(1).getCImm()->getZExtValue(); + I.getOperand(1).ChangeToImmediate(Val); + } else if (I.getOperand(1).isImm()) { + uint64_t Val = I.getOperand(1).getImm(); + I.getOperand(1).ChangeToImmediate(Val); + } + + I.setDesc(TII.get(MovOpc)); + constrainSelectedInstRegOperands(I, TII, TRI, RBI); + return true; + } + case TargetOpcode::G_EXTRACT: { + Register DstReg = I.getOperand(0).getReg(); + Register SrcReg = I.getOperand(1).getReg(); + LLT SrcTy = MRI.getType(SrcReg); + LLT DstTy = MRI.getType(DstReg); + (void)DstTy; + unsigned SrcSize = SrcTy.getSizeInBits(); + + if (SrcTy.getSizeInBits() > 64) { + // This should be an extract of an s128, which is like a vector extract. + if (SrcTy.getSizeInBits() != 128) + return false; + // Only support extracting 64 bits from an s128 at the moment. + if (DstTy.getSizeInBits() != 64) + return false; + + const RegisterBank &SrcRB = *RBI.getRegBank(SrcReg, MRI, TRI); + const RegisterBank &DstRB = *RBI.getRegBank(DstReg, MRI, TRI); + // Check we have the right regbank always. + assert(SrcRB.getID() == AArch64::FPRRegBankID && + DstRB.getID() == AArch64::FPRRegBankID && + "Wrong extract regbank!"); + (void)SrcRB; + + // Emit the same code as a vector extract. + // Offset must be a multiple of 64. + unsigned Offset = I.getOperand(2).getImm(); + if (Offset % 64 != 0) + return false; + unsigned LaneIdx = Offset / 64; + MachineIRBuilder MIB(I); + MachineInstr *Extract = emitExtractVectorElt( + DstReg, DstRB, LLT::scalar(64), SrcReg, LaneIdx, MIB); + if (!Extract) + return false; + I.eraseFromParent(); + return true; + } + + I.setDesc(TII.get(SrcSize == 64 ? AArch64::UBFMXri : AArch64::UBFMWri)); + MachineInstrBuilder(MF, I).addImm(I.getOperand(2).getImm() + + Ty.getSizeInBits() - 1); + + if (SrcSize < 64) { + assert(SrcSize == 32 && DstTy.getSizeInBits() == 16 && + "unexpected G_EXTRACT types"); + return constrainSelectedInstRegOperands(I, TII, TRI, RBI); + } + + DstReg = MRI.createGenericVirtualRegister(LLT::scalar(64)); + MIB.setInsertPt(MIB.getMBB(), std::next(I.getIterator())); + MIB.buildInstr(TargetOpcode::COPY, {I.getOperand(0).getReg()}, {}) + .addReg(DstReg, 0, AArch64::sub_32); + RBI.constrainGenericRegister(I.getOperand(0).getReg(), + AArch64::GPR32RegClass, MRI); + I.getOperand(0).setReg(DstReg); + + return constrainSelectedInstRegOperands(I, TII, TRI, RBI); + } + + case TargetOpcode::G_INSERT: { + LLT SrcTy = MRI.getType(I.getOperand(2).getReg()); + LLT DstTy = MRI.getType(I.getOperand(0).getReg()); + unsigned DstSize = DstTy.getSizeInBits(); + // Larger inserts are vectors, same-size ones should be something else by + // now (split up or turned into COPYs). + if (Ty.getSizeInBits() > 64 || SrcTy.getSizeInBits() > 32) + return false; + + I.setDesc(TII.get(DstSize == 64 ? AArch64::BFMXri : AArch64::BFMWri)); + unsigned LSB = I.getOperand(3).getImm(); + unsigned Width = MRI.getType(I.getOperand(2).getReg()).getSizeInBits(); + I.getOperand(3).setImm((DstSize - LSB) % DstSize); + MachineInstrBuilder(MF, I).addImm(Width - 1); + + if (DstSize < 64) { + assert(DstSize == 32 && SrcTy.getSizeInBits() == 16 && + "unexpected G_INSERT types"); + return constrainSelectedInstRegOperands(I, TII, TRI, RBI); + } + + Register SrcReg = MRI.createGenericVirtualRegister(LLT::scalar(64)); + BuildMI(MBB, I.getIterator(), I.getDebugLoc(), + TII.get(AArch64::SUBREG_TO_REG)) + .addDef(SrcReg) + .addImm(0) + .addUse(I.getOperand(2).getReg()) + .addImm(AArch64::sub_32); + RBI.constrainGenericRegister(I.getOperand(2).getReg(), + AArch64::GPR32RegClass, MRI); + I.getOperand(2).setReg(SrcReg); + + return constrainSelectedInstRegOperands(I, TII, TRI, RBI); + } + case TargetOpcode::G_FRAME_INDEX: { + // allocas and G_FRAME_INDEX are only supported in addrspace(0). + if (Ty != LLT::pointer(0, 64)) { + LLVM_DEBUG(dbgs() << "G_FRAME_INDEX pointer has type: " << Ty + << ", expected: " << LLT::pointer(0, 64) << '\n'); + return false; + } + I.setDesc(TII.get(AArch64::ADDXri)); + + // MOs for a #0 shifted immediate. + I.addOperand(MachineOperand::CreateImm(0)); + I.addOperand(MachineOperand::CreateImm(0)); + + return constrainSelectedInstRegOperands(I, TII, TRI, RBI); + } + + case TargetOpcode::G_GLOBAL_VALUE: { + auto GV = I.getOperand(1).getGlobal(); + if (GV->isThreadLocal()) + return selectTLSGlobalValue(I, MRI); + + unsigned OpFlags = STI.ClassifyGlobalReference(GV, TM); + if (OpFlags & AArch64II::MO_GOT) { + I.setDesc(TII.get(AArch64::LOADgot)); + I.getOperand(1).setTargetFlags(OpFlags); + } else if (TM.getCodeModel() == CodeModel::Large) { + // Materialize the global using movz/movk instructions. + materializeLargeCMVal(I, GV, OpFlags); + I.eraseFromParent(); + return true; + } else if (TM.getCodeModel() == CodeModel::Tiny) { + I.setDesc(TII.get(AArch64::ADR)); + I.getOperand(1).setTargetFlags(OpFlags); + } else { + I.setDesc(TII.get(AArch64::MOVaddr)); + I.getOperand(1).setTargetFlags(OpFlags | AArch64II::MO_PAGE); + MachineInstrBuilder MIB(MF, I); + MIB.addGlobalAddress(GV, I.getOperand(1).getOffset(), + OpFlags | AArch64II::MO_PAGEOFF | AArch64II::MO_NC); + } + return constrainSelectedInstRegOperands(I, TII, TRI, RBI); + } + + case TargetOpcode::G_ZEXTLOAD: + case TargetOpcode::G_LOAD: + case TargetOpcode::G_STORE: { + bool IsZExtLoad = I.getOpcode() == TargetOpcode::G_ZEXTLOAD; + MachineIRBuilder MIB(I); + + LLT PtrTy = MRI.getType(I.getOperand(1).getReg()); + + if (PtrTy != LLT::pointer(0, 64)) { + LLVM_DEBUG(dbgs() << "Load/Store pointer has type: " << PtrTy + << ", expected: " << LLT::pointer(0, 64) << '\n'); + return false; + } + + auto &MemOp = **I.memoperands_begin(); + if (MemOp.isAtomic()) { + // For now we just support s8 acquire loads to be able to compile stack + // protector code. + if (MemOp.getOrdering() == AtomicOrdering::Acquire && + MemOp.getSize() == 1) { + I.setDesc(TII.get(AArch64::LDARB)); + return constrainSelectedInstRegOperands(I, TII, TRI, RBI); + } + LLVM_DEBUG(dbgs() << "Atomic load/store not fully supported yet\n"); + return false; + } + unsigned MemSizeInBits = MemOp.getSize() * 8; + + const Register PtrReg = I.getOperand(1).getReg(); +#ifndef NDEBUG + const RegisterBank &PtrRB = *RBI.getRegBank(PtrReg, MRI, TRI); + // Sanity-check the pointer register. + assert(PtrRB.getID() == AArch64::GPRRegBankID && + "Load/Store pointer operand isn't a GPR"); + assert(MRI.getType(PtrReg).isPointer() && + "Load/Store pointer operand isn't a pointer"); +#endif + + const Register ValReg = I.getOperand(0).getReg(); + const RegisterBank &RB = *RBI.getRegBank(ValReg, MRI, TRI); + + const unsigned NewOpc = + selectLoadStoreUIOp(I.getOpcode(), RB.getID(), MemSizeInBits); + if (NewOpc == I.getOpcode()) + return false; + + I.setDesc(TII.get(NewOpc)); + + uint64_t Offset = 0; + auto *PtrMI = MRI.getVRegDef(PtrReg); + + // Try to fold a GEP into our unsigned immediate addressing mode. + if (PtrMI->getOpcode() == TargetOpcode::G_PTR_ADD) { + if (auto COff = getConstantVRegVal(PtrMI->getOperand(2).getReg(), MRI)) { + int64_t Imm = *COff; + const unsigned Size = MemSizeInBits / 8; + const unsigned Scale = Log2_32(Size); + if ((Imm & (Size - 1)) == 0 && Imm >= 0 && Imm < (0x1000 << Scale)) { + Register Ptr2Reg = PtrMI->getOperand(1).getReg(); + I.getOperand(1).setReg(Ptr2Reg); + PtrMI = MRI.getVRegDef(Ptr2Reg); + Offset = Imm / Size; + } + } + } + + // If we haven't folded anything into our addressing mode yet, try to fold + // a frame index into the base+offset. + if (!Offset && PtrMI->getOpcode() == TargetOpcode::G_FRAME_INDEX) + I.getOperand(1).ChangeToFrameIndex(PtrMI->getOperand(1).getIndex()); + + I.addOperand(MachineOperand::CreateImm(Offset)); + + // If we're storing a 0, use WZR/XZR. + if (auto CVal = getConstantVRegVal(ValReg, MRI)) { + if (*CVal == 0 && Opcode == TargetOpcode::G_STORE) { + if (I.getOpcode() == AArch64::STRWui) + I.getOperand(0).setReg(AArch64::WZR); + else if (I.getOpcode() == AArch64::STRXui) + I.getOperand(0).setReg(AArch64::XZR); + } + } + + if (IsZExtLoad) { + // The zextload from a smaller type to i32 should be handled by the importer. + if (MRI.getType(ValReg).getSizeInBits() != 64) + return false; + // If we have a ZEXTLOAD then change the load's type to be a narrower reg + //and zero_extend with SUBREG_TO_REG. + Register LdReg = MRI.createVirtualRegister(&AArch64::GPR32RegClass); + Register DstReg = I.getOperand(0).getReg(); + I.getOperand(0).setReg(LdReg); + + MIB.setInsertPt(MIB.getMBB(), std::next(I.getIterator())); + MIB.buildInstr(AArch64::SUBREG_TO_REG, {DstReg}, {}) + .addImm(0) + .addUse(LdReg) + .addImm(AArch64::sub_32); + constrainSelectedInstRegOperands(I, TII, TRI, RBI); + return RBI.constrainGenericRegister(DstReg, AArch64::GPR64allRegClass, + MRI); + } + return constrainSelectedInstRegOperands(I, TII, TRI, RBI); + } + + case TargetOpcode::G_SMULH: + case TargetOpcode::G_UMULH: { + // Reject the various things we don't support yet. + if (unsupportedBinOp(I, RBI, MRI, TRI)) + return false; + + const Register DefReg = I.getOperand(0).getReg(); + const RegisterBank &RB = *RBI.getRegBank(DefReg, MRI, TRI); + + if (RB.getID() != AArch64::GPRRegBankID) { + LLVM_DEBUG(dbgs() << "G_[SU]MULH on bank: " << RB << ", expected: GPR\n"); + return false; + } + + if (Ty != LLT::scalar(64)) { + LLVM_DEBUG(dbgs() << "G_[SU]MULH has type: " << Ty + << ", expected: " << LLT::scalar(64) << '\n'); + return false; + } + + unsigned NewOpc = I.getOpcode() == TargetOpcode::G_SMULH ? AArch64::SMULHrr + : AArch64::UMULHrr; + I.setDesc(TII.get(NewOpc)); + + // Now that we selected an opcode, we need to constrain the register + // operands to use appropriate classes. + return constrainSelectedInstRegOperands(I, TII, TRI, RBI); + } + case TargetOpcode::G_FADD: + case TargetOpcode::G_FSUB: + case TargetOpcode::G_FMUL: + case TargetOpcode::G_FDIV: + + case TargetOpcode::G_ASHR: + if (MRI.getType(I.getOperand(0).getReg()).isVector()) + return selectVectorASHR(I, MRI); + LLVM_FALLTHROUGH; + case TargetOpcode::G_SHL: + if (Opcode == TargetOpcode::G_SHL && + MRI.getType(I.getOperand(0).getReg()).isVector()) + return selectVectorSHL(I, MRI); + LLVM_FALLTHROUGH; + case TargetOpcode::G_OR: + case TargetOpcode::G_LSHR: { + // Reject the various things we don't support yet. + if (unsupportedBinOp(I, RBI, MRI, TRI)) + return false; + + const unsigned OpSize = Ty.getSizeInBits(); + + const Register DefReg = I.getOperand(0).getReg(); + const RegisterBank &RB = *RBI.getRegBank(DefReg, MRI, TRI); + + const unsigned NewOpc = selectBinaryOp(I.getOpcode(), RB.getID(), OpSize); + if (NewOpc == I.getOpcode()) + return false; + + I.setDesc(TII.get(NewOpc)); + // FIXME: Should the type be always reset in setDesc? + + // Now that we selected an opcode, we need to constrain the register + // operands to use appropriate classes. + return constrainSelectedInstRegOperands(I, TII, TRI, RBI); + } + + case TargetOpcode::G_PTR_ADD: { + MachineIRBuilder MIRBuilder(I); + emitADD(I.getOperand(0).getReg(), I.getOperand(1), I.getOperand(2), + MIRBuilder); + I.eraseFromParent(); + return true; + } + case TargetOpcode::G_UADDO: { + // TODO: Support other types. + unsigned OpSize = Ty.getSizeInBits(); + if (OpSize != 32 && OpSize != 64) { + LLVM_DEBUG( + dbgs() + << "G_UADDO currently only supported for 32 and 64 b types.\n"); + return false; + } + + // TODO: Support vectors. + if (Ty.isVector()) { + LLVM_DEBUG(dbgs() << "G_UADDO currently only supported for scalars.\n"); + return false; + } + + // Add and set the set condition flag. + unsigned AddsOpc = OpSize == 32 ? AArch64::ADDSWrr : AArch64::ADDSXrr; + MachineIRBuilder MIRBuilder(I); + auto AddsMI = MIRBuilder.buildInstr(AddsOpc, {I.getOperand(0)}, + {I.getOperand(2), I.getOperand(3)}); + constrainSelectedInstRegOperands(*AddsMI, TII, TRI, RBI); + + // Now, put the overflow result in the register given by the first operand + // to the G_UADDO. CSINC increments the result when the predicate is false, + // so to get the increment when it's true, we need to use the inverse. In + // this case, we want to increment when carry is set. + auto CsetMI = MIRBuilder + .buildInstr(AArch64::CSINCWr, {I.getOperand(1).getReg()}, + {Register(AArch64::WZR), Register(AArch64::WZR)}) + .addImm(getInvertedCondCode(AArch64CC::HS)); + constrainSelectedInstRegOperands(*CsetMI, TII, TRI, RBI); + I.eraseFromParent(); + return true; + } + + case TargetOpcode::G_PTRMASK: { + Register MaskReg = I.getOperand(2).getReg(); + Optional<int64_t> MaskVal = getConstantVRegVal(MaskReg, MRI); + // TODO: Implement arbitrary cases + if (!MaskVal || !isShiftedMask_64(*MaskVal)) + return false; + + uint64_t Mask = *MaskVal; + I.setDesc(TII.get(AArch64::ANDXri)); + I.getOperand(2).ChangeToImmediate( + AArch64_AM::encodeLogicalImmediate(Mask, 64)); + + return constrainSelectedInstRegOperands(I, TII, TRI, RBI); + } + case TargetOpcode::G_PTRTOINT: + case TargetOpcode::G_TRUNC: { + const LLT DstTy = MRI.getType(I.getOperand(0).getReg()); + const LLT SrcTy = MRI.getType(I.getOperand(1).getReg()); + + const Register DstReg = I.getOperand(0).getReg(); + const Register SrcReg = I.getOperand(1).getReg(); + + const RegisterBank &DstRB = *RBI.getRegBank(DstReg, MRI, TRI); + const RegisterBank &SrcRB = *RBI.getRegBank(SrcReg, MRI, TRI); + + if (DstRB.getID() != SrcRB.getID()) { + LLVM_DEBUG( + dbgs() << "G_TRUNC/G_PTRTOINT input/output on different banks\n"); + return false; + } + + if (DstRB.getID() == AArch64::GPRRegBankID) { + const TargetRegisterClass *DstRC = + getRegClassForTypeOnBank(DstTy, DstRB, RBI); + if (!DstRC) + return false; + + const TargetRegisterClass *SrcRC = + getRegClassForTypeOnBank(SrcTy, SrcRB, RBI); + if (!SrcRC) + return false; + + if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, MRI) || + !RBI.constrainGenericRegister(DstReg, *DstRC, MRI)) { + LLVM_DEBUG(dbgs() << "Failed to constrain G_TRUNC/G_PTRTOINT\n"); + return false; + } + + if (DstRC == SrcRC) { + // Nothing to be done + } else if (Opcode == TargetOpcode::G_TRUNC && DstTy == LLT::scalar(32) && + SrcTy == LLT::scalar(64)) { + llvm_unreachable("TableGen can import this case"); + return false; + } else if (DstRC == &AArch64::GPR32RegClass && + SrcRC == &AArch64::GPR64RegClass) { + I.getOperand(1).setSubReg(AArch64::sub_32); + } else { + LLVM_DEBUG( + dbgs() << "Unhandled mismatched classes in G_TRUNC/G_PTRTOINT\n"); + return false; + } + + I.setDesc(TII.get(TargetOpcode::COPY)); + return true; + } else if (DstRB.getID() == AArch64::FPRRegBankID) { + if (DstTy == LLT::vector(4, 16) && SrcTy == LLT::vector(4, 32)) { + I.setDesc(TII.get(AArch64::XTNv4i16)); + constrainSelectedInstRegOperands(I, TII, TRI, RBI); + return true; + } + + if (!SrcTy.isVector() && SrcTy.getSizeInBits() == 128) { + MachineIRBuilder MIB(I); + MachineInstr *Extract = emitExtractVectorElt( + DstReg, DstRB, LLT::scalar(DstTy.getSizeInBits()), SrcReg, 0, MIB); + if (!Extract) + return false; + I.eraseFromParent(); + return true; + } + + // We might have a vector G_PTRTOINT, in which case just emit a COPY. + if (Opcode == TargetOpcode::G_PTRTOINT) { + assert(DstTy.isVector() && "Expected an FPR ptrtoint to be a vector"); + I.setDesc(TII.get(TargetOpcode::COPY)); + return true; + } + } + + return false; + } + + case TargetOpcode::G_ANYEXT: { + const Register DstReg = I.getOperand(0).getReg(); + const Register SrcReg = I.getOperand(1).getReg(); + + const RegisterBank &RBDst = *RBI.getRegBank(DstReg, MRI, TRI); + if (RBDst.getID() != AArch64::GPRRegBankID) { + LLVM_DEBUG(dbgs() << "G_ANYEXT on bank: " << RBDst + << ", expected: GPR\n"); + return false; + } + + const RegisterBank &RBSrc = *RBI.getRegBank(SrcReg, MRI, TRI); + if (RBSrc.getID() != AArch64::GPRRegBankID) { + LLVM_DEBUG(dbgs() << "G_ANYEXT on bank: " << RBSrc + << ", expected: GPR\n"); + return false; + } + + const unsigned DstSize = MRI.getType(DstReg).getSizeInBits(); + + if (DstSize == 0) { + LLVM_DEBUG(dbgs() << "G_ANYEXT operand has no size, not a gvreg?\n"); + return false; + } + + if (DstSize != 64 && DstSize > 32) { + LLVM_DEBUG(dbgs() << "G_ANYEXT to size: " << DstSize + << ", expected: 32 or 64\n"); + return false; + } + // At this point G_ANYEXT is just like a plain COPY, but we need + // to explicitly form the 64-bit value if any. + if (DstSize > 32) { + Register ExtSrc = MRI.createVirtualRegister(&AArch64::GPR64allRegClass); + BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::SUBREG_TO_REG)) + .addDef(ExtSrc) + .addImm(0) + .addUse(SrcReg) + .addImm(AArch64::sub_32); + I.getOperand(1).setReg(ExtSrc); + } + return selectCopy(I, TII, MRI, TRI, RBI); + } + + case TargetOpcode::G_ZEXT: + case TargetOpcode::G_SEXT_INREG: + case TargetOpcode::G_SEXT: { + unsigned Opcode = I.getOpcode(); + const bool IsSigned = Opcode != TargetOpcode::G_ZEXT; + const Register DefReg = I.getOperand(0).getReg(); + Register SrcReg = I.getOperand(1).getReg(); + const LLT DstTy = MRI.getType(DefReg); + const LLT SrcTy = MRI.getType(SrcReg); + unsigned DstSize = DstTy.getSizeInBits(); + unsigned SrcSize = SrcTy.getSizeInBits(); + + // SEXT_INREG has the same src reg size as dst, the size of the value to be + // extended is encoded in the imm. + if (Opcode == TargetOpcode::G_SEXT_INREG) + SrcSize = I.getOperand(2).getImm(); + + if (DstTy.isVector()) + return false; // Should be handled by imported patterns. + + assert((*RBI.getRegBank(DefReg, MRI, TRI)).getID() == + AArch64::GPRRegBankID && + "Unexpected ext regbank"); + + MachineIRBuilder MIB(I); + MachineInstr *ExtI; + + // First check if we're extending the result of a load which has a dest type + // smaller than 32 bits, then this zext is redundant. GPR32 is the smallest + // GPR register on AArch64 and all loads which are smaller automatically + // zero-extend the upper bits. E.g. + // %v(s8) = G_LOAD %p, :: (load 1) + // %v2(s32) = G_ZEXT %v(s8) + if (!IsSigned) { + auto *LoadMI = getOpcodeDef(TargetOpcode::G_LOAD, SrcReg, MRI); + bool IsGPR = + RBI.getRegBank(SrcReg, MRI, TRI)->getID() == AArch64::GPRRegBankID; + if (LoadMI && IsGPR) { + const MachineMemOperand *MemOp = *LoadMI->memoperands_begin(); + unsigned BytesLoaded = MemOp->getSize(); + if (BytesLoaded < 4 && SrcTy.getSizeInBytes() == BytesLoaded) + return selectCopy(I, TII, MRI, TRI, RBI); + } + + // If we are zero extending from 32 bits to 64 bits, it's possible that + // the instruction implicitly does the zero extend for us. In that case, + // we can just emit a SUBREG_TO_REG. + if (IsGPR && SrcSize == 32 && DstSize == 64) { + // Unlike with the G_LOAD case, we don't want to look through copies + // here. + MachineInstr *Def = MRI.getVRegDef(SrcReg); + if (Def && isDef32(*Def)) { + MIB.buildInstr(AArch64::SUBREG_TO_REG, {DefReg}, {}) + .addImm(0) + .addUse(SrcReg) + .addImm(AArch64::sub_32); + + if (!RBI.constrainGenericRegister(DefReg, AArch64::GPR64RegClass, + MRI)) { + LLVM_DEBUG(dbgs() << "Failed to constrain G_ZEXT destination\n"); + return false; + } + + if (!RBI.constrainGenericRegister(SrcReg, AArch64::GPR32RegClass, + MRI)) { + LLVM_DEBUG(dbgs() << "Failed to constrain G_ZEXT source\n"); + return false; + } + + I.eraseFromParent(); + return true; + } + } + } + + if (DstSize == 64) { + if (Opcode != TargetOpcode::G_SEXT_INREG) { + // FIXME: Can we avoid manually doing this? + if (!RBI.constrainGenericRegister(SrcReg, AArch64::GPR32RegClass, + MRI)) { + LLVM_DEBUG(dbgs() << "Failed to constrain " << TII.getName(Opcode) + << " operand\n"); + return false; + } + SrcReg = MIB.buildInstr(AArch64::SUBREG_TO_REG, + {&AArch64::GPR64RegClass}, {}) + .addImm(0) + .addUse(SrcReg) + .addImm(AArch64::sub_32) + .getReg(0); + } + + ExtI = MIB.buildInstr(IsSigned ? AArch64::SBFMXri : AArch64::UBFMXri, + {DefReg}, {SrcReg}) + .addImm(0) + .addImm(SrcSize - 1); + } else if (DstSize <= 32) { + ExtI = MIB.buildInstr(IsSigned ? AArch64::SBFMWri : AArch64::UBFMWri, + {DefReg}, {SrcReg}) + .addImm(0) + .addImm(SrcSize - 1); + } else { + return false; + } + + constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI); + I.eraseFromParent(); + return true; + } + + case TargetOpcode::G_SITOFP: + case TargetOpcode::G_UITOFP: + case TargetOpcode::G_FPTOSI: + case TargetOpcode::G_FPTOUI: { + const LLT DstTy = MRI.getType(I.getOperand(0).getReg()), + SrcTy = MRI.getType(I.getOperand(1).getReg()); + const unsigned NewOpc = selectFPConvOpc(Opcode, DstTy, SrcTy); + if (NewOpc == Opcode) + return false; + + I.setDesc(TII.get(NewOpc)); + constrainSelectedInstRegOperands(I, TII, TRI, RBI); + + return true; + } + + case TargetOpcode::G_FREEZE: + return selectCopy(I, TII, MRI, TRI, RBI); + + case TargetOpcode::G_INTTOPTR: + // The importer is currently unable to import pointer types since they + // didn't exist in SelectionDAG. + return selectCopy(I, TII, MRI, TRI, RBI); + + case TargetOpcode::G_BITCAST: + // Imported SelectionDAG rules can handle every bitcast except those that + // bitcast from a type to the same type. Ideally, these shouldn't occur + // but we might not run an optimizer that deletes them. The other exception + // is bitcasts involving pointer types, as SelectionDAG has no knowledge + // of them. + return selectCopy(I, TII, MRI, TRI, RBI); + + case TargetOpcode::G_SELECT: { + if (MRI.getType(I.getOperand(1).getReg()) != LLT::scalar(1)) { + LLVM_DEBUG(dbgs() << "G_SELECT cond has type: " << Ty + << ", expected: " << LLT::scalar(1) << '\n'); + return false; + } + + const Register CondReg = I.getOperand(1).getReg(); + const Register TReg = I.getOperand(2).getReg(); + const Register FReg = I.getOperand(3).getReg(); + + if (tryOptSelect(I)) + return true; + + Register CSelOpc = selectSelectOpc(I, MRI, RBI); + MachineInstr &TstMI = + *BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::ANDSWri)) + .addDef(AArch64::WZR) + .addUse(CondReg) + .addImm(AArch64_AM::encodeLogicalImmediate(1, 32)); + + MachineInstr &CSelMI = *BuildMI(MBB, I, I.getDebugLoc(), TII.get(CSelOpc)) + .addDef(I.getOperand(0).getReg()) + .addUse(TReg) + .addUse(FReg) + .addImm(AArch64CC::NE); + + constrainSelectedInstRegOperands(TstMI, TII, TRI, RBI); + constrainSelectedInstRegOperands(CSelMI, TII, TRI, RBI); + + I.eraseFromParent(); + return true; + } + case TargetOpcode::G_ICMP: { + if (Ty.isVector()) + return selectVectorICmp(I, MRI); + + if (Ty != LLT::scalar(32)) { + LLVM_DEBUG(dbgs() << "G_ICMP result has type: " << Ty + << ", expected: " << LLT::scalar(32) << '\n'); + return false; + } + + MachineIRBuilder MIRBuilder(I); + MachineInstr *Cmp; + CmpInst::Predicate Pred; + std::tie(Cmp, Pred) = emitIntegerCompare(I.getOperand(2), I.getOperand(3), + I.getOperand(1), MIRBuilder); + if (!Cmp) + return false; + emitCSetForICMP(I.getOperand(0).getReg(), Pred, MIRBuilder); + I.eraseFromParent(); + return true; + } + + case TargetOpcode::G_FCMP: { + if (Ty != LLT::scalar(32)) { + LLVM_DEBUG(dbgs() << "G_FCMP result has type: " << Ty + << ", expected: " << LLT::scalar(32) << '\n'); + return false; + } + + unsigned CmpOpc = selectFCMPOpc(I, MRI); + if (!CmpOpc) + return false; + + // FIXME: regbank + + AArch64CC::CondCode CC1, CC2; + changeFCMPPredToAArch64CC( + (CmpInst::Predicate)I.getOperand(1).getPredicate(), CC1, CC2); + + // Partially build the compare. Decide if we need to add a use for the + // third operand based off whether or not we're comparing against 0.0. + auto CmpMI = BuildMI(MBB, I, I.getDebugLoc(), TII.get(CmpOpc)) + .addUse(I.getOperand(2).getReg()); + + // If we don't have an immediate compare, then we need to add a use of the + // register which wasn't used for the immediate. + // Note that the immediate will always be the last operand. + if (CmpOpc != AArch64::FCMPSri && CmpOpc != AArch64::FCMPDri) + CmpMI = CmpMI.addUse(I.getOperand(3).getReg()); + + const Register DefReg = I.getOperand(0).getReg(); + Register Def1Reg = DefReg; + if (CC2 != AArch64CC::AL) + Def1Reg = MRI.createVirtualRegister(&AArch64::GPR32RegClass); + + MachineInstr &CSetMI = + *BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::CSINCWr)) + .addDef(Def1Reg) + .addUse(AArch64::WZR) + .addUse(AArch64::WZR) + .addImm(getInvertedCondCode(CC1)); + + if (CC2 != AArch64CC::AL) { + Register Def2Reg = MRI.createVirtualRegister(&AArch64::GPR32RegClass); + MachineInstr &CSet2MI = + *BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::CSINCWr)) + .addDef(Def2Reg) + .addUse(AArch64::WZR) + .addUse(AArch64::WZR) + .addImm(getInvertedCondCode(CC2)); + MachineInstr &OrMI = + *BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::ORRWrr)) + .addDef(DefReg) + .addUse(Def1Reg) + .addUse(Def2Reg); + constrainSelectedInstRegOperands(OrMI, TII, TRI, RBI); + constrainSelectedInstRegOperands(CSet2MI, TII, TRI, RBI); + } + constrainSelectedInstRegOperands(*CmpMI, TII, TRI, RBI); + constrainSelectedInstRegOperands(CSetMI, TII, TRI, RBI); + + I.eraseFromParent(); + return true; + } + case TargetOpcode::G_VASTART: + return STI.isTargetDarwin() ? selectVaStartDarwin(I, MF, MRI) + : selectVaStartAAPCS(I, MF, MRI); + case TargetOpcode::G_INTRINSIC: + return selectIntrinsic(I, MRI); + case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS: + return selectIntrinsicWithSideEffects(I, MRI); + case TargetOpcode::G_IMPLICIT_DEF: { + I.setDesc(TII.get(TargetOpcode::IMPLICIT_DEF)); + const LLT DstTy = MRI.getType(I.getOperand(0).getReg()); + const Register DstReg = I.getOperand(0).getReg(); + const RegisterBank &DstRB = *RBI.getRegBank(DstReg, MRI, TRI); + const TargetRegisterClass *DstRC = + getRegClassForTypeOnBank(DstTy, DstRB, RBI); + RBI.constrainGenericRegister(DstReg, *DstRC, MRI); + return true; + } + case TargetOpcode::G_BLOCK_ADDR: { + if (TM.getCodeModel() == CodeModel::Large) { + materializeLargeCMVal(I, I.getOperand(1).getBlockAddress(), 0); + I.eraseFromParent(); + return true; + } else { + I.setDesc(TII.get(AArch64::MOVaddrBA)); + auto MovMI = BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::MOVaddrBA), + I.getOperand(0).getReg()) + .addBlockAddress(I.getOperand(1).getBlockAddress(), + /* Offset */ 0, AArch64II::MO_PAGE) + .addBlockAddress( + I.getOperand(1).getBlockAddress(), /* Offset */ 0, + AArch64II::MO_NC | AArch64II::MO_PAGEOFF); + I.eraseFromParent(); + return constrainSelectedInstRegOperands(*MovMI, TII, TRI, RBI); + } + } + case TargetOpcode::G_INTRINSIC_TRUNC: + return selectIntrinsicTrunc(I, MRI); + case TargetOpcode::G_INTRINSIC_ROUND: + return selectIntrinsicRound(I, MRI); + case TargetOpcode::G_BUILD_VECTOR: + return selectBuildVector(I, MRI); + case TargetOpcode::G_MERGE_VALUES: + return selectMergeValues(I, MRI); + case TargetOpcode::G_UNMERGE_VALUES: + return selectUnmergeValues(I, MRI); + case TargetOpcode::G_SHUFFLE_VECTOR: + return selectShuffleVector(I, MRI); + case TargetOpcode::G_EXTRACT_VECTOR_ELT: + return selectExtractElt(I, MRI); + case TargetOpcode::G_INSERT_VECTOR_ELT: + return selectInsertElt(I, MRI); + case TargetOpcode::G_CONCAT_VECTORS: + return selectConcatVectors(I, MRI); + case TargetOpcode::G_JUMP_TABLE: + return selectJumpTable(I, MRI); + } + + return false; +} + +bool AArch64InstructionSelector::selectBrJT(MachineInstr &I, + MachineRegisterInfo &MRI) const { + assert(I.getOpcode() == TargetOpcode::G_BRJT && "Expected G_BRJT"); + Register JTAddr = I.getOperand(0).getReg(); + unsigned JTI = I.getOperand(1).getIndex(); + Register Index = I.getOperand(2).getReg(); + MachineIRBuilder MIB(I); + + Register TargetReg = MRI.createVirtualRegister(&AArch64::GPR64RegClass); + Register ScratchReg = MRI.createVirtualRegister(&AArch64::GPR64spRegClass); + auto JumpTableInst = MIB.buildInstr(AArch64::JumpTableDest32, + {TargetReg, ScratchReg}, {JTAddr, Index}) + .addJumpTableIndex(JTI); + // Build the indirect branch. + MIB.buildInstr(AArch64::BR, {}, {TargetReg}); + I.eraseFromParent(); + return constrainSelectedInstRegOperands(*JumpTableInst, TII, TRI, RBI); +} + +bool AArch64InstructionSelector::selectJumpTable( + MachineInstr &I, MachineRegisterInfo &MRI) const { + assert(I.getOpcode() == TargetOpcode::G_JUMP_TABLE && "Expected jump table"); + assert(I.getOperand(1).isJTI() && "Jump table op should have a JTI!"); + + Register DstReg = I.getOperand(0).getReg(); + unsigned JTI = I.getOperand(1).getIndex(); + // We generate a MOVaddrJT which will get expanded to an ADRP + ADD later. + MachineIRBuilder MIB(I); + auto MovMI = + MIB.buildInstr(AArch64::MOVaddrJT, {DstReg}, {}) + .addJumpTableIndex(JTI, AArch64II::MO_PAGE) + .addJumpTableIndex(JTI, AArch64II::MO_NC | AArch64II::MO_PAGEOFF); + I.eraseFromParent(); + return constrainSelectedInstRegOperands(*MovMI, TII, TRI, RBI); +} + +bool AArch64InstructionSelector::selectTLSGlobalValue( + MachineInstr &I, MachineRegisterInfo &MRI) const { + if (!STI.isTargetMachO()) + return false; + MachineFunction &MF = *I.getParent()->getParent(); + MF.getFrameInfo().setAdjustsStack(true); + + const GlobalValue &GV = *I.getOperand(1).getGlobal(); + MachineIRBuilder MIB(I); + + MIB.buildInstr(AArch64::LOADgot, {AArch64::X0}, {}) + .addGlobalAddress(&GV, 0, AArch64II::MO_TLS); + + auto Load = MIB.buildInstr(AArch64::LDRXui, {&AArch64::GPR64commonRegClass}, + {Register(AArch64::X0)}) + .addImm(0); + + // TLS calls preserve all registers except those that absolutely must be + // trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be + // silly). + MIB.buildInstr(getBLRCallOpcode(MF), {}, {Load}) + .addDef(AArch64::X0, RegState::Implicit) + .addRegMask(TRI.getTLSCallPreservedMask()); + + MIB.buildCopy(I.getOperand(0).getReg(), Register(AArch64::X0)); + RBI.constrainGenericRegister(I.getOperand(0).getReg(), AArch64::GPR64RegClass, + MRI); + I.eraseFromParent(); + return true; +} + +bool AArch64InstructionSelector::selectIntrinsicTrunc( + MachineInstr &I, MachineRegisterInfo &MRI) const { + const LLT SrcTy = MRI.getType(I.getOperand(0).getReg()); + + // Select the correct opcode. + unsigned Opc = 0; + if (!SrcTy.isVector()) { + switch (SrcTy.getSizeInBits()) { + default: + case 16: + Opc = AArch64::FRINTZHr; + break; + case 32: + Opc = AArch64::FRINTZSr; + break; + case 64: + Opc = AArch64::FRINTZDr; + break; + } + } else { + unsigned NumElts = SrcTy.getNumElements(); + switch (SrcTy.getElementType().getSizeInBits()) { + default: + break; + case 16: + if (NumElts == 4) + Opc = AArch64::FRINTZv4f16; + else if (NumElts == 8) + Opc = AArch64::FRINTZv8f16; + break; + case 32: + if (NumElts == 2) + Opc = AArch64::FRINTZv2f32; + else if (NumElts == 4) + Opc = AArch64::FRINTZv4f32; + break; + case 64: + if (NumElts == 2) + Opc = AArch64::FRINTZv2f64; + break; + } + } + + if (!Opc) { + // Didn't get an opcode above, bail. + LLVM_DEBUG(dbgs() << "Unsupported type for G_INTRINSIC_TRUNC!\n"); + return false; + } + + // Legalization would have set us up perfectly for this; we just need to + // set the opcode and move on. + I.setDesc(TII.get(Opc)); + return constrainSelectedInstRegOperands(I, TII, TRI, RBI); +} + +bool AArch64InstructionSelector::selectIntrinsicRound( + MachineInstr &I, MachineRegisterInfo &MRI) const { + const LLT SrcTy = MRI.getType(I.getOperand(0).getReg()); + + // Select the correct opcode. + unsigned Opc = 0; + if (!SrcTy.isVector()) { + switch (SrcTy.getSizeInBits()) { + default: + case 16: + Opc = AArch64::FRINTAHr; + break; + case 32: + Opc = AArch64::FRINTASr; + break; + case 64: + Opc = AArch64::FRINTADr; + break; + } + } else { + unsigned NumElts = SrcTy.getNumElements(); + switch (SrcTy.getElementType().getSizeInBits()) { + default: + break; + case 16: + if (NumElts == 4) + Opc = AArch64::FRINTAv4f16; + else if (NumElts == 8) + Opc = AArch64::FRINTAv8f16; + break; + case 32: + if (NumElts == 2) + Opc = AArch64::FRINTAv2f32; + else if (NumElts == 4) + Opc = AArch64::FRINTAv4f32; + break; + case 64: + if (NumElts == 2) + Opc = AArch64::FRINTAv2f64; + break; + } + } + + if (!Opc) { + // Didn't get an opcode above, bail. + LLVM_DEBUG(dbgs() << "Unsupported type for G_INTRINSIC_ROUND!\n"); + return false; + } + + // Legalization would have set us up perfectly for this; we just need to + // set the opcode and move on. + I.setDesc(TII.get(Opc)); + return constrainSelectedInstRegOperands(I, TII, TRI, RBI); +} + +bool AArch64InstructionSelector::selectVectorICmp( + MachineInstr &I, MachineRegisterInfo &MRI) const { + Register DstReg = I.getOperand(0).getReg(); + LLT DstTy = MRI.getType(DstReg); + Register SrcReg = I.getOperand(2).getReg(); + Register Src2Reg = I.getOperand(3).getReg(); + LLT SrcTy = MRI.getType(SrcReg); + + unsigned SrcEltSize = SrcTy.getElementType().getSizeInBits(); + unsigned NumElts = DstTy.getNumElements(); + + // First index is element size, 0 == 8b, 1 == 16b, 2 == 32b, 3 == 64b + // Second index is num elts, 0 == v2, 1 == v4, 2 == v8, 3 == v16 + // Third index is cc opcode: + // 0 == eq + // 1 == ugt + // 2 == uge + // 3 == ult + // 4 == ule + // 5 == sgt + // 6 == sge + // 7 == slt + // 8 == sle + // ne is done by negating 'eq' result. + + // This table below assumes that for some comparisons the operands will be + // commuted. + // ult op == commute + ugt op + // ule op == commute + uge op + // slt op == commute + sgt op + // sle op == commute + sge op + unsigned PredIdx = 0; + bool SwapOperands = false; + CmpInst::Predicate Pred = (CmpInst::Predicate)I.getOperand(1).getPredicate(); + switch (Pred) { + case CmpInst::ICMP_NE: + case CmpInst::ICMP_EQ: + PredIdx = 0; + break; + case CmpInst::ICMP_UGT: + PredIdx = 1; + break; + case CmpInst::ICMP_UGE: + PredIdx = 2; + break; + case CmpInst::ICMP_ULT: + PredIdx = 3; + SwapOperands = true; + break; + case CmpInst::ICMP_ULE: + PredIdx = 4; + SwapOperands = true; + break; + case CmpInst::ICMP_SGT: + PredIdx = 5; + break; + case CmpInst::ICMP_SGE: + PredIdx = 6; + break; + case CmpInst::ICMP_SLT: + PredIdx = 7; + SwapOperands = true; + break; + case CmpInst::ICMP_SLE: + PredIdx = 8; + SwapOperands = true; + break; + default: + llvm_unreachable("Unhandled icmp predicate"); + return false; + } + + // This table obviously should be tablegen'd when we have our GISel native + // tablegen selector. + + static const unsigned OpcTable[4][4][9] = { + { + {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, + 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, + 0 /* invalid */}, + {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, + 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, + 0 /* invalid */}, + {AArch64::CMEQv8i8, AArch64::CMHIv8i8, AArch64::CMHSv8i8, + AArch64::CMHIv8i8, AArch64::CMHSv8i8, AArch64::CMGTv8i8, + AArch64::CMGEv8i8, AArch64::CMGTv8i8, AArch64::CMGEv8i8}, + {AArch64::CMEQv16i8, AArch64::CMHIv16i8, AArch64::CMHSv16i8, + AArch64::CMHIv16i8, AArch64::CMHSv16i8, AArch64::CMGTv16i8, + AArch64::CMGEv16i8, AArch64::CMGTv16i8, AArch64::CMGEv16i8} + }, + { + {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, + 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, + 0 /* invalid */}, + {AArch64::CMEQv4i16, AArch64::CMHIv4i16, AArch64::CMHSv4i16, + AArch64::CMHIv4i16, AArch64::CMHSv4i16, AArch64::CMGTv4i16, + AArch64::CMGEv4i16, AArch64::CMGTv4i16, AArch64::CMGEv4i16}, + {AArch64::CMEQv8i16, AArch64::CMHIv8i16, AArch64::CMHSv8i16, + AArch64::CMHIv8i16, AArch64::CMHSv8i16, AArch64::CMGTv8i16, + AArch64::CMGEv8i16, AArch64::CMGTv8i16, AArch64::CMGEv8i16}, + {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, + 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, + 0 /* invalid */} + }, + { + {AArch64::CMEQv2i32, AArch64::CMHIv2i32, AArch64::CMHSv2i32, + AArch64::CMHIv2i32, AArch64::CMHSv2i32, AArch64::CMGTv2i32, + AArch64::CMGEv2i32, AArch64::CMGTv2i32, AArch64::CMGEv2i32}, + {AArch64::CMEQv4i32, AArch64::CMHIv4i32, AArch64::CMHSv4i32, + AArch64::CMHIv4i32, AArch64::CMHSv4i32, AArch64::CMGTv4i32, + AArch64::CMGEv4i32, AArch64::CMGTv4i32, AArch64::CMGEv4i32}, + {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, + 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, + 0 /* invalid */}, + {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, + 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, + 0 /* invalid */} + }, + { + {AArch64::CMEQv2i64, AArch64::CMHIv2i64, AArch64::CMHSv2i64, + AArch64::CMHIv2i64, AArch64::CMHSv2i64, AArch64::CMGTv2i64, + AArch64::CMGEv2i64, AArch64::CMGTv2i64, AArch64::CMGEv2i64}, + {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, + 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, + 0 /* invalid */}, + {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, + 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, + 0 /* invalid */}, + {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, + 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, + 0 /* invalid */} + }, + }; + unsigned EltIdx = Log2_32(SrcEltSize / 8); + unsigned NumEltsIdx = Log2_32(NumElts / 2); + unsigned Opc = OpcTable[EltIdx][NumEltsIdx][PredIdx]; + if (!Opc) { + LLVM_DEBUG(dbgs() << "Could not map G_ICMP to cmp opcode"); + return false; + } + + const RegisterBank &VecRB = *RBI.getRegBank(SrcReg, MRI, TRI); + const TargetRegisterClass *SrcRC = + getRegClassForTypeOnBank(SrcTy, VecRB, RBI, true); + if (!SrcRC) { + LLVM_DEBUG(dbgs() << "Could not determine source register class.\n"); + return false; + } + + unsigned NotOpc = Pred == ICmpInst::ICMP_NE ? AArch64::NOTv8i8 : 0; + if (SrcTy.getSizeInBits() == 128) + NotOpc = NotOpc ? AArch64::NOTv16i8 : 0; + + if (SwapOperands) + std::swap(SrcReg, Src2Reg); + + MachineIRBuilder MIB(I); + auto Cmp = MIB.buildInstr(Opc, {SrcRC}, {SrcReg, Src2Reg}); + constrainSelectedInstRegOperands(*Cmp, TII, TRI, RBI); + + // Invert if we had a 'ne' cc. + if (NotOpc) { + Cmp = MIB.buildInstr(NotOpc, {DstReg}, {Cmp}); + constrainSelectedInstRegOperands(*Cmp, TII, TRI, RBI); + } else { + MIB.buildCopy(DstReg, Cmp.getReg(0)); + } + RBI.constrainGenericRegister(DstReg, *SrcRC, MRI); + I.eraseFromParent(); + return true; +} + +MachineInstr *AArch64InstructionSelector::emitScalarToVector( + unsigned EltSize, const TargetRegisterClass *DstRC, Register Scalar, + MachineIRBuilder &MIRBuilder) const { + auto Undef = MIRBuilder.buildInstr(TargetOpcode::IMPLICIT_DEF, {DstRC}, {}); + + auto BuildFn = [&](unsigned SubregIndex) { + auto Ins = + MIRBuilder + .buildInstr(TargetOpcode::INSERT_SUBREG, {DstRC}, {Undef, Scalar}) + .addImm(SubregIndex); + constrainSelectedInstRegOperands(*Undef, TII, TRI, RBI); + constrainSelectedInstRegOperands(*Ins, TII, TRI, RBI); + return &*Ins; + }; + + switch (EltSize) { + case 16: + return BuildFn(AArch64::hsub); + case 32: + return BuildFn(AArch64::ssub); + case 64: + return BuildFn(AArch64::dsub); + default: + return nullptr; + } +} + +bool AArch64InstructionSelector::selectMergeValues( + MachineInstr &I, MachineRegisterInfo &MRI) const { + assert(I.getOpcode() == TargetOpcode::G_MERGE_VALUES && "unexpected opcode"); + const LLT DstTy = MRI.getType(I.getOperand(0).getReg()); + const LLT SrcTy = MRI.getType(I.getOperand(1).getReg()); + assert(!DstTy.isVector() && !SrcTy.isVector() && "invalid merge operation"); + const RegisterBank &RB = *RBI.getRegBank(I.getOperand(1).getReg(), MRI, TRI); + + if (I.getNumOperands() != 3) + return false; + + // Merging 2 s64s into an s128. + if (DstTy == LLT::scalar(128)) { + if (SrcTy.getSizeInBits() != 64) + return false; + MachineIRBuilder MIB(I); + Register DstReg = I.getOperand(0).getReg(); + Register Src1Reg = I.getOperand(1).getReg(); + Register Src2Reg = I.getOperand(2).getReg(); + auto Tmp = MIB.buildInstr(TargetOpcode::IMPLICIT_DEF, {DstTy}, {}); + MachineInstr *InsMI = + emitLaneInsert(None, Tmp.getReg(0), Src1Reg, /* LaneIdx */ 0, RB, MIB); + if (!InsMI) + return false; + MachineInstr *Ins2MI = emitLaneInsert(DstReg, InsMI->getOperand(0).getReg(), + Src2Reg, /* LaneIdx */ 1, RB, MIB); + if (!Ins2MI) + return false; + constrainSelectedInstRegOperands(*InsMI, TII, TRI, RBI); + constrainSelectedInstRegOperands(*Ins2MI, TII, TRI, RBI); + I.eraseFromParent(); + return true; + } + + if (RB.getID() != AArch64::GPRRegBankID) + return false; + + if (DstTy.getSizeInBits() != 64 || SrcTy.getSizeInBits() != 32) + return false; + + auto *DstRC = &AArch64::GPR64RegClass; + Register SubToRegDef = MRI.createVirtualRegister(DstRC); + MachineInstr &SubRegMI = *BuildMI(*I.getParent(), I, I.getDebugLoc(), + TII.get(TargetOpcode::SUBREG_TO_REG)) + .addDef(SubToRegDef) + .addImm(0) + .addUse(I.getOperand(1).getReg()) + .addImm(AArch64::sub_32); + Register SubToRegDef2 = MRI.createVirtualRegister(DstRC); + // Need to anyext the second scalar before we can use bfm + MachineInstr &SubRegMI2 = *BuildMI(*I.getParent(), I, I.getDebugLoc(), + TII.get(TargetOpcode::SUBREG_TO_REG)) + .addDef(SubToRegDef2) + .addImm(0) + .addUse(I.getOperand(2).getReg()) + .addImm(AArch64::sub_32); + MachineInstr &BFM = + *BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(AArch64::BFMXri)) + .addDef(I.getOperand(0).getReg()) + .addUse(SubToRegDef) + .addUse(SubToRegDef2) + .addImm(32) + .addImm(31); + constrainSelectedInstRegOperands(SubRegMI, TII, TRI, RBI); + constrainSelectedInstRegOperands(SubRegMI2, TII, TRI, RBI); + constrainSelectedInstRegOperands(BFM, TII, TRI, RBI); + I.eraseFromParent(); + return true; +} + +static bool getLaneCopyOpcode(unsigned &CopyOpc, unsigned &ExtractSubReg, + const unsigned EltSize) { + // Choose a lane copy opcode and subregister based off of the size of the + // vector's elements. + switch (EltSize) { + case 16: + CopyOpc = AArch64::CPYi16; + ExtractSubReg = AArch64::hsub; + break; + case 32: + CopyOpc = AArch64::CPYi32; + ExtractSubReg = AArch64::ssub; + break; + case 64: + CopyOpc = AArch64::CPYi64; + ExtractSubReg = AArch64::dsub; + break; + default: + // Unknown size, bail out. + LLVM_DEBUG(dbgs() << "Elt size '" << EltSize << "' unsupported.\n"); + return false; + } + return true; +} + +MachineInstr *AArch64InstructionSelector::emitExtractVectorElt( + Optional<Register> DstReg, const RegisterBank &DstRB, LLT ScalarTy, + Register VecReg, unsigned LaneIdx, MachineIRBuilder &MIRBuilder) const { + MachineRegisterInfo &MRI = *MIRBuilder.getMRI(); + unsigned CopyOpc = 0; + unsigned ExtractSubReg = 0; + if (!getLaneCopyOpcode(CopyOpc, ExtractSubReg, ScalarTy.getSizeInBits())) { + LLVM_DEBUG( + dbgs() << "Couldn't determine lane copy opcode for instruction.\n"); + return nullptr; + } + + const TargetRegisterClass *DstRC = + getRegClassForTypeOnBank(ScalarTy, DstRB, RBI, true); + if (!DstRC) { + LLVM_DEBUG(dbgs() << "Could not determine destination register class.\n"); + return nullptr; + } + + const RegisterBank &VecRB = *RBI.getRegBank(VecReg, MRI, TRI); + const LLT &VecTy = MRI.getType(VecReg); + const TargetRegisterClass *VecRC = + getRegClassForTypeOnBank(VecTy, VecRB, RBI, true); + if (!VecRC) { + LLVM_DEBUG(dbgs() << "Could not determine source register class.\n"); + return nullptr; + } + + // The register that we're going to copy into. + Register InsertReg = VecReg; + if (!DstReg) + DstReg = MRI.createVirtualRegister(DstRC); + // If the lane index is 0, we just use a subregister COPY. + if (LaneIdx == 0) { + auto Copy = MIRBuilder.buildInstr(TargetOpcode::COPY, {*DstReg}, {}) + .addReg(VecReg, 0, ExtractSubReg); + RBI.constrainGenericRegister(*DstReg, *DstRC, MRI); + return &*Copy; + } + + // Lane copies require 128-bit wide registers. If we're dealing with an + // unpacked vector, then we need to move up to that width. Insert an implicit + // def and a subregister insert to get us there. + if (VecTy.getSizeInBits() != 128) { + MachineInstr *ScalarToVector = emitScalarToVector( + VecTy.getSizeInBits(), &AArch64::FPR128RegClass, VecReg, MIRBuilder); + if (!ScalarToVector) + return nullptr; + InsertReg = ScalarToVector->getOperand(0).getReg(); + } + + MachineInstr *LaneCopyMI = + MIRBuilder.buildInstr(CopyOpc, {*DstReg}, {InsertReg}).addImm(LaneIdx); + constrainSelectedInstRegOperands(*LaneCopyMI, TII, TRI, RBI); + + // Make sure that we actually constrain the initial copy. + RBI.constrainGenericRegister(*DstReg, *DstRC, MRI); + return LaneCopyMI; +} + +bool AArch64InstructionSelector::selectExtractElt( + MachineInstr &I, MachineRegisterInfo &MRI) const { + assert(I.getOpcode() == TargetOpcode::G_EXTRACT_VECTOR_ELT && + "unexpected opcode!"); + Register DstReg = I.getOperand(0).getReg(); + const LLT NarrowTy = MRI.getType(DstReg); + const Register SrcReg = I.getOperand(1).getReg(); + const LLT WideTy = MRI.getType(SrcReg); + (void)WideTy; + assert(WideTy.getSizeInBits() >= NarrowTy.getSizeInBits() && + "source register size too small!"); + assert(NarrowTy.isScalar() && "cannot extract vector into vector!"); + + // Need the lane index to determine the correct copy opcode. + MachineOperand &LaneIdxOp = I.getOperand(2); + assert(LaneIdxOp.isReg() && "Lane index operand was not a register?"); + + if (RBI.getRegBank(DstReg, MRI, TRI)->getID() != AArch64::FPRRegBankID) { + LLVM_DEBUG(dbgs() << "Cannot extract into GPR.\n"); + return false; + } + + // Find the index to extract from. + auto VRegAndVal = getConstantVRegValWithLookThrough(LaneIdxOp.getReg(), MRI); + if (!VRegAndVal) + return false; + unsigned LaneIdx = VRegAndVal->Value; + + MachineIRBuilder MIRBuilder(I); + + const RegisterBank &DstRB = *RBI.getRegBank(DstReg, MRI, TRI); + MachineInstr *Extract = emitExtractVectorElt(DstReg, DstRB, NarrowTy, SrcReg, + LaneIdx, MIRBuilder); + if (!Extract) + return false; + + I.eraseFromParent(); + return true; +} + +bool AArch64InstructionSelector::selectSplitVectorUnmerge( + MachineInstr &I, MachineRegisterInfo &MRI) const { + unsigned NumElts = I.getNumOperands() - 1; + Register SrcReg = I.getOperand(NumElts).getReg(); + const LLT NarrowTy = MRI.getType(I.getOperand(0).getReg()); + const LLT SrcTy = MRI.getType(SrcReg); + + assert(NarrowTy.isVector() && "Expected an unmerge into vectors"); + if (SrcTy.getSizeInBits() > 128) { + LLVM_DEBUG(dbgs() << "Unexpected vector type for vec split unmerge"); + return false; + } + + MachineIRBuilder MIB(I); + + // We implement a split vector operation by treating the sub-vectors as + // scalars and extracting them. + const RegisterBank &DstRB = + *RBI.getRegBank(I.getOperand(0).getReg(), MRI, TRI); + for (unsigned OpIdx = 0; OpIdx < NumElts; ++OpIdx) { + Register Dst = I.getOperand(OpIdx).getReg(); + MachineInstr *Extract = + emitExtractVectorElt(Dst, DstRB, NarrowTy, SrcReg, OpIdx, MIB); + if (!Extract) + return false; + } + I.eraseFromParent(); + return true; +} + +bool AArch64InstructionSelector::selectUnmergeValues( + MachineInstr &I, MachineRegisterInfo &MRI) const { + assert(I.getOpcode() == TargetOpcode::G_UNMERGE_VALUES && + "unexpected opcode"); + + // TODO: Handle unmerging into GPRs and from scalars to scalars. + if (RBI.getRegBank(I.getOperand(0).getReg(), MRI, TRI)->getID() != + AArch64::FPRRegBankID || + RBI.getRegBank(I.getOperand(1).getReg(), MRI, TRI)->getID() != + AArch64::FPRRegBankID) { + LLVM_DEBUG(dbgs() << "Unmerging vector-to-gpr and scalar-to-scalar " + "currently unsupported.\n"); + return false; + } + + // The last operand is the vector source register, and every other operand is + // a register to unpack into. + unsigned NumElts = I.getNumOperands() - 1; + Register SrcReg = I.getOperand(NumElts).getReg(); + const LLT NarrowTy = MRI.getType(I.getOperand(0).getReg()); + const LLT WideTy = MRI.getType(SrcReg); + (void)WideTy; + assert((WideTy.isVector() || WideTy.getSizeInBits() == 128) && + "can only unmerge from vector or s128 types!"); + assert(WideTy.getSizeInBits() > NarrowTy.getSizeInBits() && + "source register size too small!"); + + if (!NarrowTy.isScalar()) + return selectSplitVectorUnmerge(I, MRI); + + MachineIRBuilder MIB(I); + + // Choose a lane copy opcode and subregister based off of the size of the + // vector's elements. + unsigned CopyOpc = 0; + unsigned ExtractSubReg = 0; + if (!getLaneCopyOpcode(CopyOpc, ExtractSubReg, NarrowTy.getSizeInBits())) + return false; + + // Set up for the lane copies. + MachineBasicBlock &MBB = *I.getParent(); + + // Stores the registers we'll be copying from. + SmallVector<Register, 4> InsertRegs; + + // We'll use the first register twice, so we only need NumElts-1 registers. + unsigned NumInsertRegs = NumElts - 1; + + // If our elements fit into exactly 128 bits, then we can copy from the source + // directly. Otherwise, we need to do a bit of setup with some subregister + // inserts. + if (NarrowTy.getSizeInBits() * NumElts == 128) { + InsertRegs = SmallVector<Register, 4>(NumInsertRegs, SrcReg); + } else { + // No. We have to perform subregister inserts. For each insert, create an + // implicit def and a subregister insert, and save the register we create. + for (unsigned Idx = 0; Idx < NumInsertRegs; ++Idx) { + Register ImpDefReg = MRI.createVirtualRegister(&AArch64::FPR128RegClass); + MachineInstr &ImpDefMI = + *BuildMI(MBB, I, I.getDebugLoc(), TII.get(TargetOpcode::IMPLICIT_DEF), + ImpDefReg); + + // Now, create the subregister insert from SrcReg. + Register InsertReg = MRI.createVirtualRegister(&AArch64::FPR128RegClass); + MachineInstr &InsMI = + *BuildMI(MBB, I, I.getDebugLoc(), + TII.get(TargetOpcode::INSERT_SUBREG), InsertReg) + .addUse(ImpDefReg) + .addUse(SrcReg) + .addImm(AArch64::dsub); + + constrainSelectedInstRegOperands(ImpDefMI, TII, TRI, RBI); + constrainSelectedInstRegOperands(InsMI, TII, TRI, RBI); + + // Save the register so that we can copy from it after. + InsertRegs.push_back(InsertReg); + } + } + + // Now that we've created any necessary subregister inserts, we can + // create the copies. + // + // Perform the first copy separately as a subregister copy. + Register CopyTo = I.getOperand(0).getReg(); + auto FirstCopy = MIB.buildInstr(TargetOpcode::COPY, {CopyTo}, {}) + .addReg(InsertRegs[0], 0, ExtractSubReg); + constrainSelectedInstRegOperands(*FirstCopy, TII, TRI, RBI); + + // Now, perform the remaining copies as vector lane copies. + unsigned LaneIdx = 1; + for (Register InsReg : InsertRegs) { + Register CopyTo = I.getOperand(LaneIdx).getReg(); + MachineInstr &CopyInst = + *BuildMI(MBB, I, I.getDebugLoc(), TII.get(CopyOpc), CopyTo) + .addUse(InsReg) + .addImm(LaneIdx); + constrainSelectedInstRegOperands(CopyInst, TII, TRI, RBI); + ++LaneIdx; + } + + // Separately constrain the first copy's destination. Because of the + // limitation in constrainOperandRegClass, we can't guarantee that this will + // actually be constrained. So, do it ourselves using the second operand. + const TargetRegisterClass *RC = + MRI.getRegClassOrNull(I.getOperand(1).getReg()); + if (!RC) { + LLVM_DEBUG(dbgs() << "Couldn't constrain copy destination.\n"); + return false; + } + + RBI.constrainGenericRegister(CopyTo, *RC, MRI); + I.eraseFromParent(); + return true; +} + +bool AArch64InstructionSelector::selectConcatVectors( + MachineInstr &I, MachineRegisterInfo &MRI) const { + assert(I.getOpcode() == TargetOpcode::G_CONCAT_VECTORS && + "Unexpected opcode"); + Register Dst = I.getOperand(0).getReg(); + Register Op1 = I.getOperand(1).getReg(); + Register Op2 = I.getOperand(2).getReg(); + MachineIRBuilder MIRBuilder(I); + MachineInstr *ConcatMI = emitVectorConcat(Dst, Op1, Op2, MIRBuilder); + if (!ConcatMI) + return false; + I.eraseFromParent(); + return true; +} + +unsigned +AArch64InstructionSelector::emitConstantPoolEntry(const Constant *CPVal, + MachineFunction &MF) const { + Type *CPTy = CPVal->getType(); + Align Alignment = MF.getDataLayout().getPrefTypeAlign(CPTy); + + MachineConstantPool *MCP = MF.getConstantPool(); + return MCP->getConstantPoolIndex(CPVal, Alignment); +} + +MachineInstr *AArch64InstructionSelector::emitLoadFromConstantPool( + const Constant *CPVal, MachineIRBuilder &MIRBuilder) const { + unsigned CPIdx = emitConstantPoolEntry(CPVal, MIRBuilder.getMF()); + + auto Adrp = + MIRBuilder.buildInstr(AArch64::ADRP, {&AArch64::GPR64RegClass}, {}) + .addConstantPoolIndex(CPIdx, 0, AArch64II::MO_PAGE); + + MachineInstr *LoadMI = nullptr; + switch (MIRBuilder.getDataLayout().getTypeStoreSize(CPVal->getType())) { + case 16: + LoadMI = + &*MIRBuilder + .buildInstr(AArch64::LDRQui, {&AArch64::FPR128RegClass}, {Adrp}) + .addConstantPoolIndex(CPIdx, 0, + AArch64II::MO_PAGEOFF | AArch64II::MO_NC); + break; + case 8: + LoadMI = &*MIRBuilder + .buildInstr(AArch64::LDRDui, {&AArch64::FPR64RegClass}, {Adrp}) + .addConstantPoolIndex( + CPIdx, 0, AArch64II::MO_PAGEOFF | AArch64II::MO_NC); + break; + default: + LLVM_DEBUG(dbgs() << "Could not load from constant pool of type " + << *CPVal->getType()); + return nullptr; + } + constrainSelectedInstRegOperands(*Adrp, TII, TRI, RBI); + constrainSelectedInstRegOperands(*LoadMI, TII, TRI, RBI); + return LoadMI; +} + +/// Return an <Opcode, SubregIndex> pair to do an vector elt insert of a given +/// size and RB. +static std::pair<unsigned, unsigned> +getInsertVecEltOpInfo(const RegisterBank &RB, unsigned EltSize) { + unsigned Opc, SubregIdx; + if (RB.getID() == AArch64::GPRRegBankID) { + if (EltSize == 32) { + Opc = AArch64::INSvi32gpr; + SubregIdx = AArch64::ssub; + } else if (EltSize == 64) { + Opc = AArch64::INSvi64gpr; + SubregIdx = AArch64::dsub; + } else { + llvm_unreachable("invalid elt size!"); + } + } else { + if (EltSize == 8) { + Opc = AArch64::INSvi8lane; + SubregIdx = AArch64::bsub; + } else if (EltSize == 16) { + Opc = AArch64::INSvi16lane; + SubregIdx = AArch64::hsub; + } else if (EltSize == 32) { + Opc = AArch64::INSvi32lane; + SubregIdx = AArch64::ssub; + } else if (EltSize == 64) { + Opc = AArch64::INSvi64lane; + SubregIdx = AArch64::dsub; + } else { + llvm_unreachable("invalid elt size!"); + } + } + return std::make_pair(Opc, SubregIdx); +} + +MachineInstr * +AArch64InstructionSelector::emitADD(Register DefReg, MachineOperand &LHS, + MachineOperand &RHS, + MachineIRBuilder &MIRBuilder) const { + assert(LHS.isReg() && RHS.isReg() && "Expected LHS and RHS to be registers!"); + MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo(); + static const unsigned OpcTable[2][2]{{AArch64::ADDXrr, AArch64::ADDXri}, + {AArch64::ADDWrr, AArch64::ADDWri}}; + bool Is32Bit = MRI.getType(LHS.getReg()).getSizeInBits() == 32; + auto ImmFns = selectArithImmed(RHS); + unsigned Opc = OpcTable[Is32Bit][ImmFns.hasValue()]; + auto AddMI = MIRBuilder.buildInstr(Opc, {DefReg}, {LHS}); + + // If we matched a valid constant immediate, add those operands. + if (ImmFns) { + for (auto &RenderFn : *ImmFns) + RenderFn(AddMI); + } else { + AddMI.addUse(RHS.getReg()); + } + + constrainSelectedInstRegOperands(*AddMI, TII, TRI, RBI); + return &*AddMI; +} + +MachineInstr * +AArch64InstructionSelector::emitCMN(MachineOperand &LHS, MachineOperand &RHS, + MachineIRBuilder &MIRBuilder) const { + assert(LHS.isReg() && RHS.isReg() && "Expected LHS and RHS to be registers!"); + MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo(); + static const unsigned OpcTable[2][2]{{AArch64::ADDSXrr, AArch64::ADDSXri}, + {AArch64::ADDSWrr, AArch64::ADDSWri}}; + bool Is32Bit = (MRI.getType(LHS.getReg()).getSizeInBits() == 32); + auto ImmFns = selectArithImmed(RHS); + unsigned Opc = OpcTable[Is32Bit][ImmFns.hasValue()]; + Register ZReg = Is32Bit ? AArch64::WZR : AArch64::XZR; + + auto CmpMI = MIRBuilder.buildInstr(Opc, {ZReg}, {LHS}); + + // If we matched a valid constant immediate, add those operands. + if (ImmFns) { + for (auto &RenderFn : *ImmFns) + RenderFn(CmpMI); + } else { + CmpMI.addUse(RHS.getReg()); + } + + constrainSelectedInstRegOperands(*CmpMI, TII, TRI, RBI); + return &*CmpMI; +} + +MachineInstr * +AArch64InstructionSelector::emitTST(const Register &LHS, const Register &RHS, + MachineIRBuilder &MIRBuilder) const { + MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo(); + unsigned RegSize = MRI.getType(LHS).getSizeInBits(); + bool Is32Bit = (RegSize == 32); + static const unsigned OpcTable[2][2]{{AArch64::ANDSXrr, AArch64::ANDSXri}, + {AArch64::ANDSWrr, AArch64::ANDSWri}}; + Register ZReg = Is32Bit ? AArch64::WZR : AArch64::XZR; + + // We might be able to fold in an immediate into the TST. We need to make sure + // it's a logical immediate though, since ANDS requires that. + auto ValAndVReg = getConstantVRegValWithLookThrough(RHS, MRI); + bool IsImmForm = ValAndVReg.hasValue() && + AArch64_AM::isLogicalImmediate(ValAndVReg->Value, RegSize); + unsigned Opc = OpcTable[Is32Bit][IsImmForm]; + auto TstMI = MIRBuilder.buildInstr(Opc, {ZReg}, {LHS}); + + if (IsImmForm) + TstMI.addImm( + AArch64_AM::encodeLogicalImmediate(ValAndVReg->Value, RegSize)); + else + TstMI.addUse(RHS); + + constrainSelectedInstRegOperands(*TstMI, TII, TRI, RBI); + return &*TstMI; +} + +std::pair<MachineInstr *, CmpInst::Predicate> +AArch64InstructionSelector::emitIntegerCompare( + MachineOperand &LHS, MachineOperand &RHS, MachineOperand &Predicate, + MachineIRBuilder &MIRBuilder) const { + assert(LHS.isReg() && RHS.isReg() && "Expected LHS and RHS to be registers!"); + assert(Predicate.isPredicate() && "Expected predicate?"); + MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo(); + + CmpInst::Predicate P = (CmpInst::Predicate)Predicate.getPredicate(); + + // Fold the compare if possible. + MachineInstr *FoldCmp = + tryFoldIntegerCompare(LHS, RHS, Predicate, MIRBuilder); + if (FoldCmp) + return {FoldCmp, P}; + + // Can't fold into a CMN. Just emit a normal compare. + unsigned CmpOpc = 0; + Register ZReg; + + LLT CmpTy = MRI.getType(LHS.getReg()); + assert((CmpTy.isScalar() || CmpTy.isPointer()) && + "Expected scalar or pointer"); + if (CmpTy == LLT::scalar(32)) { + CmpOpc = AArch64::SUBSWrr; + ZReg = MRI.createVirtualRegister(&AArch64::GPR32RegClass); + } else if (CmpTy == LLT::scalar(64) || CmpTy.isPointer()) { + CmpOpc = AArch64::SUBSXrr; + ZReg = MRI.createVirtualRegister(&AArch64::GPR64RegClass); + } else { + return {nullptr, CmpInst::Predicate::BAD_ICMP_PREDICATE}; + } + + // Try to match immediate forms. + MachineInstr *ImmedCmp = + tryOptArithImmedIntegerCompare(LHS, RHS, P, MIRBuilder); + if (ImmedCmp) + return {ImmedCmp, P}; + + // If we don't have an immediate, we may have a shift which can be folded + // into the compare. + MachineInstr *ShiftedCmp = tryOptArithShiftedCompare(LHS, RHS, MIRBuilder); + if (ShiftedCmp) + return {ShiftedCmp, P}; + + auto CmpMI = + MIRBuilder.buildInstr(CmpOpc, {ZReg}, {LHS.getReg(), RHS.getReg()}); + // Make sure that we can constrain the compare that we emitted. + constrainSelectedInstRegOperands(*CmpMI, TII, TRI, RBI); + return {&*CmpMI, P}; +} + +MachineInstr *AArch64InstructionSelector::emitVectorConcat( + Optional<Register> Dst, Register Op1, Register Op2, + MachineIRBuilder &MIRBuilder) const { + // We implement a vector concat by: + // 1. Use scalar_to_vector to insert the lower vector into the larger dest + // 2. Insert the upper vector into the destination's upper element + // TODO: some of this code is common with G_BUILD_VECTOR handling. + MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo(); + + const LLT Op1Ty = MRI.getType(Op1); + const LLT Op2Ty = MRI.getType(Op2); + + if (Op1Ty != Op2Ty) { + LLVM_DEBUG(dbgs() << "Could not do vector concat of differing vector tys"); + return nullptr; + } + assert(Op1Ty.isVector() && "Expected a vector for vector concat"); + + if (Op1Ty.getSizeInBits() >= 128) { + LLVM_DEBUG(dbgs() << "Vector concat not supported for full size vectors"); + return nullptr; + } + + // At the moment we just support 64 bit vector concats. + if (Op1Ty.getSizeInBits() != 64) { + LLVM_DEBUG(dbgs() << "Vector concat supported for 64b vectors"); + return nullptr; + } + + const LLT ScalarTy = LLT::scalar(Op1Ty.getSizeInBits()); + const RegisterBank &FPRBank = *RBI.getRegBank(Op1, MRI, TRI); + const TargetRegisterClass *DstRC = + getMinClassForRegBank(FPRBank, Op1Ty.getSizeInBits() * 2); + + MachineInstr *WidenedOp1 = + emitScalarToVector(ScalarTy.getSizeInBits(), DstRC, Op1, MIRBuilder); + MachineInstr *WidenedOp2 = + emitScalarToVector(ScalarTy.getSizeInBits(), DstRC, Op2, MIRBuilder); + if (!WidenedOp1 || !WidenedOp2) { + LLVM_DEBUG(dbgs() << "Could not emit a vector from scalar value"); + return nullptr; + } + + // Now do the insert of the upper element. + unsigned InsertOpc, InsSubRegIdx; + std::tie(InsertOpc, InsSubRegIdx) = + getInsertVecEltOpInfo(FPRBank, ScalarTy.getSizeInBits()); + + if (!Dst) + Dst = MRI.createVirtualRegister(DstRC); + auto InsElt = + MIRBuilder + .buildInstr(InsertOpc, {*Dst}, {WidenedOp1->getOperand(0).getReg()}) + .addImm(1) /* Lane index */ + .addUse(WidenedOp2->getOperand(0).getReg()) + .addImm(0); + constrainSelectedInstRegOperands(*InsElt, TII, TRI, RBI); + return &*InsElt; +} + +MachineInstr *AArch64InstructionSelector::emitFMovForFConstant( + MachineInstr &I, MachineRegisterInfo &MRI) const { + assert(I.getOpcode() == TargetOpcode::G_FCONSTANT && + "Expected a G_FCONSTANT!"); + MachineOperand &ImmOp = I.getOperand(1); + unsigned DefSize = MRI.getType(I.getOperand(0).getReg()).getSizeInBits(); + + // Only handle 32 and 64 bit defs for now. + if (DefSize != 32 && DefSize != 64) + return nullptr; + + // Don't handle null values using FMOV. + if (ImmOp.getFPImm()->isNullValue()) + return nullptr; + + // Get the immediate representation for the FMOV. + const APFloat &ImmValAPF = ImmOp.getFPImm()->getValueAPF(); + int Imm = DefSize == 32 ? AArch64_AM::getFP32Imm(ImmValAPF) + : AArch64_AM::getFP64Imm(ImmValAPF); + + // If this is -1, it means the immediate can't be represented as the requested + // floating point value. Bail. + if (Imm == -1) + return nullptr; + + // Update MI to represent the new FMOV instruction, constrain it, and return. + ImmOp.ChangeToImmediate(Imm); + unsigned MovOpc = DefSize == 32 ? AArch64::FMOVSi : AArch64::FMOVDi; + I.setDesc(TII.get(MovOpc)); + constrainSelectedInstRegOperands(I, TII, TRI, RBI); + return &I; +} + +MachineInstr * +AArch64InstructionSelector::emitCSetForICMP(Register DefReg, unsigned Pred, + MachineIRBuilder &MIRBuilder) const { + // CSINC increments the result when the predicate is false. Invert it. + const AArch64CC::CondCode InvCC = changeICMPPredToAArch64CC( + CmpInst::getInversePredicate((CmpInst::Predicate)Pred)); + auto I = + MIRBuilder + .buildInstr(AArch64::CSINCWr, {DefReg}, {Register(AArch64::WZR), Register(AArch64::WZR)}) + .addImm(InvCC); + constrainSelectedInstRegOperands(*I, TII, TRI, RBI); + return &*I; +} + +bool AArch64InstructionSelector::tryOptSelect(MachineInstr &I) const { + MachineIRBuilder MIB(I); + MachineRegisterInfo &MRI = *MIB.getMRI(); + const TargetRegisterInfo &TRI = *MRI.getTargetRegisterInfo(); + + // We want to recognize this pattern: + // + // $z = G_FCMP pred, $x, $y + // ... + // $w = G_SELECT $z, $a, $b + // + // Where the value of $z is *only* ever used by the G_SELECT (possibly with + // some copies/truncs in between.) + // + // If we see this, then we can emit something like this: + // + // fcmp $x, $y + // fcsel $w, $a, $b, pred + // + // Rather than emitting both of the rather long sequences in the standard + // G_FCMP/G_SELECT select methods. + + // First, check if the condition is defined by a compare. + MachineInstr *CondDef = MRI.getVRegDef(I.getOperand(1).getReg()); + while (CondDef) { + // We can only fold if all of the defs have one use. + Register CondDefReg = CondDef->getOperand(0).getReg(); + if (!MRI.hasOneNonDBGUse(CondDefReg)) { + // Unless it's another select. + for (const MachineInstr &UI : MRI.use_nodbg_instructions(CondDefReg)) { + if (CondDef == &UI) + continue; + if (UI.getOpcode() != TargetOpcode::G_SELECT) + return false; + } + } + + // We can skip over G_TRUNC since the condition is 1-bit. + // Truncating/extending can have no impact on the value. + unsigned Opc = CondDef->getOpcode(); + if (Opc != TargetOpcode::COPY && Opc != TargetOpcode::G_TRUNC) + break; + + // Can't see past copies from physregs. + if (Opc == TargetOpcode::COPY && + Register::isPhysicalRegister(CondDef->getOperand(1).getReg())) + return false; + + CondDef = MRI.getVRegDef(CondDef->getOperand(1).getReg()); + } + + // Is the condition defined by a compare? + if (!CondDef) + return false; + + unsigned CondOpc = CondDef->getOpcode(); + if (CondOpc != TargetOpcode::G_ICMP && CondOpc != TargetOpcode::G_FCMP) + return false; + + AArch64CC::CondCode CondCode; + if (CondOpc == TargetOpcode::G_ICMP) { + MachineInstr *Cmp; + CmpInst::Predicate Pred; + + std::tie(Cmp, Pred) = + emitIntegerCompare(CondDef->getOperand(2), CondDef->getOperand(3), + CondDef->getOperand(1), MIB); + + if (!Cmp) { + LLVM_DEBUG(dbgs() << "Couldn't emit compare for select!\n"); + return false; + } + + // Have to collect the CondCode after emitIntegerCompare, since it can + // update the predicate. + CondCode = changeICMPPredToAArch64CC(Pred); + } else { + // Get the condition code for the select. + AArch64CC::CondCode CondCode2; + changeFCMPPredToAArch64CC( + (CmpInst::Predicate)CondDef->getOperand(1).getPredicate(), CondCode, + CondCode2); + + // changeFCMPPredToAArch64CC sets CondCode2 to AL when we require two + // instructions to emit the comparison. + // TODO: Handle FCMP_UEQ and FCMP_ONE. After that, this check will be + // unnecessary. + if (CondCode2 != AArch64CC::AL) + return false; + + // Make sure we'll be able to select the compare. + unsigned CmpOpc = selectFCMPOpc(*CondDef, MRI); + if (!CmpOpc) + return false; + + // Emit a new compare. + auto Cmp = MIB.buildInstr(CmpOpc, {}, {CondDef->getOperand(2).getReg()}); + if (CmpOpc != AArch64::FCMPSri && CmpOpc != AArch64::FCMPDri) + Cmp.addUse(CondDef->getOperand(3).getReg()); + constrainSelectedInstRegOperands(*Cmp, TII, TRI, RBI); + } + + // Emit the select. + unsigned CSelOpc = selectSelectOpc(I, MRI, RBI); + auto CSel = + MIB.buildInstr(CSelOpc, {I.getOperand(0).getReg()}, + {I.getOperand(2).getReg(), I.getOperand(3).getReg()}) + .addImm(CondCode); + constrainSelectedInstRegOperands(*CSel, TII, TRI, RBI); + I.eraseFromParent(); + return true; +} + +MachineInstr *AArch64InstructionSelector::tryFoldIntegerCompare( + MachineOperand &LHS, MachineOperand &RHS, MachineOperand &Predicate, + MachineIRBuilder &MIRBuilder) const { + assert(LHS.isReg() && RHS.isReg() && Predicate.isPredicate() && + "Unexpected MachineOperand"); + MachineRegisterInfo &MRI = *MIRBuilder.getMRI(); + // We want to find this sort of thing: + // x = G_SUB 0, y + // G_ICMP z, x + // + // In this case, we can fold the G_SUB into the G_ICMP using a CMN instead. + // e.g: + // + // cmn z, y + + // Helper lambda to detect the subtract followed by the compare. + // Takes in the def of the LHS or RHS, and checks if it's a subtract from 0. + auto IsCMN = [&](MachineInstr *DefMI, const AArch64CC::CondCode &CC) { + if (!DefMI || DefMI->getOpcode() != TargetOpcode::G_SUB) + return false; + + // Need to make sure NZCV is the same at the end of the transformation. + if (CC != AArch64CC::EQ && CC != AArch64CC::NE) + return false; + + // We want to match against SUBs. + if (DefMI->getOpcode() != TargetOpcode::G_SUB) + return false; + + // Make sure that we're getting + // x = G_SUB 0, y + auto ValAndVReg = + getConstantVRegValWithLookThrough(DefMI->getOperand(1).getReg(), MRI); + if (!ValAndVReg || ValAndVReg->Value != 0) + return false; + + // This can safely be represented as a CMN. + return true; + }; + + // Check if the RHS or LHS of the G_ICMP is defined by a SUB + MachineInstr *LHSDef = getDefIgnoringCopies(LHS.getReg(), MRI); + MachineInstr *RHSDef = getDefIgnoringCopies(RHS.getReg(), MRI); + CmpInst::Predicate P = (CmpInst::Predicate)Predicate.getPredicate(); + const AArch64CC::CondCode CC = changeICMPPredToAArch64CC(P); + + // Given this: + // + // x = G_SUB 0, y + // G_ICMP x, z + // + // Produce this: + // + // cmn y, z + if (IsCMN(LHSDef, CC)) + return emitCMN(LHSDef->getOperand(2), RHS, MIRBuilder); + + // Same idea here, but with the RHS of the compare instead: + // + // Given this: + // + // x = G_SUB 0, y + // G_ICMP z, x + // + // Produce this: + // + // cmn z, y + if (IsCMN(RHSDef, CC)) + return emitCMN(LHS, RHSDef->getOperand(2), MIRBuilder); + + // Given this: + // + // z = G_AND x, y + // G_ICMP z, 0 + // + // Produce this if the compare is signed: + // + // tst x, y + if (!isUnsignedICMPPred(P) && LHSDef && + LHSDef->getOpcode() == TargetOpcode::G_AND) { + // Make sure that the RHS is 0. + auto ValAndVReg = getConstantVRegValWithLookThrough(RHS.getReg(), MRI); + if (!ValAndVReg || ValAndVReg->Value != 0) + return nullptr; + + return emitTST(LHSDef->getOperand(1).getReg(), + LHSDef->getOperand(2).getReg(), MIRBuilder); + } + + return nullptr; +} + +MachineInstr *AArch64InstructionSelector::tryOptArithImmedIntegerCompare( + MachineOperand &LHS, MachineOperand &RHS, CmpInst::Predicate &P, + MachineIRBuilder &MIB) const { + // Attempt to select the immediate form of an integer compare. + MachineRegisterInfo &MRI = *MIB.getMRI(); + auto Ty = MRI.getType(LHS.getReg()); + assert(!Ty.isVector() && "Expected scalar or pointer only?"); + unsigned Size = Ty.getSizeInBits(); + assert((Size == 32 || Size == 64) && + "Expected 32 bit or 64 bit compare only?"); + + // Check if this is a case we can already handle. + InstructionSelector::ComplexRendererFns ImmFns; + ImmFns = selectArithImmed(RHS); + + if (!ImmFns) { + // We didn't get a rendering function, but we may still have a constant. + auto MaybeImmed = getImmedFromMO(RHS); + if (!MaybeImmed) + return nullptr; + + // We have a constant, but it doesn't fit. Try adjusting it by one and + // updating the predicate if possible. + uint64_t C = *MaybeImmed; + CmpInst::Predicate NewP; + switch (P) { + default: + return nullptr; + case CmpInst::ICMP_SLT: + case CmpInst::ICMP_SGE: + // Check for + // + // x slt c => x sle c - 1 + // x sge c => x sgt c - 1 + // + // When c is not the smallest possible negative number. + if ((Size == 64 && static_cast<int64_t>(C) == INT64_MIN) || + (Size == 32 && static_cast<int32_t>(C) == INT32_MIN)) + return nullptr; + NewP = (P == CmpInst::ICMP_SLT) ? CmpInst::ICMP_SLE : CmpInst::ICMP_SGT; + C -= 1; + break; + case CmpInst::ICMP_ULT: + case CmpInst::ICMP_UGE: + // Check for + // + // x ult c => x ule c - 1 + // x uge c => x ugt c - 1 + // + // When c is not zero. + if (C == 0) + return nullptr; + NewP = (P == CmpInst::ICMP_ULT) ? CmpInst::ICMP_ULE : CmpInst::ICMP_UGT; + C -= 1; + break; + case CmpInst::ICMP_SLE: + case CmpInst::ICMP_SGT: + // Check for + // + // x sle c => x slt c + 1 + // x sgt c => s sge c + 1 + // + // When c is not the largest possible signed integer. + if ((Size == 32 && static_cast<int32_t>(C) == INT32_MAX) || + (Size == 64 && static_cast<int64_t>(C) == INT64_MAX)) + return nullptr; + NewP = (P == CmpInst::ICMP_SLE) ? CmpInst::ICMP_SLT : CmpInst::ICMP_SGE; + C += 1; + break; + case CmpInst::ICMP_ULE: + case CmpInst::ICMP_UGT: + // Check for + // + // x ule c => x ult c + 1 + // x ugt c => s uge c + 1 + // + // When c is not the largest possible unsigned integer. + if ((Size == 32 && static_cast<uint32_t>(C) == UINT32_MAX) || + (Size == 64 && C == UINT64_MAX)) + return nullptr; + NewP = (P == CmpInst::ICMP_ULE) ? CmpInst::ICMP_ULT : CmpInst::ICMP_UGE; + C += 1; + break; + } + + // Check if the new constant is valid. + if (Size == 32) + C = static_cast<uint32_t>(C); + ImmFns = select12BitValueWithLeftShift(C); + if (!ImmFns) + return nullptr; + P = NewP; + } + + // At this point, we know we can select an immediate form. Go ahead and do + // that. + Register ZReg; + unsigned Opc; + if (Size == 32) { + ZReg = AArch64::WZR; + Opc = AArch64::SUBSWri; + } else { + ZReg = AArch64::XZR; + Opc = AArch64::SUBSXri; + } + + auto CmpMI = MIB.buildInstr(Opc, {ZReg}, {LHS.getReg()}); + for (auto &RenderFn : *ImmFns) + RenderFn(CmpMI); + constrainSelectedInstRegOperands(*CmpMI, TII, TRI, RBI); + return &*CmpMI; +} + +MachineInstr *AArch64InstructionSelector::tryOptArithShiftedCompare( + MachineOperand &LHS, MachineOperand &RHS, MachineIRBuilder &MIB) const { + // We are looking for the following pattern: + // + // shift = G_SHL/ASHR/LHSR y, c + // ... + // cmp = G_ICMP pred, something, shift + // + // Since we will select the G_ICMP to a SUBS, we can potentially fold the + // shift into the subtract. + static const unsigned OpcTable[2] = {AArch64::SUBSWrs, AArch64::SUBSXrs}; + static const Register ZRegTable[2] = {AArch64::WZR, AArch64::XZR}; + auto ImmFns = selectShiftedRegister(RHS); + if (!ImmFns) + return nullptr; + MachineRegisterInfo &MRI = *MIB.getMRI(); + auto Ty = MRI.getType(LHS.getReg()); + assert(!Ty.isVector() && "Expected scalar or pointer only?"); + unsigned Size = Ty.getSizeInBits(); + bool Idx = (Size == 64); + Register ZReg = ZRegTable[Idx]; + unsigned Opc = OpcTable[Idx]; + auto CmpMI = MIB.buildInstr(Opc, {ZReg}, {LHS.getReg()}); + for (auto &RenderFn : *ImmFns) + RenderFn(CmpMI); + constrainSelectedInstRegOperands(*CmpMI, TII, TRI, RBI); + return &*CmpMI; +} + +bool AArch64InstructionSelector::selectShuffleVector( + MachineInstr &I, MachineRegisterInfo &MRI) const { + const LLT DstTy = MRI.getType(I.getOperand(0).getReg()); + Register Src1Reg = I.getOperand(1).getReg(); + const LLT Src1Ty = MRI.getType(Src1Reg); + Register Src2Reg = I.getOperand(2).getReg(); + const LLT Src2Ty = MRI.getType(Src2Reg); + ArrayRef<int> Mask = I.getOperand(3).getShuffleMask(); + + MachineBasicBlock &MBB = *I.getParent(); + MachineFunction &MF = *MBB.getParent(); + LLVMContext &Ctx = MF.getFunction().getContext(); + + // G_SHUFFLE_VECTOR is weird in that the source operands can be scalars, if + // it's originated from a <1 x T> type. Those should have been lowered into + // G_BUILD_VECTOR earlier. + if (!Src1Ty.isVector() || !Src2Ty.isVector()) { + LLVM_DEBUG(dbgs() << "Could not select a \"scalar\" G_SHUFFLE_VECTOR\n"); + return false; + } + + unsigned BytesPerElt = DstTy.getElementType().getSizeInBits() / 8; + + SmallVector<Constant *, 64> CstIdxs; + for (int Val : Mask) { + // For now, any undef indexes we'll just assume to be 0. This should be + // optimized in future, e.g. to select DUP etc. + Val = Val < 0 ? 0 : Val; + for (unsigned Byte = 0; Byte < BytesPerElt; ++Byte) { + unsigned Offset = Byte + Val * BytesPerElt; + CstIdxs.emplace_back(ConstantInt::get(Type::getInt8Ty(Ctx), Offset)); + } + } + + MachineIRBuilder MIRBuilder(I); + + // Use a constant pool to load the index vector for TBL. + Constant *CPVal = ConstantVector::get(CstIdxs); + MachineInstr *IndexLoad = emitLoadFromConstantPool(CPVal, MIRBuilder); + if (!IndexLoad) { + LLVM_DEBUG(dbgs() << "Could not load from a constant pool"); + return false; + } + + if (DstTy.getSizeInBits() != 128) { + assert(DstTy.getSizeInBits() == 64 && "Unexpected shuffle result ty"); + // This case can be done with TBL1. + MachineInstr *Concat = emitVectorConcat(None, Src1Reg, Src2Reg, MIRBuilder); + if (!Concat) { + LLVM_DEBUG(dbgs() << "Could not do vector concat for tbl1"); + return false; + } + + // The constant pool load will be 64 bits, so need to convert to FPR128 reg. + IndexLoad = + emitScalarToVector(64, &AArch64::FPR128RegClass, + IndexLoad->getOperand(0).getReg(), MIRBuilder); + + auto TBL1 = MIRBuilder.buildInstr( + AArch64::TBLv16i8One, {&AArch64::FPR128RegClass}, + {Concat->getOperand(0).getReg(), IndexLoad->getOperand(0).getReg()}); + constrainSelectedInstRegOperands(*TBL1, TII, TRI, RBI); + + auto Copy = + MIRBuilder + .buildInstr(TargetOpcode::COPY, {I.getOperand(0).getReg()}, {}) + .addReg(TBL1.getReg(0), 0, AArch64::dsub); + RBI.constrainGenericRegister(Copy.getReg(0), AArch64::FPR64RegClass, MRI); + I.eraseFromParent(); + return true; + } + + // For TBL2 we need to emit a REG_SEQUENCE to tie together two consecutive + // Q registers for regalloc. + auto RegSeq = MIRBuilder + .buildInstr(TargetOpcode::REG_SEQUENCE, + {&AArch64::QQRegClass}, {Src1Reg}) + .addImm(AArch64::qsub0) + .addUse(Src2Reg) + .addImm(AArch64::qsub1); + + auto TBL2 = MIRBuilder.buildInstr(AArch64::TBLv16i8Two, {I.getOperand(0)}, + {RegSeq, IndexLoad->getOperand(0)}); + constrainSelectedInstRegOperands(*RegSeq, TII, TRI, RBI); + constrainSelectedInstRegOperands(*TBL2, TII, TRI, RBI); + I.eraseFromParent(); + return true; +} + +MachineInstr *AArch64InstructionSelector::emitLaneInsert( + Optional<Register> DstReg, Register SrcReg, Register EltReg, + unsigned LaneIdx, const RegisterBank &RB, + MachineIRBuilder &MIRBuilder) const { + MachineInstr *InsElt = nullptr; + const TargetRegisterClass *DstRC = &AArch64::FPR128RegClass; + MachineRegisterInfo &MRI = *MIRBuilder.getMRI(); + + // Create a register to define with the insert if one wasn't passed in. + if (!DstReg) + DstReg = MRI.createVirtualRegister(DstRC); + + unsigned EltSize = MRI.getType(EltReg).getSizeInBits(); + unsigned Opc = getInsertVecEltOpInfo(RB, EltSize).first; + + if (RB.getID() == AArch64::FPRRegBankID) { + auto InsSub = emitScalarToVector(EltSize, DstRC, EltReg, MIRBuilder); + InsElt = MIRBuilder.buildInstr(Opc, {*DstReg}, {SrcReg}) + .addImm(LaneIdx) + .addUse(InsSub->getOperand(0).getReg()) + .addImm(0); + } else { + InsElt = MIRBuilder.buildInstr(Opc, {*DstReg}, {SrcReg}) + .addImm(LaneIdx) + .addUse(EltReg); + } + + constrainSelectedInstRegOperands(*InsElt, TII, TRI, RBI); + return InsElt; +} + +bool AArch64InstructionSelector::selectInsertElt( + MachineInstr &I, MachineRegisterInfo &MRI) const { + assert(I.getOpcode() == TargetOpcode::G_INSERT_VECTOR_ELT); + + // Get information on the destination. + Register DstReg = I.getOperand(0).getReg(); + const LLT DstTy = MRI.getType(DstReg); + unsigned VecSize = DstTy.getSizeInBits(); + + // Get information on the element we want to insert into the destination. + Register EltReg = I.getOperand(2).getReg(); + const LLT EltTy = MRI.getType(EltReg); + unsigned EltSize = EltTy.getSizeInBits(); + if (EltSize < 16 || EltSize > 64) + return false; // Don't support all element types yet. + + // Find the definition of the index. Bail out if it's not defined by a + // G_CONSTANT. + Register IdxReg = I.getOperand(3).getReg(); + auto VRegAndVal = getConstantVRegValWithLookThrough(IdxReg, MRI); + if (!VRegAndVal) + return false; + unsigned LaneIdx = VRegAndVal->Value; + + // Perform the lane insert. + Register SrcReg = I.getOperand(1).getReg(); + const RegisterBank &EltRB = *RBI.getRegBank(EltReg, MRI, TRI); + MachineIRBuilder MIRBuilder(I); + + if (VecSize < 128) { + // If the vector we're inserting into is smaller than 128 bits, widen it + // to 128 to do the insert. + MachineInstr *ScalarToVec = emitScalarToVector( + VecSize, &AArch64::FPR128RegClass, SrcReg, MIRBuilder); + if (!ScalarToVec) + return false; + SrcReg = ScalarToVec->getOperand(0).getReg(); + } + + // Create an insert into a new FPR128 register. + // Note that if our vector is already 128 bits, we end up emitting an extra + // register. + MachineInstr *InsMI = + emitLaneInsert(None, SrcReg, EltReg, LaneIdx, EltRB, MIRBuilder); + + if (VecSize < 128) { + // If we had to widen to perform the insert, then we have to demote back to + // the original size to get the result we want. + Register DemoteVec = InsMI->getOperand(0).getReg(); + const TargetRegisterClass *RC = + getMinClassForRegBank(*RBI.getRegBank(DemoteVec, MRI, TRI), VecSize); + if (RC != &AArch64::FPR32RegClass && RC != &AArch64::FPR64RegClass) { + LLVM_DEBUG(dbgs() << "Unsupported register class!\n"); + return false; + } + unsigned SubReg = 0; + if (!getSubRegForClass(RC, TRI, SubReg)) + return false; + if (SubReg != AArch64::ssub && SubReg != AArch64::dsub) { + LLVM_DEBUG(dbgs() << "Unsupported destination size! (" << VecSize + << "\n"); + return false; + } + MIRBuilder.buildInstr(TargetOpcode::COPY, {DstReg}, {}) + .addReg(DemoteVec, 0, SubReg); + RBI.constrainGenericRegister(DstReg, *RC, MRI); + } else { + // No widening needed. + InsMI->getOperand(0).setReg(DstReg); + constrainSelectedInstRegOperands(*InsMI, TII, TRI, RBI); + } + + I.eraseFromParent(); + return true; +} + +bool AArch64InstructionSelector::tryOptConstantBuildVec( + MachineInstr &I, LLT DstTy, MachineRegisterInfo &MRI) const { + assert(I.getOpcode() == TargetOpcode::G_BUILD_VECTOR); + assert(DstTy.getSizeInBits() <= 128 && "Unexpected build_vec type!"); + if (DstTy.getSizeInBits() < 32) + return false; + // Check if we're building a constant vector, in which case we want to + // generate a constant pool load instead of a vector insert sequence. + SmallVector<Constant *, 16> Csts; + for (unsigned Idx = 1; Idx < I.getNumOperands(); ++Idx) { + // Try to find G_CONSTANT or G_FCONSTANT + auto *OpMI = + getOpcodeDef(TargetOpcode::G_CONSTANT, I.getOperand(Idx).getReg(), MRI); + if (OpMI) + Csts.emplace_back( + const_cast<ConstantInt *>(OpMI->getOperand(1).getCImm())); + else if ((OpMI = getOpcodeDef(TargetOpcode::G_FCONSTANT, + I.getOperand(Idx).getReg(), MRI))) + Csts.emplace_back( + const_cast<ConstantFP *>(OpMI->getOperand(1).getFPImm())); + else + return false; + } + Constant *CV = ConstantVector::get(Csts); + MachineIRBuilder MIB(I); + auto *CPLoad = emitLoadFromConstantPool(CV, MIB); + if (!CPLoad) { + LLVM_DEBUG(dbgs() << "Could not generate cp load for build_vector"); + return false; + } + MIB.buildCopy(I.getOperand(0), CPLoad->getOperand(0)); + RBI.constrainGenericRegister(I.getOperand(0).getReg(), + *MRI.getRegClass(CPLoad->getOperand(0).getReg()), + MRI); + I.eraseFromParent(); + return true; +} + +bool AArch64InstructionSelector::selectBuildVector( + MachineInstr &I, MachineRegisterInfo &MRI) const { + assert(I.getOpcode() == TargetOpcode::G_BUILD_VECTOR); + // Until we port more of the optimized selections, for now just use a vector + // insert sequence. + const LLT DstTy = MRI.getType(I.getOperand(0).getReg()); + const LLT EltTy = MRI.getType(I.getOperand(1).getReg()); + unsigned EltSize = EltTy.getSizeInBits(); + + if (tryOptConstantBuildVec(I, DstTy, MRI)) + return true; + if (EltSize < 16 || EltSize > 64) + return false; // Don't support all element types yet. + const RegisterBank &RB = *RBI.getRegBank(I.getOperand(1).getReg(), MRI, TRI); + MachineIRBuilder MIRBuilder(I); + + const TargetRegisterClass *DstRC = &AArch64::FPR128RegClass; + MachineInstr *ScalarToVec = + emitScalarToVector(DstTy.getElementType().getSizeInBits(), DstRC, + I.getOperand(1).getReg(), MIRBuilder); + if (!ScalarToVec) + return false; + + Register DstVec = ScalarToVec->getOperand(0).getReg(); + unsigned DstSize = DstTy.getSizeInBits(); + + // Keep track of the last MI we inserted. Later on, we might be able to save + // a copy using it. + MachineInstr *PrevMI = nullptr; + for (unsigned i = 2, e = DstSize / EltSize + 1; i < e; ++i) { + // Note that if we don't do a subregister copy, we can end up making an + // extra register. + PrevMI = &*emitLaneInsert(None, DstVec, I.getOperand(i).getReg(), i - 1, RB, + MIRBuilder); + DstVec = PrevMI->getOperand(0).getReg(); + } + + // If DstTy's size in bits is less than 128, then emit a subregister copy + // from DstVec to the last register we've defined. + if (DstSize < 128) { + // Force this to be FPR using the destination vector. + const TargetRegisterClass *RC = + getMinClassForRegBank(*RBI.getRegBank(DstVec, MRI, TRI), DstSize); + if (!RC) + return false; + if (RC != &AArch64::FPR32RegClass && RC != &AArch64::FPR64RegClass) { + LLVM_DEBUG(dbgs() << "Unsupported register class!\n"); + return false; + } + + unsigned SubReg = 0; + if (!getSubRegForClass(RC, TRI, SubReg)) + return false; + if (SubReg != AArch64::ssub && SubReg != AArch64::dsub) { + LLVM_DEBUG(dbgs() << "Unsupported destination size! (" << DstSize + << "\n"); + return false; + } + + Register Reg = MRI.createVirtualRegister(RC); + Register DstReg = I.getOperand(0).getReg(); + + MIRBuilder.buildInstr(TargetOpcode::COPY, {DstReg}, {}) + .addReg(DstVec, 0, SubReg); + MachineOperand &RegOp = I.getOperand(1); + RegOp.setReg(Reg); + RBI.constrainGenericRegister(DstReg, *RC, MRI); + } else { + // We don't need a subregister copy. Save a copy by re-using the + // destination register on the final insert. + assert(PrevMI && "PrevMI was null?"); + PrevMI->getOperand(0).setReg(I.getOperand(0).getReg()); + constrainSelectedInstRegOperands(*PrevMI, TII, TRI, RBI); + } + + I.eraseFromParent(); + return true; +} + +/// Helper function to find an intrinsic ID on an a MachineInstr. Returns the +/// ID if it exists, and 0 otherwise. +static unsigned findIntrinsicID(MachineInstr &I) { + auto IntrinOp = find_if(I.operands(), [&](const MachineOperand &Op) { + return Op.isIntrinsicID(); + }); + if (IntrinOp == I.operands_end()) + return 0; + return IntrinOp->getIntrinsicID(); +} + +bool AArch64InstructionSelector::selectIntrinsicWithSideEffects( + MachineInstr &I, MachineRegisterInfo &MRI) const { + // Find the intrinsic ID. + unsigned IntrinID = findIntrinsicID(I); + if (!IntrinID) + return false; + MachineIRBuilder MIRBuilder(I); + + // Select the instruction. + switch (IntrinID) { + default: + return false; + case Intrinsic::trap: + MIRBuilder.buildInstr(AArch64::BRK, {}, {}).addImm(1); + break; + case Intrinsic::debugtrap: + if (!STI.isTargetWindows()) + return false; + MIRBuilder.buildInstr(AArch64::BRK, {}, {}).addImm(0xF000); + break; + } + + I.eraseFromParent(); + return true; +} + +bool AArch64InstructionSelector::selectIntrinsic(MachineInstr &I, + MachineRegisterInfo &MRI) { + unsigned IntrinID = findIntrinsicID(I); + if (!IntrinID) + return false; + MachineIRBuilder MIRBuilder(I); + + switch (IntrinID) { + default: + break; + case Intrinsic::aarch64_crypto_sha1h: { + Register DstReg = I.getOperand(0).getReg(); + Register SrcReg = I.getOperand(2).getReg(); + + // FIXME: Should this be an assert? + if (MRI.getType(DstReg).getSizeInBits() != 32 || + MRI.getType(SrcReg).getSizeInBits() != 32) + return false; + + // The operation has to happen on FPRs. Set up some new FPR registers for + // the source and destination if they are on GPRs. + if (RBI.getRegBank(SrcReg, MRI, TRI)->getID() != AArch64::FPRRegBankID) { + SrcReg = MRI.createVirtualRegister(&AArch64::FPR32RegClass); + MIRBuilder.buildCopy({SrcReg}, {I.getOperand(2)}); + + // Make sure the copy ends up getting constrained properly. + RBI.constrainGenericRegister(I.getOperand(2).getReg(), + AArch64::GPR32RegClass, MRI); + } + + if (RBI.getRegBank(DstReg, MRI, TRI)->getID() != AArch64::FPRRegBankID) + DstReg = MRI.createVirtualRegister(&AArch64::FPR32RegClass); + + // Actually insert the instruction. + auto SHA1Inst = MIRBuilder.buildInstr(AArch64::SHA1Hrr, {DstReg}, {SrcReg}); + constrainSelectedInstRegOperands(*SHA1Inst, TII, TRI, RBI); + + // Did we create a new register for the destination? + if (DstReg != I.getOperand(0).getReg()) { + // Yep. Copy the result of the instruction back into the original + // destination. + MIRBuilder.buildCopy({I.getOperand(0)}, {DstReg}); + RBI.constrainGenericRegister(I.getOperand(0).getReg(), + AArch64::GPR32RegClass, MRI); + } + + I.eraseFromParent(); + return true; + } + case Intrinsic::frameaddress: + case Intrinsic::returnaddress: { + MachineFunction &MF = *I.getParent()->getParent(); + MachineFrameInfo &MFI = MF.getFrameInfo(); + + unsigned Depth = I.getOperand(2).getImm(); + Register DstReg = I.getOperand(0).getReg(); + RBI.constrainGenericRegister(DstReg, AArch64::GPR64RegClass, MRI); + + if (Depth == 0 && IntrinID == Intrinsic::returnaddress) { + if (MFReturnAddr) { + MIRBuilder.buildCopy({DstReg}, MFReturnAddr); + I.eraseFromParent(); + return true; + } + MFI.setReturnAddressIsTaken(true); + MF.addLiveIn(AArch64::LR, &AArch64::GPR64spRegClass); + // Insert the copy from LR/X30 into the entry block, before it can be + // clobbered by anything. + MachineBasicBlock &EntryBlock = *MF.begin(); + if (!EntryBlock.isLiveIn(AArch64::LR)) + EntryBlock.addLiveIn(AArch64::LR); + MachineIRBuilder EntryBuilder(MF); + EntryBuilder.setInstr(*EntryBlock.begin()); + EntryBuilder.buildCopy({DstReg}, {Register(AArch64::LR)}); + MFReturnAddr = DstReg; + I.eraseFromParent(); + return true; + } + + MFI.setFrameAddressIsTaken(true); + Register FrameAddr(AArch64::FP); + while (Depth--) { + Register NextFrame = MRI.createVirtualRegister(&AArch64::GPR64spRegClass); + auto Ldr = + MIRBuilder.buildInstr(AArch64::LDRXui, {NextFrame}, {FrameAddr}) + .addImm(0); + constrainSelectedInstRegOperands(*Ldr, TII, TRI, RBI); + FrameAddr = NextFrame; + } + + if (IntrinID == Intrinsic::frameaddress) + MIRBuilder.buildCopy({DstReg}, {FrameAddr}); + else { + MFI.setReturnAddressIsTaken(true); + MIRBuilder.buildInstr(AArch64::LDRXui, {DstReg}, {FrameAddr}).addImm(1); + } + + I.eraseFromParent(); + return true; + } + } + return false; +} + +InstructionSelector::ComplexRendererFns +AArch64InstructionSelector::selectShiftA_32(const MachineOperand &Root) const { + auto MaybeImmed = getImmedFromMO(Root); + if (MaybeImmed == None || *MaybeImmed > 31) + return None; + uint64_t Enc = (32 - *MaybeImmed) & 0x1f; + return {{[=](MachineInstrBuilder &MIB) { MIB.addImm(Enc); }}}; +} + +InstructionSelector::ComplexRendererFns +AArch64InstructionSelector::selectShiftB_32(const MachineOperand &Root) const { + auto MaybeImmed = getImmedFromMO(Root); + if (MaybeImmed == None || *MaybeImmed > 31) + return None; + uint64_t Enc = 31 - *MaybeImmed; + return {{[=](MachineInstrBuilder &MIB) { MIB.addImm(Enc); }}}; +} + +InstructionSelector::ComplexRendererFns +AArch64InstructionSelector::selectShiftA_64(const MachineOperand &Root) const { + auto MaybeImmed = getImmedFromMO(Root); + if (MaybeImmed == None || *MaybeImmed > 63) + return None; + uint64_t Enc = (64 - *MaybeImmed) & 0x3f; + return {{[=](MachineInstrBuilder &MIB) { MIB.addImm(Enc); }}}; +} + +InstructionSelector::ComplexRendererFns +AArch64InstructionSelector::selectShiftB_64(const MachineOperand &Root) const { + auto MaybeImmed = getImmedFromMO(Root); + if (MaybeImmed == None || *MaybeImmed > 63) + return None; + uint64_t Enc = 63 - *MaybeImmed; + return {{[=](MachineInstrBuilder &MIB) { MIB.addImm(Enc); }}}; +} + +/// Helper to select an immediate value that can be represented as a 12-bit +/// value shifted left by either 0 or 12. If it is possible to do so, return +/// the immediate and shift value. If not, return None. +/// +/// Used by selectArithImmed and selectNegArithImmed. +InstructionSelector::ComplexRendererFns +AArch64InstructionSelector::select12BitValueWithLeftShift( + uint64_t Immed) const { + unsigned ShiftAmt; + if (Immed >> 12 == 0) { + ShiftAmt = 0; + } else if ((Immed & 0xfff) == 0 && Immed >> 24 == 0) { + ShiftAmt = 12; + Immed = Immed >> 12; + } else + return None; + + unsigned ShVal = AArch64_AM::getShifterImm(AArch64_AM::LSL, ShiftAmt); + return {{ + [=](MachineInstrBuilder &MIB) { MIB.addImm(Immed); }, + [=](MachineInstrBuilder &MIB) { MIB.addImm(ShVal); }, + }}; +} + +/// SelectArithImmed - Select an immediate value that can be represented as +/// a 12-bit value shifted left by either 0 or 12. If so, return true with +/// Val set to the 12-bit value and Shift set to the shifter operand. +InstructionSelector::ComplexRendererFns +AArch64InstructionSelector::selectArithImmed(MachineOperand &Root) const { + // This function is called from the addsub_shifted_imm ComplexPattern, + // which lists [imm] as the list of opcode it's interested in, however + // we still need to check whether the operand is actually an immediate + // here because the ComplexPattern opcode list is only used in + // root-level opcode matching. + auto MaybeImmed = getImmedFromMO(Root); + if (MaybeImmed == None) + return None; + return select12BitValueWithLeftShift(*MaybeImmed); +} + +/// SelectNegArithImmed - As above, but negates the value before trying to +/// select it. +InstructionSelector::ComplexRendererFns +AArch64InstructionSelector::selectNegArithImmed(MachineOperand &Root) const { + // We need a register here, because we need to know if we have a 64 or 32 + // bit immediate. + if (!Root.isReg()) + return None; + auto MaybeImmed = getImmedFromMO(Root); + if (MaybeImmed == None) + return None; + uint64_t Immed = *MaybeImmed; + + // This negation is almost always valid, but "cmp wN, #0" and "cmn wN, #0" + // have the opposite effect on the C flag, so this pattern mustn't match under + // those circumstances. + if (Immed == 0) + return None; + + // Check if we're dealing with a 32-bit type on the root or a 64-bit type on + // the root. + MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo(); + if (MRI.getType(Root.getReg()).getSizeInBits() == 32) + Immed = ~((uint32_t)Immed) + 1; + else + Immed = ~Immed + 1ULL; + + if (Immed & 0xFFFFFFFFFF000000ULL) + return None; + + Immed &= 0xFFFFFFULL; + return select12BitValueWithLeftShift(Immed); +} + +/// Return true if it is worth folding MI into an extended register. That is, +/// if it's safe to pull it into the addressing mode of a load or store as a +/// shift. +bool AArch64InstructionSelector::isWorthFoldingIntoExtendedReg( + MachineInstr &MI, const MachineRegisterInfo &MRI) const { + // Always fold if there is one use, or if we're optimizing for size. + Register DefReg = MI.getOperand(0).getReg(); + if (MRI.hasOneNonDBGUse(DefReg) || + MI.getParent()->getParent()->getFunction().hasMinSize()) + return true; + + // It's better to avoid folding and recomputing shifts when we don't have a + // fastpath. + if (!STI.hasLSLFast()) + return false; + + // We have a fastpath, so folding a shift in and potentially computing it + // many times may be beneficial. Check if this is only used in memory ops. + // If it is, then we should fold. + return all_of(MRI.use_nodbg_instructions(DefReg), + [](MachineInstr &Use) { return Use.mayLoadOrStore(); }); +} + +static bool isSignExtendShiftType(AArch64_AM::ShiftExtendType Type) { + switch (Type) { + case AArch64_AM::SXTB: + case AArch64_AM::SXTH: + case AArch64_AM::SXTW: + return true; + default: + return false; + } +} + +InstructionSelector::ComplexRendererFns +AArch64InstructionSelector::selectExtendedSHL( + MachineOperand &Root, MachineOperand &Base, MachineOperand &Offset, + unsigned SizeInBytes, bool WantsExt) const { + assert(Base.isReg() && "Expected base to be a register operand"); + assert(Offset.isReg() && "Expected offset to be a register operand"); + + MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo(); + MachineInstr *OffsetInst = MRI.getVRegDef(Offset.getReg()); + if (!OffsetInst) + return None; + + unsigned OffsetOpc = OffsetInst->getOpcode(); + if (OffsetOpc != TargetOpcode::G_SHL && OffsetOpc != TargetOpcode::G_MUL) + return None; + + // Make sure that the memory op is a valid size. + int64_t LegalShiftVal = Log2_32(SizeInBytes); + if (LegalShiftVal == 0) + return None; + if (!isWorthFoldingIntoExtendedReg(*OffsetInst, MRI)) + return None; + + // Now, try to find the specific G_CONSTANT. Start by assuming that the + // register we will offset is the LHS, and the register containing the + // constant is the RHS. + Register OffsetReg = OffsetInst->getOperand(1).getReg(); + Register ConstantReg = OffsetInst->getOperand(2).getReg(); + auto ValAndVReg = getConstantVRegValWithLookThrough(ConstantReg, MRI); + if (!ValAndVReg) { + // We didn't get a constant on the RHS. If the opcode is a shift, then + // we're done. + if (OffsetOpc == TargetOpcode::G_SHL) + return None; + + // If we have a G_MUL, we can use either register. Try looking at the RHS. + std::swap(OffsetReg, ConstantReg); + ValAndVReg = getConstantVRegValWithLookThrough(ConstantReg, MRI); + if (!ValAndVReg) + return None; + } + + // The value must fit into 3 bits, and must be positive. Make sure that is + // true. + int64_t ImmVal = ValAndVReg->Value; + + // Since we're going to pull this into a shift, the constant value must be + // a power of 2. If we got a multiply, then we need to check this. + if (OffsetOpc == TargetOpcode::G_MUL) { + if (!isPowerOf2_32(ImmVal)) + return None; + + // Got a power of 2. So, the amount we'll shift is the log base-2 of that. + ImmVal = Log2_32(ImmVal); + } + + if ((ImmVal & 0x7) != ImmVal) + return None; + + // We are only allowed to shift by LegalShiftVal. This shift value is built + // into the instruction, so we can't just use whatever we want. + if (ImmVal != LegalShiftVal) + return None; + + unsigned SignExtend = 0; + if (WantsExt) { + // Check if the offset is defined by an extend. + MachineInstr *ExtInst = getDefIgnoringCopies(OffsetReg, MRI); + auto Ext = getExtendTypeForInst(*ExtInst, MRI, true); + if (Ext == AArch64_AM::InvalidShiftExtend) + return None; + + SignExtend = isSignExtendShiftType(Ext) ? 1 : 0; + // We only support SXTW for signed extension here. + if (SignExtend && Ext != AArch64_AM::SXTW) + return None; + + // Need a 32-bit wide register here. + MachineIRBuilder MIB(*MRI.getVRegDef(Root.getReg())); + OffsetReg = ExtInst->getOperand(1).getReg(); + OffsetReg = narrowExtendRegIfNeeded(OffsetReg, MIB); + } + + // We can use the LHS of the GEP as the base, and the LHS of the shift as an + // offset. Signify that we are shifting by setting the shift flag to 1. + return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(Base.getReg()); }, + [=](MachineInstrBuilder &MIB) { MIB.addUse(OffsetReg); }, + [=](MachineInstrBuilder &MIB) { + // Need to add both immediates here to make sure that they are both + // added to the instruction. + MIB.addImm(SignExtend); + MIB.addImm(1); + }}}; +} + +/// This is used for computing addresses like this: +/// +/// ldr x1, [x2, x3, lsl #3] +/// +/// Where x2 is the base register, and x3 is an offset register. The shift-left +/// is a constant value specific to this load instruction. That is, we'll never +/// see anything other than a 3 here (which corresponds to the size of the +/// element being loaded.) +InstructionSelector::ComplexRendererFns +AArch64InstructionSelector::selectAddrModeShiftedExtendXReg( + MachineOperand &Root, unsigned SizeInBytes) const { + if (!Root.isReg()) + return None; + MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo(); + + // We want to find something like this: + // + // val = G_CONSTANT LegalShiftVal + // shift = G_SHL off_reg val + // ptr = G_PTR_ADD base_reg shift + // x = G_LOAD ptr + // + // And fold it into this addressing mode: + // + // ldr x, [base_reg, off_reg, lsl #LegalShiftVal] + + // Check if we can find the G_PTR_ADD. + MachineInstr *PtrAdd = + getOpcodeDef(TargetOpcode::G_PTR_ADD, Root.getReg(), MRI); + if (!PtrAdd || !isWorthFoldingIntoExtendedReg(*PtrAdd, MRI)) + return None; + + // Now, try to match an opcode which will match our specific offset. + // We want a G_SHL or a G_MUL. + MachineInstr *OffsetInst = + getDefIgnoringCopies(PtrAdd->getOperand(2).getReg(), MRI); + return selectExtendedSHL(Root, PtrAdd->getOperand(1), + OffsetInst->getOperand(0), SizeInBytes, + /*WantsExt=*/false); +} + +/// This is used for computing addresses like this: +/// +/// ldr x1, [x2, x3] +/// +/// Where x2 is the base register, and x3 is an offset register. +/// +/// When possible (or profitable) to fold a G_PTR_ADD into the address calculation, +/// this will do so. Otherwise, it will return None. +InstructionSelector::ComplexRendererFns +AArch64InstructionSelector::selectAddrModeRegisterOffset( + MachineOperand &Root) const { + MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo(); + + // We need a GEP. + MachineInstr *Gep = MRI.getVRegDef(Root.getReg()); + if (!Gep || Gep->getOpcode() != TargetOpcode::G_PTR_ADD) + return None; + + // If this is used more than once, let's not bother folding. + // TODO: Check if they are memory ops. If they are, then we can still fold + // without having to recompute anything. + if (!MRI.hasOneNonDBGUse(Gep->getOperand(0).getReg())) + return None; + + // Base is the GEP's LHS, offset is its RHS. + return {{[=](MachineInstrBuilder &MIB) { + MIB.addUse(Gep->getOperand(1).getReg()); + }, + [=](MachineInstrBuilder &MIB) { + MIB.addUse(Gep->getOperand(2).getReg()); + }, + [=](MachineInstrBuilder &MIB) { + // Need to add both immediates here to make sure that they are both + // added to the instruction. + MIB.addImm(0); + MIB.addImm(0); + }}}; +} + +/// This is intended to be equivalent to selectAddrModeXRO in +/// AArch64ISelDAGtoDAG. It's used for selecting X register offset loads. +InstructionSelector::ComplexRendererFns +AArch64InstructionSelector::selectAddrModeXRO(MachineOperand &Root, + unsigned SizeInBytes) const { + MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo(); + + // If we have a constant offset, then we probably don't want to match a + // register offset. + if (isBaseWithConstantOffset(Root, MRI)) + return None; + + // Try to fold shifts into the addressing mode. + auto AddrModeFns = selectAddrModeShiftedExtendXReg(Root, SizeInBytes); + if (AddrModeFns) + return AddrModeFns; + + // If that doesn't work, see if it's possible to fold in registers from + // a GEP. + return selectAddrModeRegisterOffset(Root); +} + +/// This is used for computing addresses like this: +/// +/// ldr x0, [xBase, wOffset, sxtw #LegalShiftVal] +/// +/// Where we have a 64-bit base register, a 32-bit offset register, and an +/// extend (which may or may not be signed). +InstructionSelector::ComplexRendererFns +AArch64InstructionSelector::selectAddrModeWRO(MachineOperand &Root, + unsigned SizeInBytes) const { + MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo(); + + MachineInstr *PtrAdd = + getOpcodeDef(TargetOpcode::G_PTR_ADD, Root.getReg(), MRI); + if (!PtrAdd || !isWorthFoldingIntoExtendedReg(*PtrAdd, MRI)) + return None; + + MachineOperand &LHS = PtrAdd->getOperand(1); + MachineOperand &RHS = PtrAdd->getOperand(2); + MachineInstr *OffsetInst = getDefIgnoringCopies(RHS.getReg(), MRI); + + // The first case is the same as selectAddrModeXRO, except we need an extend. + // In this case, we try to find a shift and extend, and fold them into the + // addressing mode. + // + // E.g. + // + // off_reg = G_Z/S/ANYEXT ext_reg + // val = G_CONSTANT LegalShiftVal + // shift = G_SHL off_reg val + // ptr = G_PTR_ADD base_reg shift + // x = G_LOAD ptr + // + // In this case we can get a load like this: + // + // ldr x0, [base_reg, ext_reg, sxtw #LegalShiftVal] + auto ExtendedShl = selectExtendedSHL(Root, LHS, OffsetInst->getOperand(0), + SizeInBytes, /*WantsExt=*/true); + if (ExtendedShl) + return ExtendedShl; + + // There was no shift. We can try and fold a G_Z/S/ANYEXT in alone though. + // + // e.g. + // ldr something, [base_reg, ext_reg, sxtw] + if (!isWorthFoldingIntoExtendedReg(*OffsetInst, MRI)) + return None; + + // Check if this is an extend. We'll get an extend type if it is. + AArch64_AM::ShiftExtendType Ext = + getExtendTypeForInst(*OffsetInst, MRI, /*IsLoadStore=*/true); + if (Ext == AArch64_AM::InvalidShiftExtend) + return None; + + // Need a 32-bit wide register. + MachineIRBuilder MIB(*PtrAdd); + Register ExtReg = + narrowExtendRegIfNeeded(OffsetInst->getOperand(1).getReg(), MIB); + unsigned SignExtend = Ext == AArch64_AM::SXTW; + + // Base is LHS, offset is ExtReg. + return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(LHS.getReg()); }, + [=](MachineInstrBuilder &MIB) { MIB.addUse(ExtReg); }, + [=](MachineInstrBuilder &MIB) { + MIB.addImm(SignExtend); + MIB.addImm(0); + }}}; +} + +/// Select a "register plus unscaled signed 9-bit immediate" address. This +/// should only match when there is an offset that is not valid for a scaled +/// immediate addressing mode. The "Size" argument is the size in bytes of the +/// memory reference, which is needed here to know what is valid for a scaled +/// immediate. +InstructionSelector::ComplexRendererFns +AArch64InstructionSelector::selectAddrModeUnscaled(MachineOperand &Root, + unsigned Size) const { + MachineRegisterInfo &MRI = + Root.getParent()->getParent()->getParent()->getRegInfo(); + + if (!Root.isReg()) + return None; + + if (!isBaseWithConstantOffset(Root, MRI)) + return None; + + MachineInstr *RootDef = MRI.getVRegDef(Root.getReg()); + if (!RootDef) + return None; + + MachineOperand &OffImm = RootDef->getOperand(2); + if (!OffImm.isReg()) + return None; + MachineInstr *RHS = MRI.getVRegDef(OffImm.getReg()); + if (!RHS || RHS->getOpcode() != TargetOpcode::G_CONSTANT) + return None; + int64_t RHSC; + MachineOperand &RHSOp1 = RHS->getOperand(1); + if (!RHSOp1.isCImm() || RHSOp1.getCImm()->getBitWidth() > 64) + return None; + RHSC = RHSOp1.getCImm()->getSExtValue(); + + // If the offset is valid as a scaled immediate, don't match here. + if ((RHSC & (Size - 1)) == 0 && RHSC >= 0 && RHSC < (0x1000 << Log2_32(Size))) + return None; + if (RHSC >= -256 && RHSC < 256) { + MachineOperand &Base = RootDef->getOperand(1); + return {{ + [=](MachineInstrBuilder &MIB) { MIB.add(Base); }, + [=](MachineInstrBuilder &MIB) { MIB.addImm(RHSC); }, + }}; + } + return None; +} + +InstructionSelector::ComplexRendererFns +AArch64InstructionSelector::tryFoldAddLowIntoImm(MachineInstr &RootDef, + unsigned Size, + MachineRegisterInfo &MRI) const { + if (RootDef.getOpcode() != AArch64::G_ADD_LOW) + return None; + MachineInstr &Adrp = *MRI.getVRegDef(RootDef.getOperand(1).getReg()); + if (Adrp.getOpcode() != AArch64::ADRP) + return None; + + // TODO: add heuristics like isWorthFoldingADDlow() from SelectionDAG. + // TODO: Need to check GV's offset % size if doing offset folding into globals. + assert(Adrp.getOperand(1).getOffset() == 0 && "Unexpected offset in global"); + auto GV = Adrp.getOperand(1).getGlobal(); + if (GV->isThreadLocal()) + return None; + + auto &MF = *RootDef.getParent()->getParent(); + if (GV->getPointerAlignment(MF.getDataLayout()) < Size) + return None; + + unsigned OpFlags = STI.ClassifyGlobalReference(GV, MF.getTarget()); + MachineIRBuilder MIRBuilder(RootDef); + Register AdrpReg = Adrp.getOperand(0).getReg(); + return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(AdrpReg); }, + [=](MachineInstrBuilder &MIB) { + MIB.addGlobalAddress(GV, /* Offset */ 0, + OpFlags | AArch64II::MO_PAGEOFF | + AArch64II::MO_NC); + }}}; +} + +/// Select a "register plus scaled unsigned 12-bit immediate" address. The +/// "Size" argument is the size in bytes of the memory reference, which +/// determines the scale. +InstructionSelector::ComplexRendererFns +AArch64InstructionSelector::selectAddrModeIndexed(MachineOperand &Root, + unsigned Size) const { + MachineFunction &MF = *Root.getParent()->getParent()->getParent(); + MachineRegisterInfo &MRI = MF.getRegInfo(); + + if (!Root.isReg()) + return None; + + MachineInstr *RootDef = MRI.getVRegDef(Root.getReg()); + if (!RootDef) + return None; + + if (RootDef->getOpcode() == TargetOpcode::G_FRAME_INDEX) { + return {{ + [=](MachineInstrBuilder &MIB) { MIB.add(RootDef->getOperand(1)); }, + [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, + }}; + } + + CodeModel::Model CM = MF.getTarget().getCodeModel(); + // Check if we can fold in the ADD of small code model ADRP + ADD address. + if (CM == CodeModel::Small) { + auto OpFns = tryFoldAddLowIntoImm(*RootDef, Size, MRI); + if (OpFns) + return OpFns; + } + + if (isBaseWithConstantOffset(Root, MRI)) { + MachineOperand &LHS = RootDef->getOperand(1); + MachineOperand &RHS = RootDef->getOperand(2); + MachineInstr *LHSDef = MRI.getVRegDef(LHS.getReg()); + MachineInstr *RHSDef = MRI.getVRegDef(RHS.getReg()); + if (LHSDef && RHSDef) { + int64_t RHSC = (int64_t)RHSDef->getOperand(1).getCImm()->getZExtValue(); + unsigned Scale = Log2_32(Size); + if ((RHSC & (Size - 1)) == 0 && RHSC >= 0 && RHSC < (0x1000 << Scale)) { + if (LHSDef->getOpcode() == TargetOpcode::G_FRAME_INDEX) + return {{ + [=](MachineInstrBuilder &MIB) { MIB.add(LHSDef->getOperand(1)); }, + [=](MachineInstrBuilder &MIB) { MIB.addImm(RHSC >> Scale); }, + }}; + + return {{ + [=](MachineInstrBuilder &MIB) { MIB.add(LHS); }, + [=](MachineInstrBuilder &MIB) { MIB.addImm(RHSC >> Scale); }, + }}; + } + } + } + + // Before falling back to our general case, check if the unscaled + // instructions can handle this. If so, that's preferable. + if (selectAddrModeUnscaled(Root, Size).hasValue()) + return None; + + return {{ + [=](MachineInstrBuilder &MIB) { MIB.add(Root); }, + [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, + }}; +} + +/// Given a shift instruction, return the correct shift type for that +/// instruction. +static AArch64_AM::ShiftExtendType getShiftTypeForInst(MachineInstr &MI) { + // TODO: Handle AArch64_AM::ROR + switch (MI.getOpcode()) { + default: + return AArch64_AM::InvalidShiftExtend; + case TargetOpcode::G_SHL: + return AArch64_AM::LSL; + case TargetOpcode::G_LSHR: + return AArch64_AM::LSR; + case TargetOpcode::G_ASHR: + return AArch64_AM::ASR; + } +} + +/// Select a "shifted register" operand. If the value is not shifted, set the +/// shift operand to a default value of "lsl 0". +/// +/// TODO: Allow shifted register to be rotated in logical instructions. +InstructionSelector::ComplexRendererFns +AArch64InstructionSelector::selectShiftedRegister(MachineOperand &Root) const { + if (!Root.isReg()) + return None; + MachineRegisterInfo &MRI = + Root.getParent()->getParent()->getParent()->getRegInfo(); + + // Check if the operand is defined by an instruction which corresponds to + // a ShiftExtendType. E.g. a G_SHL, G_LSHR, etc. + // + // TODO: Handle AArch64_AM::ROR for logical instructions. + MachineInstr *ShiftInst = MRI.getVRegDef(Root.getReg()); + if (!ShiftInst) + return None; + AArch64_AM::ShiftExtendType ShType = getShiftTypeForInst(*ShiftInst); + if (ShType == AArch64_AM::InvalidShiftExtend) + return None; + if (!isWorthFoldingIntoExtendedReg(*ShiftInst, MRI)) + return None; + + // Need an immediate on the RHS. + MachineOperand &ShiftRHS = ShiftInst->getOperand(2); + auto Immed = getImmedFromMO(ShiftRHS); + if (!Immed) + return None; + + // We have something that we can fold. Fold in the shift's LHS and RHS into + // the instruction. + MachineOperand &ShiftLHS = ShiftInst->getOperand(1); + Register ShiftReg = ShiftLHS.getReg(); + + unsigned NumBits = MRI.getType(ShiftReg).getSizeInBits(); + unsigned Val = *Immed & (NumBits - 1); + unsigned ShiftVal = AArch64_AM::getShifterImm(ShType, Val); + + return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(ShiftReg); }, + [=](MachineInstrBuilder &MIB) { MIB.addImm(ShiftVal); }}}; +} + +AArch64_AM::ShiftExtendType AArch64InstructionSelector::getExtendTypeForInst( + MachineInstr &MI, MachineRegisterInfo &MRI, bool IsLoadStore) const { + unsigned Opc = MI.getOpcode(); + + // Handle explicit extend instructions first. + if (Opc == TargetOpcode::G_SEXT || Opc == TargetOpcode::G_SEXT_INREG) { + unsigned Size; + if (Opc == TargetOpcode::G_SEXT) + Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits(); + else + Size = MI.getOperand(2).getImm(); + assert(Size != 64 && "Extend from 64 bits?"); + switch (Size) { + case 8: + return AArch64_AM::SXTB; + case 16: + return AArch64_AM::SXTH; + case 32: + return AArch64_AM::SXTW; + default: + return AArch64_AM::InvalidShiftExtend; + } + } + + if (Opc == TargetOpcode::G_ZEXT || Opc == TargetOpcode::G_ANYEXT) { + unsigned Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits(); + assert(Size != 64 && "Extend from 64 bits?"); + switch (Size) { + case 8: + return AArch64_AM::UXTB; + case 16: + return AArch64_AM::UXTH; + case 32: + return AArch64_AM::UXTW; + default: + return AArch64_AM::InvalidShiftExtend; + } + } + + // Don't have an explicit extend. Try to handle a G_AND with a constant mask + // on the RHS. + if (Opc != TargetOpcode::G_AND) + return AArch64_AM::InvalidShiftExtend; + + Optional<uint64_t> MaybeAndMask = getImmedFromMO(MI.getOperand(2)); + if (!MaybeAndMask) + return AArch64_AM::InvalidShiftExtend; + uint64_t AndMask = *MaybeAndMask; + switch (AndMask) { + default: + return AArch64_AM::InvalidShiftExtend; + case 0xFF: + return !IsLoadStore ? AArch64_AM::UXTB : AArch64_AM::InvalidShiftExtend; + case 0xFFFF: + return !IsLoadStore ? AArch64_AM::UXTH : AArch64_AM::InvalidShiftExtend; + case 0xFFFFFFFF: + return AArch64_AM::UXTW; + } +} + +Register AArch64InstructionSelector::narrowExtendRegIfNeeded( + Register ExtReg, MachineIRBuilder &MIB) const { + MachineRegisterInfo &MRI = *MIB.getMRI(); + if (MRI.getType(ExtReg).getSizeInBits() == 32) + return ExtReg; + + // Insert a copy to move ExtReg to GPR32. + Register NarrowReg = MRI.createVirtualRegister(&AArch64::GPR32RegClass); + auto Copy = MIB.buildCopy({NarrowReg}, {ExtReg}); + + // Select the copy into a subregister copy. + selectCopy(*Copy, TII, MRI, TRI, RBI); + return Copy.getReg(0); +} + +Register AArch64InstructionSelector::widenGPRBankRegIfNeeded( + Register Reg, unsigned WideSize, MachineIRBuilder &MIB) const { + assert(WideSize >= 8 && "WideSize is smaller than all possible registers?"); + MachineRegisterInfo &MRI = *MIB.getMRI(); + unsigned NarrowSize = MRI.getType(Reg).getSizeInBits(); + assert(WideSize >= NarrowSize && + "WideSize cannot be smaller than NarrowSize!"); + + // If the sizes match, just return the register. + // + // If NarrowSize is an s1, then we can select it to any size, so we'll treat + // it as a don't care. + if (NarrowSize == WideSize || NarrowSize == 1) + return Reg; + + // Now check the register classes. + const RegisterBank *RB = RBI.getRegBank(Reg, MRI, TRI); + const TargetRegisterClass *OrigRC = getMinClassForRegBank(*RB, NarrowSize); + const TargetRegisterClass *WideRC = getMinClassForRegBank(*RB, WideSize); + assert(OrigRC && "Could not determine narrow RC?"); + assert(WideRC && "Could not determine wide RC?"); + + // If the sizes differ, but the register classes are the same, there is no + // need to insert a SUBREG_TO_REG. + // + // For example, an s8 that's supposed to be a GPR will be selected to either + // a GPR32 or a GPR64 register. Note that this assumes that the s8 will + // always end up on a GPR32. + if (OrigRC == WideRC) + return Reg; + + // We have two different register classes. Insert a SUBREG_TO_REG. + unsigned SubReg = 0; + getSubRegForClass(OrigRC, TRI, SubReg); + assert(SubReg && "Couldn't determine subregister?"); + + // Build the SUBREG_TO_REG and return the new, widened register. + auto SubRegToReg = + MIB.buildInstr(AArch64::SUBREG_TO_REG, {WideRC}, {}) + .addImm(0) + .addUse(Reg) + .addImm(SubReg); + constrainSelectedInstRegOperands(*SubRegToReg, TII, TRI, RBI); + return SubRegToReg.getReg(0); +} + +/// Select an "extended register" operand. This operand folds in an extend +/// followed by an optional left shift. +InstructionSelector::ComplexRendererFns +AArch64InstructionSelector::selectArithExtendedRegister( + MachineOperand &Root) const { + if (!Root.isReg()) + return None; + MachineRegisterInfo &MRI = + Root.getParent()->getParent()->getParent()->getRegInfo(); + + uint64_t ShiftVal = 0; + Register ExtReg; + AArch64_AM::ShiftExtendType Ext; + MachineInstr *RootDef = getDefIgnoringCopies(Root.getReg(), MRI); + if (!RootDef) + return None; + + if (!isWorthFoldingIntoExtendedReg(*RootDef, MRI)) + return None; + + // Check if we can fold a shift and an extend. + if (RootDef->getOpcode() == TargetOpcode::G_SHL) { + // Look for a constant on the RHS of the shift. + MachineOperand &RHS = RootDef->getOperand(2); + Optional<uint64_t> MaybeShiftVal = getImmedFromMO(RHS); + if (!MaybeShiftVal) + return None; + ShiftVal = *MaybeShiftVal; + if (ShiftVal > 4) + return None; + // Look for a valid extend instruction on the LHS of the shift. + MachineOperand &LHS = RootDef->getOperand(1); + MachineInstr *ExtDef = getDefIgnoringCopies(LHS.getReg(), MRI); + if (!ExtDef) + return None; + Ext = getExtendTypeForInst(*ExtDef, MRI); + if (Ext == AArch64_AM::InvalidShiftExtend) + return None; + ExtReg = ExtDef->getOperand(1).getReg(); + } else { + // Didn't get a shift. Try just folding an extend. + Ext = getExtendTypeForInst(*RootDef, MRI); + if (Ext == AArch64_AM::InvalidShiftExtend) + return None; + ExtReg = RootDef->getOperand(1).getReg(); + + // If we have a 32 bit instruction which zeroes out the high half of a + // register, we get an implicit zero extend for free. Check if we have one. + // FIXME: We actually emit the extend right now even though we don't have + // to. + if (Ext == AArch64_AM::UXTW && MRI.getType(ExtReg).getSizeInBits() == 32) { + MachineInstr *ExtInst = MRI.getVRegDef(ExtReg); + if (ExtInst && isDef32(*ExtInst)) + return None; + } + } + + // We require a GPR32 here. Narrow the ExtReg if needed using a subregister + // copy. + MachineIRBuilder MIB(*RootDef); + ExtReg = narrowExtendRegIfNeeded(ExtReg, MIB); + + return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(ExtReg); }, + [=](MachineInstrBuilder &MIB) { + MIB.addImm(getArithExtendImm(Ext, ShiftVal)); + }}}; +} + +void AArch64InstructionSelector::renderTruncImm(MachineInstrBuilder &MIB, + const MachineInstr &MI, + int OpIdx) const { + const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); + assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 && + "Expected G_CONSTANT"); + Optional<int64_t> CstVal = getConstantVRegVal(MI.getOperand(0).getReg(), MRI); + assert(CstVal && "Expected constant value"); + MIB.addImm(CstVal.getValue()); +} + +void AArch64InstructionSelector::renderLogicalImm32( + MachineInstrBuilder &MIB, const MachineInstr &I, int OpIdx) const { + assert(I.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 && + "Expected G_CONSTANT"); + uint64_t CstVal = I.getOperand(1).getCImm()->getZExtValue(); + uint64_t Enc = AArch64_AM::encodeLogicalImmediate(CstVal, 32); + MIB.addImm(Enc); +} + +void AArch64InstructionSelector::renderLogicalImm64( + MachineInstrBuilder &MIB, const MachineInstr &I, int OpIdx) const { + assert(I.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 && + "Expected G_CONSTANT"); + uint64_t CstVal = I.getOperand(1).getCImm()->getZExtValue(); + uint64_t Enc = AArch64_AM::encodeLogicalImmediate(CstVal, 64); + MIB.addImm(Enc); +} + +bool AArch64InstructionSelector::isLoadStoreOfNumBytes( + const MachineInstr &MI, unsigned NumBytes) const { + if (!MI.mayLoadOrStore()) + return false; + assert(MI.hasOneMemOperand() && + "Expected load/store to have only one mem op!"); + return (*MI.memoperands_begin())->getSize() == NumBytes; +} + +bool AArch64InstructionSelector::isDef32(const MachineInstr &MI) const { + const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); + if (MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() != 32) + return false; + + // Only return true if we know the operation will zero-out the high half of + // the 64-bit register. Truncates can be subregister copies, which don't + // zero out the high bits. Copies and other copy-like instructions can be + // fed by truncates, or could be lowered as subregister copies. + switch (MI.getOpcode()) { + default: + return true; + case TargetOpcode::COPY: + case TargetOpcode::G_BITCAST: + case TargetOpcode::G_TRUNC: + case TargetOpcode::G_PHI: + return false; + } +} + + +// Perform fixups on the given PHI instruction's operands to force them all +// to be the same as the destination regbank. +static void fixupPHIOpBanks(MachineInstr &MI, MachineRegisterInfo &MRI, + const AArch64RegisterBankInfo &RBI) { + assert(MI.getOpcode() == TargetOpcode::G_PHI && "Expected a G_PHI"); + Register DstReg = MI.getOperand(0).getReg(); + const RegisterBank *DstRB = MRI.getRegBankOrNull(DstReg); + assert(DstRB && "Expected PHI dst to have regbank assigned"); + MachineIRBuilder MIB(MI); + + // Go through each operand and ensure it has the same regbank. + for (unsigned OpIdx = 1; OpIdx < MI.getNumOperands(); ++OpIdx) { + MachineOperand &MO = MI.getOperand(OpIdx); + if (!MO.isReg()) + continue; + Register OpReg = MO.getReg(); + const RegisterBank *RB = MRI.getRegBankOrNull(OpReg); + if (RB != DstRB) { + // Insert a cross-bank copy. + auto *OpDef = MRI.getVRegDef(OpReg); + const LLT &Ty = MRI.getType(OpReg); + MIB.setInsertPt(*OpDef->getParent(), std::next(OpDef->getIterator())); + auto Copy = MIB.buildCopy(Ty, OpReg); + MRI.setRegBank(Copy.getReg(0), *DstRB); + MO.setReg(Copy.getReg(0)); + } + } +} + +void AArch64InstructionSelector::processPHIs(MachineFunction &MF) { + // We're looking for PHIs, build a list so we don't invalidate iterators. + MachineRegisterInfo &MRI = MF.getRegInfo(); + SmallVector<MachineInstr *, 32> Phis; + for (auto &BB : MF) { + for (auto &MI : BB) { + if (MI.getOpcode() == TargetOpcode::G_PHI) + Phis.emplace_back(&MI); + } + } + + for (auto *MI : Phis) { + // We need to do some work here if the operand types are < 16 bit and they + // are split across fpr/gpr banks. Since all types <32b on gpr + // end up being assigned gpr32 regclasses, we can end up with PHIs here + // which try to select between a gpr32 and an fpr16. Ideally RBS shouldn't + // be selecting heterogenous regbanks for operands if possible, but we + // still need to be able to deal with it here. + // + // To fix this, if we have a gpr-bank operand < 32b in size and at least + // one other operand is on the fpr bank, then we add cross-bank copies + // to homogenize the operand banks. For simplicity the bank that we choose + // to settle on is whatever bank the def operand has. For example: + // + // %endbb: + // %dst:gpr(s16) = G_PHI %in1:gpr(s16), %bb1, %in2:fpr(s16), %bb2 + // => + // %bb2: + // ... + // %in2_copy:gpr(s16) = COPY %in2:fpr(s16) + // ... + // %endbb: + // %dst:gpr(s16) = G_PHI %in1:gpr(s16), %bb1, %in2_copy:gpr(s16), %bb2 + bool HasGPROp = false, HasFPROp = false; + for (unsigned OpIdx = 1; OpIdx < MI->getNumOperands(); ++OpIdx) { + const auto &MO = MI->getOperand(OpIdx); + if (!MO.isReg()) + continue; + const LLT &Ty = MRI.getType(MO.getReg()); + if (!Ty.isValid() || !Ty.isScalar()) + break; + if (Ty.getSizeInBits() >= 32) + break; + const RegisterBank *RB = MRI.getRegBankOrNull(MO.getReg()); + // If for some reason we don't have a regbank yet. Don't try anything. + if (!RB) + break; + + if (RB->getID() == AArch64::GPRRegBankID) + HasGPROp = true; + else + HasFPROp = true; + } + // We have heterogenous regbanks, need to fixup. + if (HasGPROp && HasFPROp) + fixupPHIOpBanks(*MI, MRI, RBI); + } +} + +namespace llvm { +InstructionSelector * +createAArch64InstructionSelector(const AArch64TargetMachine &TM, + AArch64Subtarget &Subtarget, + AArch64RegisterBankInfo &RBI) { + return new AArch64InstructionSelector(TM, Subtarget, RBI); +} +} diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp new file mode 100644 index 000000000000..2eaec0b970fa --- /dev/null +++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp @@ -0,0 +1,809 @@ +//===- AArch64LegalizerInfo.cpp ----------------------------------*- C++ -*-==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// \file +/// This file implements the targeting of the Machinelegalizer class for +/// AArch64. +/// \todo This should be generated by TableGen. +//===----------------------------------------------------------------------===// + +#include "AArch64LegalizerInfo.h" +#include "AArch64Subtarget.h" +#include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" +#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" +#include "llvm/CodeGen/GlobalISel/Utils.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/TargetOpcodes.h" +#include "llvm/CodeGen/ValueTypes.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/Type.h" + +#define DEBUG_TYPE "aarch64-legalinfo" + +using namespace llvm; +using namespace LegalizeActions; +using namespace LegalizeMutations; +using namespace LegalityPredicates; + +AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) + : ST(&ST) { + using namespace TargetOpcode; + const LLT p0 = LLT::pointer(0, 64); + const LLT s1 = LLT::scalar(1); + const LLT s8 = LLT::scalar(8); + const LLT s16 = LLT::scalar(16); + const LLT s32 = LLT::scalar(32); + const LLT s64 = LLT::scalar(64); + const LLT s128 = LLT::scalar(128); + const LLT s256 = LLT::scalar(256); + const LLT s512 = LLT::scalar(512); + const LLT v16s8 = LLT::vector(16, 8); + const LLT v8s8 = LLT::vector(8, 8); + const LLT v4s8 = LLT::vector(4, 8); + const LLT v8s16 = LLT::vector(8, 16); + const LLT v4s16 = LLT::vector(4, 16); + const LLT v2s16 = LLT::vector(2, 16); + const LLT v2s32 = LLT::vector(2, 32); + const LLT v4s32 = LLT::vector(4, 32); + const LLT v2s64 = LLT::vector(2, 64); + const LLT v2p0 = LLT::vector(2, p0); + + const TargetMachine &TM = ST.getTargetLowering()->getTargetMachine(); + + // FIXME: support subtargets which have neon/fp-armv8 disabled. + if (!ST.hasNEON() || !ST.hasFPARMv8()) { + computeTables(); + return; + } + + getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE}) + .legalFor({p0, s1, s8, s16, s32, s64, v2s32, v4s32, v2s64}) + .clampScalar(0, s1, s64) + .widenScalarToNextPow2(0, 8) + .fewerElementsIf( + [=](const LegalityQuery &Query) { + return Query.Types[0].isVector() && + (Query.Types[0].getElementType() != s64 || + Query.Types[0].getNumElements() != 2); + }, + [=](const LegalityQuery &Query) { + LLT EltTy = Query.Types[0].getElementType(); + if (EltTy == s64) + return std::make_pair(0, LLT::vector(2, 64)); + return std::make_pair(0, EltTy); + }); + + getActionDefinitionsBuilder(G_PHI) + .legalFor({p0, s16, s32, s64, v2s32, v4s32, v2s64}) + .clampScalar(0, s16, s64) + .widenScalarToNextPow2(0); + + getActionDefinitionsBuilder(G_BSWAP) + .legalFor({s32, s64, v4s32, v2s32, v2s64}) + .clampScalar(0, s32, s64) + .widenScalarToNextPow2(0); + + getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL, G_AND, G_OR, G_XOR}) + .legalFor({s32, s64, v2s32, v4s32, v2s64, v8s16, v16s8}) + .clampScalar(0, s32, s64) + .widenScalarToNextPow2(0) + .clampNumElements(0, v2s32, v4s32) + .clampNumElements(0, v2s64, v2s64) + .moreElementsToNextPow2(0); + + getActionDefinitionsBuilder(G_SHL) + .legalFor({{s32, s32}, {s64, s64}, + {v2s32, v2s32}, {v4s32, v4s32}, {v2s64, v2s64}}) + .clampScalar(1, s32, s64) + .clampScalar(0, s32, s64) + .widenScalarToNextPow2(0) + .clampNumElements(0, v2s32, v4s32) + .clampNumElements(0, v2s64, v2s64) + .moreElementsToNextPow2(0) + .minScalarSameAs(1, 0); + + getActionDefinitionsBuilder(G_PTR_ADD) + .legalFor({{p0, s64}, {v2p0, v2s64}}) + .clampScalar(1, s64, s64); + + getActionDefinitionsBuilder(G_PTRMASK).legalFor({{p0, s64}}); + + getActionDefinitionsBuilder({G_SDIV, G_UDIV}) + .legalFor({s32, s64}) + .libcallFor({s128}) + .clampScalar(0, s32, s64) + .widenScalarToNextPow2(0) + .scalarize(0); + + getActionDefinitionsBuilder({G_LSHR, G_ASHR}) + .customIf([=](const LegalityQuery &Query) { + const auto &SrcTy = Query.Types[0]; + const auto &AmtTy = Query.Types[1]; + return !SrcTy.isVector() && SrcTy.getSizeInBits() == 32 && + AmtTy.getSizeInBits() == 32; + }) + .legalFor({{s32, s32}, + {s32, s64}, + {s64, s64}, + {v2s32, v2s32}, + {v4s32, v4s32}, + {v2s64, v2s64}}) + .clampScalar(1, s32, s64) + .clampScalar(0, s32, s64) + .minScalarSameAs(1, 0); + + getActionDefinitionsBuilder({G_SREM, G_UREM}) + .lowerFor({s1, s8, s16, s32, s64}); + + getActionDefinitionsBuilder({G_SMULO, G_UMULO}) + .lowerFor({{s64, s1}}); + + getActionDefinitionsBuilder({G_SMULH, G_UMULH}).legalFor({s32, s64}); + + getActionDefinitionsBuilder({G_UADDE, G_USUBE, G_SADDO, G_SSUBO, G_UADDO}) + .legalFor({{s32, s1}, {s64, s1}}) + .minScalar(0, s32); + + getActionDefinitionsBuilder({G_FADD, G_FSUB, G_FMUL, G_FDIV, G_FNEG}) + .legalFor({s32, s64, v2s64, v4s32, v2s32}); + + getActionDefinitionsBuilder(G_FREM).libcallFor({s32, s64}); + + getActionDefinitionsBuilder({G_FCEIL, G_FABS, G_FSQRT, G_FFLOOR, G_FRINT, + G_FMA, G_INTRINSIC_TRUNC, G_INTRINSIC_ROUND, + G_FNEARBYINT}) + // If we don't have full FP16 support, then scalarize the elements of + // vectors containing fp16 types. + .fewerElementsIf( + [=, &ST](const LegalityQuery &Query) { + const auto &Ty = Query.Types[0]; + return Ty.isVector() && Ty.getElementType() == s16 && + !ST.hasFullFP16(); + }, + [=](const LegalityQuery &Query) { return std::make_pair(0, s16); }) + // If we don't have full FP16 support, then widen s16 to s32 if we + // encounter it. + .widenScalarIf( + [=, &ST](const LegalityQuery &Query) { + return Query.Types[0] == s16 && !ST.hasFullFP16(); + }, + [=](const LegalityQuery &Query) { return std::make_pair(0, s32); }) + .legalFor({s16, s32, s64, v2s32, v4s32, v2s64, v2s16, v4s16, v8s16}); + + getActionDefinitionsBuilder( + {G_FCOS, G_FSIN, G_FLOG10, G_FLOG, G_FLOG2, G_FEXP, G_FEXP2, G_FPOW}) + // We need a call for these, so we always need to scalarize. + .scalarize(0) + // Regardless of FP16 support, widen 16-bit elements to 32-bits. + .minScalar(0, s32) + .libcallFor({s32, s64, v2s32, v4s32, v2s64}); + + getActionDefinitionsBuilder(G_INSERT) + .unsupportedIf([=](const LegalityQuery &Query) { + return Query.Types[0].getSizeInBits() <= Query.Types[1].getSizeInBits(); + }) + .legalIf([=](const LegalityQuery &Query) { + const LLT &Ty0 = Query.Types[0]; + const LLT &Ty1 = Query.Types[1]; + if (Ty0 != s32 && Ty0 != s64 && Ty0 != p0) + return false; + return isPowerOf2_32(Ty1.getSizeInBits()) && + (Ty1.getSizeInBits() == 1 || Ty1.getSizeInBits() >= 8); + }) + .clampScalar(0, s32, s64) + .widenScalarToNextPow2(0) + .maxScalarIf(typeInSet(0, {s32}), 1, s16) + .maxScalarIf(typeInSet(0, {s64}), 1, s32) + .widenScalarToNextPow2(1); + + getActionDefinitionsBuilder(G_EXTRACT) + .unsupportedIf([=](const LegalityQuery &Query) { + return Query.Types[0].getSizeInBits() >= Query.Types[1].getSizeInBits(); + }) + .legalIf([=](const LegalityQuery &Query) { + const LLT &Ty0 = Query.Types[0]; + const LLT &Ty1 = Query.Types[1]; + if (Ty1 != s32 && Ty1 != s64 && Ty1 != s128) + return false; + if (Ty1 == p0) + return true; + return isPowerOf2_32(Ty0.getSizeInBits()) && + (Ty0.getSizeInBits() == 1 || Ty0.getSizeInBits() >= 8); + }) + .clampScalar(1, s32, s128) + .widenScalarToNextPow2(1) + .maxScalarIf(typeInSet(1, {s32}), 0, s16) + .maxScalarIf(typeInSet(1, {s64}), 0, s32) + .widenScalarToNextPow2(0); + + getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) + .legalForTypesWithMemDesc({{s32, p0, 8, 8}, + {s32, p0, 16, 8}, + {s32, p0, 32, 8}, + {s64, p0, 8, 2}, + {s64, p0, 16, 2}, + {s64, p0, 32, 4}, + {s64, p0, 64, 8}, + {p0, p0, 64, 8}, + {v2s32, p0, 64, 8}}) + .clampScalar(0, s32, s64) + .widenScalarToNextPow2(0) + // TODO: We could support sum-of-pow2's but the lowering code doesn't know + // how to do that yet. + .unsupportedIfMemSizeNotPow2() + // Lower anything left over into G_*EXT and G_LOAD + .lower(); + + auto IsPtrVecPred = [=](const LegalityQuery &Query) { + const LLT &ValTy = Query.Types[0]; + if (!ValTy.isVector()) + return false; + const LLT EltTy = ValTy.getElementType(); + return EltTy.isPointer() && EltTy.getAddressSpace() == 0; + }; + + getActionDefinitionsBuilder(G_LOAD) + .legalForTypesWithMemDesc({{s8, p0, 8, 8}, + {s16, p0, 16, 8}, + {s32, p0, 32, 8}, + {s64, p0, 64, 8}, + {p0, p0, 64, 8}, + {s128, p0, 128, 8}, + {v8s8, p0, 64, 8}, + {v16s8, p0, 128, 8}, + {v4s16, p0, 64, 8}, + {v8s16, p0, 128, 8}, + {v2s32, p0, 64, 8}, + {v4s32, p0, 128, 8}, + {v2s64, p0, 128, 8}}) + // These extends are also legal + .legalForTypesWithMemDesc({{s32, p0, 8, 8}, + {s32, p0, 16, 8}}) + .clampScalar(0, s8, s64) + .lowerIfMemSizeNotPow2() + // Lower any any-extending loads left into G_ANYEXT and G_LOAD + .lowerIf([=](const LegalityQuery &Query) { + return Query.Types[0].getSizeInBits() != Query.MMODescrs[0].SizeInBits; + }) + .widenScalarToNextPow2(0) + .clampMaxNumElements(0, s32, 2) + .clampMaxNumElements(0, s64, 1) + .customIf(IsPtrVecPred); + + getActionDefinitionsBuilder(G_STORE) + .legalForTypesWithMemDesc({{s8, p0, 8, 8}, + {s16, p0, 16, 8}, + {s32, p0, 8, 8}, + {s32, p0, 16, 8}, + {s32, p0, 32, 8}, + {s64, p0, 64, 8}, + {p0, p0, 64, 8}, + {s128, p0, 128, 8}, + {v16s8, p0, 128, 8}, + {v4s16, p0, 64, 8}, + {v8s16, p0, 128, 8}, + {v2s32, p0, 64, 8}, + {v4s32, p0, 128, 8}, + {v2s64, p0, 128, 8}}) + .clampScalar(0, s8, s64) + .lowerIfMemSizeNotPow2() + .lowerIf([=](const LegalityQuery &Query) { + return Query.Types[0].isScalar() && + Query.Types[0].getSizeInBits() != Query.MMODescrs[0].SizeInBits; + }) + .clampMaxNumElements(0, s32, 2) + .clampMaxNumElements(0, s64, 1) + .customIf(IsPtrVecPred); + + // Constants + getActionDefinitionsBuilder(G_CONSTANT) + .legalFor({p0, s8, s16, s32, s64}) + .clampScalar(0, s8, s64) + .widenScalarToNextPow2(0); + getActionDefinitionsBuilder(G_FCONSTANT) + .legalFor({s32, s64}) + .clampScalar(0, s32, s64); + + getActionDefinitionsBuilder(G_ICMP) + .legalFor({{s32, s32}, + {s32, s64}, + {s32, p0}, + {v4s32, v4s32}, + {v2s32, v2s32}, + {v2s64, v2s64}, + {v2s64, v2p0}, + {v4s16, v4s16}, + {v8s16, v8s16}, + {v8s8, v8s8}, + {v16s8, v16s8}}) + .clampScalar(1, s32, s64) + .clampScalar(0, s32, s32) + .minScalarEltSameAsIf( + [=](const LegalityQuery &Query) { + const LLT &Ty = Query.Types[0]; + const LLT &SrcTy = Query.Types[1]; + return Ty.isVector() && !SrcTy.getElementType().isPointer() && + Ty.getElementType() != SrcTy.getElementType(); + }, + 0, 1) + .minScalarOrEltIf( + [=](const LegalityQuery &Query) { return Query.Types[1] == v2s16; }, + 1, s32) + .minScalarOrEltIf( + [=](const LegalityQuery &Query) { return Query.Types[1] == v2p0; }, 0, + s64) + .widenScalarOrEltToNextPow2(1); + + getActionDefinitionsBuilder(G_FCMP) + .legalFor({{s32, s32}, {s32, s64}}) + .clampScalar(0, s32, s32) + .clampScalar(1, s32, s64) + .widenScalarToNextPow2(1); + + // Extensions + auto ExtLegalFunc = [=](const LegalityQuery &Query) { + unsigned DstSize = Query.Types[0].getSizeInBits(); + + if (DstSize == 128 && !Query.Types[0].isVector()) + return false; // Extending to a scalar s128 needs narrowing. + + // Make sure that we have something that will fit in a register, and + // make sure it's a power of 2. + if (DstSize < 8 || DstSize > 128 || !isPowerOf2_32(DstSize)) + return false; + + const LLT &SrcTy = Query.Types[1]; + + // Special case for s1. + if (SrcTy == s1) + return true; + + // Make sure we fit in a register otherwise. Don't bother checking that + // the source type is below 128 bits. We shouldn't be allowing anything + // through which is wider than the destination in the first place. + unsigned SrcSize = SrcTy.getSizeInBits(); + if (SrcSize < 8 || !isPowerOf2_32(SrcSize)) + return false; + + return true; + }; + getActionDefinitionsBuilder({G_ZEXT, G_SEXT, G_ANYEXT}) + .legalIf(ExtLegalFunc) + .clampScalar(0, s64, s64); // Just for s128, others are handled above. + + getActionDefinitionsBuilder(G_TRUNC).alwaysLegal(); + + getActionDefinitionsBuilder(G_SEXT_INREG) + .legalFor({s32, s64}) + .lower(); + + // FP conversions + getActionDefinitionsBuilder(G_FPTRUNC).legalFor( + {{s16, s32}, {s16, s64}, {s32, s64}, {v4s16, v4s32}, {v2s32, v2s64}}); + getActionDefinitionsBuilder(G_FPEXT).legalFor( + {{s32, s16}, {s64, s16}, {s64, s32}, {v4s32, v4s16}, {v2s64, v2s32}}); + + // Conversions + getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) + .legalForCartesianProduct({s32, s64, v2s64, v4s32, v2s32}) + .clampScalar(0, s32, s64) + .widenScalarToNextPow2(0) + .clampScalar(1, s32, s64) + .widenScalarToNextPow2(1); + + getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) + .legalForCartesianProduct({s32, s64, v2s64, v4s32, v2s32}) + .clampScalar(1, s32, s64) + .widenScalarToNextPow2(1) + .clampScalar(0, s32, s64) + .widenScalarToNextPow2(0); + + // Control-flow + getActionDefinitionsBuilder(G_BRCOND).legalFor({s1, s8, s16, s32}); + getActionDefinitionsBuilder(G_BRINDIRECT).legalFor({p0}); + + // Select + // FIXME: We can probably do a bit better than just scalarizing vector + // selects. + getActionDefinitionsBuilder(G_SELECT) + .legalFor({{s32, s1}, {s64, s1}, {p0, s1}}) + .clampScalar(0, s32, s64) + .widenScalarToNextPow2(0) + .scalarize(0); + + // Pointer-handling + getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({p0}); + + if (TM.getCodeModel() == CodeModel::Small) + getActionDefinitionsBuilder(G_GLOBAL_VALUE).custom(); + else + getActionDefinitionsBuilder(G_GLOBAL_VALUE).legalFor({p0}); + + getActionDefinitionsBuilder(G_PTRTOINT) + .legalForCartesianProduct({s1, s8, s16, s32, s64}, {p0}) + .maxScalar(0, s64) + .widenScalarToNextPow2(0, /*Min*/ 8); + + getActionDefinitionsBuilder(G_INTTOPTR) + .unsupportedIf([&](const LegalityQuery &Query) { + return Query.Types[0].getSizeInBits() != Query.Types[1].getSizeInBits(); + }) + .legalFor({{p0, s64}}); + + // Casts for 32 and 64-bit width type are just copies. + // Same for 128-bit width type, except they are on the FPR bank. + getActionDefinitionsBuilder(G_BITCAST) + // FIXME: This is wrong since G_BITCAST is not allowed to change the + // number of bits but it's what the previous code described and fixing + // it breaks tests. + .legalForCartesianProduct({s1, s8, s16, s32, s64, s128, v16s8, v8s8, v4s8, + v8s16, v4s16, v2s16, v4s32, v2s32, v2s64, + v2p0}); + + getActionDefinitionsBuilder(G_VASTART).legalFor({p0}); + + // va_list must be a pointer, but most sized types are pretty easy to handle + // as the destination. + getActionDefinitionsBuilder(G_VAARG) + .customForCartesianProduct({s8, s16, s32, s64, p0}, {p0}) + .clampScalar(0, s8, s64) + .widenScalarToNextPow2(0, /*Min*/ 8); + + if (ST.hasLSE()) { + getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG_WITH_SUCCESS) + .lowerIf(all( + typeInSet(0, {s8, s16, s32, s64}), typeIs(1, s1), typeIs(2, p0), + atomicOrderingAtLeastOrStrongerThan(0, AtomicOrdering::Monotonic))); + + getActionDefinitionsBuilder( + {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, G_ATOMICRMW_AND, + G_ATOMICRMW_OR, G_ATOMICRMW_XOR, G_ATOMICRMW_MIN, G_ATOMICRMW_MAX, + G_ATOMICRMW_UMIN, G_ATOMICRMW_UMAX, G_ATOMIC_CMPXCHG}) + .legalIf(all( + typeInSet(0, {s8, s16, s32, s64}), typeIs(1, p0), + atomicOrderingAtLeastOrStrongerThan(0, AtomicOrdering::Monotonic))); + } + + getActionDefinitionsBuilder(G_BLOCK_ADDR).legalFor({p0}); + + // Merge/Unmerge + for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { + unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; + unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; + + auto notValidElt = [](const LegalityQuery &Query, unsigned TypeIdx) { + const LLT &Ty = Query.Types[TypeIdx]; + if (Ty.isVector()) { + const LLT &EltTy = Ty.getElementType(); + if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 64) + return true; + if (!isPowerOf2_32(EltTy.getSizeInBits())) + return true; + } + return false; + }; + + // FIXME: This rule is horrible, but specifies the same as what we had + // before with the particularly strange definitions removed (e.g. + // s8 = G_MERGE_VALUES s32, s32). + // Part of the complexity comes from these ops being extremely flexible. For + // example, you can build/decompose vectors with it, concatenate vectors, + // etc. and in addition to this you can also bitcast with it at the same + // time. We've been considering breaking it up into multiple ops to make it + // more manageable throughout the backend. + getActionDefinitionsBuilder(Op) + // Break up vectors with weird elements into scalars + .fewerElementsIf( + [=](const LegalityQuery &Query) { return notValidElt(Query, 0); }, + scalarize(0)) + .fewerElementsIf( + [=](const LegalityQuery &Query) { return notValidElt(Query, 1); }, + scalarize(1)) + // Clamp the big scalar to s8-s512 and make it either a power of 2, 192, + // or 384. + .clampScalar(BigTyIdx, s8, s512) + .widenScalarIf( + [=](const LegalityQuery &Query) { + const LLT &Ty = Query.Types[BigTyIdx]; + return !isPowerOf2_32(Ty.getSizeInBits()) && + Ty.getSizeInBits() % 64 != 0; + }, + [=](const LegalityQuery &Query) { + // Pick the next power of 2, or a multiple of 64 over 128. + // Whichever is smaller. + const LLT &Ty = Query.Types[BigTyIdx]; + unsigned NewSizeInBits = 1 + << Log2_32_Ceil(Ty.getSizeInBits() + 1); + if (NewSizeInBits >= 256) { + unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); + if (RoundedTo < NewSizeInBits) + NewSizeInBits = RoundedTo; + } + return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits)); + }) + // Clamp the little scalar to s8-s256 and make it a power of 2. It's not + // worth considering the multiples of 64 since 2*192 and 2*384 are not + // valid. + .clampScalar(LitTyIdx, s8, s256) + .widenScalarToNextPow2(LitTyIdx, /*Min*/ 8) + // So at this point, we have s8, s16, s32, s64, s128, s192, s256, s384, + // s512, <X x s8>, <X x s16>, <X x s32>, or <X x s64>. + // At this point it's simple enough to accept the legal types. + .legalIf([=](const LegalityQuery &Query) { + const LLT &BigTy = Query.Types[BigTyIdx]; + const LLT &LitTy = Query.Types[LitTyIdx]; + if (BigTy.isVector() && BigTy.getSizeInBits() < 32) + return false; + if (LitTy.isVector() && LitTy.getSizeInBits() < 32) + return false; + return BigTy.getSizeInBits() % LitTy.getSizeInBits() == 0; + }) + // Any vectors left are the wrong size. Scalarize them. + .scalarize(0) + .scalarize(1); + } + + getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) + .unsupportedIf([=](const LegalityQuery &Query) { + const LLT &EltTy = Query.Types[1].getElementType(); + return Query.Types[0] != EltTy; + }) + .minScalar(2, s64) + .legalIf([=](const LegalityQuery &Query) { + const LLT &VecTy = Query.Types[1]; + return VecTy == v2s16 || VecTy == v4s16 || VecTy == v8s16 || + VecTy == v4s32 || VecTy == v2s64 || VecTy == v2s32; + }); + + getActionDefinitionsBuilder(G_INSERT_VECTOR_ELT) + .legalIf([=](const LegalityQuery &Query) { + const LLT &VecTy = Query.Types[0]; + // TODO: Support s8 and s16 + return VecTy == v2s32 || VecTy == v4s32 || VecTy == v2s64; + }); + + getActionDefinitionsBuilder(G_BUILD_VECTOR) + .legalFor({{v4s16, s16}, + {v8s16, s16}, + {v2s32, s32}, + {v4s32, s32}, + {v2p0, p0}, + {v2s64, s64}}) + .clampNumElements(0, v4s32, v4s32) + .clampNumElements(0, v2s64, v2s64) + + // Deal with larger scalar types, which will be implicitly truncated. + .legalIf([=](const LegalityQuery &Query) { + return Query.Types[0].getScalarSizeInBits() < + Query.Types[1].getSizeInBits(); + }) + .minScalarSameAs(1, 0); + + getActionDefinitionsBuilder(G_CTLZ).legalForCartesianProduct( + {s32, s64, v8s8, v16s8, v4s16, v8s16, v2s32, v4s32}) + .scalarize(1); + + getActionDefinitionsBuilder(G_SHUFFLE_VECTOR) + .legalIf([=](const LegalityQuery &Query) { + const LLT &DstTy = Query.Types[0]; + const LLT &SrcTy = Query.Types[1]; + // For now just support the TBL2 variant which needs the source vectors + // to be the same size as the dest. + if (DstTy != SrcTy) + return false; + for (auto &Ty : {v2s32, v4s32, v2s64}) { + if (DstTy == Ty) + return true; + } + return false; + }) + // G_SHUFFLE_VECTOR can have scalar sources (from 1 x s vectors), we + // just want those lowered into G_BUILD_VECTOR + .lowerIf([=](const LegalityQuery &Query) { + return !Query.Types[1].isVector(); + }) + .clampNumElements(0, v4s32, v4s32) + .clampNumElements(0, v2s64, v2s64); + + getActionDefinitionsBuilder(G_CONCAT_VECTORS) + .legalFor({{v4s32, v2s32}, {v8s16, v4s16}}); + + getActionDefinitionsBuilder(G_JUMP_TABLE) + .legalFor({{p0}, {s64}}); + + getActionDefinitionsBuilder(G_BRJT).legalIf([=](const LegalityQuery &Query) { + return Query.Types[0] == p0 && Query.Types[1] == s64; + }); + + getActionDefinitionsBuilder(G_DYN_STACKALLOC).lower(); + + computeTables(); + verify(*ST.getInstrInfo()); +} + +bool AArch64LegalizerInfo::legalizeCustom(LegalizerHelper &Helper, + MachineInstr &MI) const { + MachineIRBuilder &MIRBuilder = Helper.MIRBuilder; + MachineRegisterInfo &MRI = *MIRBuilder.getMRI(); + GISelChangeObserver &Observer = Helper.Observer; + switch (MI.getOpcode()) { + default: + // No idea what to do. + return false; + case TargetOpcode::G_VAARG: + return legalizeVaArg(MI, MRI, MIRBuilder); + case TargetOpcode::G_LOAD: + case TargetOpcode::G_STORE: + return legalizeLoadStore(MI, MRI, MIRBuilder, Observer); + case TargetOpcode::G_SHL: + case TargetOpcode::G_ASHR: + case TargetOpcode::G_LSHR: + return legalizeShlAshrLshr(MI, MRI, MIRBuilder, Observer); + case TargetOpcode::G_GLOBAL_VALUE: + return legalizeSmallCMGlobalValue(MI, MRI, MIRBuilder, Observer); + } + + llvm_unreachable("expected switch to return"); +} + +bool AArch64LegalizerInfo::legalizeSmallCMGlobalValue(MachineInstr &MI, + MachineRegisterInfo &MRI, + MachineIRBuilder &MIRBuilder, + GISelChangeObserver &Observer) const { + assert(MI.getOpcode() == TargetOpcode::G_GLOBAL_VALUE); + // We do this custom legalization to convert G_GLOBAL_VALUE into target ADRP + + // G_ADD_LOW instructions. + // By splitting this here, we can optimize accesses in the small code model by + // folding in the G_ADD_LOW into the load/store offset. + auto GV = MI.getOperand(1).getGlobal(); + if (GV->isThreadLocal()) + return true; // Don't want to modify TLS vars. + + auto &TM = ST->getTargetLowering()->getTargetMachine(); + unsigned OpFlags = ST->ClassifyGlobalReference(GV, TM); + + if (OpFlags & AArch64II::MO_GOT) + return true; + + Register DstReg = MI.getOperand(0).getReg(); + auto ADRP = MIRBuilder.buildInstr(AArch64::ADRP, {LLT::pointer(0, 64)}, {}) + .addGlobalAddress(GV, 0, OpFlags | AArch64II::MO_PAGE); + // Set the regclass on the dest reg too. + MRI.setRegClass(ADRP.getReg(0), &AArch64::GPR64RegClass); + + MIRBuilder.buildInstr(AArch64::G_ADD_LOW, {DstReg}, {ADRP}) + .addGlobalAddress(GV, 0, + OpFlags | AArch64II::MO_PAGEOFF | AArch64II::MO_NC); + MI.eraseFromParent(); + return true; +} + +bool AArch64LegalizerInfo::legalizeIntrinsic( + LegalizerHelper &Helper, MachineInstr &MI) const { + MachineIRBuilder &MIRBuilder = Helper.MIRBuilder; + switch (MI.getIntrinsicID()) { + case Intrinsic::memcpy: + case Intrinsic::memset: + case Intrinsic::memmove: + if (createMemLibcall(MIRBuilder, *MIRBuilder.getMRI(), MI) == + LegalizerHelper::UnableToLegalize) + return false; + MI.eraseFromParent(); + return true; + default: + break; + } + return true; +} + +bool AArch64LegalizerInfo::legalizeShlAshrLshr( + MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &MIRBuilder, + GISelChangeObserver &Observer) const { + assert(MI.getOpcode() == TargetOpcode::G_ASHR || + MI.getOpcode() == TargetOpcode::G_LSHR || + MI.getOpcode() == TargetOpcode::G_SHL); + // If the shift amount is a G_CONSTANT, promote it to a 64 bit type so the + // imported patterns can select it later. Either way, it will be legal. + Register AmtReg = MI.getOperand(2).getReg(); + auto *CstMI = MRI.getVRegDef(AmtReg); + assert(CstMI && "expected to find a vreg def"); + if (CstMI->getOpcode() != TargetOpcode::G_CONSTANT) + return true; + // Check the shift amount is in range for an immediate form. + unsigned Amount = CstMI->getOperand(1).getCImm()->getZExtValue(); + if (Amount > 31) + return true; // This will have to remain a register variant. + assert(MRI.getType(AmtReg).getSizeInBits() == 32); + auto ExtCst = MIRBuilder.buildZExt(LLT::scalar(64), AmtReg); + MI.getOperand(2).setReg(ExtCst.getReg(0)); + return true; +} + +bool AArch64LegalizerInfo::legalizeLoadStore( + MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &MIRBuilder, + GISelChangeObserver &Observer) const { + assert(MI.getOpcode() == TargetOpcode::G_STORE || + MI.getOpcode() == TargetOpcode::G_LOAD); + // Here we just try to handle vector loads/stores where our value type might + // have pointer elements, which the SelectionDAG importer can't handle. To + // allow the existing patterns for s64 to fire for p0, we just try to bitcast + // the value to use s64 types. + + // Custom legalization requires the instruction, if not deleted, must be fully + // legalized. In order to allow further legalization of the inst, we create + // a new instruction and erase the existing one. + + Register ValReg = MI.getOperand(0).getReg(); + const LLT ValTy = MRI.getType(ValReg); + + if (!ValTy.isVector() || !ValTy.getElementType().isPointer() || + ValTy.getElementType().getAddressSpace() != 0) { + LLVM_DEBUG(dbgs() << "Tried to do custom legalization on wrong load/store"); + return false; + } + + unsigned PtrSize = ValTy.getElementType().getSizeInBits(); + const LLT NewTy = LLT::vector(ValTy.getNumElements(), PtrSize); + auto &MMO = **MI.memoperands_begin(); + if (MI.getOpcode() == TargetOpcode::G_STORE) { + auto Bitcast = MIRBuilder.buildBitcast(NewTy, ValReg); + MIRBuilder.buildStore(Bitcast.getReg(0), MI.getOperand(1), MMO); + } else { + auto NewLoad = MIRBuilder.buildLoad(NewTy, MI.getOperand(1), MMO); + MIRBuilder.buildBitcast(ValReg, NewLoad); + } + MI.eraseFromParent(); + return true; +} + +bool AArch64LegalizerInfo::legalizeVaArg(MachineInstr &MI, + MachineRegisterInfo &MRI, + MachineIRBuilder &MIRBuilder) const { + MachineFunction &MF = MIRBuilder.getMF(); + Align Alignment(MI.getOperand(2).getImm()); + Register Dst = MI.getOperand(0).getReg(); + Register ListPtr = MI.getOperand(1).getReg(); + + LLT PtrTy = MRI.getType(ListPtr); + LLT IntPtrTy = LLT::scalar(PtrTy.getSizeInBits()); + + const unsigned PtrSize = PtrTy.getSizeInBits() / 8; + const Align PtrAlign = Align(PtrSize); + auto List = MIRBuilder.buildLoad( + PtrTy, ListPtr, + *MF.getMachineMemOperand(MachinePointerInfo(), MachineMemOperand::MOLoad, + PtrSize, PtrAlign)); + + MachineInstrBuilder DstPtr; + if (Alignment > PtrAlign) { + // Realign the list to the actual required alignment. + auto AlignMinus1 = + MIRBuilder.buildConstant(IntPtrTy, Alignment.value() - 1); + auto ListTmp = MIRBuilder.buildPtrAdd(PtrTy, List, AlignMinus1.getReg(0)); + DstPtr = MIRBuilder.buildMaskLowPtrBits(PtrTy, ListTmp, Log2(Alignment)); + } else + DstPtr = List; + + uint64_t ValSize = MRI.getType(Dst).getSizeInBits() / 8; + MIRBuilder.buildLoad( + Dst, DstPtr, + *MF.getMachineMemOperand(MachinePointerInfo(), MachineMemOperand::MOLoad, + ValSize, std::max(Alignment, PtrAlign))); + + auto Size = MIRBuilder.buildConstant(IntPtrTy, alignTo(ValSize, PtrAlign)); + + auto NewList = MIRBuilder.buildPtrAdd(PtrTy, DstPtr, Size.getReg(0)); + + MIRBuilder.buildStore(NewList, ListPtr, + *MF.getMachineMemOperand(MachinePointerInfo(), + MachineMemOperand::MOStore, + PtrSize, PtrAlign)); + + MI.eraseFromParent(); + return true; +} diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.h b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.h new file mode 100644 index 000000000000..1cb24559c1ab --- /dev/null +++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.h @@ -0,0 +1,51 @@ +//===- AArch64LegalizerInfo --------------------------------------*- C++ -*-==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// \file +/// This file declares the targeting of the Machinelegalizer class for +/// AArch64. +/// \todo This should be generated by TableGen. +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64MACHINELEGALIZER_H +#define LLVM_LIB_TARGET_AARCH64_AARCH64MACHINELEGALIZER_H + +#include "llvm/CodeGen/GlobalISel/GISelChangeObserver.h" +#include "llvm/CodeGen/GlobalISel/LegalizerInfo.h" + +namespace llvm { + +class LLVMContext; +class AArch64Subtarget; + +/// This class provides the information for the target register banks. +class AArch64LegalizerInfo : public LegalizerInfo { +public: + AArch64LegalizerInfo(const AArch64Subtarget &ST); + + bool legalizeCustom(LegalizerHelper &Helper, MachineInstr &MI) const override; + + bool legalizeIntrinsic(LegalizerHelper &Helper, + MachineInstr &MI) const override; + +private: + bool legalizeVaArg(MachineInstr &MI, MachineRegisterInfo &MRI, + MachineIRBuilder &MIRBuilder) const; + bool legalizeLoadStore(MachineInstr &MI, MachineRegisterInfo &MRI, + MachineIRBuilder &MIRBuilder, + GISelChangeObserver &Observer) const; + bool legalizeShlAshrLshr(MachineInstr &MI, MachineRegisterInfo &MRI, + MachineIRBuilder &MIRBuilder, + GISelChangeObserver &Observer) const; + + bool legalizeSmallCMGlobalValue(MachineInstr &MI, MachineRegisterInfo &MRI, + MachineIRBuilder &MIRBuilder, + GISelChangeObserver &Observer) const; + const AArch64Subtarget *ST; +}; +} // End llvm namespace. +#endif diff --git a/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp new file mode 100644 index 000000000000..baa8515baf3e --- /dev/null +++ b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp @@ -0,0 +1,507 @@ + //=== lib/CodeGen/GlobalISel/AArch64PostLegalizerCombiner.cpp -------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This performs post-legalization combines on generic MachineInstrs. +// +// Any combine that this pass performs must preserve instruction legality. +// Combines unconcerned with legality should be handled by the +// PreLegalizerCombiner instead. +// +//===----------------------------------------------------------------------===// + +#include "AArch64TargetMachine.h" +#include "llvm/CodeGen/GlobalISel/Combiner.h" +#include "llvm/CodeGen/GlobalISel/CombinerHelper.h" +#include "llvm/CodeGen/GlobalISel/CombinerInfo.h" +#include "llvm/CodeGen/GlobalISel/GISelKnownBits.h" +#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" +#include "llvm/CodeGen/MachineDominators.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/TargetPassConfig.h" +#include "llvm/Support/Debug.h" + +#define DEBUG_TYPE "aarch64-postlegalizer-combiner" + +using namespace llvm; +using namespace MIPatternMatch; + +/// Represents a pseudo instruction which replaces a G_SHUFFLE_VECTOR. +/// +/// Used for matching target-supported shuffles before codegen. +struct ShuffleVectorPseudo { + unsigned Opc; ///< Opcode for the instruction. (E.g. G_ZIP1) + Register Dst; ///< Destination register. + SmallVector<SrcOp, 2> SrcOps; ///< Source registers. + ShuffleVectorPseudo(unsigned Opc, Register Dst, + std::initializer_list<SrcOp> SrcOps) + : Opc(Opc), Dst(Dst), SrcOps(SrcOps){}; + ShuffleVectorPseudo() {} +}; + +/// \returns The splat index of a G_SHUFFLE_VECTOR \p MI when \p MI is a splat. +/// If \p MI is not a splat, returns None. +static Optional<int> getSplatIndex(MachineInstr &MI) { + assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR && + "Only G_SHUFFLE_VECTOR can have a splat index!"); + ArrayRef<int> Mask = MI.getOperand(3).getShuffleMask(); + auto FirstDefinedIdx = find_if(Mask, [](int Elt) { return Elt >= 0; }); + + // If all elements are undefined, this shuffle can be considered a splat. + // Return 0 for better potential for callers to simplify. + if (FirstDefinedIdx == Mask.end()) + return 0; + + // Make sure all remaining elements are either undef or the same + // as the first non-undef value. + int SplatValue = *FirstDefinedIdx; + if (any_of(make_range(std::next(FirstDefinedIdx), Mask.end()), + [&SplatValue](int Elt) { return Elt >= 0 && Elt != SplatValue; })) + return None; + + return SplatValue; +} + +/// Check if a vector shuffle corresponds to a REV instruction with the +/// specified blocksize. +static bool isREVMask(ArrayRef<int> M, unsigned EltSize, unsigned NumElts, + unsigned BlockSize) { + assert((BlockSize == 16 || BlockSize == 32 || BlockSize == 64) && + "Only possible block sizes for REV are: 16, 32, 64"); + assert(EltSize != 64 && "EltSize cannot be 64 for REV mask."); + + unsigned BlockElts = M[0] + 1; + + // If the first shuffle index is UNDEF, be optimistic. + if (M[0] < 0) + BlockElts = BlockSize / EltSize; + + if (BlockSize <= EltSize || BlockSize != BlockElts * EltSize) + return false; + + for (unsigned i = 0; i < NumElts; ++i) { + // Ignore undef indices. + if (M[i] < 0) + continue; + if (static_cast<unsigned>(M[i]) != + (i - i % BlockElts) + (BlockElts - 1 - i % BlockElts)) + return false; + } + + return true; +} + +/// Determines if \p M is a shuffle vector mask for a TRN of \p NumElts. +/// Whether or not G_TRN1 or G_TRN2 should be used is stored in \p WhichResult. +static bool isTRNMask(ArrayRef<int> M, unsigned NumElts, + unsigned &WhichResult) { + if (NumElts % 2 != 0) + return false; + WhichResult = (M[0] == 0 ? 0 : 1); + for (unsigned i = 0; i < NumElts; i += 2) { + if ((M[i] >= 0 && static_cast<unsigned>(M[i]) != i + WhichResult) || + (M[i + 1] >= 0 && + static_cast<unsigned>(M[i + 1]) != i + NumElts + WhichResult)) + return false; + } + return true; +} + +/// Check if a G_EXT instruction can handle a shuffle mask \p M when the vector +/// sources of the shuffle are different. +static Optional<std::pair<bool, uint64_t>> getExtMask(ArrayRef<int> M, + unsigned NumElts) { + // Look for the first non-undef element. + auto FirstRealElt = find_if(M, [](int Elt) { return Elt >= 0; }); + if (FirstRealElt == M.end()) + return None; + + // Use APInt to handle overflow when calculating expected element. + unsigned MaskBits = APInt(32, NumElts * 2).logBase2(); + APInt ExpectedElt = APInt(MaskBits, *FirstRealElt + 1); + + // The following shuffle indices must be the successive elements after the + // first real element. + if (any_of( + make_range(std::next(FirstRealElt), M.end()), + [&ExpectedElt](int Elt) { return Elt != ExpectedElt++ && Elt >= 0; })) + return None; + + // The index of an EXT is the first element if it is not UNDEF. + // Watch out for the beginning UNDEFs. The EXT index should be the expected + // value of the first element. E.g. + // <-1, -1, 3, ...> is treated as <1, 2, 3, ...>. + // <-1, -1, 0, 1, ...> is treated as <2*NumElts-2, 2*NumElts-1, 0, 1, ...>. + // ExpectedElt is the last mask index plus 1. + uint64_t Imm = ExpectedElt.getZExtValue(); + bool ReverseExt = false; + + // There are two difference cases requiring to reverse input vectors. + // For example, for vector <4 x i32> we have the following cases, + // Case 1: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, -1, 0>) + // Case 2: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, 7, 0>) + // For both cases, we finally use mask <5, 6, 7, 0>, which requires + // to reverse two input vectors. + if (Imm < NumElts) + ReverseExt = true; + else + Imm -= NumElts; + return std::make_pair(ReverseExt, Imm); +} + +/// Determines if \p M is a shuffle vector mask for a UZP of \p NumElts. +/// Whether or not G_UZP1 or G_UZP2 should be used is stored in \p WhichResult. +static bool isUZPMask(ArrayRef<int> M, unsigned NumElts, + unsigned &WhichResult) { + WhichResult = (M[0] == 0 ? 0 : 1); + for (unsigned i = 0; i != NumElts; ++i) { + // Skip undef indices. + if (M[i] < 0) + continue; + if (static_cast<unsigned>(M[i]) != 2 * i + WhichResult) + return false; + } + return true; +} + +/// \return true if \p M is a zip mask for a shuffle vector of \p NumElts. +/// Whether or not G_ZIP1 or G_ZIP2 should be used is stored in \p WhichResult. +static bool isZipMask(ArrayRef<int> M, unsigned NumElts, + unsigned &WhichResult) { + if (NumElts % 2 != 0) + return false; + + // 0 means use ZIP1, 1 means use ZIP2. + WhichResult = (M[0] == 0 ? 0 : 1); + unsigned Idx = WhichResult * NumElts / 2; + for (unsigned i = 0; i != NumElts; i += 2) { + if ((M[i] >= 0 && static_cast<unsigned>(M[i]) != Idx) || + (M[i + 1] >= 0 && static_cast<unsigned>(M[i + 1]) != Idx + NumElts)) + return false; + Idx += 1; + } + return true; +} + +/// \return true if a G_SHUFFLE_VECTOR instruction \p MI can be replaced with a +/// G_REV instruction. Returns the appropriate G_REV opcode in \p Opc. +static bool matchREV(MachineInstr &MI, MachineRegisterInfo &MRI, + ShuffleVectorPseudo &MatchInfo) { + assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR); + ArrayRef<int> ShuffleMask = MI.getOperand(3).getShuffleMask(); + Register Dst = MI.getOperand(0).getReg(); + Register Src = MI.getOperand(1).getReg(); + LLT Ty = MRI.getType(Dst); + unsigned EltSize = Ty.getScalarSizeInBits(); + + // Element size for a rev cannot be 64. + if (EltSize == 64) + return false; + + unsigned NumElts = Ty.getNumElements(); + + // Try to produce G_REV64 + if (isREVMask(ShuffleMask, EltSize, NumElts, 64)) { + MatchInfo = ShuffleVectorPseudo(AArch64::G_REV64, Dst, {Src}); + return true; + } + + // TODO: Produce G_REV32 and G_REV16 once we have proper legalization support. + // This should be identical to above, but with a constant 32 and constant + // 16. + return false; +} + +/// \return true if a G_SHUFFLE_VECTOR instruction \p MI can be replaced with +/// a G_TRN1 or G_TRN2 instruction. +static bool matchTRN(MachineInstr &MI, MachineRegisterInfo &MRI, + ShuffleVectorPseudo &MatchInfo) { + assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR); + unsigned WhichResult; + ArrayRef<int> ShuffleMask = MI.getOperand(3).getShuffleMask(); + Register Dst = MI.getOperand(0).getReg(); + unsigned NumElts = MRI.getType(Dst).getNumElements(); + if (!isTRNMask(ShuffleMask, NumElts, WhichResult)) + return false; + unsigned Opc = (WhichResult == 0) ? AArch64::G_TRN1 : AArch64::G_TRN2; + Register V1 = MI.getOperand(1).getReg(); + Register V2 = MI.getOperand(2).getReg(); + MatchInfo = ShuffleVectorPseudo(Opc, Dst, {V1, V2}); + return true; +} + +/// \return true if a G_SHUFFLE_VECTOR instruction \p MI can be replaced with +/// a G_UZP1 or G_UZP2 instruction. +/// +/// \param [in] MI - The shuffle vector instruction. +/// \param [out] MatchInfo - Either G_UZP1 or G_UZP2 on success. +static bool matchUZP(MachineInstr &MI, MachineRegisterInfo &MRI, + ShuffleVectorPseudo &MatchInfo) { + assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR); + unsigned WhichResult; + ArrayRef<int> ShuffleMask = MI.getOperand(3).getShuffleMask(); + Register Dst = MI.getOperand(0).getReg(); + unsigned NumElts = MRI.getType(Dst).getNumElements(); + if (!isUZPMask(ShuffleMask, NumElts, WhichResult)) + return false; + unsigned Opc = (WhichResult == 0) ? AArch64::G_UZP1 : AArch64::G_UZP2; + Register V1 = MI.getOperand(1).getReg(); + Register V2 = MI.getOperand(2).getReg(); + MatchInfo = ShuffleVectorPseudo(Opc, Dst, {V1, V2}); + return true; +} + +static bool matchZip(MachineInstr &MI, MachineRegisterInfo &MRI, + ShuffleVectorPseudo &MatchInfo) { + assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR); + unsigned WhichResult; + ArrayRef<int> ShuffleMask = MI.getOperand(3).getShuffleMask(); + Register Dst = MI.getOperand(0).getReg(); + unsigned NumElts = MRI.getType(Dst).getNumElements(); + if (!isZipMask(ShuffleMask, NumElts, WhichResult)) + return false; + unsigned Opc = (WhichResult == 0) ? AArch64::G_ZIP1 : AArch64::G_ZIP2; + Register V1 = MI.getOperand(1).getReg(); + Register V2 = MI.getOperand(2).getReg(); + MatchInfo = ShuffleVectorPseudo(Opc, Dst, {V1, V2}); + return true; +} + +/// Helper function for matchDup. +static bool matchDupFromInsertVectorElt(int Lane, MachineInstr &MI, + MachineRegisterInfo &MRI, + ShuffleVectorPseudo &MatchInfo) { + if (Lane != 0) + return false; + + // Try to match a vector splat operation into a dup instruction. + // We're looking for this pattern: + // + // %scalar:gpr(s64) = COPY $x0 + // %undef:fpr(<2 x s64>) = G_IMPLICIT_DEF + // %cst0:gpr(s32) = G_CONSTANT i32 0 + // %zerovec:fpr(<2 x s32>) = G_BUILD_VECTOR %cst0(s32), %cst0(s32) + // %ins:fpr(<2 x s64>) = G_INSERT_VECTOR_ELT %undef, %scalar(s64), %cst0(s32) + // %splat:fpr(<2 x s64>) = G_SHUFFLE_VECTOR %ins(<2 x s64>), %undef, %zerovec(<2 x s32>) + // + // ...into: + // %splat = G_DUP %scalar + + // Begin matching the insert. + auto *InsMI = getOpcodeDef(TargetOpcode::G_INSERT_VECTOR_ELT, + MI.getOperand(1).getReg(), MRI); + if (!InsMI) + return false; + // Match the undef vector operand. + if (!getOpcodeDef(TargetOpcode::G_IMPLICIT_DEF, InsMI->getOperand(1).getReg(), + MRI)) + return false; + + // Match the index constant 0. + int64_t Index = 0; + if (!mi_match(InsMI->getOperand(3).getReg(), MRI, m_ICst(Index)) || Index) + return false; + + MatchInfo = ShuffleVectorPseudo(AArch64::G_DUP, MI.getOperand(0).getReg(), + {InsMI->getOperand(2).getReg()}); + return true; +} + +/// Helper function for matchDup. +static bool matchDupFromBuildVector(int Lane, MachineInstr &MI, + MachineRegisterInfo &MRI, + ShuffleVectorPseudo &MatchInfo) { + assert(Lane >= 0 && "Expected positive lane?"); + // Test if the LHS is a BUILD_VECTOR. If it is, then we can just reference the + // lane's definition directly. + auto *BuildVecMI = getOpcodeDef(TargetOpcode::G_BUILD_VECTOR, + MI.getOperand(1).getReg(), MRI); + if (!BuildVecMI) + return false; + Register Reg = BuildVecMI->getOperand(Lane + 1).getReg(); + MatchInfo = + ShuffleVectorPseudo(AArch64::G_DUP, MI.getOperand(0).getReg(), {Reg}); + return true; +} + +static bool matchDup(MachineInstr &MI, MachineRegisterInfo &MRI, + ShuffleVectorPseudo &MatchInfo) { + assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR); + auto MaybeLane = getSplatIndex(MI); + if (!MaybeLane) + return false; + int Lane = *MaybeLane; + // If this is undef splat, generate it via "just" vdup, if possible. + if (Lane < 0) + Lane = 0; + if (matchDupFromInsertVectorElt(Lane, MI, MRI, MatchInfo)) + return true; + if (matchDupFromBuildVector(Lane, MI, MRI, MatchInfo)) + return true; + return false; +} + +static bool matchEXT(MachineInstr &MI, MachineRegisterInfo &MRI, + ShuffleVectorPseudo &MatchInfo) { + assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR); + Register Dst = MI.getOperand(0).getReg(); + auto ExtInfo = getExtMask(MI.getOperand(3).getShuffleMask(), + MRI.getType(Dst).getNumElements()); + if (!ExtInfo) + return false; + bool ReverseExt; + uint64_t Imm; + std::tie(ReverseExt, Imm) = *ExtInfo; + Register V1 = MI.getOperand(1).getReg(); + Register V2 = MI.getOperand(2).getReg(); + if (ReverseExt) + std::swap(V1, V2); + uint64_t ExtFactor = MRI.getType(V1).getScalarSizeInBits() / 8; + Imm *= ExtFactor; + MatchInfo = ShuffleVectorPseudo(AArch64::G_EXT, Dst, {V1, V2, Imm}); + return true; +} + +/// Replace a G_SHUFFLE_VECTOR instruction with a pseudo. +/// \p Opc is the opcode to use. \p MI is the G_SHUFFLE_VECTOR. +static bool applyShuffleVectorPseudo(MachineInstr &MI, + ShuffleVectorPseudo &MatchInfo) { + MachineIRBuilder MIRBuilder(MI); + MIRBuilder.buildInstr(MatchInfo.Opc, {MatchInfo.Dst}, MatchInfo.SrcOps); + MI.eraseFromParent(); + return true; +} + +/// Replace a G_SHUFFLE_VECTOR instruction with G_EXT. +/// Special-cased because the constant operand must be emitted as a G_CONSTANT +/// for the imported tablegen patterns to work. +static bool applyEXT(MachineInstr &MI, ShuffleVectorPseudo &MatchInfo) { + MachineIRBuilder MIRBuilder(MI); + // Tablegen patterns expect an i32 G_CONSTANT as the final op. + auto Cst = + MIRBuilder.buildConstant(LLT::scalar(32), MatchInfo.SrcOps[2].getImm()); + MIRBuilder.buildInstr(MatchInfo.Opc, {MatchInfo.Dst}, + {MatchInfo.SrcOps[0], MatchInfo.SrcOps[1], Cst}); + MI.eraseFromParent(); + return true; +} + +#define AARCH64POSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS +#include "AArch64GenPostLegalizeGICombiner.inc" +#undef AARCH64POSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS + +namespace { +#define AARCH64POSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H +#include "AArch64GenPostLegalizeGICombiner.inc" +#undef AARCH64POSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H + +class AArch64PostLegalizerCombinerInfo : public CombinerInfo { + GISelKnownBits *KB; + MachineDominatorTree *MDT; + +public: + AArch64GenPostLegalizerCombinerHelperRuleConfig GeneratedRuleCfg; + + AArch64PostLegalizerCombinerInfo(bool EnableOpt, bool OptSize, bool MinSize, + GISelKnownBits *KB, + MachineDominatorTree *MDT) + : CombinerInfo(/*AllowIllegalOps*/ true, /*ShouldLegalizeIllegal*/ false, + /*LegalizerInfo*/ nullptr, EnableOpt, OptSize, MinSize), + KB(KB), MDT(MDT) { + if (!GeneratedRuleCfg.parseCommandLineOption()) + report_fatal_error("Invalid rule identifier"); + } + + virtual bool combine(GISelChangeObserver &Observer, MachineInstr &MI, + MachineIRBuilder &B) const override; +}; + +bool AArch64PostLegalizerCombinerInfo::combine(GISelChangeObserver &Observer, + MachineInstr &MI, + MachineIRBuilder &B) const { + const auto *LI = + MI.getParent()->getParent()->getSubtarget().getLegalizerInfo(); + CombinerHelper Helper(Observer, B, KB, MDT, LI); + AArch64GenPostLegalizerCombinerHelper Generated(GeneratedRuleCfg); + return Generated.tryCombineAll(Observer, MI, B, Helper); +} + +#define AARCH64POSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP +#include "AArch64GenPostLegalizeGICombiner.inc" +#undef AARCH64POSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP + +class AArch64PostLegalizerCombiner : public MachineFunctionPass { +public: + static char ID; + + AArch64PostLegalizerCombiner(bool IsOptNone = false); + + StringRef getPassName() const override { + return "AArch64PostLegalizerCombiner"; + } + + bool runOnMachineFunction(MachineFunction &MF) override; + void getAnalysisUsage(AnalysisUsage &AU) const override; + +private: + bool IsOptNone; +}; +} // end anonymous namespace + +void AArch64PostLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const { + AU.addRequired<TargetPassConfig>(); + AU.setPreservesCFG(); + getSelectionDAGFallbackAnalysisUsage(AU); + AU.addRequired<GISelKnownBitsAnalysis>(); + AU.addPreserved<GISelKnownBitsAnalysis>(); + if (!IsOptNone) { + AU.addRequired<MachineDominatorTree>(); + AU.addPreserved<MachineDominatorTree>(); + } + MachineFunctionPass::getAnalysisUsage(AU); +} + +AArch64PostLegalizerCombiner::AArch64PostLegalizerCombiner(bool IsOptNone) + : MachineFunctionPass(ID), IsOptNone(IsOptNone) { + initializeAArch64PostLegalizerCombinerPass(*PassRegistry::getPassRegistry()); +} + +bool AArch64PostLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) { + if (MF.getProperties().hasProperty( + MachineFunctionProperties::Property::FailedISel)) + return false; + assert(MF.getProperties().hasProperty( + MachineFunctionProperties::Property::Legalized) && + "Expected a legalized function?"); + auto *TPC = &getAnalysis<TargetPassConfig>(); + const Function &F = MF.getFunction(); + bool EnableOpt = + MF.getTarget().getOptLevel() != CodeGenOpt::None && !skipFunction(F); + GISelKnownBits *KB = &getAnalysis<GISelKnownBitsAnalysis>().get(MF); + MachineDominatorTree *MDT = + IsOptNone ? nullptr : &getAnalysis<MachineDominatorTree>(); + AArch64PostLegalizerCombinerInfo PCInfo(EnableOpt, F.hasOptSize(), + F.hasMinSize(), KB, MDT); + Combiner C(PCInfo, TPC); + return C.combineMachineInstrs(MF, /*CSEInfo*/ nullptr); +} + +char AArch64PostLegalizerCombiner::ID = 0; +INITIALIZE_PASS_BEGIN(AArch64PostLegalizerCombiner, DEBUG_TYPE, + "Combine AArch64 MachineInstrs after legalization", false, + false) +INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) +INITIALIZE_PASS_DEPENDENCY(GISelKnownBitsAnalysis) +INITIALIZE_PASS_END(AArch64PostLegalizerCombiner, DEBUG_TYPE, + "Combine AArch64 MachineInstrs after legalization", false, + false) + +namespace llvm { +FunctionPass *createAArch64PostLegalizeCombiner(bool IsOptNone) { + return new AArch64PostLegalizerCombiner(IsOptNone); +} +} // end namespace llvm diff --git a/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp b/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp new file mode 100644 index 000000000000..9a1f200d5222 --- /dev/null +++ b/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp @@ -0,0 +1,203 @@ +//=== lib/CodeGen/GlobalISel/AArch64PreLegalizerCombiner.cpp --------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This pass does combining of machine instructions at the generic MI level, +// before the legalizer. +// +//===----------------------------------------------------------------------===// + +#include "AArch64TargetMachine.h" +#include "llvm/CodeGen/GlobalISel/Combiner.h" +#include "llvm/CodeGen/GlobalISel/CombinerHelper.h" +#include "llvm/CodeGen/GlobalISel/CombinerInfo.h" +#include "llvm/CodeGen/GlobalISel/GISelKnownBits.h" +#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" +#include "llvm/CodeGen/MachineDominators.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/TargetPassConfig.h" +#include "llvm/Support/Debug.h" + +#define DEBUG_TYPE "aarch64-prelegalizer-combiner" + +using namespace llvm; +using namespace MIPatternMatch; + +/// Return true if a G_FCONSTANT instruction is known to be better-represented +/// as a G_CONSTANT. +static bool matchFConstantToConstant(MachineInstr &MI, + MachineRegisterInfo &MRI) { + assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT); + Register DstReg = MI.getOperand(0).getReg(); + const unsigned DstSize = MRI.getType(DstReg).getSizeInBits(); + if (DstSize != 32 && DstSize != 64) + return false; + + // When we're storing a value, it doesn't matter what register bank it's on. + // Since not all floating point constants can be materialized using a fmov, + // it makes more sense to just use a GPR. + return all_of(MRI.use_nodbg_instructions(DstReg), + [](const MachineInstr &Use) { return Use.mayStore(); }); +} + +/// Change a G_FCONSTANT into a G_CONSTANT. +static void applyFConstantToConstant(MachineInstr &MI) { + assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT); + MachineIRBuilder MIB(MI); + const APFloat &ImmValAPF = MI.getOperand(1).getFPImm()->getValueAPF(); + MIB.buildConstant(MI.getOperand(0).getReg(), ImmValAPF.bitcastToAPInt()); + MI.eraseFromParent(); +} + +class AArch64PreLegalizerCombinerHelperState { +protected: + CombinerHelper &Helper; + +public: + AArch64PreLegalizerCombinerHelperState(CombinerHelper &Helper) + : Helper(Helper) {} +}; + +#define AARCH64PRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS +#include "AArch64GenPreLegalizeGICombiner.inc" +#undef AARCH64PRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS + +namespace { +#define AARCH64PRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H +#include "AArch64GenPreLegalizeGICombiner.inc" +#undef AARCH64PRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H + +class AArch64PreLegalizerCombinerInfo : public CombinerInfo { + GISelKnownBits *KB; + MachineDominatorTree *MDT; + AArch64GenPreLegalizerCombinerHelperRuleConfig GeneratedRuleCfg; + +public: + AArch64PreLegalizerCombinerInfo(bool EnableOpt, bool OptSize, bool MinSize, + GISelKnownBits *KB, MachineDominatorTree *MDT) + : CombinerInfo(/*AllowIllegalOps*/ true, /*ShouldLegalizeIllegal*/ false, + /*LegalizerInfo*/ nullptr, EnableOpt, OptSize, MinSize), + KB(KB), MDT(MDT) { + if (!GeneratedRuleCfg.parseCommandLineOption()) + report_fatal_error("Invalid rule identifier"); + } + + virtual bool combine(GISelChangeObserver &Observer, MachineInstr &MI, + MachineIRBuilder &B) const override; +}; + +bool AArch64PreLegalizerCombinerInfo::combine(GISelChangeObserver &Observer, + MachineInstr &MI, + MachineIRBuilder &B) const { + CombinerHelper Helper(Observer, B, KB, MDT); + AArch64GenPreLegalizerCombinerHelper Generated(GeneratedRuleCfg, Helper); + + switch (MI.getOpcode()) { + case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS: + switch (MI.getIntrinsicID()) { + case Intrinsic::memcpy: + case Intrinsic::memmove: + case Intrinsic::memset: { + // If we're at -O0 set a maxlen of 32 to inline, otherwise let the other + // heuristics decide. + unsigned MaxLen = EnableOpt ? 0 : 32; + // Try to inline memcpy type calls if optimizations are enabled. + return (!EnableMinSize) ? Helper.tryCombineMemCpyFamily(MI, MaxLen) + : false; + } + default: + break; + } + } + + if (Generated.tryCombineAll(Observer, MI, B)) + return true; + + switch (MI.getOpcode()) { + case TargetOpcode::G_CONCAT_VECTORS: + return Helper.tryCombineConcatVectors(MI); + case TargetOpcode::G_SHUFFLE_VECTOR: + return Helper.tryCombineShuffleVector(MI); + } + + return false; +} + +#define AARCH64PRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP +#include "AArch64GenPreLegalizeGICombiner.inc" +#undef AARCH64PRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP + +// Pass boilerplate +// ================ + +class AArch64PreLegalizerCombiner : public MachineFunctionPass { +public: + static char ID; + + AArch64PreLegalizerCombiner(bool IsOptNone = false); + + StringRef getPassName() const override { return "AArch64PreLegalizerCombiner"; } + + bool runOnMachineFunction(MachineFunction &MF) override; + + void getAnalysisUsage(AnalysisUsage &AU) const override; +private: + bool IsOptNone; +}; +} // end anonymous namespace + +void AArch64PreLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const { + AU.addRequired<TargetPassConfig>(); + AU.setPreservesCFG(); + getSelectionDAGFallbackAnalysisUsage(AU); + AU.addRequired<GISelKnownBitsAnalysis>(); + AU.addPreserved<GISelKnownBitsAnalysis>(); + if (!IsOptNone) { + AU.addRequired<MachineDominatorTree>(); + AU.addPreserved<MachineDominatorTree>(); + } + MachineFunctionPass::getAnalysisUsage(AU); +} + +AArch64PreLegalizerCombiner::AArch64PreLegalizerCombiner(bool IsOptNone) + : MachineFunctionPass(ID), IsOptNone(IsOptNone) { + initializeAArch64PreLegalizerCombinerPass(*PassRegistry::getPassRegistry()); +} + +bool AArch64PreLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) { + if (MF.getProperties().hasProperty( + MachineFunctionProperties::Property::FailedISel)) + return false; + auto *TPC = &getAnalysis<TargetPassConfig>(); + const Function &F = MF.getFunction(); + bool EnableOpt = + MF.getTarget().getOptLevel() != CodeGenOpt::None && !skipFunction(F); + GISelKnownBits *KB = &getAnalysis<GISelKnownBitsAnalysis>().get(MF); + MachineDominatorTree *MDT = + IsOptNone ? nullptr : &getAnalysis<MachineDominatorTree>(); + AArch64PreLegalizerCombinerInfo PCInfo(EnableOpt, F.hasOptSize(), + F.hasMinSize(), KB, MDT); + Combiner C(PCInfo, TPC); + return C.combineMachineInstrs(MF, /*CSEInfo*/ nullptr); +} + +char AArch64PreLegalizerCombiner::ID = 0; +INITIALIZE_PASS_BEGIN(AArch64PreLegalizerCombiner, DEBUG_TYPE, + "Combine AArch64 machine instrs before legalization", + false, false) +INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) +INITIALIZE_PASS_DEPENDENCY(GISelKnownBitsAnalysis) +INITIALIZE_PASS_END(AArch64PreLegalizerCombiner, DEBUG_TYPE, + "Combine AArch64 machine instrs before legalization", false, + false) + + +namespace llvm { +FunctionPass *createAArch64PreLegalizeCombiner(bool IsOptNone) { + return new AArch64PreLegalizerCombiner(IsOptNone); +} +} // end namespace llvm diff --git a/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp new file mode 100644 index 000000000000..7e3ff1948dad --- /dev/null +++ b/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp @@ -0,0 +1,868 @@ +//===- AArch64RegisterBankInfo.cpp ----------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// \file +/// This file implements the targeting of the RegisterBankInfo class for +/// AArch64. +/// \todo This should be generated by TableGen. +//===----------------------------------------------------------------------===// + +#include "AArch64RegisterBankInfo.h" +#include "AArch64InstrInfo.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/CodeGen/GlobalISel/RegisterBank.h" +#include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h" +#include "llvm/CodeGen/LowLevelType.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineOperand.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/TargetOpcodes.h" +#include "llvm/CodeGen/TargetRegisterInfo.h" +#include "llvm/CodeGen/TargetSubtargetInfo.h" +#include "llvm/Support/ErrorHandling.h" +#include <algorithm> +#include <cassert> + +#define GET_TARGET_REGBANK_IMPL +#include "AArch64GenRegisterBank.inc" + +// This file will be TableGen'ed at some point. +#include "AArch64GenRegisterBankInfo.def" + +using namespace llvm; + +AArch64RegisterBankInfo::AArch64RegisterBankInfo(const TargetRegisterInfo &TRI) + : AArch64GenRegisterBankInfo() { + static llvm::once_flag InitializeRegisterBankFlag; + + static auto InitializeRegisterBankOnce = [&]() { + // We have only one set of register banks, whatever the subtarget + // is. Therefore, the initialization of the RegBanks table should be + // done only once. Indeed the table of all register banks + // (AArch64::RegBanks) is unique in the compiler. At some point, it + // will get tablegen'ed and the whole constructor becomes empty. + + const RegisterBank &RBGPR = getRegBank(AArch64::GPRRegBankID); + (void)RBGPR; + assert(&AArch64::GPRRegBank == &RBGPR && + "The order in RegBanks is messed up"); + + const RegisterBank &RBFPR = getRegBank(AArch64::FPRRegBankID); + (void)RBFPR; + assert(&AArch64::FPRRegBank == &RBFPR && + "The order in RegBanks is messed up"); + + const RegisterBank &RBCCR = getRegBank(AArch64::CCRegBankID); + (void)RBCCR; + assert(&AArch64::CCRegBank == &RBCCR && + "The order in RegBanks is messed up"); + + // The GPR register bank is fully defined by all the registers in + // GR64all + its subclasses. + assert(RBGPR.covers(*TRI.getRegClass(AArch64::GPR32RegClassID)) && + "Subclass not added?"); + assert(RBGPR.getSize() == 64 && "GPRs should hold up to 64-bit"); + + // The FPR register bank is fully defined by all the registers in + // GR64all + its subclasses. + assert(RBFPR.covers(*TRI.getRegClass(AArch64::QQRegClassID)) && + "Subclass not added?"); + assert(RBFPR.covers(*TRI.getRegClass(AArch64::FPR64RegClassID)) && + "Subclass not added?"); + assert(RBFPR.getSize() == 512 && + "FPRs should hold up to 512-bit via QQQQ sequence"); + + assert(RBCCR.covers(*TRI.getRegClass(AArch64::CCRRegClassID)) && + "Class not added?"); + assert(RBCCR.getSize() == 32 && "CCR should hold up to 32-bit"); + + // Check that the TableGen'ed like file is in sync we our expectations. + // First, the Idx. + assert(checkPartialMappingIdx(PMI_FirstGPR, PMI_LastGPR, + {PMI_GPR32, PMI_GPR64}) && + "PartialMappingIdx's are incorrectly ordered"); + assert(checkPartialMappingIdx(PMI_FirstFPR, PMI_LastFPR, + {PMI_FPR16, PMI_FPR32, PMI_FPR64, PMI_FPR128, + PMI_FPR256, PMI_FPR512}) && + "PartialMappingIdx's are incorrectly ordered"); +// Now, the content. +// Check partial mapping. +#define CHECK_PARTIALMAP(Idx, ValStartIdx, ValLength, RB) \ + do { \ + assert( \ + checkPartialMap(PartialMappingIdx::Idx, ValStartIdx, ValLength, RB) && \ + #Idx " is incorrectly initialized"); \ + } while (false) + + CHECK_PARTIALMAP(PMI_GPR32, 0, 32, RBGPR); + CHECK_PARTIALMAP(PMI_GPR64, 0, 64, RBGPR); + CHECK_PARTIALMAP(PMI_FPR16, 0, 16, RBFPR); + CHECK_PARTIALMAP(PMI_FPR32, 0, 32, RBFPR); + CHECK_PARTIALMAP(PMI_FPR64, 0, 64, RBFPR); + CHECK_PARTIALMAP(PMI_FPR128, 0, 128, RBFPR); + CHECK_PARTIALMAP(PMI_FPR256, 0, 256, RBFPR); + CHECK_PARTIALMAP(PMI_FPR512, 0, 512, RBFPR); + +// Check value mapping. +#define CHECK_VALUEMAP_IMPL(RBName, Size, Offset) \ + do { \ + assert(checkValueMapImpl(PartialMappingIdx::PMI_##RBName##Size, \ + PartialMappingIdx::PMI_First##RBName, Size, \ + Offset) && \ + #RBName #Size " " #Offset " is incorrectly initialized"); \ + } while (false) + +#define CHECK_VALUEMAP(RBName, Size) CHECK_VALUEMAP_IMPL(RBName, Size, 0) + + CHECK_VALUEMAP(GPR, 32); + CHECK_VALUEMAP(GPR, 64); + CHECK_VALUEMAP(FPR, 16); + CHECK_VALUEMAP(FPR, 32); + CHECK_VALUEMAP(FPR, 64); + CHECK_VALUEMAP(FPR, 128); + CHECK_VALUEMAP(FPR, 256); + CHECK_VALUEMAP(FPR, 512); + +// Check the value mapping for 3-operands instructions where all the operands +// map to the same value mapping. +#define CHECK_VALUEMAP_3OPS(RBName, Size) \ + do { \ + CHECK_VALUEMAP_IMPL(RBName, Size, 0); \ + CHECK_VALUEMAP_IMPL(RBName, Size, 1); \ + CHECK_VALUEMAP_IMPL(RBName, Size, 2); \ + } while (false) + + CHECK_VALUEMAP_3OPS(GPR, 32); + CHECK_VALUEMAP_3OPS(GPR, 64); + CHECK_VALUEMAP_3OPS(FPR, 32); + CHECK_VALUEMAP_3OPS(FPR, 64); + CHECK_VALUEMAP_3OPS(FPR, 128); + CHECK_VALUEMAP_3OPS(FPR, 256); + CHECK_VALUEMAP_3OPS(FPR, 512); + +#define CHECK_VALUEMAP_CROSSREGCPY(RBNameDst, RBNameSrc, Size) \ + do { \ + unsigned PartialMapDstIdx = PMI_##RBNameDst##Size - PMI_Min; \ + unsigned PartialMapSrcIdx = PMI_##RBNameSrc##Size - PMI_Min; \ + (void)PartialMapDstIdx; \ + (void)PartialMapSrcIdx; \ + const ValueMapping *Map = getCopyMapping( \ + AArch64::RBNameDst##RegBankID, AArch64::RBNameSrc##RegBankID, Size); \ + (void)Map; \ + assert(Map[0].BreakDown == \ + &AArch64GenRegisterBankInfo::PartMappings[PartialMapDstIdx] && \ + Map[0].NumBreakDowns == 1 && #RBNameDst #Size \ + " Dst is incorrectly initialized"); \ + assert(Map[1].BreakDown == \ + &AArch64GenRegisterBankInfo::PartMappings[PartialMapSrcIdx] && \ + Map[1].NumBreakDowns == 1 && #RBNameSrc #Size \ + " Src is incorrectly initialized"); \ + \ + } while (false) + + CHECK_VALUEMAP_CROSSREGCPY(GPR, GPR, 32); + CHECK_VALUEMAP_CROSSREGCPY(GPR, FPR, 32); + CHECK_VALUEMAP_CROSSREGCPY(GPR, GPR, 64); + CHECK_VALUEMAP_CROSSREGCPY(GPR, FPR, 64); + CHECK_VALUEMAP_CROSSREGCPY(FPR, FPR, 32); + CHECK_VALUEMAP_CROSSREGCPY(FPR, GPR, 32); + CHECK_VALUEMAP_CROSSREGCPY(FPR, FPR, 64); + CHECK_VALUEMAP_CROSSREGCPY(FPR, GPR, 64); + +#define CHECK_VALUEMAP_FPEXT(DstSize, SrcSize) \ + do { \ + unsigned PartialMapDstIdx = PMI_FPR##DstSize - PMI_Min; \ + unsigned PartialMapSrcIdx = PMI_FPR##SrcSize - PMI_Min; \ + (void)PartialMapDstIdx; \ + (void)PartialMapSrcIdx; \ + const ValueMapping *Map = getFPExtMapping(DstSize, SrcSize); \ + (void)Map; \ + assert(Map[0].BreakDown == \ + &AArch64GenRegisterBankInfo::PartMappings[PartialMapDstIdx] && \ + Map[0].NumBreakDowns == 1 && "FPR" #DstSize \ + " Dst is incorrectly initialized"); \ + assert(Map[1].BreakDown == \ + &AArch64GenRegisterBankInfo::PartMappings[PartialMapSrcIdx] && \ + Map[1].NumBreakDowns == 1 && "FPR" #SrcSize \ + " Src is incorrectly initialized"); \ + \ + } while (false) + + CHECK_VALUEMAP_FPEXT(32, 16); + CHECK_VALUEMAP_FPEXT(64, 16); + CHECK_VALUEMAP_FPEXT(64, 32); + CHECK_VALUEMAP_FPEXT(128, 64); + + assert(verify(TRI) && "Invalid register bank information"); + }; + + llvm::call_once(InitializeRegisterBankFlag, InitializeRegisterBankOnce); +} + +unsigned AArch64RegisterBankInfo::copyCost(const RegisterBank &A, + const RegisterBank &B, + unsigned Size) const { + // What do we do with different size? + // copy are same size. + // Will introduce other hooks for different size: + // * extract cost. + // * build_sequence cost. + + // Copy from (resp. to) GPR to (resp. from) FPR involves FMOV. + // FIXME: This should be deduced from the scheduling model. + if (&A == &AArch64::GPRRegBank && &B == &AArch64::FPRRegBank) + // FMOVXDr or FMOVWSr. + return 5; + if (&A == &AArch64::FPRRegBank && &B == &AArch64::GPRRegBank) + // FMOVDXr or FMOVSWr. + return 4; + + return RegisterBankInfo::copyCost(A, B, Size); +} + +const RegisterBank & +AArch64RegisterBankInfo::getRegBankFromRegClass(const TargetRegisterClass &RC, + LLT) const { + switch (RC.getID()) { + case AArch64::FPR8RegClassID: + case AArch64::FPR16RegClassID: + case AArch64::FPR16_loRegClassID: + case AArch64::FPR32_with_hsub_in_FPR16_loRegClassID: + case AArch64::FPR32RegClassID: + case AArch64::FPR64RegClassID: + case AArch64::FPR64_loRegClassID: + case AArch64::FPR128RegClassID: + case AArch64::FPR128_loRegClassID: + case AArch64::DDRegClassID: + case AArch64::DDDRegClassID: + case AArch64::DDDDRegClassID: + case AArch64::QQRegClassID: + case AArch64::QQQRegClassID: + case AArch64::QQQQRegClassID: + return getRegBank(AArch64::FPRRegBankID); + case AArch64::GPR32commonRegClassID: + case AArch64::GPR32RegClassID: + case AArch64::GPR32spRegClassID: + case AArch64::GPR32sponlyRegClassID: + case AArch64::GPR32argRegClassID: + case AArch64::GPR32allRegClassID: + case AArch64::GPR64commonRegClassID: + case AArch64::GPR64RegClassID: + case AArch64::GPR64spRegClassID: + case AArch64::GPR64sponlyRegClassID: + case AArch64::GPR64argRegClassID: + case AArch64::GPR64allRegClassID: + case AArch64::GPR64noipRegClassID: + case AArch64::GPR64common_and_GPR64noipRegClassID: + case AArch64::GPR64noip_and_tcGPR64RegClassID: + case AArch64::tcGPR64RegClassID: + case AArch64::WSeqPairsClassRegClassID: + case AArch64::XSeqPairsClassRegClassID: + return getRegBank(AArch64::GPRRegBankID); + case AArch64::CCRRegClassID: + return getRegBank(AArch64::CCRegBankID); + default: + llvm_unreachable("Register class not supported"); + } +} + +RegisterBankInfo::InstructionMappings +AArch64RegisterBankInfo::getInstrAlternativeMappings( + const MachineInstr &MI) const { + const MachineFunction &MF = *MI.getParent()->getParent(); + const TargetSubtargetInfo &STI = MF.getSubtarget(); + const TargetRegisterInfo &TRI = *STI.getRegisterInfo(); + const MachineRegisterInfo &MRI = MF.getRegInfo(); + + switch (MI.getOpcode()) { + case TargetOpcode::G_OR: { + // 32 and 64-bit or can be mapped on either FPR or + // GPR for the same cost. + unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, TRI); + if (Size != 32 && Size != 64) + break; + + // If the instruction has any implicit-defs or uses, + // do not mess with it. + if (MI.getNumOperands() != 3) + break; + InstructionMappings AltMappings; + const InstructionMapping &GPRMapping = getInstructionMapping( + /*ID*/ 1, /*Cost*/ 1, getValueMapping(PMI_FirstGPR, Size), + /*NumOperands*/ 3); + const InstructionMapping &FPRMapping = getInstructionMapping( + /*ID*/ 2, /*Cost*/ 1, getValueMapping(PMI_FirstFPR, Size), + /*NumOperands*/ 3); + + AltMappings.push_back(&GPRMapping); + AltMappings.push_back(&FPRMapping); + return AltMappings; + } + case TargetOpcode::G_BITCAST: { + unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, TRI); + if (Size != 32 && Size != 64) + break; + + // If the instruction has any implicit-defs or uses, + // do not mess with it. + if (MI.getNumOperands() != 2) + break; + + InstructionMappings AltMappings; + const InstructionMapping &GPRMapping = getInstructionMapping( + /*ID*/ 1, /*Cost*/ 1, + getCopyMapping(AArch64::GPRRegBankID, AArch64::GPRRegBankID, Size), + /*NumOperands*/ 2); + const InstructionMapping &FPRMapping = getInstructionMapping( + /*ID*/ 2, /*Cost*/ 1, + getCopyMapping(AArch64::FPRRegBankID, AArch64::FPRRegBankID, Size), + /*NumOperands*/ 2); + const InstructionMapping &GPRToFPRMapping = getInstructionMapping( + /*ID*/ 3, + /*Cost*/ copyCost(AArch64::GPRRegBank, AArch64::FPRRegBank, Size), + getCopyMapping(AArch64::FPRRegBankID, AArch64::GPRRegBankID, Size), + /*NumOperands*/ 2); + const InstructionMapping &FPRToGPRMapping = getInstructionMapping( + /*ID*/ 3, + /*Cost*/ copyCost(AArch64::GPRRegBank, AArch64::FPRRegBank, Size), + getCopyMapping(AArch64::GPRRegBankID, AArch64::FPRRegBankID, Size), + /*NumOperands*/ 2); + + AltMappings.push_back(&GPRMapping); + AltMappings.push_back(&FPRMapping); + AltMappings.push_back(&GPRToFPRMapping); + AltMappings.push_back(&FPRToGPRMapping); + return AltMappings; + } + case TargetOpcode::G_LOAD: { + unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, TRI); + if (Size != 64) + break; + + // If the instruction has any implicit-defs or uses, + // do not mess with it. + if (MI.getNumOperands() != 2) + break; + + InstructionMappings AltMappings; + const InstructionMapping &GPRMapping = getInstructionMapping( + /*ID*/ 1, /*Cost*/ 1, + getOperandsMapping({getValueMapping(PMI_FirstGPR, Size), + // Addresses are GPR 64-bit. + getValueMapping(PMI_FirstGPR, 64)}), + /*NumOperands*/ 2); + const InstructionMapping &FPRMapping = getInstructionMapping( + /*ID*/ 2, /*Cost*/ 1, + getOperandsMapping({getValueMapping(PMI_FirstFPR, Size), + // Addresses are GPR 64-bit. + getValueMapping(PMI_FirstGPR, 64)}), + /*NumOperands*/ 2); + + AltMappings.push_back(&GPRMapping); + AltMappings.push_back(&FPRMapping); + return AltMappings; + } + default: + break; + } + return RegisterBankInfo::getInstrAlternativeMappings(MI); +} + +void AArch64RegisterBankInfo::applyMappingImpl( + const OperandsMapper &OpdMapper) const { + switch (OpdMapper.getMI().getOpcode()) { + case TargetOpcode::G_OR: + case TargetOpcode::G_BITCAST: + case TargetOpcode::G_LOAD: + // Those ID must match getInstrAlternativeMappings. + assert((OpdMapper.getInstrMapping().getID() >= 1 && + OpdMapper.getInstrMapping().getID() <= 4) && + "Don't know how to handle that ID"); + return applyDefaultMapping(OpdMapper); + default: + llvm_unreachable("Don't know how to handle that operation"); + } +} + +/// Returns whether opcode \p Opc is a pre-isel generic floating-point opcode, +/// having only floating-point operands. +static bool isPreISelGenericFloatingPointOpcode(unsigned Opc) { + switch (Opc) { + case TargetOpcode::G_FADD: + case TargetOpcode::G_FSUB: + case TargetOpcode::G_FMUL: + case TargetOpcode::G_FMA: + case TargetOpcode::G_FDIV: + case TargetOpcode::G_FCONSTANT: + case TargetOpcode::G_FPEXT: + case TargetOpcode::G_FPTRUNC: + case TargetOpcode::G_FCEIL: + case TargetOpcode::G_FFLOOR: + case TargetOpcode::G_FNEARBYINT: + case TargetOpcode::G_FNEG: + case TargetOpcode::G_FCOS: + case TargetOpcode::G_FSIN: + case TargetOpcode::G_FLOG10: + case TargetOpcode::G_FLOG: + case TargetOpcode::G_FLOG2: + case TargetOpcode::G_FSQRT: + case TargetOpcode::G_FABS: + case TargetOpcode::G_FEXP: + case TargetOpcode::G_FRINT: + case TargetOpcode::G_INTRINSIC_TRUNC: + case TargetOpcode::G_INTRINSIC_ROUND: + return true; + } + return false; +} + +const RegisterBankInfo::InstructionMapping & +AArch64RegisterBankInfo::getSameKindOfOperandsMapping( + const MachineInstr &MI) const { + const unsigned Opc = MI.getOpcode(); + const MachineFunction &MF = *MI.getParent()->getParent(); + const MachineRegisterInfo &MRI = MF.getRegInfo(); + + unsigned NumOperands = MI.getNumOperands(); + assert(NumOperands <= 3 && + "This code is for instructions with 3 or less operands"); + + LLT Ty = MRI.getType(MI.getOperand(0).getReg()); + unsigned Size = Ty.getSizeInBits(); + bool IsFPR = Ty.isVector() || isPreISelGenericFloatingPointOpcode(Opc); + + PartialMappingIdx RBIdx = IsFPR ? PMI_FirstFPR : PMI_FirstGPR; + +#ifndef NDEBUG + // Make sure all the operands are using similar size and type. + // Should probably be checked by the machine verifier. + // This code won't catch cases where the number of lanes is + // different between the operands. + // If we want to go to that level of details, it is probably + // best to check that the types are the same, period. + // Currently, we just check that the register banks are the same + // for each types. + for (unsigned Idx = 1; Idx != NumOperands; ++Idx) { + LLT OpTy = MRI.getType(MI.getOperand(Idx).getReg()); + assert( + AArch64GenRegisterBankInfo::getRegBankBaseIdxOffset( + RBIdx, OpTy.getSizeInBits()) == + AArch64GenRegisterBankInfo::getRegBankBaseIdxOffset(RBIdx, Size) && + "Operand has incompatible size"); + bool OpIsFPR = OpTy.isVector() || isPreISelGenericFloatingPointOpcode(Opc); + (void)OpIsFPR; + assert(IsFPR == OpIsFPR && "Operand has incompatible type"); + } +#endif // End NDEBUG. + + return getInstructionMapping(DefaultMappingID, 1, + getValueMapping(RBIdx, Size), NumOperands); +} + +bool AArch64RegisterBankInfo::hasFPConstraints( + const MachineInstr &MI, const MachineRegisterInfo &MRI, + const TargetRegisterInfo &TRI) const { + unsigned Op = MI.getOpcode(); + + // Do we have an explicit floating point instruction? + if (isPreISelGenericFloatingPointOpcode(Op)) + return true; + + // No. Check if we have a copy-like instruction. If we do, then we could + // still be fed by floating point instructions. + if (Op != TargetOpcode::COPY && !MI.isPHI()) + return false; + + // MI is copy-like. Return true if it outputs an FPR. + return getRegBank(MI.getOperand(0).getReg(), MRI, TRI) == + &AArch64::FPRRegBank; +} + +bool AArch64RegisterBankInfo::onlyUsesFP(const MachineInstr &MI, + const MachineRegisterInfo &MRI, + const TargetRegisterInfo &TRI) const { + switch (MI.getOpcode()) { + case TargetOpcode::G_FPTOSI: + case TargetOpcode::G_FPTOUI: + case TargetOpcode::G_FCMP: + return true; + default: + break; + } + return hasFPConstraints(MI, MRI, TRI); +} + +bool AArch64RegisterBankInfo::onlyDefinesFP( + const MachineInstr &MI, const MachineRegisterInfo &MRI, + const TargetRegisterInfo &TRI) const { + switch (MI.getOpcode()) { + case AArch64::G_DUP: + case TargetOpcode::G_SITOFP: + case TargetOpcode::G_UITOFP: + case TargetOpcode::G_EXTRACT_VECTOR_ELT: + case TargetOpcode::G_INSERT_VECTOR_ELT: + return true; + default: + break; + } + return hasFPConstraints(MI, MRI, TRI); +} + +const RegisterBankInfo::InstructionMapping & +AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { + const unsigned Opc = MI.getOpcode(); + + // Try the default logic for non-generic instructions that are either copies + // or already have some operands assigned to banks. + if ((Opc != TargetOpcode::COPY && !isPreISelGenericOpcode(Opc)) || + Opc == TargetOpcode::G_PHI) { + const RegisterBankInfo::InstructionMapping &Mapping = + getInstrMappingImpl(MI); + if (Mapping.isValid()) + return Mapping; + } + + const MachineFunction &MF = *MI.getParent()->getParent(); + const MachineRegisterInfo &MRI = MF.getRegInfo(); + const TargetSubtargetInfo &STI = MF.getSubtarget(); + const TargetRegisterInfo &TRI = *STI.getRegisterInfo(); + + switch (Opc) { + // G_{F|S|U}REM are not listed because they are not legal. + // Arithmetic ops. + case TargetOpcode::G_ADD: + case TargetOpcode::G_SUB: + case TargetOpcode::G_PTR_ADD: + case TargetOpcode::G_MUL: + case TargetOpcode::G_SDIV: + case TargetOpcode::G_UDIV: + // Bitwise ops. + case TargetOpcode::G_AND: + case TargetOpcode::G_OR: + case TargetOpcode::G_XOR: + // Floating point ops. + case TargetOpcode::G_FADD: + case TargetOpcode::G_FSUB: + case TargetOpcode::G_FMUL: + case TargetOpcode::G_FDIV: + return getSameKindOfOperandsMapping(MI); + case TargetOpcode::G_FPEXT: { + LLT DstTy = MRI.getType(MI.getOperand(0).getReg()); + LLT SrcTy = MRI.getType(MI.getOperand(1).getReg()); + return getInstructionMapping( + DefaultMappingID, /*Cost*/ 1, + getFPExtMapping(DstTy.getSizeInBits(), SrcTy.getSizeInBits()), + /*NumOperands*/ 2); + } + // Shifts. + case TargetOpcode::G_SHL: + case TargetOpcode::G_LSHR: + case TargetOpcode::G_ASHR: { + LLT ShiftAmtTy = MRI.getType(MI.getOperand(2).getReg()); + LLT SrcTy = MRI.getType(MI.getOperand(1).getReg()); + if (ShiftAmtTy.getSizeInBits() == 64 && SrcTy.getSizeInBits() == 32) + return getInstructionMapping(DefaultMappingID, 1, + &ValMappings[Shift64Imm], 3); + return getSameKindOfOperandsMapping(MI); + } + case TargetOpcode::COPY: { + Register DstReg = MI.getOperand(0).getReg(); + Register SrcReg = MI.getOperand(1).getReg(); + // Check if one of the register is not a generic register. + if ((Register::isPhysicalRegister(DstReg) || + !MRI.getType(DstReg).isValid()) || + (Register::isPhysicalRegister(SrcReg) || + !MRI.getType(SrcReg).isValid())) { + const RegisterBank *DstRB = getRegBank(DstReg, MRI, TRI); + const RegisterBank *SrcRB = getRegBank(SrcReg, MRI, TRI); + if (!DstRB) + DstRB = SrcRB; + else if (!SrcRB) + SrcRB = DstRB; + // If both RB are null that means both registers are generic. + // We shouldn't be here. + assert(DstRB && SrcRB && "Both RegBank were nullptr"); + unsigned Size = getSizeInBits(DstReg, MRI, TRI); + return getInstructionMapping( + DefaultMappingID, copyCost(*DstRB, *SrcRB, Size), + getCopyMapping(DstRB->getID(), SrcRB->getID(), Size), + // We only care about the mapping of the destination. + /*NumOperands*/ 1); + } + // Both registers are generic, use G_BITCAST. + LLVM_FALLTHROUGH; + } + case TargetOpcode::G_BITCAST: { + LLT DstTy = MRI.getType(MI.getOperand(0).getReg()); + LLT SrcTy = MRI.getType(MI.getOperand(1).getReg()); + unsigned Size = DstTy.getSizeInBits(); + bool DstIsGPR = !DstTy.isVector() && DstTy.getSizeInBits() <= 64; + bool SrcIsGPR = !SrcTy.isVector() && SrcTy.getSizeInBits() <= 64; + const RegisterBank &DstRB = + DstIsGPR ? AArch64::GPRRegBank : AArch64::FPRRegBank; + const RegisterBank &SrcRB = + SrcIsGPR ? AArch64::GPRRegBank : AArch64::FPRRegBank; + return getInstructionMapping( + DefaultMappingID, copyCost(DstRB, SrcRB, Size), + getCopyMapping(DstRB.getID(), SrcRB.getID(), Size), + // We only care about the mapping of the destination for COPY. + /*NumOperands*/ Opc == TargetOpcode::G_BITCAST ? 2 : 1); + } + default: + break; + } + + unsigned NumOperands = MI.getNumOperands(); + + // Track the size and bank of each register. We don't do partial mappings. + SmallVector<unsigned, 4> OpSize(NumOperands); + SmallVector<PartialMappingIdx, 4> OpRegBankIdx(NumOperands); + for (unsigned Idx = 0; Idx < NumOperands; ++Idx) { + auto &MO = MI.getOperand(Idx); + if (!MO.isReg() || !MO.getReg()) + continue; + + LLT Ty = MRI.getType(MO.getReg()); + OpSize[Idx] = Ty.getSizeInBits(); + + // As a top-level guess, vectors go in FPRs, scalars and pointers in GPRs. + // For floating-point instructions, scalars go in FPRs. + if (Ty.isVector() || isPreISelGenericFloatingPointOpcode(Opc) || + Ty.getSizeInBits() > 64) + OpRegBankIdx[Idx] = PMI_FirstFPR; + else + OpRegBankIdx[Idx] = PMI_FirstGPR; + } + + unsigned Cost = 1; + // Some of the floating-point instructions have mixed GPR and FPR operands: + // fine-tune the computed mapping. + switch (Opc) { + case AArch64::G_DUP: { + Register ScalarReg = MI.getOperand(1).getReg(); + auto ScalarDef = MRI.getVRegDef(ScalarReg); + if (getRegBank(ScalarReg, MRI, TRI) == &AArch64::FPRRegBank || + onlyDefinesFP(*ScalarDef, MRI, TRI)) + OpRegBankIdx = {PMI_FirstFPR, PMI_FirstFPR}; + else + OpRegBankIdx = {PMI_FirstFPR, PMI_FirstGPR}; + break; + } + case TargetOpcode::G_TRUNC: { + LLT SrcTy = MRI.getType(MI.getOperand(1).getReg()); + if (!SrcTy.isVector() && SrcTy.getSizeInBits() == 128) + OpRegBankIdx = {PMI_FirstFPR, PMI_FirstFPR}; + break; + } + case TargetOpcode::G_SITOFP: + case TargetOpcode::G_UITOFP: + if (MRI.getType(MI.getOperand(0).getReg()).isVector()) + break; + OpRegBankIdx = {PMI_FirstFPR, PMI_FirstGPR}; + break; + case TargetOpcode::G_FPTOSI: + case TargetOpcode::G_FPTOUI: + if (MRI.getType(MI.getOperand(0).getReg()).isVector()) + break; + OpRegBankIdx = {PMI_FirstGPR, PMI_FirstFPR}; + break; + case TargetOpcode::G_FCMP: + OpRegBankIdx = {PMI_FirstGPR, + /* Predicate */ PMI_None, PMI_FirstFPR, PMI_FirstFPR}; + break; + case TargetOpcode::G_BITCAST: + // This is going to be a cross register bank copy and this is expensive. + if (OpRegBankIdx[0] != OpRegBankIdx[1]) + Cost = copyCost( + *AArch64GenRegisterBankInfo::PartMappings[OpRegBankIdx[0]].RegBank, + *AArch64GenRegisterBankInfo::PartMappings[OpRegBankIdx[1]].RegBank, + OpSize[0]); + break; + case TargetOpcode::G_LOAD: + // Loading in vector unit is slightly more expensive. + // This is actually only true for the LD1R and co instructions, + // but anyway for the fast mode this number does not matter and + // for the greedy mode the cost of the cross bank copy will + // offset this number. + // FIXME: Should be derived from the scheduling model. + if (OpRegBankIdx[0] != PMI_FirstGPR) + Cost = 2; + else + // Check if that load feeds fp instructions. + // In that case, we want the default mapping to be on FPR + // instead of blind map every scalar to GPR. + for (const MachineInstr &UseMI : + MRI.use_nodbg_instructions(MI.getOperand(0).getReg())) { + // If we have at least one direct use in a FP instruction, + // assume this was a floating point load in the IR. + // If it was not, we would have had a bitcast before + // reaching that instruction. + if (onlyUsesFP(UseMI, MRI, TRI)) { + OpRegBankIdx[0] = PMI_FirstFPR; + break; + } + } + break; + case TargetOpcode::G_STORE: + // Check if that store is fed by fp instructions. + if (OpRegBankIdx[0] == PMI_FirstGPR) { + Register VReg = MI.getOperand(0).getReg(); + if (!VReg) + break; + MachineInstr *DefMI = MRI.getVRegDef(VReg); + if (onlyDefinesFP(*DefMI, MRI, TRI)) + OpRegBankIdx[0] = PMI_FirstFPR; + break; + } + break; + case TargetOpcode::G_SELECT: { + // If the destination is FPR, preserve that. + if (OpRegBankIdx[0] != PMI_FirstGPR) + break; + + // If we're taking in vectors, we have no choice but to put everything on + // FPRs, except for the condition. The condition must always be on a GPR. + LLT SrcTy = MRI.getType(MI.getOperand(2).getReg()); + if (SrcTy.isVector()) { + OpRegBankIdx = {PMI_FirstFPR, PMI_FirstGPR, PMI_FirstFPR, PMI_FirstFPR}; + break; + } + + // Try to minimize the number of copies. If we have more floating point + // constrained values than not, then we'll put everything on FPR. Otherwise, + // everything has to be on GPR. + unsigned NumFP = 0; + + // Check if the uses of the result always produce floating point values. + // + // For example: + // + // %z = G_SELECT %cond %x %y + // fpr = G_FOO %z ... + if (any_of(MRI.use_nodbg_instructions(MI.getOperand(0).getReg()), + [&](MachineInstr &MI) { return onlyUsesFP(MI, MRI, TRI); })) + ++NumFP; + + // Check if the defs of the source values always produce floating point + // values. + // + // For example: + // + // %x = G_SOMETHING_ALWAYS_FLOAT %a ... + // %z = G_SELECT %cond %x %y + // + // Also check whether or not the sources have already been decided to be + // FPR. Keep track of this. + // + // This doesn't check the condition, since it's just whatever is in NZCV. + // This isn't passed explicitly in a register to fcsel/csel. + for (unsigned Idx = 2; Idx < 4; ++Idx) { + Register VReg = MI.getOperand(Idx).getReg(); + MachineInstr *DefMI = MRI.getVRegDef(VReg); + if (getRegBank(VReg, MRI, TRI) == &AArch64::FPRRegBank || + onlyDefinesFP(*DefMI, MRI, TRI)) + ++NumFP; + } + + // If we have more FP constraints than not, then move everything over to + // FPR. + if (NumFP >= 2) + OpRegBankIdx = {PMI_FirstFPR, PMI_FirstGPR, PMI_FirstFPR, PMI_FirstFPR}; + + break; + } + case TargetOpcode::G_UNMERGE_VALUES: { + // If the first operand belongs to a FPR register bank, then make sure that + // we preserve that. + if (OpRegBankIdx[0] != PMI_FirstGPR) + break; + + LLT SrcTy = MRI.getType(MI.getOperand(MI.getNumOperands()-1).getReg()); + // UNMERGE into scalars from a vector should always use FPR. + // Likewise if any of the uses are FP instructions. + if (SrcTy.isVector() || SrcTy == LLT::scalar(128) || + any_of(MRI.use_nodbg_instructions(MI.getOperand(0).getReg()), + [&](MachineInstr &MI) { return onlyUsesFP(MI, MRI, TRI); })) { + // Set the register bank of every operand to FPR. + for (unsigned Idx = 0, NumOperands = MI.getNumOperands(); + Idx < NumOperands; ++Idx) + OpRegBankIdx[Idx] = PMI_FirstFPR; + } + break; + } + case TargetOpcode::G_EXTRACT_VECTOR_ELT: + // Destination and source need to be FPRs. + OpRegBankIdx[0] = PMI_FirstFPR; + OpRegBankIdx[1] = PMI_FirstFPR; + + // Index needs to be a GPR. + OpRegBankIdx[2] = PMI_FirstGPR; + break; + case TargetOpcode::G_INSERT_VECTOR_ELT: + OpRegBankIdx[0] = PMI_FirstFPR; + OpRegBankIdx[1] = PMI_FirstFPR; + + // The element may be either a GPR or FPR. Preserve that behaviour. + if (getRegBank(MI.getOperand(2).getReg(), MRI, TRI) == &AArch64::FPRRegBank) + OpRegBankIdx[2] = PMI_FirstFPR; + else + OpRegBankIdx[2] = PMI_FirstGPR; + + // Index needs to be a GPR. + OpRegBankIdx[3] = PMI_FirstGPR; + break; + case TargetOpcode::G_EXTRACT: { + // For s128 sources we have to use fpr. + LLT SrcTy = MRI.getType(MI.getOperand(1).getReg()); + if (SrcTy.getSizeInBits() == 128) { + OpRegBankIdx[0] = PMI_FirstFPR; + OpRegBankIdx[1] = PMI_FirstFPR; + } + break; + } + case TargetOpcode::G_BUILD_VECTOR: + // If the first source operand belongs to a FPR register bank, then make + // sure that we preserve that. + if (OpRegBankIdx[1] != PMI_FirstGPR) + break; + Register VReg = MI.getOperand(1).getReg(); + if (!VReg) + break; + + // Get the instruction that defined the source operand reg, and check if + // it's a floating point operation. Or, if it's a type like s16 which + // doesn't have a exact size gpr register class. + MachineInstr *DefMI = MRI.getVRegDef(VReg); + unsigned DefOpc = DefMI->getOpcode(); + const LLT SrcTy = MRI.getType(VReg); + if (isPreISelGenericFloatingPointOpcode(DefOpc) || + SrcTy.getSizeInBits() < 32) { + // Have a floating point op. + // Make sure every operand gets mapped to a FPR register class. + unsigned NumOperands = MI.getNumOperands(); + for (unsigned Idx = 0; Idx < NumOperands; ++Idx) + OpRegBankIdx[Idx] = PMI_FirstFPR; + } + break; + } + + // Finally construct the computed mapping. + SmallVector<const ValueMapping *, 8> OpdsMapping(NumOperands); + for (unsigned Idx = 0; Idx < NumOperands; ++Idx) { + if (MI.getOperand(Idx).isReg() && MI.getOperand(Idx).getReg()) { + auto Mapping = getValueMapping(OpRegBankIdx[Idx], OpSize[Idx]); + if (!Mapping->isValid()) + return getInvalidInstructionMapping(); + + OpdsMapping[Idx] = Mapping; + } + } + + return getInstructionMapping(DefaultMappingID, Cost, + getOperandsMapping(OpdsMapping), NumOperands); +} diff --git a/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.h b/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.h new file mode 100644 index 000000000000..e956fca1aa10 --- /dev/null +++ b/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.h @@ -0,0 +1,145 @@ +//===- AArch64RegisterBankInfo -----------------------------------*- C++ -*-==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// \file +/// This file declares the targeting of the RegisterBankInfo class for AArch64. +/// \todo This should be generated by TableGen. +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64REGISTERBANKINFO_H +#define LLVM_LIB_TARGET_AARCH64_AARCH64REGISTERBANKINFO_H + +#include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h" + +#define GET_REGBANK_DECLARATIONS +#include "AArch64GenRegisterBank.inc" + +namespace llvm { + +class TargetRegisterInfo; + +class AArch64GenRegisterBankInfo : public RegisterBankInfo { +protected: + enum PartialMappingIdx { + PMI_None = -1, + PMI_FPR16 = 1, + PMI_FPR32, + PMI_FPR64, + PMI_FPR128, + PMI_FPR256, + PMI_FPR512, + PMI_GPR32, + PMI_GPR64, + PMI_FirstGPR = PMI_GPR32, + PMI_LastGPR = PMI_GPR64, + PMI_FirstFPR = PMI_FPR16, + PMI_LastFPR = PMI_FPR512, + PMI_Min = PMI_FirstFPR, + }; + + static RegisterBankInfo::PartialMapping PartMappings[]; + static RegisterBankInfo::ValueMapping ValMappings[]; + static PartialMappingIdx BankIDToCopyMapIdx[]; + + enum ValueMappingIdx { + InvalidIdx = 0, + First3OpsIdx = 1, + Last3OpsIdx = 22, + DistanceBetweenRegBanks = 3, + FirstCrossRegCpyIdx = 25, + LastCrossRegCpyIdx = 39, + DistanceBetweenCrossRegCpy = 2, + FPExt16To32Idx = 41, + FPExt16To64Idx = 43, + FPExt32To64Idx = 45, + FPExt64To128Idx = 47, + Shift64Imm = 49 + }; + + static bool checkPartialMap(unsigned Idx, unsigned ValStartIdx, + unsigned ValLength, const RegisterBank &RB); + static bool checkValueMapImpl(unsigned Idx, unsigned FirstInBank, + unsigned Size, unsigned Offset); + static bool checkPartialMappingIdx(PartialMappingIdx FirstAlias, + PartialMappingIdx LastAlias, + ArrayRef<PartialMappingIdx> Order); + + static unsigned getRegBankBaseIdxOffset(unsigned RBIdx, unsigned Size); + + /// Get the pointer to the ValueMapping representing the RegisterBank + /// at \p RBIdx with a size of \p Size. + /// + /// The returned mapping works for instructions with the same kind of + /// operands for up to 3 operands. + /// + /// \pre \p RBIdx != PartialMappingIdx::None + static const RegisterBankInfo::ValueMapping * + getValueMapping(PartialMappingIdx RBIdx, unsigned Size); + + /// Get the pointer to the ValueMapping of the operands of a copy + /// instruction from the \p SrcBankID register bank to the \p DstBankID + /// register bank with a size of \p Size. + static const RegisterBankInfo::ValueMapping * + getCopyMapping(unsigned DstBankID, unsigned SrcBankID, unsigned Size); + + /// Get the instruction mapping for G_FPEXT. + /// + /// \pre (DstSize, SrcSize) pair is one of the following: + /// (32, 16), (64, 16), (64, 32), (128, 64) + /// + /// \return An InstructionMapping with statically allocated OperandsMapping. + static const RegisterBankInfo::ValueMapping * + getFPExtMapping(unsigned DstSize, unsigned SrcSize); + +#define GET_TARGET_REGBANK_CLASS +#include "AArch64GenRegisterBank.inc" +}; + +/// This class provides the information for the target register banks. +class AArch64RegisterBankInfo final : public AArch64GenRegisterBankInfo { + /// See RegisterBankInfo::applyMapping. + void applyMappingImpl(const OperandsMapper &OpdMapper) const override; + + /// Get an instruction mapping where all the operands map to + /// the same register bank and have similar size. + /// + /// \pre MI.getNumOperands() <= 3 + /// + /// \return An InstructionMappings with a statically allocated + /// OperandsMapping. + const InstructionMapping & + getSameKindOfOperandsMapping(const MachineInstr &MI) const; + + /// Returns true if the output of \p MI must be stored on a FPR register. + bool hasFPConstraints(const MachineInstr &MI, const MachineRegisterInfo &MRI, + const TargetRegisterInfo &TRI) const; + + /// Returns true if the source registers of \p MI must all be FPRs. + bool onlyUsesFP(const MachineInstr &MI, const MachineRegisterInfo &MRI, + const TargetRegisterInfo &TRI) const; + + /// Returns true if the destination register of \p MI must be a FPR. + bool onlyDefinesFP(const MachineInstr &MI, const MachineRegisterInfo &MRI, + const TargetRegisterInfo &TRI) const; + +public: + AArch64RegisterBankInfo(const TargetRegisterInfo &TRI); + + unsigned copyCost(const RegisterBank &A, const RegisterBank &B, + unsigned Size) const override; + + const RegisterBank &getRegBankFromRegClass(const TargetRegisterClass &RC, + LLT) const override; + + InstructionMappings + getInstrAlternativeMappings(const MachineInstr &MI) const override; + + const InstructionMapping & + getInstrMapping(const MachineInstr &MI) const override; +}; +} // End llvm namespace. +#endif |