diff options
| author | Dimitry Andric <dim@FreeBSD.org> | 2021-07-29 20:15:26 +0000 |
|---|---|---|
| committer | Dimitry Andric <dim@FreeBSD.org> | 2021-07-29 20:15:26 +0000 |
| commit | 344a3780b2e33f6ca763666c380202b18aab72a3 (patch) | |
| tree | f0b203ee6eb71d7fdd792373e3c81eb18d6934dd /llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp | |
| parent | b60736ec1405bb0a8dd40989f67ef4c93da068ab (diff) | |
vendor/llvm-project/llvmorg-13-init-16847-g88e66fa60ae5vendor/llvm-project/llvmorg-12.0.1-rc2-0-ge7dac564cd0evendor/llvm-project/llvmorg-12.0.1-0-gfed41342a82f
Diffstat (limited to 'llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp')
| -rw-r--r-- | llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp | 1049 |
1 files changed, 628 insertions, 421 deletions
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp index 5259f4f5a4d0..a98248438e40 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp @@ -11,12 +11,14 @@ /// \todo This should be generated by TableGen. //===----------------------------------------------------------------------===// +#include "AArch64GlobalISelUtils.h" #include "AArch64InstrInfo.h" #include "AArch64MachineFunctionInfo.h" #include "AArch64RegisterBankInfo.h" #include "AArch64RegisterInfo.h" #include "AArch64Subtarget.h" #include "AArch64TargetMachine.h" +#include "AArch64GlobalISelUtils.h" #include "MCTargetDesc/AArch64AddressingModes.h" #include "MCTargetDesc/AArch64MCTargetDesc.h" #include "llvm/ADT/Optional.h" @@ -24,16 +26,17 @@ #include "llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h" #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" -#include "llvm/CodeGen/GlobalISel/Utils.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineConstantPool.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineMemOperand.h" #include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/TargetOpcodes.h" #include "llvm/IR/Constants.h" +#include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/PatternMatch.h" #include "llvm/IR/Type.h" @@ -46,6 +49,12 @@ using namespace llvm; using namespace MIPatternMatch; +using namespace AArch64GISelUtils; + +namespace llvm { +class BlockFrequencyInfo; +class ProfileSummaryInfo; +} namespace { @@ -62,9 +71,11 @@ public: bool select(MachineInstr &I) override; static const char *getName() { return DEBUG_TYPE; } - void setupMF(MachineFunction &MF, GISelKnownBits &KB, - CodeGenCoverage &CoverageInfo) override { - InstructionSelector::setupMF(MF, KB, CoverageInfo); + void setupMF(MachineFunction &MF, GISelKnownBits *KB, + CodeGenCoverage &CoverageInfo, ProfileSummaryInfo *PSI, + BlockFrequencyInfo *BFI) override { + InstructionSelector::setupMF(MF, KB, CoverageInfo, PSI, BFI); + MIB.setMF(MF); // hasFnAttribute() is expensive to call on every BRCOND selection, so // cache it here for each run of the selector. @@ -85,12 +96,12 @@ private: bool preISelLower(MachineInstr &I); // An early selection function that runs before the selectImpl() call. - bool earlySelect(MachineInstr &I) const; + bool earlySelect(MachineInstr &I); // Do some preprocessing of G_PHIs before we begin selection. void processPHIs(MachineFunction &MF); - bool earlySelectSHL(MachineInstr &I, MachineRegisterInfo &MRI) const; + bool earlySelectSHL(MachineInstr &I, MachineRegisterInfo &MRI); /// Eliminate same-sized cross-bank copies into stores before selectImpl(). bool contractCrossBankCopyIntoStore(MachineInstr &I, @@ -117,10 +128,10 @@ private: ///@} bool selectCompareBranch(MachineInstr &I, MachineFunction &MF, - MachineRegisterInfo &MRI) const; + MachineRegisterInfo &MRI); - bool selectVectorAshrLshr(MachineInstr &I, MachineRegisterInfo &MRI) const; - bool selectVectorSHL(MachineInstr &I, MachineRegisterInfo &MRI) const; + bool selectVectorAshrLshr(MachineInstr &I, MachineRegisterInfo &MRI); + bool selectVectorSHL(MachineInstr &I, MachineRegisterInfo &MRI); // Helper to generate an equivalent of scalar_to_vector into a new register, // returned via 'Dst'. @@ -139,28 +150,37 @@ private: Register EltReg, unsigned LaneIdx, const RegisterBank &RB, MachineIRBuilder &MIRBuilder) const; - bool selectInsertElt(MachineInstr &I, MachineRegisterInfo &MRI) const; + + /// Emit a sequence of instructions representing a constant \p CV for a + /// vector register \p Dst. (E.g. a MOV, or a load from a constant pool.) + /// + /// \returns the last instruction in the sequence on success, and nullptr + /// otherwise. + MachineInstr *emitConstantVector(Register Dst, Constant *CV, + MachineIRBuilder &MIRBuilder, + MachineRegisterInfo &MRI); + + bool selectInsertElt(MachineInstr &I, MachineRegisterInfo &MRI); bool tryOptConstantBuildVec(MachineInstr &MI, LLT DstTy, - MachineRegisterInfo &MRI) const; - bool selectBuildVector(MachineInstr &I, MachineRegisterInfo &MRI) const; - bool selectMergeValues(MachineInstr &I, MachineRegisterInfo &MRI) const; - bool selectUnmergeValues(MachineInstr &I, MachineRegisterInfo &MRI) const; + MachineRegisterInfo &MRI); + bool selectBuildVector(MachineInstr &I, MachineRegisterInfo &MRI); + bool selectMergeValues(MachineInstr &I, MachineRegisterInfo &MRI); + bool selectUnmergeValues(MachineInstr &I, MachineRegisterInfo &MRI); - bool selectShuffleVector(MachineInstr &I, MachineRegisterInfo &MRI) const; - bool selectExtractElt(MachineInstr &I, MachineRegisterInfo &MRI) const; - bool selectConcatVectors(MachineInstr &I, MachineRegisterInfo &MRI) const; - bool selectSplitVectorUnmerge(MachineInstr &I, - MachineRegisterInfo &MRI) const; + bool selectShuffleVector(MachineInstr &I, MachineRegisterInfo &MRI); + bool selectExtractElt(MachineInstr &I, MachineRegisterInfo &MRI); + bool selectConcatVectors(MachineInstr &I, MachineRegisterInfo &MRI); + bool selectSplitVectorUnmerge(MachineInstr &I, MachineRegisterInfo &MRI); bool selectIntrinsicWithSideEffects(MachineInstr &I, - MachineRegisterInfo &MRI) const; + MachineRegisterInfo &MRI); bool selectIntrinsic(MachineInstr &I, MachineRegisterInfo &MRI); - bool selectVectorICmp(MachineInstr &I, MachineRegisterInfo &MRI) const; + bool selectVectorICmp(MachineInstr &I, MachineRegisterInfo &MRI); bool selectIntrinsicTrunc(MachineInstr &I, MachineRegisterInfo &MRI) const; bool selectIntrinsicRound(MachineInstr &I, MachineRegisterInfo &MRI) const; - bool selectJumpTable(MachineInstr &I, MachineRegisterInfo &MRI) const; - bool selectBrJT(MachineInstr &I, MachineRegisterInfo &MRI) const; - bool selectTLSGlobalValue(MachineInstr &I, MachineRegisterInfo &MRI) const; - bool selectReduction(MachineInstr &I, MachineRegisterInfo &MRI) const; + bool selectJumpTable(MachineInstr &I, MachineRegisterInfo &MRI); + bool selectBrJT(MachineInstr &I, MachineRegisterInfo &MRI); + bool selectTLSGlobalValue(MachineInstr &I, MachineRegisterInfo &MRI); + bool selectReduction(MachineInstr &I, MachineRegisterInfo &MRI); unsigned emitConstantPoolEntry(const Constant *CPVal, MachineFunction &MF) const; @@ -244,17 +264,12 @@ private: Register VecReg, unsigned LaneIdx, MachineIRBuilder &MIRBuilder) const; - /// Helper function for selecting G_FCONSTANT. If the G_FCONSTANT can be - /// materialized using a FMOV instruction, then update MI and return it. - /// Otherwise, do nothing and return a nullptr. - MachineInstr *emitFMovForFConstant(MachineInstr &MI, - MachineRegisterInfo &MRI) const; - /// Emit a CSet for an integer compare. /// - /// \p DefReg is expected to be a 32-bit scalar register. + /// \p DefReg and \p SrcReg are expected to be 32-bit scalar registers. MachineInstr *emitCSetForICMP(Register DefReg, unsigned Pred, - MachineIRBuilder &MIRBuilder) const; + MachineIRBuilder &MIRBuilder, + Register SrcReg = AArch64::WZR) const; /// Emit a CSet for a FP compare. /// /// \p Dst is expected to be a 32-bit scalar register. @@ -392,13 +407,18 @@ private: int OpIdx = -1) const; void renderLogicalImm64(MachineInstrBuilder &MIB, const MachineInstr &I, int OpIdx = -1) const; + void renderFPImm16(MachineInstrBuilder &MIB, const MachineInstr &MI, + int OpIdx = -1) const; + void renderFPImm32(MachineInstrBuilder &MIB, const MachineInstr &MI, + int OpIdx = -1) const; + void renderFPImm64(MachineInstrBuilder &MIB, const MachineInstr &MI, + int OpIdx = -1) const; // Materialize a GlobalValue or BlockAddress using a movz+movk sequence. - void materializeLargeCMVal(MachineInstr &I, const Value *V, - unsigned OpFlags) const; + void materializeLargeCMVal(MachineInstr &I, const Value *V, unsigned OpFlags); // Optimization methods. - bool tryOptSelect(MachineInstr &MI) const; + bool tryOptSelect(MachineInstr &MI); MachineInstr *tryFoldIntegerCompare(MachineOperand &LHS, MachineOperand &RHS, MachineOperand &Predicate, MachineIRBuilder &MIRBuilder) const; @@ -424,6 +444,8 @@ private: // clobbered by calls. Register MFReturnAddr; + MachineIRBuilder MIB; + #define GET_GLOBALISEL_PREDICATES_DECL #include "AArch64GenGlobalISel.inc" #undef GET_GLOBALISEL_PREDICATES_DECL @@ -468,6 +490,8 @@ getRegClassForTypeOnBank(LLT Ty, const RegisterBank &RB, if (Ty.getSizeInBits() == 64) return GetAllRegSet ? &AArch64::GPR64allRegClass : &AArch64::GPR64RegClass; + if (Ty.getSizeInBits() == 128) + return &AArch64::XSeqPairsClassRegClass; return nullptr; } @@ -500,6 +524,8 @@ getMinClassForRegBank(const RegisterBank &RB, unsigned SizeInBits, if (SizeInBits == 64) return GetAllRegSet ? &AArch64::GPR64allRegClass : &AArch64::GPR64RegClass; + if (SizeInBits == 128) + return &AArch64::XSeqPairsClassRegClass; } if (RegBankID == AArch64::FPRRegBankID) { @@ -562,6 +588,58 @@ static unsigned getMinSizeForRegBank(const RegisterBank &RB) { } } +/// Create a REG_SEQUENCE instruction using the registers in \p Regs. +/// Helper function for functions like createDTuple and createQTuple. +/// +/// \p RegClassIDs - The list of register class IDs available for some tuple of +/// a scalar class. E.g. QQRegClassID, QQQRegClassID, QQQQRegClassID. This is +/// expected to contain between 2 and 4 tuple classes. +/// +/// \p SubRegs - The list of subregister classes associated with each register +/// class ID in \p RegClassIDs. E.g., QQRegClassID should use the qsub0 +/// subregister class. The index of each subregister class is expected to +/// correspond with the index of each register class. +/// +/// \returns Either the destination register of REG_SEQUENCE instruction that +/// was created, or the 0th element of \p Regs if \p Regs contains a single +/// element. +static Register createTuple(ArrayRef<Register> Regs, + const unsigned RegClassIDs[], + const unsigned SubRegs[], MachineIRBuilder &MIB) { + unsigned NumRegs = Regs.size(); + if (NumRegs == 1) + return Regs[0]; + assert(NumRegs >= 2 && NumRegs <= 4 && + "Only support between two and 4 registers in a tuple!"); + const TargetRegisterInfo *TRI = MIB.getMF().getSubtarget().getRegisterInfo(); + auto *DesiredClass = TRI->getRegClass(RegClassIDs[NumRegs - 2]); + auto RegSequence = + MIB.buildInstr(TargetOpcode::REG_SEQUENCE, {DesiredClass}, {}); + for (unsigned I = 0, E = Regs.size(); I < E; ++I) { + RegSequence.addUse(Regs[I]); + RegSequence.addImm(SubRegs[I]); + } + return RegSequence.getReg(0); +} + +/// Create a tuple of D-registers using the registers in \p Regs. +static Register createDTuple(ArrayRef<Register> Regs, MachineIRBuilder &MIB) { + static const unsigned RegClassIDs[] = { + AArch64::DDRegClassID, AArch64::DDDRegClassID, AArch64::DDDDRegClassID}; + static const unsigned SubRegs[] = {AArch64::dsub0, AArch64::dsub1, + AArch64::dsub2, AArch64::dsub3}; + return createTuple(Regs, RegClassIDs, SubRegs, MIB); +} + +/// Create a tuple of Q-registers using the registers in \p Regs. +static Register createQTuple(ArrayRef<Register> Regs, MachineIRBuilder &MIB) { + static const unsigned RegClassIDs[] = { + AArch64::QQRegClassID, AArch64::QQQRegClassID, AArch64::QQQQRegClassID}; + static const unsigned SubRegs[] = {AArch64::qsub0, AArch64::qsub1, + AArch64::qsub2, AArch64::qsub3}; + return createTuple(Regs, RegClassIDs, SubRegs, MIB); +} + static Optional<uint64_t> getImmedFromMO(const MachineOperand &Root) { auto &MI = *Root.getParent(); auto &MBB = *MI.getParent(); @@ -865,8 +943,8 @@ static bool selectCopy(MachineInstr &I, const TargetInstrInfo &TII, #ifndef NDEBUG ValidCopy = KnownValid || isValidCopy(I, DstRegBank, MRI, TRI, RBI); assert(ValidCopy && "Invalid copy."); - (void)KnownValid; #endif + (void)KnownValid; return ValidCopy; }; @@ -932,6 +1010,15 @@ static bool selectCopy(MachineInstr &I, const TargetInstrInfo &TII, << " operand\n"); return false; } + + // If this a GPR ZEXT that we want to just reduce down into a copy. + // The sizes will be mismatched with the source < 32b but that's ok. + if (I.getOpcode() == TargetOpcode::G_ZEXT) { + I.setDesc(TII.get(AArch64::COPY)); + assert(SrcRegBank.getID() == AArch64::GPRRegBankID); + return selectCopy(I, TII, MRI, TRI, RBI); + } + I.setDesc(TII.get(AArch64::COPY)); return CheckCopy(); } @@ -1085,7 +1172,9 @@ AArch64InstructionSelector::emitSelect(Register Dst, Register True, // // Into: // %select = CSINC %reg, %x, cc - if (mi_match(Reg, MRI, m_GAdd(m_Reg(MatchReg), m_SpecificICst(1)))) { + if (mi_match(Reg, MRI, + m_any_of(m_GAdd(m_Reg(MatchReg), m_SpecificICst(1)), + m_GPtrAdd(m_Reg(MatchReg), m_SpecificICst(1))))) { Opc = Is32Bit ? AArch64::CSINCWr : AArch64::CSINCXr; Reg = MatchReg; if (Invert) { @@ -1208,60 +1297,6 @@ static AArch64CC::CondCode changeICMPPredToAArch64CC(CmpInst::Predicate P) { } } -static void changeFCMPPredToAArch64CC(CmpInst::Predicate P, - AArch64CC::CondCode &CondCode, - AArch64CC::CondCode &CondCode2) { - CondCode2 = AArch64CC::AL; - switch (P) { - default: - llvm_unreachable("Unknown FP condition!"); - case CmpInst::FCMP_OEQ: - CondCode = AArch64CC::EQ; - break; - case CmpInst::FCMP_OGT: - CondCode = AArch64CC::GT; - break; - case CmpInst::FCMP_OGE: - CondCode = AArch64CC::GE; - break; - case CmpInst::FCMP_OLT: - CondCode = AArch64CC::MI; - break; - case CmpInst::FCMP_OLE: - CondCode = AArch64CC::LS; - break; - case CmpInst::FCMP_ONE: - CondCode = AArch64CC::MI; - CondCode2 = AArch64CC::GT; - break; - case CmpInst::FCMP_ORD: - CondCode = AArch64CC::VC; - break; - case CmpInst::FCMP_UNO: - CondCode = AArch64CC::VS; - break; - case CmpInst::FCMP_UEQ: - CondCode = AArch64CC::EQ; - CondCode2 = AArch64CC::VS; - break; - case CmpInst::FCMP_UGT: - CondCode = AArch64CC::HI; - break; - case CmpInst::FCMP_UGE: - CondCode = AArch64CC::PL; - break; - case CmpInst::FCMP_ULT: - CondCode = AArch64CC::LT; - break; - case CmpInst::FCMP_ULE: - CondCode = AArch64CC::LE; - break; - case CmpInst::FCMP_UNE: - CondCode = AArch64CC::NE; - break; - } -} - /// Return a register which can be used as a bit to test in a TB(N)Z. static Register getTestBitReg(Register Reg, uint64_t &Bit, bool &Invert, MachineRegisterInfo &MRI) { @@ -1605,7 +1640,7 @@ bool AArch64InstructionSelector::selectCompareBranchFedByICmp( } bool AArch64InstructionSelector::selectCompareBranch( - MachineInstr &I, MachineFunction &MF, MachineRegisterInfo &MRI) const { + MachineInstr &I, MachineFunction &MF, MachineRegisterInfo &MRI) { Register CondReg = I.getOperand(0).getReg(); MachineInstr *CCMI = MRI.getVRegDef(CondReg); if (CCMI->getOpcode() == TargetOpcode::G_TRUNC) { @@ -1615,7 +1650,6 @@ bool AArch64InstructionSelector::selectCompareBranch( // Try to select the G_BRCOND using whatever is feeding the condition if // possible. - MachineIRBuilder MIB(I); unsigned CCMIOpc = CCMI->getOpcode(); if (CCMIOpc == TargetOpcode::G_FCMP) return selectCompareBranchFedByFCmp(I, *CCMI, MIB); @@ -1650,23 +1684,7 @@ static Optional<int64_t> getVectorShiftImm(Register Reg, assert(MRI.getType(Reg).isVector() && "Expected a *vector* shift operand"); MachineInstr *OpMI = MRI.getVRegDef(Reg); assert(OpMI && "Expected to find a vreg def for vector shift operand"); - if (OpMI->getOpcode() != TargetOpcode::G_BUILD_VECTOR) - return None; - - // Check all operands are identical immediates. - int64_t ImmVal = 0; - for (unsigned Idx = 1; Idx < OpMI->getNumOperands(); ++Idx) { - auto VRegAndVal = getConstantVRegValWithLookThrough(OpMI->getOperand(Idx).getReg(), MRI); - if (!VRegAndVal) - return None; - - if (Idx == 1) - ImmVal = VRegAndVal->Value.getSExtValue(); - if (ImmVal != VRegAndVal->Value.getSExtValue()) - return None; - } - - return ImmVal; + return getAArch64VectorSplatScalar(*OpMI, MRI); } /// Matches and returns the shift immediate value for a SHL instruction given @@ -1703,8 +1721,8 @@ static Optional<int64_t> getVectorSHLImm(LLT SrcTy, Register Reg, MachineRegiste return Imm; } -bool AArch64InstructionSelector::selectVectorSHL( - MachineInstr &I, MachineRegisterInfo &MRI) const { +bool AArch64InstructionSelector::selectVectorSHL(MachineInstr &I, + MachineRegisterInfo &MRI) { assert(I.getOpcode() == TargetOpcode::G_SHL); Register DstReg = I.getOperand(0).getReg(); const LLT Ty = MRI.getType(DstReg); @@ -1719,26 +1737,25 @@ bool AArch64InstructionSelector::selectVectorSHL( Optional<int64_t> ImmVal = getVectorSHLImm(Ty, Src2Reg, MRI); unsigned Opc = 0; - if (Ty == LLT::vector(2, 64)) { + if (Ty == LLT::fixed_vector(2, 64)) { Opc = ImmVal ? AArch64::SHLv2i64_shift : AArch64::USHLv2i64; - } else if (Ty == LLT::vector(4, 32)) { + } else if (Ty == LLT::fixed_vector(4, 32)) { Opc = ImmVal ? AArch64::SHLv4i32_shift : AArch64::USHLv4i32; - } else if (Ty == LLT::vector(2, 32)) { + } else if (Ty == LLT::fixed_vector(2, 32)) { Opc = ImmVal ? AArch64::SHLv2i32_shift : AArch64::USHLv2i32; - } else if (Ty == LLT::vector(4, 16)) { + } else if (Ty == LLT::fixed_vector(4, 16)) { Opc = ImmVal ? AArch64::SHLv4i16_shift : AArch64::USHLv4i16; - } else if (Ty == LLT::vector(8, 16)) { + } else if (Ty == LLT::fixed_vector(8, 16)) { Opc = ImmVal ? AArch64::SHLv8i16_shift : AArch64::USHLv8i16; - } else if (Ty == LLT::vector(16, 8)) { + } else if (Ty == LLT::fixed_vector(16, 8)) { Opc = ImmVal ? AArch64::SHLv16i8_shift : AArch64::USHLv16i8; - } else if (Ty == LLT::vector(8, 8)) { + } else if (Ty == LLT::fixed_vector(8, 8)) { Opc = ImmVal ? AArch64::SHLv8i8_shift : AArch64::USHLv8i8; } else { LLVM_DEBUG(dbgs() << "Unhandled G_SHL type"); return false; } - MachineIRBuilder MIB(I); auto Shl = MIB.buildInstr(Opc, {DstReg}, {Src1Reg}); if (ImmVal) Shl.addImm(*ImmVal); @@ -1750,7 +1767,7 @@ bool AArch64InstructionSelector::selectVectorSHL( } bool AArch64InstructionSelector::selectVectorAshrLshr( - MachineInstr &I, MachineRegisterInfo &MRI) const { + MachineInstr &I, MachineRegisterInfo &MRI) { assert(I.getOpcode() == TargetOpcode::G_ASHR || I.getOpcode() == TargetOpcode::G_LSHR); Register DstReg = I.getOperand(0).getReg(); @@ -1774,25 +1791,25 @@ bool AArch64InstructionSelector::selectVectorAshrLshr( unsigned NegOpc = 0; const TargetRegisterClass *RC = getRegClassForTypeOnBank(Ty, RBI.getRegBank(AArch64::FPRRegBankID), RBI); - if (Ty == LLT::vector(2, 64)) { + if (Ty == LLT::fixed_vector(2, 64)) { Opc = IsASHR ? AArch64::SSHLv2i64 : AArch64::USHLv2i64; NegOpc = AArch64::NEGv2i64; - } else if (Ty == LLT::vector(4, 32)) { + } else if (Ty == LLT::fixed_vector(4, 32)) { Opc = IsASHR ? AArch64::SSHLv4i32 : AArch64::USHLv4i32; NegOpc = AArch64::NEGv4i32; - } else if (Ty == LLT::vector(2, 32)) { + } else if (Ty == LLT::fixed_vector(2, 32)) { Opc = IsASHR ? AArch64::SSHLv2i32 : AArch64::USHLv2i32; NegOpc = AArch64::NEGv2i32; - } else if (Ty == LLT::vector(4, 16)) { + } else if (Ty == LLT::fixed_vector(4, 16)) { Opc = IsASHR ? AArch64::SSHLv4i16 : AArch64::USHLv4i16; NegOpc = AArch64::NEGv4i16; - } else if (Ty == LLT::vector(8, 16)) { + } else if (Ty == LLT::fixed_vector(8, 16)) { Opc = IsASHR ? AArch64::SSHLv8i16 : AArch64::USHLv8i16; NegOpc = AArch64::NEGv8i16; - } else if (Ty == LLT::vector(16, 8)) { + } else if (Ty == LLT::fixed_vector(16, 8)) { Opc = IsASHR ? AArch64::SSHLv16i8 : AArch64::USHLv16i8; - NegOpc = AArch64::NEGv8i16; - } else if (Ty == LLT::vector(8, 8)) { + NegOpc = AArch64::NEGv16i8; + } else if (Ty == LLT::fixed_vector(8, 8)) { Opc = IsASHR ? AArch64::SSHLv8i8 : AArch64::USHLv8i8; NegOpc = AArch64::NEGv8i8; } else { @@ -1800,7 +1817,6 @@ bool AArch64InstructionSelector::selectVectorAshrLshr( return false; } - MachineIRBuilder MIB(I); auto Neg = MIB.buildInstr(NegOpc, {RC}, {Src2Reg}); constrainSelectedInstRegOperands(*Neg, TII, TRI, RBI); auto SShl = MIB.buildInstr(Opc, {DstReg}, {Src1Reg, Neg}); @@ -1842,11 +1858,10 @@ bool AArch64InstructionSelector::selectVaStartDarwin( } void AArch64InstructionSelector::materializeLargeCMVal( - MachineInstr &I, const Value *V, unsigned OpFlags) const { + MachineInstr &I, const Value *V, unsigned OpFlags) { MachineBasicBlock &MBB = *I.getParent(); MachineFunction &MF = *MBB.getParent(); MachineRegisterInfo &MRI = MF.getRegInfo(); - MachineIRBuilder MIB(I); auto MovZ = MIB.buildInstr(AArch64::MOVZXi, {&AArch64::GPR64RegClass}, {}); MovZ->addOperand(MF, I.getOperand(1)); @@ -1907,7 +1922,6 @@ bool AArch64InstructionSelector::preISelLower(MachineInstr &I) { assert(AmtMI && "could not find a vreg definition for shift amount"); if (AmtMI->getOpcode() != TargetOpcode::G_CONSTANT) { // Insert a subregister copy to implement a 64->32 trunc - MachineIRBuilder MIB(I); auto Trunc = MIB.buildInstr(TargetOpcode::COPY, {SrcTy}, {}) .addReg(ShiftReg, 0, AArch64::sub_32); MRI.setRegBank(Trunc.getReg(0), RBI.getRegBank(AArch64::GPRRegBankID)); @@ -1915,8 +1929,21 @@ bool AArch64InstructionSelector::preISelLower(MachineInstr &I) { } return true; } - case TargetOpcode::G_STORE: - return contractCrossBankCopyIntoStore(I, MRI); + case TargetOpcode::G_STORE: { + bool Changed = contractCrossBankCopyIntoStore(I, MRI); + MachineOperand &SrcOp = I.getOperand(0); + if (MRI.getType(SrcOp.getReg()).isPointer()) { + // Allow matching with imported patterns for stores of pointers. Unlike + // G_LOAD/G_PTR_ADD, we may not have selected all users. So, emit a copy + // and constrain. + auto Copy = MIB.buildCopy(LLT::scalar(64), SrcOp); + Register NewSrc = Copy.getReg(0); + SrcOp.setReg(NewSrc); + RBI.constrainGenericRegister(NewSrc, AArch64::GPR64RegClass, MRI); + Changed = true; + } + return Changed; + } case TargetOpcode::G_PTR_ADD: return convertPtrAddToAdd(I, MRI); case TargetOpcode::G_LOAD: { @@ -1936,11 +1963,10 @@ bool AArch64InstructionSelector::preISelLower(MachineInstr &I) { LLT DstTy = MRI.getType(I.getOperand(0).getReg()); if (!DstTy.getElementType().isPointer()) return false; - MachineIRBuilder MIB(I); auto NewSrc = MIB.buildCopy(LLT::scalar(64), I.getOperand(1).getReg()); MRI.setType(I.getOperand(0).getReg(), DstTy.changeElementType(LLT::scalar(64))); - MRI.setRegBank(NewSrc.getReg(0), RBI.getRegBank(AArch64::GPRRegBankID)); + MRI.setRegClass(NewSrc.getReg(0), &AArch64::GPR64RegClass); I.getOperand(1).setReg(NewSrc.getReg(0)); return true; } @@ -1987,8 +2013,8 @@ bool AArch64InstructionSelector::convertPtrAddToAdd( if (PtrTy.getAddressSpace() != 0) return false; - MachineIRBuilder MIB(I); - const LLT CastPtrTy = PtrTy.isVector() ? LLT::vector(2, 64) : LLT::scalar(64); + const LLT CastPtrTy = + PtrTy.isVector() ? LLT::fixed_vector(2, 64) : LLT::scalar(64); auto PtrToInt = MIB.buildPtrToInt(CastPtrTy, AddOp1Reg); // Set regbanks on the registers. if (PtrTy.isVector()) @@ -2016,8 +2042,8 @@ bool AArch64InstructionSelector::convertPtrAddToAdd( return true; } -bool AArch64InstructionSelector::earlySelectSHL( - MachineInstr &I, MachineRegisterInfo &MRI) const { +bool AArch64InstructionSelector::earlySelectSHL(MachineInstr &I, + MachineRegisterInfo &MRI) { // We try to match the immediate variant of LSL, which is actually an alias // for a special case of UBFM. Otherwise, we fall back to the imported // selector which will match the register variant. @@ -2033,7 +2059,6 @@ bool AArch64InstructionSelector::earlySelectSHL( bool Is64Bit = DstTy.getSizeInBits() == 64; auto Imm1Fn = Is64Bit ? selectShiftA_64(MO) : selectShiftA_32(MO); auto Imm2Fn = Is64Bit ? selectShiftB_64(MO) : selectShiftB_32(MO); - MachineIRBuilder MIB(I); if (!Imm1Fn || !Imm2Fn) return false; @@ -2093,7 +2118,7 @@ bool AArch64InstructionSelector::contractCrossBankCopyIntoStore( return true; } -bool AArch64InstructionSelector::earlySelect(MachineInstr &I) const { +bool AArch64InstructionSelector::earlySelect(MachineInstr &I) { assert(I.getParent() && "Instruction should be in a basic block!"); assert(I.getParent()->getParent() && "Instruction should be in a function!"); @@ -2102,6 +2127,24 @@ bool AArch64InstructionSelector::earlySelect(MachineInstr &I) const { MachineRegisterInfo &MRI = MF.getRegInfo(); switch (I.getOpcode()) { + case AArch64::G_DUP: { + // Before selecting a DUP instruction, check if it is better selected as a + // MOV or load from a constant pool. + Register Src = I.getOperand(1).getReg(); + auto ValAndVReg = getConstantVRegValWithLookThrough(Src, MRI); + if (!ValAndVReg) + return false; + LLVMContext &Ctx = MF.getFunction().getContext(); + Register Dst = I.getOperand(0).getReg(); + auto *CV = ConstantDataVector::getSplat( + MRI.getType(Dst).getNumElements(), + ConstantInt::get(Type::getIntNTy(Ctx, MRI.getType(Src).getSizeInBits()), + ValAndVReg->Value)); + if (!emitConstantVector(Dst, CV, MIB, MRI)) + return false; + I.eraseFromParent(); + return true; + } case TargetOpcode::G_BR: { // If the branch jumps to the fallthrough block, don't bother emitting it. // Only do this for -O0 for a good code size improvement, because when @@ -2139,6 +2182,74 @@ bool AArch64InstructionSelector::earlySelect(MachineInstr &I) const { I.setDesc(TII.get(TargetOpcode::COPY)); return true; } + + case TargetOpcode::G_ADD: { + // Check if this is being fed by a G_ICMP on either side. + // + // (cmp pred, x, y) + z + // + // In the above case, when the cmp is true, we increment z by 1. So, we can + // fold the add into the cset for the cmp by using cinc. + // + // FIXME: This would probably be a lot nicer in PostLegalizerLowering. + Register X = I.getOperand(1).getReg(); + + // Only handle scalars. Scalar G_ICMP is only legal for s32, so bail out + // early if we see it. + LLT Ty = MRI.getType(X); + if (Ty.isVector() || Ty.getSizeInBits() != 32) + return false; + + Register CmpReg = I.getOperand(2).getReg(); + MachineInstr *Cmp = getOpcodeDef(TargetOpcode::G_ICMP, CmpReg, MRI); + if (!Cmp) { + std::swap(X, CmpReg); + Cmp = getOpcodeDef(TargetOpcode::G_ICMP, CmpReg, MRI); + if (!Cmp) + return false; + } + auto Pred = + static_cast<CmpInst::Predicate>(Cmp->getOperand(1).getPredicate()); + emitIntegerCompare(Cmp->getOperand(2), Cmp->getOperand(3), + Cmp->getOperand(1), MIB); + emitCSetForICMP(I.getOperand(0).getReg(), Pred, MIB, X); + I.eraseFromParent(); + return true; + } + case TargetOpcode::G_OR: { + // Look for operations that take the lower `Width=Size-ShiftImm` bits of + // `ShiftSrc` and insert them into the upper `Width` bits of `MaskSrc` via + // shifting and masking that we can replace with a BFI (encoded as a BFM). + Register Dst = I.getOperand(0).getReg(); + LLT Ty = MRI.getType(Dst); + + if (!Ty.isScalar()) + return false; + + unsigned Size = Ty.getSizeInBits(); + if (Size != 32 && Size != 64) + return false; + + Register ShiftSrc; + int64_t ShiftImm; + Register MaskSrc; + int64_t MaskImm; + if (!mi_match( + Dst, MRI, + m_GOr(m_OneNonDBGUse(m_GShl(m_Reg(ShiftSrc), m_ICst(ShiftImm))), + m_OneNonDBGUse(m_GAnd(m_Reg(MaskSrc), m_ICst(MaskImm)))))) + return false; + + if (ShiftImm > Size || ((1ULL << ShiftImm) - 1ULL) != uint64_t(MaskImm)) + return false; + + int64_t Immr = Size - ShiftImm; + int64_t Imms = Size - ShiftImm - 1; + unsigned Opc = Size == 32 ? AArch64::BFMWri : AArch64::BFMXri; + emitInstr(Opc, {Dst}, {MaskSrc, ShiftSrc, Immr, Imms}, MIB); + I.eraseFromParent(); + return true; + } default: return false; } @@ -2160,6 +2271,8 @@ bool AArch64InstructionSelector::select(MachineInstr &I) { return false; } + MIB.setInstrAndDebugLoc(I); + unsigned Opcode = I.getOpcode(); // G_PHI requires same handling as PHI if (!I.isPreISelOpcode() || Opcode == TargetOpcode::G_PHI) { @@ -2229,9 +2342,30 @@ bool AArch64InstructionSelector::select(MachineInstr &I) { LLT Ty = I.getOperand(0).isReg() ? MRI.getType(I.getOperand(0).getReg()) : LLT{}; - MachineIRBuilder MIB(I); - switch (Opcode) { + case TargetOpcode::G_SBFX: + case TargetOpcode::G_UBFX: { + static const unsigned OpcTable[2][2] = { + {AArch64::UBFMWri, AArch64::UBFMXri}, + {AArch64::SBFMWri, AArch64::SBFMXri}}; + bool IsSigned = Opcode == TargetOpcode::G_SBFX; + unsigned Size = Ty.getSizeInBits(); + unsigned Opc = OpcTable[IsSigned][Size == 64]; + auto Cst1 = + getConstantVRegValWithLookThrough(I.getOperand(2).getReg(), MRI); + assert(Cst1 && "Should have gotten a constant for src 1?"); + auto Cst2 = + getConstantVRegValWithLookThrough(I.getOperand(3).getReg(), MRI); + assert(Cst2 && "Should have gotten a constant for src 2?"); + auto LSB = Cst1->Value.getZExtValue(); + auto Width = Cst2->Value.getZExtValue(); + auto BitfieldInst = + MIB.buildInstr(Opc, {I.getOperand(0)}, {I.getOperand(1)}) + .addImm(LSB) + .addImm(LSB + Width - 1); + I.eraseFromParent(); + return constrainSelectedInstRegOperands(*BitfieldInst, TII, TRI, RBI); + } case TargetOpcode::G_BRCOND: return selectCompareBranch(I, MF, MRI); @@ -2256,7 +2390,6 @@ bool AArch64InstructionSelector::select(MachineInstr &I) { } assert(TM.getCodeModel() == CodeModel::Small && "Expected small code model"); - MachineIRBuilder MIB(I); auto Op1 = BaseMI->getOperand(1); auto Op2 = I.getOperand(2); auto MovAddr = MIB.buildInstr(AArch64::MOVaddr, {I.getOperand(0)}, {}) @@ -2373,14 +2506,11 @@ bool AArch64InstructionSelector::select(MachineInstr &I) { : (DefSize == 64 ? AArch64::FPR64RegClass : AArch64::FPR128RegClass); - // Can we use a FMOV instruction to represent the immediate? - if (emitFMovForFConstant(I, MRI)) - return true; - // For 64b values, emit a constant pool load instead. - if (DefSize == 64 || DefSize == 128) { + // For s32, use a cp load if we have optsize/minsize. + if (DefSize == 64 || DefSize == 128 || + (DefSize == 32 && shouldOptForSize(&MF))) { auto *FPImm = I.getOperand(1).getFPImm(); - MachineIRBuilder MIB(I); auto *LoadMI = emitLoadFromConstantPool(FPImm, MIB); if (!LoadMI) { LLVM_DEBUG(dbgs() << "Failed to load double constant pool entry\n"); @@ -2435,21 +2565,25 @@ bool AArch64InstructionSelector::select(MachineInstr &I) { if (DstTy.getSizeInBits() != 64) return false; + unsigned Offset = I.getOperand(2).getImm(); + if (Offset % 64 != 0) + return false; + + // Check we have the right regbank always. const RegisterBank &SrcRB = *RBI.getRegBank(SrcReg, MRI, TRI); const RegisterBank &DstRB = *RBI.getRegBank(DstReg, MRI, TRI); - // Check we have the right regbank always. - assert(SrcRB.getID() == AArch64::FPRRegBankID && - DstRB.getID() == AArch64::FPRRegBankID && - "Wrong extract regbank!"); - (void)SrcRB; + assert(SrcRB.getID() == DstRB.getID() && "Wrong extract regbank!"); + + if (SrcRB.getID() == AArch64::GPRRegBankID) { + MIB.buildInstr(TargetOpcode::COPY, {DstReg}, {}) + .addUse(SrcReg, 0, Offset == 0 ? AArch64::sube64 : AArch64::subo64); + I.eraseFromParent(); + return true; + } // Emit the same code as a vector extract. // Offset must be a multiple of 64. - unsigned Offset = I.getOperand(2).getImm(); - if (Offset % 64 != 0) - return false; unsigned LaneIdx = Offset / 64; - MachineIRBuilder MIB(I); MachineInstr *Extract = emitExtractVectorElt( DstReg, DstRB, LLT::scalar(64), SrcReg, LaneIdx, MIB); if (!Extract) @@ -2560,8 +2694,6 @@ bool AArch64InstructionSelector::select(MachineInstr &I) { case TargetOpcode::G_LOAD: case TargetOpcode::G_STORE: { bool IsZExtLoad = I.getOpcode() == TargetOpcode::G_ZEXTLOAD; - MachineIRBuilder MIB(I); - LLT PtrTy = MRI.getType(I.getOperand(1).getReg()); if (PtrTy != LLT::pointer(0, 64)) { @@ -2572,18 +2704,29 @@ bool AArch64InstructionSelector::select(MachineInstr &I) { auto &MemOp = **I.memoperands_begin(); uint64_t MemSizeInBytes = MemOp.getSize(); - if (MemOp.isAtomic()) { - // For now we just support s8 acquire loads to be able to compile stack - // protector code. - if (MemOp.getOrdering() == AtomicOrdering::Acquire && - MemSizeInBytes == 1) { - I.setDesc(TII.get(AArch64::LDARB)); - return constrainSelectedInstRegOperands(I, TII, TRI, RBI); + unsigned MemSizeInBits = MemSizeInBytes * 8; + AtomicOrdering Order = MemOp.getSuccessOrdering(); + + // Need special instructions for atomics that affect ordering. + if (Order != AtomicOrdering::NotAtomic && + Order != AtomicOrdering::Unordered && + Order != AtomicOrdering::Monotonic) { + assert(I.getOpcode() != TargetOpcode::G_ZEXTLOAD); + if (MemSizeInBytes > 64) + return false; + + if (I.getOpcode() == TargetOpcode::G_LOAD) { + static unsigned Opcodes[] = {AArch64::LDARB, AArch64::LDARH, + AArch64::LDARW, AArch64::LDARX}; + I.setDesc(TII.get(Opcodes[Log2_32(MemSizeInBytes)])); + } else { + static unsigned Opcodes[] = {AArch64::STLRB, AArch64::STLRH, + AArch64::STLRW, AArch64::STLRX}; + I.setDesc(TII.get(Opcodes[Log2_32(MemSizeInBytes)])); } - LLVM_DEBUG(dbgs() << "Atomic load/store not fully supported yet\n"); - return false; + constrainSelectedInstRegOperands(I, TII, TRI, RBI); + return true; } - unsigned MemSizeInBits = MemSizeInBytes * 8; #ifndef NDEBUG const Register PtrReg = I.getOperand(1).getReg(); @@ -2737,9 +2880,7 @@ bool AArch64InstructionSelector::select(MachineInstr &I) { } case TargetOpcode::G_PTR_ADD: { - MachineIRBuilder MIRBuilder(I); - emitADD(I.getOperand(0).getReg(), I.getOperand(1), I.getOperand(2), - MIRBuilder); + emitADD(I.getOperand(0).getReg(), I.getOperand(1), I.getOperand(2), MIB); I.eraseFromParent(); return true; } @@ -2748,18 +2889,16 @@ bool AArch64InstructionSelector::select(MachineInstr &I) { case TargetOpcode::G_SSUBO: case TargetOpcode::G_USUBO: { // Emit the operation and get the correct condition code. - MachineIRBuilder MIRBuilder(I); auto OpAndCC = emitOverflowOp(Opcode, I.getOperand(0).getReg(), - I.getOperand(2), I.getOperand(3), MIRBuilder); + I.getOperand(2), I.getOperand(3), MIB); // Now, put the overflow result in the register given by the first operand // to the overflow op. CSINC increments the result when the predicate is // false, so to get the increment when it's true, we need to use the // inverse. In this case, we want to increment when carry is set. Register ZReg = AArch64::WZR; - auto CsetMI = MIRBuilder - .buildInstr(AArch64::CSINCWr, {I.getOperand(1).getReg()}, - {ZReg, ZReg}) + auto CsetMI = MIB.buildInstr(AArch64::CSINCWr, {I.getOperand(1).getReg()}, + {ZReg, ZReg}) .addImm(getInvertedCondCode(OpAndCC.second)); constrainSelectedInstRegOperands(*CsetMI, TII, TRI, RBI); I.eraseFromParent(); @@ -2832,14 +2971,14 @@ bool AArch64InstructionSelector::select(MachineInstr &I) { I.setDesc(TII.get(TargetOpcode::COPY)); return true; } else if (DstRB.getID() == AArch64::FPRRegBankID) { - if (DstTy == LLT::vector(4, 16) && SrcTy == LLT::vector(4, 32)) { + if (DstTy == LLT::fixed_vector(4, 16) && + SrcTy == LLT::fixed_vector(4, 32)) { I.setDesc(TII.get(AArch64::XTNv4i16)); constrainSelectedInstRegOperands(I, TII, TRI, RBI); return true; } if (!SrcTy.isVector() && SrcTy.getSizeInBits() == 128) { - MachineIRBuilder MIB(I); MachineInstr *Extract = emitExtractVectorElt( DstReg, DstRB, LLT::scalar(DstTy.getSizeInBits()), SrcReg, 0, MIB); if (!Extract) @@ -2927,7 +3066,6 @@ bool AArch64InstructionSelector::select(MachineInstr &I) { AArch64::GPRRegBankID && "Unexpected ext regbank"); - MachineIRBuilder MIB(I); MachineInstr *ExtI; // First check if we're extending the result of a load which has a dest type @@ -2947,34 +3085,46 @@ bool AArch64InstructionSelector::select(MachineInstr &I) { return selectCopy(I, TII, MRI, TRI, RBI); } + // For the 32-bit -> 64-bit case, we can emit a mov (ORRWrs) + // + SUBREG_TO_REG. + // // If we are zero extending from 32 bits to 64 bits, it's possible that // the instruction implicitly does the zero extend for us. In that case, - // we can just emit a SUBREG_TO_REG. + // we only need the SUBREG_TO_REG. if (IsGPR && SrcSize == 32 && DstSize == 64) { // Unlike with the G_LOAD case, we don't want to look through copies - // here. + // here. (See isDef32.) MachineInstr *Def = MRI.getVRegDef(SrcReg); - if (Def && isDef32(*Def)) { - MIB.buildInstr(AArch64::SUBREG_TO_REG, {DefReg}, {}) - .addImm(0) - .addUse(SrcReg) - .addImm(AArch64::sub_32); + Register SubregToRegSrc = SrcReg; - if (!RBI.constrainGenericRegister(DefReg, AArch64::GPR64RegClass, - MRI)) { - LLVM_DEBUG(dbgs() << "Failed to constrain G_ZEXT destination\n"); - return false; - } + // Does the instruction implicitly zero extend? + if (!Def || !isDef32(*Def)) { + // No. Zero out using an OR. + Register OrDst = MRI.createVirtualRegister(&AArch64::GPR32RegClass); + const Register ZReg = AArch64::WZR; + MIB.buildInstr(AArch64::ORRWrs, {OrDst}, {ZReg, SrcReg}).addImm(0); + SubregToRegSrc = OrDst; + } - if (!RBI.constrainGenericRegister(SrcReg, AArch64::GPR32RegClass, - MRI)) { - LLVM_DEBUG(dbgs() << "Failed to constrain G_ZEXT source\n"); - return false; - } + MIB.buildInstr(AArch64::SUBREG_TO_REG, {DefReg}, {}) + .addImm(0) + .addUse(SubregToRegSrc) + .addImm(AArch64::sub_32); - I.eraseFromParent(); - return true; + if (!RBI.constrainGenericRegister(DefReg, AArch64::GPR64RegClass, + MRI)) { + LLVM_DEBUG(dbgs() << "Failed to constrain G_ZEXT destination\n"); + return false; } + + if (!RBI.constrainGenericRegister(SrcReg, AArch64::GPR32RegClass, + MRI)) { + LLVM_DEBUG(dbgs() << "Failed to constrain G_ZEXT source\n"); + return false; + } + + I.eraseFromParent(); + return true; } } @@ -3061,7 +3211,6 @@ bool AArch64InstructionSelector::select(MachineInstr &I) { // Make sure to use an unused vreg instead of wzr, so that the peephole // optimizations will be able to optimize these. - MachineIRBuilder MIB(I); Register DeadVReg = MRI.createVirtualRegister(&AArch64::GPR32RegClass); auto TstMI = MIB.buildInstr(AArch64::ANDSWri, {DeadVReg}, {CondReg}) .addImm(AArch64_AM::encodeLogicalImmediate(1, 32)); @@ -3081,22 +3230,20 @@ bool AArch64InstructionSelector::select(MachineInstr &I) { return false; } - MachineIRBuilder MIRBuilder(I); auto Pred = static_cast<CmpInst::Predicate>(I.getOperand(1).getPredicate()); emitIntegerCompare(I.getOperand(2), I.getOperand(3), I.getOperand(1), - MIRBuilder); - emitCSetForICMP(I.getOperand(0).getReg(), Pred, MIRBuilder); + MIB); + emitCSetForICMP(I.getOperand(0).getReg(), Pred, MIB); I.eraseFromParent(); return true; } case TargetOpcode::G_FCMP: { - MachineIRBuilder MIRBuilder(I); CmpInst::Predicate Pred = static_cast<CmpInst::Predicate>(I.getOperand(1).getPredicate()); - if (!emitFPCompare(I.getOperand(2).getReg(), I.getOperand(3).getReg(), - MIRBuilder, Pred) || - !emitCSetForFCmp(I.getOperand(0).getReg(), Pred, MIRBuilder)) + if (!emitFPCompare(I.getOperand(2).getReg(), I.getOperand(3).getReg(), MIB, + Pred) || + !emitCSetForFCmp(I.getOperand(0).getReg(), Pred, MIB)) return false; I.eraseFromParent(); return true; @@ -3142,14 +3289,18 @@ bool AArch64InstructionSelector::select(MachineInstr &I) { // difficult because at RBS we may end up pessimizing the fpr case if we // decided to add an anyextend to fix this. Manual selection is the most // robust solution for now. - Register SrcReg = I.getOperand(1).getReg(); - if (RBI.getRegBank(SrcReg, MRI, TRI)->getID() != AArch64::GPRRegBankID) + if (RBI.getRegBank(I.getOperand(1).getReg(), MRI, TRI)->getID() != + AArch64::GPRRegBankID) return false; // We expect the fpr regbank case to be imported. - LLT SrcTy = MRI.getType(SrcReg); - if (SrcTy.getSizeInBits() == 16) - I.setDesc(TII.get(AArch64::DUPv8i16gpr)); - else if (SrcTy.getSizeInBits() == 8) + LLT VecTy = MRI.getType(I.getOperand(0).getReg()); + if (VecTy == LLT::fixed_vector(8, 8)) + I.setDesc(TII.get(AArch64::DUPv8i8gpr)); + else if (VecTy == LLT::fixed_vector(16, 8)) I.setDesc(TII.get(AArch64::DUPv16i8gpr)); + else if (VecTy == LLT::fixed_vector(4, 16)) + I.setDesc(TII.get(AArch64::DUPv4i16gpr)); + else if (VecTy == LLT::fixed_vector(8, 16)) + I.setDesc(TII.get(AArch64::DUPv8i16gpr)); else return false; return constrainSelectedInstRegOperands(I, TII, TRI, RBI); @@ -3182,19 +3333,33 @@ bool AArch64InstructionSelector::select(MachineInstr &I) { return false; } -bool AArch64InstructionSelector::selectReduction( - MachineInstr &I, MachineRegisterInfo &MRI) const { +bool AArch64InstructionSelector::selectReduction(MachineInstr &I, + MachineRegisterInfo &MRI) { Register VecReg = I.getOperand(1).getReg(); LLT VecTy = MRI.getType(VecReg); if (I.getOpcode() == TargetOpcode::G_VECREDUCE_ADD) { + // For <2 x i32> ADDPv2i32 generates an FPR64 value, so we need to emit + // a subregister copy afterwards. + if (VecTy == LLT::fixed_vector(2, 32)) { + Register DstReg = I.getOperand(0).getReg(); + auto AddP = MIB.buildInstr(AArch64::ADDPv2i32, {&AArch64::FPR64RegClass}, + {VecReg, VecReg}); + auto Copy = MIB.buildInstr(TargetOpcode::COPY, {DstReg}, {}) + .addReg(AddP.getReg(0), 0, AArch64::ssub) + .getReg(0); + RBI.constrainGenericRegister(Copy, AArch64::FPR32RegClass, MRI); + I.eraseFromParent(); + return constrainSelectedInstRegOperands(*AddP, TII, TRI, RBI); + } + unsigned Opc = 0; - if (VecTy == LLT::vector(16, 8)) + if (VecTy == LLT::fixed_vector(16, 8)) Opc = AArch64::ADDVv16i8v; - else if (VecTy == LLT::vector(8, 16)) + else if (VecTy == LLT::fixed_vector(8, 16)) Opc = AArch64::ADDVv8i16v; - else if (VecTy == LLT::vector(4, 32)) + else if (VecTy == LLT::fixed_vector(4, 32)) Opc = AArch64::ADDVv4i32v; - else if (VecTy == LLT::vector(2, 64)) + else if (VecTy == LLT::fixed_vector(2, 64)) Opc = AArch64::ADDPv2i64p; else { LLVM_DEBUG(dbgs() << "Unhandled type for add reduction"); @@ -3206,9 +3371,9 @@ bool AArch64InstructionSelector::selectReduction( if (I.getOpcode() == TargetOpcode::G_VECREDUCE_FADD) { unsigned Opc = 0; - if (VecTy == LLT::vector(2, 32)) + if (VecTy == LLT::fixed_vector(2, 32)) Opc = AArch64::FADDPv2i32p; - else if (VecTy == LLT::vector(2, 64)) + else if (VecTy == LLT::fixed_vector(2, 64)) Opc = AArch64::FADDPv2i64p; else { LLVM_DEBUG(dbgs() << "Unhandled type for fadd reduction"); @@ -3221,12 +3386,11 @@ bool AArch64InstructionSelector::selectReduction( } bool AArch64InstructionSelector::selectBrJT(MachineInstr &I, - MachineRegisterInfo &MRI) const { + MachineRegisterInfo &MRI) { assert(I.getOpcode() == TargetOpcode::G_BRJT && "Expected G_BRJT"); Register JTAddr = I.getOperand(0).getReg(); unsigned JTI = I.getOperand(1).getIndex(); Register Index = I.getOperand(2).getReg(); - MachineIRBuilder MIB(I); Register TargetReg = MRI.createVirtualRegister(&AArch64::GPR64RegClass); Register ScratchReg = MRI.createVirtualRegister(&AArch64::GPR64spRegClass); @@ -3241,15 +3405,14 @@ bool AArch64InstructionSelector::selectBrJT(MachineInstr &I, return constrainSelectedInstRegOperands(*JumpTableInst, TII, TRI, RBI); } -bool AArch64InstructionSelector::selectJumpTable( - MachineInstr &I, MachineRegisterInfo &MRI) const { +bool AArch64InstructionSelector::selectJumpTable(MachineInstr &I, + MachineRegisterInfo &MRI) { assert(I.getOpcode() == TargetOpcode::G_JUMP_TABLE && "Expected jump table"); assert(I.getOperand(1).isJTI() && "Jump table op should have a JTI!"); Register DstReg = I.getOperand(0).getReg(); unsigned JTI = I.getOperand(1).getIndex(); // We generate a MOVaddrJT which will get expanded to an ADRP + ADD later. - MachineIRBuilder MIB(I); auto MovMI = MIB.buildInstr(AArch64::MOVaddrJT, {DstReg}, {}) .addJumpTableIndex(JTI, AArch64II::MO_PAGE) @@ -3259,14 +3422,16 @@ bool AArch64InstructionSelector::selectJumpTable( } bool AArch64InstructionSelector::selectTLSGlobalValue( - MachineInstr &I, MachineRegisterInfo &MRI) const { + MachineInstr &I, MachineRegisterInfo &MRI) { if (!STI.isTargetMachO()) return false; MachineFunction &MF = *I.getParent()->getParent(); MF.getFrameInfo().setAdjustsStack(true); - const GlobalValue &GV = *I.getOperand(1).getGlobal(); - MachineIRBuilder MIB(I); + const auto &GlobalOp = I.getOperand(1); + assert(GlobalOp.getOffset() == 0 && + "Shouldn't have an offset on TLS globals!"); + const GlobalValue &GV = *GlobalOp.getGlobal(); auto LoadGOT = MIB.buildInstr(AArch64::LOADgot, {&AArch64::GPR64commonRegClass}, {}) @@ -3403,7 +3568,7 @@ bool AArch64InstructionSelector::selectIntrinsicRound( } bool AArch64InstructionSelector::selectVectorICmp( - MachineInstr &I, MachineRegisterInfo &MRI) const { + MachineInstr &I, MachineRegisterInfo &MRI) { Register DstReg = I.getOperand(0).getReg(); LLT DstTy = MRI.getType(DstReg); Register SrcReg = I.getOperand(2).getReg(); @@ -3558,7 +3723,6 @@ bool AArch64InstructionSelector::selectVectorICmp( if (SwapOperands) std::swap(SrcReg, Src2Reg); - MachineIRBuilder MIB(I); auto Cmp = MIB.buildInstr(Opc, {SrcRC}, {SrcReg, Src2Reg}); constrainSelectedInstRegOperands(*Cmp, TII, TRI, RBI); @@ -3602,7 +3766,7 @@ MachineInstr *AArch64InstructionSelector::emitScalarToVector( } bool AArch64InstructionSelector::selectMergeValues( - MachineInstr &I, MachineRegisterInfo &MRI) const { + MachineInstr &I, MachineRegisterInfo &MRI) { assert(I.getOpcode() == TargetOpcode::G_MERGE_VALUES && "unexpected opcode"); const LLT DstTy = MRI.getType(I.getOperand(0).getReg()); const LLT SrcTy = MRI.getType(I.getOperand(1).getReg()); @@ -3616,7 +3780,6 @@ bool AArch64InstructionSelector::selectMergeValues( if (DstTy == LLT::scalar(128)) { if (SrcTy.getSizeInBits() != 64) return false; - MachineIRBuilder MIB(I); Register DstReg = I.getOperand(0).getReg(); Register Src1Reg = I.getOperand(1).getReg(); Register Src2Reg = I.getOperand(2).getReg(); @@ -3757,7 +3920,7 @@ MachineInstr *AArch64InstructionSelector::emitExtractVectorElt( } bool AArch64InstructionSelector::selectExtractElt( - MachineInstr &I, MachineRegisterInfo &MRI) const { + MachineInstr &I, MachineRegisterInfo &MRI) { assert(I.getOpcode() == TargetOpcode::G_EXTRACT_VECTOR_ELT && "unexpected opcode!"); Register DstReg = I.getOperand(0).getReg(); @@ -3784,11 +3947,10 @@ bool AArch64InstructionSelector::selectExtractElt( return false; unsigned LaneIdx = VRegAndVal->Value.getSExtValue(); - MachineIRBuilder MIRBuilder(I); const RegisterBank &DstRB = *RBI.getRegBank(DstReg, MRI, TRI); MachineInstr *Extract = emitExtractVectorElt(DstReg, DstRB, NarrowTy, SrcReg, - LaneIdx, MIRBuilder); + LaneIdx, MIB); if (!Extract) return false; @@ -3797,7 +3959,7 @@ bool AArch64InstructionSelector::selectExtractElt( } bool AArch64InstructionSelector::selectSplitVectorUnmerge( - MachineInstr &I, MachineRegisterInfo &MRI) const { + MachineInstr &I, MachineRegisterInfo &MRI) { unsigned NumElts = I.getNumOperands() - 1; Register SrcReg = I.getOperand(NumElts).getReg(); const LLT NarrowTy = MRI.getType(I.getOperand(0).getReg()); @@ -3809,8 +3971,6 @@ bool AArch64InstructionSelector::selectSplitVectorUnmerge( return false; } - MachineIRBuilder MIB(I); - // We implement a split vector operation by treating the sub-vectors as // scalars and extracting them. const RegisterBank &DstRB = @@ -3826,8 +3986,8 @@ bool AArch64InstructionSelector::selectSplitVectorUnmerge( return true; } -bool AArch64InstructionSelector::selectUnmergeValues( - MachineInstr &I, MachineRegisterInfo &MRI) const { +bool AArch64InstructionSelector::selectUnmergeValues(MachineInstr &I, + MachineRegisterInfo &MRI) { assert(I.getOpcode() == TargetOpcode::G_UNMERGE_VALUES && "unexpected opcode"); @@ -3856,8 +4016,6 @@ bool AArch64InstructionSelector::selectUnmergeValues( if (!NarrowTy.isScalar()) return selectSplitVectorUnmerge(I, MRI); - MachineIRBuilder MIB(I); - // Choose a lane copy opcode and subregister based off of the size of the // vector's elements. unsigned CopyOpc = 0; @@ -3882,6 +4040,13 @@ bool AArch64InstructionSelector::selectUnmergeValues( } else { // No. We have to perform subregister inserts. For each insert, create an // implicit def and a subregister insert, and save the register we create. + const TargetRegisterClass *RC = + getMinClassForRegBank(*RBI.getRegBank(SrcReg, MRI, TRI), + WideTy.getScalarSizeInBits() * NumElts); + unsigned SubReg = 0; + bool Found = getSubRegForClass(RC, TRI, SubReg); + (void)Found; + assert(Found && "expected to find last operand's subeg idx"); for (unsigned Idx = 0; Idx < NumInsertRegs; ++Idx) { Register ImpDefReg = MRI.createVirtualRegister(&AArch64::FPR128RegClass); MachineInstr &ImpDefMI = @@ -3895,7 +4060,7 @@ bool AArch64InstructionSelector::selectUnmergeValues( TII.get(TargetOpcode::INSERT_SUBREG), InsertReg) .addUse(ImpDefReg) .addUse(SrcReg) - .addImm(AArch64::dsub); + .addImm(SubReg); constrainSelectedInstRegOperands(ImpDefMI, TII, TRI, RBI); constrainSelectedInstRegOperands(InsMI, TII, TRI, RBI); @@ -3942,14 +4107,13 @@ bool AArch64InstructionSelector::selectUnmergeValues( } bool AArch64InstructionSelector::selectConcatVectors( - MachineInstr &I, MachineRegisterInfo &MRI) const { + MachineInstr &I, MachineRegisterInfo &MRI) { assert(I.getOpcode() == TargetOpcode::G_CONCAT_VECTORS && "Unexpected opcode"); Register Dst = I.getOperand(0).getReg(); Register Op1 = I.getOperand(1).getReg(); Register Op2 = I.getOperand(2).getReg(); - MachineIRBuilder MIRBuilder(I); - MachineInstr *ConcatMI = emitVectorConcat(Dst, Op1, Op2, MIRBuilder); + MachineInstr *ConcatMI = emitVectorConcat(Dst, Op1, Op2, MIB); if (!ConcatMI) return false; I.eraseFromParent(); @@ -3968,14 +4132,17 @@ AArch64InstructionSelector::emitConstantPoolEntry(const Constant *CPVal, MachineInstr *AArch64InstructionSelector::emitLoadFromConstantPool( const Constant *CPVal, MachineIRBuilder &MIRBuilder) const { - unsigned CPIdx = emitConstantPoolEntry(CPVal, MIRBuilder.getMF()); + auto &MF = MIRBuilder.getMF(); + unsigned CPIdx = emitConstantPoolEntry(CPVal, MF); auto Adrp = MIRBuilder.buildInstr(AArch64::ADRP, {&AArch64::GPR64RegClass}, {}) .addConstantPoolIndex(CPIdx, 0, AArch64II::MO_PAGE); MachineInstr *LoadMI = nullptr; - switch (MIRBuilder.getDataLayout().getTypeStoreSize(CPVal->getType())) { + MachinePointerInfo PtrInfo = MachinePointerInfo::getConstantPool(MF); + unsigned Size = MIRBuilder.getDataLayout().getTypeStoreSize(CPVal->getType()); + switch (Size) { case 16: LoadMI = &*MIRBuilder @@ -3984,16 +4151,27 @@ MachineInstr *AArch64InstructionSelector::emitLoadFromConstantPool( AArch64II::MO_PAGEOFF | AArch64II::MO_NC); break; case 8: - LoadMI = &*MIRBuilder - .buildInstr(AArch64::LDRDui, {&AArch64::FPR64RegClass}, {Adrp}) - .addConstantPoolIndex( - CPIdx, 0, AArch64II::MO_PAGEOFF | AArch64II::MO_NC); + LoadMI = + &*MIRBuilder + .buildInstr(AArch64::LDRDui, {&AArch64::FPR64RegClass}, {Adrp}) + .addConstantPoolIndex(CPIdx, 0, + AArch64II::MO_PAGEOFF | AArch64II::MO_NC); + break; + case 4: + LoadMI = + &*MIRBuilder + .buildInstr(AArch64::LDRSui, {&AArch64::FPR32RegClass}, {Adrp}) + .addConstantPoolIndex(CPIdx, 0, + AArch64II::MO_PAGEOFF | AArch64II::MO_NC); break; default: LLVM_DEBUG(dbgs() << "Could not load from constant pool of type " << *CPVal->getType()); return nullptr; } + LoadMI->addMemOperand(MF, MF.getMachineMemOperand(PtrInfo, + MachineMemOperand::MOLoad, + Size, Align(Size))); constrainSelectedInstRegOperands(*Adrp, TII, TRI, RBI); constrainSelectedInstRegOperands(*LoadMI, TII, TRI, RBI); return LoadMI; @@ -4316,49 +4494,15 @@ MachineInstr *AArch64InstructionSelector::emitVectorConcat( return &*InsElt; } -MachineInstr *AArch64InstructionSelector::emitFMovForFConstant( - MachineInstr &I, MachineRegisterInfo &MRI) const { - assert(I.getOpcode() == TargetOpcode::G_FCONSTANT && - "Expected a G_FCONSTANT!"); - MachineOperand &ImmOp = I.getOperand(1); - unsigned DefSize = MRI.getType(I.getOperand(0).getReg()).getSizeInBits(); - - // Only handle 32 and 64 bit defs for now. - if (DefSize != 32 && DefSize != 64) - return nullptr; - - // Don't handle null values using FMOV. - if (ImmOp.getFPImm()->isNullValue()) - return nullptr; - - // Get the immediate representation for the FMOV. - const APFloat &ImmValAPF = ImmOp.getFPImm()->getValueAPF(); - int Imm = DefSize == 32 ? AArch64_AM::getFP32Imm(ImmValAPF) - : AArch64_AM::getFP64Imm(ImmValAPF); - - // If this is -1, it means the immediate can't be represented as the requested - // floating point value. Bail. - if (Imm == -1) - return nullptr; - - // Update MI to represent the new FMOV instruction, constrain it, and return. - ImmOp.ChangeToImmediate(Imm); - unsigned MovOpc = DefSize == 32 ? AArch64::FMOVSi : AArch64::FMOVDi; - I.setDesc(TII.get(MovOpc)); - constrainSelectedInstRegOperands(I, TII, TRI, RBI); - return &I; -} - MachineInstr * AArch64InstructionSelector::emitCSetForICMP(Register DefReg, unsigned Pred, - MachineIRBuilder &MIRBuilder) const { + MachineIRBuilder &MIRBuilder, + Register SrcReg) const { // CSINC increments the result when the predicate is false. Invert it. const AArch64CC::CondCode InvCC = changeICMPPredToAArch64CC( CmpInst::getInversePredicate((CmpInst::Predicate)Pred)); - auto I = - MIRBuilder - .buildInstr(AArch64::CSINCWr, {DefReg}, {Register(AArch64::WZR), Register(AArch64::WZR)}) - .addImm(InvCC); + auto I = MIRBuilder.buildInstr(AArch64::CSINCWr, {DefReg}, {SrcReg, SrcReg}) + .addImm(InvCC); constrainSelectedInstRegOperands(*I, TII, TRI, RBI); return &*I; } @@ -4382,8 +4526,7 @@ AArch64InstructionSelector::emitOverflowOp(unsigned Opcode, Register Dst, } } -bool AArch64InstructionSelector::tryOptSelect(MachineInstr &I) const { - MachineIRBuilder MIB(I); +bool AArch64InstructionSelector::tryOptSelect(MachineInstr &I) { MachineRegisterInfo &MRI = *MIB.getMRI(); // We want to recognize this pattern: // @@ -4489,37 +4632,10 @@ MachineInstr *AArch64InstructionSelector::tryFoldIntegerCompare( // // cmn z, y - // Helper lambda to detect the subtract followed by the compare. - // Takes in the def of the LHS or RHS, and checks if it's a subtract from 0. - auto IsCMN = [&](MachineInstr *DefMI, const AArch64CC::CondCode &CC) { - if (!DefMI || DefMI->getOpcode() != TargetOpcode::G_SUB) - return false; - - // Need to make sure NZCV is the same at the end of the transformation. - if (CC != AArch64CC::EQ && CC != AArch64CC::NE) - return false; - - // We want to match against SUBs. - if (DefMI->getOpcode() != TargetOpcode::G_SUB) - return false; - - // Make sure that we're getting - // x = G_SUB 0, y - auto ValAndVReg = - getConstantVRegValWithLookThrough(DefMI->getOperand(1).getReg(), MRI); - if (!ValAndVReg || ValAndVReg->Value != 0) - return false; - - // This can safely be represented as a CMN. - return true; - }; - // Check if the RHS or LHS of the G_ICMP is defined by a SUB MachineInstr *LHSDef = getDefIgnoringCopies(LHS.getReg(), MRI); MachineInstr *RHSDef = getDefIgnoringCopies(RHS.getReg(), MRI); - CmpInst::Predicate P = (CmpInst::Predicate)Predicate.getPredicate(); - const AArch64CC::CondCode CC = changeICMPPredToAArch64CC(P); - + auto P = static_cast<CmpInst::Predicate>(Predicate.getPredicate()); // Given this: // // x = G_SUB 0, y @@ -4528,7 +4644,7 @@ MachineInstr *AArch64InstructionSelector::tryFoldIntegerCompare( // Produce this: // // cmn y, z - if (IsCMN(LHSDef, CC)) + if (isCMN(LHSDef, P, MRI)) return emitCMN(LHSDef->getOperand(2), RHS, MIRBuilder); // Same idea here, but with the RHS of the compare instead: @@ -4541,7 +4657,7 @@ MachineInstr *AArch64InstructionSelector::tryFoldIntegerCompare( // Produce this: // // cmn z, y - if (IsCMN(RHSDef, CC)) + if (isCMN(RHSDef, P, MRI)) return emitCMN(LHS, RHSDef->getOperand(2), MIRBuilder); // Given this: @@ -4567,7 +4683,7 @@ MachineInstr *AArch64InstructionSelector::tryFoldIntegerCompare( } bool AArch64InstructionSelector::selectShuffleVector( - MachineInstr &I, MachineRegisterInfo &MRI) const { + MachineInstr &I, MachineRegisterInfo &MRI) { const LLT DstTy = MRI.getType(I.getOperand(0).getReg()); Register Src1Reg = I.getOperand(1).getReg(); const LLT Src1Ty = MRI.getType(Src1Reg); @@ -4600,11 +4716,9 @@ bool AArch64InstructionSelector::selectShuffleVector( } } - MachineIRBuilder MIRBuilder(I); - // Use a constant pool to load the index vector for TBL. Constant *CPVal = ConstantVector::get(CstIdxs); - MachineInstr *IndexLoad = emitLoadFromConstantPool(CPVal, MIRBuilder); + MachineInstr *IndexLoad = emitLoadFromConstantPool(CPVal, MIB); if (!IndexLoad) { LLVM_DEBUG(dbgs() << "Could not load from a constant pool"); return false; @@ -4613,25 +4727,23 @@ bool AArch64InstructionSelector::selectShuffleVector( if (DstTy.getSizeInBits() != 128) { assert(DstTy.getSizeInBits() == 64 && "Unexpected shuffle result ty"); // This case can be done with TBL1. - MachineInstr *Concat = emitVectorConcat(None, Src1Reg, Src2Reg, MIRBuilder); + MachineInstr *Concat = emitVectorConcat(None, Src1Reg, Src2Reg, MIB); if (!Concat) { LLVM_DEBUG(dbgs() << "Could not do vector concat for tbl1"); return false; } // The constant pool load will be 64 bits, so need to convert to FPR128 reg. - IndexLoad = - emitScalarToVector(64, &AArch64::FPR128RegClass, - IndexLoad->getOperand(0).getReg(), MIRBuilder); + IndexLoad = emitScalarToVector(64, &AArch64::FPR128RegClass, + IndexLoad->getOperand(0).getReg(), MIB); - auto TBL1 = MIRBuilder.buildInstr( + auto TBL1 = MIB.buildInstr( AArch64::TBLv16i8One, {&AArch64::FPR128RegClass}, {Concat->getOperand(0).getReg(), IndexLoad->getOperand(0).getReg()}); constrainSelectedInstRegOperands(*TBL1, TII, TRI, RBI); auto Copy = - MIRBuilder - .buildInstr(TargetOpcode::COPY, {I.getOperand(0).getReg()}, {}) + MIB.buildInstr(TargetOpcode::COPY, {I.getOperand(0).getReg()}, {}) .addReg(TBL1.getReg(0), 0, AArch64::dsub); RBI.constrainGenericRegister(Copy.getReg(0), AArch64::FPR64RegClass, MRI); I.eraseFromParent(); @@ -4640,16 +4752,10 @@ bool AArch64InstructionSelector::selectShuffleVector( // For TBL2 we need to emit a REG_SEQUENCE to tie together two consecutive // Q registers for regalloc. - auto RegSeq = MIRBuilder - .buildInstr(TargetOpcode::REG_SEQUENCE, - {&AArch64::QQRegClass}, {Src1Reg}) - .addImm(AArch64::qsub0) - .addUse(Src2Reg) - .addImm(AArch64::qsub1); - - auto TBL2 = MIRBuilder.buildInstr(AArch64::TBLv16i8Two, {I.getOperand(0)}, - {RegSeq, IndexLoad->getOperand(0)}); - constrainSelectedInstRegOperands(*RegSeq, TII, TRI, RBI); + SmallVector<Register, 2> Regs = {Src1Reg, Src2Reg}; + auto RegSeq = createQTuple(Regs, MIB); + auto TBL2 = MIB.buildInstr(AArch64::TBLv16i8Two, {I.getOperand(0)}, + {RegSeq, IndexLoad->getOperand(0)}); constrainSelectedInstRegOperands(*TBL2, TII, TRI, RBI); I.eraseFromParent(); return true; @@ -4686,8 +4792,8 @@ MachineInstr *AArch64InstructionSelector::emitLaneInsert( return InsElt; } -bool AArch64InstructionSelector::selectInsertElt( - MachineInstr &I, MachineRegisterInfo &MRI) const { +bool AArch64InstructionSelector::selectInsertElt(MachineInstr &I, + MachineRegisterInfo &MRI) { assert(I.getOpcode() == TargetOpcode::G_INSERT_VECTOR_ELT); // Get information on the destination. @@ -4713,13 +4819,12 @@ bool AArch64InstructionSelector::selectInsertElt( // Perform the lane insert. Register SrcReg = I.getOperand(1).getReg(); const RegisterBank &EltRB = *RBI.getRegBank(EltReg, MRI, TRI); - MachineIRBuilder MIRBuilder(I); if (VecSize < 128) { // If the vector we're inserting into is smaller than 128 bits, widen it // to 128 to do the insert. - MachineInstr *ScalarToVec = emitScalarToVector( - VecSize, &AArch64::FPR128RegClass, SrcReg, MIRBuilder); + MachineInstr *ScalarToVec = + emitScalarToVector(VecSize, &AArch64::FPR128RegClass, SrcReg, MIB); if (!ScalarToVec) return false; SrcReg = ScalarToVec->getOperand(0).getReg(); @@ -4729,7 +4834,7 @@ bool AArch64InstructionSelector::selectInsertElt( // Note that if our vector is already 128 bits, we end up emitting an extra // register. MachineInstr *InsMI = - emitLaneInsert(None, SrcReg, EltReg, LaneIdx, EltRB, MIRBuilder); + emitLaneInsert(None, SrcReg, EltReg, LaneIdx, EltRB, MIB); if (VecSize < 128) { // If we had to widen to perform the insert, then we have to demote back to @@ -4749,7 +4854,7 @@ bool AArch64InstructionSelector::selectInsertElt( << "\n"); return false; } - MIRBuilder.buildInstr(TargetOpcode::COPY, {DstReg}, {}) + MIB.buildInstr(TargetOpcode::COPY, {DstReg}, {}) .addReg(DemoteVec, 0, SubReg); RBI.constrainGenericRegister(DstReg, *RC, MRI); } else { @@ -4762,8 +4867,46 @@ bool AArch64InstructionSelector::selectInsertElt( return true; } +MachineInstr * +AArch64InstructionSelector::emitConstantVector(Register Dst, Constant *CV, + MachineIRBuilder &MIRBuilder, + MachineRegisterInfo &MRI) { + LLT DstTy = MRI.getType(Dst); + unsigned DstSize = DstTy.getSizeInBits(); + if (CV->isNullValue()) { + if (DstSize == 128) { + auto Mov = + MIRBuilder.buildInstr(AArch64::MOVIv2d_ns, {Dst}, {}).addImm(0); + constrainSelectedInstRegOperands(*Mov, TII, TRI, RBI); + return &*Mov; + } + + if (DstSize == 64) { + auto Mov = + MIRBuilder + .buildInstr(AArch64::MOVIv2d_ns, {&AArch64::FPR128RegClass}, {}) + .addImm(0); + auto Copy = MIRBuilder.buildInstr(TargetOpcode::COPY, {Dst}, {}) + .addReg(Mov.getReg(0), 0, AArch64::dsub); + RBI.constrainGenericRegister(Dst, AArch64::FPR64RegClass, MRI); + return &*Copy; + } + } + + auto *CPLoad = emitLoadFromConstantPool(CV, MIRBuilder); + if (!CPLoad) { + LLVM_DEBUG(dbgs() << "Could not generate cp load for constant vector!"); + return nullptr; + } + + auto Copy = MIRBuilder.buildCopy(Dst, CPLoad->getOperand(0)); + RBI.constrainGenericRegister( + Dst, *MRI.getRegClass(CPLoad->getOperand(0).getReg()), MRI); + return &*Copy; +} + bool AArch64InstructionSelector::tryOptConstantBuildVec( - MachineInstr &I, LLT DstTy, MachineRegisterInfo &MRI) const { + MachineInstr &I, LLT DstTy, MachineRegisterInfo &MRI) { assert(I.getOpcode() == TargetOpcode::G_BUILD_VECTOR); unsigned DstSize = DstTy.getSizeInBits(); assert(DstSize <= 128 && "Unexpected build_vec type!"); @@ -4787,40 +4930,14 @@ bool AArch64InstructionSelector::tryOptConstantBuildVec( return false; } Constant *CV = ConstantVector::get(Csts); - MachineIRBuilder MIB(I); - if (CV->isNullValue()) { - // Until the importer can support immAllZerosV in pattern leaf nodes, - // select a zero move manually here. - Register DstReg = I.getOperand(0).getReg(); - if (DstSize == 128) { - auto Mov = MIB.buildInstr(AArch64::MOVIv2d_ns, {DstReg}, {}).addImm(0); - I.eraseFromParent(); - return constrainSelectedInstRegOperands(*Mov, TII, TRI, RBI); - } else if (DstSize == 64) { - auto Mov = - MIB.buildInstr(AArch64::MOVIv2d_ns, {&AArch64::FPR128RegClass}, {}) - .addImm(0); - MIB.buildInstr(TargetOpcode::COPY, {DstReg}, {}) - .addReg(Mov.getReg(0), 0, AArch64::dsub); - I.eraseFromParent(); - return RBI.constrainGenericRegister(DstReg, AArch64::FPR64RegClass, MRI); - } - } - auto *CPLoad = emitLoadFromConstantPool(CV, MIB); - if (!CPLoad) { - LLVM_DEBUG(dbgs() << "Could not generate cp load for build_vector"); + if (!emitConstantVector(I.getOperand(0).getReg(), CV, MIB, MRI)) return false; - } - MIB.buildCopy(I.getOperand(0), CPLoad->getOperand(0)); - RBI.constrainGenericRegister(I.getOperand(0).getReg(), - *MRI.getRegClass(CPLoad->getOperand(0).getReg()), - MRI); I.eraseFromParent(); return true; } -bool AArch64InstructionSelector::selectBuildVector( - MachineInstr &I, MachineRegisterInfo &MRI) const { +bool AArch64InstructionSelector::selectBuildVector(MachineInstr &I, + MachineRegisterInfo &MRI) { assert(I.getOpcode() == TargetOpcode::G_BUILD_VECTOR); // Until we port more of the optimized selections, for now just use a vector // insert sequence. @@ -4833,12 +4950,11 @@ bool AArch64InstructionSelector::selectBuildVector( if (EltSize < 16 || EltSize > 64) return false; // Don't support all element types yet. const RegisterBank &RB = *RBI.getRegBank(I.getOperand(1).getReg(), MRI, TRI); - MachineIRBuilder MIRBuilder(I); const TargetRegisterClass *DstRC = &AArch64::FPR128RegClass; MachineInstr *ScalarToVec = emitScalarToVector(DstTy.getElementType().getSizeInBits(), DstRC, - I.getOperand(1).getReg(), MIRBuilder); + I.getOperand(1).getReg(), MIB); if (!ScalarToVec) return false; @@ -4852,7 +4968,7 @@ bool AArch64InstructionSelector::selectBuildVector( // Note that if we don't do a subregister copy, we can end up making an // extra register. PrevMI = &*emitLaneInsert(None, DstVec, I.getOperand(i).getReg(), i - 1, RB, - MIRBuilder); + MIB); DstVec = PrevMI->getOperand(0).getReg(); } @@ -4881,8 +4997,7 @@ bool AArch64InstructionSelector::selectBuildVector( Register Reg = MRI.createVirtualRegister(RC); Register DstReg = I.getOperand(0).getReg(); - MIRBuilder.buildInstr(TargetOpcode::COPY, {DstReg}, {}) - .addReg(DstVec, 0, SubReg); + MIB.buildInstr(TargetOpcode::COPY, {DstReg}, {}).addReg(DstVec, 0, SubReg); MachineOperand &RegOp = I.getOperand(1); RegOp.setReg(Reg); RBI.constrainGenericRegister(DstReg, *RC, MRI); @@ -4910,27 +5025,73 @@ static unsigned findIntrinsicID(MachineInstr &I) { } bool AArch64InstructionSelector::selectIntrinsicWithSideEffects( - MachineInstr &I, MachineRegisterInfo &MRI) const { + MachineInstr &I, MachineRegisterInfo &MRI) { // Find the intrinsic ID. unsigned IntrinID = findIntrinsicID(I); if (!IntrinID) return false; - MachineIRBuilder MIRBuilder(I); // Select the instruction. switch (IntrinID) { default: return false; + case Intrinsic::aarch64_ldxp: + case Intrinsic::aarch64_ldaxp: { + auto NewI = MIB.buildInstr( + IntrinID == Intrinsic::aarch64_ldxp ? AArch64::LDXPX : AArch64::LDAXPX, + {I.getOperand(0).getReg(), I.getOperand(1).getReg()}, + {I.getOperand(3)}); + NewI.cloneMemRefs(I); + constrainSelectedInstRegOperands(*NewI, TII, TRI, RBI); + break; + } case Intrinsic::trap: - MIRBuilder.buildInstr(AArch64::BRK, {}, {}).addImm(1); + MIB.buildInstr(AArch64::BRK, {}, {}).addImm(1); break; case Intrinsic::debugtrap: - MIRBuilder.buildInstr(AArch64::BRK, {}, {}).addImm(0xF000); + MIB.buildInstr(AArch64::BRK, {}, {}).addImm(0xF000); break; case Intrinsic::ubsantrap: - MIRBuilder.buildInstr(AArch64::BRK, {}, {}) + MIB.buildInstr(AArch64::BRK, {}, {}) .addImm(I.getOperand(1).getImm() | ('U' << 8)); break; + case Intrinsic::aarch64_neon_st2: { + Register Src1 = I.getOperand(1).getReg(); + Register Src2 = I.getOperand(2).getReg(); + Register Ptr = I.getOperand(3).getReg(); + LLT Ty = MRI.getType(Src1); + const LLT S8 = LLT::scalar(8); + const LLT S16 = LLT::scalar(16); + const LLT S32 = LLT::scalar(32); + const LLT S64 = LLT::scalar(64); + const LLT P0 = LLT::pointer(0, 64); + unsigned Opc; + if (Ty == LLT::fixed_vector(8, S8)) + Opc = AArch64::ST2Twov8b; + else if (Ty == LLT::fixed_vector(16, S8)) + Opc = AArch64::ST2Twov16b; + else if (Ty == LLT::fixed_vector(4, S16)) + Opc = AArch64::ST2Twov4h; + else if (Ty == LLT::fixed_vector(8, S16)) + Opc = AArch64::ST2Twov8h; + else if (Ty == LLT::fixed_vector(2, S32)) + Opc = AArch64::ST2Twov2s; + else if (Ty == LLT::fixed_vector(4, S32)) + Opc = AArch64::ST2Twov4s; + else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0)) + Opc = AArch64::ST2Twov2d; + else if (Ty == S64 || Ty == P0) + Opc = AArch64::ST1Twov1d; + else + llvm_unreachable("Unexpected type for st2!"); + SmallVector<Register, 2> Regs = {Src1, Src2}; + Register Tuple = Ty.getSizeInBits() == 128 ? createQTuple(Regs, MIB) + : createDTuple(Regs, MIB); + auto Store = MIB.buildInstr(Opc, {}, {Tuple, Ptr}); + Store.cloneMemRefs(I); + constrainSelectedInstRegOperands(*Store, TII, TRI, RBI); + break; + } } I.eraseFromParent(); @@ -4942,7 +5103,6 @@ bool AArch64InstructionSelector::selectIntrinsic(MachineInstr &I, unsigned IntrinID = findIntrinsicID(I); if (!IntrinID) return false; - MachineIRBuilder MIRBuilder(I); switch (IntrinID) { default: @@ -4960,7 +5120,7 @@ bool AArch64InstructionSelector::selectIntrinsic(MachineInstr &I, // the source and destination if they are on GPRs. if (RBI.getRegBank(SrcReg, MRI, TRI)->getID() != AArch64::FPRRegBankID) { SrcReg = MRI.createVirtualRegister(&AArch64::FPR32RegClass); - MIRBuilder.buildCopy({SrcReg}, {I.getOperand(2)}); + MIB.buildCopy({SrcReg}, {I.getOperand(2)}); // Make sure the copy ends up getting constrained properly. RBI.constrainGenericRegister(I.getOperand(2).getReg(), @@ -4971,14 +5131,14 @@ bool AArch64InstructionSelector::selectIntrinsic(MachineInstr &I, DstReg = MRI.createVirtualRegister(&AArch64::FPR32RegClass); // Actually insert the instruction. - auto SHA1Inst = MIRBuilder.buildInstr(AArch64::SHA1Hrr, {DstReg}, {SrcReg}); + auto SHA1Inst = MIB.buildInstr(AArch64::SHA1Hrr, {DstReg}, {SrcReg}); constrainSelectedInstRegOperands(*SHA1Inst, TII, TRI, RBI); // Did we create a new register for the destination? if (DstReg != I.getOperand(0).getReg()) { // Yep. Copy the result of the instruction back into the original // destination. - MIRBuilder.buildCopy({I.getOperand(0)}, {DstReg}); + MIB.buildCopy({I.getOperand(0)}, {DstReg}); RBI.constrainGenericRegister(I.getOperand(0).getReg(), AArch64::GPR32RegClass, MRI); } @@ -5005,11 +5165,11 @@ bool AArch64InstructionSelector::selectIntrinsic(MachineInstr &I, } if (STI.hasPAuth()) { - MIRBuilder.buildInstr(AArch64::XPACI, {DstReg}, {MFReturnAddr}); + MIB.buildInstr(AArch64::XPACI, {DstReg}, {MFReturnAddr}); } else { - MIRBuilder.buildCopy({Register(AArch64::LR)}, {MFReturnAddr}); - MIRBuilder.buildInstr(AArch64::XPACLRI); - MIRBuilder.buildCopy({DstReg}, {Register(AArch64::LR)}); + MIB.buildCopy({Register(AArch64::LR)}, {MFReturnAddr}); + MIB.buildInstr(AArch64::XPACLRI); + MIB.buildCopy({DstReg}, {Register(AArch64::LR)}); } I.eraseFromParent(); @@ -5021,31 +5181,42 @@ bool AArch64InstructionSelector::selectIntrinsic(MachineInstr &I, while (Depth--) { Register NextFrame = MRI.createVirtualRegister(&AArch64::GPR64spRegClass); auto Ldr = - MIRBuilder.buildInstr(AArch64::LDRXui, {NextFrame}, {FrameAddr}) - .addImm(0); + MIB.buildInstr(AArch64::LDRXui, {NextFrame}, {FrameAddr}).addImm(0); constrainSelectedInstRegOperands(*Ldr, TII, TRI, RBI); FrameAddr = NextFrame; } if (IntrinID == Intrinsic::frameaddress) - MIRBuilder.buildCopy({DstReg}, {FrameAddr}); + MIB.buildCopy({DstReg}, {FrameAddr}); else { MFI.setReturnAddressIsTaken(true); if (STI.hasPAuth()) { Register TmpReg = MRI.createVirtualRegister(&AArch64::GPR64RegClass); - MIRBuilder.buildInstr(AArch64::LDRXui, {TmpReg}, {FrameAddr}).addImm(1); - MIRBuilder.buildInstr(AArch64::XPACI, {DstReg}, {TmpReg}); + MIB.buildInstr(AArch64::LDRXui, {TmpReg}, {FrameAddr}).addImm(1); + MIB.buildInstr(AArch64::XPACI, {DstReg}, {TmpReg}); } else { - MIRBuilder.buildInstr(AArch64::LDRXui, {Register(AArch64::LR)}, {FrameAddr}).addImm(1); - MIRBuilder.buildInstr(AArch64::XPACLRI); - MIRBuilder.buildCopy({DstReg}, {Register(AArch64::LR)}); + MIB.buildInstr(AArch64::LDRXui, {Register(AArch64::LR)}, {FrameAddr}) + .addImm(1); + MIB.buildInstr(AArch64::XPACLRI); + MIB.buildCopy({DstReg}, {Register(AArch64::LR)}); } } I.eraseFromParent(); return true; } + case Intrinsic::swift_async_context_addr: + auto Sub = MIB.buildInstr(AArch64::SUBXri, {I.getOperand(0).getReg()}, + {Register(AArch64::FP)}) + .addImm(8) + .addImm(0); + constrainSelectedInstRegOperands(*Sub, TII, TRI, RBI); + + MF->getFrameInfo().setFrameAddressIsTaken(true); + MF->getInfo<AArch64FunctionInfo>()->setHasSwiftAsyncContext(true); + I.eraseFromParent(); + return true; } return false; } @@ -5168,7 +5339,7 @@ bool AArch64InstructionSelector::isWorthFoldingIntoExtendedReg( // Always fold if there is one use, or if we're optimizing for size. Register DefReg = MI.getOperand(0).getReg(); if (MRI.hasOneNonDBGUse(DefReg) || - MI.getParent()->getParent()->getFunction().hasMinSize()) + MI.getParent()->getParent()->getFunction().hasOptSize()) return true; // It's better to avoid folding and recomputing shifts when we don't have a @@ -5577,8 +5748,10 @@ AArch64InstructionSelector::tryFoldAddLowIntoImm(MachineInstr &RootDef, return None; // TODO: add heuristics like isWorthFoldingADDlow() from SelectionDAG. - // TODO: Need to check GV's offset % size if doing offset folding into globals. - assert(Adrp.getOperand(1).getOffset() == 0 && "Unexpected offset in global"); + auto Offset = Adrp.getOperand(1).getOffset(); + if (Offset % Size != 0) + return None; + auto GV = Adrp.getOperand(1).getGlobal(); if (GV->isThreadLocal()) return None; @@ -5592,7 +5765,7 @@ AArch64InstructionSelector::tryFoldAddLowIntoImm(MachineInstr &RootDef, Register AdrpReg = Adrp.getOperand(0).getReg(); return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(AdrpReg); }, [=](MachineInstrBuilder &MIB) { - MIB.addGlobalAddress(GV, /* Offset */ 0, + MIB.addGlobalAddress(GV, Offset, OpFlags | AArch64II::MO_PAGEOFF | AArch64II::MO_NC); }}}; @@ -5736,9 +5909,9 @@ AArch64_AM::ShiftExtendType AArch64InstructionSelector::getExtendTypeForInst( assert(Size != 64 && "Extend from 64 bits?"); switch (Size) { case 8: - return AArch64_AM::SXTB; + return IsLoadStore ? AArch64_AM::InvalidShiftExtend : AArch64_AM::SXTB; case 16: - return AArch64_AM::SXTH; + return IsLoadStore ? AArch64_AM::InvalidShiftExtend : AArch64_AM::SXTH; case 32: return AArch64_AM::SXTW; default: @@ -5751,9 +5924,9 @@ AArch64_AM::ShiftExtendType AArch64InstructionSelector::getExtendTypeForInst( assert(Size != 64 && "Extend from 64 bits?"); switch (Size) { case 8: - return AArch64_AM::UXTB; + return IsLoadStore ? AArch64_AM::InvalidShiftExtend : AArch64_AM::UXTB; case 16: - return AArch64_AM::UXTH; + return IsLoadStore ? AArch64_AM::InvalidShiftExtend : AArch64_AM::UXTH; case 32: return AArch64_AM::UXTW; default: @@ -5895,6 +6068,33 @@ void AArch64InstructionSelector::renderLogicalImm64( MIB.addImm(Enc); } +void AArch64InstructionSelector::renderFPImm16(MachineInstrBuilder &MIB, + const MachineInstr &MI, + int OpIdx) const { + assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT && OpIdx == -1 && + "Expected G_FCONSTANT"); + MIB.addImm( + AArch64_AM::getFP16Imm(MI.getOperand(1).getFPImm()->getValueAPF())); +} + +void AArch64InstructionSelector::renderFPImm32(MachineInstrBuilder &MIB, + const MachineInstr &MI, + int OpIdx) const { + assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT && OpIdx == -1 && + "Expected G_FCONSTANT"); + MIB.addImm( + AArch64_AM::getFP32Imm(MI.getOperand(1).getFPImm()->getValueAPF())); +} + +void AArch64InstructionSelector::renderFPImm64(MachineInstrBuilder &MIB, + const MachineInstr &MI, + int OpIdx) const { + assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT && OpIdx == -1 && + "Expected G_FCONSTANT"); + MIB.addImm( + AArch64_AM::getFP64Imm(MI.getOperand(1).getFPImm()->getValueAPF())); +} + bool AArch64InstructionSelector::isLoadStoreOfNumBytes( const MachineInstr &MI, unsigned NumBytes) const { if (!MI.mayLoadOrStore()) @@ -5946,7 +6146,14 @@ static void fixupPHIOpBanks(MachineInstr &MI, MachineRegisterInfo &MRI, // Insert a cross-bank copy. auto *OpDef = MRI.getVRegDef(OpReg); const LLT &Ty = MRI.getType(OpReg); - MIB.setInsertPt(*OpDef->getParent(), std::next(OpDef->getIterator())); + MachineBasicBlock &OpDefBB = *OpDef->getParent(); + + // Any instruction we insert must appear after all PHIs in the block + // for the block to be valid MIR. + MachineBasicBlock::iterator InsertPt = std::next(OpDef->getIterator()); + if (InsertPt != OpDefBB.end() && InsertPt->isPHI()) + InsertPt = OpDefBB.getFirstNonPHI(); + MIB.setInsertPt(*OpDef->getParent(), InsertPt); auto Copy = MIB.buildCopy(Ty, OpReg); MRI.setRegBank(Copy.getReg(0), *DstRB); MO.setReg(Copy.getReg(0)); |
