diff options
author | Dimitry Andric <dim@FreeBSD.org> | 2020-01-24 22:00:03 +0000 |
---|---|---|
committer | Dimitry Andric <dim@FreeBSD.org> | 2020-01-24 22:00:03 +0000 |
commit | 480093f4440d54b30b3025afeac24b48f2ba7a2e (patch) | |
tree | 162e72994062888647caf0d875428db9445491a8 /contrib/llvm-project/llvm/lib/Target/ARM | |
parent | 489b1cf2ecf5b9b4a394857987014bfb09067726 (diff) | |
parent | 706b4fc47bbc608932d3b491ae19a3b9cde9497b (diff) |
Notes
Diffstat (limited to 'contrib/llvm-project/llvm/lib/Target/ARM')
58 files changed, 5581 insertions, 3133 deletions
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/A15SDOptimizer.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/A15SDOptimizer.cpp index 30b9c8071ba2..f8a86a70c077 100644 --- a/contrib/llvm-project/llvm/lib/Target/ARM/A15SDOptimizer.cpp +++ b/contrib/llvm-project/llvm/lib/Target/ARM/A15SDOptimizer.cpp @@ -157,9 +157,8 @@ unsigned A15SDOptimizer::getPrefSPRLane(unsigned SReg) { MachineInstr *MI = MRI->getVRegDef(SReg); if (!MI) return ARM::ssub_0; MachineOperand *MO = MI->findRegisterDefOperand(SReg); - - assert(MO->isReg() && "Non-register operand found!"); if (!MO) return ARM::ssub_0; + assert(MO->isReg() && "Non-register operand found!"); if (MI->isCopy() && usesRegClass(MI->getOperand(1), &ARM::SPRRegClass)) { diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARM.h b/contrib/llvm-project/llvm/lib/Target/ARM/ARM.h index 2e6f756d522c..3412813a3ef2 100644 --- a/contrib/llvm-project/llvm/lib/Target/ARM/ARM.h +++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARM.h @@ -43,7 +43,6 @@ FunctionPass *createARMISelDag(ARMBaseTargetMachine &TM, FunctionPass *createA15SDOptimizerPass(); FunctionPass *createARMLoadStoreOptimizationPass(bool PreAlloc = false); FunctionPass *createARMExpandPseudoPass(); -FunctionPass *createARMCodeGenPreparePass(); FunctionPass *createARMConstantIslandPass(); FunctionPass *createMLxExpansionPass(); FunctionPass *createThumb2ITBlockPass(); @@ -54,6 +53,7 @@ FunctionPass *createThumb2SizeReductionPass( InstructionSelector * createARMInstructionSelector(const ARMBaseTargetMachine &TM, const ARMSubtarget &STI, const ARMRegisterBankInfo &RBI); +Pass *createMVEGatherScatterLoweringPass(); void LowerARMMachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI, ARMAsmPrinter &AP); @@ -61,7 +61,6 @@ void LowerARMMachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI, void initializeARMParallelDSPPass(PassRegistry &); void initializeARMLoadStoreOptPass(PassRegistry &); void initializeARMPreAllocLoadStoreOptPass(PassRegistry &); -void initializeARMCodeGenPreparePass(PassRegistry &); void initializeARMConstantIslandsPass(PassRegistry &); void initializeARMExpandPseudoPass(PassRegistry &); void initializeThumb2SizeReducePass(PassRegistry &); @@ -69,6 +68,7 @@ void initializeThumb2ITBlockPass(PassRegistry &); void initializeMVEVPTBlockPass(PassRegistry &); void initializeARMLowOverheadLoopsPass(PassRegistry &); void initializeMVETailPredicationPass(PassRegistry &); +void initializeMVEGatherScatterLoweringPass(PassRegistry &); } // end namespace llvm diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARM.td b/contrib/llvm-project/llvm/lib/Target/ARM/ARM.td index fed4cb2b9316..380eaa863689 100644 --- a/contrib/llvm-project/llvm/lib/Target/ARM/ARM.td +++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARM.td @@ -303,6 +303,10 @@ def FeatureNonpipelinedVFP : SubtargetFeature<"nonpipelined-vfp", def FeatureHasSlowFPVMLx : SubtargetFeature<"slowfpvmlx", "SlowFPVMLx", "true", "Disable VFP / NEON MAC instructions">; +// VFPv4 added VFMA instructions that can similar be fast or slow. +def FeatureHasSlowFPVFMx : SubtargetFeature<"slowfpvfmx", "SlowFPVFMx", "true", + "Disable VFP / NEON FMA instructions">; + // Cortex-A8 / A9 Advanced SIMD has multiplier accumulator forwarding. def FeatureVMLxForwarding : SubtargetFeature<"vmlx-forwarding", "HasVMLxForwarding", "true", @@ -415,10 +419,6 @@ def FeatureNoPostRASched : SubtargetFeature<"disable-postra-scheduler", "DisablePostRAScheduler", "true", "Don't schedule again after register allocation">; -// Enable use of alias analysis during code generation -def FeatureUseAA : SubtargetFeature<"use-aa", "UseAA", "true", - "Use alias analysis during codegen">; - // Armv8.5-A extensions def FeatureSB : SubtargetFeature<"sb", "HasSB", "true", @@ -584,7 +584,6 @@ def ProcExynos : SubtargetFeature<"exynos", "ARMProcFamily", "Exynos", "Samsung Exynos processors", [FeatureZCZeroing, FeatureUseWideStrideVFP, - FeatureUseAA, FeatureSplatVFPToNeon, FeatureSlowVGETLNi32, FeatureSlowVDUP32, @@ -593,6 +592,7 @@ def ProcExynos : SubtargetFeature<"exynos", "ARMProcFamily", "Exynos", FeatureHWDivThumb, FeatureHWDivARM, FeatureHasSlowFPVMLx, + FeatureHasSlowFPVFMx, FeatureHasRetAddrStack, FeatureFuseLiterals, FeatureFuseAES, @@ -923,6 +923,7 @@ def : ProcessorModel<"cortex-a5", CortexA8Model, [ARMv7a, ProcA5, FeatureTrustZone, FeatureSlowFPBrcc, FeatureHasSlowFPVMLx, + FeatureHasSlowFPVFMx, FeatureVMLxForwarding, FeatureMP, FeatureVFP4]>; @@ -933,6 +934,7 @@ def : ProcessorModel<"cortex-a7", CortexA8Model, [ARMv7a, ProcA7, FeatureSlowFPBrcc, FeatureHasVMLxHazards, FeatureHasSlowFPVMLx, + FeatureHasSlowFPVFMx, FeatureVMLxForwarding, FeatureMP, FeatureVFP4, @@ -945,6 +947,7 @@ def : ProcessorModel<"cortex-a8", CortexA8Model, [ARMv7a, ProcA8, FeatureSlowFPBrcc, FeatureHasVMLxHazards, FeatureHasSlowFPVMLx, + FeatureHasSlowFPVFMx, FeatureVMLxForwarding]>; def : ProcessorModel<"cortex-a9", CortexA9Model, [ARMv7a, ProcA9, @@ -1014,6 +1017,7 @@ def : ProcessorModel<"swift", SwiftModel, [ARMv7a, ProcSwift, FeatureAvoidPartialCPSR, FeatureAvoidMOVsShOp, FeatureHasSlowFPVMLx, + FeatureHasSlowFPVFMx, FeatureHasVMLxHazards, FeatureProfUnpredicate, FeaturePrefISHSTBarrier, @@ -1032,6 +1036,7 @@ def : ProcessorModel<"cortex-r4f", CortexA8Model, [ARMv7r, ProcR4, FeatureHasRetAddrStack, FeatureSlowFPBrcc, FeatureHasSlowFPVMLx, + FeatureHasSlowFPVFMx, FeatureVFP3_D16, FeatureAvoidPartialCPSR]>; @@ -1041,6 +1046,7 @@ def : ProcessorModel<"cortex-r5", CortexA8Model, [ARMv7r, ProcR5, FeatureSlowFPBrcc, FeatureHWDivARM, FeatureHasSlowFPVMLx, + FeatureHasSlowFPVFMx, FeatureAvoidPartialCPSR]>; def : ProcessorModel<"cortex-r7", CortexA8Model, [ARMv7r, ProcR7, @@ -1051,6 +1057,7 @@ def : ProcessorModel<"cortex-r7", CortexA8Model, [ARMv7r, ProcR7, FeatureSlowFPBrcc, FeatureHWDivARM, FeatureHasSlowFPVMLx, + FeatureHasSlowFPVFMx, FeatureAvoidPartialCPSR]>; def : ProcessorModel<"cortex-r8", CortexA8Model, [ARMv7r, @@ -1061,27 +1068,26 @@ def : ProcessorModel<"cortex-r8", CortexA8Model, [ARMv7r, FeatureSlowFPBrcc, FeatureHWDivARM, FeatureHasSlowFPVMLx, + FeatureHasSlowFPVFMx, FeatureAvoidPartialCPSR]>; def : ProcessorModel<"cortex-m3", CortexM4Model, [ARMv7m, ProcM3, FeaturePrefLoopAlign32, FeatureUseMISched, - FeatureUseAA, FeatureHasNoBranchPredictor]>; def : ProcessorModel<"sc300", CortexM4Model, [ARMv7m, ProcM3, FeatureUseMISched, - FeatureUseAA, FeatureHasNoBranchPredictor]>; def : ProcessorModel<"cortex-m4", CortexM4Model, [ARMv7em, FeatureVFP4_D16_SP, FeaturePrefLoopAlign32, FeatureHasSlowFPVMLx, + FeatureHasSlowFPVFMx, FeatureUseMISched, - FeatureUseAA, FeatureHasNoBranchPredictor]>; def : ProcNoItin<"cortex-m7", [ARMv7em, @@ -1095,8 +1101,8 @@ def : ProcessorModel<"cortex-m33", CortexM4Model, [ARMv8mMainline, FeatureFPARMv8_D16_SP, FeaturePrefLoopAlign32, FeatureHasSlowFPVMLx, + FeatureHasSlowFPVFMx, FeatureUseMISched, - FeatureUseAA, FeatureHasNoBranchPredictor]>; def : ProcessorModel<"cortex-m35p", CortexM4Model, [ARMv8mMainline, @@ -1104,8 +1110,8 @@ def : ProcessorModel<"cortex-m35p", CortexM4Model, [ARMv8mMainline, FeatureFPARMv8_D16_SP, FeaturePrefLoopAlign32, FeatureHasSlowFPVMLx, + FeatureHasSlowFPVFMx, FeatureUseMISched, - FeatureUseAA, FeatureHasNoBranchPredictor]>; @@ -1192,13 +1198,12 @@ def : ProcessorModel<"cyclone", SwiftModel, [ARMv8a, ProcSwift, FeatureAvoidPartialCPSR, FeatureAvoidMOVsShOp, FeatureHasSlowFPVMLx, + FeatureHasSlowFPVFMx, FeatureCrypto, FeatureUseMISched, FeatureZCZeroing, FeatureNoPostRASched]>; -def : ProcNoItin<"exynos-m1", [ARMv8a, ProcExynos]>; -def : ProcNoItin<"exynos-m2", [ARMv8a, ProcExynos]>; def : ProcNoItin<"exynos-m3", [ARMv8a, ProcExynos]>; def : ProcNoItin<"exynos-m4", [ARMv82a, ProcExynos, FeatureFullFP16, @@ -1215,8 +1220,7 @@ def : ProcNoItin<"kryo", [ARMv8a, ProcKryo, def : ProcessorModel<"cortex-r52", CortexR52Model, [ARMv8r, ProcR52, FeatureUseMISched, - FeatureFPAO, - FeatureUseAA]>; + FeatureFPAO]>; //===----------------------------------------------------------------------===// // Register File Description diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMAsmPrinter.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/ARMAsmPrinter.cpp index c8c91e53c44e..6f26ca127f94 100644 --- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMAsmPrinter.cpp +++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMAsmPrinter.cpp @@ -54,8 +54,8 @@ using namespace llvm; ARMAsmPrinter::ARMAsmPrinter(TargetMachine &TM, std::unique_ptr<MCStreamer> Streamer) - : AsmPrinter(TM, std::move(Streamer)), AFI(nullptr), MCP(nullptr), - InConstantPool(false), OptimizationGoals(-1) {} + : AsmPrinter(TM, std::move(Streamer)), Subtarget(nullptr), AFI(nullptr), + MCP(nullptr), InConstantPool(false), OptimizationGoals(-1) {} void ARMAsmPrinter::EmitFunctionBodyEnd() { // Make sure to terminate any constant pools that were at the end @@ -1170,10 +1170,16 @@ void ARMAsmPrinter::EmitUnwindingInstruction(const MachineInstr *MI) { break; case ARM::ADDri: case ARM::t2ADDri: + case ARM::t2ADDri12: + case ARM::t2ADDspImm: + case ARM::t2ADDspImm12: Offset = -MI->getOperand(2).getImm(); break; case ARM::SUBri: case ARM::t2SUBri: + case ARM::t2SUBri12: + case ARM::t2SUBspImm: + case ARM::t2SUBspImm12: Offset = MI->getOperand(2).getImm(); break; case ARM::tSUBspi: @@ -2142,7 +2148,7 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) { //===----------------------------------------------------------------------===// // Force static initialization. -extern "C" void LLVMInitializeARMAsmPrinter() { +extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeARMAsmPrinter() { RegisterAsmPrinter<ARMAsmPrinter> X(getTheARMLETarget()); RegisterAsmPrinter<ARMAsmPrinter> Y(getTheARMBETarget()); RegisterAsmPrinter<ARMAsmPrinter> A(getTheThumbLETarget()); diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp index 684cd1def977..48f781510254 100644 --- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp +++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp @@ -134,7 +134,7 @@ ScheduleHazardRecognizer *ARMBaseInstrInfo:: CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II, const ScheduleDAG *DAG) const { if (Subtarget.isThumb2() || Subtarget.hasVFP2Base()) - return (ScheduleHazardRecognizer *)new ARMHazardRecognizer(II, DAG); + return new ARMHazardRecognizer(II, DAG); return TargetInstrInfo::CreateTargetPostRAHazardRecognizer(II, DAG); } @@ -829,8 +829,8 @@ void llvm::addPredicatedMveVpredROp(MachineInstrBuilder &MIB, void ARMBaseInstrInfo::copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, - const DebugLoc &DL, unsigned DestReg, - unsigned SrcReg, bool KillSrc) const { + const DebugLoc &DL, MCRegister DestReg, + MCRegister SrcReg, bool KillSrc) const { bool GPRDest = ARM::GPRRegClass.contains(DestReg); bool GPRSrc = ARM::GPRRegClass.contains(SrcReg); @@ -993,9 +993,8 @@ void ARMBaseInstrInfo::copyPhysReg(MachineBasicBlock &MBB, Mov->addRegisterKilled(SrcReg, TRI); } -bool ARMBaseInstrInfo::isCopyInstrImpl(const MachineInstr &MI, - const MachineOperand *&Src, - const MachineOperand *&Dest) const { +Optional<DestSourcePair> +ARMBaseInstrInfo::isCopyInstrImpl(const MachineInstr &MI) const { // VMOVRRD is also a copy instruction but it requires // special way of handling. It is more complex copy version // and since that we are not considering it. For recognition @@ -1006,10 +1005,8 @@ bool ARMBaseInstrInfo::isCopyInstrImpl(const MachineInstr &MI, if (!MI.isMoveReg() || (MI.getOpcode() == ARM::VORRq && MI.getOperand(1).getReg() != MI.getOperand(2).getReg())) - return false; - Dest = &MI.getOperand(0); - Src = &MI.getOperand(1); - return true; + return None; + return DestSourcePair{MI.getOperand(0), MI.getOperand(1)}; } const MachineInstrBuilder & @@ -2726,25 +2723,6 @@ static bool isSuitableForMask(MachineInstr *&MI, unsigned SrcReg, return false; } -/// getSwappedCondition - assume the flags are set by MI(a,b), return -/// the condition code if we modify the instructions such that flags are -/// set by MI(b,a). -inline static ARMCC::CondCodes getSwappedCondition(ARMCC::CondCodes CC) { - switch (CC) { - default: return ARMCC::AL; - case ARMCC::EQ: return ARMCC::EQ; - case ARMCC::NE: return ARMCC::NE; - case ARMCC::HS: return ARMCC::LS; - case ARMCC::LO: return ARMCC::HI; - case ARMCC::HI: return ARMCC::LO; - case ARMCC::LS: return ARMCC::HS; - case ARMCC::GE: return ARMCC::LE; - case ARMCC::LT: return ARMCC::GT; - case ARMCC::GT: return ARMCC::LT; - case ARMCC::LE: return ARMCC::GE; - } -} - /// getCmpToAddCondition - assume the flags are set by CMP(a,b), return /// the condition code if we modify the instructions such that flags are /// set by ADD(a,b,X). @@ -3279,22 +3257,26 @@ bool ARMBaseInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, } break; case ARM::t2ADDrr: - case ARM::t2SUBrr: + case ARM::t2SUBrr: { if (UseOpc == ARM::t2SUBrr && Commute) return false; // ADD/SUB are special because they're essentially the same operation, so // we can handle a larger range of immediates. + const bool ToSP = DefMI.getOperand(0).getReg() == ARM::SP; + const unsigned t2ADD = ToSP ? ARM::t2ADDspImm : ARM::t2ADDri; + const unsigned t2SUB = ToSP ? ARM::t2SUBspImm : ARM::t2SUBri; if (ARM_AM::isT2SOImmTwoPartVal(ImmVal)) - NewUseOpc = UseOpc == ARM::t2ADDrr ? ARM::t2ADDri : ARM::t2SUBri; + NewUseOpc = UseOpc == ARM::t2ADDrr ? t2ADD : t2SUB; else if (ARM_AM::isT2SOImmTwoPartVal(-ImmVal)) { ImmVal = -ImmVal; - NewUseOpc = UseOpc == ARM::t2ADDrr ? ARM::t2SUBri : ARM::t2ADDri; + NewUseOpc = UseOpc == ARM::t2ADDrr ? t2SUB : t2ADD; } else return false; SOImmValV1 = (uint32_t)ARM_AM::getT2SOImmTwoPartFirst(ImmVal); SOImmValV2 = (uint32_t)ARM_AM::getT2SOImmTwoPartSecond(ImmVal); break; + } case ARM::t2ORRrr: case ARM::t2EORrr: if (!ARM_AM::isT2SOImmTwoPartVal(ImmVal)) @@ -3314,7 +3296,8 @@ bool ARMBaseInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, unsigned OpIdx = Commute ? 2 : 1; Register Reg1 = UseMI.getOperand(OpIdx).getReg(); bool isKill = UseMI.getOperand(OpIdx).isKill(); - Register NewReg = MRI->createVirtualRegister(MRI->getRegClass(Reg)); + const TargetRegisterClass *TRC = MRI->getRegClass(Reg); + Register NewReg = MRI->createVirtualRegister(TRC); BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(), get(NewUseOpc), NewReg) .addReg(Reg1, getKillRegState(isKill)) @@ -3326,6 +3309,18 @@ bool ARMBaseInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, UseMI.getOperand(1).setIsKill(); UseMI.getOperand(2).ChangeToImmediate(SOImmValV2); DefMI.eraseFromParent(); + // FIXME: t2ADDrr should be split, as different rulles apply when writing to SP. + // Just as t2ADDri, that was split to [t2ADDri, t2ADDspImm]. + // Then the below code will not be needed, as the input/output register + // classes will be rgpr or gprSP. + // For now, we fix the UseMI operand explicitly here: + switch(NewUseOpc){ + case ARM::t2ADDspImm: + case ARM::t2SUBspImm: + case ARM::t2ADDri: + case ARM::t2SUBri: + MRI->setRegClass(UseMI.getOperand(0).getReg(), TRC); + } return true; } @@ -5350,6 +5345,34 @@ ARMBaseInstrInfo::getSerializableBitmaskMachineOperandTargetFlags() const { return makeArrayRef(TargetFlags); } +Optional<RegImmPair> ARMBaseInstrInfo::isAddImmediate(const MachineInstr &MI, + Register Reg) const { + int Sign = 1; + unsigned Opcode = MI.getOpcode(); + int64_t Offset = 0; + + // TODO: Handle cases where Reg is a super- or sub-register of the + // destination register. + if (Reg != MI.getOperand(0).getReg()) + return None; + + // We describe SUBri or ADDri instructions. + if (Opcode == ARM::SUBri) + Sign = -1; + else if (Opcode != ARM::ADDri) + return None; + + // TODO: Third operand can be global address (usually some string). Since + // strings can be relocated we cannot calculate their offsets for + // now. + if (!MI.getOperand(0).isReg() || !MI.getOperand(1).isReg() || + !MI.getOperand(2).isImm()) + return None; + + Offset = MI.getOperand(2).getImm() * Sign; + return RegImmPair{MI.getOperand(1).getReg(), Offset}; +} + bool llvm::registerDefinedBetween(unsigned Reg, MachineBasicBlock::iterator From, MachineBasicBlock::iterator To, diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMBaseInstrInfo.h b/contrib/llvm-project/llvm/lib/Target/ARM/ARMBaseInstrInfo.h index c232b6f0b45d..f6d4ebe3a090 100644 --- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMBaseInstrInfo.h +++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMBaseInstrInfo.h @@ -99,12 +99,11 @@ protected: MachineInstr *commuteInstructionImpl(MachineInstr &MI, bool NewMI, unsigned OpIdx1, unsigned OpIdx2) const override; - - /// If the specific machine instruction is a instruction that moves/copies - /// value from one register to another register return true along with - /// @Source machine operand and @Destination machine operand. - bool isCopyInstrImpl(const MachineInstr &MI, const MachineOperand *&Source, - const MachineOperand *&Destination) const override; + /// If the specific machine instruction is an instruction that moves/copies + /// value from one register to another register return destination and source + /// registers as machine operands. + Optional<DestSourcePair> + isCopyInstrImpl(const MachineInstr &MI) const override; public: // Return whether the target has an explicit NOP encoding. @@ -203,7 +202,7 @@ public: const ARMSubtarget &Subtarget) const; void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, - const DebugLoc &DL, unsigned DestReg, unsigned SrcReg, + const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc) const override; void storeRegToStackSlot(MachineBasicBlock &MBB, @@ -455,6 +454,9 @@ public: // 3 - predicate reg return MI.getOperand(3).getReg(); } + + Optional<RegImmPair> isAddImmediate(const MachineInstr &MI, + Register Reg) const override; }; /// Get the operands corresponding to the given \p Pred value. By default, the @@ -486,6 +488,27 @@ bool isUncondBranchOpcode(int Opc) { return Opc == ARM::B || Opc == ARM::tB || Opc == ARM::t2B; } +// This table shows the VPT instruction variants, i.e. the different +// mask field encodings, see also B5.6. Predication/conditional execution in +// the ArmARM. +enum VPTMaskValue { + T = 8, // 0b1000 + TT = 4, // 0b0100 + TE = 12, // 0b1100 + TTT = 2, // 0b0010 + TTE = 6, // 0b0110 + TEE = 10, // 0b1010 + TET = 14, // 0b1110 + TTTT = 1, // 0b0001 + TTTE = 3, // 0b0011 + TTEE = 5, // 0b0101 + TTET = 7, // 0b0111 + TEEE = 9, // 0b1001 + TEET = 11, // 0b1011 + TETT = 13, // 0b1101 + TETE = 15 // 0b1111 +}; + static inline bool isVPTOpcode(int Opc) { return Opc == ARM::MVE_VPTv16i8 || Opc == ARM::MVE_VPTv16u8 || Opc == ARM::MVE_VPTv16s8 || Opc == ARM::MVE_VPTv8i16 || @@ -502,6 +525,97 @@ static inline bool isVPTOpcode(int Opc) { } static inline +unsigned VCMPOpcodeToVPT(unsigned Opcode) { + switch (Opcode) { + default: + return 0; + case ARM::MVE_VCMPf32: + return ARM::MVE_VPTv4f32; + case ARM::MVE_VCMPf16: + return ARM::MVE_VPTv8f16; + case ARM::MVE_VCMPi8: + return ARM::MVE_VPTv16i8; + case ARM::MVE_VCMPi16: + return ARM::MVE_VPTv8i16; + case ARM::MVE_VCMPi32: + return ARM::MVE_VPTv4i32; + case ARM::MVE_VCMPu8: + return ARM::MVE_VPTv16u8; + case ARM::MVE_VCMPu16: + return ARM::MVE_VPTv8u16; + case ARM::MVE_VCMPu32: + return ARM::MVE_VPTv4u32; + case ARM::MVE_VCMPs8: + return ARM::MVE_VPTv16s8; + case ARM::MVE_VCMPs16: + return ARM::MVE_VPTv8s16; + case ARM::MVE_VCMPs32: + return ARM::MVE_VPTv4s32; + + case ARM::MVE_VCMPf32r: + return ARM::MVE_VPTv4f32r; + case ARM::MVE_VCMPf16r: + return ARM::MVE_VPTv8f16r; + case ARM::MVE_VCMPi8r: + return ARM::MVE_VPTv16i8r; + case ARM::MVE_VCMPi16r: + return ARM::MVE_VPTv8i16r; + case ARM::MVE_VCMPi32r: + return ARM::MVE_VPTv4i32r; + case ARM::MVE_VCMPu8r: + return ARM::MVE_VPTv16u8r; + case ARM::MVE_VCMPu16r: + return ARM::MVE_VPTv8u16r; + case ARM::MVE_VCMPu32r: + return ARM::MVE_VPTv4u32r; + case ARM::MVE_VCMPs8r: + return ARM::MVE_VPTv16s8r; + case ARM::MVE_VCMPs16r: + return ARM::MVE_VPTv8s16r; + case ARM::MVE_VCMPs32r: + return ARM::MVE_VPTv4s32r; + } +} + +static inline +unsigned VCTPOpcodeToLSTP(unsigned Opcode, bool IsDoLoop) { + switch (Opcode) { + default: + llvm_unreachable("unhandled vctp opcode"); + break; + case ARM::MVE_VCTP8: + return IsDoLoop ? ARM::MVE_DLSTP_8 : ARM::MVE_WLSTP_8; + case ARM::MVE_VCTP16: + return IsDoLoop ? ARM::MVE_DLSTP_16 : ARM::MVE_WLSTP_16; + case ARM::MVE_VCTP32: + return IsDoLoop ? ARM::MVE_DLSTP_32 : ARM::MVE_WLSTP_32; + case ARM::MVE_VCTP64: + return IsDoLoop ? ARM::MVE_DLSTP_64 : ARM::MVE_WLSTP_64; + } + return 0; +} + +static inline +bool isVCTP(MachineInstr *MI) { + switch (MI->getOpcode()) { + default: + break; + case ARM::MVE_VCTP8: + case ARM::MVE_VCTP16: + case ARM::MVE_VCTP32: + case ARM::MVE_VCTP64: + return true; + } + return false; +} + +static inline +bool isLoopStart(MachineInstr &MI) { + return MI.getOpcode() == ARM::t2DoLoopStart || + MI.getOpcode() == ARM::t2WhileLoopStart; +} + +static inline bool isCondBranchOpcode(int Opc) { return Opc == ARM::Bcc || Opc == ARM::tBcc || Opc == ARM::t2Bcc; } diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp index 1eaf871867e0..52e6d05c3155 100644 --- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp +++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp @@ -75,6 +75,8 @@ ARMBaseRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { // GHC set of callee saved regs is empty as all those regs are // used for passing STG regs around return CSR_NoRegs_SaveList; + } else if (F.getCallingConv() == CallingConv::CFGuard_Check) { + return CSR_Win_AAPCS_CFGuard_Check_SaveList; } else if (F.hasFnAttribute("interrupt")) { if (STI.isMClass()) { // M-class CPUs have hardware which saves the registers needed to allow a @@ -123,7 +125,8 @@ ARMBaseRegisterInfo::getCallPreservedMask(const MachineFunction &MF, if (CC == CallingConv::GHC) // This is academic because all GHC calls are (supposed to be) tail calls return CSR_NoRegs_RegMask; - + if (CC == CallingConv::CFGuard_Check) + return CSR_Win_AAPCS_CFGuard_Check_RegMask; if (STI.getTargetLowering()->supportSwiftError() && MF.getFunction().getAttributes().hasAttrSomewhere(Attribute::SwiftError)) return STI.isTargetDarwin() ? CSR_iOS_SwiftError_RegMask @@ -191,7 +194,7 @@ getReservedRegs(const MachineFunction &MF) const { markSuperRegs(Reserved, ARM::PC); markSuperRegs(Reserved, ARM::FPSCR); markSuperRegs(Reserved, ARM::APSR_NZCV); - if (TFI->hasFP(MF) || STI.isTargetDarwin()) + if (TFI->hasFP(MF)) markSuperRegs(Reserved, getFramePointerReg(STI)); if (hasBasePointer(MF)) markSuperRegs(Reserved, BasePtr); @@ -385,7 +388,7 @@ bool ARMBaseRegisterInfo::hasBasePointer(const MachineFunction &MF) const { return true; // Thumb has trouble with negative offsets from the FP. Thumb2 has a limited - // negative range for ldr/str (255), and thumb1 is positive offsets only. + // negative range for ldr/str (255), and Thumb1 is positive offsets only. // // It's going to be better to use the SP or Base Pointer instead. When there // are variable sized objects, we can't reference off of the SP, so we diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMCallLowering.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/ARMCallLowering.cpp index d3b595ce8323..ce260a9ba145 100644 --- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMCallLowering.cpp +++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMCallLowering.cpp @@ -106,7 +106,7 @@ struct OutgoingValueHandler : public CallLowering::ValueHandler { MIRBuilder.buildConstant(OffsetReg, Offset); Register AddrReg = MRI.createGenericVirtualRegister(p0); - MIRBuilder.buildGEP(AddrReg, SPReg, OffsetReg); + MIRBuilder.buildPtrAdd(AddrReg, SPReg, OffsetReg); MPO = MachinePointerInfo::getStack(MIRBuilder.getMF(), Offset); return AddrReg; diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMCallingConv.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/ARMCallingConv.cpp index 92ebc542b423..a47c59512592 100644 --- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMCallingConv.cpp +++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMCallingConv.cpp @@ -18,8 +18,8 @@ using namespace llvm; // APCS f64 is in register pairs, possibly split to stack -static bool f64AssignAPCS(unsigned &ValNo, MVT &ValVT, MVT &LocVT, - CCValAssign::LocInfo &LocInfo, +static bool f64AssignAPCS(unsigned ValNo, MVT ValVT, MVT LocVT, + CCValAssign::LocInfo LocInfo, CCState &State, bool CanFail) { static const MCPhysReg RegList[] = { ARM::R0, ARM::R1, ARM::R2, ARM::R3 }; @@ -48,9 +48,9 @@ static bool f64AssignAPCS(unsigned &ValNo, MVT &ValVT, MVT &LocVT, return true; } -static bool CC_ARM_APCS_Custom_f64(unsigned &ValNo, MVT &ValVT, MVT &LocVT, - CCValAssign::LocInfo &LocInfo, - ISD::ArgFlagsTy &ArgFlags, +static bool CC_ARM_APCS_Custom_f64(unsigned ValNo, MVT ValVT, MVT LocVT, + CCValAssign::LocInfo LocInfo, + ISD::ArgFlagsTy ArgFlags, CCState &State) { if (!f64AssignAPCS(ValNo, ValVT, LocVT, LocInfo, State, true)) return false; @@ -61,8 +61,8 @@ static bool CC_ARM_APCS_Custom_f64(unsigned &ValNo, MVT &ValVT, MVT &LocVT, } // AAPCS f64 is in aligned register pairs -static bool f64AssignAAPCS(unsigned &ValNo, MVT &ValVT, MVT &LocVT, - CCValAssign::LocInfo &LocInfo, +static bool f64AssignAAPCS(unsigned ValNo, MVT ValVT, MVT LocVT, + CCValAssign::LocInfo LocInfo, CCState &State, bool CanFail) { static const MCPhysReg HiRegList[] = { ARM::R0, ARM::R2 }; static const MCPhysReg LoRegList[] = { ARM::R1, ARM::R3 }; @@ -102,9 +102,9 @@ static bool f64AssignAAPCS(unsigned &ValNo, MVT &ValVT, MVT &LocVT, return true; } -static bool CC_ARM_AAPCS_Custom_f64(unsigned &ValNo, MVT &ValVT, MVT &LocVT, - CCValAssign::LocInfo &LocInfo, - ISD::ArgFlagsTy &ArgFlags, +static bool CC_ARM_AAPCS_Custom_f64(unsigned ValNo, MVT ValVT, MVT LocVT, + CCValAssign::LocInfo LocInfo, + ISD::ArgFlagsTy ArgFlags, CCState &State) { if (!f64AssignAAPCS(ValNo, ValVT, LocVT, LocInfo, State, true)) return false; @@ -114,8 +114,8 @@ static bool CC_ARM_AAPCS_Custom_f64(unsigned &ValNo, MVT &ValVT, MVT &LocVT, return true; // we handled it } -static bool f64RetAssign(unsigned &ValNo, MVT &ValVT, MVT &LocVT, - CCValAssign::LocInfo &LocInfo, CCState &State) { +static bool f64RetAssign(unsigned ValNo, MVT ValVT, MVT LocVT, + CCValAssign::LocInfo LocInfo, CCState &State) { static const MCPhysReg HiRegList[] = { ARM::R0, ARM::R2 }; static const MCPhysReg LoRegList[] = { ARM::R1, ARM::R3 }; @@ -134,9 +134,9 @@ static bool f64RetAssign(unsigned &ValNo, MVT &ValVT, MVT &LocVT, return true; } -static bool RetCC_ARM_APCS_Custom_f64(unsigned &ValNo, MVT &ValVT, MVT &LocVT, - CCValAssign::LocInfo &LocInfo, - ISD::ArgFlagsTy &ArgFlags, +static bool RetCC_ARM_APCS_Custom_f64(unsigned ValNo, MVT ValVT, MVT LocVT, + CCValAssign::LocInfo LocInfo, + ISD::ArgFlagsTy ArgFlags, CCState &State) { if (!f64RetAssign(ValNo, ValVT, LocVT, LocInfo, State)) return false; @@ -145,9 +145,9 @@ static bool RetCC_ARM_APCS_Custom_f64(unsigned &ValNo, MVT &ValVT, MVT &LocVT, return true; // we handled it } -static bool RetCC_ARM_AAPCS_Custom_f64(unsigned &ValNo, MVT &ValVT, MVT &LocVT, - CCValAssign::LocInfo &LocInfo, - ISD::ArgFlagsTy &ArgFlags, +static bool RetCC_ARM_AAPCS_Custom_f64(unsigned ValNo, MVT ValVT, MVT LocVT, + CCValAssign::LocInfo LocInfo, + ISD::ArgFlagsTy ArgFlags, CCState &State) { return RetCC_ARM_APCS_Custom_f64(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State); @@ -169,10 +169,10 @@ static const MCPhysReg QRegList[] = { ARM::Q0, ARM::Q1, ARM::Q2, ARM::Q3 }; // InConsecutiveRegsLast set. We must process all members of the HA before // we can allocate it, as we need to know the total number of registers that // will be needed in order to (attempt to) allocate a contiguous block. -static bool CC_ARM_AAPCS_Custom_Aggregate(unsigned &ValNo, MVT &ValVT, - MVT &LocVT, - CCValAssign::LocInfo &LocInfo, - ISD::ArgFlagsTy &ArgFlags, +static bool CC_ARM_AAPCS_Custom_Aggregate(unsigned ValNo, MVT ValVT, + MVT LocVT, + CCValAssign::LocInfo LocInfo, + ISD::ArgFlagsTy ArgFlags, CCState &State) { SmallVectorImpl<CCValAssign> &PendingMembers = State.getPendingLocs(); @@ -181,7 +181,7 @@ static bool CC_ARM_AAPCS_Custom_Aggregate(unsigned &ValNo, MVT &ValVT, assert(PendingMembers[0].getLocVT() == LocVT); // Add the argument to the list to be allocated once we know the size of the - // aggregate. Store the type's required alignmnent as extra info for later: in + // aggregate. Store the type's required alignment as extra info for later: in // the [N x i64] case all trace has been removed by the time we actually get // to do allocation. PendingMembers.push_back(CCValAssign::getPending(ValNo, ValVT, LocVT, LocInfo, diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMCallingConv.h b/contrib/llvm-project/llvm/lib/Target/ARM/ARMCallingConv.h index 615634551d90..7c692f03b440 100644 --- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMCallingConv.h +++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMCallingConv.h @@ -32,6 +32,9 @@ bool CC_ARM_APCS_GHC(unsigned ValNo, MVT ValVT, MVT LocVT, bool FastCC_ARM_APCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State); +bool CC_ARM_Win32_CFGuard_Check(unsigned ValNo, MVT ValVT, MVT LocVT, + CCValAssign::LocInfo LocInfo, + ISD::ArgFlagsTy ArgFlags, CCState &State); bool RetCC_ARM_AAPCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State); diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMCallingConv.td b/contrib/llvm-project/llvm/lib/Target/ARM/ARMCallingConv.td index 61d2d83ddc40..5df5b56f5afa 100644 --- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMCallingConv.td +++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMCallingConv.td @@ -20,7 +20,7 @@ def CC_ARM_APCS : CallingConv<[ // Handles byval parameters. CCIfByVal<CCPassByVal<4, 4>>, - + CCIfType<[i1, i8, i16], CCPromoteToType<i32>>, // Pass SwiftSelf in a callee saved register. @@ -80,7 +80,7 @@ def FastCC_ARM_APCS : CallingConv<[ S9, S10, S11, S12, S13, S14, S15]>>, // CPRCs may be allocated to co-processor registers or the stack - they - // may never be allocated to core registers. + // may never be allocated to core registers. CCIfType<[f32], CCAssignToStackWithShadow<4, 4, [Q0, Q1, Q2, Q3]>>, CCIfType<[f64], CCAssignToStackWithShadow<8, 4, [Q0, Q1, Q2, Q3]>>, CCIfType<[v2f64], CCAssignToStackWithShadow<16, 4, [Q0, Q1, Q2, Q3]>>, @@ -165,8 +165,8 @@ def CC_ARM_AAPCS : CallingConv<[ CCIfNest<CCAssignToReg<[R12]>>, // Handle all vector types as either f64 or v2f64. - CCIfType<[v1i64, v2i32, v4i16, v4f16, v4f16, v8i8, v2f32], CCBitConvertToType<f64>>, - CCIfType<[v2i64, v4i32, v8i16, v8f16, v8f16, v16i8, v4f32], CCBitConvertToType<v2f64>>, + CCIfType<[v1i64, v2i32, v4i16, v4f16, v8i8, v2f32], CCBitConvertToType<f64>>, + CCIfType<[v2i64, v4i32, v8i16, v8f16, v16i8, v4f32], CCBitConvertToType<v2f64>>, // Pass SwiftSelf in a callee saved register. CCIfSwiftSelf<CCIfType<[i32], CCAssignToReg<[R10]>>>, @@ -182,8 +182,8 @@ def CC_ARM_AAPCS : CallingConv<[ let Entry = 1 in def RetCC_ARM_AAPCS : CallingConv<[ // Handle all vector types as either f64 or v2f64. - CCIfType<[v1i64, v2i32, v4i16, v4f16, v4f16, v8i8, v2f32], CCBitConvertToType<f64>>, - CCIfType<[v2i64, v4i32, v8i16, v8f16, v8f16,v16i8, v4f32], CCBitConvertToType<v2f64>>, + CCIfType<[v1i64, v2i32, v4i16, v4f16, v8i8, v2f32], CCBitConvertToType<f64>>, + CCIfType<[v2i64, v4i32, v8i16, v8f16, v16i8, v4f32], CCBitConvertToType<v2f64>>, // Pass SwiftSelf in a callee saved register. CCIfSwiftSelf<CCIfType<[i32], CCAssignToReg<[R10]>>>, @@ -208,8 +208,8 @@ def CC_ARM_AAPCS_VFP : CallingConv<[ CCIfByVal<CCPassByVal<4, 4>>, // Handle all vector types as either f64 or v2f64. - CCIfType<[v1i64, v2i32, v4i16, v4f16, v4f16, v8i8, v2f32], CCBitConvertToType<f64>>, - CCIfType<[v2i64, v4i32, v8i16, v8f16, v8f16, v16i8, v4f32], CCBitConvertToType<v2f64>>, + CCIfType<[v1i64, v2i32, v4i16, v4f16, v8i8, v2f32], CCBitConvertToType<f64>>, + CCIfType<[v2i64, v4i32, v8i16, v8f16, v16i8, v4f32], CCBitConvertToType<v2f64>>, // Pass SwiftSelf in a callee saved register. CCIfSwiftSelf<CCIfType<[i32], CCAssignToReg<[R10]>>>, @@ -230,8 +230,8 @@ def CC_ARM_AAPCS_VFP : CallingConv<[ let Entry = 1 in def RetCC_ARM_AAPCS_VFP : CallingConv<[ // Handle all vector types as either f64 or v2f64. - CCIfType<[v1i64, v2i32, v4i16, v4f16, v4f16, v8i8, v2f32], CCBitConvertToType<f64>>, - CCIfType<[v2i64, v4i32, v8i16, v8f16, v8f16, v16i8, v4f32], CCBitConvertToType<v2f64>>, + CCIfType<[v1i64, v2i32, v4i16, v4f16, v8i8, v2f32], CCBitConvertToType<f64>>, + CCIfType<[v2i64, v4i32, v8i16, v8f16, v16i8, v4f32], CCBitConvertToType<v2f64>>, // Pass SwiftSelf in a callee saved register. CCIfSwiftSelf<CCIfType<[i32], CCAssignToReg<[R10]>>>, @@ -246,6 +246,16 @@ def RetCC_ARM_AAPCS_VFP : CallingConv<[ CCDelegateTo<RetCC_ARM_AAPCS_Common> ]>; + +// Windows Control Flow Guard checks take a single argument (the target function +// address) and have no return value. +let Entry = 1 in +def CC_ARM_Win32_CFGuard_Check : CallingConv<[ + CCIfType<[i32], CCAssignToReg<[R0]>> +]>; + + + //===----------------------------------------------------------------------===// // Callee-saved register lists. //===----------------------------------------------------------------------===// @@ -256,6 +266,11 @@ def CSR_FPRegs : CalleeSavedRegs<(add (sequence "D%u", 0, 31))>; def CSR_AAPCS : CalleeSavedRegs<(add LR, R11, R10, R9, R8, R7, R6, R5, R4, (sequence "D%u", 15, 8))>; +// The Windows Control Flow Guard Check function preserves the same registers as +// AAPCS, and also preserves all floating point registers. +def CSR_Win_AAPCS_CFGuard_Check : CalleeSavedRegs<(add LR, R11, R10, R9, R8, R7, + R6, R5, R4, (sequence "D%u", 15, 0))>; + // R8 is used to pass swifterror, remove it from CSR. def CSR_AAPCS_SwiftError : CalleeSavedRegs<(sub CSR_AAPCS, R8)>; diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMCodeGenPrepare.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/ARMCodeGenPrepare.cpp deleted file mode 100644 index 1c2c8aef55bb..000000000000 --- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMCodeGenPrepare.cpp +++ /dev/null @@ -1,1069 +0,0 @@ -//===----- ARMCodeGenPrepare.cpp ------------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -/// \file -/// This pass inserts intrinsics to handle small types that would otherwise be -/// promoted during legalization. Here we can manually promote types or insert -/// intrinsics which can handle narrow types that aren't supported by the -/// register classes. -// -//===----------------------------------------------------------------------===// - -#include "ARM.h" -#include "ARMSubtarget.h" -#include "ARMTargetMachine.h" -#include "llvm/ADT/StringRef.h" -#include "llvm/CodeGen/Passes.h" -#include "llvm/CodeGen/TargetPassConfig.h" -#include "llvm/IR/Attributes.h" -#include "llvm/IR/BasicBlock.h" -#include "llvm/IR/IRBuilder.h" -#include "llvm/IR/Constants.h" -#include "llvm/IR/InstrTypes.h" -#include "llvm/IR/Instruction.h" -#include "llvm/IR/Instructions.h" -#include "llvm/IR/IntrinsicInst.h" -#include "llvm/IR/Intrinsics.h" -#include "llvm/IR/Type.h" -#include "llvm/IR/Value.h" -#include "llvm/IR/Verifier.h" -#include "llvm/Pass.h" -#include "llvm/Support/Casting.h" -#include "llvm/Support/CommandLine.h" - -#define DEBUG_TYPE "arm-codegenprepare" - -using namespace llvm; - -static cl::opt<bool> -DisableCGP("arm-disable-cgp", cl::Hidden, cl::init(true), - cl::desc("Disable ARM specific CodeGenPrepare pass")); - -static cl::opt<bool> -EnableDSP("arm-enable-scalar-dsp", cl::Hidden, cl::init(false), - cl::desc("Use DSP instructions for scalar operations")); - -static cl::opt<bool> -EnableDSPWithImms("arm-enable-scalar-dsp-imms", cl::Hidden, cl::init(false), - cl::desc("Use DSP instructions for scalar operations\ - with immediate operands")); - -// The goal of this pass is to enable more efficient code generation for -// operations on narrow types (i.e. types with < 32-bits) and this is a -// motivating IR code example: -// -// define hidden i32 @cmp(i8 zeroext) { -// %2 = add i8 %0, -49 -// %3 = icmp ult i8 %2, 3 -// .. -// } -// -// The issue here is that i8 is type-legalized to i32 because i8 is not a -// legal type. Thus, arithmetic is done in integer-precision, but then the -// byte value is masked out as follows: -// -// t19: i32 = add t4, Constant:i32<-49> -// t24: i32 = and t19, Constant:i32<255> -// -// Consequently, we generate code like this: -// -// subs r0, #49 -// uxtb r1, r0 -// cmp r1, #3 -// -// This shows that masking out the byte value results in generation of -// the UXTB instruction. This is not optimal as r0 already contains the byte -// value we need, and so instead we can just generate: -// -// sub.w r1, r0, #49 -// cmp r1, #3 -// -// We achieve this by type promoting the IR to i32 like so for this example: -// -// define i32 @cmp(i8 zeroext %c) { -// %0 = zext i8 %c to i32 -// %c.off = add i32 %0, -49 -// %1 = icmp ult i32 %c.off, 3 -// .. -// } -// -// For this to be valid and legal, we need to prove that the i32 add is -// producing the same value as the i8 addition, and that e.g. no overflow -// happens. -// -// A brief sketch of the algorithm and some terminology. -// We pattern match interesting IR patterns: -// - which have "sources": instructions producing narrow values (i8, i16), and -// - they have "sinks": instructions consuming these narrow values. -// -// We collect all instruction connecting sources and sinks in a worklist, so -// that we can mutate these instruction and perform type promotion when it is -// legal to do so. - -namespace { -class IRPromoter { - SmallPtrSet<Value*, 8> NewInsts; - SmallPtrSet<Instruction*, 4> InstsToRemove; - DenseMap<Value*, SmallVector<Type*, 4>> TruncTysMap; - SmallPtrSet<Value*, 8> Promoted; - Module *M = nullptr; - LLVMContext &Ctx; - // The type we promote to: always i32 - IntegerType *ExtTy = nullptr; - // The type of the value that the search began from, either i8 or i16. - // This defines the max range of the values that we allow in the promoted - // tree. - IntegerType *OrigTy = nullptr; - SetVector<Value*> *Visited; - SmallPtrSetImpl<Value*> *Sources; - SmallPtrSetImpl<Instruction*> *Sinks; - SmallPtrSetImpl<Instruction*> *SafeToPromote; - SmallPtrSetImpl<Instruction*> *SafeWrap; - - void ReplaceAllUsersOfWith(Value *From, Value *To); - void PrepareWrappingAdds(void); - void ExtendSources(void); - void ConvertTruncs(void); - void PromoteTree(void); - void TruncateSinks(void); - void Cleanup(void); - -public: - IRPromoter(Module *M) : M(M), Ctx(M->getContext()), - ExtTy(Type::getInt32Ty(Ctx)) { } - - - void Mutate(Type *OrigTy, - SetVector<Value*> &Visited, - SmallPtrSetImpl<Value*> &Sources, - SmallPtrSetImpl<Instruction*> &Sinks, - SmallPtrSetImpl<Instruction*> &SafeToPromote, - SmallPtrSetImpl<Instruction*> &SafeWrap); -}; - -class ARMCodeGenPrepare : public FunctionPass { - const ARMSubtarget *ST = nullptr; - IRPromoter *Promoter = nullptr; - std::set<Value*> AllVisited; - SmallPtrSet<Instruction*, 8> SafeToPromote; - SmallPtrSet<Instruction*, 4> SafeWrap; - - bool isSafeWrap(Instruction *I); - bool isSupportedValue(Value *V); - bool isLegalToPromote(Value *V); - bool TryToPromote(Value *V); - -public: - static char ID; - static unsigned TypeSize; - Type *OrigTy = nullptr; - - ARMCodeGenPrepare() : FunctionPass(ID) {} - - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired<TargetPassConfig>(); - } - - StringRef getPassName() const override { return "ARM IR optimizations"; } - - bool doInitialization(Module &M) override; - bool runOnFunction(Function &F) override; - bool doFinalization(Module &M) override; -}; - -} - -static bool GenerateSignBits(Value *V) { - if (!isa<Instruction>(V)) - return false; - - unsigned Opc = cast<Instruction>(V)->getOpcode(); - return Opc == Instruction::AShr || Opc == Instruction::SDiv || - Opc == Instruction::SRem || Opc == Instruction::SExt; -} - -static bool EqualTypeSize(Value *V) { - return V->getType()->getScalarSizeInBits() == ARMCodeGenPrepare::TypeSize; -} - -static bool LessOrEqualTypeSize(Value *V) { - return V->getType()->getScalarSizeInBits() <= ARMCodeGenPrepare::TypeSize; -} - -static bool GreaterThanTypeSize(Value *V) { - return V->getType()->getScalarSizeInBits() > ARMCodeGenPrepare::TypeSize; -} - -static bool LessThanTypeSize(Value *V) { - return V->getType()->getScalarSizeInBits() < ARMCodeGenPrepare::TypeSize; -} - -/// Some instructions can use 8- and 16-bit operands, and we don't need to -/// promote anything larger. We disallow booleans to make life easier when -/// dealing with icmps but allow any other integer that is <= 16 bits. Void -/// types are accepted so we can handle switches. -static bool isSupportedType(Value *V) { - Type *Ty = V->getType(); - - // Allow voids and pointers, these won't be promoted. - if (Ty->isVoidTy() || Ty->isPointerTy()) - return true; - - if (auto *Ld = dyn_cast<LoadInst>(V)) - Ty = cast<PointerType>(Ld->getPointerOperandType())->getElementType(); - - if (!isa<IntegerType>(Ty) || - cast<IntegerType>(V->getType())->getBitWidth() == 1) - return false; - - return LessOrEqualTypeSize(V); -} - -/// Return true if the given value is a source in the use-def chain, producing -/// a narrow 'TypeSize' value. These values will be zext to start the promotion -/// of the tree to i32. We guarantee that these won't populate the upper bits -/// of the register. ZExt on the loads will be free, and the same for call -/// return values because we only accept ones that guarantee a zeroext ret val. -/// Many arguments will have the zeroext attribute too, so those would be free -/// too. -static bool isSource(Value *V) { - if (!isa<IntegerType>(V->getType())) - return false; - - // TODO Allow zext to be sources. - if (isa<Argument>(V)) - return true; - else if (isa<LoadInst>(V)) - return true; - else if (isa<BitCastInst>(V)) - return true; - else if (auto *Call = dyn_cast<CallInst>(V)) - return Call->hasRetAttr(Attribute::AttrKind::ZExt); - else if (auto *Trunc = dyn_cast<TruncInst>(V)) - return EqualTypeSize(Trunc); - return false; -} - -/// Return true if V will require any promoted values to be truncated for the -/// the IR to remain valid. We can't mutate the value type of these -/// instructions. -static bool isSink(Value *V) { - // TODO The truncate also isn't actually necessary because we would already - // proved that the data value is kept within the range of the original data - // type. - - // Sinks are: - // - points where the value in the register is being observed, such as an - // icmp, switch or store. - // - points where value types have to match, such as calls and returns. - // - zext are included to ease the transformation and are generally removed - // later on. - if (auto *Store = dyn_cast<StoreInst>(V)) - return LessOrEqualTypeSize(Store->getValueOperand()); - if (auto *Return = dyn_cast<ReturnInst>(V)) - return LessOrEqualTypeSize(Return->getReturnValue()); - if (auto *ZExt = dyn_cast<ZExtInst>(V)) - return GreaterThanTypeSize(ZExt); - if (auto *Switch = dyn_cast<SwitchInst>(V)) - return LessThanTypeSize(Switch->getCondition()); - if (auto *ICmp = dyn_cast<ICmpInst>(V)) - return ICmp->isSigned() || LessThanTypeSize(ICmp->getOperand(0)); - - return isa<CallInst>(V); -} - -/// Return whether this instruction can safely wrap. -bool ARMCodeGenPrepare::isSafeWrap(Instruction *I) { - // We can support a, potentially, wrapping instruction (I) if: - // - It is only used by an unsigned icmp. - // - The icmp uses a constant. - // - The wrapping value (I) is decreasing, i.e would underflow - wrapping - // around zero to become a larger number than before. - // - The wrapping instruction (I) also uses a constant. - // - // We can then use the two constants to calculate whether the result would - // wrap in respect to itself in the original bitwidth. If it doesn't wrap, - // just underflows the range, the icmp would give the same result whether the - // result has been truncated or not. We calculate this by: - // - Zero extending both constants, if needed, to 32-bits. - // - Take the absolute value of I's constant, adding this to the icmp const. - // - Check that this value is not out of range for small type. If it is, it - // means that it has underflowed enough to wrap around the icmp constant. - // - // For example: - // - // %sub = sub i8 %a, 2 - // %cmp = icmp ule i8 %sub, 254 - // - // If %a = 0, %sub = -2 == FE == 254 - // But if this is evalulated as a i32 - // %sub = -2 == FF FF FF FE == 4294967294 - // So the unsigned compares (i8 and i32) would not yield the same result. - // - // Another way to look at it is: - // %a - 2 <= 254 - // %a + 2 <= 254 + 2 - // %a <= 256 - // And we can't represent 256 in the i8 format, so we don't support it. - // - // Whereas: - // - // %sub i8 %a, 1 - // %cmp = icmp ule i8 %sub, 254 - // - // If %a = 0, %sub = -1 == FF == 255 - // As i32: - // %sub = -1 == FF FF FF FF == 4294967295 - // - // In this case, the unsigned compare results would be the same and this - // would also be true for ult, uge and ugt: - // - (255 < 254) == (0xFFFFFFFF < 254) == false - // - (255 <= 254) == (0xFFFFFFFF <= 254) == false - // - (255 > 254) == (0xFFFFFFFF > 254) == true - // - (255 >= 254) == (0xFFFFFFFF >= 254) == true - // - // To demonstrate why we can't handle increasing values: - // - // %add = add i8 %a, 2 - // %cmp = icmp ult i8 %add, 127 - // - // If %a = 254, %add = 256 == (i8 1) - // As i32: - // %add = 256 - // - // (1 < 127) != (256 < 127) - - unsigned Opc = I->getOpcode(); - if (Opc != Instruction::Add && Opc != Instruction::Sub) - return false; - - if (!I->hasOneUse() || - !isa<ICmpInst>(*I->user_begin()) || - !isa<ConstantInt>(I->getOperand(1))) - return false; - - ConstantInt *OverflowConst = cast<ConstantInt>(I->getOperand(1)); - bool NegImm = OverflowConst->isNegative(); - bool IsDecreasing = ((Opc == Instruction::Sub) && !NegImm) || - ((Opc == Instruction::Add) && NegImm); - if (!IsDecreasing) - return false; - - // Don't support an icmp that deals with sign bits. - auto *CI = cast<ICmpInst>(*I->user_begin()); - if (CI->isSigned() || CI->isEquality()) - return false; - - ConstantInt *ICmpConst = nullptr; - if (auto *Const = dyn_cast<ConstantInt>(CI->getOperand(0))) - ICmpConst = Const; - else if (auto *Const = dyn_cast<ConstantInt>(CI->getOperand(1))) - ICmpConst = Const; - else - return false; - - // Now check that the result can't wrap on itself. - APInt Total = ICmpConst->getValue().getBitWidth() < 32 ? - ICmpConst->getValue().zext(32) : ICmpConst->getValue(); - - Total += OverflowConst->getValue().getBitWidth() < 32 ? - OverflowConst->getValue().abs().zext(32) : OverflowConst->getValue().abs(); - - APInt Max = APInt::getAllOnesValue(ARMCodeGenPrepare::TypeSize); - - if (Total.getBitWidth() > Max.getBitWidth()) { - if (Total.ugt(Max.zext(Total.getBitWidth()))) - return false; - } else if (Max.getBitWidth() > Total.getBitWidth()) { - if (Total.zext(Max.getBitWidth()).ugt(Max)) - return false; - } else if (Total.ugt(Max)) - return false; - - LLVM_DEBUG(dbgs() << "ARM CGP: Allowing safe overflow for " << *I << "\n"); - SafeWrap.insert(I); - return true; -} - -static bool shouldPromote(Value *V) { - if (!isa<IntegerType>(V->getType()) || isSink(V)) - return false; - - if (isSource(V)) - return true; - - auto *I = dyn_cast<Instruction>(V); - if (!I) - return false; - - if (isa<ICmpInst>(I)) - return false; - - return true; -} - -/// Return whether we can safely mutate V's type to ExtTy without having to be -/// concerned with zero extending or truncation. -static bool isPromotedResultSafe(Value *V) { - if (GenerateSignBits(V)) - return false; - - if (!isa<Instruction>(V)) - return true; - - if (!isa<OverflowingBinaryOperator>(V)) - return true; - - return cast<Instruction>(V)->hasNoUnsignedWrap(); -} - -/// Return the intrinsic for the instruction that can perform the same -/// operation but on a narrow type. This is using the parallel dsp intrinsics -/// on scalar values. -static Intrinsic::ID getNarrowIntrinsic(Instruction *I) { - // Whether we use the signed or unsigned versions of these intrinsics - // doesn't matter because we're not using the GE bits that they set in - // the APSR. - switch(I->getOpcode()) { - default: - break; - case Instruction::Add: - return ARMCodeGenPrepare::TypeSize == 16 ? Intrinsic::arm_uadd16 : - Intrinsic::arm_uadd8; - case Instruction::Sub: - return ARMCodeGenPrepare::TypeSize == 16 ? Intrinsic::arm_usub16 : - Intrinsic::arm_usub8; - } - llvm_unreachable("unhandled opcode for narrow intrinsic"); -} - -void IRPromoter::ReplaceAllUsersOfWith(Value *From, Value *To) { - SmallVector<Instruction*, 4> Users; - Instruction *InstTo = dyn_cast<Instruction>(To); - bool ReplacedAll = true; - - LLVM_DEBUG(dbgs() << "ARM CGP: Replacing " << *From << " with " << *To - << "\n"); - - for (Use &U : From->uses()) { - auto *User = cast<Instruction>(U.getUser()); - if (InstTo && User->isIdenticalTo(InstTo)) { - ReplacedAll = false; - continue; - } - Users.push_back(User); - } - - for (auto *U : Users) - U->replaceUsesOfWith(From, To); - - if (ReplacedAll) - if (auto *I = dyn_cast<Instruction>(From)) - InstsToRemove.insert(I); -} - -void IRPromoter::PrepareWrappingAdds() { - LLVM_DEBUG(dbgs() << "ARM CGP: Prepare underflowing adds.\n"); - IRBuilder<> Builder{Ctx}; - - // For adds that safely wrap and use a negative immediate as operand 1, we - // create an equivalent instruction using a positive immediate. - // That positive immediate can then be zext along with all the other - // immediates later. - for (auto *I : *SafeWrap) { - if (I->getOpcode() != Instruction::Add) - continue; - - LLVM_DEBUG(dbgs() << "ARM CGP: Adjusting " << *I << "\n"); - assert((isa<ConstantInt>(I->getOperand(1)) && - cast<ConstantInt>(I->getOperand(1))->isNegative()) && - "Wrapping should have a negative immediate as the second operand"); - - auto Const = cast<ConstantInt>(I->getOperand(1)); - auto *NewConst = ConstantInt::get(Ctx, Const->getValue().abs()); - Builder.SetInsertPoint(I); - Value *NewVal = Builder.CreateSub(I->getOperand(0), NewConst); - if (auto *NewInst = dyn_cast<Instruction>(NewVal)) { - NewInst->copyIRFlags(I); - NewInsts.insert(NewInst); - } - InstsToRemove.insert(I); - I->replaceAllUsesWith(NewVal); - LLVM_DEBUG(dbgs() << "ARM CGP: New equivalent: " << *NewVal << "\n"); - } - for (auto *I : NewInsts) - Visited->insert(I); -} - -void IRPromoter::ExtendSources() { - IRBuilder<> Builder{Ctx}; - - auto InsertZExt = [&](Value *V, Instruction *InsertPt) { - assert(V->getType() != ExtTy && "zext already extends to i32"); - LLVM_DEBUG(dbgs() << "ARM CGP: Inserting ZExt for " << *V << "\n"); - Builder.SetInsertPoint(InsertPt); - if (auto *I = dyn_cast<Instruction>(V)) - Builder.SetCurrentDebugLocation(I->getDebugLoc()); - - Value *ZExt = Builder.CreateZExt(V, ExtTy); - if (auto *I = dyn_cast<Instruction>(ZExt)) { - if (isa<Argument>(V)) - I->moveBefore(InsertPt); - else - I->moveAfter(InsertPt); - NewInsts.insert(I); - } - - ReplaceAllUsersOfWith(V, ZExt); - }; - - // Now, insert extending instructions between the sources and their users. - LLVM_DEBUG(dbgs() << "ARM CGP: Promoting sources:\n"); - for (auto V : *Sources) { - LLVM_DEBUG(dbgs() << " - " << *V << "\n"); - if (auto *I = dyn_cast<Instruction>(V)) - InsertZExt(I, I); - else if (auto *Arg = dyn_cast<Argument>(V)) { - BasicBlock &BB = Arg->getParent()->front(); - InsertZExt(Arg, &*BB.getFirstInsertionPt()); - } else { - llvm_unreachable("unhandled source that needs extending"); - } - Promoted.insert(V); - } -} - -void IRPromoter::PromoteTree() { - LLVM_DEBUG(dbgs() << "ARM CGP: Mutating the tree..\n"); - - IRBuilder<> Builder{Ctx}; - - // Mutate the types of the instructions within the tree. Here we handle - // constant operands. - for (auto *V : *Visited) { - if (Sources->count(V)) - continue; - - auto *I = cast<Instruction>(V); - if (Sinks->count(I)) - continue; - - for (unsigned i = 0, e = I->getNumOperands(); i < e; ++i) { - Value *Op = I->getOperand(i); - if ((Op->getType() == ExtTy) || !isa<IntegerType>(Op->getType())) - continue; - - if (auto *Const = dyn_cast<ConstantInt>(Op)) { - Constant *NewConst = ConstantExpr::getZExt(Const, ExtTy); - I->setOperand(i, NewConst); - } else if (isa<UndefValue>(Op)) - I->setOperand(i, UndefValue::get(ExtTy)); - } - - if (shouldPromote(I)) { - I->mutateType(ExtTy); - Promoted.insert(I); - } - } - - // Finally, any instructions that should be promoted but haven't yet been, - // need to be handled using intrinsics. - for (auto *V : *Visited) { - auto *I = dyn_cast<Instruction>(V); - if (!I) - continue; - - if (Sources->count(I) || Sinks->count(I)) - continue; - - if (!shouldPromote(I) || SafeToPromote->count(I) || NewInsts.count(I)) - continue; - - assert(EnableDSP && "DSP intrinisc insertion not enabled!"); - - // Replace unsafe instructions with appropriate intrinsic calls. - LLVM_DEBUG(dbgs() << "ARM CGP: Inserting DSP intrinsic for " - << *I << "\n"); - Function *DSPInst = - Intrinsic::getDeclaration(M, getNarrowIntrinsic(I)); - Builder.SetInsertPoint(I); - Builder.SetCurrentDebugLocation(I->getDebugLoc()); - Value *Args[] = { I->getOperand(0), I->getOperand(1) }; - CallInst *Call = Builder.CreateCall(DSPInst, Args); - NewInsts.insert(Call); - ReplaceAllUsersOfWith(I, Call); - } -} - -void IRPromoter::TruncateSinks() { - LLVM_DEBUG(dbgs() << "ARM CGP: Fixing up the sinks:\n"); - - IRBuilder<> Builder{Ctx}; - - auto InsertTrunc = [&](Value *V, Type *TruncTy) -> Instruction* { - if (!isa<Instruction>(V) || !isa<IntegerType>(V->getType())) - return nullptr; - - if ((!Promoted.count(V) && !NewInsts.count(V)) || Sources->count(V)) - return nullptr; - - LLVM_DEBUG(dbgs() << "ARM CGP: Creating " << *TruncTy << " Trunc for " - << *V << "\n"); - Builder.SetInsertPoint(cast<Instruction>(V)); - auto *Trunc = dyn_cast<Instruction>(Builder.CreateTrunc(V, TruncTy)); - if (Trunc) - NewInsts.insert(Trunc); - return Trunc; - }; - - // Fix up any stores or returns that use the results of the promoted - // chain. - for (auto I : *Sinks) { - LLVM_DEBUG(dbgs() << "ARM CGP: For Sink: " << *I << "\n"); - - // Handle calls separately as we need to iterate over arg operands. - if (auto *Call = dyn_cast<CallInst>(I)) { - for (unsigned i = 0; i < Call->getNumArgOperands(); ++i) { - Value *Arg = Call->getArgOperand(i); - Type *Ty = TruncTysMap[Call][i]; - if (Instruction *Trunc = InsertTrunc(Arg, Ty)) { - Trunc->moveBefore(Call); - Call->setArgOperand(i, Trunc); - } - } - continue; - } - - // Special case switches because we need to truncate the condition. - if (auto *Switch = dyn_cast<SwitchInst>(I)) { - Type *Ty = TruncTysMap[Switch][0]; - if (Instruction *Trunc = InsertTrunc(Switch->getCondition(), Ty)) { - Trunc->moveBefore(Switch); - Switch->setCondition(Trunc); - } - continue; - } - - // Now handle the others. - for (unsigned i = 0; i < I->getNumOperands(); ++i) { - Type *Ty = TruncTysMap[I][i]; - if (Instruction *Trunc = InsertTrunc(I->getOperand(i), Ty)) { - Trunc->moveBefore(I); - I->setOperand(i, Trunc); - } - } - } -} - -void IRPromoter::Cleanup() { - LLVM_DEBUG(dbgs() << "ARM CGP: Cleanup..\n"); - // Some zexts will now have become redundant, along with their trunc - // operands, so remove them - for (auto V : *Visited) { - if (!isa<ZExtInst>(V)) - continue; - - auto ZExt = cast<ZExtInst>(V); - if (ZExt->getDestTy() != ExtTy) - continue; - - Value *Src = ZExt->getOperand(0); - if (ZExt->getSrcTy() == ZExt->getDestTy()) { - LLVM_DEBUG(dbgs() << "ARM CGP: Removing unnecessary cast: " << *ZExt - << "\n"); - ReplaceAllUsersOfWith(ZExt, Src); - continue; - } - - // Unless they produce a value that is narrower than ExtTy, we can - // replace the result of the zext with the input of a newly inserted - // trunc. - if (NewInsts.count(Src) && isa<TruncInst>(Src) && - Src->getType() == OrigTy) { - auto *Trunc = cast<TruncInst>(Src); - assert(Trunc->getOperand(0)->getType() == ExtTy && - "expected inserted trunc to be operating on i32"); - ReplaceAllUsersOfWith(ZExt, Trunc->getOperand(0)); - } - } - - for (auto *I : InstsToRemove) { - LLVM_DEBUG(dbgs() << "ARM CGP: Removing " << *I << "\n"); - I->dropAllReferences(); - I->eraseFromParent(); - } - - InstsToRemove.clear(); - NewInsts.clear(); - TruncTysMap.clear(); - Promoted.clear(); - SafeToPromote->clear(); - SafeWrap->clear(); -} - -void IRPromoter::ConvertTruncs() { - LLVM_DEBUG(dbgs() << "ARM CGP: Converting truncs..\n"); - IRBuilder<> Builder{Ctx}; - - for (auto *V : *Visited) { - if (!isa<TruncInst>(V) || Sources->count(V)) - continue; - - auto *Trunc = cast<TruncInst>(V); - Builder.SetInsertPoint(Trunc); - IntegerType *SrcTy = cast<IntegerType>(Trunc->getOperand(0)->getType()); - IntegerType *DestTy = cast<IntegerType>(TruncTysMap[Trunc][0]); - - unsigned NumBits = DestTy->getScalarSizeInBits(); - ConstantInt *Mask = - ConstantInt::get(SrcTy, APInt::getMaxValue(NumBits).getZExtValue()); - Value *Masked = Builder.CreateAnd(Trunc->getOperand(0), Mask); - - if (auto *I = dyn_cast<Instruction>(Masked)) - NewInsts.insert(I); - - ReplaceAllUsersOfWith(Trunc, Masked); - } -} - -void IRPromoter::Mutate(Type *OrigTy, - SetVector<Value*> &Visited, - SmallPtrSetImpl<Value*> &Sources, - SmallPtrSetImpl<Instruction*> &Sinks, - SmallPtrSetImpl<Instruction*> &SafeToPromote, - SmallPtrSetImpl<Instruction*> &SafeWrap) { - LLVM_DEBUG(dbgs() << "ARM CGP: Promoting use-def chains to from " - << ARMCodeGenPrepare::TypeSize << " to 32-bits\n"); - - assert(isa<IntegerType>(OrigTy) && "expected integer type"); - this->OrigTy = cast<IntegerType>(OrigTy); - assert(OrigTy->getPrimitiveSizeInBits() < ExtTy->getPrimitiveSizeInBits() && - "original type not smaller than extended type"); - - this->Visited = &Visited; - this->Sources = &Sources; - this->Sinks = &Sinks; - this->SafeToPromote = &SafeToPromote; - this->SafeWrap = &SafeWrap; - - // Cache original types of the values that will likely need truncating - for (auto *I : Sinks) { - if (auto *Call = dyn_cast<CallInst>(I)) { - for (unsigned i = 0; i < Call->getNumArgOperands(); ++i) { - Value *Arg = Call->getArgOperand(i); - TruncTysMap[Call].push_back(Arg->getType()); - } - } else if (auto *Switch = dyn_cast<SwitchInst>(I)) - TruncTysMap[I].push_back(Switch->getCondition()->getType()); - else { - for (unsigned i = 0; i < I->getNumOperands(); ++i) - TruncTysMap[I].push_back(I->getOperand(i)->getType()); - } - } - for (auto *V : Visited) { - if (!isa<TruncInst>(V) || Sources.count(V)) - continue; - auto *Trunc = cast<TruncInst>(V); - TruncTysMap[Trunc].push_back(Trunc->getDestTy()); - } - - // Convert adds using negative immediates to equivalent instructions that use - // positive constants. - PrepareWrappingAdds(); - - // Insert zext instructions between sources and their users. - ExtendSources(); - - // Promote visited instructions, mutating their types in place. Also insert - // DSP intrinsics, if enabled, for adds and subs which would be unsafe to - // promote. - PromoteTree(); - - // Convert any truncs, that aren't sources, into AND masks. - ConvertTruncs(); - - // Insert trunc instructions for use by calls, stores etc... - TruncateSinks(); - - // Finally, remove unecessary zexts and truncs, delete old instructions and - // clear the data structures. - Cleanup(); - - LLVM_DEBUG(dbgs() << "ARM CGP: Mutation complete\n"); -} - -/// We accept most instructions, as well as Arguments and ConstantInsts. We -/// Disallow casts other than zext and truncs and only allow calls if their -/// return value is zeroext. We don't allow opcodes that can introduce sign -/// bits. -bool ARMCodeGenPrepare::isSupportedValue(Value *V) { - if (auto *I = dyn_cast<Instruction>(V)) { - switch (I->getOpcode()) { - default: - return isa<BinaryOperator>(I) && isSupportedType(I) && - !GenerateSignBits(I); - case Instruction::GetElementPtr: - case Instruction::Store: - case Instruction::Br: - case Instruction::Switch: - return true; - case Instruction::PHI: - case Instruction::Select: - case Instruction::Ret: - case Instruction::Load: - case Instruction::Trunc: - case Instruction::BitCast: - return isSupportedType(I); - case Instruction::ZExt: - return isSupportedType(I->getOperand(0)); - case Instruction::ICmp: - // Now that we allow small types than TypeSize, only allow icmp of - // TypeSize because they will require a trunc to be legalised. - // TODO: Allow icmp of smaller types, and calculate at the end - // whether the transform would be beneficial. - if (isa<PointerType>(I->getOperand(0)->getType())) - return true; - return EqualTypeSize(I->getOperand(0)); - case Instruction::Call: { - // Special cases for calls as we need to check for zeroext - // TODO We should accept calls even if they don't have zeroext, as they - // can still be sinks. - auto *Call = cast<CallInst>(I); - return isSupportedType(Call) && - Call->hasRetAttr(Attribute::AttrKind::ZExt); - } - } - } else if (isa<Constant>(V) && !isa<ConstantExpr>(V)) { - return isSupportedType(V); - } else if (isa<Argument>(V)) - return isSupportedType(V); - - return isa<BasicBlock>(V); -} - -/// Check that the type of V would be promoted and that the original type is -/// smaller than the targeted promoted type. Check that we're not trying to -/// promote something larger than our base 'TypeSize' type. -bool ARMCodeGenPrepare::isLegalToPromote(Value *V) { - - auto *I = dyn_cast<Instruction>(V); - if (!I) - return true; - - if (SafeToPromote.count(I)) - return true; - - if (isPromotedResultSafe(V) || isSafeWrap(I)) { - SafeToPromote.insert(I); - return true; - } - - if (I->getOpcode() != Instruction::Add && I->getOpcode() != Instruction::Sub) - return false; - - // If promotion is not safe, can we use a DSP instruction to natively - // handle the narrow type? - if (!ST->hasDSP() || !EnableDSP || !isSupportedType(I)) - return false; - - if (ST->isThumb() && !ST->hasThumb2()) - return false; - - // TODO - // Would it be profitable? For Thumb code, these parallel DSP instructions - // are only Thumb-2, so we wouldn't be able to dual issue on Cortex-M33. For - // Cortex-A, specifically Cortex-A72, the latency is double and throughput is - // halved. They also do not take immediates as operands. - for (auto &Op : I->operands()) { - if (isa<Constant>(Op)) { - if (!EnableDSPWithImms) - return false; - } - } - LLVM_DEBUG(dbgs() << "ARM CGP: Will use an intrinsic for: " << *I << "\n"); - return true; -} - -bool ARMCodeGenPrepare::TryToPromote(Value *V) { - OrigTy = V->getType(); - TypeSize = OrigTy->getPrimitiveSizeInBits(); - if (TypeSize > 16 || TypeSize < 8) - return false; - - SafeToPromote.clear(); - SafeWrap.clear(); - - if (!isSupportedValue(V) || !shouldPromote(V) || !isLegalToPromote(V)) - return false; - - LLVM_DEBUG(dbgs() << "ARM CGP: TryToPromote: " << *V << ", TypeSize = " - << TypeSize << "\n"); - - SetVector<Value*> WorkList; - SmallPtrSet<Value*, 8> Sources; - SmallPtrSet<Instruction*, 4> Sinks; - SetVector<Value*> CurrentVisited; - WorkList.insert(V); - - // Return true if V was added to the worklist as a supported instruction, - // if it was already visited, or if we don't need to explore it (e.g. - // pointer values and GEPs), and false otherwise. - auto AddLegalInst = [&](Value *V) { - if (CurrentVisited.count(V)) - return true; - - // Ignore GEPs because they don't need promoting and the constant indices - // will prevent the transformation. - if (isa<GetElementPtrInst>(V)) - return true; - - if (!isSupportedValue(V) || (shouldPromote(V) && !isLegalToPromote(V))) { - LLVM_DEBUG(dbgs() << "ARM CGP: Can't handle: " << *V << "\n"); - return false; - } - - WorkList.insert(V); - return true; - }; - - // Iterate through, and add to, a tree of operands and users in the use-def. - while (!WorkList.empty()) { - Value *V = WorkList.back(); - WorkList.pop_back(); - if (CurrentVisited.count(V)) - continue; - - // Ignore non-instructions, other than arguments. - if (!isa<Instruction>(V) && !isSource(V)) - continue; - - // If we've already visited this value from somewhere, bail now because - // the tree has already been explored. - // TODO: This could limit the transform, ie if we try to promote something - // from an i8 and fail first, before trying an i16. - if (AllVisited.count(V)) - return false; - - CurrentVisited.insert(V); - AllVisited.insert(V); - - // Calls can be both sources and sinks. - if (isSink(V)) - Sinks.insert(cast<Instruction>(V)); - - if (isSource(V)) - Sources.insert(V); - - if (!isSink(V) && !isSource(V)) { - if (auto *I = dyn_cast<Instruction>(V)) { - // Visit operands of any instruction visited. - for (auto &U : I->operands()) { - if (!AddLegalInst(U)) - return false; - } - } - } - - // Don't visit users of a node which isn't going to be mutated unless its a - // source. - if (isSource(V) || shouldPromote(V)) { - for (Use &U : V->uses()) { - if (!AddLegalInst(U.getUser())) - return false; - } - } - } - - LLVM_DEBUG(dbgs() << "ARM CGP: Visited nodes:\n"; - for (auto *I : CurrentVisited) - I->dump(); - ); - unsigned ToPromote = 0; - for (auto *V : CurrentVisited) { - if (Sources.count(V)) - continue; - if (Sinks.count(cast<Instruction>(V))) - continue; - ++ToPromote; - } - - if (ToPromote < 2) - return false; - - Promoter->Mutate(OrigTy, CurrentVisited, Sources, Sinks, SafeToPromote, - SafeWrap); - return true; -} - -bool ARMCodeGenPrepare::doInitialization(Module &M) { - Promoter = new IRPromoter(&M); - return false; -} - -bool ARMCodeGenPrepare::runOnFunction(Function &F) { - if (skipFunction(F) || DisableCGP) - return false; - - auto *TPC = &getAnalysis<TargetPassConfig>(); - if (!TPC) - return false; - - const TargetMachine &TM = TPC->getTM<TargetMachine>(); - ST = &TM.getSubtarget<ARMSubtarget>(F); - bool MadeChange = false; - LLVM_DEBUG(dbgs() << "ARM CGP: Running on " << F.getName() << "\n"); - - // Search up from icmps to try to promote their operands. - for (BasicBlock &BB : F) { - auto &Insts = BB.getInstList(); - for (auto &I : Insts) { - if (AllVisited.count(&I)) - continue; - - if (isa<ICmpInst>(I)) { - auto &CI = cast<ICmpInst>(I); - - // Skip signed or pointer compares - if (CI.isSigned() || !isa<IntegerType>(CI.getOperand(0)->getType())) - continue; - - LLVM_DEBUG(dbgs() << "ARM CGP: Searching from: " << CI << "\n"); - - for (auto &Op : CI.operands()) { - if (auto *I = dyn_cast<Instruction>(Op)) - MadeChange |= TryToPromote(I); - } - } - } - LLVM_DEBUG(if (verifyFunction(F, &dbgs())) { - dbgs() << F; - report_fatal_error("Broken function after type promotion"); - }); - } - if (MadeChange) - LLVM_DEBUG(dbgs() << "After ARMCodeGenPrepare: " << F << "\n"); - - return MadeChange; -} - -bool ARMCodeGenPrepare::doFinalization(Module &M) { - delete Promoter; - return false; -} - -INITIALIZE_PASS_BEGIN(ARMCodeGenPrepare, DEBUG_TYPE, - "ARM IR optimizations", false, false) -INITIALIZE_PASS_END(ARMCodeGenPrepare, DEBUG_TYPE, "ARM IR optimizations", - false, false) - -char ARMCodeGenPrepare::ID = 0; -unsigned ARMCodeGenPrepare::TypeSize = 0; - -FunctionPass *llvm::createARMCodeGenPreparePass() { - return new ARMCodeGenPrepare(); -} diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp index 24ca25f73e96..634fb89b8e89 100644 --- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp +++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp @@ -1917,6 +1917,7 @@ bool ARMConstantIslands::optimizeThumb2Branches() { MachineInstrBuilder MIB = BuildMI(*MBB, Br.MI, Br.MI->getDebugLoc(), TII->get(ARM::t2LE)); + // Swapped a t2Bcc for a t2LE, so no need to update the size of the block. MIB.add(Br.MI->getOperand(0)); Br.MI->eraseFromParent(); Br.MI = MIB; @@ -1975,21 +1976,20 @@ bool ARMConstantIslands::optimizeThumb2Branches() { .addMBB(DestBB, Br.MI->getOperand(0).getTargetFlags()); Cmp.MI->eraseFromParent(); - BBInfoVector &BBInfo = BBUtils->getBBInfo(); - BBInfo[MBB->getNumber()].Size -= 2; if (Br.MI->getOpcode() == ARM::tBcc) { Br.MI->eraseFromParent(); Br.MI = NewBR; - } else if (&MBB->back() != Br.MI) { - // We've generated an LE and already erased the original conditional - // branch. The CBN?Z is now used to branch to the other successor, so an - // unconditional branch terminator is now redundant. + BBUtils->adjustBBSize(MBB, -2); + } else if (MBB->back().getOpcode() != ARM::t2LE) { + // An LE has been generated, but it's not the terminator - that is an + // unconditional branch. However, the logic has now been reversed with the + // CBN?Z being the conditional branch and the LE being the unconditional + // branch. So this means we can remove the redundant unconditional branch + // at the end of the block. MachineInstr *LastMI = &MBB->back(); - if (LastMI != Br.MI) { - BBInfo[MBB->getNumber()].Size -= LastMI->getDesc().getSize(); - LastMI->eraseFromParent(); - } + BBUtils->adjustBBSize(MBB, -LastMI->getDesc().getSize()); + LastMI->eraseFromParent(); } BBUtils->adjustBBOffsetsAfter(MBB); ++NumCBZ; diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp index 563fdda56104..2c3ac816219f 100644 --- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp +++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp @@ -1213,9 +1213,10 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB, MBBI = NewMI; return true; } + case ARM::VMOVHcc: case ARM::VMOVScc: case ARM::VMOVDcc: { - unsigned newOpc = Opcode == ARM::VMOVScc ? ARM::VMOVS : ARM::VMOVD; + unsigned newOpc = Opcode != ARM::VMOVDcc ? ARM::VMOVS : ARM::VMOVD; BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(newOpc), MI.getOperand(1).getReg()) .add(MI.getOperand(2)) @@ -1951,6 +1952,24 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB, MI.eraseFromParent(); return true; } + case ARM::LOADDUAL: + case ARM::STOREDUAL: { + Register PairReg = MI.getOperand(0).getReg(); + + MachineInstrBuilder MIB = + BuildMI(MBB, MBBI, MI.getDebugLoc(), + TII->get(Opcode == ARM::LOADDUAL ? ARM::LDRD : ARM::STRD)) + .addReg(TRI->getSubReg(PairReg, ARM::gsub_0), + Opcode == ARM::LOADDUAL ? RegState::Define : 0) + .addReg(TRI->getSubReg(PairReg, ARM::gsub_1), + Opcode == ARM::LOADDUAL ? RegState::Define : 0); + for (unsigned i = 1; i < MI.getNumOperands(); i++) + MIB.add(MI.getOperand(i)); + MIB.add(predOps(ARMCC::AL)); + MIB.cloneMemRefs(MI); + MI.eraseFromParent(); + return true; + } } } diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMFastISel.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/ARMFastISel.cpp index 1fc5ff6921c6..6e19db3c7e22 100644 --- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMFastISel.cpp +++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMFastISel.cpp @@ -1879,6 +1879,8 @@ CCAssignFn *ARMFastISel::CCAssignFnForCall(CallingConv::ID CC, report_fatal_error("Can't return in GHC call convention"); else return CC_ARM_APCS_GHC; + case CallingConv::CFGuard_Check: + return (Return ? RetCC_ARM_AAPCS : CC_ARM_Win32_CFGuard_Check); } } @@ -2564,8 +2566,12 @@ bool ARMFastISel::SelectIntrinsicCall(const IntrinsicInst &I) { return SelectCall(&I, "memset"); } case Intrinsic::trap: { - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get( - Subtarget->useNaClTrap() ? ARM::TRAPNaCl : ARM::TRAP)); + unsigned Opcode; + if (Subtarget->isThumb()) + Opcode = ARM::tTRAP; + else + Opcode = Subtarget->useNaClTrap() ? ARM::TRAPNaCl : ARM::TRAP; + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opcode)); return true; } } diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMFrameLowering.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/ARMFrameLowering.cpp index 01ae93086dcb..cb98b2b34efd 100644 --- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMFrameLowering.cpp +++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMFrameLowering.cpp @@ -2128,10 +2128,16 @@ void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF, AFI->setLRIsSpilledForFarJump(true); } AFI->setLRIsSpilled(SavedRegs.test(ARM::LR)); +} + +void ARMFrameLowering::getCalleeSaves(const MachineFunction &MF, + BitVector &SavedRegs) const { + TargetFrameLowering::getCalleeSaves(MF, SavedRegs); // If we have the "returned" parameter attribute which guarantees that we // return the value which was passed in r0 unmodified (e.g. C++ 'structors), // record that fact for IPRA. + const ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); if (AFI->getPreservesR0()) SavedRegs.set(ARM::R0); } @@ -2418,7 +2424,8 @@ void ARMFrameLowering::adjustForSegmentedStacks( } else { // Get TLS base address from the coprocessor // mrc p15, #0, SR0, c13, c0, #3 - BuildMI(McrMBB, DL, TII.get(ARM::MRC), ScratchReg0) + BuildMI(McrMBB, DL, TII.get(Thumb ? ARM::t2MRC : ARM::MRC), + ScratchReg0) .addImm(15) .addImm(0) .addImm(13) @@ -2432,7 +2439,8 @@ void ARMFrameLowering::adjustForSegmentedStacks( // Get the stack limit from the right offset // ldr SR0, [sr0, #4 * TlsOffset] - BuildMI(GetMBB, DL, TII.get(ARM::LDRi12), ScratchReg0) + BuildMI(GetMBB, DL, TII.get(Thumb ? ARM::t2LDRi12 : ARM::LDRi12), + ScratchReg0) .addReg(ScratchReg0) .addImm(4 * TlsOffset) .add(predOps(ARMCC::AL)); diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMFrameLowering.h b/contrib/llvm-project/llvm/lib/Target/ARM/ARMFrameLowering.h index 6d8aee597945..0462b01af707 100644 --- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMFrameLowering.h +++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMFrameLowering.h @@ -53,6 +53,8 @@ public: int ResolveFrameIndexReference(const MachineFunction &MF, int FI, unsigned &FrameReg, int SPAdj) const; + void getCalleeSaves(const MachineFunction &MF, + BitVector &SavedRegs) const override; void determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs, RegScavenger *RS) const override; diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMHazardRecognizer.h b/contrib/llvm-project/llvm/lib/Target/ARM/ARMHazardRecognizer.h index b5ac694e01f7..ca02cc739e11 100644 --- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMHazardRecognizer.h +++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMHazardRecognizer.h @@ -27,14 +27,13 @@ class MachineInstr; /// ARM preRA scheduler uses an unspecialized instance of the /// ScoreboardHazardRecognizer. class ARMHazardRecognizer : public ScoreboardHazardRecognizer { - MachineInstr *LastMI; - unsigned FpMLxStalls; + MachineInstr *LastMI = nullptr; + unsigned FpMLxStalls = 0; public: ARMHazardRecognizer(const InstrItineraryData *ItinData, const ScheduleDAG *DAG) - : ScoreboardHazardRecognizer(ItinData, DAG, "post-RA-sched"), - LastMI(nullptr) {} + : ScoreboardHazardRecognizer(ItinData, DAG, "post-RA-sched") {} HazardType getHazardType(SUnit *SU, int Stalls) override; void Reset() override; diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp index 8f6515c423eb..76a9ac12062d 100644 --- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp +++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp @@ -28,6 +28,7 @@ #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Function.h" #include "llvm/IR/Intrinsics.h" +#include "llvm/IR/IntrinsicsARM.h" #include "llvm/IR/LLVMContext.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" @@ -144,6 +145,8 @@ public: // Thumb 2 Addressing Modes: bool SelectT2AddrModeImm12(SDValue N, SDValue &Base, SDValue &OffImm); + template <unsigned Shift> + bool SelectT2AddrModeImm8(SDValue N, SDValue &Base, SDValue &OffImm); bool SelectT2AddrModeImm8(SDValue N, SDValue &Base, SDValue &OffImm); bool SelectT2AddrModeImm8Offset(SDNode *Op, SDValue N, @@ -158,6 +161,9 @@ public: SDValue &OffReg, SDValue &ShImm); bool SelectT2AddrModeExclusive(SDValue N, SDValue &Base, SDValue &OffImm); + template<int Min, int Max> + bool SelectImmediateInRange(SDValue N, SDValue &OffImm); + inline bool is_so_imm(unsigned Imm) const { return ARM_AM::getSOImmVal(Imm) != -1; } @@ -209,6 +215,59 @@ private: unsigned NumVecs, const uint16_t *DOpcodes, const uint16_t *QOpcodes); + /// Helper functions for setting up clusters of MVE predication operands. + template <typename SDValueVector> + void AddMVEPredicateToOps(SDValueVector &Ops, SDLoc Loc, + SDValue PredicateMask); + template <typename SDValueVector> + void AddMVEPredicateToOps(SDValueVector &Ops, SDLoc Loc, + SDValue PredicateMask, SDValue Inactive); + + template <typename SDValueVector> + void AddEmptyMVEPredicateToOps(SDValueVector &Ops, SDLoc Loc); + template <typename SDValueVector> + void AddEmptyMVEPredicateToOps(SDValueVector &Ops, SDLoc Loc, EVT InactiveTy); + + /// SelectMVE_WB - Select MVE writeback load/store intrinsics. + void SelectMVE_WB(SDNode *N, const uint16_t *Opcodes, bool Predicated); + + /// SelectMVE_LongShift - Select MVE 64-bit scalar shift intrinsics. + void SelectMVE_LongShift(SDNode *N, uint16_t Opcode, bool Immediate, + bool HasSaturationOperand); + + /// SelectMVE_VADCSBC - Select MVE vector add/sub-with-carry intrinsics. + void SelectMVE_VADCSBC(SDNode *N, uint16_t OpcodeWithCarry, + uint16_t OpcodeWithNoCarry, bool Add, bool Predicated); + + /// Select long MVE vector reductions with two vector operands + /// Stride is the number of vector element widths the instruction can operate + /// on: + /// 2 for long non-rounding variants, vml{a,s}ldav[a][x]: [i16, i32] + /// 1 for long rounding variants: vrml{a,s}ldavh[a][x]: [i32] + /// Stride is used when addressing the OpcodesS array which contains multiple + /// opcodes for each element width. + /// TySize is the index into the list of element types listed above + void SelectBaseMVE_VMLLDAV(SDNode *N, bool Predicated, + const uint16_t *OpcodesS, const uint16_t *OpcodesU, + size_t Stride, size_t TySize); + + /// Select a 64-bit MVE vector reduction with two vector operands + /// arm_mve_vmlldava_[predicated] + void SelectMVE_VMLLDAV(SDNode *N, bool Predicated, const uint16_t *OpcodesS, + const uint16_t *OpcodesU); + /// Select a 72-bit MVE vector rounding reduction with two vector operands + /// int_arm_mve_vrmlldavha[_predicated] + void SelectMVE_VRMLLDAVH(SDNode *N, bool Predicated, const uint16_t *OpcodesS, + const uint16_t *OpcodesU); + + /// SelectMVE_VLD - Select MVE interleaving load intrinsics. NumVecs + /// should be 2 or 4. The opcode array specifies the instructions + /// used for 8, 16 and 32-bit lane sizes respectively, and each + /// pointer points to a set of NumVecs sub-opcodes used for the + /// different stages (e.g. VLD20 versus VLD21) of each load family. + void SelectMVE_VLD(SDNode *N, unsigned NumVecs, + const uint16_t *const *Opcodes); + /// SelectVLDDup - Select NEON load-duplicate intrinsics. NumVecs /// should be 1, 2, 3 or 4. The opcode array specifies the instructions used /// for loading D registers. @@ -1237,6 +1296,33 @@ bool ARMDAGToDAGISel::SelectT2AddrModeImm12(SDValue N, return true; } +template <unsigned Shift> +bool ARMDAGToDAGISel::SelectT2AddrModeImm8(SDValue N, SDValue &Base, + SDValue &OffImm) { + if (N.getOpcode() == ISD::SUB || CurDAG->isBaseWithConstantOffset(N)) { + int RHSC; + if (isScaledConstantInRange(N.getOperand(1), 1 << Shift, -255, 256, RHSC)) { + Base = N.getOperand(0); + if (Base.getOpcode() == ISD::FrameIndex) { + int FI = cast<FrameIndexSDNode>(Base)->getIndex(); + Base = CurDAG->getTargetFrameIndex( + FI, TLI->getPointerTy(CurDAG->getDataLayout())); + } + + if (N.getOpcode() == ISD::SUB) + RHSC = -RHSC; + OffImm = + CurDAG->getTargetConstant(RHSC * (1 << Shift), SDLoc(N), MVT::i32); + return true; + } + } + + // Base only. + Base = N; + OffImm = CurDAG->getTargetConstant(0, SDLoc(N), MVT::i32); + return true; +} + bool ARMDAGToDAGISel::SelectT2AddrModeImm8(SDValue N, SDValue &Base, SDValue &OffImm) { // Match simple R - imm8 operands. @@ -1319,11 +1405,27 @@ bool ARMDAGToDAGISel::SelectT2AddrModeImm7Offset(SDNode *Op, SDValue N, SDValue &OffImm, unsigned Shift) { unsigned Opcode = Op->getOpcode(); - ISD::MemIndexedMode AM = (Opcode == ISD::LOAD) - ? cast<LoadSDNode>(Op)->getAddressingMode() - : cast<StoreSDNode>(Op)->getAddressingMode(); + ISD::MemIndexedMode AM; + switch (Opcode) { + case ISD::LOAD: + AM = cast<LoadSDNode>(Op)->getAddressingMode(); + break; + case ISD::STORE: + AM = cast<StoreSDNode>(Op)->getAddressingMode(); + break; + case ISD::MLOAD: + AM = cast<MaskedLoadSDNode>(Op)->getAddressingMode(); + break; + case ISD::MSTORE: + AM = cast<MaskedStoreSDNode>(Op)->getAddressingMode(); + break; + default: + llvm_unreachable("Unexpected Opcode for Imm7Offset"); + } + int RHSC; - if (isScaledConstantInRange(N, 1 << Shift, 0, 0x80, RHSC)) { // 7 bits. + // 7 bit constant, shifted by Shift. + if (isScaledConstantInRange(N, 1 << Shift, 0, 0x80, RHSC)) { OffImm = ((AM == ISD::PRE_INC) || (AM == ISD::POST_INC)) ? CurDAG->getTargetConstant(RHSC * (1 << Shift), SDLoc(N), MVT::i32) @@ -1334,6 +1436,16 @@ bool ARMDAGToDAGISel::SelectT2AddrModeImm7Offset(SDNode *Op, SDValue N, return false; } +template <int Min, int Max> +bool ARMDAGToDAGISel::SelectImmediateInRange(SDValue N, SDValue &OffImm) { + int Val; + if (isScaledConstantInRange(N, 1, Min, Max, Val)) { + OffImm = CurDAG->getTargetConstant(Val, SDLoc(N), MVT::i32); + return true; + } + return false; +} + bool ARMDAGToDAGISel::SelectT2AddrModeSoReg(SDValue N, SDValue &Base, SDValue &OffReg, SDValue &ShImm) { @@ -1593,58 +1705,93 @@ bool ARMDAGToDAGISel::tryT2IndexedLoad(SDNode *N) { } bool ARMDAGToDAGISel::tryMVEIndexedLoad(SDNode *N) { - LoadSDNode *LD = cast<LoadSDNode>(N); - ISD::MemIndexedMode AM = LD->getAddressingMode(); - if (AM == ISD::UNINDEXED) - return false; - EVT LoadedVT = LD->getMemoryVT(); - if (!LoadedVT.isVector()) - return false; - bool isSExtLd = LD->getExtensionType() == ISD::SEXTLOAD; - SDValue Offset; - bool isPre = (AM == ISD::PRE_INC) || (AM == ISD::PRE_DEC); + EVT LoadedVT; unsigned Opcode = 0; - unsigned Align = LD->getAlignment(); - bool IsLE = Subtarget->isLittle(); + bool isSExtLd, isPre; + unsigned Align; + ARMVCC::VPTCodes Pred; + SDValue PredReg; + SDValue Chain, Base, Offset; + + if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) { + ISD::MemIndexedMode AM = LD->getAddressingMode(); + if (AM == ISD::UNINDEXED) + return false; + LoadedVT = LD->getMemoryVT(); + if (!LoadedVT.isVector()) + return false; + + Chain = LD->getChain(); + Base = LD->getBasePtr(); + Offset = LD->getOffset(); + Align = LD->getAlignment(); + isSExtLd = LD->getExtensionType() == ISD::SEXTLOAD; + isPre = (AM == ISD::PRE_INC) || (AM == ISD::PRE_DEC); + Pred = ARMVCC::None; + PredReg = CurDAG->getRegister(0, MVT::i32); + } else if (MaskedLoadSDNode *LD = dyn_cast<MaskedLoadSDNode>(N)) { + ISD::MemIndexedMode AM = LD->getAddressingMode(); + if (AM == ISD::UNINDEXED) + return false; + LoadedVT = LD->getMemoryVT(); + if (!LoadedVT.isVector()) + return false; + Chain = LD->getChain(); + Base = LD->getBasePtr(); + Offset = LD->getOffset(); + Align = LD->getAlignment(); + isSExtLd = LD->getExtensionType() == ISD::SEXTLOAD; + isPre = (AM == ISD::PRE_INC) || (AM == ISD::PRE_DEC); + Pred = ARMVCC::Then; + PredReg = LD->getMask(); + } else + llvm_unreachable("Expected a Load or a Masked Load!"); + + // We allow LE non-masked loads to change the type (for example use a vldrb.8 + // as opposed to a vldrw.32). This can allow extra addressing modes or + // alignments for what is otherwise an equivalent instruction. + bool CanChangeType = Subtarget->isLittle() && !isa<MaskedLoadSDNode>(N); + + SDValue NewOffset; if (Align >= 2 && LoadedVT == MVT::v4i16 && - SelectT2AddrModeImm7Offset(N, LD->getOffset(), Offset, 1)) { + SelectT2AddrModeImm7Offset(N, Offset, NewOffset, 1)) { if (isSExtLd) Opcode = isPre ? ARM::MVE_VLDRHS32_pre : ARM::MVE_VLDRHS32_post; else Opcode = isPre ? ARM::MVE_VLDRHU32_pre : ARM::MVE_VLDRHU32_post; } else if (LoadedVT == MVT::v8i8 && - SelectT2AddrModeImm7Offset(N, LD->getOffset(), Offset, 0)) { + SelectT2AddrModeImm7Offset(N, Offset, NewOffset, 0)) { if (isSExtLd) Opcode = isPre ? ARM::MVE_VLDRBS16_pre : ARM::MVE_VLDRBS16_post; else Opcode = isPre ? ARM::MVE_VLDRBU16_pre : ARM::MVE_VLDRBU16_post; } else if (LoadedVT == MVT::v4i8 && - SelectT2AddrModeImm7Offset(N, LD->getOffset(), Offset, 0)) { + SelectT2AddrModeImm7Offset(N, Offset, NewOffset, 0)) { if (isSExtLd) Opcode = isPre ? ARM::MVE_VLDRBS32_pre : ARM::MVE_VLDRBS32_post; else Opcode = isPre ? ARM::MVE_VLDRBU32_pre : ARM::MVE_VLDRBU32_post; } else if (Align >= 4 && - (IsLE || LoadedVT == MVT::v4i32 || LoadedVT == MVT::v4f32) && - SelectT2AddrModeImm7Offset(N, LD->getOffset(), Offset, 2)) + (CanChangeType || LoadedVT == MVT::v4i32 || + LoadedVT == MVT::v4f32) && + SelectT2AddrModeImm7Offset(N, Offset, NewOffset, 2)) Opcode = isPre ? ARM::MVE_VLDRWU32_pre : ARM::MVE_VLDRWU32_post; else if (Align >= 2 && - (IsLE || LoadedVT == MVT::v8i16 || LoadedVT == MVT::v8f16) && - SelectT2AddrModeImm7Offset(N, LD->getOffset(), Offset, 1)) + (CanChangeType || LoadedVT == MVT::v8i16 || + LoadedVT == MVT::v8f16) && + SelectT2AddrModeImm7Offset(N, Offset, NewOffset, 1)) Opcode = isPre ? ARM::MVE_VLDRHU16_pre : ARM::MVE_VLDRHU16_post; - else if ((IsLE || LoadedVT == MVT::v16i8) && - SelectT2AddrModeImm7Offset(N, LD->getOffset(), Offset, 0)) + else if ((CanChangeType || LoadedVT == MVT::v16i8) && + SelectT2AddrModeImm7Offset(N, Offset, NewOffset, 0)) Opcode = isPre ? ARM::MVE_VLDRBU8_pre : ARM::MVE_VLDRBU8_post; else return false; - SDValue Chain = LD->getChain(); - SDValue Base = LD->getBasePtr(); - SDValue Ops[] = {Base, Offset, - CurDAG->getTargetConstant(ARMVCC::None, SDLoc(N), MVT::i32), - CurDAG->getRegister(0, MVT::i32), Chain}; - SDNode *New = CurDAG->getMachineNode(Opcode, SDLoc(N), LD->getValueType(0), + SDValue Ops[] = {Base, NewOffset, + CurDAG->getTargetConstant(Pred, SDLoc(N), MVT::i32), PredReg, + Chain}; + SDNode *New = CurDAG->getMachineNode(Opcode, SDLoc(N), N->getValueType(0), MVT::i32, MVT::Other, Ops); transferMemOperands(N, New); ReplaceUses(SDValue(N, 0), SDValue(New, 1)); @@ -2304,6 +2451,268 @@ void ARMDAGToDAGISel::SelectVLDSTLane(SDNode *N, bool IsLoad, bool isUpdating, CurDAG->RemoveDeadNode(N); } +template <typename SDValueVector> +void ARMDAGToDAGISel::AddMVEPredicateToOps(SDValueVector &Ops, SDLoc Loc, + SDValue PredicateMask) { + Ops.push_back(CurDAG->getTargetConstant(ARMVCC::Then, Loc, MVT::i32)); + Ops.push_back(PredicateMask); +} + +template <typename SDValueVector> +void ARMDAGToDAGISel::AddMVEPredicateToOps(SDValueVector &Ops, SDLoc Loc, + SDValue PredicateMask, + SDValue Inactive) { + Ops.push_back(CurDAG->getTargetConstant(ARMVCC::Then, Loc, MVT::i32)); + Ops.push_back(PredicateMask); + Ops.push_back(Inactive); +} + +template <typename SDValueVector> +void ARMDAGToDAGISel::AddEmptyMVEPredicateToOps(SDValueVector &Ops, SDLoc Loc) { + Ops.push_back(CurDAG->getTargetConstant(ARMVCC::None, Loc, MVT::i32)); + Ops.push_back(CurDAG->getRegister(0, MVT::i32)); +} + +template <typename SDValueVector> +void ARMDAGToDAGISel::AddEmptyMVEPredicateToOps(SDValueVector &Ops, SDLoc Loc, + EVT InactiveTy) { + Ops.push_back(CurDAG->getTargetConstant(ARMVCC::None, Loc, MVT::i32)); + Ops.push_back(CurDAG->getRegister(0, MVT::i32)); + Ops.push_back(SDValue( + CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, Loc, InactiveTy), 0)); +} + +void ARMDAGToDAGISel::SelectMVE_WB(SDNode *N, const uint16_t *Opcodes, + bool Predicated) { + SDLoc Loc(N); + SmallVector<SDValue, 8> Ops; + + uint16_t Opcode; + switch (N->getValueType(1).getVectorElementType().getSizeInBits()) { + case 32: + Opcode = Opcodes[0]; + break; + case 64: + Opcode = Opcodes[1]; + break; + default: + llvm_unreachable("bad vector element size in SelectMVE_WB"); + } + + Ops.push_back(N->getOperand(2)); // vector of base addresses + + int32_t ImmValue = cast<ConstantSDNode>(N->getOperand(3))->getZExtValue(); + Ops.push_back(getI32Imm(ImmValue, Loc)); // immediate offset + + if (Predicated) + AddMVEPredicateToOps(Ops, Loc, N->getOperand(4)); + else + AddEmptyMVEPredicateToOps(Ops, Loc); + + Ops.push_back(N->getOperand(0)); // chain + + CurDAG->SelectNodeTo(N, Opcode, N->getVTList(), makeArrayRef(Ops)); +} + +void ARMDAGToDAGISel::SelectMVE_LongShift(SDNode *N, uint16_t Opcode, + bool Immediate, + bool HasSaturationOperand) { + SDLoc Loc(N); + SmallVector<SDValue, 8> Ops; + + // Two 32-bit halves of the value to be shifted + Ops.push_back(N->getOperand(1)); + Ops.push_back(N->getOperand(2)); + + // The shift count + if (Immediate) { + int32_t ImmValue = cast<ConstantSDNode>(N->getOperand(3))->getZExtValue(); + Ops.push_back(getI32Imm(ImmValue, Loc)); // immediate shift count + } else { + Ops.push_back(N->getOperand(3)); + } + + // The immediate saturation operand, if any + if (HasSaturationOperand) { + int32_t SatOp = cast<ConstantSDNode>(N->getOperand(4))->getZExtValue(); + int SatBit = (SatOp == 64 ? 0 : 1); + Ops.push_back(getI32Imm(SatBit, Loc)); + } + + // MVE scalar shifts are IT-predicable, so include the standard + // predicate arguments. + Ops.push_back(getAL(CurDAG, Loc)); + Ops.push_back(CurDAG->getRegister(0, MVT::i32)); + + CurDAG->SelectNodeTo(N, Opcode, N->getVTList(), makeArrayRef(Ops)); +} + +void ARMDAGToDAGISel::SelectMVE_VADCSBC(SDNode *N, uint16_t OpcodeWithCarry, + uint16_t OpcodeWithNoCarry, + bool Add, bool Predicated) { + SDLoc Loc(N); + SmallVector<SDValue, 8> Ops; + uint16_t Opcode; + + unsigned FirstInputOp = Predicated ? 2 : 1; + + // Two input vectors and the input carry flag + Ops.push_back(N->getOperand(FirstInputOp)); + Ops.push_back(N->getOperand(FirstInputOp + 1)); + SDValue CarryIn = N->getOperand(FirstInputOp + 2); + ConstantSDNode *CarryInConstant = dyn_cast<ConstantSDNode>(CarryIn); + uint32_t CarryMask = 1 << 29; + uint32_t CarryExpected = Add ? 0 : CarryMask; + if (CarryInConstant && + (CarryInConstant->getZExtValue() & CarryMask) == CarryExpected) { + Opcode = OpcodeWithNoCarry; + } else { + Ops.push_back(CarryIn); + Opcode = OpcodeWithCarry; + } + + if (Predicated) + AddMVEPredicateToOps(Ops, Loc, + N->getOperand(FirstInputOp + 3), // predicate + N->getOperand(FirstInputOp - 1)); // inactive + else + AddEmptyMVEPredicateToOps(Ops, Loc, N->getValueType(0)); + + CurDAG->SelectNodeTo(N, Opcode, N->getVTList(), makeArrayRef(Ops)); +} + +static bool SDValueToConstBool(SDValue SDVal) { + assert(isa<ConstantSDNode>(SDVal) && "expected a compile-time constant"); + ConstantSDNode *SDValConstant = dyn_cast<ConstantSDNode>(SDVal); + uint64_t Value = SDValConstant->getZExtValue(); + assert((Value == 0 || Value == 1) && "expected value 0 or 1"); + return Value; +} + +void ARMDAGToDAGISel::SelectBaseMVE_VMLLDAV(SDNode *N, bool Predicated, + const uint16_t *OpcodesS, + const uint16_t *OpcodesU, + size_t Stride, size_t TySize) { + assert(TySize < Stride && "Invalid TySize"); + bool IsUnsigned = SDValueToConstBool(N->getOperand(1)); + bool IsSub = SDValueToConstBool(N->getOperand(2)); + bool IsExchange = SDValueToConstBool(N->getOperand(3)); + if (IsUnsigned) { + assert(!IsSub && + "Unsigned versions of vmlsldav[a]/vrmlsldavh[a] do not exist"); + assert(!IsExchange && + "Unsigned versions of vmlaldav[a]x/vrmlaldavh[a]x do not exist"); + } + + auto OpIsZero = [N](size_t OpNo) { + if (ConstantSDNode *OpConst = dyn_cast<ConstantSDNode>(N->getOperand(OpNo))) + if (OpConst->getZExtValue() == 0) + return true; + return false; + }; + + // If the input accumulator value is not zero, select an instruction with + // accumulator, otherwise select an instruction without accumulator + bool IsAccum = !(OpIsZero(4) && OpIsZero(5)); + + const uint16_t *Opcodes = IsUnsigned ? OpcodesU : OpcodesS; + if (IsSub) + Opcodes += 4 * Stride; + if (IsExchange) + Opcodes += 2 * Stride; + if (IsAccum) + Opcodes += Stride; + uint16_t Opcode = Opcodes[TySize]; + + SDLoc Loc(N); + SmallVector<SDValue, 8> Ops; + // Push the accumulator operands, if they are used + if (IsAccum) { + Ops.push_back(N->getOperand(4)); + Ops.push_back(N->getOperand(5)); + } + // Push the two vector operands + Ops.push_back(N->getOperand(6)); + Ops.push_back(N->getOperand(7)); + + if (Predicated) + AddMVEPredicateToOps(Ops, Loc, N->getOperand(8)); + else + AddEmptyMVEPredicateToOps(Ops, Loc); + + CurDAG->SelectNodeTo(N, Opcode, N->getVTList(), makeArrayRef(Ops)); +} + +void ARMDAGToDAGISel::SelectMVE_VMLLDAV(SDNode *N, bool Predicated, + const uint16_t *OpcodesS, + const uint16_t *OpcodesU) { + EVT VecTy = N->getOperand(6).getValueType(); + size_t SizeIndex; + switch (VecTy.getVectorElementType().getSizeInBits()) { + case 16: + SizeIndex = 0; + break; + case 32: + SizeIndex = 1; + break; + default: + llvm_unreachable("bad vector element size"); + } + + SelectBaseMVE_VMLLDAV(N, Predicated, OpcodesS, OpcodesU, 2, SizeIndex); +} + +void ARMDAGToDAGISel::SelectMVE_VRMLLDAVH(SDNode *N, bool Predicated, + const uint16_t *OpcodesS, + const uint16_t *OpcodesU) { + assert( + N->getOperand(6).getValueType().getVectorElementType().getSizeInBits() == + 32 && + "bad vector element size"); + SelectBaseMVE_VMLLDAV(N, Predicated, OpcodesS, OpcodesU, 1, 0); +} + +void ARMDAGToDAGISel::SelectMVE_VLD(SDNode *N, unsigned NumVecs, + const uint16_t *const *Opcodes) { + EVT VT = N->getValueType(0); + SDLoc Loc(N); + + const uint16_t *OurOpcodes; + switch (VT.getVectorElementType().getSizeInBits()) { + case 8: + OurOpcodes = Opcodes[0]; + break; + case 16: + OurOpcodes = Opcodes[1]; + break; + case 32: + OurOpcodes = Opcodes[2]; + break; + default: + llvm_unreachable("bad vector element size in SelectMVE_VLD"); + } + + EVT DataTy = EVT::getVectorVT(*CurDAG->getContext(), MVT::i64, NumVecs * 2); + EVT ResultTys[] = {DataTy, MVT::Other}; + + auto Data = SDValue( + CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, Loc, DataTy), 0); + SDValue Chain = N->getOperand(0); + for (unsigned Stage = 0; Stage < NumVecs; ++Stage) { + SDValue Ops[] = {Data, N->getOperand(2), Chain}; + auto LoadInst = + CurDAG->getMachineNode(OurOpcodes[Stage], Loc, ResultTys, Ops); + Data = SDValue(LoadInst, 0); + Chain = SDValue(LoadInst, 1); + } + + for (unsigned i = 0; i < NumVecs; i++) + ReplaceUses(SDValue(N, i), + CurDAG->getTargetExtractSubreg(ARM::qsub_0 + i, Loc, VT, Data)); + ReplaceUses(SDValue(N, NumVecs), Chain); + CurDAG->RemoveDeadNode(N); +} + void ARMDAGToDAGISel::SelectVLDDup(SDNode *N, bool IsIntrinsic, bool isUpdating, unsigned NumVecs, const uint16_t *DOpcodes, @@ -3089,6 +3498,11 @@ void ARMDAGToDAGISel::Select(SDNode *N) { // Other cases are autogenerated. break; } + case ISD::MLOAD: + if (Subtarget->hasMVEIntegerOps() && tryMVEIndexedLoad(N)) + return; + // Other cases are autogenerated. + break; case ARMISD::WLS: case ARMISD::LE: { SDValue Ops[] = { N->getOperand(1), @@ -3101,6 +3515,26 @@ void ARMDAGToDAGISel::Select(SDNode *N) { CurDAG->RemoveDeadNode(N); return; } + case ARMISD::LDRD: { + if (Subtarget->isThumb2()) + break; // TableGen handles isel in this case. + SDValue Base, RegOffset, ImmOffset; + const SDValue &Chain = N->getOperand(0); + const SDValue &Addr = N->getOperand(1); + SelectAddrMode3(Addr, Base, RegOffset, ImmOffset); + SDValue Ops[] = {Base, RegOffset, ImmOffset, Chain}; + SDNode *New = CurDAG->getMachineNode(ARM::LOADDUAL, dl, + {MVT::Untyped, MVT::Other}, Ops); + SDValue Lo = CurDAG->getTargetExtractSubreg(ARM::gsub_0, dl, MVT::i32, + SDValue(New, 0)); + SDValue Hi = CurDAG->getTargetExtractSubreg(ARM::gsub_1, dl, MVT::i32, + SDValue(New, 0)); + ReplaceUses(SDValue(N, 0), Lo); + ReplaceUses(SDValue(N, 1), Hi); + ReplaceUses(SDValue(N, 2), SDValue(New, 1)); + CurDAG->RemoveDeadNode(N); + return; + } case ARMISD::LOOP_DEC: { SDValue Ops[] = { N->getOperand(1), N->getOperand(2), @@ -4028,6 +4462,117 @@ void ARMDAGToDAGISel::Select(SDNode *N) { SelectVLDSTLane(N, false, false, 4, DOpcodes, QOpcodes); return; } + + case Intrinsic::arm_mve_vldr_gather_base_wb: + case Intrinsic::arm_mve_vldr_gather_base_wb_predicated: { + static const uint16_t Opcodes[] = {ARM::MVE_VLDRWU32_qi_pre, + ARM::MVE_VLDRDU64_qi_pre}; + SelectMVE_WB(N, Opcodes, + IntNo == Intrinsic::arm_mve_vldr_gather_base_wb_predicated); + return; + } + + case Intrinsic::arm_mve_vld2q: { + static const uint16_t Opcodes8[] = {ARM::MVE_VLD20_8, ARM::MVE_VLD21_8}; + static const uint16_t Opcodes16[] = {ARM::MVE_VLD20_16, + ARM::MVE_VLD21_16}; + static const uint16_t Opcodes32[] = {ARM::MVE_VLD20_32, + ARM::MVE_VLD21_32}; + static const uint16_t *const Opcodes[] = {Opcodes8, Opcodes16, Opcodes32}; + SelectMVE_VLD(N, 2, Opcodes); + return; + } + + case Intrinsic::arm_mve_vld4q: { + static const uint16_t Opcodes8[] = {ARM::MVE_VLD40_8, ARM::MVE_VLD41_8, + ARM::MVE_VLD42_8, ARM::MVE_VLD43_8}; + static const uint16_t Opcodes16[] = {ARM::MVE_VLD40_16, ARM::MVE_VLD41_16, + ARM::MVE_VLD42_16, + ARM::MVE_VLD43_16}; + static const uint16_t Opcodes32[] = {ARM::MVE_VLD40_32, ARM::MVE_VLD41_32, + ARM::MVE_VLD42_32, + ARM::MVE_VLD43_32}; + static const uint16_t *const Opcodes[] = {Opcodes8, Opcodes16, Opcodes32}; + SelectMVE_VLD(N, 4, Opcodes); + return; + } + } + break; + } + + case ISD::INTRINSIC_WO_CHAIN: { + unsigned IntNo = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue(); + switch (IntNo) { + default: + break; + + case Intrinsic::arm_mve_urshrl: + SelectMVE_LongShift(N, ARM::MVE_URSHRL, true, false); + return; + case Intrinsic::arm_mve_uqshll: + SelectMVE_LongShift(N, ARM::MVE_UQSHLL, true, false); + return; + case Intrinsic::arm_mve_srshrl: + SelectMVE_LongShift(N, ARM::MVE_SRSHRL, true, false); + return; + case Intrinsic::arm_mve_sqshll: + SelectMVE_LongShift(N, ARM::MVE_SQSHLL, true, false); + return; + case Intrinsic::arm_mve_uqrshll: + SelectMVE_LongShift(N, ARM::MVE_UQRSHLL, false, true); + return; + case Intrinsic::arm_mve_sqrshrl: + SelectMVE_LongShift(N, ARM::MVE_SQRSHRL, false, true); + return; + case Intrinsic::arm_mve_lsll: + SelectMVE_LongShift(N, ARM::MVE_LSLLr, false, false); + return; + case Intrinsic::arm_mve_asrl: + SelectMVE_LongShift(N, ARM::MVE_ASRLr, false, false); + return; + + case Intrinsic::arm_mve_vadc: + case Intrinsic::arm_mve_vadc_predicated: + SelectMVE_VADCSBC(N, ARM::MVE_VADC, ARM::MVE_VADCI, true, + IntNo == Intrinsic::arm_mve_vadc_predicated); + return; + + case Intrinsic::arm_mve_vmlldava: + case Intrinsic::arm_mve_vmlldava_predicated: { + static const uint16_t OpcodesU[] = { + ARM::MVE_VMLALDAVu16, ARM::MVE_VMLALDAVu32, + ARM::MVE_VMLALDAVau16, ARM::MVE_VMLALDAVau32, + }; + static const uint16_t OpcodesS[] = { + ARM::MVE_VMLALDAVs16, ARM::MVE_VMLALDAVs32, + ARM::MVE_VMLALDAVas16, ARM::MVE_VMLALDAVas32, + ARM::MVE_VMLALDAVxs16, ARM::MVE_VMLALDAVxs32, + ARM::MVE_VMLALDAVaxs16, ARM::MVE_VMLALDAVaxs32, + ARM::MVE_VMLSLDAVs16, ARM::MVE_VMLSLDAVs32, + ARM::MVE_VMLSLDAVas16, ARM::MVE_VMLSLDAVas32, + ARM::MVE_VMLSLDAVxs16, ARM::MVE_VMLSLDAVxs32, + ARM::MVE_VMLSLDAVaxs16, ARM::MVE_VMLSLDAVaxs32, + }; + SelectMVE_VMLLDAV(N, IntNo == Intrinsic::arm_mve_vmlldava_predicated, + OpcodesS, OpcodesU); + return; + } + + case Intrinsic::arm_mve_vrmlldavha: + case Intrinsic::arm_mve_vrmlldavha_predicated: { + static const uint16_t OpcodesU[] = { + ARM::MVE_VRMLALDAVHu32, ARM::MVE_VRMLALDAVHau32, + }; + static const uint16_t OpcodesS[] = { + ARM::MVE_VRMLALDAVHs32, ARM::MVE_VRMLALDAVHas32, + ARM::MVE_VRMLALDAVHxs32, ARM::MVE_VRMLALDAVHaxs32, + ARM::MVE_VRMLSLDAVHs32, ARM::MVE_VRMLSLDAVHas32, + ARM::MVE_VRMLSLDAVHxs32, ARM::MVE_VRMLSLDAVHaxs32, + }; + SelectMVE_VRMLLDAVH(N, IntNo == Intrinsic::arm_mve_vrmlldavha_predicated, + OpcodesS, OpcodesU); + return; + } } break; } @@ -4551,10 +5096,6 @@ SelectInlineAsmMemoryOperand(const SDValue &Op, unsigned ConstraintID, switch(ConstraintID) { default: llvm_unreachable("Unexpected asm memory constraint"); - case InlineAsm::Constraint_i: - // FIXME: It seems strange that 'i' is needed here since it's supposed to - // be an immediate and not a memory constraint. - LLVM_FALLTHROUGH; case InlineAsm::Constraint_m: case InlineAsm::Constraint_o: case InlineAsm::Constraint_Q: diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMISelLowering.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/ARMISelLowering.cpp index db26feb57010..cf738cd66434 100644 --- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -78,6 +78,7 @@ #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Intrinsics.h" +#include "llvm/IR/IntrinsicsARM.h" #include "llvm/IR/Module.h" #include "llvm/IR/PatternMatch.h" #include "llvm/IR/Type.h" @@ -142,6 +143,11 @@ static cl::opt<unsigned> ConstpoolPromotionMaxTotal( cl::desc("Maximum size of ALL constants to promote into a constant pool"), cl::init(128)); +static cl::opt<unsigned> +MVEMaxSupportedInterleaveFactor("mve-max-interleave-factor", cl::Hidden, + cl::desc("Maximum interleave factor for MVE VLDn to generate."), + cl::init(2)); + // The APCS parameter registers. static const MCPhysReg GPRArgRegs[] = { ARM::R0, ARM::R1, ARM::R2, ARM::R3 @@ -209,6 +215,9 @@ void ARMTargetLowering::addTypeForNEON(MVT VT, MVT PromotedLdStVT, VT != MVT::v2i64 && VT != MVT::v1i64) for (auto Opcode : {ISD::ABS, ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX}) setOperationAction(Opcode, VT, Legal); + if (!VT.isFloatingPoint()) + for (auto Opcode : {ISD::SADDSAT, ISD::UADDSAT, ISD::SSUBSAT, ISD::USUBSAT}) + setOperationAction(Opcode, VT, Legal); } void ARMTargetLowering::addDRTypeForNEON(MVT VT) { @@ -296,6 +305,8 @@ void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) { im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) { setIndexedLoadAction(im, VT, Legal); setIndexedStoreAction(im, VT, Legal); + setIndexedMaskedLoadAction(im, VT, Legal); + setIndexedMaskedStoreAction(im, VT, Legal); } } @@ -322,6 +333,8 @@ void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) { im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) { setIndexedLoadAction(im, VT, Legal); setIndexedStoreAction(im, VT, Legal); + setIndexedMaskedLoadAction(im, VT, Legal); + setIndexedMaskedStoreAction(im, VT, Legal); } if (HasMVEFP) { @@ -366,6 +379,13 @@ void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) { addAllExtLoads(MVT::v4i32, MVT::v4i16, Legal); addAllExtLoads(MVT::v4i32, MVT::v4i8, Legal); + // It is legal to sign extend from v4i8/v4i16 to v4i32 or v8i8 to v8i16. + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i8, Legal); + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Legal); + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i32, Legal); + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v8i8, Legal); + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v8i16, Legal); + // Some truncating stores are legal too. setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal); setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal); @@ -374,12 +394,12 @@ void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) { // Pre and Post inc on these are legal, given the correct extends for (unsigned im = (unsigned)ISD::PRE_INC; im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) { - setIndexedLoadAction(im, MVT::v8i8, Legal); - setIndexedStoreAction(im, MVT::v8i8, Legal); - setIndexedLoadAction(im, MVT::v4i8, Legal); - setIndexedStoreAction(im, MVT::v4i8, Legal); - setIndexedLoadAction(im, MVT::v4i16, Legal); - setIndexedStoreAction(im, MVT::v4i16, Legal); + for (auto VT : {MVT::v8i8, MVT::v4i8, MVT::v4i16}) { + setIndexedLoadAction(im, VT, Legal); + setIndexedStoreAction(im, VT, Legal); + setIndexedMaskedLoadAction(im, VT, Legal); + setIndexedMaskedStoreAction(im, VT, Legal); + } } // Predicate types @@ -446,7 +466,6 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, { RTLIB::OGE_F32, "__gesf2vfp", ISD::SETNE }, { RTLIB::OGT_F32, "__gtsf2vfp", ISD::SETNE }, { RTLIB::UO_F32, "__unordsf2vfp", ISD::SETNE }, - { RTLIB::O_F32, "__unordsf2vfp", ISD::SETEQ }, // Double-precision comparisons. { RTLIB::OEQ_F64, "__eqdf2vfp", ISD::SETNE }, @@ -456,7 +475,6 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, { RTLIB::OGE_F64, "__gedf2vfp", ISD::SETNE }, { RTLIB::OGT_F64, "__gtdf2vfp", ISD::SETNE }, { RTLIB::UO_F64, "__unorddf2vfp", ISD::SETNE }, - { RTLIB::O_F64, "__unorddf2vfp", ISD::SETEQ }, // Floating-point to integer conversions. // i64 conversions are done via library routines even when generating VFP @@ -520,7 +538,6 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, { RTLIB::OGE_F64, "__aeabi_dcmpge", CallingConv::ARM_AAPCS, ISD::SETNE }, { RTLIB::OGT_F64, "__aeabi_dcmpgt", CallingConv::ARM_AAPCS, ISD::SETNE }, { RTLIB::UO_F64, "__aeabi_dcmpun", CallingConv::ARM_AAPCS, ISD::SETNE }, - { RTLIB::O_F64, "__aeabi_dcmpun", CallingConv::ARM_AAPCS, ISD::SETEQ }, // Single-precision floating-point arithmetic helper functions // RTABI chapter 4.1.2, Table 4 @@ -538,7 +555,6 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, { RTLIB::OGE_F32, "__aeabi_fcmpge", CallingConv::ARM_AAPCS, ISD::SETNE }, { RTLIB::OGT_F32, "__aeabi_fcmpgt", CallingConv::ARM_AAPCS, ISD::SETNE }, { RTLIB::UO_F32, "__aeabi_fcmpun", CallingConv::ARM_AAPCS, ISD::SETNE }, - { RTLIB::O_F32, "__aeabi_fcmpun", CallingConv::ARM_AAPCS, ISD::SETEQ }, // Floating-point to integer conversions. // RTABI chapter 4.1.2, Table 6 @@ -964,19 +980,26 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, setOperationAction(ISD::FP_TO_SINT, MVT::f64, Custom); setOperationAction(ISD::FP_TO_UINT, MVT::f64, Custom); setOperationAction(ISD::FP_ROUND, MVT::f32, Custom); + setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i32, Custom); + setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i32, Custom); + setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::f64, Custom); + setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::f64, Custom); + setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Custom); } if (!Subtarget->hasFP64() || !Subtarget->hasFPARMv8Base()) { setOperationAction(ISD::FP_EXTEND, MVT::f64, Custom); - if (Subtarget->hasFullFP16()) + setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f64, Custom); + if (Subtarget->hasFullFP16()) { setOperationAction(ISD::FP_ROUND, MVT::f16, Custom); + setOperationAction(ISD::STRICT_FP_ROUND, MVT::f16, Custom); + } } - if (!Subtarget->hasFP16()) + if (!Subtarget->hasFP16()) { setOperationAction(ISD::FP_EXTEND, MVT::f32, Custom); - - if (!Subtarget->hasFP64()) - setOperationAction(ISD::FP_ROUND, MVT::f32, Custom); + setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f32, Custom); + } computeRegisterProperties(Subtarget->getRegisterInfo()); @@ -1050,6 +1073,8 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, setOperationAction(ISD::SRA, MVT::i64, Custom); setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom); setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i64, Custom); + setOperationAction(ISD::LOAD, MVT::i64, Custom); + setOperationAction(ISD::STORE, MVT::i64, Custom); // MVE lowers 64 bit shifts to lsll and lsrl // assuming that ISD::SRL and SRA of i64 are already marked custom @@ -1170,9 +1195,11 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, setOperationAction(ISD::UDIVREM, MVT::i32, Expand); } - if (Subtarget->isTargetWindows() && Subtarget->getTargetTriple().isOSMSVCRT()) - for (auto &VT : {MVT::f32, MVT::f64}) - setOperationAction(ISD::FPOWI, VT, Custom); + if (Subtarget->getTargetTriple().isOSMSVCRT()) { + // MSVCRT doesn't have powi; fall back to pow + setLibcallName(RTLIB::POWI_F32, nullptr); + setLibcallName(RTLIB::POWI_F64, nullptr); + } setOperationAction(ISD::GlobalAddress, MVT::i32, Custom); setOperationAction(ISD::ConstantPool, MVT::i32, Custom); @@ -1571,6 +1598,9 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const { case ARMISD::PRELOAD: return "ARMISD::PRELOAD"; + case ARMISD::LDRD: return "ARMISD::LDRD"; + case ARMISD::STRD: return "ARMISD::STRD"; + case ARMISD::WIN__CHKSTK: return "ARMISD::WIN__CHKSTK"; case ARMISD::WIN__DBZCHK: return "ARMISD::WIN__DBZCHK"; @@ -1855,6 +1885,7 @@ ARMTargetLowering::getEffectiveCallingConv(CallingConv::ID CC, case CallingConv::ARM_AAPCS: case CallingConv::ARM_APCS: case CallingConv::GHC: + case CallingConv::CFGuard_Check: return CC; case CallingConv::PreserveMost: return CallingConv::PreserveMost; @@ -1914,6 +1945,8 @@ CCAssignFn *ARMTargetLowering::CCAssignFnForNode(CallingConv::ID CC, return (Return ? RetCC_ARM_APCS : CC_ARM_APCS_GHC); case CallingConv::PreserveMost: return (Return ? RetCC_ARM_AAPCS : CC_ARM_AAPCS); + case CallingConv::CFGuard_Check: + return (Return ? RetCC_ARM_AAPCS : CC_ARM_Win32_CFGuard_Check); } } @@ -2062,11 +2095,10 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, MachineFunction::CallSiteInfo CSInfo; bool isStructRet = (Outs.empty()) ? false : Outs[0].Flags.isSRet(); bool isThisReturn = false; - auto Attr = MF.getFunction().getFnAttribute("disable-tail-calls"); bool PreferIndirect = false; // Disable tail calls if they're not supported. - if (!Subtarget->supportsTailCall() || Attr.getValueAsString() == "true") + if (!Subtarget->supportsTailCall()) isTailCall = false; if (isa<GlobalAddressSDNode>(Callee)) { @@ -2331,12 +2363,14 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, } else if (Subtarget->isTargetCOFF()) { assert(Subtarget->isTargetWindows() && "Windows is the only supported COFF target"); - unsigned TargetFlags = GV->hasDLLImportStorageClass() - ? ARMII::MO_DLLIMPORT - : ARMII::MO_NO_FLAG; + unsigned TargetFlags = ARMII::MO_NO_FLAG; + if (GV->hasDLLImportStorageClass()) + TargetFlags = ARMII::MO_DLLIMPORT; + else if (!TM.shouldAssumeDSOLocal(*GV->getParent(), GV)) + TargetFlags = ARMII::MO_COFFSTUB; Callee = DAG.getTargetGlobalAddress(GV, dl, PtrVt, /*offset=*/0, TargetFlags); - if (GV->hasDLLImportStorageClass()) + if (TargetFlags & (ARMII::MO_DLLIMPORT | ARMII::MO_COFFSTUB)) Callee = DAG.getLoad(PtrVt, dl, DAG.getEntryNode(), DAG.getNode(ARMISD::Wrapper, dl, PtrVt, Callee), @@ -2941,9 +2975,7 @@ bool ARMTargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const { if (!Subtarget->supportsTailCall()) return false; - auto Attr = - CI->getParent()->getParent()->getFnAttribute("disable-tail-calls"); - if (!CI->isTailCall() || Attr.getValueAsString() == "true") + if (!CI->isTailCall()) return false; return true; @@ -3629,6 +3661,49 @@ ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG, EVT PtrVT = getPointerTy(DAG.getDataLayout()); return DAG.getNode(ARMISD::THREAD_POINTER, dl, PtrVT); } + case Intrinsic::arm_cls: { + const SDValue &Operand = Op.getOperand(1); + const EVT VTy = Op.getValueType(); + SDValue SRA = + DAG.getNode(ISD::SRA, dl, VTy, Operand, DAG.getConstant(31, dl, VTy)); + SDValue XOR = DAG.getNode(ISD::XOR, dl, VTy, SRA, Operand); + SDValue SHL = + DAG.getNode(ISD::SHL, dl, VTy, XOR, DAG.getConstant(1, dl, VTy)); + SDValue OR = + DAG.getNode(ISD::OR, dl, VTy, SHL, DAG.getConstant(1, dl, VTy)); + SDValue Result = DAG.getNode(ISD::CTLZ, dl, VTy, OR); + return Result; + } + case Intrinsic::arm_cls64: { + // cls(x) = if cls(hi(x)) != 31 then cls(hi(x)) + // else 31 + clz(if hi(x) == 0 then lo(x) else not(lo(x))) + const SDValue &Operand = Op.getOperand(1); + const EVT VTy = Op.getValueType(); + + SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, VTy, Operand, + DAG.getConstant(1, dl, VTy)); + SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, VTy, Operand, + DAG.getConstant(0, dl, VTy)); + SDValue Constant0 = DAG.getConstant(0, dl, VTy); + SDValue Constant1 = DAG.getConstant(1, dl, VTy); + SDValue Constant31 = DAG.getConstant(31, dl, VTy); + SDValue SRAHi = DAG.getNode(ISD::SRA, dl, VTy, Hi, Constant31); + SDValue XORHi = DAG.getNode(ISD::XOR, dl, VTy, SRAHi, Hi); + SDValue SHLHi = DAG.getNode(ISD::SHL, dl, VTy, XORHi, Constant1); + SDValue ORHi = DAG.getNode(ISD::OR, dl, VTy, SHLHi, Constant1); + SDValue CLSHi = DAG.getNode(ISD::CTLZ, dl, VTy, ORHi); + SDValue CheckLo = + DAG.getSetCC(dl, MVT::i1, CLSHi, Constant31, ISD::CondCode::SETEQ); + SDValue HiIsZero = + DAG.getSetCC(dl, MVT::i1, Hi, Constant0, ISD::CondCode::SETEQ); + SDValue AdjustedLo = + DAG.getSelect(dl, VTy, HiIsZero, Lo, DAG.getNOT(dl, Lo, VTy)); + SDValue CLZAdjustedLo = DAG.getNode(ISD::CTLZ, dl, VTy, AdjustedLo); + SDValue Result = + DAG.getSelect(dl, VTy, CheckLo, + DAG.getNode(ISD::ADD, dl, VTy, CLZAdjustedLo, Constant31), CLSHi); + return Result; + } case Intrinsic::eh_sjlj_lsda: { MachineFunction &MF = DAG.getMachineFunction(); ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); @@ -3698,6 +3773,10 @@ ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG, case Intrinsic::arm_neon_vtbl2: return DAG.getNode(ARMISD::VTBL2, SDLoc(Op), Op.getValueType(), Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); + case Intrinsic::arm_mve_pred_i2v: + case Intrinsic::arm_mve_pred_v2i: + return DAG.getNode(ARMISD::PREDICATE_CAST, SDLoc(Op), Op.getValueType(), + Op.getOperand(1)); } } @@ -4887,7 +4966,7 @@ SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { Opcode = ARMISD::CSINC; std::swap(TrueVal, FalseVal); std::swap(TVal, FVal); - CC = ISD::getSetCCInverse(CC, true); + CC = ISD::getSetCCInverse(CC, LHS.getValueType()); } if (Opcode) { @@ -4897,7 +4976,7 @@ SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { HasLowerConstantMaterializationCost(FVal, TVal, Subtarget)) { std::swap(TrueVal, FalseVal); std::swap(TVal, FVal); - CC = ISD::getSetCCInverse(CC, true); + CC = ISD::getSetCCInverse(CC, LHS.getValueType()); } // Attempt to use ZR checking TVal is 0, possibly inverting the condition @@ -4906,7 +4985,7 @@ SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { if (FVal == 0 && Opcode != ARMISD::CSINC) { std::swap(TrueVal, FalseVal); std::swap(TVal, FVal); - CC = ISD::getSetCCInverse(CC, true); + CC = ISD::getSetCCInverse(CC, LHS.getValueType()); } if (TVal == 0) TrueVal = DAG.getRegister(ARM::ZR, MVT::i32); @@ -4950,7 +5029,7 @@ SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { ARMCC::CondCodes CondCode = IntCCToARMCC(CC); if (CondCode == ARMCC::LT || CondCode == ARMCC::LE || CondCode == ARMCC::VC || CondCode == ARMCC::NE) { - CC = ISD::getSetCCInverse(CC, true); + CC = ISD::getSetCCInverse(CC, LHS.getValueType()); std::swap(TrueVal, FalseVal); } } @@ -5310,17 +5389,31 @@ SDValue ARMTargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const { EVT VT = Op.getValueType(); if (VT.isVector()) return LowerVectorFP_TO_INT(Op, DAG); - if (isUnsupportedFloatingType(Op.getOperand(0).getValueType())) { + + bool IsStrict = Op->isStrictFPOpcode(); + SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0); + + if (isUnsupportedFloatingType(SrcVal.getValueType())) { RTLIB::Libcall LC; - if (Op.getOpcode() == ISD::FP_TO_SINT) - LC = RTLIB::getFPTOSINT(Op.getOperand(0).getValueType(), + if (Op.getOpcode() == ISD::FP_TO_SINT || + Op.getOpcode() == ISD::STRICT_FP_TO_SINT) + LC = RTLIB::getFPTOSINT(SrcVal.getValueType(), Op.getValueType()); else - LC = RTLIB::getFPTOUINT(Op.getOperand(0).getValueType(), + LC = RTLIB::getFPTOUINT(SrcVal.getValueType(), Op.getValueType()); + SDLoc Loc(Op); MakeLibCallOptions CallOptions; - return makeLibCall(DAG, LC, Op.getValueType(), Op.getOperand(0), - CallOptions, SDLoc(Op)).first; + SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue(); + SDValue Result; + std::tie(Result, Chain) = makeLibCall(DAG, LC, Op.getValueType(), SrcVal, + CallOptions, Loc, Chain); + return IsStrict ? DAG.getMergeValues({Result, Chain}, Loc) : Result; + } + + // FIXME: Remove this when we have strict fp instruction selection patterns + if (IsStrict) { + DAG.mutateStrictFPToFP(Op.getNode()); } return Op; @@ -5517,7 +5610,7 @@ SDValue ARMTargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const { // FIXME? Maybe this could be a TableGen attribute on some registers and // this table could be generated automatically from RegInfo. -Register ARMTargetLowering::getRegisterByName(const char* RegName, EVT VT, +Register ARMTargetLowering::getRegisterByName(const char* RegName, LLT VT, const MachineFunction &MF) const { Register Reg = StringSwitch<unsigned>(RegName) .Case("sp", ARM::SP) @@ -7745,6 +7838,92 @@ static SDValue LowerVECTOR_SHUFFLE_i1(SDValue Op, SelectionDAG &DAG, DAG.getConstant(ARMCC::NE, dl, MVT::i32)); } +static SDValue LowerVECTOR_SHUFFLEUsingMovs(SDValue Op, + ArrayRef<int> ShuffleMask, + SelectionDAG &DAG) { + // Attempt to lower the vector shuffle using as many whole register movs as + // possible. This is useful for types smaller than 32bits, which would + // often otherwise become a series for grp movs. + SDLoc dl(Op); + EVT VT = Op.getValueType(); + if (VT.getScalarSizeInBits() >= 32) + return SDValue(); + + assert((VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i8) && + "Unexpected vector type"); + int NumElts = VT.getVectorNumElements(); + int QuarterSize = NumElts / 4; + // The four final parts of the vector, as i32's + SDValue Parts[4]; + + // Look for full lane vmovs like <0,1,2,3> or <u,5,6,7> etc, (but not + // <u,u,u,u>), returning the vmov lane index + auto getMovIdx = [](ArrayRef<int> ShuffleMask, int Start, int Length) { + // Detect which mov lane this would be from the first non-undef element. + int MovIdx = -1; + for (int i = 0; i < Length; i++) { + if (ShuffleMask[Start + i] >= 0) { + if (ShuffleMask[Start + i] % Length != i) + return -1; + MovIdx = ShuffleMask[Start + i] / Length; + break; + } + } + // If all items are undef, leave this for other combines + if (MovIdx == -1) + return -1; + // Check the remaining values are the correct part of the same mov + for (int i = 1; i < Length; i++) { + if (ShuffleMask[Start + i] >= 0 && + (ShuffleMask[Start + i] / Length != MovIdx || + ShuffleMask[Start + i] % Length != i)) + return -1; + } + return MovIdx; + }; + + for (int Part = 0; Part < 4; ++Part) { + // Does this part look like a mov + int Elt = getMovIdx(ShuffleMask, Part * QuarterSize, QuarterSize); + if (Elt != -1) { + SDValue Input = Op->getOperand(0); + if (Elt >= 4) { + Input = Op->getOperand(1); + Elt -= 4; + } + SDValue BitCast = DAG.getBitcast(MVT::v4i32, Input); + Parts[Part] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, BitCast, + DAG.getConstant(Elt, dl, MVT::i32)); + } + } + + // Nothing interesting found, just return + if (!Parts[0] && !Parts[1] && !Parts[2] && !Parts[3]) + return SDValue(); + + // The other parts need to be built with the old shuffle vector, cast to a + // v4i32 and extract_vector_elts + if (!Parts[0] || !Parts[1] || !Parts[2] || !Parts[3]) { + SmallVector<int, 16> NewShuffleMask; + for (int Part = 0; Part < 4; ++Part) + for (int i = 0; i < QuarterSize; i++) + NewShuffleMask.push_back( + Parts[Part] ? -1 : ShuffleMask[Part * QuarterSize + i]); + SDValue NewShuffle = DAG.getVectorShuffle( + VT, dl, Op->getOperand(0), Op->getOperand(1), NewShuffleMask); + SDValue BitCast = DAG.getBitcast(MVT::v4i32, NewShuffle); + + for (int Part = 0; Part < 4; ++Part) + if (!Parts[Part]) + Parts[Part] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, + BitCast, DAG.getConstant(Part, dl, MVT::i32)); + } + // Build a vector out of the various parts and bitcast it back to the original + // type. + SDValue NewVec = DAG.getBuildVector(MVT::v4i32, dl, Parts); + return DAG.getBitcast(VT, NewVec); +} + static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST) { SDValue V1 = Op.getOperand(0); @@ -7939,6 +8118,10 @@ static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG, if (SDValue NewOp = LowerVECTOR_SHUFFLEv8i8(Op, ShuffleMask, DAG)) return NewOp; + if (ST->hasMVEIntegerOps()) + if (SDValue NewOp = LowerVECTOR_SHUFFLEUsingMovs(Op, ShuffleMask, DAG)) + return NewOp; + return SDValue(); } @@ -8905,6 +9088,24 @@ static SDValue LowerPredicateLoad(SDValue Op, SelectionDAG &DAG) { return DAG.getMergeValues({Pred, Load.getValue(1)}, dl); } +void ARMTargetLowering::LowerLOAD(SDNode *N, SmallVectorImpl<SDValue> &Results, + SelectionDAG &DAG) const { + LoadSDNode *LD = cast<LoadSDNode>(N); + EVT MemVT = LD->getMemoryVT(); + assert(LD->isUnindexed() && "Loads should be unindexed at this point."); + + if (MemVT == MVT::i64 && Subtarget->hasV5TEOps() && + !Subtarget->isThumb1Only() && LD->isVolatile()) { + SDLoc dl(N); + SDValue Result = DAG.getMemIntrinsicNode( + ARMISD::LDRD, dl, DAG.getVTList({MVT::i32, MVT::i32, MVT::Other}), + {LD->getChain(), LD->getBasePtr()}, MemVT, LD->getMemOperand()); + SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, + Result.getValue(0), Result.getValue(1)); + Results.append({Pair, Result.getValue(2)}); + } +} + static SDValue LowerPredicateStore(SDValue Op, SelectionDAG &DAG) { StoreSDNode *ST = cast<StoreSDNode>(Op.getNode()); EVT MemVT = ST->getMemoryVT(); @@ -8934,6 +9135,40 @@ static SDValue LowerPredicateStore(SDValue Op, SelectionDAG &DAG) { ST->getMemOperand()); } +static SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG, + const ARMSubtarget *Subtarget) { + StoreSDNode *ST = cast<StoreSDNode>(Op.getNode()); + EVT MemVT = ST->getMemoryVT(); + assert(ST->isUnindexed() && "Stores should be unindexed at this point."); + + if (MemVT == MVT::i64 && Subtarget->hasV5TEOps() && + !Subtarget->isThumb1Only() && ST->isVolatile()) { + SDNode *N = Op.getNode(); + SDLoc dl(N); + + SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, ST->getValue(), + DAG.getTargetConstant(0, dl, MVT::i32)); + SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, ST->getValue(), + DAG.getTargetConstant(1, dl, MVT::i32)); + + return DAG.getMemIntrinsicNode(ARMISD::STRD, dl, DAG.getVTList(MVT::Other), + {ST->getChain(), Lo, Hi, ST->getBasePtr()}, + MemVT, ST->getMemOperand()); + } else if (Subtarget->hasMVEIntegerOps() && + ((MemVT == MVT::v4i1 || MemVT == MVT::v8i1 || + MemVT == MVT::v16i1))) { + return LowerPredicateStore(Op, DAG); + } + + return SDValue(); +} + +static bool isZeroVector(SDValue N) { + return (ISD::isBuildVectorAllZeros(N.getNode()) || + (N->getOpcode() == ARMISD::VMOVIMM && + isNullConstant(N->getOperand(0)))); +} + static SDValue LowerMLOAD(SDValue Op, SelectionDAG &DAG) { MaskedLoadSDNode *N = cast<MaskedLoadSDNode>(Op.getNode()); MVT VT = Op.getSimpleValueType(); @@ -8941,13 +9176,7 @@ static SDValue LowerMLOAD(SDValue Op, SelectionDAG &DAG) { SDValue PassThru = N->getPassThru(); SDLoc dl(Op); - auto IsZero = [](SDValue PassThru) { - return (ISD::isBuildVectorAllZeros(PassThru.getNode()) || - (PassThru->getOpcode() == ARMISD::VMOVIMM && - isNullConstant(PassThru->getOperand(0)))); - }; - - if (IsZero(PassThru)) + if (isZeroVector(PassThru)) return Op; // MVE Masked loads use zero as the passthru value. Here we convert undef to @@ -8955,12 +9184,13 @@ static SDValue LowerMLOAD(SDValue Op, SelectionDAG &DAG) { SDValue ZeroVec = DAG.getNode(ARMISD::VMOVIMM, dl, VT, DAG.getTargetConstant(0, dl, MVT::i32)); SDValue NewLoad = DAG.getMaskedLoad( - VT, dl, N->getChain(), N->getBasePtr(), Mask, ZeroVec, N->getMemoryVT(), - N->getMemOperand(), N->getExtensionType(), N->isExpandingLoad()); + VT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask, ZeroVec, + N->getMemoryVT(), N->getMemOperand(), N->getAddressingMode(), + N->getExtensionType(), N->isExpandingLoad()); SDValue Combo = NewLoad; if (!PassThru.isUndef() && (PassThru.getOpcode() != ISD::BITCAST || - !IsZero(PassThru->getOperand(0)))) + !isZeroVector(PassThru->getOperand(0)))) Combo = DAG.getNode(ISD::VSELECT, dl, VT, Mask, NewLoad, PassThru); return DAG.getMergeValues({Combo, NewLoad.getValue(1)}, dl); } @@ -9043,58 +9273,6 @@ static void ReplaceCMP_SWAP_64Results(SDNode *N, Results.push_back(SDValue(CmpSwap, 2)); } -static SDValue LowerFPOWI(SDValue Op, const ARMSubtarget &Subtarget, - SelectionDAG &DAG) { - const auto &TLI = DAG.getTargetLoweringInfo(); - - assert(Subtarget.getTargetTriple().isOSMSVCRT() && - "Custom lowering is MSVCRT specific!"); - - SDLoc dl(Op); - SDValue Val = Op.getOperand(0); - MVT Ty = Val->getSimpleValueType(0); - SDValue Exponent = DAG.getNode(ISD::SINT_TO_FP, dl, Ty, Op.getOperand(1)); - SDValue Callee = DAG.getExternalSymbol(Ty == MVT::f32 ? "powf" : "pow", - TLI.getPointerTy(DAG.getDataLayout())); - - TargetLowering::ArgListTy Args; - TargetLowering::ArgListEntry Entry; - - Entry.Node = Val; - Entry.Ty = Val.getValueType().getTypeForEVT(*DAG.getContext()); - Entry.IsZExt = true; - Args.push_back(Entry); - - Entry.Node = Exponent; - Entry.Ty = Exponent.getValueType().getTypeForEVT(*DAG.getContext()); - Entry.IsZExt = true; - Args.push_back(Entry); - - Type *LCRTy = Val.getValueType().getTypeForEVT(*DAG.getContext()); - - // In the in-chain to the call is the entry node If we are emitting a - // tailcall, the chain will be mutated if the node has a non-entry input - // chain. - SDValue InChain = DAG.getEntryNode(); - SDValue TCChain = InChain; - - const Function &F = DAG.getMachineFunction().getFunction(); - bool IsTC = TLI.isInTailCallPosition(DAG, Op.getNode(), TCChain) && - F.getReturnType() == LCRTy; - if (IsTC) - InChain = TCChain; - - TargetLowering::CallLoweringInfo CLI(DAG); - CLI.setDebugLoc(dl) - .setChain(InChain) - .setCallee(CallingConv::ARM_AAPCS_VFP, LCRTy, Callee, std::move(Args)) - .setTailCall(IsTC); - std::pair<SDValue, SDValue> CI = TLI.LowerCallTo(CLI); - - // Return the chain (the DAG root) if it is a tail call - return !CI.second.getNode() ? DAG.getRoot() : CI.first; -} - SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { LLVM_DEBUG(dbgs() << "Lowering node: "; Op.dump()); switch (Op.getOpcode()) { @@ -9114,6 +9292,8 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::PREFETCH: return LowerPREFETCH(Op, DAG, Subtarget); case ISD::SINT_TO_FP: case ISD::UINT_TO_FP: return LowerINT_TO_FP(Op, DAG); + case ISD::STRICT_FP_TO_SINT: + case ISD::STRICT_FP_TO_UINT: case ISD::FP_TO_SINT: case ISD::FP_TO_UINT: return LowerFP_TO_INT(Op, DAG); case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG); @@ -9170,7 +9350,7 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::LOAD: return LowerPredicateLoad(Op, DAG); case ISD::STORE: - return LowerPredicateStore(Op, DAG); + return LowerSTORE(Op, DAG, Subtarget); case ISD::MLOAD: return LowerMLOAD(Op, DAG); case ISD::ATOMIC_LOAD: @@ -9182,9 +9362,10 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { if (Subtarget->isTargetWindows()) return LowerDYNAMIC_STACKALLOC(Op, DAG); llvm_unreachable("Don't know how to custom lower this!"); + case ISD::STRICT_FP_ROUND: case ISD::FP_ROUND: return LowerFP_ROUND(Op, DAG); + case ISD::STRICT_FP_EXTEND: case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG); - case ISD::FPOWI: return LowerFPOWI(Op, *Subtarget, DAG); case ARMISD::WIN__DBZCHK: return SDValue(); } } @@ -9271,7 +9452,9 @@ void ARMTargetLowering::ReplaceNodeResults(SDNode *N, case ISD::ABS: lowerABS(N, Results, DAG); return ; - + case ISD::LOAD: + LowerLOAD(N, Results, DAG); + break; } if (Res.getNode()) Results.push_back(Res); @@ -11711,7 +11894,8 @@ static SDValue PerformADDCombine(SDNode *N, /// PerformSUBCombine - Target-specific dag combine xforms for ISD::SUB. /// static SDValue PerformSUBCombine(SDNode *N, - TargetLowering::DAGCombinerInfo &DCI) { + TargetLowering::DAGCombinerInfo &DCI, + const ARMSubtarget *Subtarget) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); @@ -11720,7 +11904,28 @@ static SDValue PerformSUBCombine(SDNode *N, if (SDValue Result = combineSelectAndUse(N, N1, N0, DCI)) return Result; - return SDValue(); + if (!Subtarget->hasMVEIntegerOps() || !N->getValueType(0).isVector()) + return SDValue(); + + // Fold (sub (ARMvmovImm 0), (ARMvdup x)) -> (ARMvdup (sub 0, x)) + // so that we can readily pattern match more mve instructions which can use + // a scalar operand. + SDValue VDup = N->getOperand(1); + if (VDup->getOpcode() != ARMISD::VDUP) + return SDValue(); + + SDValue VMov = N->getOperand(0); + if (VMov->getOpcode() == ISD::BITCAST) + VMov = VMov->getOperand(0); + + if (VMov->getOpcode() != ARMISD::VMOVIMM || !isZeroVector(VMov)) + return SDValue(); + + SDLoc dl(N); + SDValue Negate = DCI.DAG.getNode(ISD::SUB, dl, MVT::i32, + DCI.DAG.getConstant(0, dl, MVT::i32), + VDup->getOperand(0)); + return DCI.DAG.getNode(ARMISD::VDUP, dl, N->getValueType(0), Negate); } /// PerformVMULCombine @@ -12736,6 +12941,39 @@ PerformPREDICATE_CASTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { return SDValue(); } +static SDValue PerformVCMPCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI, + const ARMSubtarget *Subtarget) { + if (!Subtarget->hasMVEIntegerOps()) + return SDValue(); + + EVT VT = N->getValueType(0); + SDValue Op0 = N->getOperand(0); + SDValue Op1 = N->getOperand(1); + ARMCC::CondCodes Cond = + (ARMCC::CondCodes)cast<ConstantSDNode>(N->getOperand(2))->getZExtValue(); + SDLoc dl(N); + + // vcmp X, 0, cc -> vcmpz X, cc + if (isZeroVector(Op1)) + return DCI.DAG.getNode(ARMISD::VCMPZ, dl, VT, Op0, + N->getOperand(2)); + + unsigned SwappedCond = getSwappedCondition(Cond); + if (isValidMVECond(SwappedCond, VT.isFloatingPoint())) { + // vcmp 0, X, cc -> vcmpz X, reversed(cc) + if (isZeroVector(Op0)) + return DCI.DAG.getNode(ARMISD::VCMPZ, dl, VT, Op1, + DCI.DAG.getConstant(SwappedCond, dl, MVT::i32)); + // vcmp vdup(Y), X, cc -> vcmp X, vdup(Y), reversed(cc) + if (Op0->getOpcode() == ARMISD::VDUP && Op1->getOpcode() != ARMISD::VDUP) + return DCI.DAG.getNode(ARMISD::VCMP, dl, VT, Op1, Op0, + DCI.DAG.getConstant(SwappedCond, dl, MVT::i32)); + } + + return SDValue(); +} + /// PerformInsertEltCombine - Target-specific dag combine xforms for /// ISD::INSERT_VECTOR_ELT. static SDValue PerformInsertEltCombine(SDNode *N, @@ -13844,11 +14082,12 @@ static SDValue PerformExtendCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST) { SDValue N0 = N->getOperand(0); - // Check for sign- and zero-extensions of vector extract operations of 8- - // and 16-bit vector elements. NEON supports these directly. They are + // Check for sign- and zero-extensions of vector extract operations of 8- and + // 16-bit vector elements. NEON and MVE support these directly. They are // handled during DAG combining because type legalization will promote them // to 32-bit types and it is messy to recognize the operations after that. - if (ST->hasNEON() && N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT) { + if ((ST->hasNEON() || ST->hasMVEIntegerOps()) && + N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT) { SDValue Vec = N0.getOperand(0); SDValue Lane = N0.getOperand(1); EVT VT = N->getValueType(0); @@ -14067,7 +14306,7 @@ static SDValue PerformHWLoopCombine(SDNode *N, return SDValue(); if (Negate) - CC = ISD::getSetCCInverse(CC, true); + CC = ISD::getSetCCInverse(CC, /* Integer inverse */ MVT::i32); auto IsTrueIfZero = [](ISD::CondCode CC, int Imm) { return (CC == ISD::SETEQ && Imm == 0) || @@ -14371,7 +14610,7 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N, case ARMISD::ADDE: return PerformADDECombine(N, DCI, Subtarget); case ARMISD::UMLAL: return PerformUMLALCombine(N, DCI.DAG, Subtarget); case ISD::ADD: return PerformADDCombine(N, DCI, Subtarget); - case ISD::SUB: return PerformSUBCombine(N, DCI); + case ISD::SUB: return PerformSUBCombine(N, DCI, Subtarget); case ISD::MUL: return PerformMULCombine(N, DCI, Subtarget); case ISD::OR: return PerformORCombine(N, DCI, Subtarget); case ISD::XOR: return PerformXORCombine(N, DCI, Subtarget); @@ -14415,6 +14654,8 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N, return PerformARMBUILD_VECTORCombine(N, DCI); case ARMISD::PREDICATE_CAST: return PerformPREDICATE_CASTCombine(N, DCI); + case ARMISD::VCMP: + return PerformVCMPCombine(N, DCI, Subtarget); case ARMISD::SMULWB: { unsigned BitWidth = N->getValueType(0).getSizeInBits(); APInt DemandedMask = APInt::getLowBitsSet(BitWidth, 16); @@ -14523,7 +14764,7 @@ bool ARMTargetLowering::allowsMisalignedMemoryAccesses(EVT VT, unsigned, if (!VT.isSimple()) return false; - // The AllowsUnaliged flag models the SCTLR.A setting in ARM cpus + // The AllowsUnaligned flag models the SCTLR.A setting in ARM cpus bool AllowsUnaligned = Subtarget->allowsUnalignedMem(); auto Ty = VT.getSimpleVT().SimpleTy; @@ -14725,8 +14966,12 @@ bool ARMTargetLowering::shouldSinkOperands(Instruction *I, switch (I->getOpcode()) { case Instruction::Add: case Instruction::Mul: + case Instruction::ICmp: return true; case Instruction::Sub: + case Instruction::Shl: + case Instruction::LShr: + case Instruction::AShr: return Operand == 1; default: return false; @@ -14808,6 +15053,40 @@ int ARMTargetLowering::getScalingFactorCost(const DataLayout &DL, return -1; } +/// isFMAFasterThanFMulAndFAdd - Return true if an FMA operation is faster +/// than a pair of fmul and fadd instructions. fmuladd intrinsics will be +/// expanded to FMAs when this method returns true, otherwise fmuladd is +/// expanded to fmul + fadd. +/// +/// ARM supports both fused and unfused multiply-add operations; we already +/// lower a pair of fmul and fadd to the latter so it's not clear that there +/// would be a gain or that the gain would be worthwhile enough to risk +/// correctness bugs. +/// +/// For MVE, we set this to true as it helps simplify the need for some +/// patterns (and we don't have the non-fused floating point instruction). +bool ARMTargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, + EVT VT) const { + if (!VT.isSimple()) + return false; + + switch (VT.getSimpleVT().SimpleTy) { + case MVT::v4f32: + case MVT::v8f16: + return Subtarget->hasMVEFloatOps(); + case MVT::f16: + return Subtarget->useFPVFMx16(); + case MVT::f32: + return Subtarget->useFPVFMx(); + case MVT::f64: + return Subtarget->useFPVFMx64(); + default: + break; + } + + return false; +} + static bool isLegalT1AddressImmediate(int64_t V, EVT VT) { if (V < 0) return false; @@ -14850,7 +15129,7 @@ static bool isLegalT2AddressImmediate(int64_t V, EVT VT, V = -V; } - unsigned NumBytes = std::max(VT.getSizeInBits() / 8, 1U); + unsigned NumBytes = std::max((unsigned)VT.getSizeInBits() / 8, 1U); // MVE: size * imm7 if (VT.isVector() && Subtarget->hasMVEIntegerOps()) { @@ -15155,14 +15434,19 @@ static bool getT2IndexedAddressParts(SDNode *Ptr, EVT VT, } static bool getMVEIndexedAddressParts(SDNode *Ptr, EVT VT, unsigned Align, - bool isSEXTLoad, bool isLE, SDValue &Base, - SDValue &Offset, bool &isInc, - SelectionDAG &DAG) { + bool isSEXTLoad, bool IsMasked, bool isLE, + SDValue &Base, SDValue &Offset, + bool &isInc, SelectionDAG &DAG) { if (Ptr->getOpcode() != ISD::ADD && Ptr->getOpcode() != ISD::SUB) return false; if (!isa<ConstantSDNode>(Ptr->getOperand(1))) return false; + // We allow LE non-masked loads to change the type (for example use a vldrb.8 + // as opposed to a vldrw.32). This can allow extra addressing modes or + // alignments for what is otherwise an equivalent instruction. + bool CanChangeType = isLE && !IsMasked; + ConstantSDNode *RHS = cast<ConstantSDNode>(Ptr->getOperand(1)); int RHSC = (int)RHS->getZExtValue(); @@ -15181,7 +15465,7 @@ static bool getMVEIndexedAddressParts(SDNode *Ptr, EVT VT, unsigned Align, }; // Try to find a matching instruction based on s/zext, Alignment, Offset and - // (in BE) type. + // (in BE/masked) type. Base = Ptr->getOperand(0); if (VT == MVT::v4i16) { if (Align >= 2 && IsInRange(RHSC, 0x80, 2)) @@ -15189,13 +15473,15 @@ static bool getMVEIndexedAddressParts(SDNode *Ptr, EVT VT, unsigned Align, } else if (VT == MVT::v4i8 || VT == MVT::v8i8) { if (IsInRange(RHSC, 0x80, 1)) return true; - } else if (Align >= 4 && (isLE || VT == MVT::v4i32 || VT == MVT::v4f32) && + } else if (Align >= 4 && + (CanChangeType || VT == MVT::v4i32 || VT == MVT::v4f32) && IsInRange(RHSC, 0x80, 4)) return true; - else if (Align >= 2 && (isLE || VT == MVT::v8i16 || VT == MVT::v8f16) && + else if (Align >= 2 && + (CanChangeType || VT == MVT::v8i16 || VT == MVT::v8f16) && IsInRange(RHSC, 0x80, 2)) return true; - else if ((isLE || VT == MVT::v16i8) && IsInRange(RHSC, 0x80, 1)) + else if ((CanChangeType || VT == MVT::v16i8) && IsInRange(RHSC, 0x80, 1)) return true; return false; } @@ -15215,6 +15501,7 @@ ARMTargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base, SDValue Ptr; unsigned Align; bool isSEXTLoad = false; + bool IsMasked = false; if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) { Ptr = LD->getBasePtr(); VT = LD->getMemoryVT(); @@ -15224,6 +15511,17 @@ ARMTargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base, Ptr = ST->getBasePtr(); VT = ST->getMemoryVT(); Align = ST->getAlignment(); + } else if (MaskedLoadSDNode *LD = dyn_cast<MaskedLoadSDNode>(N)) { + Ptr = LD->getBasePtr(); + VT = LD->getMemoryVT(); + Align = LD->getAlignment(); + isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD; + IsMasked = true; + } else if (MaskedStoreSDNode *ST = dyn_cast<MaskedStoreSDNode>(N)) { + Ptr = ST->getBasePtr(); + VT = ST->getMemoryVT(); + Align = ST->getAlignment(); + IsMasked = true; } else return false; @@ -15232,8 +15530,8 @@ ARMTargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base, if (VT.isVector()) isLegal = Subtarget->hasMVEIntegerOps() && getMVEIndexedAddressParts(Ptr.getNode(), VT, Align, isSEXTLoad, - Subtarget->isLittle(), Base, Offset, - isInc, DAG); + IsMasked, Subtarget->isLittle(), Base, + Offset, isInc, DAG); else { if (Subtarget->isThumb2()) isLegal = getT2IndexedAddressParts(Ptr.getNode(), VT, isSEXTLoad, Base, @@ -15261,6 +15559,7 @@ bool ARMTargetLowering::getPostIndexedAddressParts(SDNode *N, SDNode *Op, SDValue Ptr; unsigned Align; bool isSEXTLoad = false, isNonExt; + bool IsMasked = false; if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) { VT = LD->getMemoryVT(); Ptr = LD->getBasePtr(); @@ -15272,6 +15571,19 @@ bool ARMTargetLowering::getPostIndexedAddressParts(SDNode *N, SDNode *Op, Ptr = ST->getBasePtr(); Align = ST->getAlignment(); isNonExt = !ST->isTruncatingStore(); + } else if (MaskedLoadSDNode *LD = dyn_cast<MaskedLoadSDNode>(N)) { + VT = LD->getMemoryVT(); + Ptr = LD->getBasePtr(); + Align = LD->getAlignment(); + isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD; + isNonExt = LD->getExtensionType() == ISD::NON_EXTLOAD; + IsMasked = true; + } else if (MaskedStoreSDNode *ST = dyn_cast<MaskedStoreSDNode>(N)) { + VT = ST->getMemoryVT(); + Ptr = ST->getBasePtr(); + Align = ST->getAlignment(); + isNonExt = !ST->isTruncatingStore(); + IsMasked = true; } else return false; @@ -15295,7 +15607,7 @@ bool ARMTargetLowering::getPostIndexedAddressParts(SDNode *N, SDNode *Op, bool isLegal = false; if (VT.isVector()) isLegal = Subtarget->hasMVEIntegerOps() && - getMVEIndexedAddressParts(Op, VT, Align, isSEXTLoad, + getMVEIndexedAddressParts(Op, VT, Align, isSEXTLoad, IsMasked, Subtarget->isLittle(), Base, Offset, isInc, DAG); else { @@ -16048,7 +16360,8 @@ ARMTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const } SDValue ARMTargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const { - SDValue SrcVal = Op.getOperand(0); + bool IsStrict = Op->isStrictFPOpcode(); + SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0); const unsigned DstSz = Op.getValueType().getSizeInBits(); const unsigned SrcSz = SrcVal.getValueType().getSizeInBits(); assert(DstSz > SrcSz && DstSz <= 64 && SrcSz >= 16 && @@ -16068,34 +16381,35 @@ SDValue ARMTargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const { SDLoc Loc(Op); RTLIB::Libcall LC; MakeLibCallOptions CallOptions; - if (SrcSz == 16) { - // Instruction from 16 -> 32 - if (Subtarget->hasFP16()) - SrcVal = DAG.getNode(ISD::FP_EXTEND, Loc, MVT::f32, SrcVal); - // Lib call from 16 -> 32 - else { - LC = RTLIB::getFPEXT(MVT::f16, MVT::f32); + SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue(); + for (unsigned Sz = SrcSz; Sz <= 32 && Sz < DstSz; Sz *= 2) { + bool Supported = (Sz == 16 ? Subtarget->hasFP16() : Subtarget->hasFP64()); + MVT SrcVT = (Sz == 16 ? MVT::f16 : MVT::f32); + MVT DstVT = (Sz == 16 ? MVT::f32 : MVT::f64); + if (Supported) { + if (IsStrict) { + SrcVal = DAG.getNode(ISD::STRICT_FP_EXTEND, Loc, + {DstVT, MVT::Other}, {Chain, SrcVal}); + Chain = SrcVal.getValue(1); + } else { + SrcVal = DAG.getNode(ISD::FP_EXTEND, Loc, DstVT, SrcVal); + } + } else { + LC = RTLIB::getFPEXT(SrcVT, DstVT); assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpected type for custom-lowering FP_EXTEND"); - SrcVal = - makeLibCall(DAG, LC, MVT::f32, SrcVal, CallOptions, Loc).first; + std::tie(SrcVal, Chain) = makeLibCall(DAG, LC, DstVT, SrcVal, CallOptions, + Loc, Chain); } } - if (DstSz != 64) - return SrcVal; - // For sure now SrcVal is 32 bits - if (Subtarget->hasFP64()) // Instruction from 32 -> 64 - return DAG.getNode(ISD::FP_EXTEND, Loc, MVT::f64, SrcVal); - - LC = RTLIB::getFPEXT(MVT::f32, MVT::f64); - assert(LC != RTLIB::UNKNOWN_LIBCALL && - "Unexpected type for custom-lowering FP_EXTEND"); - return makeLibCall(DAG, LC, MVT::f64, SrcVal, CallOptions, Loc).first; + return IsStrict ? DAG.getMergeValues({SrcVal, Chain}, Loc) : SrcVal; } SDValue ARMTargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const { - SDValue SrcVal = Op.getOperand(0); + bool IsStrict = Op->isStrictFPOpcode(); + + SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0); EVT SrcVT = SrcVal.getValueType(); EVT DstVT = Op.getValueType(); const unsigned DstSz = Op.getValueType().getSizeInBits(); @@ -16118,7 +16432,11 @@ SDValue ARMTargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const { assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpected type for custom-lowering FP_ROUND"); MakeLibCallOptions CallOptions; - return makeLibCall(DAG, LC, DstVT, SrcVal, CallOptions, Loc).first; + SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue(); + SDValue Result; + std::tie(Result, Chain) = makeLibCall(DAG, LC, DstVT, SrcVal, CallOptions, + Loc, Chain); + return IsStrict ? DAG.getMergeValues({Result, Chain}, Loc) : Result; } void ARMTargetLowering::lowerABS(SDNode *N, SmallVectorImpl<SDValue> &Results, @@ -16644,15 +16962,20 @@ ARMTargetLowering::getNumInterleavedAccesses(VectorType *VecTy, } bool ARMTargetLowering::isLegalInterleavedAccessType( - VectorType *VecTy, const DataLayout &DL) const { + unsigned Factor, VectorType *VecTy, const DataLayout &DL) const { unsigned VecSize = DL.getTypeSizeInBits(VecTy); unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType()); + if (!Subtarget->hasNEON() && !Subtarget->hasMVEIntegerOps()) + return false; + // Ensure the vector doesn't have f16 elements. Even though we could do an // i16 vldN, we can't hold the f16 vectors and will end up converting via // f32. - if (VecTy->getElementType()->isHalfTy()) + if (Subtarget->hasNEON() && VecTy->getElementType()->isHalfTy()) + return false; + if (Subtarget->hasMVEIntegerOps() && Factor == 3) return false; // Ensure the number of vector elements is greater than 1. @@ -16665,12 +16988,16 @@ bool ARMTargetLowering::isLegalInterleavedAccessType( // Ensure the total vector size is 64 or a multiple of 128. Types larger than // 128 will be split into multiple interleaved accesses. - return VecSize == 64 || VecSize % 128 == 0; + if (Subtarget->hasNEON() && VecSize == 64) + return true; + return VecSize % 128 == 0; } unsigned ARMTargetLowering::getMaxSupportedInterleaveFactor() const { if (Subtarget->hasNEON()) return 4; + if (Subtarget->hasMVEIntegerOps()) + return MVEMaxSupportedInterleaveFactor; return TargetLoweringBase::getMaxSupportedInterleaveFactor(); } @@ -16702,7 +17029,7 @@ bool ARMTargetLowering::lowerInterleavedLoad( // Skip if we do not have NEON and skip illegal vector types. We can // "legalize" wide vector types into multiple interleaved accesses as long as // the vector types are divisible by 128. - if (!Subtarget->hasNEON() || !isLegalInterleavedAccessType(VecTy, DL)) + if (!isLegalInterleavedAccessType(Factor, VecTy, DL)) return false; unsigned NumLoads = getNumInterleavedAccesses(VecTy, DL); @@ -16734,13 +17061,37 @@ bool ARMTargetLowering::lowerInterleavedLoad( assert(isTypeLegal(EVT::getEVT(VecTy)) && "Illegal vldN vector type!"); - Type *Int8Ptr = Builder.getInt8PtrTy(LI->getPointerAddressSpace()); - Type *Tys[] = {VecTy, Int8Ptr}; - static const Intrinsic::ID LoadInts[3] = {Intrinsic::arm_neon_vld2, - Intrinsic::arm_neon_vld3, - Intrinsic::arm_neon_vld4}; - Function *VldnFunc = - Intrinsic::getDeclaration(LI->getModule(), LoadInts[Factor - 2], Tys); + auto createLoadIntrinsic = [&](Value *BaseAddr) { + if (Subtarget->hasNEON()) { + Type *Int8Ptr = Builder.getInt8PtrTy(LI->getPointerAddressSpace()); + Type *Tys[] = {VecTy, Int8Ptr}; + static const Intrinsic::ID LoadInts[3] = {Intrinsic::arm_neon_vld2, + Intrinsic::arm_neon_vld3, + Intrinsic::arm_neon_vld4}; + Function *VldnFunc = + Intrinsic::getDeclaration(LI->getModule(), LoadInts[Factor - 2], Tys); + + SmallVector<Value *, 2> Ops; + Ops.push_back(Builder.CreateBitCast(BaseAddr, Int8Ptr)); + Ops.push_back(Builder.getInt32(LI->getAlignment())); + + return Builder.CreateCall(VldnFunc, Ops, "vldN"); + } else { + assert((Factor == 2 || Factor == 4) && + "expected interleave factor of 2 or 4 for MVE"); + Intrinsic::ID LoadInts = + Factor == 2 ? Intrinsic::arm_mve_vld2q : Intrinsic::arm_mve_vld4q; + Type *VecEltTy = VecTy->getVectorElementType()->getPointerTo( + LI->getPointerAddressSpace()); + Type *Tys[] = {VecTy, VecEltTy}; + Function *VldnFunc = + Intrinsic::getDeclaration(LI->getModule(), LoadInts, Tys); + + SmallVector<Value *, 2> Ops; + Ops.push_back(Builder.CreateBitCast(BaseAddr, VecEltTy)); + return Builder.CreateCall(VldnFunc, Ops, "vldN"); + } + }; // Holds sub-vectors extracted from the load intrinsic return values. The // sub-vectors are associated with the shufflevector instructions they will @@ -16755,11 +17106,7 @@ bool ARMTargetLowering::lowerInterleavedLoad( Builder.CreateConstGEP1_32(VecTy->getVectorElementType(), BaseAddr, VecTy->getVectorNumElements() * Factor); - SmallVector<Value *, 2> Ops; - Ops.push_back(Builder.CreateBitCast(BaseAddr, Int8Ptr)); - Ops.push_back(Builder.getInt32(LI->getAlignment())); - - CallInst *VldN = Builder.CreateCall(VldnFunc, Ops, "vldN"); + CallInst *VldN = createLoadIntrinsic(BaseAddr); // Replace uses of each shufflevector with the corresponding vector loaded // by ldN. @@ -16838,7 +17185,7 @@ bool ARMTargetLowering::lowerInterleavedStore(StoreInst *SI, // Skip if we do not have NEON and skip illegal vector types. We can // "legalize" wide vector types into multiple interleaved accesses as long as // the vector types are divisible by 128. - if (!Subtarget->hasNEON() || !isLegalInterleavedAccessType(SubVecTy, DL)) + if (!isLegalInterleavedAccessType(Factor, SubVecTy, DL)) return false; unsigned NumStores = getNumInterleavedAccesses(SubVecTy, DL); @@ -16882,11 +17229,46 @@ bool ARMTargetLowering::lowerInterleavedStore(StoreInst *SI, auto Mask = SVI->getShuffleMask(); - Type *Int8Ptr = Builder.getInt8PtrTy(SI->getPointerAddressSpace()); - Type *Tys[] = {Int8Ptr, SubVecTy}; - static const Intrinsic::ID StoreInts[3] = {Intrinsic::arm_neon_vst2, - Intrinsic::arm_neon_vst3, - Intrinsic::arm_neon_vst4}; + auto createStoreIntrinsic = [&](Value *BaseAddr, + SmallVectorImpl<Value *> &Shuffles) { + if (Subtarget->hasNEON()) { + static const Intrinsic::ID StoreInts[3] = {Intrinsic::arm_neon_vst2, + Intrinsic::arm_neon_vst3, + Intrinsic::arm_neon_vst4}; + Type *Int8Ptr = Builder.getInt8PtrTy(SI->getPointerAddressSpace()); + Type *Tys[] = {Int8Ptr, SubVecTy}; + + Function *VstNFunc = Intrinsic::getDeclaration( + SI->getModule(), StoreInts[Factor - 2], Tys); + + SmallVector<Value *, 6> Ops; + Ops.push_back(Builder.CreateBitCast(BaseAddr, Int8Ptr)); + for (auto S : Shuffles) + Ops.push_back(S); + Ops.push_back(Builder.getInt32(SI->getAlignment())); + Builder.CreateCall(VstNFunc, Ops); + } else { + assert((Factor == 2 || Factor == 4) && + "expected interleave factor of 2 or 4 for MVE"); + Intrinsic::ID StoreInts = + Factor == 2 ? Intrinsic::arm_mve_vst2q : Intrinsic::arm_mve_vst4q; + Type *EltPtrTy = SubVecTy->getVectorElementType()->getPointerTo( + SI->getPointerAddressSpace()); + Type *Tys[] = {EltPtrTy, SubVecTy}; + Function *VstNFunc = + Intrinsic::getDeclaration(SI->getModule(), StoreInts, Tys); + + SmallVector<Value *, 6> Ops; + Ops.push_back(Builder.CreateBitCast(BaseAddr, EltPtrTy)); + for (auto S : Shuffles) + Ops.push_back(S); + for (unsigned F = 0; F < Factor; F++) { + Ops.push_back(Builder.getInt32(F)); + Builder.CreateCall(VstNFunc, Ops); + Ops.pop_back(); + } + } + }; for (unsigned StoreCount = 0; StoreCount < NumStores; ++StoreCount) { // If we generating more than one store, we compute the base address of @@ -16895,17 +17277,13 @@ bool ARMTargetLowering::lowerInterleavedStore(StoreInst *SI, BaseAddr = Builder.CreateConstGEP1_32(SubVecTy->getVectorElementType(), BaseAddr, LaneLen * Factor); - SmallVector<Value *, 6> Ops; - Ops.push_back(Builder.CreateBitCast(BaseAddr, Int8Ptr)); - - Function *VstNFunc = - Intrinsic::getDeclaration(SI->getModule(), StoreInts[Factor - 2], Tys); + SmallVector<Value *, 4> Shuffles; // Split the shufflevector operands into sub vectors for the new vstN call. for (unsigned i = 0; i < Factor; i++) { unsigned IdxI = StoreCount * LaneLen * Factor + i; if (Mask[IdxI] >= 0) { - Ops.push_back(Builder.CreateShuffleVector( + Shuffles.push_back(Builder.CreateShuffleVector( Op0, Op1, createSequentialMask(Builder, Mask[IdxI], LaneLen, 0))); } else { unsigned StartMask = 0; @@ -16922,13 +17300,12 @@ bool ARMTargetLowering::lowerInterleavedStore(StoreInst *SI, // In the case of all undefs we're defaulting to using elems from 0 // Note: StartMask cannot be negative, it's checked in // isReInterleaveMask - Ops.push_back(Builder.CreateShuffleVector( + Shuffles.push_back(Builder.CreateShuffleVector( Op0, Op1, createSequentialMask(Builder, StartMask, LaneLen, 0))); } } - Ops.push_back(Builder.getInt32(SI->getAlignment())); - Builder.CreateCall(VstNFunc, Ops); + createStoreIntrinsic(BaseAddr, Shuffles); } return true; } diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMISelLowering.h b/contrib/llvm-project/llvm/lib/Target/ARM/ARMISelLowering.h index 53813fad5afd..1baa22a4fa56 100644 --- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMISelLowering.h +++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMISelLowering.h @@ -278,7 +278,11 @@ class VectorType; VST4_UPD, VST2LN_UPD, VST3LN_UPD, - VST4LN_UPD + VST4LN_UPD, + + // Load/Store of dual registers + LDRD, + STRD }; } // end namespace ARMISD @@ -377,7 +381,7 @@ class VectorType; bool isLegalT2ScaledAddressingMode(const AddrMode &AM, EVT VT) const; - /// Returns true if the addresing mode representing by AM is legal + /// Returns true if the addressing mode representing by AM is legal /// for the Thumb1 target, for a load/store of the specified type. bool isLegalT1ScaledAddressingMode(const AddrMode &AM, EVT VT) const; @@ -604,7 +608,7 @@ class VectorType; /// Returns true if \p VecTy is a legal interleaved access type. This /// function checks the vector element type and the overall width of the /// vector. - bool isLegalInterleavedAccessType(VectorType *VecTy, + bool isLegalInterleavedAccessType(unsigned Factor, VectorType *VecTy, const DataLayout &DL) const; bool alignLoopsWithOptSize() const override; @@ -731,23 +735,17 @@ class VectorType; SDValue LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) const; void lowerABS(SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const; + void LowerLOAD(SDNode *N, SmallVectorImpl<SDValue> &Results, + SelectionDAG &DAG) const; - Register getRegisterByName(const char* RegName, EVT VT, + Register getRegisterByName(const char* RegName, LLT VT, const MachineFunction &MF) const override; SDValue BuildSDIVPow2(SDNode *N, const APInt &Divisor, SelectionDAG &DAG, SmallVectorImpl<SDNode *> &Created) const override; - /// isFMAFasterThanFMulAndFAdd - Return true if an FMA operation is faster - /// than a pair of fmul and fadd instructions. fmuladd intrinsics will be - /// expanded to FMAs when this method returns true, otherwise fmuladd is - /// expanded to fmul + fadd. - /// - /// ARM supports both fused and unfused multiply-add operations; we already - /// lower a pair of fmul and fadd to the latter so it's not clear that there - /// would be a gain or that the gain would be worthwhile enough to risk - /// correctness bugs. - bool isFMAFasterThanFMulAndFAdd(EVT VT) const override { return false; } + bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, + EVT VT) const override; SDValue ReconstructShuffle(SDValue Op, SelectionDAG &DAG) const; diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrInfo.td b/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrInfo.td index fe696222ec70..ce67af6f1b49 100644 --- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrInfo.td +++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrInfo.td @@ -243,6 +243,12 @@ def ARMqsub8b : SDNode<"ARMISD::QSUB8b", SDT_ARMAnd, []>; def ARMqadd16b : SDNode<"ARMISD::QADD16b", SDT_ARMAnd, []>; def ARMqsub16b : SDNode<"ARMISD::QSUB16b", SDT_ARMAnd, []>; +def SDT_ARMldrd : SDTypeProfile<2, 1, [SDTCisVT<0, i32>, SDTCisSameAs<0, 1>, SDTCisPtrTy<2>]>; +def ARMldrd : SDNode<"ARMISD::LDRD", SDT_ARMldrd, [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>; + +def SDT_ARMstrd : SDTypeProfile<0, 3, [SDTCisVT<0, i32>, SDTCisSameAs<0, 1>, SDTCisPtrTy<2>]>; +def ARMstrd : SDNode<"ARMISD::STRD", SDT_ARMstrd, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; + // Vector operations shared between NEON and MVE def ARMvdup : SDNode<"ARMISD::VDUP", SDTypeProfile<1, 1, [SDTCisVec<0>]>>; @@ -297,6 +303,28 @@ class RegConstraint<string C> { string Constraints = C; } +// ARMCC condition codes. See ARMCC::CondCodes +def ARMCCeq : PatLeaf<(i32 0)>; +def ARMCCne : PatLeaf<(i32 1)>; +def ARMCChs : PatLeaf<(i32 2)>; +def ARMCClo : PatLeaf<(i32 3)>; +def ARMCCmi : PatLeaf<(i32 4)>; +def ARMCCpl : PatLeaf<(i32 5)>; +def ARMCCvs : PatLeaf<(i32 6)>; +def ARMCCvc : PatLeaf<(i32 7)>; +def ARMCChi : PatLeaf<(i32 8)>; +def ARMCCls : PatLeaf<(i32 9)>; +def ARMCCge : PatLeaf<(i32 10)>; +def ARMCClt : PatLeaf<(i32 11)>; +def ARMCCgt : PatLeaf<(i32 12)>; +def ARMCCle : PatLeaf<(i32 13)>; +def ARMCCal : PatLeaf<(i32 14)>; + +// VCC predicates. See ARMVCC::VPTCodes +def ARMVCCNone : PatLeaf<(i32 0)>; +def ARMVCCThen : PatLeaf<(i32 1)>; +def ARMVCCElse : PatLeaf<(i32 2)>; + //===----------------------------------------------------------------------===// // ARM specific transformation functions and pattern fragments. // @@ -913,7 +941,10 @@ def MVEShiftImm1_7AsmOperand: ImmAsmOperand<1,7> { // encodings allow. let DiagnosticString = "operand must be an immediate in the range [1,8]"; } -def mve_shift_imm1_7 : Operand<i32> { +def mve_shift_imm1_7 : Operand<i32>, + // SelectImmediateInRange / isScaledConstantInRange uses a + // half-open interval, so the parameters <1,8> mean 1-7 inclusive + ComplexPattern<i32, 1, "SelectImmediateInRange<1,8>", [], []> { let ParserMatchClass = MVEShiftImm1_7AsmOperand; let EncoderMethod = "getMVEShiftImmOpValue"; } @@ -926,7 +957,10 @@ def MVEShiftImm1_15AsmOperand: ImmAsmOperand<1,15> { // encodings allow. let DiagnosticString = "operand must be an immediate in the range [1,16]"; } -def mve_shift_imm1_15 : Operand<i32> { +def mve_shift_imm1_15 : Operand<i32>, + // SelectImmediateInRange / isScaledConstantInRange uses a + // half-open interval, so the parameters <1,16> mean 1-15 inclusive + ComplexPattern<i32, 1, "SelectImmediateInRange<1,16>", [], []> { let ParserMatchClass = MVEShiftImm1_15AsmOperand; let EncoderMethod = "getMVEShiftImmOpValue"; } @@ -2667,6 +2701,14 @@ let mayLoad = 1, hasSideEffects = 0, hasExtraDefRegAllocReq = 1 in { Requires<[IsARM, HasV5TE]>; } +let mayLoad = 1, hasSideEffects = 0, hasNoSchedulingInfo = 1 in { +def LOADDUAL : ARMPseudoInst<(outs GPRPairOp:$Rt), (ins addrmode3:$addr), + 64, IIC_iLoad_d_r, []>, + Requires<[IsARM, HasV5TE]> { + let AM = AddrMode3; +} +} + def LDA : AIldracq<0b00, (outs GPR:$Rt), (ins addr_offset_none:$addr), NoItinerary, "lda", "\t$Rt, $addr", []>; def LDAB : AIldracq<0b10, (outs GPR:$Rt), (ins addr_offset_none:$addr), @@ -2942,6 +2984,19 @@ let mayStore = 1, hasSideEffects = 0, hasExtraSrcRegAllocReq = 1 in { } } +let mayStore = 1, hasSideEffects = 0, hasNoSchedulingInfo = 1 in { +def STOREDUAL : ARMPseudoInst<(outs), (ins GPRPairOp:$Rt, addrmode3:$addr), + 64, IIC_iStore_d_r, []>, + Requires<[IsARM, HasV5TE]> { + let AM = AddrMode3; +} +} + +let Predicates = [IsARM, HasV5TE] in { +def : Pat<(ARMstrd GPR:$Rt, GPR:$Rt2, addrmode3:$addr), + (STOREDUAL (REG_SEQUENCE GPRPair, GPR:$Rt, gsub_0, GPR:$Rt2, gsub_1), addrmode3:$addr)>; +} + // Indexed stores multiclass AI2_stridx<bit isByte, string opc, InstrItinClass iii, InstrItinClass iir> { @@ -6214,7 +6269,7 @@ def CMP_SWAP_64 : PseudoInst<(outs GPRPair:$Rd, GPR:$temp), } def CompilerBarrier : PseudoInst<(outs), (ins i32imm:$ordering), NoItinerary, - [(atomic_fence imm:$ordering, 0)]> { + [(atomic_fence timm:$ordering, 0)]> { let hasSideEffects = 1; let Size = 0; let AsmString = "@ COMPILER BARRIER"; diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrMVE.td b/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrMVE.td index 4f67cd6e47cc..604291be822c 100644 --- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrMVE.td +++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrMVE.td @@ -275,6 +275,83 @@ class mve_addr_q_shift<int shift> : MemOperand { let MIOperandInfo = (ops MQPR:$base, i32imm:$imm); } +// A family of classes wrapping up information about the vector types +// used by MVE. +class MVEVectorVTInfo<ValueType vec, ValueType dblvec, ValueType pred, + bits<2> size, string suffixletter, bit unsigned> { + // The LLVM ValueType representing the vector, so we can use it in + // ISel patterns. + ValueType Vec = vec; + + // The LLVM ValueType representing a vector with elements double the size + // of those in Vec, so we can use it in ISel patterns. It is up to the + // invoker of this class to ensure that this is a correct choice. + ValueType DblVec = dblvec; + + // An LLVM ValueType representing a corresponding vector of + // predicate bits, for use in ISel patterns that handle an IR + // intrinsic describing the predicated form of the instruction. + // + // Usually, for a vector of N things, this will be vNi1. But for + // vectors of 2 values, we make an exception, and use v4i1 instead + // of v2i1. Rationale: MVE codegen doesn't support doing all the + // auxiliary operations on v2i1 (vector shuffles etc), and also, + // there's no MVE compare instruction that will _generate_ v2i1 + // directly. + ValueType Pred = pred; + + // The most common representation of the vector element size in MVE + // instruction encodings: a 2-bit value V representing an (8<<V)-bit + // vector element. + bits<2> Size = size; + + // For vectors explicitly mentioning a signedness of integers: 0 for + // signed and 1 for unsigned. For anything else, undefined. + bit Unsigned = unsigned; + + // The number of bits in a vector element, in integer form. + int LaneBits = !shl(8, Size); + + // The suffix used in assembly language on an instruction operating + // on this lane if it only cares about number of bits. + string BitsSuffix = !if(!eq(suffixletter, "p"), + !if(!eq(unsigned, 0b0), "8", "16"), + !cast<string>(LaneBits)); + + // The suffix used on an instruction that mentions the whole type. + string Suffix = suffixletter ## BitsSuffix; + + // The letter part of the suffix only. + string SuffixLetter = suffixletter; +} + +// Integer vector types that don't treat signed and unsigned differently. +def MVE_v16i8 : MVEVectorVTInfo<v16i8, v8i16, v16i1, 0b00, "i", ?>; +def MVE_v8i16 : MVEVectorVTInfo<v8i16, v4i32, v8i1, 0b01, "i", ?>; +def MVE_v4i32 : MVEVectorVTInfo<v4i32, v2i64, v4i1, 0b10, "i", ?>; +def MVE_v2i64 : MVEVectorVTInfo<v2i64, ?, v4i1, 0b11, "i", ?>; + +// Explicitly signed and unsigned integer vectors. They map to the +// same set of LLVM ValueTypes as above, but are represented +// differently in assembly and instruction encodings. +def MVE_v16s8 : MVEVectorVTInfo<v16i8, v8i16, v16i1, 0b00, "s", 0b0>; +def MVE_v8s16 : MVEVectorVTInfo<v8i16, v4i32, v8i1, 0b01, "s", 0b0>; +def MVE_v4s32 : MVEVectorVTInfo<v4i32, v2i64, v4i1, 0b10, "s", 0b0>; +def MVE_v2s64 : MVEVectorVTInfo<v2i64, ?, v4i1, 0b11, "s", 0b0>; +def MVE_v16u8 : MVEVectorVTInfo<v16i8, v8i16, v16i1, 0b00, "u", 0b1>; +def MVE_v8u16 : MVEVectorVTInfo<v8i16, v4i32, v8i1, 0b01, "u", 0b1>; +def MVE_v4u32 : MVEVectorVTInfo<v4i32, v2i64, v4i1, 0b10, "u", 0b1>; +def MVE_v2u64 : MVEVectorVTInfo<v2i64, ?, v4i1, 0b11, "u", 0b1>; + +// FP vector types. +def MVE_v8f16 : MVEVectorVTInfo<v8f16, v4f32, v8i1, 0b01, "f", ?>; +def MVE_v4f32 : MVEVectorVTInfo<v4f32, v2f64, v4i1, 0b10, "f", ?>; +def MVE_v2f64 : MVEVectorVTInfo<v2f64, ?, v4i1, 0b11, "f", ?>; + +// Polynomial vector types. +def MVE_v16p8 : MVEVectorVTInfo<v16i8, v8i16, v16i1, 0b11, "p", 0b0>; +def MVE_v8p16 : MVEVectorVTInfo<v8i16, v4i32, v8i1, 0b11, "p", 0b1>; + // --------- Start of base classes for the instructions themselves class MVE_MI<dag oops, dag iops, InstrItinClass itin, string asm, @@ -346,9 +423,12 @@ class MVE_ScalarShiftSingleReg<string iname, dag iops, string asm, string cstr, let Inst{19-16} = RdaDest{3-0}; } -class MVE_ScalarShiftSRegImm<string iname, bits<2> op5_4, list<dag> pattern=[]> +class MVE_ScalarShiftSRegImm<string iname, bits<2> op5_4> : MVE_ScalarShiftSingleReg<iname, (ins rGPR:$RdaSrc, long_shift:$imm), - "$RdaSrc, $imm", "$RdaDest = $RdaSrc", pattern> { + "$RdaSrc, $imm", "$RdaDest = $RdaSrc", + [(set rGPR:$RdaDest, + (i32 (!cast<Intrinsic>("int_arm_mve_" # iname) + (i32 rGPR:$RdaSrc), (i32 imm:$imm))))]> { bits<5> imm; let Inst{15} = 0b0; @@ -364,9 +444,12 @@ def MVE_SRSHR : MVE_ScalarShiftSRegImm<"srshr", 0b10>; def MVE_UQSHL : MVE_ScalarShiftSRegImm<"uqshl", 0b00>; def MVE_URSHR : MVE_ScalarShiftSRegImm<"urshr", 0b01>; -class MVE_ScalarShiftSRegReg<string iname, bits<2> op5_4, list<dag> pattern=[]> +class MVE_ScalarShiftSRegReg<string iname, bits<2> op5_4> : MVE_ScalarShiftSingleReg<iname, (ins rGPR:$RdaSrc, rGPR:$Rm), - "$RdaSrc, $Rm", "$RdaDest = $RdaSrc", pattern> { + "$RdaSrc, $Rm", "$RdaDest = $RdaSrc", + [(set rGPR:$RdaDest, + (i32 (!cast<Intrinsic>("int_arm_mve_" # iname) + (i32 rGPR:$RdaSrc), (i32 rGPR:$Rm))))]> { bits<4> Rm; let Inst{15-12} = Rm{3-0}; @@ -487,10 +570,10 @@ class MVE_rDest<dag oops, dag iops, InstrItinClass itin, let Inst{4} = 0b0; } -class MVE_VABAV<string suffix, bit U, bits<2> size, list<dag> pattern=[]> +class MVE_VABAV<string suffix, bit U, bits<2> size> : MVE_rDest<(outs rGPR:$Rda), (ins rGPR:$Rda_src, MQPR:$Qn, MQPR:$Qm), NoItinerary, "vabav", suffix, "$Rda, $Qn, $Qm", "$Rda = $Rda_src", - pattern> { + []> { bits<4> Qm; bits<4> Qn; bits<4> Rda; @@ -509,12 +592,35 @@ class MVE_VABAV<string suffix, bit U, bits<2> size, list<dag> pattern=[]> let Inst{0} = 0b1; } -def MVE_VABAVs8 : MVE_VABAV<"s8", 0b0, 0b00>; -def MVE_VABAVs16 : MVE_VABAV<"s16", 0b0, 0b01>; -def MVE_VABAVs32 : MVE_VABAV<"s32", 0b0, 0b10>; -def MVE_VABAVu8 : MVE_VABAV<"u8", 0b1, 0b00>; -def MVE_VABAVu16 : MVE_VABAV<"u16", 0b1, 0b01>; -def MVE_VABAVu32 : MVE_VABAV<"u32", 0b1, 0b10>; +multiclass MVE_VABAV_m<MVEVectorVTInfo VTI> { + def "" : MVE_VABAV<VTI.Suffix, VTI.Unsigned, VTI.Size>; + defvar Inst = !cast<Instruction>(NAME); + + let Predicates = [HasMVEInt] in { + def : Pat<(i32 (int_arm_mve_vabav + (i32 VTI.Unsigned), + (i32 rGPR:$Rda_src), + (VTI.Vec MQPR:$Qn), (VTI.Vec MQPR:$Qm))), + (i32 (Inst (i32 rGPR:$Rda_src), + (VTI.Vec MQPR:$Qn), (VTI.Vec MQPR:$Qm)))>; + + def : Pat<(i32 (int_arm_mve_vabav_predicated + (i32 VTI.Unsigned), + (i32 rGPR:$Rda_src), + (VTI.Vec MQPR:$Qn), (VTI.Vec MQPR:$Qm), + (VTI.Pred VCCR:$mask))), + (i32 (Inst (i32 rGPR:$Rda_src), + (VTI.Vec MQPR:$Qn), (VTI.Vec MQPR:$Qm), + ARMVCCThen, (VTI.Pred VCCR:$mask)))>; + } +} + +defm MVE_VABAVs8 : MVE_VABAV_m<MVE_v16s8>; +defm MVE_VABAVs16 : MVE_VABAV_m<MVE_v8s16>; +defm MVE_VABAVs32 : MVE_VABAV_m<MVE_v4s32>; +defm MVE_VABAVu8 : MVE_VABAV_m<MVE_v16u8>; +defm MVE_VABAVu16 : MVE_VABAV_m<MVE_v8u16>; +defm MVE_VABAVu32 : MVE_VABAV_m<MVE_v4u32>; class MVE_VADDV<string iname, string suffix, dag iops, string cstr, bit A, bit U, bits<2> size, list<dag> pattern=[]> @@ -658,17 +764,31 @@ class MVE_VMINMAXV<string iname, string suffix, bit U, bits<2> size, let Inst{0} = 0b0; } -multiclass MVE_VMINMAXV_ty<string iname, bit bit_7, list<dag> pattern=[]> { - def s8 : MVE_VMINMAXV<iname, "s8", 0b0, 0b00, 0b1, bit_7>; - def s16 : MVE_VMINMAXV<iname, "s16", 0b0, 0b01, 0b1, bit_7>; - def s32 : MVE_VMINMAXV<iname, "s32", 0b0, 0b10, 0b1, bit_7>; - def u8 : MVE_VMINMAXV<iname, "u8", 0b1, 0b00, 0b1, bit_7>; - def u16 : MVE_VMINMAXV<iname, "u16", 0b1, 0b01, 0b1, bit_7>; - def u32 : MVE_VMINMAXV<iname, "u32", 0b1, 0b10, 0b1, bit_7>; +multiclass MVE_VMINMAXV_p<string iname, bit bit_17, bit bit_7, + MVEVectorVTInfo VTI, Intrinsic intr> { + def "": MVE_VMINMAXV<iname, VTI.Suffix, VTI.Unsigned, VTI.Size, + bit_17, bit_7>; + defvar Inst = !cast<Instruction>(NAME); + + let Predicates = [HasMVEInt] in + def _pat : Pat<(i32 (intr (i32 rGPR:$prev), (VTI.Vec MQPR:$vec))), + (i32 (Inst (i32 rGPR:$prev), (VTI.Vec MQPR:$vec)))>; +} + +multiclass MVE_VMINMAXV_ty<string iname, bit bit_7, + Intrinsic intr_s, Intrinsic intr_u> { + defm s8 : MVE_VMINMAXV_p<iname, 1, bit_7, MVE_v16s8, intr_s>; + defm s16: MVE_VMINMAXV_p<iname, 1, bit_7, MVE_v8s16, intr_s>; + defm s32: MVE_VMINMAXV_p<iname, 1, bit_7, MVE_v4s32, intr_s>; + defm u8 : MVE_VMINMAXV_p<iname, 1, bit_7, MVE_v16u8, intr_u>; + defm u16: MVE_VMINMAXV_p<iname, 1, bit_7, MVE_v8u16, intr_u>; + defm u32: MVE_VMINMAXV_p<iname, 1, bit_7, MVE_v4u32, intr_u>; } -defm MVE_VMINV : MVE_VMINMAXV_ty<"vminv", 0b1>; -defm MVE_VMAXV : MVE_VMINMAXV_ty<"vmaxv", 0b0>; +defm MVE_VMINV : MVE_VMINMAXV_ty< + "vminv", 0b1, int_arm_mve_minv_s, int_arm_mve_minv_u>; +defm MVE_VMAXV : MVE_VMINMAXV_ty< + "vmaxv", 0b0, int_arm_mve_maxv_s, int_arm_mve_maxv_u>; let Predicates = [HasMVEInt] in { def : Pat<(i32 (vecreduce_smax (v16i8 MQPR:$src))), @@ -709,10 +829,9 @@ defm MVE_VMINAV : MVE_VMINMAXAV_ty<"vminav", 0b1>; defm MVE_VMAXAV : MVE_VMINMAXAV_ty<"vmaxav", 0b0>; class MVE_VMLAMLSDAV<string iname, string suffix, dag iops, string cstr, - bit sz, bit bit_28, bit A, bit X, bit bit_8, bit bit_0, - list<dag> pattern=[]> + bit sz, bit bit_28, bit A, bit X, bit bit_8, bit bit_0> : MVE_rDest<(outs tGPREven:$RdaDest), iops, NoItinerary, iname, suffix, - "$RdaDest, $Qn, $Qm", cstr, pattern> { + "$RdaDest, $Qn, $Qm", cstr, []> { bits<4> RdaDest; bits<3> Qm; bits<3> Qn; @@ -730,47 +849,88 @@ class MVE_VMLAMLSDAV<string iname, string suffix, dag iops, string cstr, let Inst{0} = bit_0; } -multiclass MVE_VMLAMLSDAV_A<string iname, string x, string suffix, - bit sz, bit bit_28, bit X, bit bit_8, bit bit_0, - list<dag> pattern=[]> { - def ""#x#suffix : MVE_VMLAMLSDAV<iname # x, suffix, +multiclass MVE_VMLAMLSDAV_A<string iname, string x, MVEVectorVTInfo VTI, + bit sz, bit bit_28, bit X, bit bit_8, bit bit_0> { + def ""#x#VTI.Suffix : MVE_VMLAMLSDAV<iname # x, VTI.Suffix, (ins MQPR:$Qn, MQPR:$Qm), "", - sz, bit_28, 0b0, X, bit_8, bit_0, pattern>; - def "a"#x#suffix : MVE_VMLAMLSDAV<iname # "a" # x, suffix, + sz, bit_28, 0b0, X, bit_8, bit_0>; + def "a"#x#VTI.Suffix : MVE_VMLAMLSDAV<iname # "a" # x, VTI.Suffix, (ins tGPREven:$RdaSrc, MQPR:$Qn, MQPR:$Qm), "$RdaDest = $RdaSrc", - sz, bit_28, 0b1, X, bit_8, bit_0, pattern>; + sz, bit_28, 0b1, X, bit_8, bit_0>; + let Predicates = [HasMVEInt] in { + def : Pat<(i32 (int_arm_mve_vmldava + (i32 VTI.Unsigned), + (i32 bit_0) /* subtract */, + (i32 X) /* exchange */, + (i32 0) /* accumulator */, + (VTI.Vec MQPR:$Qn), (VTI.Vec MQPR:$Qm))), + (i32 (!cast<Instruction>(NAME # x # VTI.Suffix) + (VTI.Vec MQPR:$Qn), (VTI.Vec MQPR:$Qm)))>; + + def : Pat<(i32 (int_arm_mve_vmldava_predicated + (i32 VTI.Unsigned), + (i32 bit_0) /* subtract */, + (i32 X) /* exchange */, + (i32 0) /* accumulator */, + (VTI.Vec MQPR:$Qn), (VTI.Vec MQPR:$Qm), + (VTI.Pred VCCR:$mask))), + (i32 (!cast<Instruction>(NAME # x # VTI.Suffix) + (VTI.Vec MQPR:$Qn), (VTI.Vec MQPR:$Qm), + ARMVCCThen, (VTI.Pred VCCR:$mask)))>; + + def : Pat<(i32 (int_arm_mve_vmldava + (i32 VTI.Unsigned), + (i32 bit_0) /* subtract */, + (i32 X) /* exchange */, + (i32 tGPREven:$RdaSrc), + (VTI.Vec MQPR:$Qn), (VTI.Vec MQPR:$Qm))), + (i32 (!cast<Instruction>(NAME # "a" # x # VTI.Suffix) + (i32 tGPREven:$RdaSrc), + (VTI.Vec MQPR:$Qn), (VTI.Vec MQPR:$Qm)))>; + + def : Pat<(i32 (int_arm_mve_vmldava_predicated + (i32 VTI.Unsigned), + (i32 bit_0) /* subtract */, + (i32 X) /* exchange */, + (i32 tGPREven:$RdaSrc), + (VTI.Vec MQPR:$Qn), (VTI.Vec MQPR:$Qm), + (VTI.Pred VCCR:$mask))), + (i32 (!cast<Instruction>(NAME # "a" # x # VTI.Suffix) + (i32 tGPREven:$RdaSrc), + (VTI.Vec MQPR:$Qn), (VTI.Vec MQPR:$Qm), + ARMVCCThen, (VTI.Pred VCCR:$mask)))>; + } } -multiclass MVE_VMLAMLSDAV_AX<string iname, string suffix, bit sz, bit bit_28, - bit bit_8, bit bit_0, list<dag> pattern=[]> { - defm "" : MVE_VMLAMLSDAV_A<iname, "", suffix, sz, bit_28, - 0b0, bit_8, bit_0, pattern>; - defm "" : MVE_VMLAMLSDAV_A<iname, "x", suffix, sz, bit_28, - 0b1, bit_8, bit_0, pattern>; +multiclass MVE_VMLAMLSDAV_AX<string iname, MVEVectorVTInfo VTI, bit sz, + bit bit_28, bit bit_8, bit bit_0> { + defm "" : MVE_VMLAMLSDAV_A<iname, "", VTI, sz, bit_28, + 0b0, bit_8, bit_0>; + defm "" : MVE_VMLAMLSDAV_A<iname, "x", VTI, sz, bit_28, + 0b1, bit_8, bit_0>; } -multiclass MVE_VMLADAV_multi<string suffix, bit sz, bit bit_8, - list<dag> pattern=[]> { - defm "" : MVE_VMLAMLSDAV_AX<"vmladav", "s"#suffix, - sz, 0b0, bit_8, 0b0, pattern>; - defm "" : MVE_VMLAMLSDAV_A<"vmladav", "", "u"#suffix, - sz, 0b1, 0b0, bit_8, 0b0, pattern>; +multiclass MVE_VMLADAV_multi<MVEVectorVTInfo SVTI, MVEVectorVTInfo UVTI, + bit sz, bit bit_8> { + defm "" : MVE_VMLAMLSDAV_AX<"vmladav", SVTI, + sz, 0b0, bit_8, 0b0>; + defm "" : MVE_VMLAMLSDAV_A<"vmladav", "", UVTI, + sz, 0b1, 0b0, bit_8, 0b0>; } -multiclass MVE_VMLSDAV_multi<string suffix, bit sz, bit bit_28, - list<dag> pattern=[]> { - defm "" : MVE_VMLAMLSDAV_AX<"vmlsdav", "s"#suffix, - sz, bit_28, 0b0, 0b1, pattern>; +multiclass MVE_VMLSDAV_multi<MVEVectorVTInfo VTI, bit sz, bit bit_28> { + defm "" : MVE_VMLAMLSDAV_AX<"vmlsdav", VTI, + sz, bit_28, 0b0, 0b1>; } -defm MVE_VMLADAV : MVE_VMLADAV_multi< "8", 0b0, 0b1>; -defm MVE_VMLADAV : MVE_VMLADAV_multi<"16", 0b0, 0b0>; -defm MVE_VMLADAV : MVE_VMLADAV_multi<"32", 0b1, 0b0>; +defm MVE_VMLADAV : MVE_VMLADAV_multi<MVE_v16s8, MVE_v16u8, 0b0, 0b1>; +defm MVE_VMLADAV : MVE_VMLADAV_multi<MVE_v8s16, MVE_v8u16, 0b0, 0b0>; +defm MVE_VMLADAV : MVE_VMLADAV_multi<MVE_v4s32, MVE_v4u32, 0b1, 0b0>; -defm MVE_VMLSDAV : MVE_VMLSDAV_multi< "8", 0b0, 0b1>; -defm MVE_VMLSDAV : MVE_VMLSDAV_multi<"16", 0b0, 0b0>; -defm MVE_VMLSDAV : MVE_VMLSDAV_multi<"32", 0b1, 0b0>; +defm MVE_VMLSDAV : MVE_VMLSDAV_multi<MVE_v16s8, 0b0, 0b1>; +defm MVE_VMLSDAV : MVE_VMLSDAV_multi<MVE_v8s16, 0b0, 0b0>; +defm MVE_VMLSDAV : MVE_VMLSDAV_multi<MVE_v4s32, 0b1, 0b0>; // vmlav aliases vmladav foreach acc = ["", "a"] in { @@ -932,6 +1092,16 @@ let Predicates = [HasMVEFloat] in { (v4f32 (MVE_VMAXNMf32 (v4f32 MQPR:$val1), (v4f32 MQPR:$val2)))>; def : Pat<(v8f16 (fmaxnum (v8f16 MQPR:$val1), (v8f16 MQPR:$val2))), (v8f16 (MVE_VMAXNMf16 (v8f16 MQPR:$val1), (v8f16 MQPR:$val2)))>; + def : Pat<(v4f32 (int_arm_mve_max_predicated (v4f32 MQPR:$val1), (v4f32 MQPR:$val2), (i32 0), + (v4i1 VCCR:$mask), (v4f32 MQPR:$inactive))), + (v4f32 (MVE_VMAXNMf32 (v4f32 MQPR:$val1), (v4f32 MQPR:$val2), + ARMVCCThen, (v4i1 VCCR:$mask), + (v4f32 MQPR:$inactive)))>; + def : Pat<(v8f16 (int_arm_mve_max_predicated (v8f16 MQPR:$val1), (v8f16 MQPR:$val2), (i32 0), + (v8i1 VCCR:$mask), (v8f16 MQPR:$inactive))), + (v8f16 (MVE_VMAXNMf16 (v8f16 MQPR:$val1), (v8f16 MQPR:$val2), + ARMVCCThen, (v8i1 VCCR:$mask), + (v8f16 MQPR:$inactive)))>; } def MVE_VMINNMf32 : MVE_VMINMAXNM<"vminnm", "f32", 0b0, 0b1>; @@ -942,6 +1112,16 @@ let Predicates = [HasMVEFloat] in { (v4f32 (MVE_VMINNMf32 (v4f32 MQPR:$val1), (v4f32 MQPR:$val2)))>; def : Pat<(v8f16 (fminnum (v8f16 MQPR:$val1), (v8f16 MQPR:$val2))), (v8f16 (MVE_VMINNMf16 (v8f16 MQPR:$val1), (v8f16 MQPR:$val2)))>; + def : Pat<(v4f32 (int_arm_mve_min_predicated (v4f32 MQPR:$val1), (v4f32 MQPR:$val2), + (i32 0), (v4i1 VCCR:$mask), (v4f32 MQPR:$inactive))), + (v4f32 (MVE_VMINNMf32 (v4f32 MQPR:$val1), (v4f32 MQPR:$val2), + ARMVCCThen, (v4i1 VCCR:$mask), + (v4f32 MQPR:$inactive)))>; + def : Pat<(v8f16 (int_arm_mve_min_predicated (v8f16 MQPR:$val1), (v8f16 MQPR:$val2), + (i32 0), (v8i1 VCCR:$mask), (v8f16 MQPR:$inactive))), + (v8f16 (MVE_VMINNMf16 (v8f16 MQPR:$val1), (v8f16 MQPR:$val2), + ARMVCCThen, (v8i1 VCCR:$mask), + (v8f16 MQPR:$inactive)))>; } @@ -957,50 +1137,48 @@ class MVE_VMINMAX<string iname, string suffix, bit U, bits<2> size, let Inst{8} = 0b0; let Inst{6} = 0b1; let Inst{4} = bit_4; + let validForTailPredication = 1; } -multiclass MVE_VMINMAX_all_sizes<string iname, bit bit_4> { - def s8 : MVE_VMINMAX<iname, "s8", 0b0, 0b00, bit_4>; - def s16 : MVE_VMINMAX<iname, "s16", 0b0, 0b01, bit_4>; - def s32 : MVE_VMINMAX<iname, "s32", 0b0, 0b10, bit_4>; - def u8 : MVE_VMINMAX<iname, "u8", 0b1, 0b00, bit_4>; - def u16 : MVE_VMINMAX<iname, "u16", 0b1, 0b01, bit_4>; - def u32 : MVE_VMINMAX<iname, "u32", 0b1, 0b10, bit_4>; -} +multiclass MVE_VMINMAX_m<string iname, bit bit_4, MVEVectorVTInfo VTI, + SDNode unpred_op, Intrinsic pred_int> { + def "" : MVE_VMINMAX<iname, VTI.Suffix, VTI.Unsigned, VTI.Size, bit_4>; + defvar Inst = !cast<Instruction>(NAME); -defm MVE_VMAX : MVE_VMINMAX_all_sizes<"vmax", 0b0>; -defm MVE_VMIN : MVE_VMINMAX_all_sizes<"vmin", 0b1>; + let Predicates = [HasMVEInt] in { + // Unpredicated min/max + def : Pat<(VTI.Vec (unpred_op (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn))), + (VTI.Vec (Inst (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn)))>; -let Predicates = [HasMVEInt] in { - def : Pat<(v16i8 (smin (v16i8 MQPR:$val1), (v16i8 MQPR:$val2))), - (v16i8 (MVE_VMINs8 (v16i8 MQPR:$val1), (v16i8 MQPR:$val2)))>; - def : Pat<(v8i16 (smin (v8i16 MQPR:$val1), (v8i16 MQPR:$val2))), - (v8i16 (MVE_VMINs16 (v8i16 MQPR:$val1), (v8i16 MQPR:$val2)))>; - def : Pat<(v4i32 (smin (v4i32 MQPR:$val1), (v4i32 MQPR:$val2))), - (v4i32 (MVE_VMINs32 (v4i32 MQPR:$val1), (v4i32 MQPR:$val2)))>; - - def : Pat<(v16i8 (smax (v16i8 MQPR:$val1), (v16i8 MQPR:$val2))), - (v16i8 (MVE_VMAXs8 (v16i8 MQPR:$val1), (v16i8 MQPR:$val2)))>; - def : Pat<(v8i16 (smax (v8i16 MQPR:$val1), (v8i16 MQPR:$val2))), - (v8i16 (MVE_VMAXs16 (v8i16 MQPR:$val1), (v8i16 MQPR:$val2)))>; - def : Pat<(v4i32 (smax (v4i32 MQPR:$val1), (v4i32 MQPR:$val2))), - (v4i32 (MVE_VMAXs32 (v4i32 MQPR:$val1), (v4i32 MQPR:$val2)))>; - - def : Pat<(v16i8 (umin (v16i8 MQPR:$val1), (v16i8 MQPR:$val2))), - (v16i8 (MVE_VMINu8 (v16i8 MQPR:$val1), (v16i8 MQPR:$val2)))>; - def : Pat<(v8i16 (umin (v8i16 MQPR:$val1), (v8i16 MQPR:$val2))), - (v8i16 (MVE_VMINu16 (v8i16 MQPR:$val1), (v8i16 MQPR:$val2)))>; - def : Pat<(v4i32 (umin (v4i32 MQPR:$val1), (v4i32 MQPR:$val2))), - (v4i32 (MVE_VMINu32 (v4i32 MQPR:$val1), (v4i32 MQPR:$val2)))>; - - def : Pat<(v16i8 (umax (v16i8 MQPR:$val1), (v16i8 MQPR:$val2))), - (v16i8 (MVE_VMAXu8 (v16i8 MQPR:$val1), (v16i8 MQPR:$val2)))>; - def : Pat<(v8i16 (umax (v8i16 MQPR:$val1), (v8i16 MQPR:$val2))), - (v8i16 (MVE_VMAXu16 (v8i16 MQPR:$val1), (v8i16 MQPR:$val2)))>; - def : Pat<(v4i32 (umax (v4i32 MQPR:$val1), (v4i32 MQPR:$val2))), - (v4i32 (MVE_VMAXu32 (v4i32 MQPR:$val1), (v4i32 MQPR:$val2)))>; + // Predicated min/max + def : Pat<(VTI.Vec (pred_int (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn), + (i32 VTI.Unsigned), (VTI.Pred VCCR:$mask), + (VTI.Vec MQPR:$inactive))), + (VTI.Vec (Inst (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn), + ARMVCCThen, (VTI.Pred VCCR:$mask), + (VTI.Vec MQPR:$inactive)))>; + } } +multiclass MVE_VMAX<MVEVectorVTInfo VTI> + : MVE_VMINMAX_m<"vmax", 0b0, VTI, !if(VTI.Unsigned, umax, smax), int_arm_mve_max_predicated>; +multiclass MVE_VMIN<MVEVectorVTInfo VTI> + : MVE_VMINMAX_m<"vmin", 0b1, VTI, !if(VTI.Unsigned, umin, smin), int_arm_mve_min_predicated>; + +defm MVE_VMINs8 : MVE_VMIN<MVE_v16s8>; +defm MVE_VMINs16 : MVE_VMIN<MVE_v8s16>; +defm MVE_VMINs32 : MVE_VMIN<MVE_v4s32>; +defm MVE_VMINu8 : MVE_VMIN<MVE_v16u8>; +defm MVE_VMINu16 : MVE_VMIN<MVE_v8u16>; +defm MVE_VMINu32 : MVE_VMIN<MVE_v4u32>; + +defm MVE_VMAXs8 : MVE_VMAX<MVE_v16s8>; +defm MVE_VMAXs16 : MVE_VMAX<MVE_v8s16>; +defm MVE_VMAXs32 : MVE_VMAX<MVE_v4s32>; +defm MVE_VMAXu8 : MVE_VMAX<MVE_v16u8>; +defm MVE_VMAXu16 : MVE_VMAX<MVE_v8u16>; +defm MVE_VMAXu32 : MVE_VMAX<MVE_v4u32>; + // end of mve_comp instructions // start of mve_bit instructions @@ -1150,53 +1328,61 @@ foreach s=["s8", "s16", "s32", "u8", "u16", "u32", "i8", "i16", "i32", "f16", "f (MVE_VAND MQPR:$QdSrc, MQPR:$QnSrc, MQPR:$QmSrc, vpred_r:$vp)>; } -let Predicates = [HasMVEInt] in { - def : Pat<(v16i8 (and (v16i8 MQPR:$val1), (v16i8 MQPR:$val2))), - (v16i8 (MVE_VAND (v16i8 MQPR:$val1), (v16i8 MQPR:$val2)))>; - def : Pat<(v8i16 (and (v8i16 MQPR:$val1), (v8i16 MQPR:$val2))), - (v8i16 (MVE_VAND (v8i16 MQPR:$val1), (v8i16 MQPR:$val2)))>; - def : Pat<(v4i32 (and (v4i32 MQPR:$val1), (v4i32 MQPR:$val2))), - (v4i32 (MVE_VAND (v4i32 MQPR:$val1), (v4i32 MQPR:$val2)))>; - def : Pat<(v2i64 (and (v2i64 MQPR:$val1), (v2i64 MQPR:$val2))), - (v2i64 (MVE_VAND (v2i64 MQPR:$val1), (v2i64 MQPR:$val2)))>; - - def : Pat<(v16i8 (or (v16i8 MQPR:$val1), (v16i8 MQPR:$val2))), - (v16i8 (MVE_VORR (v16i8 MQPR:$val1), (v16i8 MQPR:$val2)))>; - def : Pat<(v8i16 (or (v8i16 MQPR:$val1), (v8i16 MQPR:$val2))), - (v8i16 (MVE_VORR (v8i16 MQPR:$val1), (v8i16 MQPR:$val2)))>; - def : Pat<(v4i32 (or (v4i32 MQPR:$val1), (v4i32 MQPR:$val2))), - (v4i32 (MVE_VORR (v4i32 MQPR:$val1), (v4i32 MQPR:$val2)))>; - def : Pat<(v2i64 (or (v2i64 MQPR:$val1), (v2i64 MQPR:$val2))), - (v2i64 (MVE_VORR (v2i64 MQPR:$val1), (v2i64 MQPR:$val2)))>; - - def : Pat<(v16i8 (xor (v16i8 MQPR:$val1), (v16i8 MQPR:$val2))), - (v16i8 (MVE_VEOR (v16i8 MQPR:$val1), (v16i8 MQPR:$val2)))>; - def : Pat<(v8i16 (xor (v8i16 MQPR:$val1), (v8i16 MQPR:$val2))), - (v8i16 (MVE_VEOR (v8i16 MQPR:$val1), (v8i16 MQPR:$val2)))>; - def : Pat<(v4i32 (xor (v4i32 MQPR:$val1), (v4i32 MQPR:$val2))), - (v4i32 (MVE_VEOR (v4i32 MQPR:$val1), (v4i32 MQPR:$val2)))>; - def : Pat<(v2i64 (xor (v2i64 MQPR:$val1), (v2i64 MQPR:$val2))), - (v2i64 (MVE_VEOR (v2i64 MQPR:$val1), (v2i64 MQPR:$val2)))>; - - def : Pat<(v16i8 (and (v16i8 MQPR:$val1), (vnotq MQPR:$val2))), - (v16i8 (MVE_VBIC (v16i8 MQPR:$val1), (v16i8 MQPR:$val2)))>; - def : Pat<(v8i16 (and (v8i16 MQPR:$val1), (vnotq MQPR:$val2))), - (v8i16 (MVE_VBIC (v8i16 MQPR:$val1), (v8i16 MQPR:$val2)))>; - def : Pat<(v4i32 (and (v4i32 MQPR:$val1), (vnotq MQPR:$val2))), - (v4i32 (MVE_VBIC (v4i32 MQPR:$val1), (v4i32 MQPR:$val2)))>; - def : Pat<(v2i64 (and (v2i64 MQPR:$val1), (vnotq MQPR:$val2))), - (v2i64 (MVE_VBIC (v2i64 MQPR:$val1), (v2i64 MQPR:$val2)))>; - - def : Pat<(v16i8 (or (v16i8 MQPR:$val1), (vnotq MQPR:$val2))), - (v16i8 (MVE_VORN (v16i8 MQPR:$val1), (v16i8 MQPR:$val2)))>; - def : Pat<(v8i16 (or (v8i16 MQPR:$val1), (vnotq MQPR:$val2))), - (v8i16 (MVE_VORN (v8i16 MQPR:$val1), (v8i16 MQPR:$val2)))>; - def : Pat<(v4i32 (or (v4i32 MQPR:$val1), (vnotq MQPR:$val2))), - (v4i32 (MVE_VORN (v4i32 MQPR:$val1), (v4i32 MQPR:$val2)))>; - def : Pat<(v2i64 (or (v2i64 MQPR:$val1), (vnotq MQPR:$val2))), - (v2i64 (MVE_VORN (v2i64 MQPR:$val1), (v2i64 MQPR:$val2)))>; +multiclass MVE_bit_op<MVEVectorVTInfo VTI, SDNode unpred_op, Intrinsic pred_int, MVE_bit_ops instruction> { + let Predicates = [HasMVEInt] in { + // Unpredicated operation + def : Pat<(VTI.Vec (unpred_op (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn))), + (VTI.Vec (instruction (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn)))>; + // Predicated operation + def : Pat<(VTI.Vec (pred_int (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn), + (VTI.Pred VCCR:$mask), (VTI.Vec MQPR:$inactive))), + (VTI.Vec (instruction + (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn), + ARMVCCThen, (VTI.Pred VCCR:$mask), + (VTI.Vec MQPR:$inactive)))>; + } +} + +defm : MVE_bit_op<MVE_v16i8, and, int_arm_mve_and_predicated, MVE_VAND>; +defm : MVE_bit_op<MVE_v8i16, and, int_arm_mve_and_predicated, MVE_VAND>; +defm : MVE_bit_op<MVE_v4i32, and, int_arm_mve_and_predicated, MVE_VAND>; +defm : MVE_bit_op<MVE_v2i64, and, int_arm_mve_and_predicated, MVE_VAND>; + +defm : MVE_bit_op<MVE_v16i8, or, int_arm_mve_orr_predicated, MVE_VORR>; +defm : MVE_bit_op<MVE_v8i16, or, int_arm_mve_orr_predicated, MVE_VORR>; +defm : MVE_bit_op<MVE_v4i32, or, int_arm_mve_orr_predicated, MVE_VORR>; +defm : MVE_bit_op<MVE_v2i64, or, int_arm_mve_orr_predicated, MVE_VORR>; + +defm : MVE_bit_op<MVE_v16i8, xor, int_arm_mve_eor_predicated, MVE_VEOR>; +defm : MVE_bit_op<MVE_v8i16, xor, int_arm_mve_eor_predicated, MVE_VEOR>; +defm : MVE_bit_op<MVE_v4i32, xor, int_arm_mve_eor_predicated, MVE_VEOR>; +defm : MVE_bit_op<MVE_v2i64, xor, int_arm_mve_eor_predicated, MVE_VEOR>; + +multiclass MVE_bit_op_with_inv<MVEVectorVTInfo VTI, SDNode unpred_op, Intrinsic pred_int, MVE_bit_ops instruction> { + let Predicates = [HasMVEInt] in { + // Unpredicated operation + def : Pat<(VTI.Vec (unpred_op (VTI.Vec MQPR:$Qm), (vnotq (VTI.Vec MQPR:$Qn)))), + (VTI.Vec (instruction (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn)))>; + // Predicated operation + def : Pat<(VTI.Vec (pred_int (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn), + (VTI.Pred VCCR:$mask), (VTI.Vec MQPR:$inactive))), + (VTI.Vec (instruction + (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn), + ARMVCCThen, (VTI.Pred VCCR:$mask), + (VTI.Vec MQPR:$inactive)))>; + } } +defm : MVE_bit_op_with_inv<MVE_v16i8, and, int_arm_mve_bic_predicated, MVE_VBIC>; +defm : MVE_bit_op_with_inv<MVE_v8i16, and, int_arm_mve_bic_predicated, MVE_VBIC>; +defm : MVE_bit_op_with_inv<MVE_v4i32, and, int_arm_mve_bic_predicated, MVE_VBIC>; +defm : MVE_bit_op_with_inv<MVE_v2i64, and, int_arm_mve_bic_predicated, MVE_VBIC>; + +defm : MVE_bit_op_with_inv<MVE_v16i8, or, int_arm_mve_orn_predicated, MVE_VORN>; +defm : MVE_bit_op_with_inv<MVE_v8i16, or, int_arm_mve_orn_predicated, MVE_VORN>; +defm : MVE_bit_op_with_inv<MVE_v4i32, or, int_arm_mve_orn_predicated, MVE_VORN>; +defm : MVE_bit_op_with_inv<MVE_v2i64, or, int_arm_mve_orn_predicated, MVE_VORN>; + class MVE_bit_cmode<string iname, string suffix, bits<4> cmode, dag inOps> : MVE_p<(outs MQPR:$Qd), inOps, NoItinerary, iname, suffix, "$Qd, $imm", vpred_n, "$Qd = $Qd_src"> { @@ -1429,8 +1615,9 @@ class MVE_int<string iname, string suffix, bits<2> size, list<dag> pattern=[]> let Inst{3-1} = Qm{2-0}; } -class MVE_VMULt1<string suffix, bits<2> size, list<dag> pattern=[]> - : MVE_int<"vmul", suffix, size, pattern> { +class MVE_VMULt1<string iname, string suffix, bits<2> size, + list<dag> pattern=[]> + : MVE_int<iname, suffix, size, pattern> { let Inst{28} = 0b0; let Inst{25-23} = 0b110; @@ -1438,22 +1625,36 @@ class MVE_VMULt1<string suffix, bits<2> size, list<dag> pattern=[]> let Inst{12-8} = 0b01001; let Inst{4} = 0b1; let Inst{0} = 0b0; + let validForTailPredication = 1; } -def MVE_VMULt1i8 : MVE_VMULt1<"i8", 0b00>; -def MVE_VMULt1i16 : MVE_VMULt1<"i16", 0b01>; -def MVE_VMULt1i32 : MVE_VMULt1<"i32", 0b10>; +multiclass MVE_VMUL_m<string iname, MVEVectorVTInfo VTI, + SDNode unpred_op, Intrinsic pred_int> { + def "" : MVE_VMULt1<iname, VTI.Suffix, VTI.Size>; + defvar Inst = !cast<Instruction>(NAME); -let Predicates = [HasMVEInt] in { - def : Pat<(v16i8 (mul (v16i8 MQPR:$val1), (v16i8 MQPR:$val2))), - (v16i8 (MVE_VMULt1i8 (v16i8 MQPR:$val1), (v16i8 MQPR:$val2)))>; - def : Pat<(v8i16 (mul (v8i16 MQPR:$val1), (v8i16 MQPR:$val2))), - (v8i16 (MVE_VMULt1i16 (v8i16 MQPR:$val1), (v8i16 MQPR:$val2)))>; - def : Pat<(v4i32 (mul (v4i32 MQPR:$val1), (v4i32 MQPR:$val2))), - (v4i32 (MVE_VMULt1i32 (v4i32 MQPR:$val1), (v4i32 MQPR:$val2)))>; + let Predicates = [HasMVEInt] in { + // Unpredicated multiply + def : Pat<(VTI.Vec (unpred_op (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn))), + (VTI.Vec (Inst (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn)))>; + + // Predicated multiply + def : Pat<(VTI.Vec (pred_int (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn), + (VTI.Pred VCCR:$mask), (VTI.Vec MQPR:$inactive))), + (VTI.Vec (Inst (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn), + ARMVCCThen, (VTI.Pred VCCR:$mask), + (VTI.Vec MQPR:$inactive)))>; + } } -class MVE_VQxDMULH<string iname, string suffix, bits<2> size, bit rounding, +multiclass MVE_VMUL<MVEVectorVTInfo VTI> + : MVE_VMUL_m<"vmul", VTI, mul, int_arm_mve_mul_predicated>; + +defm MVE_VMULi8 : MVE_VMUL<MVE_v16i8>; +defm MVE_VMULi16 : MVE_VMUL<MVE_v8i16>; +defm MVE_VMULi32 : MVE_VMUL<MVE_v4i32>; + +class MVE_VQxDMULH_Base<string iname, string suffix, bits<2> size, bit rounding, list<dag> pattern=[]> : MVE_int<iname, suffix, size, pattern> { @@ -1465,18 +1666,40 @@ class MVE_VQxDMULH<string iname, string suffix, bits<2> size, bit rounding, let Inst{0} = 0b0; } -class MVE_VQDMULH<string suffix, bits<2> size, list<dag> pattern=[]> - : MVE_VQxDMULH<"vqdmulh", suffix, size, 0b0, pattern>; -class MVE_VQRDMULH<string suffix, bits<2> size, list<dag> pattern=[]> - : MVE_VQxDMULH<"vqrdmulh", suffix, size, 0b1, pattern>; +multiclass MVE_VQxDMULH_m<string iname, MVEVectorVTInfo VTI, + SDNode unpred_op, Intrinsic pred_int, + bit rounding> { + def "" : MVE_VQxDMULH_Base<iname, VTI.Suffix, VTI.Size, rounding>; + defvar Inst = !cast<Instruction>(NAME); -def MVE_VQDMULHi8 : MVE_VQDMULH<"s8", 0b00>; -def MVE_VQDMULHi16 : MVE_VQDMULH<"s16", 0b01>; -def MVE_VQDMULHi32 : MVE_VQDMULH<"s32", 0b10>; + let Predicates = [HasMVEInt] in { + // Unpredicated multiply + def : Pat<(VTI.Vec (unpred_op (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn))), + (VTI.Vec (Inst (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn)))>; + + // Predicated multiply + def : Pat<(VTI.Vec (pred_int (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn), + (VTI.Pred VCCR:$mask), (VTI.Vec MQPR:$inactive))), + (VTI.Vec (Inst (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn), + ARMVCCThen, (VTI.Pred VCCR:$mask), + (VTI.Vec MQPR:$inactive)))>; + } +} -def MVE_VQRDMULHi8 : MVE_VQRDMULH<"s8", 0b00>; -def MVE_VQRDMULHi16 : MVE_VQRDMULH<"s16", 0b01>; -def MVE_VQRDMULHi32 : MVE_VQRDMULH<"s32", 0b10>; +multiclass MVE_VQxDMULH<string iname, MVEVectorVTInfo VTI, bit rounding> + : MVE_VQxDMULH_m<iname, VTI, !if(rounding, int_arm_mve_vqrdmulh, + int_arm_mve_vqdmulh), + !if(rounding, int_arm_mve_qrdmulh_predicated, + int_arm_mve_qdmulh_predicated), + rounding>; + +defm MVE_VQDMULHi8 : MVE_VQxDMULH<"vqdmulh", MVE_v16s8, 0b0>; +defm MVE_VQDMULHi16 : MVE_VQxDMULH<"vqdmulh", MVE_v8s16, 0b0>; +defm MVE_VQDMULHi32 : MVE_VQxDMULH<"vqdmulh", MVE_v4s32, 0b0>; + +defm MVE_VQRDMULHi8 : MVE_VQxDMULH<"vqrdmulh", MVE_v16s8, 0b1>; +defm MVE_VQRDMULHi16 : MVE_VQxDMULH<"vqrdmulh", MVE_v8s16, 0b1>; +defm MVE_VQRDMULHi32 : MVE_VQxDMULH<"vqrdmulh", MVE_v4s32, 0b1>; class MVE_VADDSUB<string iname, string suffix, bits<2> size, bit subtract, list<dag> pattern=[]> @@ -1491,39 +1714,40 @@ class MVE_VADDSUB<string iname, string suffix, bits<2> size, bit subtract, let validForTailPredication = 1; } -class MVE_VADD<string suffix, bits<2> size, list<dag> pattern=[]> - : MVE_VADDSUB<"vadd", suffix, size, 0b0, pattern>; -class MVE_VSUB<string suffix, bits<2> size, list<dag> pattern=[]> - : MVE_VADDSUB<"vsub", suffix, size, 0b1, pattern>; +multiclass MVE_VADDSUB_m<string iname, MVEVectorVTInfo VTI, bit subtract, + SDNode unpred_op, Intrinsic pred_int> { + def "" : MVE_VADDSUB<iname, VTI.Suffix, VTI.Size, subtract>; + defvar Inst = !cast<Instruction>(NAME); -def MVE_VADDi8 : MVE_VADD<"i8", 0b00>; -def MVE_VADDi16 : MVE_VADD<"i16", 0b01>; -def MVE_VADDi32 : MVE_VADD<"i32", 0b10>; + let Predicates = [HasMVEInt] in { + // Unpredicated add/subtract + def : Pat<(VTI.Vec (unpred_op (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn))), + (VTI.Vec (Inst (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn)))>; -let Predicates = [HasMVEInt] in { - def : Pat<(v16i8 (add (v16i8 MQPR:$val1), (v16i8 MQPR:$val2))), - (v16i8 (MVE_VADDi8 (v16i8 MQPR:$val1), (v16i8 MQPR:$val2)))>; - def : Pat<(v8i16 (add (v8i16 MQPR:$val1), (v8i16 MQPR:$val2))), - (v8i16 (MVE_VADDi16 (v8i16 MQPR:$val1), (v8i16 MQPR:$val2)))>; - def : Pat<(v4i32 (add (v4i32 MQPR:$val1), (v4i32 MQPR:$val2))), - (v4i32 (MVE_VADDi32 (v4i32 MQPR:$val1), (v4i32 MQPR:$val2)))>; + // Predicated add/subtract + def : Pat<(VTI.Vec (pred_int (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn), + (VTI.Pred VCCR:$mask), (VTI.Vec MQPR:$inactive))), + (VTI.Vec (Inst (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn), + ARMVCCThen, (VTI.Pred VCCR:$mask), + (VTI.Vec MQPR:$inactive)))>; + } } -def MVE_VSUBi8 : MVE_VSUB<"i8", 0b00>; -def MVE_VSUBi16 : MVE_VSUB<"i16", 0b01>; -def MVE_VSUBi32 : MVE_VSUB<"i32", 0b10>; +multiclass MVE_VADD<MVEVectorVTInfo VTI> + : MVE_VADDSUB_m<"vadd", VTI, 0b0, add, int_arm_mve_add_predicated>; +multiclass MVE_VSUB<MVEVectorVTInfo VTI> + : MVE_VADDSUB_m<"vsub", VTI, 0b1, sub, int_arm_mve_sub_predicated>; -let Predicates = [HasMVEInt] in { - def : Pat<(v16i8 (sub (v16i8 MQPR:$val1), (v16i8 MQPR:$val2))), - (v16i8 (MVE_VSUBi8 (v16i8 MQPR:$val1), (v16i8 MQPR:$val2)))>; - def : Pat<(v8i16 (sub (v8i16 MQPR:$val1), (v8i16 MQPR:$val2))), - (v8i16 (MVE_VSUBi16 (v8i16 MQPR:$val1), (v8i16 MQPR:$val2)))>; - def : Pat<(v4i32 (sub (v4i32 MQPR:$val1), (v4i32 MQPR:$val2))), - (v4i32 (MVE_VSUBi32 (v4i32 MQPR:$val1), (v4i32 MQPR:$val2)))>; -} +defm MVE_VADDi8 : MVE_VADD<MVE_v16i8>; +defm MVE_VADDi16 : MVE_VADD<MVE_v8i16>; +defm MVE_VADDi32 : MVE_VADD<MVE_v4i32>; + +defm MVE_VSUBi8 : MVE_VSUB<MVE_v16i8>; +defm MVE_VSUBi16 : MVE_VSUB<MVE_v8i16>; +defm MVE_VSUBi32 : MVE_VSUB<MVE_v4i32>; class MVE_VQADDSUB<string iname, string suffix, bit U, bit subtract, - bits<2> size, ValueType vt> + bits<2> size> : MVE_int<iname, suffix, size, []> { let Inst{28} = U; @@ -1535,50 +1759,75 @@ class MVE_VQADDSUB<string iname, string suffix, bit U, bit subtract, let Inst{4} = 0b1; let Inst{0} = 0b0; let validForTailPredication = 1; +} - ValueType VT = vt; +class MVE_VQADD_<string suffix, bit U, bits<2> size> + : MVE_VQADDSUB<"vqadd", suffix, U, 0b0, size>; +class MVE_VQSUB_<string suffix, bit U, bits<2> size> + : MVE_VQADDSUB<"vqsub", suffix, U, 0b1, size>; + +multiclass MVE_VQADD_m<MVEVectorVTInfo VTI, + SDNode unpred_op, Intrinsic pred_int> { + def "" : MVE_VQADD_<VTI.Suffix, VTI.Unsigned, VTI.Size>; + defvar Inst = !cast<Instruction>(NAME); + + let Predicates = [HasMVEInt] in { + // Unpredicated saturating add + def : Pat<(VTI.Vec (unpred_op (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn))), + (VTI.Vec (Inst (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn)))>; + + // Predicated saturating add + def : Pat<(VTI.Vec (pred_int (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn), + (i32 VTI.Unsigned), (VTI.Pred VCCR:$mask), + (VTI.Vec MQPR:$inactive))), + (VTI.Vec (Inst (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn), + ARMVCCThen, (VTI.Pred VCCR:$mask), + (VTI.Vec MQPR:$inactive)))>; + } } -class MVE_VQADD<string suffix, bit U, bits<2> size, ValueType VT> - : MVE_VQADDSUB<"vqadd", suffix, U, 0b0, size, VT>; -class MVE_VQSUB<string suffix, bit U, bits<2> size, ValueType VT> - : MVE_VQADDSUB<"vqsub", suffix, U, 0b1, size, VT>; +multiclass MVE_VQADD<MVEVectorVTInfo VTI, SDNode unpred_op> + : MVE_VQADD_m<VTI, unpred_op, int_arm_mve_qadd_predicated>; + +defm MVE_VQADDs8 : MVE_VQADD<MVE_v16s8, saddsat>; +defm MVE_VQADDs16 : MVE_VQADD<MVE_v8s16, saddsat>; +defm MVE_VQADDs32 : MVE_VQADD<MVE_v4s32, saddsat>; +defm MVE_VQADDu8 : MVE_VQADD<MVE_v16u8, uaddsat>; +defm MVE_VQADDu16 : MVE_VQADD<MVE_v8u16, uaddsat>; +defm MVE_VQADDu32 : MVE_VQADD<MVE_v4u32, uaddsat>; + +multiclass MVE_VQSUB_m<MVEVectorVTInfo VTI, + SDNode unpred_op, Intrinsic pred_int> { + def "" : MVE_VQSUB_<VTI.Suffix, VTI.Unsigned, VTI.Size>; + defvar Inst = !cast<Instruction>(NAME); + + let Predicates = [HasMVEInt] in { + // Unpredicated saturating subtract + def : Pat<(VTI.Vec (unpred_op (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn))), + (VTI.Vec (Inst (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn)))>; + + // Predicated saturating subtract + def : Pat<(VTI.Vec (pred_int (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn), + (i32 VTI.Unsigned), (VTI.Pred VCCR:$mask), + (VTI.Vec MQPR:$inactive))), + (VTI.Vec (Inst (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn), + ARMVCCThen, (VTI.Pred VCCR:$mask), + (VTI.Vec MQPR:$inactive)))>; + } +} -def MVE_VQADDs8 : MVE_VQADD<"s8", 0b0, 0b00, v16i8>; -def MVE_VQADDs16 : MVE_VQADD<"s16", 0b0, 0b01, v8i16>; -def MVE_VQADDs32 : MVE_VQADD<"s32", 0b0, 0b10, v4i32>; -def MVE_VQADDu8 : MVE_VQADD<"u8", 0b1, 0b00, v16i8>; -def MVE_VQADDu16 : MVE_VQADD<"u16", 0b1, 0b01, v8i16>; -def MVE_VQADDu32 : MVE_VQADD<"u32", 0b1, 0b10, v4i32>; +multiclass MVE_VQSUB<MVEVectorVTInfo VTI, SDNode unpred_op> + : MVE_VQSUB_m<VTI, unpred_op, int_arm_mve_qsub_predicated>; -def MVE_VQSUBs8 : MVE_VQSUB<"s8", 0b0, 0b00, v16i8>; -def MVE_VQSUBs16 : MVE_VQSUB<"s16", 0b0, 0b01, v8i16>; -def MVE_VQSUBs32 : MVE_VQSUB<"s32", 0b0, 0b10, v4i32>; -def MVE_VQSUBu8 : MVE_VQSUB<"u8", 0b1, 0b00, v16i8>; -def MVE_VQSUBu16 : MVE_VQSUB<"u16", 0b1, 0b01, v8i16>; -def MVE_VQSUBu32 : MVE_VQSUB<"u32", 0b1, 0b10, v4i32>; +defm MVE_VQSUBs8 : MVE_VQSUB<MVE_v16s8, ssubsat>; +defm MVE_VQSUBs16 : MVE_VQSUB<MVE_v8s16, ssubsat>; +defm MVE_VQSUBs32 : MVE_VQSUB<MVE_v4s32, ssubsat>; +defm MVE_VQSUBu8 : MVE_VQSUB<MVE_v16u8, usubsat>; +defm MVE_VQSUBu16 : MVE_VQSUB<MVE_v8u16, usubsat>; +defm MVE_VQSUBu32 : MVE_VQSUB<MVE_v4u32, usubsat>; -let Predicates = [HasMVEInt] in { - foreach instr = [MVE_VQADDu8, MVE_VQADDu16, MVE_VQADDu32] in - foreach VT = [instr.VT] in - def : Pat<(VT (uaddsat (VT MQPR:$Qm), (VT MQPR:$Qn))), - (VT (instr (VT MQPR:$Qm), (VT MQPR:$Qn)))>; - foreach instr = [MVE_VQADDs8, MVE_VQADDs16, MVE_VQADDs32] in - foreach VT = [instr.VT] in - def : Pat<(VT (saddsat (VT MQPR:$Qm), (VT MQPR:$Qn))), - (VT (instr (VT MQPR:$Qm), (VT MQPR:$Qn)))>; - foreach instr = [MVE_VQSUBu8, MVE_VQSUBu16, MVE_VQSUBu32] in - foreach VT = [instr.VT] in - def : Pat<(VT (usubsat (VT MQPR:$Qm), (VT MQPR:$Qn))), - (VT (instr (VT MQPR:$Qm), (VT MQPR:$Qn)))>; - foreach instr = [MVE_VQSUBs8, MVE_VQSUBs16, MVE_VQSUBs32] in - foreach VT = [instr.VT] in - def : Pat<(VT (ssubsat (VT MQPR:$Qm), (VT MQPR:$Qn))), - (VT (instr (VT MQPR:$Qm), (VT MQPR:$Qn)))>; -} - - -class MVE_VABD_int<string suffix, bit U, bits<2> size, list<dag> pattern=[]> +class MVE_VABD_int<string suffix, bit U, bits<2> size, + list<dag> pattern=[]> : MVE_int<"vabd", suffix, size, pattern> { let Inst{28} = U; @@ -1590,14 +1839,38 @@ class MVE_VABD_int<string suffix, bit U, bits<2> size, list<dag> pattern=[]> let validForTailPredication = 1; } -def MVE_VABDs8 : MVE_VABD_int<"s8", 0b0, 0b00>; -def MVE_VABDs16 : MVE_VABD_int<"s16", 0b0, 0b01>; -def MVE_VABDs32 : MVE_VABD_int<"s32", 0b0, 0b10>; -def MVE_VABDu8 : MVE_VABD_int<"u8", 0b1, 0b00>; -def MVE_VABDu16 : MVE_VABD_int<"u16", 0b1, 0b01>; -def MVE_VABDu32 : MVE_VABD_int<"u32", 0b1, 0b10>; +multiclass MVE_VABD_m<MVEVectorVTInfo VTI, + Intrinsic unpred_int, Intrinsic pred_int> { + def "" : MVE_VABD_int<VTI.Suffix, VTI.Unsigned, VTI.Size>; + defvar Inst = !cast<Instruction>(NAME); + + let Predicates = [HasMVEInt] in { + // Unpredicated absolute difference + def : Pat<(VTI.Vec (unpred_int (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn), + (i32 VTI.Unsigned))), + (VTI.Vec (Inst (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn)))>; + + // Predicated absolute difference + def : Pat<(VTI.Vec (pred_int (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn), + (i32 VTI.Unsigned), (VTI.Pred VCCR:$mask), + (VTI.Vec MQPR:$inactive))), + (VTI.Vec (Inst (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn), + ARMVCCThen, (VTI.Pred VCCR:$mask), + (VTI.Vec MQPR:$inactive)))>; + } +} + +multiclass MVE_VABD<MVEVectorVTInfo VTI> + : MVE_VABD_m<VTI, int_arm_mve_vabd, int_arm_mve_abd_predicated>; -class MVE_VRHADD<string suffix, bit U, bits<2> size, list<dag> pattern=[]> +defm MVE_VABDs8 : MVE_VABD<MVE_v16s8>; +defm MVE_VABDs16 : MVE_VABD<MVE_v8s16>; +defm MVE_VABDs32 : MVE_VABD<MVE_v4s32>; +defm MVE_VABDu8 : MVE_VABD<MVE_v16u8>; +defm MVE_VABDu16 : MVE_VABD<MVE_v8u16>; +defm MVE_VABDu32 : MVE_VABD<MVE_v4u32>; + +class MVE_VRHADD_Base<string suffix, bit U, bits<2> size, list<dag> pattern=[]> : MVE_int<"vrhadd", suffix, size, pattern> { let Inst{28} = U; @@ -1609,12 +1882,36 @@ class MVE_VRHADD<string suffix, bit U, bits<2> size, list<dag> pattern=[]> let validForTailPredication = 1; } -def MVE_VRHADDs8 : MVE_VRHADD<"s8", 0b0, 0b00>; -def MVE_VRHADDs16 : MVE_VRHADD<"s16", 0b0, 0b01>; -def MVE_VRHADDs32 : MVE_VRHADD<"s32", 0b0, 0b10>; -def MVE_VRHADDu8 : MVE_VRHADD<"u8", 0b1, 0b00>; -def MVE_VRHADDu16 : MVE_VRHADD<"u16", 0b1, 0b01>; -def MVE_VRHADDu32 : MVE_VRHADD<"u32", 0b1, 0b10>; +multiclass MVE_VRHADD_m<MVEVectorVTInfo VTI, + SDNode unpred_op, Intrinsic pred_int> { + def "" : MVE_VRHADD_Base<VTI.Suffix, VTI.Unsigned, VTI.Size>; + defvar Inst = !cast<Instruction>(NAME); + + let Predicates = [HasMVEInt] in { + // Unpredicated rounding add-with-divide-by-two + def : Pat<(VTI.Vec (unpred_op (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn), + (i32 VTI.Unsigned))), + (VTI.Vec (Inst (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn)))>; + + // Predicated add-with-divide-by-two + def : Pat<(VTI.Vec (pred_int (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn), + (i32 VTI.Unsigned), (VTI.Pred VCCR:$mask), + (VTI.Vec MQPR:$inactive))), + (VTI.Vec (Inst (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn), + ARMVCCThen, (VTI.Pred VCCR:$mask), + (VTI.Vec MQPR:$inactive)))>; + } +} + +multiclass MVE_VRHADD<MVEVectorVTInfo VTI> + : MVE_VRHADD_m<VTI, int_arm_mve_vrhadd, int_arm_mve_rhadd_predicated>; + +defm MVE_VRHADDs8 : MVE_VRHADD<MVE_v16s8>; +defm MVE_VRHADDs16 : MVE_VRHADD<MVE_v8s16>; +defm MVE_VRHADDs32 : MVE_VRHADD<MVE_v4s32>; +defm MVE_VRHADDu8 : MVE_VRHADD<MVE_v16u8>; +defm MVE_VRHADDu16 : MVE_VRHADD<MVE_v8u16>; +defm MVE_VRHADDu32 : MVE_VRHADD<MVE_v4u32>; class MVE_VHADDSUB<string iname, string suffix, bit U, bit subtract, bits<2> size, list<dag> pattern=[]> @@ -1631,81 +1928,73 @@ class MVE_VHADDSUB<string iname, string suffix, bit U, bit subtract, let validForTailPredication = 1; } -class MVE_VHADD<string suffix, bit U, bits<2> size, +class MVE_VHADD_<string suffix, bit U, bits<2> size, list<dag> pattern=[]> : MVE_VHADDSUB<"vhadd", suffix, U, 0b0, size, pattern>; -class MVE_VHSUB<string suffix, bit U, bits<2> size, +class MVE_VHSUB_<string suffix, bit U, bits<2> size, list<dag> pattern=[]> : MVE_VHADDSUB<"vhsub", suffix, U, 0b1, size, pattern>; -def MVE_VHADDs8 : MVE_VHADD<"s8", 0b0, 0b00>; -def MVE_VHADDs16 : MVE_VHADD<"s16", 0b0, 0b01>; -def MVE_VHADDs32 : MVE_VHADD<"s32", 0b0, 0b10>; -def MVE_VHADDu8 : MVE_VHADD<"u8", 0b1, 0b00>; -def MVE_VHADDu16 : MVE_VHADD<"u16", 0b1, 0b01>; -def MVE_VHADDu32 : MVE_VHADD<"u32", 0b1, 0b10>; - -def MVE_VHSUBs8 : MVE_VHSUB<"s8", 0b0, 0b00>; -def MVE_VHSUBs16 : MVE_VHSUB<"s16", 0b0, 0b01>; -def MVE_VHSUBs32 : MVE_VHSUB<"s32", 0b0, 0b10>; -def MVE_VHSUBu8 : MVE_VHSUB<"u8", 0b1, 0b00>; -def MVE_VHSUBu16 : MVE_VHSUB<"u16", 0b1, 0b01>; -def MVE_VHSUBu32 : MVE_VHSUB<"u32", 0b1, 0b10>; +multiclass MVE_VHADD_m<MVEVectorVTInfo VTI, + SDNode unpred_op, Intrinsic pred_int> { + def "" : MVE_VHADD_<VTI.Suffix, VTI.Unsigned, VTI.Size>; + defvar Inst = !cast<Instruction>(NAME); + + let Predicates = [HasMVEInt] in { + // Unpredicated add-and-divide-by-two + def : Pat<(VTI.Vec (unpred_op (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn), (i32 VTI.Unsigned))), + (VTI.Vec (Inst (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn)))>; + + // Predicated add-and-divide-by-two + def : Pat<(VTI.Vec (pred_int (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn), (i32 VTI.Unsigned), + (VTI.Pred VCCR:$mask), (VTI.Vec MQPR:$inactive))), + (VTI.Vec (Inst (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn), + ARMVCCThen, (VTI.Pred VCCR:$mask), + (VTI.Vec MQPR:$inactive)))>; + } +} -let Predicates = [HasMVEInt] in { - def : Pat<(v16i8 (ARMvshrsImm - (v16i8 (add (v16i8 MQPR:$v1), (v16i8 MQPR:$v2))), 1)), - (v16i8 (MVE_VHADDs8 - (v16i8 MQPR:$v1), (v16i8 MQPR:$v2)))>; - def : Pat<(v8i16 (ARMvshrsImm - (v8i16 (add (v8i16 MQPR:$v1), (v8i16 MQPR:$v2))), 1)), - (v8i16 (MVE_VHADDs16 - (v8i16 MQPR:$v1), (v8i16 MQPR:$v2)))>; - def : Pat<(v4i32 (ARMvshrsImm - (v4i32 (add (v4i32 MQPR:$v1), (v4i32 MQPR:$v2))), 1)), - (v4i32 (MVE_VHADDs32 - (v4i32 MQPR:$v1), (v4i32 MQPR:$v2)))>; - - def : Pat<(v16i8 (ARMvshruImm - (v16i8 (add (v16i8 MQPR:$v1), (v16i8 MQPR:$v2))), 1)), - (v16i8 (MVE_VHADDu8 - (v16i8 MQPR:$v1), (v16i8 MQPR:$v2)))>; - def : Pat<(v8i16 (ARMvshruImm - (v8i16 (add (v8i16 MQPR:$v1), (v8i16 MQPR:$v2))), 1)), - (v8i16 (MVE_VHADDu16 - (v8i16 MQPR:$v1), (v8i16 MQPR:$v2)))>; - def : Pat<(v4i32 (ARMvshruImm - (v4i32 (add (v4i32 MQPR:$v1), (v4i32 MQPR:$v2))), 1)), - (v4i32 (MVE_VHADDu32 - (v4i32 MQPR:$v1), (v4i32 MQPR:$v2)))>; - - def : Pat<(v16i8 (ARMvshrsImm - (v16i8 (sub (v16i8 MQPR:$v1), (v16i8 MQPR:$v2))), 1)), - (v16i8 (MVE_VHSUBs8 - (v16i8 MQPR:$v1), (v16i8 MQPR:$v2)))>; - def : Pat<(v8i16 (ARMvshrsImm - (v8i16 (sub (v8i16 MQPR:$v1), (v8i16 MQPR:$v2))), 1)), - (v8i16 (MVE_VHSUBs16 - (v8i16 MQPR:$v1), (v8i16 MQPR:$v2)))>; - def : Pat<(v4i32 (ARMvshrsImm - (v4i32 (sub (v4i32 MQPR:$v1), (v4i32 MQPR:$v2))), 1)), - (v4i32 (MVE_VHSUBs32 - (v4i32 MQPR:$v1), (v4i32 MQPR:$v2)))>; - - def : Pat<(v16i8 (ARMvshruImm - (v16i8 (sub (v16i8 MQPR:$v1), (v16i8 MQPR:$v2))), 1)), - (v16i8 (MVE_VHSUBu8 - (v16i8 MQPR:$v1), (v16i8 MQPR:$v2)))>; - def : Pat<(v8i16 (ARMvshruImm - (v8i16 (sub (v8i16 MQPR:$v1), (v8i16 MQPR:$v2))), 1)), - (v8i16 (MVE_VHSUBu16 - (v8i16 MQPR:$v1), (v8i16 MQPR:$v2)))>; - def : Pat<(v4i32 (ARMvshruImm - (v4i32 (sub (v4i32 MQPR:$v1), (v4i32 MQPR:$v2))), 1)), - (v4i32 (MVE_VHSUBu32 - (v4i32 MQPR:$v1), (v4i32 MQPR:$v2)))>; +multiclass MVE_VHADD<MVEVectorVTInfo VTI> + : MVE_VHADD_m<VTI, int_arm_mve_vhadd, int_arm_mve_hadd_predicated>; + +defm MVE_VHADDs8 : MVE_VHADD<MVE_v16s8>; +defm MVE_VHADDs16 : MVE_VHADD<MVE_v8s16>; +defm MVE_VHADDs32 : MVE_VHADD<MVE_v4s32>; +defm MVE_VHADDu8 : MVE_VHADD<MVE_v16u8>; +defm MVE_VHADDu16 : MVE_VHADD<MVE_v8u16>; +defm MVE_VHADDu32 : MVE_VHADD<MVE_v4u32>; + +multiclass MVE_VHSUB_m<MVEVectorVTInfo VTI, + SDNode unpred_op, Intrinsic pred_int> { + def "" : MVE_VHSUB_<VTI.Suffix, VTI.Unsigned, VTI.Size>; + defvar Inst = !cast<Instruction>(NAME); + + let Predicates = [HasMVEInt] in { + // Unpredicated subtract-and-divide-by-two + def : Pat<(VTI.Vec (unpred_op (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn), + (i32 VTI.Unsigned))), + (VTI.Vec (Inst (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn)))>; + + // Predicated subtract-and-divide-by-two + def : Pat<(VTI.Vec (pred_int (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn), + (i32 VTI.Unsigned), (VTI.Pred VCCR:$mask), + (VTI.Vec MQPR:$inactive))), + (VTI.Vec (Inst (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn), + ARMVCCThen, (VTI.Pred VCCR:$mask), + (VTI.Vec MQPR:$inactive)))>; + } } +multiclass MVE_VHSUB<MVEVectorVTInfo VTI> + : MVE_VHSUB_m<VTI, int_arm_mve_vhsub, int_arm_mve_hsub_predicated>; + +defm MVE_VHSUBs8 : MVE_VHSUB<MVE_v16s8>; +defm MVE_VHSUBs16 : MVE_VHSUB<MVE_v8s16>; +defm MVE_VHSUBs32 : MVE_VHSUB<MVE_v4s32>; +defm MVE_VHSUBu8 : MVE_VHSUB<MVE_v16u8>; +defm MVE_VHSUBu16 : MVE_VHSUB<MVE_v8u16>; +defm MVE_VHSUBu32 : MVE_VHSUB<MVE_v4u32>; + class MVE_VDUP<string suffix, bit B, bit E, list<dag> pattern=[]> : MVE_p<(outs MQPR:$Qd), (ins rGPR:$Rt), NoItinerary, "vdup", suffix, "$Qd, $Rt", vpred_r, "", pattern> { @@ -1873,6 +2162,49 @@ def MVE_VQNEGs8 : MVE_VQABSNEG<"vqneg", "s8", 0b00, 0b1>; def MVE_VQNEGs16 : MVE_VQABSNEG<"vqneg", "s16", 0b01, 0b1>; def MVE_VQNEGs32 : MVE_VQABSNEG<"vqneg", "s32", 0b10, 0b1>; +// int_min/int_max: vector containing INT_MIN/INT_MAX VTI.Size times +// zero_vec: v4i32-initialized zero vector, potentially wrapped in a bitconvert +multiclass vqabsneg_pattern<MVEVectorVTInfo VTI, dag int_min, dag int_max, + dag zero_vec, MVE_VQABSNEG vqabs_instruction, + MVE_VQABSNEG vqneg_instruction> { + let Predicates = [HasMVEInt] in { + // The below tree can be replaced by a vqabs instruction, as it represents + // the following vectorized expression (r being the value in $reg): + // r > 0 ? r : (r == INT_MIN ? INT_MAX : -r) + def : Pat<(VTI.Vec (vselect + (VTI.Pred (ARMvcmpz (VTI.Vec MQPR:$reg), ARMCCgt)), + (VTI.Vec MQPR:$reg), + (VTI.Vec (vselect + (VTI.Pred (ARMvcmp (VTI.Vec MQPR:$reg), int_min, ARMCCeq)), + int_max, + (sub (VTI.Vec zero_vec), (VTI.Vec MQPR:$reg)))))), + (VTI.Vec (vqabs_instruction (VTI.Vec MQPR:$reg)))>; + // Similarly, this tree represents vqneg, i.e. the following vectorized expression: + // r == INT_MIN ? INT_MAX : -r + def : Pat<(VTI.Vec (vselect + (VTI.Pred (ARMvcmp (VTI.Vec MQPR:$reg), int_min, ARMCCeq)), + int_max, + (sub (VTI.Vec zero_vec), (VTI.Vec MQPR:$reg)))), + (VTI.Vec (vqneg_instruction (VTI.Vec MQPR:$reg)))>; + } +} + +defm MVE_VQABSNEG_Ps8 : vqabsneg_pattern<MVE_v16i8, + (v16i8 (ARMvmovImm (i32 3712))), + (v16i8 (ARMvmovImm (i32 3711))), + (bitconvert (v4i32 (ARMvmovImm (i32 0)))), + MVE_VQABSs8, MVE_VQNEGs8>; +defm MVE_VQABSNEG_Ps16 : vqabsneg_pattern<MVE_v8i16, + (v8i16 (ARMvmovImm (i32 2688))), + (v8i16 (ARMvmvnImm (i32 2688))), + (bitconvert (v4i32 (ARMvmovImm (i32 0)))), + MVE_VQABSs16, MVE_VQNEGs16>; +defm MVE_VQABSNEG_Ps32 : vqabsneg_pattern<MVE_v4i32, + (v4i32 (ARMvmovImm (i32 1664))), + (v4i32 (ARMvmvnImm (i32 1664))), + (ARMvmovImm (i32 0)), + MVE_VQABSs32, MVE_VQNEGs32>; + class MVE_mod_imm<string iname, string suffix, bits<4> cmode, bit op, dag iops, list<dag> pattern=[]> : MVE_p<(outs MQPR:$Qd), iops, NoItinerary, iname, suffix, "$Qd, $imm", @@ -1956,6 +2288,7 @@ class MVE_VMINMAXA<string iname, string suffix, bits<2> size, let Inst{4} = 0b0; let Inst{3-1} = Qm{2-0}; let Inst{0} = 0b1; + let validForTailPredication = 1; } def MVE_VMAXAs8 : MVE_VMINMAXA<"vmaxa", "s8", 0b00, 0b0>; @@ -2049,8 +2382,8 @@ let Predicates = [HasMVEInt] in { class MVE_VSHLL_imm<string iname, string suffix, bit U, bit th, - dag immops, list<dag> pattern=[]> - : MVE_shift_imm<(outs MQPR:$Qd), !con((ins MQPR:$Qm), immops), + Operand immtype, list<dag> pattern=[]> + : MVE_shift_imm<(outs MQPR:$Qd), (ins MQPR:$Qm, immtype:$imm), iname, suffix, "$Qd, $Qm, $imm", vpred_r, "", pattern> { let Inst{28} = U; let Inst{25-23} = 0b101; @@ -2059,6 +2392,9 @@ class MVE_VSHLL_imm<string iname, string suffix, bit U, bit th, let Inst{11-6} = 0b111101; let Inst{4} = 0b0; let Inst{0} = 0b0; + + // For the MVE_VSHLL_patterns multiclass to refer to + Operand immediateType = immtype; } // The immediate VSHLL instructions accept shift counts from 1 up to @@ -2067,7 +2403,7 @@ class MVE_VSHLL_imm<string iname, string suffix, bit U, bit th, class MVE_VSHLL_imm8<string iname, string suffix, bit U, bit th, list<dag> pattern=[]> - : MVE_VSHLL_imm<iname, suffix, U, th, (ins mve_shift_imm1_7:$imm), pattern> { + : MVE_VSHLL_imm<iname, suffix, U, th, mve_shift_imm1_7, pattern> { bits<3> imm; let Inst{20-19} = 0b01; let Inst{18-16} = imm; @@ -2075,7 +2411,7 @@ class MVE_VSHLL_imm8<string iname, string suffix, class MVE_VSHLL_imm16<string iname, string suffix, bit U, bit th, list<dag> pattern=[]> - : MVE_VSHLL_imm<iname, suffix, U, th, (ins mve_shift_imm1_15:$imm), pattern> { + : MVE_VSHLL_imm<iname, suffix, U, th, mve_shift_imm1_15, pattern> { bits<4> imm; let Inst{20} = 0b1; let Inst{19-16} = imm; @@ -2119,11 +2455,50 @@ defm MVE_VSHLL_lws16 : MVE_VSHLL_lw<"vshll", "s16", 0b01, 0b0, "$Qd, $Qm, #16">; defm MVE_VSHLL_lwu8 : MVE_VSHLL_lw<"vshll", "u8", 0b00, 0b1, "$Qd, $Qm, #8">; defm MVE_VSHLL_lwu16 : MVE_VSHLL_lw<"vshll", "u16", 0b01, 0b1, "$Qd, $Qm, #16">; +multiclass MVE_VSHLL_patterns<MVEVectorVTInfo VTI, int top> { + defvar suffix = !strconcat(VTI.Suffix, !if(top, "th", "bh")); + defvar inst_imm = !cast<MVE_VSHLL_imm>("MVE_VSHLL_imm" # suffix); + defvar inst_lw = !cast<MVE_VSHLL_by_lane_width>("MVE_VSHLL_lw" # suffix); + defvar unpred_int = int_arm_mve_vshll_imm; + defvar pred_int = int_arm_mve_vshll_imm_predicated; + defvar imm = inst_imm.immediateType; + + def : Pat<(VTI.DblVec (unpred_int (VTI.Vec MQPR:$src), imm:$imm, + (i32 VTI.Unsigned), (i32 top))), + (VTI.DblVec (inst_imm (VTI.Vec MQPR:$src), imm:$imm))>; + def : Pat<(VTI.DblVec (unpred_int (VTI.Vec MQPR:$src), (i32 VTI.LaneBits), + (i32 VTI.Unsigned), (i32 top))), + (VTI.DblVec (inst_lw (VTI.Vec MQPR:$src)))>; + + def : Pat<(VTI.DblVec (pred_int (VTI.Vec MQPR:$src), imm:$imm, + (i32 VTI.Unsigned), (i32 top), + (VTI.Pred VCCR:$mask), + (VTI.DblVec MQPR:$inactive))), + (VTI.DblVec (inst_imm (VTI.Vec MQPR:$src), imm:$imm, + ARMVCCThen, (VTI.Pred VCCR:$mask), + (VTI.DblVec MQPR:$inactive)))>; + def : Pat<(VTI.DblVec (pred_int (VTI.Vec MQPR:$src), (i32 VTI.LaneBits), + (i32 VTI.Unsigned), (i32 top), + (VTI.Pred VCCR:$mask), + (VTI.DblVec MQPR:$inactive))), + (VTI.DblVec (inst_lw (VTI.Vec MQPR:$src), ARMVCCThen, + (VTI.Pred VCCR:$mask), + (VTI.DblVec MQPR:$inactive)))>; +} + +foreach VTI = [MVE_v16s8, MVE_v8s16, MVE_v16u8, MVE_v8u16] in + foreach top = [0, 1] in + defm : MVE_VSHLL_patterns<VTI, top>; + +class MVE_shift_imm_partial<Operand imm, string iname, string suffix> + : MVE_shift_imm<(outs MQPR:$Qd), (ins MQPR:$QdSrc, MQPR:$Qm, imm:$imm), + iname, suffix, "$Qd, $Qm, $imm", vpred_n, "$Qd = $QdSrc"> { + Operand immediateType = imm; +} + class MVE_VxSHRN<string iname, string suffix, bit bit_12, bit bit_28, - dag immops, list<dag> pattern=[]> - : MVE_shift_imm<(outs MQPR:$Qd), !con((ins MQPR:$QdSrc, MQPR:$Qm), immops), - iname, suffix, "$Qd, $Qm, $imm", vpred_n, "$Qd = $QdSrc", - pattern> { + Operand imm, list<dag> pattern=[]> + : MVE_shift_imm_partial<imm, iname, suffix> { bits<5> imm; let Inst{28} = bit_28; @@ -2136,45 +2511,35 @@ class MVE_VxSHRN<string iname, string suffix, bit bit_12, bit bit_28, let Inst{0} = 0b1; } -def MVE_VRSHRNi16bh : MVE_VxSHRN< - "vrshrnb", "i16", 0b0, 0b1, (ins shr_imm8:$imm)> { +def MVE_VRSHRNi16bh : MVE_VxSHRN<"vrshrnb", "i16", 0b0, 0b1, shr_imm8> { let Inst{20-19} = 0b01; } -def MVE_VRSHRNi16th : MVE_VxSHRN< - "vrshrnt", "i16", 0b1, 0b1,(ins shr_imm8:$imm)> { +def MVE_VRSHRNi16th : MVE_VxSHRN<"vrshrnt", "i16", 0b1, 0b1, shr_imm8> { let Inst{20-19} = 0b01; } -def MVE_VRSHRNi32bh : MVE_VxSHRN< - "vrshrnb", "i32", 0b0, 0b1, (ins shr_imm16:$imm)> { +def MVE_VRSHRNi32bh : MVE_VxSHRN<"vrshrnb", "i32", 0b0, 0b1, shr_imm16> { let Inst{20} = 0b1; } -def MVE_VRSHRNi32th : MVE_VxSHRN< - "vrshrnt", "i32", 0b1, 0b1, (ins shr_imm16:$imm)> { +def MVE_VRSHRNi32th : MVE_VxSHRN<"vrshrnt", "i32", 0b1, 0b1, shr_imm16> { let Inst{20} = 0b1; } -def MVE_VSHRNi16bh : MVE_VxSHRN< - "vshrnb", "i16", 0b0, 0b0, (ins shr_imm8:$imm)> { +def MVE_VSHRNi16bh : MVE_VxSHRN<"vshrnb", "i16", 0b0, 0b0, shr_imm8> { let Inst{20-19} = 0b01; } -def MVE_VSHRNi16th : MVE_VxSHRN< - "vshrnt", "i16", 0b1, 0b0, (ins shr_imm8:$imm)> { +def MVE_VSHRNi16th : MVE_VxSHRN<"vshrnt", "i16", 0b1, 0b0, shr_imm8> { let Inst{20-19} = 0b01; } -def MVE_VSHRNi32bh : MVE_VxSHRN< - "vshrnb", "i32", 0b0, 0b0, (ins shr_imm16:$imm)> { +def MVE_VSHRNi32bh : MVE_VxSHRN<"vshrnb", "i32", 0b0, 0b0, shr_imm16> { let Inst{20} = 0b1; } -def MVE_VSHRNi32th : MVE_VxSHRN< - "vshrnt", "i32", 0b1, 0b0, (ins shr_imm16:$imm)> { +def MVE_VSHRNi32th : MVE_VxSHRN<"vshrnt", "i32", 0b1, 0b0, shr_imm16> { let Inst{20} = 0b1; } -class MVE_VxQRSHRUN<string iname, string suffix, bit bit_28, bit bit_12, dag immops, - list<dag> pattern=[]> - : MVE_shift_imm<(outs MQPR:$Qd), !con((ins MQPR:$QdSrc, MQPR:$Qm), immops), - iname, suffix, "$Qd, $Qm, $imm", vpred_n, "$Qd = $QdSrc", - pattern> { +class MVE_VxQRSHRUN<string iname, string suffix, bit bit_28, bit bit_12, + Operand imm, list<dag> pattern=[]> + : MVE_shift_imm_partial<imm, iname, suffix> { bits<5> imm; let Inst{28} = bit_28; @@ -2188,44 +2553,42 @@ class MVE_VxQRSHRUN<string iname, string suffix, bit bit_28, bit bit_12, dag imm } def MVE_VQRSHRUNs16bh : MVE_VxQRSHRUN< - "vqrshrunb", "s16", 0b1, 0b0, (ins shr_imm8:$imm)> { + "vqrshrunb", "s16", 0b1, 0b0, shr_imm8> { let Inst{20-19} = 0b01; } def MVE_VQRSHRUNs16th : MVE_VxQRSHRUN< - "vqrshrunt", "s16", 0b1, 0b1, (ins shr_imm8:$imm)> { + "vqrshrunt", "s16", 0b1, 0b1, shr_imm8> { let Inst{20-19} = 0b01; } def MVE_VQRSHRUNs32bh : MVE_VxQRSHRUN< - "vqrshrunb", "s32", 0b1, 0b0, (ins shr_imm16:$imm)> { + "vqrshrunb", "s32", 0b1, 0b0, shr_imm16> { let Inst{20} = 0b1; } def MVE_VQRSHRUNs32th : MVE_VxQRSHRUN< - "vqrshrunt", "s32", 0b1, 0b1, (ins shr_imm16:$imm)> { + "vqrshrunt", "s32", 0b1, 0b1, shr_imm16> { let Inst{20} = 0b1; } def MVE_VQSHRUNs16bh : MVE_VxQRSHRUN< - "vqshrunb", "s16", 0b0, 0b0, (ins shr_imm8:$imm)> { + "vqshrunb", "s16", 0b0, 0b0, shr_imm8> { let Inst{20-19} = 0b01; } def MVE_VQSHRUNs16th : MVE_VxQRSHRUN< - "vqshrunt", "s16", 0b0, 0b1, (ins shr_imm8:$imm)> { + "vqshrunt", "s16", 0b0, 0b1, shr_imm8> { let Inst{20-19} = 0b01; } def MVE_VQSHRUNs32bh : MVE_VxQRSHRUN< - "vqshrunb", "s32", 0b0, 0b0, (ins shr_imm16:$imm)> { + "vqshrunb", "s32", 0b0, 0b0, shr_imm16> { let Inst{20} = 0b1; } def MVE_VQSHRUNs32th : MVE_VxQRSHRUN< - "vqshrunt", "s32", 0b0, 0b1, (ins shr_imm16:$imm)> { + "vqshrunt", "s32", 0b0, 0b1, shr_imm16> { let Inst{20} = 0b1; } class MVE_VxQRSHRN<string iname, string suffix, bit bit_0, bit bit_12, - dag immops, list<dag> pattern=[]> - : MVE_shift_imm<(outs MQPR:$Qd), !con((ins MQPR:$QdSrc, MQPR:$Qm), immops), - iname, suffix, "$Qd, $Qm, $imm", vpred_n, "$Qd = $QdSrc", - pattern> { + Operand imm, list<dag> pattern=[]> + : MVE_shift_imm_partial<imm, iname, suffix> { bits<5> imm; let Inst{25-23} = 0b101; @@ -2238,19 +2601,19 @@ class MVE_VxQRSHRN<string iname, string suffix, bit bit_0, bit bit_12, } multiclass MVE_VxQRSHRN_types<string iname, bit bit_0, bit bit_12> { - def s16 : MVE_VxQRSHRN<iname, "s16", bit_0, bit_12, (ins shr_imm8:$imm)> { + def s16 : MVE_VxQRSHRN<iname, "s16", bit_0, bit_12, shr_imm8> { let Inst{28} = 0b0; let Inst{20-19} = 0b01; } - def u16 : MVE_VxQRSHRN<iname, "u16", bit_0, bit_12, (ins shr_imm8:$imm)> { + def u16 : MVE_VxQRSHRN<iname, "u16", bit_0, bit_12, shr_imm8> { let Inst{28} = 0b1; let Inst{20-19} = 0b01; } - def s32 : MVE_VxQRSHRN<iname, "s32", bit_0, bit_12, (ins shr_imm16:$imm)> { + def s32 : MVE_VxQRSHRN<iname, "s32", bit_0, bit_12, shr_imm16> { let Inst{28} = 0b0; let Inst{20} = 0b1; } - def u32 : MVE_VxQRSHRN<iname, "u32", bit_0, bit_12, (ins shr_imm16:$imm)> { + def u32 : MVE_VxQRSHRN<iname, "u32", bit_0, bit_12, shr_imm16> { let Inst{28} = 0b1; let Inst{20} = 0b1; } @@ -2261,6 +2624,63 @@ defm MVE_VQRSHRNth : MVE_VxQRSHRN_types<"vqrshrnt", 0b1, 0b1>; defm MVE_VQSHRNbh : MVE_VxQRSHRN_types<"vqshrnb", 0b0, 0b0>; defm MVE_VQSHRNth : MVE_VxQRSHRN_types<"vqshrnt", 0b0, 0b1>; +multiclass MVE_VSHRN_patterns<MVE_shift_imm_partial inst, + MVEVectorVTInfo OutVTI, MVEVectorVTInfo InVTI, + bit q, bit r, bit top> { + defvar inparams = (? (OutVTI.Vec MQPR:$QdSrc), (InVTI.Vec MQPR:$Qm), + (inst.immediateType:$imm), (i32 q), (i32 r), + (i32 OutVTI.Unsigned), (i32 InVTI.Unsigned), (i32 top)); + defvar outparams = (inst (OutVTI.Vec MQPR:$QdSrc), (InVTI.Vec MQPR:$Qm), + (imm:$imm)); + + def : Pat<(OutVTI.Vec !setop(inparams, int_arm_mve_vshrn)), + (OutVTI.Vec outparams)>; + def : Pat<(OutVTI.Vec !con(inparams, (int_arm_mve_vshrn_predicated + (InVTI.Pred VCCR:$pred)))), + (OutVTI.Vec !con(outparams, (? ARMVCCThen, VCCR:$pred)))>; +} + +defm : MVE_VSHRN_patterns<MVE_VSHRNi16bh, MVE_v16s8, MVE_v8s16, 0,0,0>; +defm : MVE_VSHRN_patterns<MVE_VSHRNi16th, MVE_v16s8, MVE_v8s16, 0,0,1>; +defm : MVE_VSHRN_patterns<MVE_VSHRNi32bh, MVE_v8s16, MVE_v4s32, 0,0,0>; +defm : MVE_VSHRN_patterns<MVE_VSHRNi32th, MVE_v8s16, MVE_v4s32, 0,0,1>; +defm : MVE_VSHRN_patterns<MVE_VSHRNi16bh, MVE_v16u8, MVE_v8u16, 0,0,0>; +defm : MVE_VSHRN_patterns<MVE_VSHRNi16th, MVE_v16u8, MVE_v8u16, 0,0,1>; +defm : MVE_VSHRN_patterns<MVE_VSHRNi32bh, MVE_v8u16, MVE_v4u32, 0,0,0>; +defm : MVE_VSHRN_patterns<MVE_VSHRNi32th, MVE_v8u16, MVE_v4u32, 0,0,1>; +defm : MVE_VSHRN_patterns<MVE_VRSHRNi16bh, MVE_v16s8, MVE_v8s16, 0,1,0>; +defm : MVE_VSHRN_patterns<MVE_VRSHRNi16th, MVE_v16s8, MVE_v8s16, 0,1,1>; +defm : MVE_VSHRN_patterns<MVE_VRSHRNi32bh, MVE_v8s16, MVE_v4s32, 0,1,0>; +defm : MVE_VSHRN_patterns<MVE_VRSHRNi32th, MVE_v8s16, MVE_v4s32, 0,1,1>; +defm : MVE_VSHRN_patterns<MVE_VRSHRNi16bh, MVE_v16u8, MVE_v8u16, 0,1,0>; +defm : MVE_VSHRN_patterns<MVE_VRSHRNi16th, MVE_v16u8, MVE_v8u16, 0,1,1>; +defm : MVE_VSHRN_patterns<MVE_VRSHRNi32bh, MVE_v8u16, MVE_v4u32, 0,1,0>; +defm : MVE_VSHRN_patterns<MVE_VRSHRNi32th, MVE_v8u16, MVE_v4u32, 0,1,1>; +defm : MVE_VSHRN_patterns<MVE_VQSHRNbhs16, MVE_v16s8, MVE_v8s16, 1,0,0>; +defm : MVE_VSHRN_patterns<MVE_VQSHRNths16, MVE_v16s8, MVE_v8s16, 1,0,1>; +defm : MVE_VSHRN_patterns<MVE_VQSHRNbhs32, MVE_v8s16, MVE_v4s32, 1,0,0>; +defm : MVE_VSHRN_patterns<MVE_VQSHRNths32, MVE_v8s16, MVE_v4s32, 1,0,1>; +defm : MVE_VSHRN_patterns<MVE_VQSHRNbhu16, MVE_v16u8, MVE_v8u16, 1,0,0>; +defm : MVE_VSHRN_patterns<MVE_VQSHRNthu16, MVE_v16u8, MVE_v8u16, 1,0,1>; +defm : MVE_VSHRN_patterns<MVE_VQSHRNbhu32, MVE_v8u16, MVE_v4u32, 1,0,0>; +defm : MVE_VSHRN_patterns<MVE_VQSHRNthu32, MVE_v8u16, MVE_v4u32, 1,0,1>; +defm : MVE_VSHRN_patterns<MVE_VQRSHRNbhs16, MVE_v16s8, MVE_v8s16, 1,1,0>; +defm : MVE_VSHRN_patterns<MVE_VQRSHRNths16, MVE_v16s8, MVE_v8s16, 1,1,1>; +defm : MVE_VSHRN_patterns<MVE_VQRSHRNbhs32, MVE_v8s16, MVE_v4s32, 1,1,0>; +defm : MVE_VSHRN_patterns<MVE_VQRSHRNths32, MVE_v8s16, MVE_v4s32, 1,1,1>; +defm : MVE_VSHRN_patterns<MVE_VQRSHRNbhu16, MVE_v16u8, MVE_v8u16, 1,1,0>; +defm : MVE_VSHRN_patterns<MVE_VQRSHRNthu16, MVE_v16u8, MVE_v8u16, 1,1,1>; +defm : MVE_VSHRN_patterns<MVE_VQRSHRNbhu32, MVE_v8u16, MVE_v4u32, 1,1,0>; +defm : MVE_VSHRN_patterns<MVE_VQRSHRNthu32, MVE_v8u16, MVE_v4u32, 1,1,1>; +defm : MVE_VSHRN_patterns<MVE_VQSHRUNs16bh, MVE_v16u8, MVE_v8s16, 1,0,0>; +defm : MVE_VSHRN_patterns<MVE_VQSHRUNs16th, MVE_v16u8, MVE_v8s16, 1,0,1>; +defm : MVE_VSHRN_patterns<MVE_VQSHRUNs32bh, MVE_v8u16, MVE_v4s32, 1,0,0>; +defm : MVE_VSHRN_patterns<MVE_VQSHRUNs32th, MVE_v8u16, MVE_v4s32, 1,0,1>; +defm : MVE_VSHRN_patterns<MVE_VQRSHRUNs16bh, MVE_v16u8, MVE_v8s16, 1,1,0>; +defm : MVE_VSHRN_patterns<MVE_VQRSHRUNs16th, MVE_v16u8, MVE_v8s16, 1,1,1>; +defm : MVE_VSHRN_patterns<MVE_VQRSHRUNs32bh, MVE_v8u16, MVE_v4s32, 1,1,0>; +defm : MVE_VSHRN_patterns<MVE_VQRSHRUNs32th, MVE_v8u16, MVE_v4s32, 1,1,1>; + // end of mve_imm_shift instructions // start of mve_shift instructions @@ -2293,13 +2713,31 @@ class MVE_shift_by_vec<string iname, string suffix, bit U, let validForTailPredication = 1; } +multiclass MVE_shift_by_vec_p<string iname, MVEVectorVTInfo VTI, bit q, bit r> { + def "" : MVE_shift_by_vec<iname, VTI.Suffix, VTI.Unsigned, VTI.Size, q, r>; + defvar Inst = !cast<Instruction>(NAME); + + def : Pat<(VTI.Vec (int_arm_mve_vshl_vector + (VTI.Vec MQPR:$in), (VTI.Vec MQPR:$sh), + (i32 q), (i32 r), (i32 VTI.Unsigned))), + (VTI.Vec (Inst (VTI.Vec MQPR:$in), (VTI.Vec MQPR:$sh)))>; + + def : Pat<(VTI.Vec (int_arm_mve_vshl_vector_predicated + (VTI.Vec MQPR:$in), (VTI.Vec MQPR:$sh), + (i32 q), (i32 r), (i32 VTI.Unsigned), + (VTI.Pred VCCR:$mask), (VTI.Vec MQPR:$inactive))), + (VTI.Vec (Inst (VTI.Vec MQPR:$in), (VTI.Vec MQPR:$sh), + ARMVCCThen, (VTI.Pred VCCR:$mask), + (VTI.Vec MQPR:$inactive)))>; +} + multiclass mve_shift_by_vec_multi<string iname, bit bit_4, bit bit_8> { - def s8 : MVE_shift_by_vec<iname, "s8", 0b0, 0b00, bit_4, bit_8>; - def s16 : MVE_shift_by_vec<iname, "s16", 0b0, 0b01, bit_4, bit_8>; - def s32 : MVE_shift_by_vec<iname, "s32", 0b0, 0b10, bit_4, bit_8>; - def u8 : MVE_shift_by_vec<iname, "u8", 0b1, 0b00, bit_4, bit_8>; - def u16 : MVE_shift_by_vec<iname, "u16", 0b1, 0b01, bit_4, bit_8>; - def u32 : MVE_shift_by_vec<iname, "u32", 0b1, 0b10, bit_4, bit_8>; + defm s8 : MVE_shift_by_vec_p<iname, MVE_v16s8, bit_4, bit_8>; + defm s16 : MVE_shift_by_vec_p<iname, MVE_v8s16, bit_4, bit_8>; + defm s32 : MVE_shift_by_vec_p<iname, MVE_v4s32, bit_4, bit_8>; + defm u8 : MVE_shift_by_vec_p<iname, MVE_v16u8, bit_4, bit_8>; + defm u16 : MVE_shift_by_vec_p<iname, MVE_v8u16, bit_4, bit_8>; + defm u32 : MVE_shift_by_vec_p<iname, MVE_v4u32, bit_4, bit_8>; } defm MVE_VSHL_by_vec : mve_shift_by_vec_multi<"vshl", 0b0, 0b0>; @@ -2340,11 +2778,18 @@ class MVE_shift_with_imm<string iname, string suffix, dag oops, dag iops, let Inst{3-1} = Qm{2-0}; let Inst{0} = 0b0; let validForTailPredication = 1; + + // For the MVE_shift_imm_patterns multiclass to refer to + MVEVectorVTInfo VTI; + Operand immediateType; + Intrinsic unpred_int; + Intrinsic pred_int; + dag unsignedFlag = (?); } -class MVE_VSxI_imm<string iname, string suffix, bit bit_8, dag imm> +class MVE_VSxI_imm<string iname, string suffix, bit bit_8, Operand immType> : MVE_shift_with_imm<iname, suffix, (outs MQPR:$Qd), - !con((ins MQPR:$Qd_src, MQPR:$Qm), imm), + (ins MQPR:$Qd_src, MQPR:$Qm, immType:$imm), "$Qd, $Qm, $imm", vpred_n, "$Qd = $Qd_src"> { bits<6> imm; let Inst{28} = 0b1; @@ -2353,76 +2798,99 @@ class MVE_VSxI_imm<string iname, string suffix, bit bit_8, dag imm> let Inst{10-9} = 0b10; let Inst{8} = bit_8; let validForTailPredication = 1; + + Operand immediateType = immType; } -def MVE_VSRIimm8 : MVE_VSxI_imm<"vsri", "8", 0b0, (ins shr_imm8:$imm)> { +def MVE_VSRIimm8 : MVE_VSxI_imm<"vsri", "8", 0b0, shr_imm8> { let Inst{21-19} = 0b001; } -def MVE_VSRIimm16 : MVE_VSxI_imm<"vsri", "16", 0b0, (ins shr_imm16:$imm)> { +def MVE_VSRIimm16 : MVE_VSxI_imm<"vsri", "16", 0b0, shr_imm16> { let Inst{21-20} = 0b01; } -def MVE_VSRIimm32 : MVE_VSxI_imm<"vsri", "32", 0b0, (ins shr_imm32:$imm)> { +def MVE_VSRIimm32 : MVE_VSxI_imm<"vsri", "32", 0b0, shr_imm32> { let Inst{21} = 0b1; } -def MVE_VSLIimm8 : MVE_VSxI_imm<"vsli", "8", 0b1, (ins imm0_7:$imm)> { +def MVE_VSLIimm8 : MVE_VSxI_imm<"vsli", "8", 0b1, imm0_7> { let Inst{21-19} = 0b001; } -def MVE_VSLIimm16 : MVE_VSxI_imm<"vsli", "16", 0b1, (ins imm0_15:$imm)> { +def MVE_VSLIimm16 : MVE_VSxI_imm<"vsli", "16", 0b1, imm0_15> { let Inst{21-20} = 0b01; } -def MVE_VSLIimm32 : MVE_VSxI_imm<"vsli", "32", 0b1,(ins imm0_31:$imm)> { +def MVE_VSLIimm32 : MVE_VSxI_imm<"vsli", "32", 0b1,imm0_31> { let Inst{21} = 0b1; } -class MVE_VQSHL_imm<string suffix, dag imm> - : MVE_shift_with_imm<"vqshl", suffix, (outs MQPR:$Qd), - !con((ins MQPR:$Qm), imm), "$Qd, $Qm, $imm", +multiclass MVE_VSxI_patterns<MVE_VSxI_imm inst, string name, + MVEVectorVTInfo VTI> { + defvar inparams = (? (VTI.Vec MQPR:$QdSrc), (VTI.Vec MQPR:$Qm), + (inst.immediateType:$imm)); + defvar outparams = (inst (VTI.Vec MQPR:$QdSrc), (VTI.Vec MQPR:$Qm), + (inst.immediateType:$imm)); + defvar unpred_int = !cast<Intrinsic>("int_arm_mve_" # name); + defvar pred_int = !cast<Intrinsic>("int_arm_mve_" # name # "_predicated"); + + def : Pat<(VTI.Vec !setop(inparams, unpred_int)), + (VTI.Vec outparams)>; + def : Pat<(VTI.Vec !con(inparams, (pred_int (VTI.Pred VCCR:$pred)))), + (VTI.Vec !con(outparams, (? ARMVCCThen, VCCR:$pred)))>; +} + +defm : MVE_VSxI_patterns<MVE_VSLIimm8, "vsli", MVE_v16i8>; +defm : MVE_VSxI_patterns<MVE_VSLIimm16, "vsli", MVE_v8i16>; +defm : MVE_VSxI_patterns<MVE_VSLIimm32, "vsli", MVE_v4i32>; +defm : MVE_VSxI_patterns<MVE_VSRIimm8, "vsri", MVE_v16i8>; +defm : MVE_VSxI_patterns<MVE_VSRIimm16, "vsri", MVE_v8i16>; +defm : MVE_VSxI_patterns<MVE_VSRIimm32, "vsri", MVE_v4i32>; + +class MVE_VQSHL_imm<MVEVectorVTInfo VTI_, Operand immType> + : MVE_shift_with_imm<"vqshl", VTI_.Suffix, (outs MQPR:$Qd), + (ins MQPR:$Qm, immType:$imm), "$Qd, $Qm, $imm", vpred_r, ""> { bits<6> imm; + let Inst{28} = VTI_.Unsigned; let Inst{25-24} = 0b11; let Inst{21-16} = imm; let Inst{10-8} = 0b111; -} - -def MVE_VSLIimms8 : MVE_VQSHL_imm<"s8", (ins imm0_7:$imm)> { - let Inst{28} = 0b0; - let Inst{21-19} = 0b001; -} - -def MVE_VSLIimmu8 : MVE_VQSHL_imm<"u8", (ins imm0_7:$imm)> { - let Inst{28} = 0b1; - let Inst{21-19} = 0b001; -} -def MVE_VSLIimms16 : MVE_VQSHL_imm<"s16", (ins imm0_15:$imm)> { - let Inst{28} = 0b0; - let Inst{21-20} = 0b01; + let VTI = VTI_; + let immediateType = immType; + let unsignedFlag = (? (i32 VTI.Unsigned)); } -def MVE_VSLIimmu16 : MVE_VQSHL_imm<"u16", (ins imm0_15:$imm)> { - let Inst{28} = 0b1; - let Inst{21-20} = 0b01; -} - -def MVE_VSLIimms32 : MVE_VQSHL_imm<"s32", (ins imm0_31:$imm)> { - let Inst{28} = 0b0; - let Inst{21} = 0b1; -} - -def MVE_VSLIimmu32 : MVE_VQSHL_imm<"u32", (ins imm0_31:$imm)> { - let Inst{28} = 0b1; - let Inst{21} = 0b1; +let unpred_int = int_arm_mve_vqshl_imm, + pred_int = int_arm_mve_vqshl_imm_predicated in { + def MVE_VQSHLimms8 : MVE_VQSHL_imm<MVE_v16s8, imm0_7> { + let Inst{21-19} = 0b001; + } + def MVE_VQSHLimmu8 : MVE_VQSHL_imm<MVE_v16u8, imm0_7> { + let Inst{21-19} = 0b001; + } + + def MVE_VQSHLimms16 : MVE_VQSHL_imm<MVE_v8s16, imm0_15> { + let Inst{21-20} = 0b01; + } + def MVE_VQSHLimmu16 : MVE_VQSHL_imm<MVE_v8u16, imm0_15> { + let Inst{21-20} = 0b01; + } + + def MVE_VQSHLimms32 : MVE_VQSHL_imm<MVE_v4s32, imm0_31> { + let Inst{21} = 0b1; + } + def MVE_VQSHLimmu32 : MVE_VQSHL_imm<MVE_v4u32, imm0_31> { + let Inst{21} = 0b1; + } } -class MVE_VQSHLU_imm<string suffix, dag imm> - : MVE_shift_with_imm<"vqshlu", suffix, (outs MQPR:$Qd), - !con((ins MQPR:$Qm), imm), "$Qd, $Qm, $imm", +class MVE_VQSHLU_imm<MVEVectorVTInfo VTI_, Operand immType> + : MVE_shift_with_imm<"vqshlu", VTI_.Suffix, (outs MQPR:$Qd), + (ins MQPR:$Qm, immType:$imm), "$Qd, $Qm, $imm", vpred_r, ""> { bits<6> imm; @@ -2430,61 +2898,103 @@ class MVE_VQSHLU_imm<string suffix, dag imm> let Inst{25-24} = 0b11; let Inst{21-16} = imm; let Inst{10-8} = 0b110; -} -def MVE_VQSHLU_imms8 : MVE_VQSHLU_imm<"s8", (ins imm0_7:$imm)> { - let Inst{21-19} = 0b001; + let VTI = VTI_; + let immediateType = immType; } -def MVE_VQSHLU_imms16 : MVE_VQSHLU_imm<"s16", (ins imm0_15:$imm)> { - let Inst{21-20} = 0b01; -} +let unpred_int = int_arm_mve_vqshlu_imm, + pred_int = int_arm_mve_vqshlu_imm_predicated in { + def MVE_VQSHLU_imms8 : MVE_VQSHLU_imm<MVE_v16s8, imm0_7> { + let Inst{21-19} = 0b001; + } -def MVE_VQSHLU_imms32 : MVE_VQSHLU_imm<"s32", (ins imm0_31:$imm)> { - let Inst{21} = 0b1; + def MVE_VQSHLU_imms16 : MVE_VQSHLU_imm<MVE_v8s16, imm0_15> { + let Inst{21-20} = 0b01; + } + + def MVE_VQSHLU_imms32 : MVE_VQSHLU_imm<MVE_v4s32, imm0_31> { + let Inst{21} = 0b1; + } } -class MVE_VRSHR_imm<string suffix, dag imm> - : MVE_shift_with_imm<"vrshr", suffix, (outs MQPR:$Qd), - !con((ins MQPR:$Qm), imm), "$Qd, $Qm, $imm", +class MVE_VRSHR_imm<MVEVectorVTInfo VTI_, Operand immType> + : MVE_shift_with_imm<"vrshr", VTI_.Suffix, (outs MQPR:$Qd), + (ins MQPR:$Qm, immType:$imm), "$Qd, $Qm, $imm", vpred_r, ""> { bits<6> imm; + let Inst{28} = VTI_.Unsigned; let Inst{25-24} = 0b11; let Inst{21-16} = imm; let Inst{10-8} = 0b010; -} -def MVE_VRSHR_imms8 : MVE_VRSHR_imm<"s8", (ins shr_imm8:$imm)> { - let Inst{28} = 0b0; - let Inst{21-19} = 0b001; + let VTI = VTI_; + let immediateType = immType; + let unsignedFlag = (? (i32 VTI.Unsigned)); } -def MVE_VRSHR_immu8 : MVE_VRSHR_imm<"u8", (ins shr_imm8:$imm)> { - let Inst{28} = 0b1; - let Inst{21-19} = 0b001; -} +let unpred_int = int_arm_mve_vrshr_imm, + pred_int = int_arm_mve_vrshr_imm_predicated in { + def MVE_VRSHR_imms8 : MVE_VRSHR_imm<MVE_v16s8, shr_imm8> { + let Inst{21-19} = 0b001; + } -def MVE_VRSHR_imms16 : MVE_VRSHR_imm<"s16", (ins shr_imm16:$imm)> { - let Inst{28} = 0b0; - let Inst{21-20} = 0b01; -} + def MVE_VRSHR_immu8 : MVE_VRSHR_imm<MVE_v16u8, shr_imm8> { + let Inst{21-19} = 0b001; + } -def MVE_VRSHR_immu16 : MVE_VRSHR_imm<"u16", (ins shr_imm16:$imm)> { - let Inst{28} = 0b1; - let Inst{21-20} = 0b01; -} + def MVE_VRSHR_imms16 : MVE_VRSHR_imm<MVE_v8s16, shr_imm16> { + let Inst{21-20} = 0b01; + } -def MVE_VRSHR_imms32 : MVE_VRSHR_imm<"s32", (ins shr_imm32:$imm)> { - let Inst{28} = 0b0; - let Inst{21} = 0b1; -} + def MVE_VRSHR_immu16 : MVE_VRSHR_imm<MVE_v8u16, shr_imm16> { + let Inst{21-20} = 0b01; + } -def MVE_VRSHR_immu32 : MVE_VRSHR_imm<"u32", (ins shr_imm32:$imm)> { - let Inst{28} = 0b1; - let Inst{21} = 0b1; + def MVE_VRSHR_imms32 : MVE_VRSHR_imm<MVE_v4s32, shr_imm32> { + let Inst{21} = 0b1; + } + + def MVE_VRSHR_immu32 : MVE_VRSHR_imm<MVE_v4u32, shr_imm32> { + let Inst{21} = 0b1; + } } +multiclass MVE_shift_imm_patterns<MVE_shift_with_imm inst> { + def : Pat<(inst.VTI.Vec !con((inst.unpred_int (inst.VTI.Vec MQPR:$src), + inst.immediateType:$imm), + inst.unsignedFlag)), + (inst.VTI.Vec (inst (inst.VTI.Vec MQPR:$src), + inst.immediateType:$imm))>; + + def : Pat<(inst.VTI.Vec !con((inst.pred_int (inst.VTI.Vec MQPR:$src), + inst.immediateType:$imm), + inst.unsignedFlag, + (? (inst.VTI.Pred VCCR:$mask), + (inst.VTI.Vec MQPR:$inactive)))), + (inst.VTI.Vec (inst (inst.VTI.Vec MQPR:$src), + inst.immediateType:$imm, + ARMVCCThen, (inst.VTI.Pred VCCR:$mask), + (inst.VTI.Vec MQPR:$inactive)))>; +} + +defm : MVE_shift_imm_patterns<MVE_VQSHLimms8>; +defm : MVE_shift_imm_patterns<MVE_VQSHLimmu8>; +defm : MVE_shift_imm_patterns<MVE_VQSHLimms16>; +defm : MVE_shift_imm_patterns<MVE_VQSHLimmu16>; +defm : MVE_shift_imm_patterns<MVE_VQSHLimms32>; +defm : MVE_shift_imm_patterns<MVE_VQSHLimmu32>; +defm : MVE_shift_imm_patterns<MVE_VQSHLU_imms8>; +defm : MVE_shift_imm_patterns<MVE_VQSHLU_imms16>; +defm : MVE_shift_imm_patterns<MVE_VQSHLU_imms32>; +defm : MVE_shift_imm_patterns<MVE_VRSHR_imms8>; +defm : MVE_shift_imm_patterns<MVE_VRSHR_immu8>; +defm : MVE_shift_imm_patterns<MVE_VRSHR_imms16>; +defm : MVE_shift_imm_patterns<MVE_VRSHR_immu16>; +defm : MVE_shift_imm_patterns<MVE_VRSHR_imms32>; +defm : MVE_shift_imm_patterns<MVE_VRSHR_immu32>; + class MVE_VSHR_imm<string suffix, dag imm> : MVE_shift_with_imm<"vshr", suffix, (outs MQPR:$Qd), !con((ins MQPR:$Qm), imm), "$Qd, $Qm, $imm", @@ -2550,27 +3060,39 @@ def MVE_VSHL_immi32 : MVE_VSHL_imm<"i32", (ins imm0_31:$imm)> { let Inst{21} = 0b1; } +multiclass MVE_immediate_shift_patterns_inner< + MVEVectorVTInfo VTI, Operand imm_operand_type, SDNode unpred_op, + Intrinsic pred_int, Instruction inst, list<int> unsignedFlag = []> { + + def : Pat<(VTI.Vec (unpred_op (VTI.Vec MQPR:$src), imm_operand_type:$imm)), + (VTI.Vec (inst (VTI.Vec MQPR:$src), imm_operand_type:$imm))>; + + def : Pat<(VTI.Vec !con((pred_int (VTI.Vec MQPR:$src), imm_operand_type:$imm), + !dag(pred_int, unsignedFlag, ?), + (pred_int (VTI.Pred VCCR:$mask), + (VTI.Vec MQPR:$inactive)))), + (VTI.Vec (inst (VTI.Vec MQPR:$src), imm_operand_type:$imm, + ARMVCCThen, (VTI.Pred VCCR:$mask), + (VTI.Vec MQPR:$inactive)))>; +} + +multiclass MVE_immediate_shift_patterns<MVEVectorVTInfo VTI, + Operand imm_operand_type> { + defm : MVE_immediate_shift_patterns_inner<VTI, imm_operand_type, + ARMvshlImm, int_arm_mve_shl_imm_predicated, + !cast<Instruction>("MVE_VSHL_immi" # VTI.BitsSuffix)>; + defm : MVE_immediate_shift_patterns_inner<VTI, imm_operand_type, + ARMvshruImm, int_arm_mve_shr_imm_predicated, + !cast<Instruction>("MVE_VSHR_immu" # VTI.BitsSuffix), [1]>; + defm : MVE_immediate_shift_patterns_inner<VTI, imm_operand_type, + ARMvshrsImm, int_arm_mve_shr_imm_predicated, + !cast<Instruction>("MVE_VSHR_imms" # VTI.BitsSuffix), [0]>; +} + let Predicates = [HasMVEInt] in { - def : Pat<(v4i32 (ARMvshlImm (v4i32 MQPR:$src), imm0_31:$imm)), - (v4i32 (MVE_VSHL_immi32 (v4i32 MQPR:$src), imm0_31:$imm))>; - def : Pat<(v8i16 (ARMvshlImm (v8i16 MQPR:$src), imm0_15:$imm)), - (v8i16 (MVE_VSHL_immi16 (v8i16 MQPR:$src), imm0_15:$imm))>; - def : Pat<(v16i8 (ARMvshlImm (v16i8 MQPR:$src), imm0_7:$imm)), - (v16i8 (MVE_VSHL_immi8 (v16i8 MQPR:$src), imm0_7:$imm))>; - - def : Pat<(v4i32 (ARMvshruImm (v4i32 MQPR:$src), imm0_31:$imm)), - (v4i32 (MVE_VSHR_immu32 (v4i32 MQPR:$src), imm0_31:$imm))>; - def : Pat<(v8i16 (ARMvshruImm (v8i16 MQPR:$src), imm0_15:$imm)), - (v8i16 (MVE_VSHR_immu16 (v8i16 MQPR:$src), imm0_15:$imm))>; - def : Pat<(v16i8 (ARMvshruImm (v16i8 MQPR:$src), imm0_7:$imm)), - (v16i8 (MVE_VSHR_immu8 (v16i8 MQPR:$src), imm0_7:$imm))>; - - def : Pat<(v4i32 (ARMvshrsImm (v4i32 MQPR:$src), imm0_31:$imm)), - (v4i32 (MVE_VSHR_imms32 (v4i32 MQPR:$src), imm0_31:$imm))>; - def : Pat<(v8i16 (ARMvshrsImm (v8i16 MQPR:$src), imm0_15:$imm)), - (v8i16 (MVE_VSHR_imms16 (v8i16 MQPR:$src), imm0_15:$imm))>; - def : Pat<(v16i8 (ARMvshrsImm (v16i8 MQPR:$src), imm0_7:$imm)), - (v16i8 (MVE_VSHR_imms8 (v16i8 MQPR:$src), imm0_7:$imm))>; + defm : MVE_immediate_shift_patterns<MVE_v16i8, imm0_7>; + defm : MVE_immediate_shift_patterns<MVE_v8i16, imm0_15>; + defm : MVE_immediate_shift_patterns<MVE_v4i32, imm0_31>; } // end of mve_shift instructions @@ -2652,8 +3174,8 @@ class MVEFloatArithNeon<string iname, string suffix, bit size, let Inst{16} = 0b0; } -class MVE_VMUL_fp<string suffix, bit size, list<dag> pattern=[]> - : MVEFloatArithNeon<"vmul", suffix, size, (outs MQPR:$Qd), +class MVE_VMUL_fp<string iname, string suffix, bit size, list<dag> pattern=[]> + : MVEFloatArithNeon<iname, suffix, size, (outs MQPR:$Qd), (ins MQPR:$Qn, MQPR:$Qm), "$Qd, $Qn, $Qm", vpred_r, "", pattern> { bits<4> Qd; @@ -2671,20 +3193,32 @@ class MVE_VMUL_fp<string suffix, bit size, list<dag> pattern=[]> let validForTailPredication = 1; } -def MVE_VMULf32 : MVE_VMUL_fp<"f32", 0b0>; -def MVE_VMULf16 : MVE_VMUL_fp<"f16", 0b1>; +multiclass MVE_VMULT_fp_m<string iname, bit bit_21, MVEVectorVTInfo VTI, + SDNode unpred_op, Intrinsic pred_int> { + def "" : MVE_VMUL_fp<iname, VTI.Suffix, VTI.Size{0}>; + defvar Inst = !cast<Instruction>(NAME); -let Predicates = [HasMVEFloat] in { - def : Pat<(v4f32 (fmul (v4f32 MQPR:$val1), (v4f32 MQPR:$val2))), - (v4f32 (MVE_VMULf32 (v4f32 MQPR:$val1), (v4f32 MQPR:$val2)))>; - def : Pat<(v8f16 (fmul (v8f16 MQPR:$val1), (v8f16 MQPR:$val2))), - (v8f16 (MVE_VMULf16 (v8f16 MQPR:$val1), (v8f16 MQPR:$val2)))>; + let Predicates = [HasMVEFloat] in { + def : Pat<(VTI.Vec (unpred_op (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn))), + (VTI.Vec (Inst (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn)))>; + def : Pat<(VTI.Vec (pred_int (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn), + (VTI.Pred VCCR:$mask), (VTI.Vec MQPR:$inactive))), + (VTI.Vec (Inst (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn), + ARMVCCThen, (VTI.Pred VCCR:$mask), + (VTI.Vec MQPR:$inactive)))>; + } } -class MVE_VCMLA<string suffix, bit size, list<dag> pattern=[]> +multiclass MVE_VMUL_fp_m<MVEVectorVTInfo VTI> + : MVE_VMULT_fp_m<"vmul", 0, VTI, fmul, int_arm_mve_mul_predicated>; + +defm MVE_VMULf32 : MVE_VMUL_fp_m<MVE_v4f32>; +defm MVE_VMULf16 : MVE_VMUL_fp_m<MVE_v8f16>; + +class MVE_VCMLA<string suffix, bit size> : MVEFloatArithNeon<"vcmla", suffix, size, (outs MQPR:$Qd), (ins MQPR:$Qd_src, MQPR:$Qn, MQPR:$Qm, complexrotateop:$rot), - "$Qd, $Qn, $Qm, $rot", vpred_n, "$Qd = $Qd_src", pattern> { + "$Qd, $Qn, $Qm, $rot", vpred_n, "$Qd = $Qd_src", []> { bits<4> Qd; bits<4> Qn; bits<2> rot; @@ -2701,8 +3235,31 @@ class MVE_VCMLA<string suffix, bit size, list<dag> pattern=[]> let Inst{4} = 0b0; } -def MVE_VCMLAf16 : MVE_VCMLA<"f16", 0b0>; -def MVE_VCMLAf32 : MVE_VCMLA<"f32", 0b1>; +multiclass MVE_VCMLA_m<MVEVectorVTInfo VTI, bit size> { + def "" : MVE_VCMLA<VTI.Suffix, size>; + defvar Inst = !cast<Instruction>(NAME); + + let Predicates = [HasMVEFloat] in { + def : Pat<(VTI.Vec (int_arm_mve_vcmlaq + imm:$rot, (VTI.Vec MQPR:$Qd_src), + (VTI.Vec MQPR:$Qn), (VTI.Vec MQPR:$Qm))), + (VTI.Vec (Inst (VTI.Vec MQPR:$Qd_src), + (VTI.Vec MQPR:$Qn), (VTI.Vec MQPR:$Qm), + imm:$rot))>; + + def : Pat<(VTI.Vec (int_arm_mve_vcmlaq_predicated + imm:$rot, (VTI.Vec MQPR:$Qd_src), + (VTI.Vec MQPR:$Qn), (VTI.Vec MQPR:$Qm), + (VTI.Pred VCCR:$mask))), + (VTI.Vec (Inst (VTI.Vec MQPR:$Qd_src), (VTI.Vec MQPR:$Qn), + (VTI.Vec MQPR:$Qm), imm:$rot, + ARMVCCThen, (VTI.Pred VCCR:$mask)))>; + + } +} + +defm MVE_VCMLAf16 : MVE_VCMLA_m<MVE_v8f16, 0b0>; +defm MVE_VCMLAf32 : MVE_VCMLA_m<MVE_v4f32, 0b1>; class MVE_VADDSUBFMA_fp<string iname, string suffix, bit size, bit bit_4, bit bit_8, bit bit_21, dag iops=(ins), @@ -2736,63 +3293,50 @@ def MVE_VFMSf32 : MVE_VADDSUBFMA_fp<"vfms", "f32", 0b0, 0b1, 0b0, 0b1, def MVE_VFMSf16 : MVE_VADDSUBFMA_fp<"vfms", "f16", 0b1, 0b1, 0b0, 0b1, (ins MQPR:$Qd_src), vpred_n, "$Qd = $Qd_src">; -let Predicates = [HasMVEFloat, UseFusedMAC] in { - def : Pat<(v8f16 (fadd (v8f16 MQPR:$src1), - (fmul (v8f16 MQPR:$src2), - (v8f16 MQPR:$src3)))), - (v8f16 (MVE_VFMAf16 $src1, $src2, $src3))>; - def : Pat<(v4f32 (fadd (v4f32 MQPR:$src1), - (fmul (v4f32 MQPR:$src2), - (v4f32 MQPR:$src3)))), - (v4f32 (MVE_VFMAf32 $src1, $src2, $src3))>; - - def : Pat<(v8f16 (fsub (v8f16 MQPR:$src1), - (fmul (v8f16 MQPR:$src2), - (v8f16 MQPR:$src3)))), - (v8f16 (MVE_VFMSf16 $src1, $src2, $src3))>; - def : Pat<(v4f32 (fsub (v4f32 MQPR:$src1), - (fmul (v4f32 MQPR:$src2), - (v4f32 MQPR:$src3)))), - (v4f32 (MVE_VFMSf32 $src1, $src2, $src3))>; -} - let Predicates = [HasMVEFloat] in { def : Pat<(v8f16 (fma (v8f16 MQPR:$src1), (v8f16 MQPR:$src2), (v8f16 MQPR:$src3))), (v8f16 (MVE_VFMAf16 $src3, $src1, $src2))>; def : Pat<(v4f32 (fma (v4f32 MQPR:$src1), (v4f32 MQPR:$src2), (v4f32 MQPR:$src3))), (v4f32 (MVE_VFMAf32 $src3, $src1, $src2))>; + def : Pat<(v8f16 (fma (fneg (v8f16 MQPR:$src1)), (v8f16 MQPR:$src2), (v8f16 MQPR:$src3))), + (v8f16 (MVE_VFMSf16 $src3, $src1, $src2))>; + def : Pat<(v4f32 (fma (fneg (v4f32 MQPR:$src1)), (v4f32 MQPR:$src2), (v4f32 MQPR:$src3))), + (v4f32 (MVE_VFMSf32 $src3, $src1, $src2))>; } - -let validForTailPredication = 1 in { - def MVE_VADDf32 : MVE_VADDSUBFMA_fp<"vadd", "f32", 0b0, 0b0, 0b1, 0b0>; - def MVE_VADDf16 : MVE_VADDSUBFMA_fp<"vadd", "f16", 0b1, 0b0, 0b1, 0b0>; -} - -let Predicates = [HasMVEFloat] in { - def : Pat<(v4f32 (fadd (v4f32 MQPR:$val1), (v4f32 MQPR:$val2))), - (v4f32 (MVE_VADDf32 (v4f32 MQPR:$val1), (v4f32 MQPR:$val2)))>; - def : Pat<(v8f16 (fadd (v8f16 MQPR:$val1), (v8f16 MQPR:$val2))), - (v8f16 (MVE_VADDf16 (v8f16 MQPR:$val1), (v8f16 MQPR:$val2)))>; +multiclass MVE_VADDSUB_fp_m<string iname, bit bit_21, MVEVectorVTInfo VTI, + SDNode unpred_op, Intrinsic pred_int> { + def "" : MVE_VADDSUBFMA_fp<iname, VTI.Suffix, VTI.Size{0}, 0, 1, bit_21> { + let validForTailPredication = 1; + } + defvar Inst = !cast<Instruction>(NAME); + + let Predicates = [HasMVEFloat] in { + def : Pat<(VTI.Vec (unpred_op (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn))), + (VTI.Vec (Inst (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn)))>; + def : Pat<(VTI.Vec (pred_int (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn), + (VTI.Pred VCCR:$mask), (VTI.Vec MQPR:$inactive))), + (VTI.Vec (Inst (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn), + ARMVCCThen, (VTI.Pred VCCR:$mask), + (VTI.Vec MQPR:$inactive)))>; + } } +multiclass MVE_VADD_fp_m<MVEVectorVTInfo VTI> + : MVE_VADDSUB_fp_m<"vadd", 0, VTI, fadd, int_arm_mve_add_predicated>; +multiclass MVE_VSUB_fp_m<MVEVectorVTInfo VTI> + : MVE_VADDSUB_fp_m<"vsub", 1, VTI, fsub, int_arm_mve_sub_predicated>; -let validForTailPredication = 1 in { - def MVE_VSUBf32 : MVE_VADDSUBFMA_fp<"vsub", "f32", 0b0, 0b0, 0b1, 0b1>; - def MVE_VSUBf16 : MVE_VADDSUBFMA_fp<"vsub", "f16", 0b1, 0b0, 0b1, 0b1>; -} +defm MVE_VADDf32 : MVE_VADD_fp_m<MVE_v4f32>; +defm MVE_VADDf16 : MVE_VADD_fp_m<MVE_v8f16>; -let Predicates = [HasMVEFloat] in { - def : Pat<(v4f32 (fsub (v4f32 MQPR:$val1), (v4f32 MQPR:$val2))), - (v4f32 (MVE_VSUBf32 (v4f32 MQPR:$val1), (v4f32 MQPR:$val2)))>; - def : Pat<(v8f16 (fsub (v8f16 MQPR:$val1), (v8f16 MQPR:$val2))), - (v8f16 (MVE_VSUBf16 (v8f16 MQPR:$val1), (v8f16 MQPR:$val2)))>; -} +defm MVE_VSUBf32 : MVE_VSUB_fp_m<MVE_v4f32>; +defm MVE_VSUBf16 : MVE_VSUB_fp_m<MVE_v8f16>; -class MVE_VCADD<string suffix, bit size, string cstr="", list<dag> pattern=[]> +class MVE_VCADD<string suffix, bit size, string cstr=""> : MVEFloatArithNeon<"vcadd", suffix, size, (outs MQPR:$Qd), (ins MQPR:$Qn, MQPR:$Qm, complexrotateopodd:$rot), - "$Qd, $Qn, $Qm, $rot", vpred_r, cstr, pattern> { + "$Qd, $Qn, $Qm, $rot", vpred_r, cstr, []> { bits<4> Qd; bits<4> Qn; bit rot; @@ -2810,8 +3354,29 @@ class MVE_VCADD<string suffix, bit size, string cstr="", list<dag> pattern=[]> let Inst{4} = 0b0; } -def MVE_VCADDf16 : MVE_VCADD<"f16", 0b0>; -def MVE_VCADDf32 : MVE_VCADD<"f32", 0b1, "@earlyclobber $Qd">; +multiclass MVE_VCADD_m<MVEVectorVTInfo VTI, bit size, string cstr=""> { + def "" : MVE_VCADD<VTI.Suffix, size, cstr>; + defvar Inst = !cast<Instruction>(NAME); + + let Predicates = [HasMVEFloat] in { + def : Pat<(VTI.Vec (int_arm_mve_vcaddq (i32 1), + imm:$rot, (VTI.Vec MQPR:$Qn), (VTI.Vec MQPR:$Qm))), + (VTI.Vec (Inst (VTI.Vec MQPR:$Qn), (VTI.Vec MQPR:$Qm), + imm:$rot))>; + + def : Pat<(VTI.Vec (int_arm_mve_vcaddq_predicated (i32 1), + imm:$rot, (VTI.Vec MQPR:$inactive), + (VTI.Vec MQPR:$Qn), (VTI.Vec MQPR:$Qm), + (VTI.Pred VCCR:$mask))), + (VTI.Vec (Inst (VTI.Vec MQPR:$Qn), (VTI.Vec MQPR:$Qm), + imm:$rot, ARMVCCThen, (VTI.Pred VCCR:$mask), + (VTI.Vec MQPR:$inactive)))>; + + } +} + +defm MVE_VCADDf16 : MVE_VCADD_m<MVE_v8f16, 0b0>; +defm MVE_VCADDf32 : MVE_VCADD_m<MVE_v4f32, 0b1, "@earlyclobber $Qd">; class MVE_VABD_fp<string suffix, bit size> : MVE_float<"vabd", suffix, (outs MQPR:$Qd), (ins MQPR:$Qn, MQPR:$Qm), @@ -2833,8 +3398,29 @@ class MVE_VABD_fp<string suffix, bit size> let validForTailPredication = 1; } -def MVE_VABDf32 : MVE_VABD_fp<"f32", 0b0>; -def MVE_VABDf16 : MVE_VABD_fp<"f16", 0b1>; +multiclass MVE_VABDT_fp_m<MVEVectorVTInfo VTI, + Intrinsic unpred_int, Intrinsic pred_int> { + def "" : MVE_VABD_fp<VTI.Suffix, VTI.Size{0}>; + defvar Inst = !cast<Instruction>(NAME); + + let Predicates = [HasMVEFloat] in { + def : Pat<(VTI.Vec (unpred_int (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn), + (i32 0))), + (VTI.Vec (Inst (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn)))>; + def : Pat<(VTI.Vec (pred_int (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn), + (i32 0), (VTI.Pred VCCR:$mask), + (VTI.Vec MQPR:$inactive))), + (VTI.Vec (Inst (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn), + ARMVCCThen, (VTI.Pred VCCR:$mask), + (VTI.Vec MQPR:$inactive)))>; + } +} + +multiclass MVE_VABD_fp_m<MVEVectorVTInfo VTI> + : MVE_VABDT_fp_m<VTI, int_arm_mve_vabd, int_arm_mve_abd_predicated>; + +defm MVE_VABDf32 : MVE_VABD_fp_m<MVE_v4f32>; +defm MVE_VABDf16 : MVE_VABD_fp_m<MVE_v8f16>; class MVE_VCVT_fix<string suffix, bit fsi, bit U, bit op, Operand imm_operand_type, list<dag> pattern=[]> @@ -3186,120 +3772,120 @@ def MVE_VCMPs8r : MVE_VCMPqrs<"s8", 0b00>; def MVE_VCMPs16r : MVE_VCMPqrs<"s16", 0b01>; def MVE_VCMPs32r : MVE_VCMPqrs<"s32", 0b10>; -multiclass unpred_vcmp_z<string suffix, int fc> { - def i8 : Pat<(v16i1 (ARMvcmpz (v16i8 MQPR:$v1), (i32 fc))), +multiclass unpred_vcmp_z<string suffix, PatLeaf fc> { + def i8 : Pat<(v16i1 (ARMvcmpz (v16i8 MQPR:$v1), fc)), (v16i1 (!cast<Instruction>("MVE_VCMP"#suffix#"8r") (v16i8 MQPR:$v1), ZR, fc))>; - def i16 : Pat<(v8i1 (ARMvcmpz (v8i16 MQPR:$v1), (i32 fc))), + def i16 : Pat<(v8i1 (ARMvcmpz (v8i16 MQPR:$v1), fc)), (v8i1 (!cast<Instruction>("MVE_VCMP"#suffix#"16r") (v8i16 MQPR:$v1), ZR, fc))>; - def i32 : Pat<(v4i1 (ARMvcmpz (v4i32 MQPR:$v1), (i32 fc))), + def i32 : Pat<(v4i1 (ARMvcmpz (v4i32 MQPR:$v1), fc)), (v4i1 (!cast<Instruction>("MVE_VCMP"#suffix#"32r") (v4i32 MQPR:$v1), ZR, fc))>; - def : Pat<(v16i1 (and (v16i1 VCCR:$p1), (v16i1 (ARMvcmpz (v16i8 MQPR:$v1), (i32 fc))))), - (v16i1 (!cast<Instruction>("MVE_VCMP"#suffix#"8r") (v16i8 MQPR:$v1), ZR, fc, 1, VCCR:$p1))>; - def : Pat<(v8i1 (and (v8i1 VCCR:$p1), (v8i1 (ARMvcmpz (v8i16 MQPR:$v1), (i32 fc))))), - (v8i1 (!cast<Instruction>("MVE_VCMP"#suffix#"16r") (v8i16 MQPR:$v1), ZR, fc, 1, VCCR:$p1))>; - def : Pat<(v4i1 (and (v4i1 VCCR:$p1), (v4i1 (ARMvcmpz (v4i32 MQPR:$v1), (i32 fc))))), - (v4i1 (!cast<Instruction>("MVE_VCMP"#suffix#"32r") (v4i32 MQPR:$v1), ZR, fc, 1, VCCR:$p1))>; + def : Pat<(v16i1 (and (v16i1 VCCR:$p1), (v16i1 (ARMvcmpz (v16i8 MQPR:$v1), fc)))), + (v16i1 (!cast<Instruction>("MVE_VCMP"#suffix#"8r") (v16i8 MQPR:$v1), ZR, fc, ARMVCCThen, VCCR:$p1))>; + def : Pat<(v8i1 (and (v8i1 VCCR:$p1), (v8i1 (ARMvcmpz (v8i16 MQPR:$v1), fc)))), + (v8i1 (!cast<Instruction>("MVE_VCMP"#suffix#"16r") (v8i16 MQPR:$v1), ZR, fc, ARMVCCThen, VCCR:$p1))>; + def : Pat<(v4i1 (and (v4i1 VCCR:$p1), (v4i1 (ARMvcmpz (v4i32 MQPR:$v1), fc)))), + (v4i1 (!cast<Instruction>("MVE_VCMP"#suffix#"32r") (v4i32 MQPR:$v1), ZR, fc, ARMVCCThen, VCCR:$p1))>; } -multiclass unpred_vcmp_r<string suffix, int fc> { - def i8 : Pat<(v16i1 (ARMvcmp (v16i8 MQPR:$v1), (v16i8 MQPR:$v2), (i32 fc))), +multiclass unpred_vcmp_r<string suffix, PatLeaf fc> { + def i8 : Pat<(v16i1 (ARMvcmp (v16i8 MQPR:$v1), (v16i8 MQPR:$v2), fc)), (v16i1 (!cast<Instruction>("MVE_VCMP"#suffix#"8") (v16i8 MQPR:$v1), (v16i8 MQPR:$v2), fc))>; - def i16 : Pat<(v8i1 (ARMvcmp (v8i16 MQPR:$v1), (v8i16 MQPR:$v2), (i32 fc))), + def i16 : Pat<(v8i1 (ARMvcmp (v8i16 MQPR:$v1), (v8i16 MQPR:$v2), fc)), (v8i1 (!cast<Instruction>("MVE_VCMP"#suffix#"16") (v8i16 MQPR:$v1), (v8i16 MQPR:$v2), fc))>; - def i32 : Pat<(v4i1 (ARMvcmp (v4i32 MQPR:$v1), (v4i32 MQPR:$v2), (i32 fc))), + def i32 : Pat<(v4i1 (ARMvcmp (v4i32 MQPR:$v1), (v4i32 MQPR:$v2), fc)), (v4i1 (!cast<Instruction>("MVE_VCMP"#suffix#"32") (v4i32 MQPR:$v1), (v4i32 MQPR:$v2), fc))>; - def i8r : Pat<(v16i1 (ARMvcmp (v16i8 MQPR:$v1), (v16i8 (ARMvdup GPR:$v2)), (i32 fc))), + def i8r : Pat<(v16i1 (ARMvcmp (v16i8 MQPR:$v1), (v16i8 (ARMvdup GPR:$v2)), fc)), (v16i1 (!cast<Instruction>("MVE_VCMP"#suffix#"8r") (v16i8 MQPR:$v1), (i32 GPR:$v2), fc))>; - def i16r : Pat<(v8i1 (ARMvcmp (v8i16 MQPR:$v1), (v8i16 (ARMvdup GPR:$v2)), (i32 fc))), + def i16r : Pat<(v8i1 (ARMvcmp (v8i16 MQPR:$v1), (v8i16 (ARMvdup GPR:$v2)), fc)), (v8i1 (!cast<Instruction>("MVE_VCMP"#suffix#"16r") (v8i16 MQPR:$v1), (i32 GPR:$v2), fc))>; - def i32r : Pat<(v4i1 (ARMvcmp (v4i32 MQPR:$v1), (v4i32 (ARMvdup GPR:$v2)), (i32 fc))), + def i32r : Pat<(v4i1 (ARMvcmp (v4i32 MQPR:$v1), (v4i32 (ARMvdup GPR:$v2)), fc)), (v4i1 (!cast<Instruction>("MVE_VCMP"#suffix#"32r") (v4i32 MQPR:$v1), (i32 GPR:$v2), fc))>; - def : Pat<(v16i1 (and (v16i1 VCCR:$p1), (v16i1 (ARMvcmp (v16i8 MQPR:$v1), (v16i8 MQPR:$v2), (i32 fc))))), - (v16i1 (!cast<Instruction>("MVE_VCMP"#suffix#"8") (v16i8 MQPR:$v1), (v16i8 MQPR:$v2), fc, 1, VCCR:$p1))>; - def : Pat<(v8i1 (and (v8i1 VCCR:$p1), (v8i1 (ARMvcmp (v8i16 MQPR:$v1), (v8i16 MQPR:$v2), (i32 fc))))), - (v8i1 (!cast<Instruction>("MVE_VCMP"#suffix#"16") (v8i16 MQPR:$v1), (v8i16 MQPR:$v2), fc, 1, VCCR:$p1))>; - def : Pat<(v4i1 (and (v4i1 VCCR:$p1), (v4i1 (ARMvcmp (v4i32 MQPR:$v1), (v4i32 MQPR:$v2), (i32 fc))))), - (v4i1 (!cast<Instruction>("MVE_VCMP"#suffix#"32") (v4i32 MQPR:$v1), (v4i32 MQPR:$v2), fc, 1, VCCR:$p1))>; + def : Pat<(v16i1 (and (v16i1 VCCR:$p1), (v16i1 (ARMvcmp (v16i8 MQPR:$v1), (v16i8 MQPR:$v2), fc)))), + (v16i1 (!cast<Instruction>("MVE_VCMP"#suffix#"8") (v16i8 MQPR:$v1), (v16i8 MQPR:$v2), fc, ARMVCCThen, VCCR:$p1))>; + def : Pat<(v8i1 (and (v8i1 VCCR:$p1), (v8i1 (ARMvcmp (v8i16 MQPR:$v1), (v8i16 MQPR:$v2), fc)))), + (v8i1 (!cast<Instruction>("MVE_VCMP"#suffix#"16") (v8i16 MQPR:$v1), (v8i16 MQPR:$v2), fc, ARMVCCThen, VCCR:$p1))>; + def : Pat<(v4i1 (and (v4i1 VCCR:$p1), (v4i1 (ARMvcmp (v4i32 MQPR:$v1), (v4i32 MQPR:$v2), fc)))), + (v4i1 (!cast<Instruction>("MVE_VCMP"#suffix#"32") (v4i32 MQPR:$v1), (v4i32 MQPR:$v2), fc, ARMVCCThen, VCCR:$p1))>; - def : Pat<(v16i1 (and (v16i1 VCCR:$p1), (v16i1 (ARMvcmp (v16i8 MQPR:$v1), (v16i8 (ARMvdup GPR:$v2)), (i32 fc))))), - (v16i1 (!cast<Instruction>("MVE_VCMP"#suffix#"8r") (v16i8 MQPR:$v1), (i32 GPR:$v2), fc, 1, VCCR:$p1))>; - def : Pat<(v8i1 (and (v8i1 VCCR:$p1), (v8i1 (ARMvcmp (v8i16 MQPR:$v1), (v8i16 (ARMvdup GPR:$v2)), (i32 fc))))), - (v8i1 (!cast<Instruction>("MVE_VCMP"#suffix#"16r") (v8i16 MQPR:$v1), (i32 GPR:$v2), fc, 1, VCCR:$p1))>; - def : Pat<(v4i1 (and (v4i1 VCCR:$p1), (v4i1 (ARMvcmp (v4i32 MQPR:$v1), (v4i32 (ARMvdup GPR:$v2)), (i32 fc))))), - (v4i1 (!cast<Instruction>("MVE_VCMP"#suffix#"32r") (v4i32 MQPR:$v1), (i32 GPR:$v2), fc, 1, VCCR:$p1))>; + def : Pat<(v16i1 (and (v16i1 VCCR:$p1), (v16i1 (ARMvcmp (v16i8 MQPR:$v1), (v16i8 (ARMvdup GPR:$v2)), fc)))), + (v16i1 (!cast<Instruction>("MVE_VCMP"#suffix#"8r") (v16i8 MQPR:$v1), (i32 GPR:$v2), fc, ARMVCCThen, VCCR:$p1))>; + def : Pat<(v8i1 (and (v8i1 VCCR:$p1), (v8i1 (ARMvcmp (v8i16 MQPR:$v1), (v8i16 (ARMvdup GPR:$v2)), fc)))), + (v8i1 (!cast<Instruction>("MVE_VCMP"#suffix#"16r") (v8i16 MQPR:$v1), (i32 GPR:$v2), fc, ARMVCCThen, VCCR:$p1))>; + def : Pat<(v4i1 (and (v4i1 VCCR:$p1), (v4i1 (ARMvcmp (v4i32 MQPR:$v1), (v4i32 (ARMvdup GPR:$v2)), fc)))), + (v4i1 (!cast<Instruction>("MVE_VCMP"#suffix#"32r") (v4i32 MQPR:$v1), (i32 GPR:$v2), fc, ARMVCCThen, VCCR:$p1))>; } -multiclass unpred_vcmpf_z<int fc> { - def f16 : Pat<(v8i1 (ARMvcmpz (v8f16 MQPR:$v1), (i32 fc))), +multiclass unpred_vcmpf_z<PatLeaf fc> { + def f16 : Pat<(v8i1 (ARMvcmpz (v8f16 MQPR:$v1), fc)), (v8i1 (MVE_VCMPf16r (v8f16 MQPR:$v1), ZR, fc))>; - def f32 : Pat<(v4i1 (ARMvcmpz (v4f32 MQPR:$v1), (i32 fc))), + def f32 : Pat<(v4i1 (ARMvcmpz (v4f32 MQPR:$v1), fc)), (v4i1 (MVE_VCMPf32r (v4f32 MQPR:$v1), ZR, fc))>; - def : Pat<(v8i1 (and (v8i1 VCCR:$p1), (v8i1 (ARMvcmpz (v8f16 MQPR:$v1), (i32 fc))))), - (v8i1 (MVE_VCMPf32r (v8f16 MQPR:$v1), ZR, fc, 1, VCCR:$p1))>; - def : Pat<(v4i1 (and (v4i1 VCCR:$p1), (v4i1 (ARMvcmpz (v4f32 MQPR:$v1), (i32 fc))))), - (v4i1 (MVE_VCMPf32r (v4f32 MQPR:$v1), ZR, fc, 1, VCCR:$p1))>; + def : Pat<(v8i1 (and (v8i1 VCCR:$p1), (v8i1 (ARMvcmpz (v8f16 MQPR:$v1), fc)))), + (v8i1 (MVE_VCMPf32r (v8f16 MQPR:$v1), ZR, fc, ARMVCCThen, VCCR:$p1))>; + def : Pat<(v4i1 (and (v4i1 VCCR:$p1), (v4i1 (ARMvcmpz (v4f32 MQPR:$v1), fc)))), + (v4i1 (MVE_VCMPf32r (v4f32 MQPR:$v1), ZR, fc, ARMVCCThen, VCCR:$p1))>; } multiclass unpred_vcmpf_r<int fc> { - def f16 : Pat<(v8i1 (ARMvcmp (v8f16 MQPR:$v1), (v8f16 MQPR:$v2), (i32 fc))), + def f16 : Pat<(v8i1 (ARMvcmp (v8f16 MQPR:$v1), (v8f16 MQPR:$v2), fc)), (v8i1 (MVE_VCMPf16 (v8f16 MQPR:$v1), (v8f16 MQPR:$v2), fc))>; - def f32 : Pat<(v4i1 (ARMvcmp (v4f32 MQPR:$v1), (v4f32 MQPR:$v2), (i32 fc))), + def f32 : Pat<(v4i1 (ARMvcmp (v4f32 MQPR:$v1), (v4f32 MQPR:$v2), fc)), (v4i1 (MVE_VCMPf32 (v4f32 MQPR:$v1), (v4f32 MQPR:$v2), fc))>; - def f16r : Pat<(v8i1 (ARMvcmp (v8f16 MQPR:$v1), (v8f16 (ARMvdup HPR:$v2)), (i32 fc))), + def f16r : Pat<(v8i1 (ARMvcmp (v8f16 MQPR:$v1), (v8f16 (ARMvdup HPR:$v2)), fc)), (v8i1 (MVE_VCMPf16r (v8f16 MQPR:$v1), (i32 (COPY_TO_REGCLASS (f16 HPR:$v2), rGPR)), fc))>; - def f32r : Pat<(v4i1 (ARMvcmp (v4f32 MQPR:$v1), (v4f32 (ARMvdup SPR:$v2)), (i32 fc))), + def f32r : Pat<(v4i1 (ARMvcmp (v4f32 MQPR:$v1), (v4f32 (ARMvdup SPR:$v2)), fc)), (v4i1 (MVE_VCMPf32r (v4f32 MQPR:$v1), (i32 (COPY_TO_REGCLASS (f32 SPR:$v2), rGPR)), fc))>; - def : Pat<(v8i1 (and (v8i1 VCCR:$p1), (v8i1 (ARMvcmp (v8f16 MQPR:$v1), (v8f16 MQPR:$v2), (i32 fc))))), - (v8i1 (MVE_VCMPf16 (v8f16 MQPR:$v1), (v8f16 MQPR:$v2), fc, 1, VCCR:$p1))>; - def : Pat<(v4i1 (and (v4i1 VCCR:$p1), (v4i1 (ARMvcmp (v4f32 MQPR:$v1), (v4f32 MQPR:$v2), (i32 fc))))), - (v4i1 (MVE_VCMPf32 (v4f32 MQPR:$v1), (v4f32 MQPR:$v2), fc, 1, VCCR:$p1))>; + def : Pat<(v8i1 (and (v8i1 VCCR:$p1), (v8i1 (ARMvcmp (v8f16 MQPR:$v1), (v8f16 MQPR:$v2), fc)))), + (v8i1 (MVE_VCMPf16 (v8f16 MQPR:$v1), (v8f16 MQPR:$v2), fc, ARMVCCThen, VCCR:$p1))>; + def : Pat<(v4i1 (and (v4i1 VCCR:$p1), (v4i1 (ARMvcmp (v4f32 MQPR:$v1), (v4f32 MQPR:$v2), fc)))), + (v4i1 (MVE_VCMPf32 (v4f32 MQPR:$v1), (v4f32 MQPR:$v2), fc, ARMVCCThen, VCCR:$p1))>; - def : Pat<(v8i1 (and (v8i1 VCCR:$p1), (v8i1 (ARMvcmp (v8f16 MQPR:$v1), (v8f16 (ARMvdup HPR:$v2)), (i32 fc))))), - (v8i1 (MVE_VCMPf16r (v8f16 MQPR:$v1), (i32 (COPY_TO_REGCLASS (f16 HPR:$v2), rGPR)), fc, 1, VCCR:$p1))>; - def : Pat<(v4i1 (and (v4i1 VCCR:$p1), (v4i1 (ARMvcmp (v4f32 MQPR:$v1), (v4f32 (ARMvdup SPR:$v2)), (i32 fc))))), - (v4i1 (MVE_VCMPf32r (v4f32 MQPR:$v1), (i32 (COPY_TO_REGCLASS (f32 SPR:$v2), rGPR)), fc, 1, VCCR:$p1))>; + def : Pat<(v8i1 (and (v8i1 VCCR:$p1), (v8i1 (ARMvcmp (v8f16 MQPR:$v1), (v8f16 (ARMvdup HPR:$v2)), fc)))), + (v8i1 (MVE_VCMPf16r (v8f16 MQPR:$v1), (i32 (COPY_TO_REGCLASS (f16 HPR:$v2), rGPR)), fc, ARMVCCThen, VCCR:$p1))>; + def : Pat<(v4i1 (and (v4i1 VCCR:$p1), (v4i1 (ARMvcmp (v4f32 MQPR:$v1), (v4f32 (ARMvdup SPR:$v2)), fc)))), + (v4i1 (MVE_VCMPf32r (v4f32 MQPR:$v1), (i32 (COPY_TO_REGCLASS (f32 SPR:$v2), rGPR)), fc, ARMVCCThen, VCCR:$p1))>; } let Predicates = [HasMVEInt] in { - defm MVE_VCEQZ : unpred_vcmp_z<"i", 0>; - defm MVE_VCNEZ : unpred_vcmp_z<"i", 1>; - defm MVE_VCGEZ : unpred_vcmp_z<"s", 10>; - defm MVE_VCLTZ : unpred_vcmp_z<"s", 11>; - defm MVE_VCGTZ : unpred_vcmp_z<"s", 12>; - defm MVE_VCLEZ : unpred_vcmp_z<"s", 13>; - defm MVE_VCGTUZ : unpred_vcmp_z<"u", 8>; - defm MVE_VCGEUZ : unpred_vcmp_z<"u", 2>; - - defm MVE_VCEQ : unpred_vcmp_r<"i", 0>; - defm MVE_VCNE : unpred_vcmp_r<"i", 1>; - defm MVE_VCGE : unpred_vcmp_r<"s", 10>; - defm MVE_VCLT : unpred_vcmp_r<"s", 11>; - defm MVE_VCGT : unpred_vcmp_r<"s", 12>; - defm MVE_VCLE : unpred_vcmp_r<"s", 13>; - defm MVE_VCGTU : unpred_vcmp_r<"u", 8>; - defm MVE_VCGEU : unpred_vcmp_r<"u", 2>; + defm MVE_VCEQZ : unpred_vcmp_z<"i", ARMCCeq>; + defm MVE_VCNEZ : unpred_vcmp_z<"i", ARMCCne>; + defm MVE_VCGEZ : unpred_vcmp_z<"s", ARMCCge>; + defm MVE_VCLTZ : unpred_vcmp_z<"s", ARMCClt>; + defm MVE_VCGTZ : unpred_vcmp_z<"s", ARMCCgt>; + defm MVE_VCLEZ : unpred_vcmp_z<"s", ARMCCle>; + defm MVE_VCGTUZ : unpred_vcmp_z<"u", ARMCChi>; + defm MVE_VCGEUZ : unpred_vcmp_z<"u", ARMCChs>; + + defm MVE_VCEQ : unpred_vcmp_r<"i", ARMCCeq>; + defm MVE_VCNE : unpred_vcmp_r<"i", ARMCCne>; + defm MVE_VCGE : unpred_vcmp_r<"s", ARMCCge>; + defm MVE_VCLT : unpred_vcmp_r<"s", ARMCClt>; + defm MVE_VCGT : unpred_vcmp_r<"s", ARMCCgt>; + defm MVE_VCLE : unpred_vcmp_r<"s", ARMCCle>; + defm MVE_VCGTU : unpred_vcmp_r<"u", ARMCChi>; + defm MVE_VCGEU : unpred_vcmp_r<"u", ARMCChs>; } let Predicates = [HasMVEFloat] in { - defm MVE_VFCEQZ : unpred_vcmpf_z<0>; - defm MVE_VFCNEZ : unpred_vcmpf_z<1>; - defm MVE_VFCGEZ : unpred_vcmpf_z<10>; - defm MVE_VFCLTZ : unpred_vcmpf_z<11>; - defm MVE_VFCGTZ : unpred_vcmpf_z<12>; - defm MVE_VFCLEZ : unpred_vcmpf_z<13>; + defm MVE_VFCEQZ : unpred_vcmpf_z<ARMCCeq>; + defm MVE_VFCNEZ : unpred_vcmpf_z<ARMCCne>; + defm MVE_VFCGEZ : unpred_vcmpf_z<ARMCCge>; + defm MVE_VFCLTZ : unpred_vcmpf_z<ARMCClt>; + defm MVE_VFCGTZ : unpred_vcmpf_z<ARMCCgt>; + defm MVE_VFCLEZ : unpred_vcmpf_z<ARMCCle>; - defm MVE_VFCEQ : unpred_vcmpf_r<0>; - defm MVE_VFCNE : unpred_vcmpf_r<1>; - defm MVE_VFCGE : unpred_vcmpf_r<10>; - defm MVE_VFCLT : unpred_vcmpf_r<11>; - defm MVE_VFCGT : unpred_vcmpf_r<12>; - defm MVE_VFCLE : unpred_vcmpf_r<13>; + defm MVE_VFCEQ : unpred_vcmpf_r<ARMCCeq>; + defm MVE_VFCNE : unpred_vcmpf_r<ARMCCne>; + defm MVE_VFCGE : unpred_vcmpf_r<ARMCCge>; + defm MVE_VFCLT : unpred_vcmpf_r<ARMCClt>; + defm MVE_VFCGT : unpred_vcmpf_r<ARMCCgt>; + defm MVE_VFCLE : unpred_vcmpf_r<ARMCCle>; } @@ -3403,10 +3989,10 @@ defm MVE_VQDMLSDHX : MVE_VQxDMLxDH_multi<"vqdmlsdhx", 0b1, 0b0, 0b1>; defm MVE_VQRDMLSDH : MVE_VQxDMLxDH_multi<"vqrdmlsdh", 0b0, 0b1, 0b1>; defm MVE_VQRDMLSDHX : MVE_VQxDMLxDH_multi<"vqrdmlsdhx", 0b1, 0b1, 0b1>; -class MVE_VCMUL<string iname, string suffix, bit size, string cstr="", list<dag> pattern=[]> +class MVE_VCMUL<string iname, string suffix, bit size, string cstr=""> : MVE_qDest_qSrc<iname, suffix, (outs MQPR:$Qd), (ins MQPR:$Qn, MQPR:$Qm, complexrotateop:$rot), - "$Qd, $Qn, $Qm, $rot", vpred_r, cstr, pattern> { + "$Qd, $Qn, $Qm, $rot", vpred_r, cstr, []> { bits<4> Qn; bits<2> rot; @@ -3422,8 +4008,30 @@ class MVE_VCMUL<string iname, string suffix, bit size, string cstr="", list<dag> let Predicates = [HasMVEFloat]; } -def MVE_VCMULf16 : MVE_VCMUL<"vcmul", "f16", 0b0>; -def MVE_VCMULf32 : MVE_VCMUL<"vcmul", "f32", 0b1, "@earlyclobber $Qd">; +multiclass MVE_VCMUL_m<string iname, MVEVectorVTInfo VTI, + bit size, string cstr=""> { + def "" : MVE_VCMUL<iname, VTI.Suffix, size, cstr>; + defvar Inst = !cast<Instruction>(NAME); + + let Predicates = [HasMVEFloat] in { + def : Pat<(VTI.Vec (int_arm_mve_vcmulq + imm:$rot, (VTI.Vec MQPR:$Qn), (VTI.Vec MQPR:$Qm))), + (VTI.Vec (Inst (VTI.Vec MQPR:$Qn), (VTI.Vec MQPR:$Qm), + imm:$rot))>; + + def : Pat<(VTI.Vec (int_arm_mve_vcmulq_predicated + imm:$rot, (VTI.Vec MQPR:$inactive), + (VTI.Vec MQPR:$Qn), (VTI.Vec MQPR:$Qm), + (VTI.Pred VCCR:$mask))), + (VTI.Vec (Inst (VTI.Vec MQPR:$Qn), (VTI.Vec MQPR:$Qm), + imm:$rot, ARMVCCThen, (VTI.Pred VCCR:$mask), + (VTI.Vec MQPR:$inactive)))>; + + } +} + +defm MVE_VCMULf16 : MVE_VCMUL_m<"vcmul", MVE_v8f16, 0b0>; +defm MVE_VCMULf32 : MVE_VCMUL_m<"vcmul", MVE_v4f32, 0b1, "@earlyclobber $Qd">; class MVE_VMULL<string iname, string suffix, bit bit_28, bits<2> bits_21_20, bit T, string cstr, list<dag> pattern=[]> @@ -3442,29 +4050,80 @@ class MVE_VMULL<string iname, string suffix, bit bit_28, bits<2> bits_21_20, let Inst{8} = 0b0; let Inst{7} = Qn{3}; let Inst{0} = 0b0; + let validForTailPredication = 1; } -multiclass MVE_VMULL_multi<string iname, string suffix, - bit bit_28, bits<2> bits_21_20, string cstr=""> { - def bh : MVE_VMULL<iname # "b", suffix, bit_28, bits_21_20, 0b0, cstr>; - def th : MVE_VMULL<iname # "t", suffix, bit_28, bits_21_20, 0b1, cstr>; +multiclass MVE_VMULL_m<MVEVectorVTInfo VTI, + SDNode unpred_op, Intrinsic pred_int, + bit Top, string cstr=""> { + def "" : MVE_VMULL<"vmull" # !if(Top, "t", "b"), VTI.Suffix, VTI.Unsigned, + VTI.Size, Top, cstr>; + defvar Inst = !cast<Instruction>(NAME); + + let Predicates = [HasMVEInt] in { + defvar uflag = !if(!eq(VTI.SuffixLetter, "p"), (?), (? (i32 VTI.Unsigned))); + + // Unpredicated multiply + def : Pat<(VTI.DblVec !con((unpred_op (VTI.Vec MQPR:$Qm), + (VTI.Vec MQPR:$Qn)), + uflag, (? (i32 Top)))), + (VTI.DblVec (Inst (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn)))>; + + // Predicated multiply + def : Pat<(VTI.DblVec !con((pred_int (VTI.Vec MQPR:$Qm), + (VTI.Vec MQPR:$Qn)), + uflag, (? (i32 Top), (VTI.Pred VCCR:$mask), + (VTI.DblVec MQPR:$inactive)))), + (VTI.DblVec (Inst (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn), + ARMVCCThen, (VTI.Pred VCCR:$mask), + (VTI.DblVec MQPR:$inactive)))>; + } } -// For integer multiplies, bits 21:20 encode size, and bit 28 signedness. -// For polynomial multiplies, bits 21:20 take the unused value 0b11, and -// bit 28 switches to encoding the size. - -defm MVE_VMULLs8 : MVE_VMULL_multi<"vmull", "s8", 0b0, 0b00>; -defm MVE_VMULLs16 : MVE_VMULL_multi<"vmull", "s16", 0b0, 0b01>; -defm MVE_VMULLs32 : MVE_VMULL_multi<"vmull", "s32", 0b0, 0b10, "@earlyclobber $Qd">; -defm MVE_VMULLu8 : MVE_VMULL_multi<"vmull", "u8", 0b1, 0b00>; -defm MVE_VMULLu16 : MVE_VMULL_multi<"vmull", "u16", 0b1, 0b01>; -defm MVE_VMULLu32 : MVE_VMULL_multi<"vmull", "u32", 0b1, 0b10, "@earlyclobber $Qd">; -defm MVE_VMULLp8 : MVE_VMULL_multi<"vmull", "p8", 0b0, 0b11>; -defm MVE_VMULLp16 : MVE_VMULL_multi<"vmull", "p16", 0b1, 0b11>; - -class MVE_VxMULH<string iname, string suffix, bit U, bits<2> size, - bit round, list<dag> pattern=[]> +// For polynomial multiplies, the size bits take the unused value 0b11, and +// the unsigned bit switches to encoding the size. + +defm MVE_VMULLBs8 : MVE_VMULL_m<MVE_v16s8, int_arm_mve_vmull, + int_arm_mve_mull_int_predicated, 0b0>; +defm MVE_VMULLTs8 : MVE_VMULL_m<MVE_v16s8, int_arm_mve_vmull, + int_arm_mve_mull_int_predicated, 0b1>; +defm MVE_VMULLBs16 : MVE_VMULL_m<MVE_v8s16, int_arm_mve_vmull, + int_arm_mve_mull_int_predicated, 0b0>; +defm MVE_VMULLTs16 : MVE_VMULL_m<MVE_v8s16, int_arm_mve_vmull, + int_arm_mve_mull_int_predicated, 0b1>; +defm MVE_VMULLBs32 : MVE_VMULL_m<MVE_v4s32, int_arm_mve_vmull, + int_arm_mve_mull_int_predicated, 0b0, + "@earlyclobber $Qd">; +defm MVE_VMULLTs32 : MVE_VMULL_m<MVE_v4s32, int_arm_mve_vmull, + int_arm_mve_mull_int_predicated, 0b1, + "@earlyclobber $Qd">; + +defm MVE_VMULLBu8 : MVE_VMULL_m<MVE_v16u8, int_arm_mve_vmull, + int_arm_mve_mull_int_predicated, 0b0>; +defm MVE_VMULLTu8 : MVE_VMULL_m<MVE_v16u8, int_arm_mve_vmull, + int_arm_mve_mull_int_predicated, 0b1>; +defm MVE_VMULLBu16 : MVE_VMULL_m<MVE_v8u16, int_arm_mve_vmull, + int_arm_mve_mull_int_predicated, 0b0>; +defm MVE_VMULLTu16 : MVE_VMULL_m<MVE_v8u16, int_arm_mve_vmull, + int_arm_mve_mull_int_predicated, 0b1>; +defm MVE_VMULLBu32 : MVE_VMULL_m<MVE_v4u32, int_arm_mve_vmull, + int_arm_mve_mull_int_predicated, 0b0, + "@earlyclobber $Qd">; +defm MVE_VMULLTu32 : MVE_VMULL_m<MVE_v4u32, int_arm_mve_vmull, + int_arm_mve_mull_int_predicated, 0b1, + "@earlyclobber $Qd">; + +defm MVE_VMULLBp8 : MVE_VMULL_m<MVE_v16p8, int_arm_mve_vmull_poly, + int_arm_mve_mull_poly_predicated, 0b0>; +defm MVE_VMULLTp8 : MVE_VMULL_m<MVE_v16p8, int_arm_mve_vmull_poly, + int_arm_mve_mull_poly_predicated, 0b1>; +defm MVE_VMULLBp16 : MVE_VMULL_m<MVE_v8p16, int_arm_mve_vmull_poly, + int_arm_mve_mull_poly_predicated, 0b0>; +defm MVE_VMULLTp16 : MVE_VMULL_m<MVE_v8p16, int_arm_mve_vmull_poly, + int_arm_mve_mull_poly_predicated, 0b1>; + +class MVE_VxMULH<string iname, string suffix, bit U, bits<2> size, bit round, + list<dag> pattern=[]> : MVE_qDest_qSrc<iname, suffix, (outs MQPR:$Qd), (ins MQPR:$Qn, MQPR:$Qm), "$Qd, $Qn, $Qm", vpred_r, "", pattern> { @@ -3480,19 +4139,46 @@ class MVE_VxMULH<string iname, string suffix, bit U, bits<2> size, let Inst{0} = 0b1; } -def MVE_VMULHs8 : MVE_VxMULH<"vmulh", "s8", 0b0, 0b00, 0b0>; -def MVE_VMULHs16 : MVE_VxMULH<"vmulh", "s16", 0b0, 0b01, 0b0>; -def MVE_VMULHs32 : MVE_VxMULH<"vmulh", "s32", 0b0, 0b10, 0b0>; -def MVE_VMULHu8 : MVE_VxMULH<"vmulh", "u8", 0b1, 0b00, 0b0>; -def MVE_VMULHu16 : MVE_VxMULH<"vmulh", "u16", 0b1, 0b01, 0b0>; -def MVE_VMULHu32 : MVE_VxMULH<"vmulh", "u32", 0b1, 0b10, 0b0>; +multiclass MVE_VxMULH_m<string iname, MVEVectorVTInfo VTI, SDNode unpred_op, + Intrinsic pred_int, bit round> { + def "" : MVE_VxMULH<iname, VTI.Suffix, VTI.Unsigned, VTI.Size, round>; + defvar Inst = !cast<Instruction>(NAME); + + let Predicates = [HasMVEInt] in { + // Unpredicated multiply returning high bits + def : Pat<(VTI.Vec (unpred_op (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn), + (i32 VTI.Unsigned))), + (VTI.Vec (Inst (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn)))>; + + // Predicated multiply returning high bits + def : Pat<(VTI.Vec (pred_int (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn), + (i32 VTI.Unsigned), (VTI.Pred VCCR:$mask), + (VTI.Vec MQPR:$inactive))), + (VTI.Vec (Inst (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn), + ARMVCCThen, (VTI.Pred VCCR:$mask), + (VTI.Vec MQPR:$inactive)))>; + } +} + +multiclass MVE_VMULT<string iname, MVEVectorVTInfo VTI, bit round> + : MVE_VxMULH_m<iname, VTI, !if(round, int_arm_mve_vrmulh, int_arm_mve_vmulh), + !if(round, int_arm_mve_rmulh_predicated, + int_arm_mve_mulh_predicated), + round>; + +defm MVE_VMULHs8 : MVE_VMULT<"vmulh", MVE_v16s8, 0b0>; +defm MVE_VMULHs16 : MVE_VMULT<"vmulh", MVE_v8s16, 0b0>; +defm MVE_VMULHs32 : MVE_VMULT<"vmulh", MVE_v4s32, 0b0>; +defm MVE_VMULHu8 : MVE_VMULT<"vmulh", MVE_v16u8, 0b0>; +defm MVE_VMULHu16 : MVE_VMULT<"vmulh", MVE_v8u16, 0b0>; +defm MVE_VMULHu32 : MVE_VMULT<"vmulh", MVE_v4u32, 0b0>; -def MVE_VRMULHs8 : MVE_VxMULH<"vrmulh", "s8", 0b0, 0b00, 0b1>; -def MVE_VRMULHs16 : MVE_VxMULH<"vrmulh", "s16", 0b0, 0b01, 0b1>; -def MVE_VRMULHs32 : MVE_VxMULH<"vrmulh", "s32", 0b0, 0b10, 0b1>; -def MVE_VRMULHu8 : MVE_VxMULH<"vrmulh", "u8", 0b1, 0b00, 0b1>; -def MVE_VRMULHu16 : MVE_VxMULH<"vrmulh", "u16", 0b1, 0b01, 0b1>; -def MVE_VRMULHu32 : MVE_VxMULH<"vrmulh", "u32", 0b1, 0b10, 0b1>; +defm MVE_VRMULHs8 : MVE_VMULT<"vrmulh", MVE_v16s8, 0b1>; +defm MVE_VRMULHs16 : MVE_VMULT<"vrmulh", MVE_v8s16, 0b1>; +defm MVE_VRMULHs32 : MVE_VMULT<"vrmulh", MVE_v4s32, 0b1>; +defm MVE_VRMULHu8 : MVE_VMULT<"vrmulh", MVE_v16u8, 0b1>; +defm MVE_VRMULHu16 : MVE_VMULT<"vrmulh", MVE_v8u16, 0b1>; +defm MVE_VRMULHu32 : MVE_VMULT<"vrmulh", MVE_v4u32, 0b1>; class MVE_VxMOVxN<string iname, string suffix, bit bit_28, bit bit_17, bits<2> size, bit T, list<dag> pattern=[]> @@ -3551,19 +4237,36 @@ class MVE_VCVT_ff<string iname, string suffix, bit op, bit T, let Predicates = [HasMVEFloat]; } -multiclass MVE_VCVT_ff_halves<string suffix, bit op> { - def bh : MVE_VCVT_ff<"vcvtb", suffix, op, 0b0>; - def th : MVE_VCVT_ff<"vcvtt", suffix, op, 0b1>; +multiclass MVE_VCVT_f2h_m<string iname, int half> { + def "": MVE_VCVT_ff<iname, "f16.f32", 0b0, half>; + defvar Inst = !cast<Instruction>(NAME); + + let Predicates = [HasMVEFloat] in { + def : Pat<(v8f16 (int_arm_mve_vcvt_narrow + (v8f16 MQPR:$Qd_src), (v4f32 MQPR:$Qm), (i32 half))), + (v8f16 (Inst (v8f16 MQPR:$Qd_src), (v4f32 MQPR:$Qm)))>; + def : Pat<(v8f16 (int_arm_mve_vcvt_narrow_predicated + (v8f16 MQPR:$Qd_src), (v4f32 MQPR:$Qm), (i32 half), + (v4i1 VCCR:$mask))), + (v8f16 (Inst (v8f16 MQPR:$Qd_src), (v4f32 MQPR:$Qm), + ARMVCCThen, (v4i1 VCCR:$mask)))>; + } } -defm MVE_VCVTf16f32 : MVE_VCVT_ff_halves<"f16.f32", 0b0>; -defm MVE_VCVTf32f16 : MVE_VCVT_ff_halves<"f32.f16", 0b1>; +multiclass MVE_VCVT_h2f_m<string iname, int half> { + def "": MVE_VCVT_ff<iname, "f32.f16", 0b1, half>; +} + +defm MVE_VCVTf16f32bh : MVE_VCVT_f2h_m<"vcvtb", 0b0>; +defm MVE_VCVTf16f32th : MVE_VCVT_f2h_m<"vcvtt", 0b1>; +defm MVE_VCVTf32f16bh : MVE_VCVT_h2f_m<"vcvtb", 0b0>; +defm MVE_VCVTf32f16th : MVE_VCVT_h2f_m<"vcvtt", 0b1>; class MVE_VxCADD<string iname, string suffix, bits<2> size, bit halve, - string cstr="", list<dag> pattern=[]> + string cstr=""> : MVE_qDest_qSrc<iname, suffix, (outs MQPR:$Qd), (ins MQPR:$Qn, MQPR:$Qm, complexrotateopodd:$rot), - "$Qd, $Qn, $Qm, $rot", vpred_r, cstr, pattern> { + "$Qd, $Qn, $Qm, $rot", vpred_r, cstr, []> { bits<4> Qn; bit rot; @@ -3577,13 +4280,35 @@ class MVE_VxCADD<string iname, string suffix, bits<2> size, bit halve, let Inst{0} = 0b0; } -def MVE_VCADDi8 : MVE_VxCADD<"vcadd", "i8", 0b00, 0b1>; -def MVE_VCADDi16 : MVE_VxCADD<"vcadd", "i16", 0b01, 0b1>; -def MVE_VCADDi32 : MVE_VxCADD<"vcadd", "i32", 0b10, 0b1, "@earlyclobber $Qd">; +multiclass MVE_VxCADD_m<string iname, MVEVectorVTInfo VTI, + bit halve, string cstr=""> { + def "" : MVE_VxCADD<iname, VTI.Suffix, VTI.Size, halve, cstr>; + defvar Inst = !cast<Instruction>(NAME); + + let Predicates = [HasMVEInt] in { + def : Pat<(VTI.Vec (int_arm_mve_vcaddq halve, + imm:$rot, (VTI.Vec MQPR:$Qn), (VTI.Vec MQPR:$Qm))), + (VTI.Vec (Inst (VTI.Vec MQPR:$Qn), (VTI.Vec MQPR:$Qm), + imm:$rot))>; + + def : Pat<(VTI.Vec (int_arm_mve_vcaddq_predicated halve, + imm:$rot, (VTI.Vec MQPR:$inactive), + (VTI.Vec MQPR:$Qn), (VTI.Vec MQPR:$Qm), + (VTI.Pred VCCR:$mask))), + (VTI.Vec (Inst (VTI.Vec MQPR:$Qn), (VTI.Vec MQPR:$Qm), + imm:$rot, ARMVCCThen, (VTI.Pred VCCR:$mask), + (VTI.Vec MQPR:$inactive)))>; + + } +} + +defm MVE_VCADDi8 : MVE_VxCADD_m<"vcadd", MVE_v16i8, 0b1>; +defm MVE_VCADDi16 : MVE_VxCADD_m<"vcadd", MVE_v8i16, 0b1>; +defm MVE_VCADDi32 : MVE_VxCADD_m<"vcadd", MVE_v4i32, 0b1, "@earlyclobber $Qd">; -def MVE_VHCADDs8 : MVE_VxCADD<"vhcadd", "s8", 0b00, 0b0>; -def MVE_VHCADDs16 : MVE_VxCADD<"vhcadd", "s16", 0b01, 0b0>; -def MVE_VHCADDs32 : MVE_VxCADD<"vhcadd", "s32", 0b10, 0b0, "@earlyclobber $Qd">; +defm MVE_VHCADDs8 : MVE_VxCADD_m<"vhcadd", MVE_v16s8, 0b0>; +defm MVE_VHCADDs16 : MVE_VxCADD_m<"vhcadd", MVE_v8s16, 0b0>; +defm MVE_VHCADDs32 : MVE_VxCADD_m<"vhcadd", MVE_v4s32, 0b0, "@earlyclobber $Qd">; class MVE_VADCSBC<string iname, bit I, bit subtract, dag carryin, list<dag> pattern=[]> @@ -3627,6 +4352,7 @@ class MVE_VQDMULL<string iname, string suffix, bit size, bit T, let Inst{8} = 0b1; let Inst{7} = Qn{3}; let Inst{0} = 0b1; + let validForTailPredication = 1; } multiclass MVE_VQDMULL_halves<string suffix, bit size, string cstr=""> { @@ -3742,6 +4468,7 @@ class MVE_VQDMULL_qr<string iname, string suffix, bit size, let Inst{12} = T; let Inst{8} = 0b1; let Inst{5} = 0b1; + let validForTailPredication = 1; } multiclass MVE_VQDMULL_qr_halves<string suffix, bit size, string cstr=""> { @@ -3804,13 +4531,30 @@ class MVE_VxSHL_qr<string iname, string suffix, bit U, bits<2> size, let validForTailPredication = 1; } +multiclass MVE_VxSHL_qr_p<string iname, MVEVectorVTInfo VTI, bit q, bit r> { + def "" : MVE_VxSHL_qr<iname, VTI.Suffix, VTI.Unsigned, VTI.Size, q, r>; + defvar Inst = !cast<Instruction>(NAME); + + def : Pat<(VTI.Vec (int_arm_mve_vshl_scalar + (VTI.Vec MQPR:$in), (i32 rGPR:$sh), + (i32 q), (i32 r), (i32 VTI.Unsigned))), + (VTI.Vec (Inst (VTI.Vec MQPR:$in), (i32 rGPR:$sh)))>; + + def : Pat<(VTI.Vec (int_arm_mve_vshl_scalar_predicated + (VTI.Vec MQPR:$in), (i32 rGPR:$sh), + (i32 q), (i32 r), (i32 VTI.Unsigned), + (VTI.Pred VCCR:$mask))), + (VTI.Vec (Inst (VTI.Vec MQPR:$in), (i32 rGPR:$sh), + ARMVCCThen, (VTI.Pred VCCR:$mask)))>; +} + multiclass MVE_VxSHL_qr_types<string iname, bit bit_7, bit bit_17> { - def s8 : MVE_VxSHL_qr<iname, "s8", 0b0, 0b00, bit_7, bit_17>; - def s16 : MVE_VxSHL_qr<iname, "s16", 0b0, 0b01, bit_7, bit_17>; - def s32 : MVE_VxSHL_qr<iname, "s32", 0b0, 0b10, bit_7, bit_17>; - def u8 : MVE_VxSHL_qr<iname, "u8", 0b1, 0b00, bit_7, bit_17>; - def u16 : MVE_VxSHL_qr<iname, "u16", 0b1, 0b01, bit_7, bit_17>; - def u32 : MVE_VxSHL_qr<iname, "u32", 0b1, 0b10, bit_7, bit_17>; + defm s8 : MVE_VxSHL_qr_p<iname, MVE_v16s8, bit_7, bit_17>; + defm s16 : MVE_VxSHL_qr_p<iname, MVE_v8s16, bit_7, bit_17>; + defm s32 : MVE_VxSHL_qr_p<iname, MVE_v4s32, bit_7, bit_17>; + defm u8 : MVE_VxSHL_qr_p<iname, MVE_v16u8, bit_7, bit_17>; + defm u16 : MVE_VxSHL_qr_p<iname, MVE_v8u16, bit_7, bit_17>; + defm u32 : MVE_VxSHL_qr_p<iname, MVE_v4u32, bit_7, bit_17>; } defm MVE_VSHL_qr : MVE_VxSHL_qr_types<"vshl", 0b0, 0b0>; @@ -4054,7 +4798,7 @@ def MVE_VDWDUPu16 : MVE_VxWDUP<"vdwdup", "u16", 0b01, 0b1>; def MVE_VDWDUPu32 : MVE_VxWDUP<"vdwdup", "u32", 0b10, 0b1>; let hasSideEffects = 1 in -class MVE_VCTP<string suffix, bits<2> size, list<dag> pattern=[]> +class MVE_VCTPInst<string suffix, bits<2> size, list<dag> pattern=[]> : MVE_p<(outs VCCR:$P0), (ins rGPR:$Rn), NoItinerary, "vctp", suffix, "$Rn", vpred_n, "", pattern> { bits<4> Rn; @@ -4072,20 +4816,22 @@ class MVE_VCTP<string suffix, bits<2> size, list<dag> pattern=[]> let validForTailPredication = 1; } -def MVE_VCTP8 : MVE_VCTP<"8", 0b00>; -def MVE_VCTP16 : MVE_VCTP<"16", 0b01>; -def MVE_VCTP32 : MVE_VCTP<"32", 0b10>; -def MVE_VCTP64 : MVE_VCTP<"64", 0b11>; +multiclass MVE_VCTP<MVEVectorVTInfo VTI, Intrinsic intr> { + def "": MVE_VCTPInst<VTI.BitsSuffix, VTI.Size>; + defvar Inst = !cast<Instruction>(NAME); -let Predicates = [HasMVEInt] in { - def : Pat<(int_arm_vctp8 rGPR:$Rn), - (v16i1 (MVE_VCTP8 rGPR:$Rn))>; - def : Pat<(int_arm_vctp16 rGPR:$Rn), - (v8i1 (MVE_VCTP16 rGPR:$Rn))>; - def : Pat<(int_arm_vctp32 rGPR:$Rn), - (v4i1 (MVE_VCTP32 rGPR:$Rn))>; + let Predicates = [HasMVEInt] in { + def : Pat<(intr rGPR:$Rn), (VTI.Pred (Inst rGPR:$Rn))>; + def : Pat<(and (intr rGPR:$Rn), (VTI.Pred VCCR:$mask)), + (VTI.Pred (Inst rGPR:$Rn, ARMVCCThen, VCCR:$mask))>; + } } +defm MVE_VCTP8 : MVE_VCTP<MVE_v16i8, int_arm_mve_vctp8>; +defm MVE_VCTP16 : MVE_VCTP<MVE_v8i16, int_arm_mve_vctp16>; +defm MVE_VCTP32 : MVE_VCTP<MVE_v4i32, int_arm_mve_vctp32>; +defm MVE_VCTP64 : MVE_VCTP<MVE_v2i64, int_arm_mve_vctp64>; + // end of mve_qDest_rSrc // start of coproc mov @@ -4258,6 +5004,29 @@ foreach wb = [MVE_vldst24_writeback< "vst" # n.nvecs # stage # "." # s.lanesize>; } +multiclass MVE_vst24_patterns<int lanesize, ValueType VT> { + foreach stage = [0,1] in + def : Pat<(int_arm_mve_vst2q i32:$addr, + (VT MQPR:$v0), (VT MQPR:$v1), (i32 stage)), + (!cast<Instruction>("MVE_VST2"#stage#"_"#lanesize) + (REG_SEQUENCE QQPR, VT:$v0, qsub_0, VT:$v1, qsub_1), + t2_addr_offset_none:$addr)>; + + foreach stage = [0,1,2,3] in + def : Pat<(int_arm_mve_vst4q i32:$addr, + (VT MQPR:$v0), (VT MQPR:$v1), + (VT MQPR:$v2), (VT MQPR:$v3), (i32 stage)), + (!cast<Instruction>("MVE_VST4"#stage#"_"#lanesize) + (REG_SEQUENCE QQQQPR, VT:$v0, qsub_0, VT:$v1, qsub_1, + VT:$v2, qsub_2, VT:$v3, qsub_3), + t2_addr_offset_none:$addr)>; +} +defm : MVE_vst24_patterns<8, v16i8>; +defm : MVE_vst24_patterns<16, v8i16>; +defm : MVE_vst24_patterns<32, v4i32>; +defm : MVE_vst24_patterns<16, v8f16>; +defm : MVE_vst24_patterns<32, v4f32>; + // end of MVE interleaving load/store // start of MVE predicable load/store @@ -4513,28 +5282,90 @@ class MVE_VLDRSTR_rq_b<MVE_ldst_direction dir, MVE_memsz memsz, string asm, string suffix, bit U, bits<2> size> : MVE_VLDRSTR_rq<dir, memsz, U, size, 0, asm, suffix, 0>; +// Multiclasses wrapping that to add ISel patterns for intrinsics. +multiclass MVE_VLDR_rq_w<MVE_memsz memsz, list<MVEVectorVTInfo> VTIs> { + defm "": MVE_VLDRSTR_rq_w<MVE_ld, memsz, "vldr" # memsz.MnemonicLetter, + VTIs[0].Suffix, VTIs[0].Unsigned, VTIs[0].Size>; + defvar Inst = !cast<Instruction>(NAME); + defvar InstU = !cast<Instruction>(NAME # "_u"); + + foreach VTI = VTIs in + foreach UnsignedFlag = !if(!eq(VTI.Size, memsz.encoding), + [0,1], [VTI.Unsigned]) in { + def : Pat<(VTI.Vec (int_arm_mve_vldr_gather_offset GPR:$base, (VTIs[0].Vec MQPR:$offsets), memsz.TypeBits, 0, UnsignedFlag)), + (VTI.Vec (InstU GPR:$base, MQPR:$offsets))>; + def : Pat<(VTI.Vec (int_arm_mve_vldr_gather_offset GPR:$base, (VTIs[0].Vec MQPR:$offsets), memsz.TypeBits, memsz.shift, UnsignedFlag)), + (VTI.Vec (Inst GPR:$base, MQPR:$offsets))>; + def : Pat<(VTI.Vec (int_arm_mve_vldr_gather_offset_predicated GPR:$base, (VTIs[0].Vec MQPR:$offsets), memsz.TypeBits, 0, UnsignedFlag, (VTI.Pred VCCR:$pred))), + (VTI.Vec (InstU GPR:$base, MQPR:$offsets, ARMVCCThen, VCCR:$pred))>; + def : Pat<(VTI.Vec (int_arm_mve_vldr_gather_offset_predicated GPR:$base, (VTIs[0].Vec MQPR:$offsets), memsz.TypeBits, memsz.shift, UnsignedFlag, (VTI.Pred VCCR:$pred))), + (VTI.Vec (Inst GPR:$base, MQPR:$offsets, ARMVCCThen, VCCR:$pred))>; + } +} +multiclass MVE_VLDR_rq_b<list<MVEVectorVTInfo> VTIs> { + def "": MVE_VLDRSTR_rq_b<MVE_ld, MVE_memB, "vldrb", + VTIs[0].Suffix, VTIs[0].Unsigned, VTIs[0].Size>; + defvar Inst = !cast<Instruction>(NAME); + + foreach VTI = VTIs in { + def : Pat<(VTI.Vec (int_arm_mve_vldr_gather_offset GPR:$base, (VTIs[0].Vec MQPR:$offsets), 8, 0, VTI.Unsigned)), + (VTI.Vec (Inst GPR:$base, MQPR:$offsets))>; + def : Pat<(VTI.Vec (int_arm_mve_vldr_gather_offset_predicated GPR:$base, (VTIs[0].Vec MQPR:$offsets), 8, 0, VTI.Unsigned, (VTI.Pred VCCR:$pred))), + (VTI.Vec (Inst GPR:$base, MQPR:$offsets, ARMVCCThen, VCCR:$pred))>; + } +} +multiclass MVE_VSTR_rq_w<MVE_memsz memsz, list<MVEVectorVTInfo> VTIs> { + defm "": MVE_VLDRSTR_rq_w<MVE_st, memsz, "vstr" # memsz.MnemonicLetter, + VTIs[0].BitsSuffix, 0, VTIs[0].Size>; + defvar Inst = !cast<Instruction>(NAME); + defvar InstU = !cast<Instruction>(NAME # "_u"); + + foreach VTI = VTIs in { + def : Pat<(int_arm_mve_vstr_scatter_offset GPR:$base, (VTIs[0].Vec MQPR:$offsets), (VTI.Vec MQPR:$data), memsz.TypeBits, 0), + (InstU MQPR:$data, GPR:$base, MQPR:$offsets)>; + def : Pat<(int_arm_mve_vstr_scatter_offset GPR:$base, (VTIs[0].Vec MQPR:$offsets), (VTI.Vec MQPR:$data), memsz.TypeBits, memsz.shift), + (Inst MQPR:$data, GPR:$base, MQPR:$offsets)>; + def : Pat<(int_arm_mve_vstr_scatter_offset_predicated GPR:$base, (VTIs[0].Vec MQPR:$offsets), (VTI.Vec MQPR:$data), memsz.TypeBits, 0, (VTI.Pred VCCR:$pred)), + (InstU MQPR:$data, GPR:$base, MQPR:$offsets, ARMVCCThen, VCCR:$pred)>; + def : Pat<(int_arm_mve_vstr_scatter_offset_predicated GPR:$base, (VTIs[0].Vec MQPR:$offsets), (VTI.Vec MQPR:$data), memsz.TypeBits, memsz.shift, (VTI.Pred VCCR:$pred)), + (Inst MQPR:$data, GPR:$base, MQPR:$offsets, ARMVCCThen, VCCR:$pred)>; + } +} +multiclass MVE_VSTR_rq_b<list<MVEVectorVTInfo> VTIs> { + def "": MVE_VLDRSTR_rq_b<MVE_st, MVE_memB, "vstrb", + VTIs[0].BitsSuffix, 0, VTIs[0].Size>; + defvar Inst = !cast<Instruction>(NAME); + + foreach VTI = VTIs in { + def : Pat<(int_arm_mve_vstr_scatter_offset GPR:$base, (VTIs[0].Vec MQPR:$offsets), (VTI.Vec MQPR:$data), 8, 0), + (Inst MQPR:$data, GPR:$base, MQPR:$offsets)>; + def : Pat<(int_arm_mve_vstr_scatter_offset_predicated GPR:$base, (VTIs[0].Vec MQPR:$offsets), (VTI.Vec MQPR:$data), 8, 0, (VTI.Pred VCCR:$pred)), + (Inst MQPR:$data, GPR:$base, MQPR:$offsets, ARMVCCThen, VCCR:$pred)>; + } +} + // Actually define all the loads and stores in this family. -def MVE_VLDRBU8_rq : MVE_VLDRSTR_rq_b<MVE_ld, MVE_memB, "vldrb","u8", 1,0b00>; -def MVE_VLDRBU16_rq: MVE_VLDRSTR_rq_b<MVE_ld, MVE_memB, "vldrb","u16", 1,0b01>; -def MVE_VLDRBS16_rq: MVE_VLDRSTR_rq_b<MVE_ld, MVE_memB, "vldrb","s16", 0,0b01>; -def MVE_VLDRBU32_rq: MVE_VLDRSTR_rq_b<MVE_ld, MVE_memB, "vldrb","u32", 1,0b10>; -def MVE_VLDRBS32_rq: MVE_VLDRSTR_rq_b<MVE_ld, MVE_memB, "vldrb","s32", 0,0b10>; +defm MVE_VLDRBU8_rq : MVE_VLDR_rq_b<[MVE_v16u8,MVE_v16s8]>; +defm MVE_VLDRBU16_rq: MVE_VLDR_rq_b<[MVE_v8u16]>; +defm MVE_VLDRBS16_rq: MVE_VLDR_rq_b<[MVE_v8s16]>; +defm MVE_VLDRBU32_rq: MVE_VLDR_rq_b<[MVE_v4u32]>; +defm MVE_VLDRBS32_rq: MVE_VLDR_rq_b<[MVE_v4s32]>; -defm MVE_VLDRHU16_rq: MVE_VLDRSTR_rq_w<MVE_ld, MVE_memH, "vldrh","u16", 1,0b01>; -defm MVE_VLDRHU32_rq: MVE_VLDRSTR_rq_w<MVE_ld, MVE_memH, "vldrh","u32", 1,0b10>; -defm MVE_VLDRHS32_rq: MVE_VLDRSTR_rq_w<MVE_ld, MVE_memH, "vldrh","s32", 0,0b10>; -defm MVE_VLDRWU32_rq: MVE_VLDRSTR_rq_w<MVE_ld, MVE_memW, "vldrw","u32", 1,0b10>; -defm MVE_VLDRDU64_rq: MVE_VLDRSTR_rq_w<MVE_ld, MVE_memD, "vldrd","u64", 1,0b11>; +defm MVE_VLDRHU16_rq: MVE_VLDR_rq_w<MVE_memH, [MVE_v8u16,MVE_v8s16,MVE_v8f16]>; +defm MVE_VLDRHU32_rq: MVE_VLDR_rq_w<MVE_memH, [MVE_v4u32]>; +defm MVE_VLDRHS32_rq: MVE_VLDR_rq_w<MVE_memH, [MVE_v4s32]>; +defm MVE_VLDRWU32_rq: MVE_VLDR_rq_w<MVE_memW, [MVE_v4u32,MVE_v4s32,MVE_v4f32]>; +defm MVE_VLDRDU64_rq: MVE_VLDR_rq_w<MVE_memD, [MVE_v2u64,MVE_v2s64]>; -def MVE_VSTRB8_rq : MVE_VLDRSTR_rq_b<MVE_st, MVE_memB, "vstrb","8", 0,0b00>; -def MVE_VSTRB16_rq : MVE_VLDRSTR_rq_b<MVE_st, MVE_memB, "vstrb","16", 0,0b01>; -def MVE_VSTRB32_rq : MVE_VLDRSTR_rq_b<MVE_st, MVE_memB, "vstrb","32", 0,0b10>; +defm MVE_VSTRB8_rq : MVE_VSTR_rq_b<[MVE_v16i8]>; +defm MVE_VSTRB16_rq : MVE_VSTR_rq_b<[MVE_v8i16]>; +defm MVE_VSTRB32_rq : MVE_VSTR_rq_b<[MVE_v4i32]>; -defm MVE_VSTRH16_rq : MVE_VLDRSTR_rq_w<MVE_st, MVE_memH, "vstrh","16", 0,0b01>; -defm MVE_VSTRH32_rq : MVE_VLDRSTR_rq_w<MVE_st, MVE_memH, "vstrh","32", 0,0b10>; -defm MVE_VSTRW32_rq : MVE_VLDRSTR_rq_w<MVE_st, MVE_memW, "vstrw","32", 0,0b10>; -defm MVE_VSTRD64_rq : MVE_VLDRSTR_rq_w<MVE_st, MVE_memD, "vstrd","64", 0,0b11>; +defm MVE_VSTRH16_rq : MVE_VSTR_rq_w<MVE_memH, [MVE_v8i16,MVE_v8f16]>; +defm MVE_VSTRH32_rq : MVE_VSTR_rq_w<MVE_memH, [MVE_v4i32]>; +defm MVE_VSTRW32_rq : MVE_VSTR_rq_w<MVE_memW, [MVE_v4i32,MVE_v4f32]>; +defm MVE_VSTRD64_rq : MVE_VSTR_rq_w<MVE_memD, [MVE_v2i64]>; // Gather loads / scatter stores whose address operand is of the form // [Qm,#imm], i.e. a vector containing a full base address for each @@ -4573,11 +5404,58 @@ multiclass MVE_VLDRSTR_qi_m<MVE_ldst_direction dir, MVE_memsz memsz, } } +// Multiclasses wrapping that one, adding selection patterns for the +// non-writeback loads and all the stores. (The writeback loads must +// deliver multiple output values, so they have to be selected by C++ +// code.) +multiclass MVE_VLDR_qi<MVE_memsz memsz, MVEVectorVTInfo AVTI, + list<MVEVectorVTInfo> DVTIs> { + defm "" : MVE_VLDRSTR_qi_m<MVE_ld, memsz, "vldr" # memsz.MnemonicLetter, + "u" # memsz.TypeBits>; + defvar Inst = !cast<Instruction>(NAME); + + foreach DVTI = DVTIs in { + def : Pat<(DVTI.Vec (int_arm_mve_vldr_gather_base + (AVTI.Vec MQPR:$addr), (i32 imm:$offset))), + (DVTI.Vec (Inst (AVTI.Vec MQPR:$addr), (i32 imm:$offset)))>; + def : Pat<(DVTI.Vec (int_arm_mve_vldr_gather_base_predicated + (AVTI.Vec MQPR:$addr), (i32 imm:$offset), (AVTI.Pred VCCR:$pred))), + (DVTI.Vec (Inst (AVTI.Vec MQPR:$addr), (i32 imm:$offset), + ARMVCCThen, VCCR:$pred))>; + } +} +multiclass MVE_VSTR_qi<MVE_memsz memsz, MVEVectorVTInfo AVTI, + list<MVEVectorVTInfo> DVTIs> { + defm "" : MVE_VLDRSTR_qi_m<MVE_st, memsz, "vstr" # memsz.MnemonicLetter, + !cast<string>(memsz.TypeBits)>; + defvar Inst = !cast<Instruction>(NAME); + defvar InstPre = !cast<Instruction>(NAME # "_pre"); + + foreach DVTI = DVTIs in { + def : Pat<(int_arm_mve_vstr_scatter_base + (AVTI.Vec MQPR:$addr), (i32 imm:$offset), (DVTI.Vec MQPR:$data)), + (Inst (DVTI.Vec MQPR:$data), (AVTI.Vec MQPR:$addr), + (i32 imm:$offset))>; + def : Pat<(int_arm_mve_vstr_scatter_base_predicated + (AVTI.Vec MQPR:$addr), (i32 imm:$offset), (DVTI.Vec MQPR:$data), (AVTI.Pred VCCR:$pred)), + (Inst (DVTI.Vec MQPR:$data), (AVTI.Vec MQPR:$addr), + (i32 imm:$offset), ARMVCCThen, VCCR:$pred)>; + def : Pat<(AVTI.Vec (int_arm_mve_vstr_scatter_base_wb + (AVTI.Vec MQPR:$addr), (i32 imm:$offset), (DVTI.Vec MQPR:$data))), + (AVTI.Vec (InstPre (DVTI.Vec MQPR:$data), (AVTI.Vec MQPR:$addr), + (i32 imm:$offset)))>; + def : Pat<(AVTI.Vec (int_arm_mve_vstr_scatter_base_wb_predicated + (AVTI.Vec MQPR:$addr), (i32 imm:$offset), (DVTI.Vec MQPR:$data), (AVTI.Pred VCCR:$pred))), + (AVTI.Vec (InstPre (DVTI.Vec MQPR:$data), (AVTI.Vec MQPR:$addr), + (i32 imm:$offset), ARMVCCThen, VCCR:$pred))>; + } +} + // Actual instruction definitions. -defm MVE_VLDRWU32_qi: MVE_VLDRSTR_qi_m<MVE_ld, MVE_memW, "vldrw", "u32">; -defm MVE_VLDRDU64_qi: MVE_VLDRSTR_qi_m<MVE_ld, MVE_memD, "vldrd", "u64">; -defm MVE_VSTRW32_qi: MVE_VLDRSTR_qi_m<MVE_st, MVE_memW, "vstrw", "32">; -defm MVE_VSTRD64_qi: MVE_VLDRSTR_qi_m<MVE_st, MVE_memD, "vstrd", "64">; +defm MVE_VLDRWU32_qi: MVE_VLDR_qi<MVE_memW, MVE_v4i32, [MVE_v4i32,MVE_v4f32]>; +defm MVE_VLDRDU64_qi: MVE_VLDR_qi<MVE_memD, MVE_v2i64, [MVE_v2i64,MVE_v2f64]>; +defm MVE_VSTRW32_qi: MVE_VSTR_qi<MVE_memW, MVE_v4i32, [MVE_v4i32,MVE_v4f32]>; +defm MVE_VSTRD64_qi: MVE_VSTR_qi<MVE_memD, MVE_v2i64, [MVE_v2i64,MVE_v2f64]>; // Define aliases for all the instructions where memory size and // vector lane size are the same. These are mnemonic aliases, so they @@ -4595,21 +5473,21 @@ defm MVE_VSTRD64_qi: MVE_VLDRSTR_qi_m<MVE_st, MVE_memD, "vstrd", "64">; foreach vpt_cond = ["", "t", "e"] in foreach memsz = [MVE_memB, MVE_memH, MVE_memW, MVE_memD] in foreach suffix = memsz.suffixes in { + // Define an alias with every suffix in the list, except for the one + // used by the real Instruction record (i.e. the one that all the + // rest are aliases *for*). + + if !ne(suffix, memsz.CanonLoadSuffix) then { + def : MnemonicAlias< + "vldr" # memsz.MnemonicLetter # vpt_cond # suffix, + "vldr" # memsz.MnemonicLetter # vpt_cond # memsz.CanonLoadSuffix>; + } - // These foreaches are conceptually ifs, implemented by iterating a - // dummy variable over a list with 0 or 1 elements depending on the - // condition. The idea is to iterate over _nearly_ all the suffixes - // in memsz.suffixes, but omit the one we want all the others to alias. - - foreach _ = !if(!ne(suffix, memsz.CanonLoadSuffix), [1], []<int>) in - def : MnemonicAlias< - "vldr" # memsz.MnemonicLetter # vpt_cond # suffix, - "vldr" # memsz.MnemonicLetter # vpt_cond # memsz.CanonLoadSuffix>; - - foreach _ = !if(!ne(suffix, memsz.CanonStoreSuffix), [1], []<int>) in - def : MnemonicAlias< - "vstr" # memsz.MnemonicLetter # vpt_cond # suffix, - "vstr" # memsz.MnemonicLetter # vpt_cond # memsz.CanonStoreSuffix>; + if !ne(suffix, memsz.CanonStoreSuffix) then { + def : MnemonicAlias< + "vstr" # memsz.MnemonicLetter # vpt_cond # suffix, + "vstr" # memsz.MnemonicLetter # vpt_cond # memsz.CanonStoreSuffix>; + } } // end of MVE predicable load/store @@ -4632,7 +5510,6 @@ class MVE_VPT<string suffix, bits<2> size, dag iops, string asm, list<dag> patte let Inst{4} = 0b0; let Defs = [VPR]; - let validForTailPredication = 1; } class MVE_VPTt1<string suffix, bits<2> size, dag iops> @@ -4644,7 +5521,6 @@ class MVE_VPTt1<string suffix, bits<2> size, dag iops> let Inst{5} = Qm{3}; let Inst{3-1} = Qm{2-0}; let Inst{0} = fc{1}; - let validForTailPredication = 1; } class MVE_VPTt1i<string suffix, bits<2> size> @@ -4746,7 +5622,6 @@ class MVE_VPTf<string suffix, bit size, dag iops, string asm, list<dag> pattern= let Defs = [VPR]; let Predicates = [HasMVEFloat]; - let validForTailPredication = 1; } class MVE_VPTft1<string suffix, bit size> @@ -4816,7 +5691,6 @@ def MVE_VPSEL : MVE_p<(outs MQPR:$Qd), (ins MQPR:$Qn, MQPR:$Qm), NoItinerary, let Inst{4} = 0b0; let Inst{3-1} = Qm{2-0}; let Inst{0} = 0b1; - let validForTailPredication = 1; } foreach suffix = ["s8", "s16", "s32", "u8", "u16", "u32", @@ -4826,87 +5700,87 @@ def : MVEInstAlias<"vpsel${vp}." # suffix # "\t$Qd, $Qn, $Qm", let Predicates = [HasMVEInt] in { def : Pat<(v16i8 (vselect (v16i1 VCCR:$pred), (v16i8 MQPR:$v1), (v16i8 MQPR:$v2))), - (v16i8 (MVE_VPSEL MQPR:$v1, MQPR:$v2, 0, VCCR:$pred))>; + (v16i8 (MVE_VPSEL MQPR:$v1, MQPR:$v2, ARMVCCNone, VCCR:$pred))>; def : Pat<(v8i16 (vselect (v8i1 VCCR:$pred), (v8i16 MQPR:$v1), (v8i16 MQPR:$v2))), - (v8i16 (MVE_VPSEL MQPR:$v1, MQPR:$v2, 0, VCCR:$pred))>; + (v8i16 (MVE_VPSEL MQPR:$v1, MQPR:$v2, ARMVCCNone, VCCR:$pred))>; def : Pat<(v4i32 (vselect (v4i1 VCCR:$pred), (v4i32 MQPR:$v1), (v4i32 MQPR:$v2))), - (v4i32 (MVE_VPSEL MQPR:$v1, MQPR:$v2, 0, VCCR:$pred))>; + (v4i32 (MVE_VPSEL MQPR:$v1, MQPR:$v2, ARMVCCNone, VCCR:$pred))>; def : Pat<(v8f16 (vselect (v8i1 VCCR:$pred), (v8f16 MQPR:$v1), (v8f16 MQPR:$v2))), - (v8f16 (MVE_VPSEL MQPR:$v1, MQPR:$v2, 0, VCCR:$pred))>; + (v8f16 (MVE_VPSEL MQPR:$v1, MQPR:$v2, ARMVCCNone, VCCR:$pred))>; def : Pat<(v4f32 (vselect (v4i1 VCCR:$pred), (v4f32 MQPR:$v1), (v4f32 MQPR:$v2))), - (v4f32 (MVE_VPSEL MQPR:$v1, MQPR:$v2, 0, VCCR:$pred))>; + (v4f32 (MVE_VPSEL MQPR:$v1, MQPR:$v2, ARMVCCNone, VCCR:$pred))>; def : Pat<(v16i8 (vselect (v16i8 MQPR:$pred), (v16i8 MQPR:$v1), (v16i8 MQPR:$v2))), - (v16i8 (MVE_VPSEL MQPR:$v1, MQPR:$v2, 0, - (MVE_VCMPi8 (v16i8 MQPR:$pred), (MVE_VMOVimmi8 0), 1)))>; + (v16i8 (MVE_VPSEL MQPR:$v1, MQPR:$v2, ARMVCCNone, + (MVE_VCMPi8 (v16i8 MQPR:$pred), (MVE_VMOVimmi8 0), ARMCCne)))>; def : Pat<(v8i16 (vselect (v8i16 MQPR:$pred), (v8i16 MQPR:$v1), (v8i16 MQPR:$v2))), - (v8i16 (MVE_VPSEL MQPR:$v1, MQPR:$v2, 0, - (MVE_VCMPi16 (v8i16 MQPR:$pred), (MVE_VMOVimmi16 0), 1)))>; + (v8i16 (MVE_VPSEL MQPR:$v1, MQPR:$v2, ARMVCCNone, + (MVE_VCMPi16 (v8i16 MQPR:$pred), (MVE_VMOVimmi16 0), ARMCCne)))>; def : Pat<(v4i32 (vselect (v4i32 MQPR:$pred), (v4i32 MQPR:$v1), (v4i32 MQPR:$v2))), - (v4i32 (MVE_VPSEL MQPR:$v1, MQPR:$v2, 0, - (MVE_VCMPi32 (v4i32 MQPR:$pred), (MVE_VMOVimmi32 0), 1)))>; + (v4i32 (MVE_VPSEL MQPR:$v1, MQPR:$v2, ARMVCCNone, + (MVE_VCMPi32 (v4i32 MQPR:$pred), (MVE_VMOVimmi32 0), ARMCCne)))>; def : Pat<(v8f16 (vselect (v8i16 MQPR:$pred), (v8f16 MQPR:$v1), (v8f16 MQPR:$v2))), - (v8f16 (MVE_VPSEL MQPR:$v1, MQPR:$v2, 0, - (MVE_VCMPi16 (v8i16 MQPR:$pred), (MVE_VMOVimmi16 0), 1)))>; + (v8f16 (MVE_VPSEL MQPR:$v1, MQPR:$v2, ARMVCCNone, + (MVE_VCMPi16 (v8i16 MQPR:$pred), (MVE_VMOVimmi16 0), ARMCCne)))>; def : Pat<(v4f32 (vselect (v4i32 MQPR:$pred), (v4f32 MQPR:$v1), (v4f32 MQPR:$v2))), - (v4f32 (MVE_VPSEL MQPR:$v1, MQPR:$v2, 0, - (MVE_VCMPi32 (v4i32 MQPR:$pred), (MVE_VMOVimmi32 0), 1)))>; + (v4f32 (MVE_VPSEL MQPR:$v1, MQPR:$v2, ARMVCCNone, + (MVE_VCMPi32 (v4i32 MQPR:$pred), (MVE_VMOVimmi32 0), ARMCCne)))>; // Pred <-> Int def : Pat<(v16i8 (zext (v16i1 VCCR:$pred))), - (v16i8 (MVE_VPSEL (MVE_VMOVimmi8 1), (MVE_VMOVimmi8 0), 0, VCCR:$pred))>; + (v16i8 (MVE_VPSEL (MVE_VMOVimmi8 1), (MVE_VMOVimmi8 0), ARMVCCNone, VCCR:$pred))>; def : Pat<(v8i16 (zext (v8i1 VCCR:$pred))), - (v8i16 (MVE_VPSEL (MVE_VMOVimmi16 1), (MVE_VMOVimmi16 0), 0, VCCR:$pred))>; + (v8i16 (MVE_VPSEL (MVE_VMOVimmi16 1), (MVE_VMOVimmi16 0), ARMVCCNone, VCCR:$pred))>; def : Pat<(v4i32 (zext (v4i1 VCCR:$pred))), - (v4i32 (MVE_VPSEL (MVE_VMOVimmi32 1), (MVE_VMOVimmi32 0), 0, VCCR:$pred))>; + (v4i32 (MVE_VPSEL (MVE_VMOVimmi32 1), (MVE_VMOVimmi32 0), ARMVCCNone, VCCR:$pred))>; def : Pat<(v16i8 (sext (v16i1 VCCR:$pred))), - (v16i8 (MVE_VPSEL (MVE_VMOVimmi8 255), (MVE_VMOVimmi8 0), 0, VCCR:$pred))>; + (v16i8 (MVE_VPSEL (MVE_VMOVimmi8 255), (MVE_VMOVimmi8 0), ARMVCCNone, VCCR:$pred))>; def : Pat<(v8i16 (sext (v8i1 VCCR:$pred))), - (v8i16 (MVE_VPSEL (MVE_VMOVimmi8 255), (MVE_VMOVimmi16 0), 0, VCCR:$pred))>; + (v8i16 (MVE_VPSEL (MVE_VMOVimmi8 255), (MVE_VMOVimmi16 0), ARMVCCNone, VCCR:$pred))>; def : Pat<(v4i32 (sext (v4i1 VCCR:$pred))), - (v4i32 (MVE_VPSEL (MVE_VMOVimmi8 255), (MVE_VMOVimmi32 0), 0, VCCR:$pred))>; + (v4i32 (MVE_VPSEL (MVE_VMOVimmi8 255), (MVE_VMOVimmi32 0), ARMVCCNone, VCCR:$pred))>; def : Pat<(v16i8 (anyext (v16i1 VCCR:$pred))), - (v16i8 (MVE_VPSEL (MVE_VMOVimmi8 1), (MVE_VMOVimmi8 0), 0, VCCR:$pred))>; + (v16i8 (MVE_VPSEL (MVE_VMOVimmi8 1), (MVE_VMOVimmi8 0), ARMVCCNone, VCCR:$pred))>; def : Pat<(v8i16 (anyext (v8i1 VCCR:$pred))), - (v8i16 (MVE_VPSEL (MVE_VMOVimmi16 1), (MVE_VMOVimmi16 0), 0, VCCR:$pred))>; + (v8i16 (MVE_VPSEL (MVE_VMOVimmi16 1), (MVE_VMOVimmi16 0), ARMVCCNone, VCCR:$pred))>; def : Pat<(v4i32 (anyext (v4i1 VCCR:$pred))), - (v4i32 (MVE_VPSEL (MVE_VMOVimmi32 1), (MVE_VMOVimmi32 0), 0, VCCR:$pred))>; + (v4i32 (MVE_VPSEL (MVE_VMOVimmi32 1), (MVE_VMOVimmi32 0), ARMVCCNone, VCCR:$pred))>; def : Pat<(v16i1 (trunc (v16i8 MQPR:$v1))), - (v16i1 (MVE_VCMPi32r (v16i8 MQPR:$v1), ZR, 1))>; + (v16i1 (MVE_VCMPi32r (v16i8 MQPR:$v1), ZR, ARMCCne))>; def : Pat<(v8i1 (trunc (v8i16 MQPR:$v1))), - (v8i1 (MVE_VCMPi32r (v8i16 MQPR:$v1), ZR, 1))>; + (v8i1 (MVE_VCMPi32r (v8i16 MQPR:$v1), ZR, ARMCCne))>; def : Pat<(v4i1 (trunc (v4i32 MQPR:$v1))), - (v4i1 (MVE_VCMPi32r (v4i32 MQPR:$v1), ZR, 1))>; + (v4i1 (MVE_VCMPi32r (v4i32 MQPR:$v1), ZR, ARMCCne))>; } let Predicates = [HasMVEFloat] in { // Pred <-> Float // 112 is 1.0 in float def : Pat<(v4f32 (uint_to_fp (v4i1 VCCR:$pred))), - (v4f32 (MVE_VPSEL (v4f32 (MVE_VMOVimmf32 112)), (v4f32 (MVE_VMOVimmi32 0)), 0, VCCR:$pred))>; + (v4f32 (MVE_VPSEL (v4f32 (MVE_VMOVimmf32 112)), (v4f32 (MVE_VMOVimmi32 0)), ARMVCCNone, VCCR:$pred))>; // 2620 in 1.0 in half def : Pat<(v8f16 (uint_to_fp (v8i1 VCCR:$pred))), - (v8f16 (MVE_VPSEL (v8f16 (MVE_VMOVimmi16 2620)), (v8f16 (MVE_VMOVimmi16 0)), 0, VCCR:$pred))>; + (v8f16 (MVE_VPSEL (v8f16 (MVE_VMOVimmi16 2620)), (v8f16 (MVE_VMOVimmi16 0)), ARMVCCNone, VCCR:$pred))>; // 240 is -1.0 in float def : Pat<(v4f32 (sint_to_fp (v4i1 VCCR:$pred))), - (v4f32 (MVE_VPSEL (v4f32 (MVE_VMOVimmf32 240)), (v4f32 (MVE_VMOVimmi32 0)), 0, VCCR:$pred))>; + (v4f32 (MVE_VPSEL (v4f32 (MVE_VMOVimmf32 240)), (v4f32 (MVE_VMOVimmi32 0)), ARMVCCNone, VCCR:$pred))>; // 2748 is -1.0 in half def : Pat<(v8f16 (sint_to_fp (v8i1 VCCR:$pred))), - (v8f16 (MVE_VPSEL (v8f16 (MVE_VMOVimmi16 2748)), (v8f16 (MVE_VMOVimmi16 0)), 0, VCCR:$pred))>; + (v8f16 (MVE_VPSEL (v8f16 (MVE_VMOVimmi16 2748)), (v8f16 (MVE_VMOVimmi16 0)), ARMVCCNone, VCCR:$pred))>; def : Pat<(v4i1 (fp_to_uint (v4f32 MQPR:$v1))), - (v4i1 (MVE_VCMPf32r (v4f32 MQPR:$v1), ZR, 1))>; + (v4i1 (MVE_VCMPf32r (v4f32 MQPR:$v1), ZR, ARMCCne))>; def : Pat<(v8i1 (fp_to_uint (v8f16 MQPR:$v1))), - (v8i1 (MVE_VCMPf16r (v8f16 MQPR:$v1), ZR, 1))>; + (v8i1 (MVE_VCMPf16r (v8f16 MQPR:$v1), ZR, ARMCCne))>; def : Pat<(v4i1 (fp_to_sint (v4f32 MQPR:$v1))), - (v4i1 (MVE_VCMPf32r (v4f32 MQPR:$v1), ZR, 1))>; + (v4i1 (MVE_VCMPf32r (v4f32 MQPR:$v1), ZR, ARMCCne))>; def : Pat<(v8i1 (fp_to_sint (v8f16 MQPR:$v1))), - (v8i1 (MVE_VCMPf16r (v8f16 MQPR:$v1), ZR, 1))>; + (v8i1 (MVE_VCMPf16r (v8f16 MQPR:$v1), ZR, ARMCCne))>; } def MVE_VPNOT : MVE_p<(outs VCCR:$P0), (ins VCCR:$P0_in), NoItinerary, @@ -4955,6 +5829,8 @@ class MVE_WLSTP<string asm, bits<2> size> let Inst{13} = 0b0; let Inst{11} = label{0}; let Inst{10-1} = label{10-1}; + let isBranch = 1; + let isTerminator = 1; } def MVE_DLSTP_8 : MVE_DLSTP<"dlstp.8", 0b00>; @@ -4983,6 +5859,8 @@ def MVE_LETP : MVE_loltp_end<(outs GPRlr:$LRout), let Inst{13} = 0b0; let Inst{11} = label{0}; let Inst{10-1} = label{10-1}; + let isBranch = 1; + let isTerminator = 1; } def MVE_LCTP : MVE_loltp_end<(outs), (ins pred:$p), "lctp${p}", ""> { @@ -4998,61 +5876,7 @@ def MVE_LCTP : MVE_loltp_end<(outs), (ins pred:$p), "lctp${p}", ""> { // Patterns //===----------------------------------------------------------------------===// -class MVE_vector_store_typed<ValueType Ty, Instruction RegImmInst, - PatFrag StoreKind, int shift> - : Pat<(StoreKind (Ty MQPR:$val), t2addrmode_imm7<shift>:$addr), - (RegImmInst (Ty MQPR:$val), t2addrmode_imm7<shift>:$addr)>; -class MVE_vector_maskedstore_typed<ValueType Ty, Instruction RegImmInst, - PatFrag StoreKind, int shift> - : Pat<(StoreKind (Ty MQPR:$val), t2addrmode_imm7<shift>:$addr, VCCR:$pred), - (RegImmInst (Ty MQPR:$val), t2addrmode_imm7<shift>:$addr, (i32 1), VCCR:$pred)>; - -multiclass MVE_vector_store<Instruction RegImmInst, PatFrag StoreKind, - int shift> { - def : MVE_vector_store_typed<v16i8, RegImmInst, StoreKind, shift>; - def : MVE_vector_store_typed<v8i16, RegImmInst, StoreKind, shift>; - def : MVE_vector_store_typed<v8f16, RegImmInst, StoreKind, shift>; - def : MVE_vector_store_typed<v4i32, RegImmInst, StoreKind, shift>; - def : MVE_vector_store_typed<v4f32, RegImmInst, StoreKind, shift>; - def : MVE_vector_store_typed<v2i64, RegImmInst, StoreKind, shift>; - def : MVE_vector_store_typed<v2f64, RegImmInst, StoreKind, shift>; -} - -class MVE_vector_load_typed<ValueType Ty, Instruction RegImmInst, - PatFrag LoadKind, int shift> - : Pat<(Ty (LoadKind t2addrmode_imm7<shift>:$addr)), - (Ty (RegImmInst t2addrmode_imm7<shift>:$addr))>; -class MVE_vector_maskedload_typed<ValueType Ty, Instruction RegImmInst, - PatFrag LoadKind, int shift> - : Pat<(Ty (LoadKind t2addrmode_imm7<shift>:$addr, VCCR:$pred, (Ty NEONimmAllZerosV))), - (Ty (RegImmInst t2addrmode_imm7<shift>:$addr, (i32 1), VCCR:$pred))>; - -multiclass MVE_vector_load<Instruction RegImmInst, PatFrag LoadKind, - int shift> { - def : MVE_vector_load_typed<v16i8, RegImmInst, LoadKind, shift>; - def : MVE_vector_load_typed<v8i16, RegImmInst, LoadKind, shift>; - def : MVE_vector_load_typed<v8f16, RegImmInst, LoadKind, shift>; - def : MVE_vector_load_typed<v4i32, RegImmInst, LoadKind, shift>; - def : MVE_vector_load_typed<v4f32, RegImmInst, LoadKind, shift>; - def : MVE_vector_load_typed<v2i64, RegImmInst, LoadKind, shift>; - def : MVE_vector_load_typed<v2f64, RegImmInst, LoadKind, shift>; -} - -class MVE_vector_offset_store_typed<ValueType Ty, Instruction Opcode, - PatFrag StoreKind, int shift> - : Pat<(StoreKind (Ty MQPR:$Rt), tGPR:$Rn, t2am_imm7_offset<shift>:$addr), - (Opcode MQPR:$Rt, tGPR:$Rn, t2am_imm7_offset<shift>:$addr)>; - -multiclass MVE_vector_offset_store<Instruction RegImmInst, PatFrag StoreKind, - int shift> { - def : MVE_vector_offset_store_typed<v16i8, RegImmInst, StoreKind, shift>; - def : MVE_vector_offset_store_typed<v8i16, RegImmInst, StoreKind, shift>; - def : MVE_vector_offset_store_typed<v8f16, RegImmInst, StoreKind, shift>; - def : MVE_vector_offset_store_typed<v4i32, RegImmInst, StoreKind, shift>; - def : MVE_vector_offset_store_typed<v4f32, RegImmInst, StoreKind, shift>; - def : MVE_vector_offset_store_typed<v2i64, RegImmInst, StoreKind, shift>; - def : MVE_vector_offset_store_typed<v2f64, RegImmInst, StoreKind, shift>; -} +// PatFrags for loads and stores. Often trying to keep semi-consistent names. def aligned32_pre_store : PatFrag<(ops node:$val, node:$ptr, node:$offset), (pre_store node:$val, node:$ptr, node:$offset), [{ @@ -5072,77 +5896,249 @@ def aligned16_post_store : PatFrag<(ops node:$val, node:$ptr, node:$offset), }]>; -def maskedload8 : PatFrag<(ops node:$ptr, node:$pred, node:$passthru), - (masked_ld node:$ptr, node:$pred, node:$passthru), [{ +def aligned_maskedloadvi8 : PatFrag<(ops node:$ptr, node:$pred, node:$passthru), + (masked_ld node:$ptr, undef, node:$pred, node:$passthru), [{ auto *Ld = cast<MaskedLoadSDNode>(N); return Ld->getMemoryVT().getScalarType() == MVT::i8; }]>; -def sextmaskedload8 : PatFrag<(ops node:$ptr, node:$pred, node:$passthru), - (maskedload8 node:$ptr, node:$pred, node:$passthru), [{ +def aligned_sextmaskedloadvi8 : PatFrag<(ops node:$ptr, node:$pred, node:$passthru), + (aligned_maskedloadvi8 node:$ptr, node:$pred, node:$passthru), [{ return cast<MaskedLoadSDNode>(N)->getExtensionType() == ISD::SEXTLOAD; }]>; -def zextmaskedload8 : PatFrag<(ops node:$ptr, node:$pred, node:$passthru), - (maskedload8 node:$ptr, node:$pred, node:$passthru), [{ +def aligned_zextmaskedloadvi8 : PatFrag<(ops node:$ptr, node:$pred, node:$passthru), + (aligned_maskedloadvi8 node:$ptr, node:$pred, node:$passthru), [{ return cast<MaskedLoadSDNode>(N)->getExtensionType() == ISD::ZEXTLOAD; }]>; -def extmaskedload8 : PatFrag<(ops node:$ptr, node:$pred, node:$passthru), - (maskedload8 node:$ptr, node:$pred, node:$passthru), [{ +def aligned_extmaskedloadvi8 : PatFrag<(ops node:$ptr, node:$pred, node:$passthru), + (aligned_maskedloadvi8 node:$ptr, node:$pred, node:$passthru), [{ auto *Ld = cast<MaskedLoadSDNode>(N); EVT ScalarVT = Ld->getMemoryVT().getScalarType(); return ScalarVT.isInteger() && Ld->getExtensionType() == ISD::EXTLOAD; }]>; -def alignedmaskedload16: PatFrag<(ops node:$ptr, node:$pred, node:$passthru), - (masked_ld node:$ptr, node:$pred, node:$passthru), [{ +def aligned_maskedloadvi16: PatFrag<(ops node:$ptr, node:$pred, node:$passthru), + (masked_ld node:$ptr, undef, node:$pred, node:$passthru), [{ auto *Ld = cast<MaskedLoadSDNode>(N); EVT ScalarVT = Ld->getMemoryVT().getScalarType(); return (ScalarVT == MVT::i16 || ScalarVT == MVT::f16) && Ld->getAlignment() >= 2; }]>; -def sextmaskedload16 : PatFrag<(ops node:$ptr, node:$pred, node:$passthru), - (alignedmaskedload16 node:$ptr, node:$pred, node:$passthru), [{ +def aligned_sextmaskedloadvi16 : PatFrag<(ops node:$ptr, node:$pred, node:$passthru), + (aligned_maskedloadvi16 node:$ptr, node:$pred, node:$passthru), [{ return cast<MaskedLoadSDNode>(N)->getExtensionType() == ISD::SEXTLOAD; }]>; -def zextmaskedload16 : PatFrag<(ops node:$ptr, node:$pred, node:$passthru), - (alignedmaskedload16 node:$ptr, node:$pred, node:$passthru), [{ +def aligned_zextmaskedloadvi16 : PatFrag<(ops node:$ptr, node:$pred, node:$passthru), + (aligned_maskedloadvi16 node:$ptr, node:$pred, node:$passthru), [{ return cast<MaskedLoadSDNode>(N)->getExtensionType() == ISD::ZEXTLOAD; }]>; -def extmaskedload16 : PatFrag<(ops node:$ptr, node:$pred, node:$passthru), - (alignedmaskedload16 node:$ptr, node:$pred, node:$passthru), [{ +def aligned_extmaskedloadvi16 : PatFrag<(ops node:$ptr, node:$pred, node:$passthru), + (aligned_maskedloadvi16 node:$ptr, node:$pred, node:$passthru), [{ auto *Ld = cast<MaskedLoadSDNode>(N); EVT ScalarVT = Ld->getMemoryVT().getScalarType(); return ScalarVT.isInteger() && Ld->getExtensionType() == ISD::EXTLOAD; }]>; -def alignedmaskedload32: PatFrag<(ops node:$ptr, node:$pred, node:$passthru), - (masked_ld node:$ptr, node:$pred, node:$passthru), [{ +def aligned_maskedloadvi32: PatFrag<(ops node:$ptr, node:$pred, node:$passthru), + (masked_ld node:$ptr, undef, node:$pred, node:$passthru), [{ auto *Ld = cast<MaskedLoadSDNode>(N); EVT ScalarVT = Ld->getMemoryVT().getScalarType(); return (ScalarVT == MVT::i32 || ScalarVT == MVT::f32) && Ld->getAlignment() >= 4; }]>; -def maskedstore8 : PatFrag<(ops node:$val, node:$ptr, node:$pred), - (masked_st node:$val, node:$ptr, node:$pred), [{ +def aligned_maskedstvi8 : PatFrag<(ops node:$val, node:$ptr, node:$pred), + (masked_st node:$val, node:$ptr, undef, node:$pred), [{ return cast<MaskedStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i8; }]>; -def truncatingmaskedstore8 : PatFrag<(ops node:$val, node:$ptr, node:$pred), - (maskedstore8 node:$val, node:$ptr, node:$pred), [{ - return cast<MaskedStoreSDNode>(N)->isTruncatingStore(); +def aligned_maskedstvi16 : PatFrag<(ops node:$val, node:$ptr, node:$pred), + (masked_st node:$val, node:$ptr, undef, node:$pred), [{ + auto *St = cast<MaskedStoreSDNode>(N); + EVT ScalarVT = St->getMemoryVT().getScalarType(); + return (ScalarVT == MVT::i16 || ScalarVT == MVT::f16) && St->getAlignment() >= 2; +}]>; +def aligned_maskedstvi32 : PatFrag<(ops node:$val, node:$ptr, node:$pred), + (masked_st node:$val, node:$ptr, undef, node:$pred), [{ + auto *St = cast<MaskedStoreSDNode>(N); + EVT ScalarVT = St->getMemoryVT().getScalarType(); + return (ScalarVT == MVT::i32 || ScalarVT == MVT::f32) && St->getAlignment() >= 4; }]>; -def maskedstore16 : PatFrag<(ops node:$val, node:$ptr, node:$pred), - (masked_st node:$val, node:$ptr, node:$pred), [{ + +def pre_maskedstore : PatFrag<(ops node:$val, node:$base, node:$offset, node:$mask), + (masked_st node:$val, node:$base, node:$offset, node:$mask), [{ + ISD::MemIndexedMode AM = cast<MaskedStoreSDNode>(N)->getAddressingMode(); + return AM == ISD::PRE_INC || AM == ISD::PRE_DEC; +}]>; +def post_maskedstore : PatFrag<(ops node:$val, node:$base, node:$offset, node:$mask), + (masked_st node:$val, node:$base, node:$offset, node:$mask), [{ + ISD::MemIndexedMode AM = cast<MaskedStoreSDNode>(N)->getAddressingMode(); + return AM == ISD::POST_INC || AM == ISD::POST_DEC; +}]>; +def aligned_pre_maskedstorevi8 : PatFrag<(ops node:$val, node:$ptr, node:$offset, node:$mask), + (pre_maskedstore node:$val, node:$ptr, node:$offset, node:$mask), [{ + return cast<MaskedStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i8; +}]>; +def aligned_post_maskedstorevi8 : PatFrag<(ops node:$val, node:$ptr, node:$offset, node:$mask), + (post_maskedstore node:$val, node:$ptr, node:$offset, node:$mask), [{ + return cast<MaskedStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i8; +}]>; +def aligned_pre_maskedstorevi16 : PatFrag<(ops node:$val, node:$ptr, node:$offset, node:$mask), + (pre_maskedstore node:$val, node:$ptr, node:$offset, node:$mask), [{ auto *St = cast<MaskedStoreSDNode>(N); EVT ScalarVT = St->getMemoryVT().getScalarType(); return (ScalarVT == MVT::i16 || ScalarVT == MVT::f16) && St->getAlignment() >= 2; }]>; +def aligned_post_maskedstorevi16 : PatFrag<(ops node:$val, node:$ptr, node:$offset, node:$mask), + (post_maskedstore node:$val, node:$ptr, node:$offset, node:$mask), [{ + auto *St = cast<MaskedStoreSDNode>(N); + EVT ScalarVT = St->getMemoryVT().getScalarType(); + return (ScalarVT == MVT::i16 || ScalarVT == MVT::f16) && St->getAlignment() >= 2; +}]>; +def aligned_pre_maskedstorevi32 : PatFrag<(ops node:$val, node:$ptr, node:$offset, node:$mask), + (pre_maskedstore node:$val, node:$ptr, node:$offset, node:$mask), [{ + auto *St = cast<MaskedStoreSDNode>(N); + EVT ScalarVT = St->getMemoryVT().getScalarType(); + return (ScalarVT == MVT::i32 || ScalarVT == MVT::f32) && St->getAlignment() >= 4; +}]>; +def aligned_post_maskedstorevi32 : PatFrag<(ops node:$val, node:$ptr, node:$offset, node:$mask), + (post_maskedstore node:$val, node:$ptr, node:$offset, node:$mask), [{ + auto *St = cast<MaskedStoreSDNode>(N); + EVT ScalarVT = St->getMemoryVT().getScalarType(); + return (ScalarVT == MVT::i32 || ScalarVT == MVT::f32) && St->getAlignment() >= 4; +}]>; + + +// PatFrags for "Aligned" extending / truncating -def truncatingmaskedstore16 : PatFrag<(ops node:$val, node:$ptr, node:$pred), - (maskedstore16 node:$val, node:$ptr, node:$pred), [{ +def aligned_extloadvi8 : PatFrag<(ops node:$ptr), (extloadvi8 node:$ptr)>; +def aligned_sextloadvi8 : PatFrag<(ops node:$ptr), (sextloadvi8 node:$ptr)>; +def aligned_zextloadvi8 : PatFrag<(ops node:$ptr), (zextloadvi8 node:$ptr)>; + +def aligned_truncstvi8 : PatFrag<(ops node:$val, node:$ptr), + (truncstorevi8 node:$val, node:$ptr)>; +def aligned_post_truncstvi8 : PatFrag<(ops node:$val, node:$base, node:$offset), + (post_truncstvi8 node:$val, node:$base, node:$offset)>; +def aligned_pre_truncstvi8 : PatFrag<(ops node:$val, node:$base, node:$offset), + (pre_truncstvi8 node:$val, node:$base, node:$offset)>; + +let MinAlignment = 2 in { + def aligned_extloadvi16 : PatFrag<(ops node:$ptr), (extloadvi16 node:$ptr)>; + def aligned_sextloadvi16 : PatFrag<(ops node:$ptr), (sextloadvi16 node:$ptr)>; + def aligned_zextloadvi16 : PatFrag<(ops node:$ptr), (zextloadvi16 node:$ptr)>; + + def aligned_truncstvi16 : PatFrag<(ops node:$val, node:$ptr), + (truncstorevi16 node:$val, node:$ptr)>; + def aligned_post_truncstvi16 : PatFrag<(ops node:$val, node:$base, node:$offset), + (post_truncstvi16 node:$val, node:$base, node:$offset)>; + def aligned_pre_truncstvi16 : PatFrag<(ops node:$val, node:$base, node:$offset), + (pre_truncstvi16 node:$val, node:$base, node:$offset)>; +} + +def truncmaskedst : PatFrag<(ops node:$val, node:$base, node:$pred), + (masked_st node:$val, node:$base, undef, node:$pred), [{ return cast<MaskedStoreSDNode>(N)->isTruncatingStore(); }]>; -def maskedstore32 : PatFrag<(ops node:$val, node:$ptr, node:$pred), - (masked_st node:$val, node:$ptr, node:$pred), [{ +def aligned_truncmaskedstvi8 : PatFrag<(ops node:$val, node:$base, node:$pred), + (truncmaskedst node:$val, node:$base, node:$pred), [{ + return cast<MaskedStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i8; +}]>; +def aligned_truncmaskedstvi16 : PatFrag<(ops node:$val, node:$base, node:$pred), + (truncmaskedst node:$val, node:$base, node:$pred), [{ auto *St = cast<MaskedStoreSDNode>(N); EVT ScalarVT = St->getMemoryVT().getScalarType(); - return (ScalarVT == MVT::i32 || ScalarVT == MVT::f32) && St->getAlignment() >= 4; + return (ScalarVT == MVT::i16 || ScalarVT == MVT::f16) && St->getAlignment() >= 2; }]>; +def pre_truncmaskedst : PatFrag<(ops node:$val, node:$base, node:$offset, node:$pred), + (masked_st node:$val, node:$base, node:$offset, node:$pred), [{ + ISD::MemIndexedMode AM = cast<MaskedStoreSDNode>(N)->getAddressingMode(); + return cast<MaskedStoreSDNode>(N)->isTruncatingStore() && (AM == ISD::PRE_INC || AM == ISD::PRE_DEC); +}]>; +def aligned_pre_truncmaskedstvi8 : PatFrag<(ops node:$val, node:$base, node:$offset, node:$pred), + (pre_truncmaskedst node:$val, node:$base, node:$offset, node:$pred), [{ + return cast<MaskedStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i8; +}]>; +def aligned_pre_truncmaskedstvi16 : PatFrag<(ops node:$val, node:$base, node:$offset, node:$pred), + (pre_truncmaskedst node:$val, node:$base, node:$offset, node:$pred), [{ + auto *St = cast<MaskedStoreSDNode>(N); + EVT ScalarVT = St->getMemoryVT().getScalarType(); + return (ScalarVT == MVT::i16 || ScalarVT == MVT::f16) && St->getAlignment() >= 2; +}]>; +def post_truncmaskedst : PatFrag<(ops node:$val, node:$base, node:$offset, node:$postd), + (masked_st node:$val, node:$base, node:$offset, node:$postd), [{ + ISD::MemIndexedMode AM = cast<MaskedStoreSDNode>(N)->getAddressingMode(); + return cast<MaskedStoreSDNode>(N)->isTruncatingStore() && (AM == ISD::POST_INC || AM == ISD::POST_DEC); +}]>; +def aligned_post_truncmaskedstvi8 : PatFrag<(ops node:$val, node:$base, node:$offset, node:$postd), + (post_truncmaskedst node:$val, node:$base, node:$offset, node:$postd), [{ + return cast<MaskedStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i8; +}]>; +def aligned_post_truncmaskedstvi16 : PatFrag<(ops node:$val, node:$base, node:$offset, node:$postd), + (post_truncmaskedst node:$val, node:$base, node:$offset, node:$postd), [{ + auto *St = cast<MaskedStoreSDNode>(N); + EVT ScalarVT = St->getMemoryVT().getScalarType(); + return (ScalarVT == MVT::i16 || ScalarVT == MVT::f16) && St->getAlignment() >= 2; +}]>; + +// Load/store patterns + +class MVE_vector_store_typed<ValueType Ty, Instruction RegImmInst, + PatFrag StoreKind, int shift> + : Pat<(StoreKind (Ty MQPR:$val), t2addrmode_imm7<shift>:$addr), + (RegImmInst (Ty MQPR:$val), t2addrmode_imm7<shift>:$addr)>; + +class MVE_vector_maskedstore_typed<ValueType Ty, Instruction RegImmInst, + PatFrag StoreKind, int shift> + : Pat<(StoreKind (Ty MQPR:$val), t2addrmode_imm7<shift>:$addr, VCCR:$pred), + (RegImmInst (Ty MQPR:$val), t2addrmode_imm7<shift>:$addr, ARMVCCThen, VCCR:$pred)>; + +multiclass MVE_vector_store<Instruction RegImmInst, PatFrag StoreKind, + int shift> { + def : MVE_vector_store_typed<v16i8, RegImmInst, StoreKind, shift>; + def : MVE_vector_store_typed<v8i16, RegImmInst, StoreKind, shift>; + def : MVE_vector_store_typed<v8f16, RegImmInst, StoreKind, shift>; + def : MVE_vector_store_typed<v4i32, RegImmInst, StoreKind, shift>; + def : MVE_vector_store_typed<v4f32, RegImmInst, StoreKind, shift>; + def : MVE_vector_store_typed<v2i64, RegImmInst, StoreKind, shift>; + def : MVE_vector_store_typed<v2f64, RegImmInst, StoreKind, shift>; +} + +class MVE_vector_load_typed<ValueType Ty, Instruction RegImmInst, + PatFrag LoadKind, int shift> + : Pat<(Ty (LoadKind t2addrmode_imm7<shift>:$addr)), + (Ty (RegImmInst t2addrmode_imm7<shift>:$addr))>; + +class MVE_vector_maskedload_typed<ValueType Ty, Instruction RegImmInst, + PatFrag LoadKind, int shift> + : Pat<(Ty (LoadKind t2addrmode_imm7<shift>:$addr, VCCR:$pred, (Ty NEONimmAllZerosV))), + (Ty (RegImmInst t2addrmode_imm7<shift>:$addr, ARMVCCThen, VCCR:$pred))>; + +multiclass MVE_vector_load<Instruction RegImmInst, PatFrag LoadKind, + int shift> { + def : MVE_vector_load_typed<v16i8, RegImmInst, LoadKind, shift>; + def : MVE_vector_load_typed<v8i16, RegImmInst, LoadKind, shift>; + def : MVE_vector_load_typed<v8f16, RegImmInst, LoadKind, shift>; + def : MVE_vector_load_typed<v4i32, RegImmInst, LoadKind, shift>; + def : MVE_vector_load_typed<v4f32, RegImmInst, LoadKind, shift>; + def : MVE_vector_load_typed<v2i64, RegImmInst, LoadKind, shift>; + def : MVE_vector_load_typed<v2f64, RegImmInst, LoadKind, shift>; +} + +class MVE_vector_offset_store_typed<ValueType Ty, Instruction Opcode, + PatFrag StoreKind, int shift> + : Pat<(StoreKind (Ty MQPR:$Rt), tGPR:$Rn, t2am_imm7_offset<shift>:$addr), + (Opcode MQPR:$Rt, tGPR:$Rn, t2am_imm7_offset<shift>:$addr)>; + +class MVE_vector_offset_maskedstore_typed<ValueType Ty, Instruction Opcode, + PatFrag StoreKind, int shift> + : Pat<(StoreKind (Ty MQPR:$Rt), tGPR:$Rn, t2am_imm7_offset<shift>:$addr, VCCR:$pred), + (Opcode MQPR:$Rt, tGPR:$Rn, t2am_imm7_offset<shift>:$addr, ARMVCCThen, VCCR:$pred)>; + +multiclass MVE_vector_offset_store<Instruction RegImmInst, PatFrag StoreKind, + int shift> { + def : MVE_vector_offset_store_typed<v16i8, RegImmInst, StoreKind, shift>; + def : MVE_vector_offset_store_typed<v8i16, RegImmInst, StoreKind, shift>; + def : MVE_vector_offset_store_typed<v8f16, RegImmInst, StoreKind, shift>; + def : MVE_vector_offset_store_typed<v4i32, RegImmInst, StoreKind, shift>; + def : MVE_vector_offset_store_typed<v4f32, RegImmInst, StoreKind, shift>; + def : MVE_vector_offset_store_typed<v2i64, RegImmInst, StoreKind, shift>; + def : MVE_vector_offset_store_typed<v2f64, RegImmInst, StoreKind, shift>; +} + let Predicates = [HasMVEInt, IsLE] in { // Stores @@ -5220,116 +6216,73 @@ let Predicates = [HasMVEInt, IsBE] in { let Predicates = [HasMVEInt] in { // Aligned masked store, shared between LE and BE - def : MVE_vector_maskedstore_typed<v16i8, MVE_VSTRBU8, maskedstore8, 0>; - def : MVE_vector_maskedstore_typed<v8i16, MVE_VSTRHU16, maskedstore16, 1>; - def : MVE_vector_maskedstore_typed<v8f16, MVE_VSTRHU16, maskedstore16, 1>; - def : MVE_vector_maskedstore_typed<v4i32, MVE_VSTRWU32, maskedstore32, 2>; - def : MVE_vector_maskedstore_typed<v4f32, MVE_VSTRWU32, maskedstore32, 2>; - // Truncating stores - def : Pat<(truncatingmaskedstore8 (v8i16 MQPR:$val), t2addrmode_imm7<0>:$addr, VCCR:$pred), - (MVE_VSTRB16 MQPR:$val, t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred)>; - def : Pat<(truncatingmaskedstore8 (v4i32 MQPR:$val), t2addrmode_imm7<0>:$addr, VCCR:$pred), - (MVE_VSTRB32 MQPR:$val, t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred)>; - def : Pat<(truncatingmaskedstore16 (v4i32 MQPR:$val), t2addrmode_imm7<1>:$addr, VCCR:$pred), - (MVE_VSTRH32 MQPR:$val, t2addrmode_imm7<1>:$addr, (i32 1), VCCR:$pred)>; + def : MVE_vector_maskedstore_typed<v16i8, MVE_VSTRBU8, aligned_maskedstvi8, 0>; + def : MVE_vector_maskedstore_typed<v8i16, MVE_VSTRHU16, aligned_maskedstvi16, 1>; + def : MVE_vector_maskedstore_typed<v8f16, MVE_VSTRHU16, aligned_maskedstvi16, 1>; + def : MVE_vector_maskedstore_typed<v4i32, MVE_VSTRWU32, aligned_maskedstvi32, 2>; + def : MVE_vector_maskedstore_typed<v4f32, MVE_VSTRWU32, aligned_maskedstvi32, 2>; + + // Pre/Post inc masked stores + def : MVE_vector_offset_maskedstore_typed<v16i8, MVE_VSTRBU8_pre, aligned_pre_maskedstorevi8, 0>; + def : MVE_vector_offset_maskedstore_typed<v16i8, MVE_VSTRBU8_post, aligned_post_maskedstorevi8, 0>; + def : MVE_vector_offset_maskedstore_typed<v8i16, MVE_VSTRHU16_pre, aligned_pre_maskedstorevi16, 1>; + def : MVE_vector_offset_maskedstore_typed<v8i16, MVE_VSTRHU16_post, aligned_post_maskedstorevi16, 1>; + def : MVE_vector_offset_maskedstore_typed<v8f16, MVE_VSTRHU16_pre, aligned_pre_maskedstorevi16, 1>; + def : MVE_vector_offset_maskedstore_typed<v8f16, MVE_VSTRHU16_post, aligned_post_maskedstorevi16, 1>; + def : MVE_vector_offset_maskedstore_typed<v4i32, MVE_VSTRWU32_pre, aligned_pre_maskedstorevi32, 2>; + def : MVE_vector_offset_maskedstore_typed<v4i32, MVE_VSTRWU32_post, aligned_post_maskedstorevi32, 2>; + def : MVE_vector_offset_maskedstore_typed<v4f32, MVE_VSTRWU32_pre, aligned_pre_maskedstorevi32, 2>; + def : MVE_vector_offset_maskedstore_typed<v4f32, MVE_VSTRWU32_post, aligned_post_maskedstorevi32, 2>; + // Aligned masked loads - def : MVE_vector_maskedload_typed<v16i8, MVE_VLDRBU8, maskedload8, 0>; - def : MVE_vector_maskedload_typed<v8i16, MVE_VLDRHU16, alignedmaskedload16, 1>; - def : MVE_vector_maskedload_typed<v8f16, MVE_VLDRHU16, alignedmaskedload16, 1>; - def : MVE_vector_maskedload_typed<v4i32, MVE_VLDRWU32, alignedmaskedload32, 2>; - def : MVE_vector_maskedload_typed<v4f32, MVE_VLDRWU32, alignedmaskedload32, 2>; - // Extending masked loads. - def : Pat<(v8i16 (sextmaskedload8 t2addrmode_imm7<0>:$addr, VCCR:$pred, - (v8i16 NEONimmAllZerosV))), - (v8i16 (MVE_VLDRBS16 t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred))>; - def : Pat<(v4i32 (sextmaskedload8 t2addrmode_imm7<0>:$addr, VCCR:$pred, - (v4i32 NEONimmAllZerosV))), - (v4i32 (MVE_VLDRBS32 t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred))>; - def : Pat<(v8i16 (zextmaskedload8 t2addrmode_imm7<0>:$addr, VCCR:$pred, - (v8i16 NEONimmAllZerosV))), - (v8i16 (MVE_VLDRBU16 t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred))>; - def : Pat<(v4i32 (zextmaskedload8 t2addrmode_imm7<0>:$addr, VCCR:$pred, - (v4i32 NEONimmAllZerosV))), - (v4i32 (MVE_VLDRBU32 t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred))>; - def : Pat<(v8i16 (extmaskedload8 t2addrmode_imm7<0>:$addr, VCCR:$pred, - (v8i16 NEONimmAllZerosV))), - (v8i16 (MVE_VLDRBU16 t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred))>; - def : Pat<(v4i32 (extmaskedload8 t2addrmode_imm7<0>:$addr, VCCR:$pred, - (v4i32 NEONimmAllZerosV))), - (v4i32 (MVE_VLDRBU32 t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred))>; - def : Pat<(v4i32 (sextmaskedload16 t2addrmode_imm7<1>:$addr, VCCR:$pred, - (v4i32 NEONimmAllZerosV))), - (v4i32 (MVE_VLDRHS32 t2addrmode_imm7<1>:$addr, (i32 1), VCCR:$pred))>; - def : Pat<(v4i32 (zextmaskedload16 t2addrmode_imm7<1>:$addr, VCCR:$pred, - (v4i32 NEONimmAllZerosV))), - (v4i32 (MVE_VLDRHU32 t2addrmode_imm7<1>:$addr, (i32 1), VCCR:$pred))>; - def : Pat<(v4i32 (extmaskedload16 t2addrmode_imm7<1>:$addr, VCCR:$pred, - (v4i32 NEONimmAllZerosV))), - (v4i32 (MVE_VLDRHU32 t2addrmode_imm7<1>:$addr, (i32 1), VCCR:$pred))>; + def : MVE_vector_maskedload_typed<v16i8, MVE_VLDRBU8, aligned_maskedloadvi8, 0>; + def : MVE_vector_maskedload_typed<v8i16, MVE_VLDRHU16, aligned_maskedloadvi16, 1>; + def : MVE_vector_maskedload_typed<v8f16, MVE_VLDRHU16, aligned_maskedloadvi16, 1>; + def : MVE_vector_maskedload_typed<v4i32, MVE_VLDRWU32, aligned_maskedloadvi32, 2>; + def : MVE_vector_maskedload_typed<v4f32, MVE_VLDRWU32, aligned_maskedloadvi32, 2>; } // Widening/Narrowing Loads/Stores -let MinAlignment = 2 in { - def truncstorevi16_align2 : PatFrag<(ops node:$val, node:$ptr), - (truncstorevi16 node:$val, node:$ptr)>; - def post_truncstvi16_align2 : PatFrag<(ops node:$val, node:$base, node:$offset), - (post_truncstvi16 node:$val, node:$base, node:$offset)>; - def pre_truncstvi16_align2 : PatFrag<(ops node:$val, node:$base, node:$offset), - (pre_truncstvi16 node:$val, node:$base, node:$offset)>; -} - -let Predicates = [HasMVEInt] in { - def : Pat<(truncstorevi8 (v8i16 MQPR:$val), taddrmode_imm7<0>:$addr), - (MVE_VSTRB16 MQPR:$val, taddrmode_imm7<0>:$addr)>; - def : Pat<(truncstorevi8 (v4i32 MQPR:$val), taddrmode_imm7<0>:$addr), - (MVE_VSTRB32 MQPR:$val, taddrmode_imm7<0>:$addr)>; - def : Pat<(truncstorevi16_align2 (v4i32 MQPR:$val), taddrmode_imm7<1>:$addr), - (MVE_VSTRH32 MQPR:$val, taddrmode_imm7<1>:$addr)>; - - def : Pat<(post_truncstvi8 (v8i16 MQPR:$Rt), tGPR:$Rn, t2am_imm7_offset<0>:$addr), - (MVE_VSTRB16_post MQPR:$Rt, tGPR:$Rn, t2am_imm7_offset<0>:$addr)>; - def : Pat<(post_truncstvi8 (v4i32 MQPR:$Rt), tGPR:$Rn, t2am_imm7_offset<0>:$addr), - (MVE_VSTRB32_post MQPR:$Rt, tGPR:$Rn, t2am_imm7_offset<0>:$addr)>; - def : Pat<(post_truncstvi16_align2 (v4i32 MQPR:$Rt), tGPR:$Rn, t2am_imm7_offset<1>:$addr), - (MVE_VSTRH32_post MQPR:$Rt, tGPR:$Rn, t2am_imm7_offset<1>:$addr)>; - - def : Pat<(pre_truncstvi8 (v8i16 MQPR:$Rt), tGPR:$Rn, t2am_imm7_offset<0>:$addr), - (MVE_VSTRB16_pre MQPR:$Rt, tGPR:$Rn, t2am_imm7_offset<0>:$addr)>; - def : Pat<(pre_truncstvi8 (v4i32 MQPR:$Rt), tGPR:$Rn, t2am_imm7_offset<0>:$addr), - (MVE_VSTRB32_pre MQPR:$Rt, tGPR:$Rn, t2am_imm7_offset<0>:$addr)>; - def : Pat<(pre_truncstvi16_align2 (v4i32 MQPR:$Rt), tGPR:$Rn, t2am_imm7_offset<1>:$addr), - (MVE_VSTRH32_pre MQPR:$Rt, tGPR:$Rn, t2am_imm7_offset<1>:$addr)>; -} - - -let MinAlignment = 2 in { - def extloadvi16_align2 : PatFrag<(ops node:$ptr), (extloadvi16 node:$ptr)>; - def sextloadvi16_align2 : PatFrag<(ops node:$ptr), (sextloadvi16 node:$ptr)>; - def zextloadvi16_align2 : PatFrag<(ops node:$ptr), (zextloadvi16 node:$ptr)>; -} - -multiclass MVEExtLoad<string DestLanes, string DestElemBits, - string SrcElemBits, string SrcElemType, - string Align, Operand am> { - def _Any : Pat<(!cast<ValueType>("v" # DestLanes # "i" # DestElemBits) - (!cast<PatFrag>("extloadvi" # SrcElemBits # Align) am:$addr)), - (!cast<Instruction>("MVE_VLDR" # SrcElemType # "U" # DestElemBits) - am:$addr)>; - def _Z : Pat<(!cast<ValueType>("v" # DestLanes # "i" # DestElemBits) - (!cast<PatFrag>("zextloadvi" # SrcElemBits # Align) am:$addr)), - (!cast<Instruction>("MVE_VLDR" # SrcElemType # "U" # DestElemBits) - am:$addr)>; - def _S : Pat<(!cast<ValueType>("v" # DestLanes # "i" # DestElemBits) - (!cast<PatFrag>("sextloadvi" # SrcElemBits # Align) am:$addr)), - (!cast<Instruction>("MVE_VLDR" # SrcElemType # "S" # DestElemBits) - am:$addr)>; +multiclass MVEExtLoadStore<Instruction LoadSInst, Instruction LoadUInst, string StoreInst, + string Amble, ValueType VT, int Shift> { + // Trunc stores + def : Pat<(!cast<PatFrag>("aligned_truncst"#Amble) (VT MQPR:$val), taddrmode_imm7<Shift>:$addr), + (!cast<Instruction>(StoreInst) MQPR:$val, taddrmode_imm7<Shift>:$addr)>; + def : Pat<(!cast<PatFrag>("aligned_post_truncst"#Amble) (VT MQPR:$Rt), tGPR:$Rn, t2am_imm7_offset<Shift>:$addr), + (!cast<Instruction>(StoreInst#"_post") MQPR:$Rt, tGPR:$Rn, t2am_imm7_offset<Shift>:$addr)>; + def : Pat<(!cast<PatFrag>("aligned_pre_truncst"#Amble) (VT MQPR:$Rt), tGPR:$Rn, t2am_imm7_offset<Shift>:$addr), + (!cast<Instruction>(StoreInst#"_pre") MQPR:$Rt, tGPR:$Rn, t2am_imm7_offset<Shift>:$addr)>; + + // Masked trunc stores + def : Pat<(!cast<PatFrag>("aligned_truncmaskedst"#Amble) (VT MQPR:$val), taddrmode_imm7<Shift>:$addr, VCCR:$pred), + (!cast<Instruction>(StoreInst) MQPR:$val, taddrmode_imm7<Shift>:$addr, ARMVCCThen, VCCR:$pred)>; + def : Pat<(!cast<PatFrag>("aligned_post_truncmaskedst"#Amble) (VT MQPR:$Rt), tGPR:$Rn, t2am_imm7_offset<Shift>:$addr, VCCR:$pred), + (!cast<Instruction>(StoreInst#"_post") MQPR:$Rt, tGPR:$Rn, t2am_imm7_offset<Shift>:$addr, ARMVCCThen, VCCR:$pred)>; + def : Pat<(!cast<PatFrag>("aligned_pre_truncmaskedst"#Amble) (VT MQPR:$Rt), tGPR:$Rn, t2am_imm7_offset<Shift>:$addr, VCCR:$pred), + (!cast<Instruction>(StoreInst#"_pre") MQPR:$Rt, tGPR:$Rn, t2am_imm7_offset<Shift>:$addr, ARMVCCThen, VCCR:$pred)>; + + // Ext loads + def : Pat<(VT (!cast<PatFrag>("aligned_extload"#Amble) taddrmode_imm7<Shift>:$addr)), + (VT (LoadUInst taddrmode_imm7<Shift>:$addr))>; + def : Pat<(VT (!cast<PatFrag>("aligned_sextload"#Amble) taddrmode_imm7<Shift>:$addr)), + (VT (LoadSInst taddrmode_imm7<Shift>:$addr))>; + def : Pat<(VT (!cast<PatFrag>("aligned_zextload"#Amble) taddrmode_imm7<Shift>:$addr)), + (VT (LoadUInst taddrmode_imm7<Shift>:$addr))>; + + // Masked ext loads + def : Pat<(VT (!cast<PatFrag>("aligned_extmaskedload"#Amble) taddrmode_imm7<Shift>:$addr, VCCR:$pred, (VT NEONimmAllZerosV))), + (VT (LoadUInst taddrmode_imm7<Shift>:$addr, ARMVCCThen, VCCR:$pred))>; + def : Pat<(VT (!cast<PatFrag>("aligned_sextmaskedload"#Amble) taddrmode_imm7<Shift>:$addr, VCCR:$pred, (VT NEONimmAllZerosV))), + (VT (LoadSInst taddrmode_imm7<Shift>:$addr, ARMVCCThen, VCCR:$pred))>; + def : Pat<(VT (!cast<PatFrag>("aligned_zextmaskedload"#Amble) taddrmode_imm7<Shift>:$addr, VCCR:$pred, (VT NEONimmAllZerosV))), + (VT (LoadUInst taddrmode_imm7<Shift>:$addr, ARMVCCThen, VCCR:$pred))>; } let Predicates = [HasMVEInt] in { - defm : MVEExtLoad<"4", "32", "8", "B", "", taddrmode_imm7<0>>; - defm : MVEExtLoad<"8", "16", "8", "B", "", taddrmode_imm7<0>>; - defm : MVEExtLoad<"4", "32", "16", "H", "_align2", taddrmode_imm7<1>>; + defm : MVEExtLoadStore<MVE_VLDRBS16, MVE_VLDRBU16, "MVE_VSTRB16", "vi8", v8i16, 0>; + defm : MVEExtLoadStore<MVE_VLDRBS32, MVE_VLDRBU32, "MVE_VSTRB32", "vi8", v4i32, 0>; + defm : MVEExtLoadStore<MVE_VLDRHS32, MVE_VLDRHU32, "MVE_VSTRH32", "vi16", v4i32, 1>; } diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrNEON.td b/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrNEON.td index 60ca92e58041..6244d8d9e27e 100644 --- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrNEON.td +++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrNEON.td @@ -3314,30 +3314,30 @@ class N2VCvtQ<bit op24, bit op23, bits<4> op11_8, bit op7, bit op4, // source operand element sizes of 8, 16 and 32 bits: multiclass N2V_QHS_cmp<bits<2> op24_23, bits<2> op21_20, bits<2> op17_16, bits<5> op11_7, bit op4, string opc, string Dt, - string asm, int fc> { + string asm, PatFrag fc> { // 64-bit vector types. def v8i8 : N2V<op24_23, op21_20, 0b00, op17_16, op11_7, 0, op4, (outs DPR:$Vd), (ins DPR:$Vm), NoItinerary, opc, !strconcat(Dt, "8"), asm, "", - [(set DPR:$Vd, (v8i8 (ARMvcmpz (v8i8 DPR:$Vm), (i32 fc))))]>; + [(set DPR:$Vd, (v8i8 (ARMvcmpz (v8i8 DPR:$Vm), fc)))]>; def v4i16 : N2V<op24_23, op21_20, 0b01, op17_16, op11_7, 0, op4, (outs DPR:$Vd), (ins DPR:$Vm), NoItinerary, opc, !strconcat(Dt, "16"), asm, "", - [(set DPR:$Vd, (v4i16 (ARMvcmpz (v4i16 DPR:$Vm), (i32 fc))))]>; + [(set DPR:$Vd, (v4i16 (ARMvcmpz (v4i16 DPR:$Vm), fc)))]>; def v2i32 : N2V<op24_23, op21_20, 0b10, op17_16, op11_7, 0, op4, (outs DPR:$Vd), (ins DPR:$Vm), NoItinerary, opc, !strconcat(Dt, "32"), asm, "", - [(set DPR:$Vd, (v2i32 (ARMvcmpz (v2i32 DPR:$Vm), (i32 fc))))]>; + [(set DPR:$Vd, (v2i32 (ARMvcmpz (v2i32 DPR:$Vm), fc)))]>; def v2f32 : N2V<op24_23, op21_20, 0b10, op17_16, op11_7, 0, op4, (outs DPR:$Vd), (ins DPR:$Vm), NoItinerary, opc, "f32", asm, "", - [(set DPR:$Vd, (v2i32 (ARMvcmpz (v2f32 DPR:$Vm), (i32 fc))))]> { + [(set DPR:$Vd, (v2i32 (ARMvcmpz (v2f32 DPR:$Vm), fc)))]> { let Inst{10} = 1; // overwrite F = 1 } def v4f16 : N2V<op24_23, op21_20, 0b01, op17_16, op11_7, 0, op4, (outs DPR:$Vd), (ins DPR:$Vm), NoItinerary, opc, "f16", asm, "", - [(set DPR:$Vd, (v4i16 (ARMvcmpz (v4f16 DPR:$Vm), (i32 fc))))]>, + [(set DPR:$Vd, (v4i16 (ARMvcmpz (v4f16 DPR:$Vm), fc)))]>, Requires<[HasNEON,HasFullFP16]> { let Inst{10} = 1; // overwrite F = 1 } @@ -3346,25 +3346,25 @@ multiclass N2V_QHS_cmp<bits<2> op24_23, bits<2> op21_20, bits<2> op17_16, def v16i8 : N2V<op24_23, op21_20, 0b00, op17_16, op11_7, 1, op4, (outs QPR:$Vd), (ins QPR:$Vm), NoItinerary, opc, !strconcat(Dt, "8"), asm, "", - [(set QPR:$Vd, (v16i8 (ARMvcmpz (v16i8 QPR:$Vm), (i32 fc))))]>; + [(set QPR:$Vd, (v16i8 (ARMvcmpz (v16i8 QPR:$Vm), fc)))]>; def v8i16 : N2V<op24_23, op21_20, 0b01, op17_16, op11_7, 1, op4, (outs QPR:$Vd), (ins QPR:$Vm), NoItinerary, opc, !strconcat(Dt, "16"), asm, "", - [(set QPR:$Vd, (v8i16 (ARMvcmpz (v8i16 QPR:$Vm), (i32 fc))))]>; + [(set QPR:$Vd, (v8i16 (ARMvcmpz (v8i16 QPR:$Vm), fc)))]>; def v4i32 : N2V<op24_23, op21_20, 0b10, op17_16, op11_7, 1, op4, (outs QPR:$Vd), (ins QPR:$Vm), NoItinerary, opc, !strconcat(Dt, "32"), asm, "", - [(set QPR:$Vd, (v4i32 (ARMvcmpz (v4i32 QPR:$Vm), (i32 fc))))]>; + [(set QPR:$Vd, (v4i32 (ARMvcmpz (v4i32 QPR:$Vm), fc)))]>; def v4f32 : N2V<op24_23, op21_20, 0b10, op17_16, op11_7, 1, op4, (outs QPR:$Vd), (ins QPR:$Vm), NoItinerary, opc, "f32", asm, "", - [(set QPR:$Vd, (v4i32 (ARMvcmpz (v4f32 QPR:$Vm), (i32 fc))))]> { + [(set QPR:$Vd, (v4i32 (ARMvcmpz (v4f32 QPR:$Vm), fc)))]> { let Inst{10} = 1; // overwrite F = 1 } def v8f16 : N2V<op24_23, op21_20, 0b01, op17_16, op11_7, 1, op4, (outs QPR:$Vd), (ins QPR:$Vm), NoItinerary, opc, "f16", asm, "", - [(set QPR:$Vd, (v8i16 (ARMvcmpz (v8f16 QPR:$Vm), (i32 fc))))]>, + [(set QPR:$Vd, (v8i16 (ARMvcmpz (v8f16 QPR:$Vm), fc)))]>, Requires<[HasNEON,HasFullFP16]> { let Inst{10} = 1; // overwrite F = 1 } @@ -3373,11 +3373,11 @@ multiclass N2V_QHS_cmp<bits<2> op24_23, bits<2> op21_20, bits<2> op17_16, // Neon 3-register comparisons. class N3VQ_cmp<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, InstrItinClass itin, string OpcodeStr, string Dt, - ValueType ResTy, ValueType OpTy, int fc, bit Commutable> + ValueType ResTy, ValueType OpTy, PatFrag fc, bit Commutable> : N3V<op24, op23, op21_20, op11_8, 1, op4, (outs QPR:$Vd), (ins QPR:$Vn, QPR:$Vm), N3RegFrm, itin, OpcodeStr, Dt, "$Vd, $Vn, $Vm", "", - [(set QPR:$Vd, (ResTy (ARMvcmp (OpTy QPR:$Vn), (OpTy QPR:$Vm), (i32 fc))))]> { + [(set QPR:$Vd, (ResTy (ARMvcmp (OpTy QPR:$Vn), (OpTy QPR:$Vm), fc)))]> { // All of these have a two-operand InstAlias. let TwoOperandAliasConstraint = "$Vn = $Vd"; let isCommutable = Commutable; @@ -3385,11 +3385,11 @@ class N3VQ_cmp<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, class N3VD_cmp<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, InstrItinClass itin, string OpcodeStr, string Dt, - ValueType ResTy, ValueType OpTy, int fc, bit Commutable> + ValueType ResTy, ValueType OpTy, PatFrag fc, bit Commutable> : N3V<op24, op23, op21_20, op11_8, 0, op4, (outs DPR:$Vd), (ins DPR:$Vn, DPR:$Vm), N3RegFrm, itin, OpcodeStr, Dt, "$Vd, $Vn, $Vm", "", - [(set DPR:$Vd, (ResTy (ARMvcmp (OpTy DPR:$Vn), (OpTy DPR:$Vm), (i32 fc))))]> { + [(set DPR:$Vd, (ResTy (ARMvcmp (OpTy DPR:$Vn), (OpTy DPR:$Vm), fc)))]> { // All of these have a two-operand InstAlias. let TwoOperandAliasConstraint = "$Vn = $Vd"; let isCommutable = Commutable; @@ -3399,7 +3399,7 @@ multiclass N3V_QHS_cmp<bit op24, bit op23, bits<4> op11_8, bit op4, InstrItinClass itinD16, InstrItinClass itinD32, InstrItinClass itinQ16, InstrItinClass itinQ32, string OpcodeStr, string Dt, - int fc, bit Commutable = 0> { + PatFrag fc, bit Commutable = 0> { // 64-bit vector types. def v8i8 : N3VD_cmp<op24, op23, 0b00, op11_8, op4, itinD16, OpcodeStr, !strconcat(Dt, "8"), @@ -4287,10 +4287,10 @@ defm VRHADDu : N3VInt_QHS<1, 0, 0b0001, 0, N3RegFrm, // VQADD : Vector Saturating Add defm VQADDs : N3VInt_QHSD<0, 0, 0b0000, 1, N3RegFrm, IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, IIC_VBINi4Q, - "vqadd", "s", int_arm_neon_vqadds, 1>; + "vqadd", "s", saddsat, 1>; defm VQADDu : N3VInt_QHSD<1, 0, 0b0000, 1, N3RegFrm, IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, IIC_VBINi4Q, - "vqadd", "u", int_arm_neon_vqaddu, 1>; + "vqadd", "u", uaddsat, 1>; // VADDHN : Vector Add and Narrow Returning High Half (D = Q + Q) defm VADDHN : N3VNInt_HSD<0,1,0b0100,0, "vaddhn", "i", null_frag, 1>; // VRADDHN : Vector Rounding Add and Narrow Returning High Half (D = Q + Q) @@ -4527,22 +4527,22 @@ let Predicates = [HasNEON, HasV8_1a] in { defm VQRDMLAH : N3VInt3_HS<1, 0, 0b1011, 1, IIC_VMACi16D, IIC_VMACi32D, IIC_VMACi16Q, IIC_VMACi32Q, "vqrdmlah", "s", null_frag>; - def : Pat<(v4i16 (int_arm_neon_vqadds + def : Pat<(v4i16 (saddsat (v4i16 DPR:$src1), (v4i16 (int_arm_neon_vqrdmulh (v4i16 DPR:$Vn), (v4i16 DPR:$Vm))))), (v4i16 (VQRDMLAHv4i16 DPR:$src1, DPR:$Vn, DPR:$Vm))>; - def : Pat<(v2i32 (int_arm_neon_vqadds + def : Pat<(v2i32 (saddsat (v2i32 DPR:$src1), (v2i32 (int_arm_neon_vqrdmulh (v2i32 DPR:$Vn), (v2i32 DPR:$Vm))))), (v2i32 (VQRDMLAHv2i32 DPR:$src1, DPR:$Vn, DPR:$Vm))>; - def : Pat<(v8i16 (int_arm_neon_vqadds + def : Pat<(v8i16 (saddsat (v8i16 QPR:$src1), (v8i16 (int_arm_neon_vqrdmulh (v8i16 QPR:$Vn), (v8i16 QPR:$Vm))))), (v8i16 (VQRDMLAHv8i16 QPR:$src1, QPR:$Vn, QPR:$Vm))>; - def : Pat<(v4i32 (int_arm_neon_vqadds + def : Pat<(v4i32 (saddsat (v4i32 QPR:$src1), (v4i32 (int_arm_neon_vqrdmulh (v4i32 QPR:$Vn), (v4i32 QPR:$Vm))))), @@ -4551,7 +4551,7 @@ let Predicates = [HasNEON, HasV8_1a] in { defm VQRDMLAHsl : N3VMulOpSL_HS<0b1110, IIC_VMACi16D, IIC_VMACi32D, IIC_VMACi16Q, IIC_VMACi32Q, "vqrdmlah", "s", null_frag>; - def : Pat<(v4i16 (int_arm_neon_vqadds + def : Pat<(v4i16 (saddsat (v4i16 DPR:$src1), (v4i16 (int_arm_neon_vqrdmulh (v4i16 DPR:$Vn), @@ -4559,7 +4559,7 @@ let Predicates = [HasNEON, HasV8_1a] in { imm:$lane)))))), (v4i16 (VQRDMLAHslv4i16 DPR:$src1, DPR:$Vn, DPR_8:$Vm, imm:$lane))>; - def : Pat<(v2i32 (int_arm_neon_vqadds + def : Pat<(v2i32 (saddsat (v2i32 DPR:$src1), (v2i32 (int_arm_neon_vqrdmulh (v2i32 DPR:$Vn), @@ -4567,7 +4567,7 @@ let Predicates = [HasNEON, HasV8_1a] in { imm:$lane)))))), (v2i32 (VQRDMLAHslv2i32 DPR:$src1, DPR:$Vn, DPR_VFP2:$Vm, imm:$lane))>; - def : Pat<(v8i16 (int_arm_neon_vqadds + def : Pat<(v8i16 (saddsat (v8i16 QPR:$src1), (v8i16 (int_arm_neon_vqrdmulh (v8i16 QPR:$src2), @@ -4579,7 +4579,7 @@ let Predicates = [HasNEON, HasV8_1a] in { QPR:$src3, (DSubReg_i16_reg imm:$lane))), (SubReg_i16_lane imm:$lane)))>; - def : Pat<(v4i32 (int_arm_neon_vqadds + def : Pat<(v4i32 (saddsat (v4i32 QPR:$src1), (v4i32 (int_arm_neon_vqrdmulh (v4i32 QPR:$src2), @@ -4597,22 +4597,22 @@ let Predicates = [HasNEON, HasV8_1a] in { defm VQRDMLSH : N3VInt3_HS<1, 0, 0b1100, 1, IIC_VMACi16D, IIC_VMACi32D, IIC_VMACi16Q, IIC_VMACi32Q, "vqrdmlsh", "s", null_frag>; - def : Pat<(v4i16 (int_arm_neon_vqsubs + def : Pat<(v4i16 (ssubsat (v4i16 DPR:$src1), (v4i16 (int_arm_neon_vqrdmulh (v4i16 DPR:$Vn), (v4i16 DPR:$Vm))))), (v4i16 (VQRDMLSHv4i16 DPR:$src1, DPR:$Vn, DPR:$Vm))>; - def : Pat<(v2i32 (int_arm_neon_vqsubs + def : Pat<(v2i32 (ssubsat (v2i32 DPR:$src1), (v2i32 (int_arm_neon_vqrdmulh (v2i32 DPR:$Vn), (v2i32 DPR:$Vm))))), (v2i32 (VQRDMLSHv2i32 DPR:$src1, DPR:$Vn, DPR:$Vm))>; - def : Pat<(v8i16 (int_arm_neon_vqsubs + def : Pat<(v8i16 (ssubsat (v8i16 QPR:$src1), (v8i16 (int_arm_neon_vqrdmulh (v8i16 QPR:$Vn), (v8i16 QPR:$Vm))))), (v8i16 (VQRDMLSHv8i16 QPR:$src1, QPR:$Vn, QPR:$Vm))>; - def : Pat<(v4i32 (int_arm_neon_vqsubs + def : Pat<(v4i32 (ssubsat (v4i32 QPR:$src1), (v4i32 (int_arm_neon_vqrdmulh (v4i32 QPR:$Vn), (v4i32 QPR:$Vm))))), @@ -4621,14 +4621,14 @@ let Predicates = [HasNEON, HasV8_1a] in { defm VQRDMLSHsl : N3VMulOpSL_HS<0b1111, IIC_VMACi16D, IIC_VMACi32D, IIC_VMACi16Q, IIC_VMACi32Q, "vqrdmlsh", "s", null_frag>; - def : Pat<(v4i16 (int_arm_neon_vqsubs + def : Pat<(v4i16 (ssubsat (v4i16 DPR:$src1), (v4i16 (int_arm_neon_vqrdmulh (v4i16 DPR:$Vn), (v4i16 (ARMvduplane (v4i16 DPR_8:$Vm), imm:$lane)))))), (v4i16 (VQRDMLSHslv4i16 DPR:$src1, DPR:$Vn, DPR_8:$Vm, imm:$lane))>; - def : Pat<(v2i32 (int_arm_neon_vqsubs + def : Pat<(v2i32 (ssubsat (v2i32 DPR:$src1), (v2i32 (int_arm_neon_vqrdmulh (v2i32 DPR:$Vn), @@ -4636,7 +4636,7 @@ let Predicates = [HasNEON, HasV8_1a] in { imm:$lane)))))), (v2i32 (VQRDMLSHslv2i32 DPR:$src1, DPR:$Vn, DPR_VFP2:$Vm, imm:$lane))>; - def : Pat<(v8i16 (int_arm_neon_vqsubs + def : Pat<(v8i16 (ssubsat (v8i16 QPR:$src1), (v8i16 (int_arm_neon_vqrdmulh (v8i16 QPR:$src2), @@ -4648,7 +4648,7 @@ let Predicates = [HasNEON, HasV8_1a] in { QPR:$src3, (DSubReg_i16_reg imm:$lane))), (SubReg_i16_lane imm:$lane)))>; - def : Pat<(v4i32 (int_arm_neon_vqsubs + def : Pat<(v4i32 (ssubsat (v4i32 QPR:$src1), (v4i32 (int_arm_neon_vqrdmulh (v4i32 QPR:$src2), @@ -4667,20 +4667,20 @@ defm VQDMLAL : N3VLInt3_HS<0, 1, 0b1001, 0, IIC_VMACi16D, IIC_VMACi32D, defm VQDMLALsl: N3VLInt3SL_HS<0, 0b0011, "vqdmlal", "s", null_frag>; let Predicates = [HasNEON] in { -def : Pat<(v4i32 (int_arm_neon_vqadds (v4i32 QPR:$src1), +def : Pat<(v4i32 (saddsat (v4i32 QPR:$src1), (v4i32 (int_arm_neon_vqdmull (v4i16 DPR:$Vn), (v4i16 DPR:$Vm))))), (VQDMLALv4i32 QPR:$src1, DPR:$Vn, DPR:$Vm)>; -def : Pat<(v2i64 (int_arm_neon_vqadds (v2i64 QPR:$src1), +def : Pat<(v2i64 (saddsat (v2i64 QPR:$src1), (v2i64 (int_arm_neon_vqdmull (v2i32 DPR:$Vn), (v2i32 DPR:$Vm))))), (VQDMLALv2i64 QPR:$src1, DPR:$Vn, DPR:$Vm)>; -def : Pat<(v4i32 (int_arm_neon_vqadds (v4i32 QPR:$src1), +def : Pat<(v4i32 (saddsat (v4i32 QPR:$src1), (v4i32 (int_arm_neon_vqdmull (v4i16 DPR:$Vn), (v4i16 (ARMvduplane (v4i16 DPR_8:$Vm), imm:$lane)))))), (VQDMLALslv4i16 QPR:$src1, DPR:$Vn, DPR_8:$Vm, imm:$lane)>; -def : Pat<(v2i64 (int_arm_neon_vqadds (v2i64 QPR:$src1), +def : Pat<(v2i64 (saddsat (v2i64 QPR:$src1), (v2i64 (int_arm_neon_vqdmull (v2i32 DPR:$Vn), (v2i32 (ARMvduplane (v2i32 DPR_VFP2:$Vm), imm:$lane)))))), @@ -4759,20 +4759,20 @@ defm VQDMLSL : N3VLInt3_HS<0, 1, 0b1011, 0, IIC_VMACi16D, IIC_VMACi32D, defm VQDMLSLsl: N3VLInt3SL_HS<0, 0b0111, "vqdmlsl", "s", null_frag>; let Predicates = [HasNEON] in { -def : Pat<(v4i32 (int_arm_neon_vqsubs (v4i32 QPR:$src1), +def : Pat<(v4i32 (ssubsat (v4i32 QPR:$src1), (v4i32 (int_arm_neon_vqdmull (v4i16 DPR:$Vn), (v4i16 DPR:$Vm))))), (VQDMLSLv4i32 QPR:$src1, DPR:$Vn, DPR:$Vm)>; -def : Pat<(v2i64 (int_arm_neon_vqsubs (v2i64 QPR:$src1), +def : Pat<(v2i64 (ssubsat (v2i64 QPR:$src1), (v2i64 (int_arm_neon_vqdmull (v2i32 DPR:$Vn), (v2i32 DPR:$Vm))))), (VQDMLSLv2i64 QPR:$src1, DPR:$Vn, DPR:$Vm)>; -def : Pat<(v4i32 (int_arm_neon_vqsubs (v4i32 QPR:$src1), +def : Pat<(v4i32 (ssubsat (v4i32 QPR:$src1), (v4i32 (int_arm_neon_vqdmull (v4i16 DPR:$Vn), (v4i16 (ARMvduplane (v4i16 DPR_8:$Vm), imm:$lane)))))), (VQDMLSLslv4i16 QPR:$src1, DPR:$Vn, DPR_8:$Vm, imm:$lane)>; -def : Pat<(v2i64 (int_arm_neon_vqsubs (v2i64 QPR:$src1), +def : Pat<(v2i64 (ssubsat (v2i64 QPR:$src1), (v2i64 (int_arm_neon_vqdmull (v2i32 DPR:$Vn), (v2i32 (ARMvduplane (v2i32 DPR_VFP2:$Vm), imm:$lane)))))), @@ -5012,6 +5012,27 @@ defm VCMLA : N3VCP8ComplexTied<1, 0, "vcmla", null_frag>; defm VCADD : N3VCP8ComplexOdd<1, 0, 0, "vcadd", null_frag>; defm VCMLA : N3VCP8ComplexTiedLane<0, "vcmla", null_frag>; +let Predicates = [HasNEON,HasV8_3a,HasFullFP16] in { + def : Pat<(v4f16 (int_arm_neon_vcadd_rot90 (v4f16 DPR:$Rn), (v4f16 DPR:$Rm))), + (VCADDv4f16 (v4f16 DPR:$Rn), (v4f16 DPR:$Rm), (i32 0))>; + def : Pat<(v4f16 (int_arm_neon_vcadd_rot270 (v4f16 DPR:$Rn), (v4f16 DPR:$Rm))), + (VCADDv4f16 (v4f16 DPR:$Rn), (v4f16 DPR:$Rm), (i32 1))>; + def : Pat<(v8f16 (int_arm_neon_vcadd_rot90 (v8f16 QPR:$Rn), (v8f16 QPR:$Rm))), + (VCADDv8f16 (v8f16 QPR:$Rn), (v8f16 QPR:$Rm), (i32 0))>; + def : Pat<(v8f16 (int_arm_neon_vcadd_rot270 (v8f16 QPR:$Rn), (v8f16 QPR:$Rm))), + (VCADDv8f16 (v8f16 QPR:$Rn), (v8f16 QPR:$Rm), (i32 1))>; +} +let Predicates = [HasNEON,HasV8_3a] in { + def : Pat<(v2f32 (int_arm_neon_vcadd_rot90 (v2f32 DPR:$Rn), (v2f32 DPR:$Rm))), + (VCADDv2f32 (v2f32 DPR:$Rn), (v2f32 DPR:$Rm), (i32 0))>; + def : Pat<(v2f32 (int_arm_neon_vcadd_rot270 (v2f32 DPR:$Rn), (v2f32 DPR:$Rm))), + (VCADDv2f32 (v2f32 DPR:$Rn), (v2f32 DPR:$Rm), (i32 1))>; + def : Pat<(v4f32 (int_arm_neon_vcadd_rot90 (v4f32 QPR:$Rn), (v4f32 QPR:$Rm))), + (VCADDv4f32 (v4f32 QPR:$Rn), (v4f32 QPR:$Rm), (i32 0))>; + def : Pat<(v4f32 (int_arm_neon_vcadd_rot270 (v4f32 QPR:$Rn), (v4f32 QPR:$Rm))), + (VCADDv4f32 (v4f32 QPR:$Rn), (v4f32 QPR:$Rm), (i32 1))>; +} + // Vector Subtract Operations. // VSUB : Vector Subtract (integer and floating-point) @@ -5045,10 +5066,10 @@ defm VHSUBu : N3VInt_QHS<1, 0, 0b0010, 0, N3RegFrm, // VQSUB : Vector Saturing Subtract defm VQSUBs : N3VInt_QHSD<0, 0, 0b0010, 1, N3RegFrm, IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q, IIC_VSUBi4Q, - "vqsub", "s", int_arm_neon_vqsubs, 0>; + "vqsub", "s", ssubsat, 0>; defm VQSUBu : N3VInt_QHSD<1, 0, 0b0010, 1, N3RegFrm, IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q, IIC_VSUBi4Q, - "vqsub", "u", int_arm_neon_vqsubu, 0>; + "vqsub", "u", usubsat, 0>; // VSUBHN : Vector Subtract and Narrow Returning High Half (D = Q - Q) defm VSUBHN : N3VNInt_HSD<0,1,0b0110,0, "vsubhn", "i", null_frag, 0>; // VRSUBHN : Vector Rounding Subtract and Narrow Returning High Half (D=Q-Q) @@ -5068,66 +5089,66 @@ def : Pat<(v2i32 (trunc (ARMvshruImm (sub (v2i64 QPR:$Vn), QPR:$Vm), 32))), // VCEQ : Vector Compare Equal defm VCEQ : N3V_QHS_cmp<1, 0, 0b1000, 1, IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q, - IIC_VSUBi4Q, "vceq", "i", 0, 1>; + IIC_VSUBi4Q, "vceq", "i", ARMCCeq, 1>; def VCEQfd : N3VD_cmp<0,0,0b00,0b1110,0, IIC_VBIND, "vceq", "f32", v2i32, v2f32, - 0, 1>; + ARMCCeq, 1>; def VCEQfq : N3VQ_cmp<0,0,0b00,0b1110,0, IIC_VBINQ, "vceq", "f32", v4i32, v4f32, - 0, 1>; + ARMCCeq, 1>; def VCEQhd : N3VD_cmp<0,0,0b01,0b1110,0, IIC_VBIND, "vceq", "f16", v4i16, v4f16, - 0, 1>, + ARMCCeq, 1>, Requires<[HasNEON, HasFullFP16]>; def VCEQhq : N3VQ_cmp<0,0,0b01,0b1110,0, IIC_VBINQ, "vceq", "f16", v8i16, v8f16, - 0, 1>, + ARMCCeq, 1>, Requires<[HasNEON, HasFullFP16]>; let TwoOperandAliasConstraint = "$Vm = $Vd" in defm VCEQz : N2V_QHS_cmp<0b11, 0b11, 0b01, 0b00010, 0, "vceq", "i", - "$Vd, $Vm, #0", 0>; + "$Vd, $Vm, #0", ARMCCeq>; // VCGE : Vector Compare Greater Than or Equal defm VCGEs : N3V_QHS_cmp<0, 0, 0b0011, 1, IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q, - IIC_VSUBi4Q, "vcge", "s", 10, 0>; + IIC_VSUBi4Q, "vcge", "s", ARMCCge, 0>; defm VCGEu : N3V_QHS_cmp<1, 0, 0b0011, 1, IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q, - IIC_VSUBi4Q, "vcge", "u", 2, 0>; + IIC_VSUBi4Q, "vcge", "u", ARMCChs, 0>; def VCGEfd : N3VD_cmp<1,0,0b00,0b1110,0, IIC_VBIND, "vcge", "f32", v2i32, v2f32, - 10, 0>; + ARMCCge, 0>; def VCGEfq : N3VQ_cmp<1,0,0b00,0b1110,0, IIC_VBINQ, "vcge", "f32", v4i32, v4f32, - 10, 0>; + ARMCCge, 0>; def VCGEhd : N3VD_cmp<1,0,0b01,0b1110,0, IIC_VBIND, "vcge", "f16", v4i16, v4f16, - 10, 0>, + ARMCCge, 0>, Requires<[HasNEON, HasFullFP16]>; def VCGEhq : N3VQ_cmp<1,0,0b01,0b1110,0, IIC_VBINQ, "vcge", "f16", v8i16, v8f16, - 10, 0>, + ARMCCge, 0>, Requires<[HasNEON, HasFullFP16]>; let TwoOperandAliasConstraint = "$Vm = $Vd" in { defm VCGEz : N2V_QHS_cmp<0b11, 0b11, 0b01, 0b00001, 0, "vcge", "s", - "$Vd, $Vm, #0", 10>; + "$Vd, $Vm, #0", ARMCCge>; defm VCLEz : N2V_QHS_cmp<0b11, 0b11, 0b01, 0b00011, 0, "vcle", "s", - "$Vd, $Vm, #0", 13>; + "$Vd, $Vm, #0", ARMCCle>; } // VCGT : Vector Compare Greater Than defm VCGTs : N3V_QHS_cmp<0, 0, 0b0011, 0, IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q, - IIC_VSUBi4Q, "vcgt", "s", 12, 0>; + IIC_VSUBi4Q, "vcgt", "s", ARMCCgt, 0>; defm VCGTu : N3V_QHS_cmp<1, 0, 0b0011, 0, IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q, - IIC_VSUBi4Q, "vcgt", "u", 8, 0>; + IIC_VSUBi4Q, "vcgt", "u", ARMCChi, 0>; def VCGTfd : N3VD_cmp<1,0,0b10,0b1110,0, IIC_VBIND, "vcgt", "f32", v2i32, v2f32, - 12, 0>; + ARMCCgt, 0>; def VCGTfq : N3VQ_cmp<1,0,0b10,0b1110,0, IIC_VBINQ, "vcgt", "f32", v4i32, v4f32, - 12, 0>; + ARMCCgt, 0>; def VCGThd : N3VD_cmp<1,0,0b11,0b1110,0, IIC_VBIND, "vcgt", "f16", v4i16, v4f16, - 12, 0>, + ARMCCgt, 0>, Requires<[HasNEON, HasFullFP16]>; def VCGThq : N3VQ_cmp<1,0,0b11,0b1110,0, IIC_VBINQ, "vcgt", "f16", v8i16, v8f16, - 12, 0>, + ARMCCgt, 0>, Requires<[HasNEON, HasFullFP16]>; let TwoOperandAliasConstraint = "$Vm = $Vd" in { defm VCGTz : N2V_QHS_cmp<0b11, 0b11, 0b01, 0b00000, 0, "vcgt", "s", - "$Vd, $Vm, #0", 12>; + "$Vd, $Vm, #0", ARMCCgt>; defm VCLTz : N2V_QHS_cmp<0b11, 0b11, 0b01, 0b00100, 0, "vclt", "s", - "$Vd, $Vm, #0", 11>; + "$Vd, $Vm, #0", ARMCClt>; } // VACGE : Vector Absolute Compare Greater Than or Equal (aka VCAGE) @@ -6797,9 +6818,12 @@ def VREV64q16 : VREV64Q<0b01, "vrev64", "16", v8i16>; def VREV64q32 : VREV64Q<0b10, "vrev64", "32", v4i32>; let Predicates = [HasNEON] in { -def : Pat<(v4f32 (ARMvrev64 (v4f32 QPR:$Vm))), (VREV64q32 QPR:$Vm)>; -def : Pat<(v8f16 (ARMvrev64 (v8f16 QPR:$Vm))), (VREV64q16 QPR:$Vm)>; -def : Pat<(v4f16 (ARMvrev64 (v4f16 DPR:$Vm))), (VREV64d16 DPR:$Vm)>; + def : Pat<(v4f32 (ARMvrev64 (v4f32 QPR:$Vm))), + (VREV64q32 QPR:$Vm)>; + def : Pat<(v8f16 (ARMvrev64 (v8f16 QPR:$Vm))), + (VREV64q16 QPR:$Vm)>; + def : Pat<(v4f16 (ARMvrev64 (v4f16 DPR:$Vm))), + (VREV64d16 DPR:$Vm)>; } // VREV32 : Vector Reverse elements within 32-bit words @@ -6821,6 +6845,13 @@ def VREV32d16 : VREV32D<0b01, "vrev32", "16", v4i16>; def VREV32q8 : VREV32Q<0b00, "vrev32", "8", v16i8>; def VREV32q16 : VREV32Q<0b01, "vrev32", "16", v8i16>; +let Predicates = [HasNEON] in { + def : Pat<(v8f16 (ARMvrev32 (v8f16 QPR:$Vm))), + (VREV32q16 QPR:$Vm)>; + def : Pat<(v4f16 (ARMvrev32 (v4f16 DPR:$Vm))), + (VREV32d16 DPR:$Vm)>; +} + // VREV16 : Vector Reverse elements within 16-bit halfwords class VREV16D<bits<2> op19_18, string OpcodeStr, string Dt, ValueType Ty> diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrThumb2.td b/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrThumb2.td index 25a45b39fa0c..4193e8147f47 100644 --- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrThumb2.td +++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrThumb2.td @@ -270,7 +270,8 @@ def t2am_imm8_offset : MemOperand, // t2addrmode_imm8s4 := reg +/- (imm8 << 2) def MemImm8s4OffsetAsmOperand : AsmOperandClass {let Name = "MemImm8s4Offset";} -class T2AddrMode_Imm8s4 : MemOperand { +class T2AddrMode_Imm8s4 : MemOperand, + ComplexPattern<i32, 2, "SelectT2AddrModeImm8<2>", []> { let EncoderMethod = "getT2AddrModeImm8s4OpValue"; let DecoderMethod = "DecodeT2AddrModeImm8s4"; let ParserMatchClass = MemImm8s4OffsetAsmOperand; @@ -917,10 +918,26 @@ multiclass T2I_bin_ii12rs<bits<3> op23_21, string opc, SDNode opnode, // The register-immediate version is re-materializable. This is useful // in particular for taking the address of a local. let isReMaterializable = 1 in { + def spImm : T2sTwoRegImm< + (outs GPRsp:$Rd), (ins GPRsp:$Rn, t2_so_imm:$imm), IIC_iALUi, + opc, ".w\t$Rd, $Rn, $imm", + []>, + Sched<[WriteALU, ReadALU]> { + let Rn = 13; + let Rd = 13; + + let Inst{31-27} = 0b11110; + let Inst{25-24} = 0b01; + let Inst{23-21} = op23_21; + let Inst{15} = 0; + + let DecoderMethod = "DecodeT2AddSubSPImm"; + } + def ri : T2sTwoRegImm< - (outs GPRnopc:$Rd), (ins GPRnopc:$Rn, t2_so_imm:$imm), IIC_iALUi, + (outs rGPR:$Rd), (ins GPRnopc:$Rn, t2_so_imm:$imm), IIC_iALUi, opc, ".w\t$Rd, $Rn, $imm", - [(set GPRnopc:$Rd, (opnode GPRnopc:$Rn, t2_so_imm:$imm))]>, + [(set rGPR:$Rd, (opnode GPRnopc:$Rn, t2_so_imm:$imm))]>, Sched<[WriteALU, ReadALU]> { let Inst{31-27} = 0b11110; let Inst{25} = 0; @@ -931,9 +948,9 @@ multiclass T2I_bin_ii12rs<bits<3> op23_21, string opc, SDNode opnode, } // 12-bit imm def ri12 : T2I< - (outs GPRnopc:$Rd), (ins GPR:$Rn, imm0_4095:$imm), IIC_iALUi, + (outs rGPR:$Rd), (ins GPR:$Rn, imm0_4095:$imm), IIC_iALUi, !strconcat(opc, "w"), "\t$Rd, $Rn, $imm", - [(set GPRnopc:$Rd, (opnode GPR:$Rn, imm0_4095:$imm))]>, + [(set rGPR:$Rd, (opnode GPR:$Rn, imm0_4095:$imm))]>, Sched<[WriteALU, ReadALU]> { bits<4> Rd; bits<4> Rn; @@ -949,6 +966,26 @@ multiclass T2I_bin_ii12rs<bits<3> op23_21, string opc, SDNode opnode, let Inst{11-8} = Rd; let Inst{7-0} = imm{7-0}; } + def spImm12 : T2I< + (outs GPRsp:$Rd), (ins GPRsp:$Rn, imm0_4095:$imm), IIC_iALUi, + !strconcat(opc, "w"), "\t$Rd, $Rn, $imm", + []>, + Sched<[WriteALU, ReadALU]> { + bits<4> Rd = 13; + bits<4> Rn = 13; + bits<12> imm; + let Inst{31-27} = 0b11110; + let Inst{26} = imm{11}; + let Inst{25-24} = 0b10; + let Inst{23-21} = op23_21; + let Inst{20} = 0; // The S bit. + let Inst{19-16} = Rn; + let Inst{15} = 0; + let Inst{14-12} = imm{10-8}; + let Inst{11-8} = Rd; + let Inst{7-0} = imm{7-0}; + let DecoderMethod = "DecodeT2AddSubSPImm"; + } // register def rr : T2sThreeReg<(outs GPRnopc:$Rd), (ins GPRnopc:$Rn, rGPR:$Rm), IIC_iALUr, opc, ".w\t$Rd, $Rn, $Rm", @@ -1412,7 +1449,8 @@ let mayLoad = 1, hasSideEffects = 0, hasExtraDefRegAllocReq = 1 in { // Load doubleword def t2LDRDi8 : T2Ii8s4<1, 0, 1, (outs rGPR:$Rt, rGPR:$Rt2), (ins t2addrmode_imm8s4:$addr), - IIC_iLoad_d_i, "ldrd", "\t$Rt, $Rt2, $addr", "", []>, + IIC_iLoad_d_i, "ldrd", "\t$Rt, $Rt2, $addr", "", + [(set rGPR:$Rt, rGPR:$Rt2, (ARMldrd t2addrmode_imm8s4:$addr))]>, Sched<[WriteLd]>; } // mayLoad = 1, hasSideEffects = 0, hasExtraDefRegAllocReq = 1 @@ -1593,7 +1631,8 @@ defm t2STRH:T2I_st<0b01,"strh", IIC_iStore_bh_i, IIC_iStore_bh_si, let mayStore = 1, hasSideEffects = 0, hasExtraSrcRegAllocReq = 1 in def t2STRDi8 : T2Ii8s4<1, 0, 0, (outs), (ins rGPR:$Rt, rGPR:$Rt2, t2addrmode_imm8s4:$addr), - IIC_iStore_d_r, "strd", "\t$Rt, $Rt2, $addr", "", []>, + IIC_iStore_d_r, "strd", "\t$Rt, $Rt2, $addr", "", + [(ARMstrd rGPR:$Rt, rGPR:$Rt2, t2addrmode_imm8s4:$addr)]>, Sched<[WriteST]>; // Indexed stores @@ -2264,19 +2303,29 @@ def : t2InstSubst<"sbc${s}${p} $rd, $rn, $imm", (t2ADCri rGPR:$rd, rGPR:$rn, t2_so_imm_not:$imm, pred:$p, s_cc_out:$s)>; def : t2InstSubst<"add${s}${p}.w $rd, $rn, $imm", - (t2SUBri GPRnopc:$rd, GPRnopc:$rn, t2_so_imm_neg:$imm, pred:$p, s_cc_out:$s)>; -def : t2InstSubst<"addw${p} $rd, $rn, $imm", - (t2SUBri12 GPRnopc:$rd, GPR:$rn, t2_so_imm_neg:$imm, pred:$p)>; + (t2SUBri rGPR:$rd, GPRnopc:$rn, t2_so_imm_neg:$imm, pred:$p, s_cc_out:$s)>; +def : t2InstSubst<"sub${s}${p}.w $rd, $rn, $imm", + (t2ADDri rGPR:$rd, GPRnopc:$rn, t2_so_imm_neg:$imm, pred:$p, s_cc_out:$s)>; +def : t2InstSubst<"subw${p} $Rd, $Rn, $imm", + (t2ADDri12 rGPR:$Rd, GPR:$Rn, imm0_4095_neg:$imm, pred:$p)>; +def : t2InstSubst<"sub${s}${p} $rd, $rn, $imm", + (t2ADDri rGPR:$rd, GPRnopc:$rn, t2_so_imm_neg:$imm, pred:$p, s_cc_out:$s)>; +def : t2InstSubst<"sub${p} $rd, $rn, $imm", + (t2ADDri12 rGPR:$rd, GPR:$rn, imm0_4095_neg:$imm, pred:$p)>; + +// SP to SP alike +def : t2InstSubst<"add${s}${p}.w $rd, $rn, $imm", + (t2SUBspImm GPRsp:$rd, GPRsp:$rn, t2_so_imm_neg:$imm, pred:$p, s_cc_out:$s)>; def : t2InstSubst<"sub${s}${p}.w $rd, $rn, $imm", - (t2ADDri GPRnopc:$rd, GPRnopc:$rn, t2_so_imm_neg:$imm, pred:$p, s_cc_out:$s)>; -def : t2InstSubst<"subw${p} $rd, $rn, $imm", - (t2ADDri12 GPRnopc:$rd, GPR:$rn, t2_so_imm_neg:$imm, pred:$p)>; + (t2ADDspImm GPRsp:$rd, GPRsp:$rn, t2_so_imm_neg:$imm, pred:$p, s_cc_out:$s)>; def : t2InstSubst<"subw${p} $Rd, $Rn, $imm", - (t2ADDri12 GPRnopc:$Rd, GPR:$Rn, imm0_4095_neg:$imm, pred:$p)>; + (t2ADDspImm12 GPRsp:$Rd, GPRsp:$Rn, imm0_4095_neg:$imm, pred:$p)>; def : t2InstSubst<"sub${s}${p} $rd, $rn, $imm", - (t2ADDri GPRnopc:$rd, GPRnopc:$rn, t2_so_imm_neg:$imm, pred:$p, s_cc_out:$s)>; + (t2ADDspImm GPRsp:$rd, GPRsp:$rn, t2_so_imm_neg:$imm, pred:$p, s_cc_out:$s)>; def : t2InstSubst<"sub${p} $rd, $rn, $imm", - (t2ADDri12 GPRnopc:$rd, GPR:$rn, t2_so_imm_neg:$imm, pred:$p)>; + (t2ADDspImm12 GPRsp:$rd, GPRsp:$rn, imm0_4095_neg:$imm, pred:$p)>; + + // RSB defm t2RSB : T2I_rbin_irs <0b1110, "rsb", sub>; @@ -2292,12 +2341,12 @@ defm t2RSBS : T2I_rbin_s_is <ARMsubc>; // The AddedComplexity preferences the first variant over the others since // it can be shrunk to a 16-bit wide encoding, while the others cannot. let AddedComplexity = 1 in -def : T2Pat<(add GPR:$src, imm1_255_neg:$imm), - (t2SUBri GPR:$src, imm1_255_neg:$imm)>; -def : T2Pat<(add GPR:$src, t2_so_imm_neg:$imm), - (t2SUBri GPR:$src, t2_so_imm_neg:$imm)>; -def : T2Pat<(add GPR:$src, imm0_4095_neg:$imm), - (t2SUBri12 GPR:$src, imm0_4095_neg:$imm)>; +def : T2Pat<(add rGPR:$src, imm1_255_neg:$imm), + (t2SUBri rGPR:$src, imm1_255_neg:$imm)>; +def : T2Pat<(add rGPR:$src, t2_so_imm_neg:$imm), + (t2SUBri rGPR:$src, t2_so_imm_neg:$imm)>; +def : T2Pat<(add rGPR:$src, imm0_4095_neg:$imm), + (t2SUBri12 rGPR:$src, imm0_4095_neg:$imm)>; def : T2Pat<(add GPR:$src, imm0_65535_neg:$imm), (t2SUBrr GPR:$src, (t2MOVi16 (imm_neg_XFORM imm:$imm)))>; @@ -2796,10 +2845,10 @@ def : T2Pat<(t2_so_imm_not:$src), // Thumb2SizeReduction's chances later on we select a t2ADD for an or where // possible. def : T2Pat<(or AddLikeOrOp:$Rn, t2_so_imm:$imm), - (t2ADDri $Rn, t2_so_imm:$imm)>; + (t2ADDri rGPR:$Rn, t2_so_imm:$imm)>; def : T2Pat<(or AddLikeOrOp:$Rn, imm0_4095:$Rm), - (t2ADDri12 $Rn, imm0_4095:$Rm)>; + (t2ADDri12 rGPR:$Rn, imm0_4095:$Rm)>; def : T2Pat<(or AddLikeOrOp:$Rn, non_imm32:$Rm), (t2ADDrr $Rn, $Rm)>; @@ -4551,10 +4600,18 @@ class T2TT<bits<2> at, string asm, list<dag> pattern> let Unpredictable{5-0} = 0b111111; } -def t2TT : T2TT<0b00, "tt", []>, Requires<[IsThumb,Has8MSecExt]>; -def t2TTT : T2TT<0b01, "ttt", []>, Requires<[IsThumb,Has8MSecExt]>; -def t2TTA : T2TT<0b10, "tta", []>, Requires<[IsThumb,Has8MSecExt]>; -def t2TTAT : T2TT<0b11, "ttat", []>, Requires<[IsThumb,Has8MSecExt]>; +def t2TT : T2TT<0b00, "tt", + [(set rGPR:$Rt, (int_arm_cmse_tt GPRnopc:$Rn))]>, + Requires<[IsThumb, Has8MSecExt]>; +def t2TTT : T2TT<0b01, "ttt", + [(set rGPR:$Rt, (int_arm_cmse_ttt GPRnopc:$Rn))]>, + Requires<[IsThumb, Has8MSecExt]>; +def t2TTA : T2TT<0b10, "tta", + [(set rGPR:$Rt, (int_arm_cmse_tta GPRnopc:$Rn))]>, + Requires<[IsThumb, Has8MSecExt]>; +def t2TTAT : T2TT<0b11, "ttat", + [(set rGPR:$Rt, (int_arm_cmse_ttat GPRnopc:$Rn))]>, + Requires<[IsThumb, Has8MSecExt]>; //===----------------------------------------------------------------------===// // Non-Instruction Patterns @@ -4655,10 +4712,10 @@ def : t2InstAlias<"sbc${s}${p} $Rd, $Rn, $ShiftedRm", // Aliases for ADD without the ".w" optional width specifier. def : t2InstAlias<"add${s}${p} $Rd, $Rn, $imm", - (t2ADDri GPRnopc:$Rd, GPRnopc:$Rn, t2_so_imm:$imm, pred:$p, + (t2ADDri rGPR:$Rd, GPRnopc:$Rn, t2_so_imm:$imm, pred:$p, cc_out:$s)>; def : t2InstAlias<"add${p} $Rd, $Rn, $imm", - (t2ADDri12 GPRnopc:$Rd, GPR:$Rn, imm0_4095:$imm, pred:$p)>; + (t2ADDri12 rGPR:$Rd, GPR:$Rn, imm0_4095:$imm, pred:$p)>; def : t2InstAlias<"add${s}${p} $Rd, $Rn, $Rm", (t2ADDrr GPRnopc:$Rd, GPRnopc:$Rn, rGPR:$Rm, pred:$p, cc_out:$s)>; def : t2InstAlias<"add${s}${p} $Rd, $Rn, $ShiftedRm", @@ -4666,9 +4723,11 @@ def : t2InstAlias<"add${s}${p} $Rd, $Rn, $ShiftedRm", pred:$p, cc_out:$s)>; // ... and with the destination and source register combined. def : t2InstAlias<"add${s}${p} $Rdn, $imm", - (t2ADDri GPRnopc:$Rdn, GPRnopc:$Rdn, t2_so_imm:$imm, pred:$p, cc_out:$s)>; + (t2ADDri rGPR:$Rdn, rGPR:$Rdn, t2_so_imm:$imm, pred:$p, cc_out:$s)>; def : t2InstAlias<"add${p} $Rdn, $imm", - (t2ADDri12 GPRnopc:$Rdn, GPRnopc:$Rdn, imm0_4095:$imm, pred:$p)>; + (t2ADDri12 rGPR:$Rdn, rGPR:$Rdn, imm0_4095:$imm, pred:$p)>; +def : t2InstAlias<"addw${p} $Rdn, $imm", + (t2ADDri12 rGPR:$Rdn, rGPR:$Rdn, imm0_4095:$imm, pred:$p)>; def : t2InstAlias<"add${s}${p} $Rdn, $Rm", (t2ADDrr GPRnopc:$Rdn, GPRnopc:$Rdn, rGPR:$Rm, pred:$p, cc_out:$s)>; def : t2InstAlias<"add${s}${p} $Rdn, $ShiftedRm", @@ -4677,33 +4736,33 @@ def : t2InstAlias<"add${s}${p} $Rdn, $ShiftedRm", // add w/ negative immediates is just a sub. def : t2InstSubst<"add${s}${p} $Rd, $Rn, $imm", - (t2SUBri GPRnopc:$Rd, GPRnopc:$Rn, t2_so_imm_neg:$imm, pred:$p, + (t2SUBri rGPR:$Rd, GPRnopc:$Rn, t2_so_imm_neg:$imm, pred:$p, cc_out:$s)>; def : t2InstSubst<"add${p} $Rd, $Rn, $imm", - (t2SUBri12 GPRnopc:$Rd, GPR:$Rn, imm0_4095_neg:$imm, pred:$p)>; + (t2SUBri12 rGPR:$Rd, GPR:$Rn, imm0_4095_neg:$imm, pred:$p)>; def : t2InstSubst<"add${s}${p} $Rdn, $imm", - (t2SUBri GPRnopc:$Rdn, GPRnopc:$Rdn, t2_so_imm_neg:$imm, pred:$p, + (t2SUBri rGPR:$Rdn, rGPR:$Rdn, t2_so_imm_neg:$imm, pred:$p, cc_out:$s)>; def : t2InstSubst<"add${p} $Rdn, $imm", - (t2SUBri12 GPRnopc:$Rdn, GPRnopc:$Rdn, imm0_4095_neg:$imm, pred:$p)>; + (t2SUBri12 rGPR:$Rdn, rGPR:$Rdn, imm0_4095_neg:$imm, pred:$p)>; def : t2InstSubst<"add${s}${p}.w $Rd, $Rn, $imm", - (t2SUBri GPRnopc:$Rd, GPRnopc:$Rn, t2_so_imm_neg:$imm, pred:$p, + (t2SUBri rGPR:$Rd, GPRnopc:$Rn, t2_so_imm_neg:$imm, pred:$p, cc_out:$s)>; def : t2InstSubst<"addw${p} $Rd, $Rn, $imm", - (t2SUBri12 GPRnopc:$Rd, GPR:$Rn, imm0_4095_neg:$imm, pred:$p)>; + (t2SUBri12 rGPR:$Rd, rGPR:$Rn, imm0_4095_neg:$imm, pred:$p)>; def : t2InstSubst<"add${s}${p}.w $Rdn, $imm", - (t2SUBri GPRnopc:$Rdn, GPRnopc:$Rdn, t2_so_imm_neg:$imm, pred:$p, + (t2SUBri rGPR:$Rdn, rGPR:$Rdn, t2_so_imm_neg:$imm, pred:$p, cc_out:$s)>; def : t2InstSubst<"addw${p} $Rdn, $imm", - (t2SUBri12 GPRnopc:$Rdn, GPRnopc:$Rdn, imm0_4095_neg:$imm, pred:$p)>; + (t2SUBri12 rGPR:$Rdn, rGPR:$Rdn, imm0_4095_neg:$imm, pred:$p)>; // Aliases for SUB without the ".w" optional width specifier. def : t2InstAlias<"sub${s}${p} $Rd, $Rn, $imm", - (t2SUBri GPRnopc:$Rd, GPRnopc:$Rn, t2_so_imm:$imm, pred:$p, cc_out:$s)>; + (t2SUBri rGPR:$Rd, GPRnopc:$Rn, t2_so_imm:$imm, pred:$p, cc_out:$s)>; def : t2InstAlias<"sub${p} $Rd, $Rn, $imm", - (t2SUBri12 GPRnopc:$Rd, GPR:$Rn, imm0_4095:$imm, pred:$p)>; + (t2SUBri12 rGPR:$Rd, GPR:$Rn, imm0_4095:$imm, pred:$p)>; def : t2InstAlias<"sub${s}${p} $Rd, $Rn, $Rm", (t2SUBrr GPRnopc:$Rd, GPRnopc:$Rn, rGPR:$Rm, pred:$p, cc_out:$s)>; def : t2InstAlias<"sub${s}${p} $Rd, $Rn, $ShiftedRm", @@ -4711,9 +4770,11 @@ def : t2InstAlias<"sub${s}${p} $Rd, $Rn, $ShiftedRm", pred:$p, cc_out:$s)>; // ... and with the destination and source register combined. def : t2InstAlias<"sub${s}${p} $Rdn, $imm", - (t2SUBri GPRnopc:$Rdn, GPRnopc:$Rdn, t2_so_imm:$imm, pred:$p, cc_out:$s)>; + (t2SUBri rGPR:$Rdn, rGPR:$Rdn, t2_so_imm:$imm, pred:$p, cc_out:$s)>; def : t2InstAlias<"sub${p} $Rdn, $imm", - (t2SUBri12 GPRnopc:$Rdn, GPRnopc:$Rdn, imm0_4095:$imm, pred:$p)>; + (t2SUBri12 rGPR:$Rdn, rGPR:$Rdn, imm0_4095:$imm, pred:$p)>; +def : t2InstAlias<"subw${p} $Rdn, $imm", + (t2SUBri12 rGPR:$Rdn, rGPR:$Rdn, imm0_4095:$imm, pred:$p)>; def : t2InstAlias<"sub${s}${p}.w $Rdn, $Rm", (t2SUBrr GPRnopc:$Rdn, GPRnopc:$Rdn, rGPR:$Rm, pred:$p, cc_out:$s)>; def : t2InstAlias<"sub${s}${p} $Rdn, $Rm", @@ -4722,6 +4783,65 @@ def : t2InstAlias<"sub${s}${p} $Rdn, $ShiftedRm", (t2SUBrs GPRnopc:$Rdn, GPRnopc:$Rdn, t2_so_reg:$ShiftedRm, pred:$p, cc_out:$s)>; +// SP to SP alike aliases +// Aliases for ADD without the ".w" optional width specifier. +def : t2InstAlias<"add${s}${p} $Rd, $Rn, $imm", + (t2ADDspImm GPRsp:$Rd, GPRsp:$Rn, t2_so_imm:$imm, pred:$p, + cc_out:$s)>; +def : t2InstAlias<"add${p} $Rd, $Rn, $imm", + (t2ADDspImm12 GPRsp:$Rd, GPRsp:$Rn, imm0_4095:$imm, pred:$p)>; +// ... and with the destination and source register combined. +def : t2InstAlias<"add${s}${p} $Rdn, $imm", + (t2ADDspImm GPRsp:$Rdn, GPRsp:$Rdn, t2_so_imm:$imm, pred:$p, cc_out:$s)>; + +def : t2InstAlias<"add${s}${p}.w $Rdn, $imm", + (t2ADDspImm GPRsp:$Rdn, GPRsp:$Rdn, t2_so_imm:$imm, pred:$p, cc_out:$s)>; + +def : t2InstAlias<"add${p} $Rdn, $imm", + (t2ADDspImm12 GPRsp:$Rdn, GPRsp:$Rdn, imm0_4095:$imm, pred:$p)>; + +def : t2InstAlias<"addw${p} $Rdn, $imm", + (t2ADDspImm12 GPRsp:$Rdn, GPRsp:$Rdn, imm0_4095:$imm, pred:$p)>; + +// add w/ negative immediates is just a sub. +def : t2InstSubst<"add${s}${p} $Rd, $Rn, $imm", + (t2SUBspImm GPRsp:$Rd, GPRsp:$Rn, t2_so_imm_neg:$imm, pred:$p, + cc_out:$s)>; +def : t2InstSubst<"add${p} $Rd, $Rn, $imm", + (t2SUBspImm12 GPRsp:$Rd, GPRsp:$Rn, imm0_4095_neg:$imm, pred:$p)>; +def : t2InstSubst<"add${s}${p} $Rdn, $imm", + (t2SUBspImm GPRsp:$Rdn, GPRsp:$Rdn, t2_so_imm_neg:$imm, pred:$p, + cc_out:$s)>; +def : t2InstSubst<"add${p} $Rdn, $imm", + (t2SUBspImm12 GPRsp:$Rdn, GPRsp:$Rdn, imm0_4095_neg:$imm, pred:$p)>; + +def : t2InstSubst<"add${s}${p}.w $Rd, $Rn, $imm", + (t2SUBspImm GPRsp:$Rd, GPRsp:$Rn, t2_so_imm_neg:$imm, pred:$p, + cc_out:$s)>; +def : t2InstSubst<"addw${p} $Rd, $Rn, $imm", + (t2SUBspImm12 GPRsp:$Rd, GPRsp:$Rn, imm0_4095_neg:$imm, pred:$p)>; +def : t2InstSubst<"add${s}${p}.w $Rdn, $imm", + (t2SUBspImm GPRsp:$Rdn, GPRsp:$Rdn, t2_so_imm_neg:$imm, pred:$p, + cc_out:$s)>; +def : t2InstSubst<"addw${p} $Rdn, $imm", + (t2SUBspImm12 GPRsp:$Rdn, GPRsp:$Rdn, imm0_4095_neg:$imm, pred:$p)>; + + +// Aliases for SUB without the ".w" optional width specifier. +def : t2InstAlias<"sub${s}${p} $Rd, $Rn, $imm", + (t2SUBspImm GPRsp:$Rd, GPRsp:$Rn, t2_so_imm:$imm, pred:$p, cc_out:$s)>; +def : t2InstAlias<"sub${p} $Rd, $Rn, $imm", + (t2SUBspImm12 GPRsp:$Rd, GPRsp:$Rn, imm0_4095:$imm, pred:$p)>; +// ... and with the destination and source register combined. +def : t2InstAlias<"sub${s}${p} $Rdn, $imm", + (t2SUBspImm GPRsp:$Rdn, GPRsp:$Rdn, t2_so_imm:$imm, pred:$p, cc_out:$s)>; +def : t2InstAlias<"sub${s}${p}.w $Rdn, $imm", + (t2SUBspImm GPRsp:$Rdn, GPRsp:$Rdn, t2_so_imm:$imm, pred:$p, cc_out:$s)>; +def : t2InstAlias<"sub${p} $Rdn, $imm", + (t2SUBspImm12 GPRsp:$Rdn, GPRsp:$Rdn, imm0_4095:$imm, pred:$p)>; +def : t2InstAlias<"subw${p} $Rdn, $imm", + (t2SUBspImm12 GPRsp:$Rdn, GPRsp:$Rdn, imm0_4095:$imm, pred:$p)>; + // Alias for compares without the ".w" optional width specifier. def : t2InstAlias<"cmn${p} $Rn, $Rm", (t2CMNzrr GPRnopc:$Rn, rGPR:$Rm, pred:$p)>; @@ -4978,10 +5098,16 @@ def : t2InstSubst<"orr${s}${p} $Rdn, $imm", pred:$p, cc_out:$s)>; // Likewise, "add Rd, t2_so_imm_neg" -> sub def : t2InstSubst<"add${s}${p} $Rd, $Rn, $imm", - (t2SUBri GPRnopc:$Rd, GPRnopc:$Rn, t2_so_imm_neg:$imm, + (t2SUBri rGPR:$Rd, GPRnopc:$Rn, t2_so_imm_neg:$imm, + pred:$p, cc_out:$s)>; +def : t2InstSubst<"add${s}${p} $Rd, $Rn, $imm", + (t2SUBspImm GPRsp:$Rd, GPRsp:$Rn, t2_so_imm_neg:$imm, + pred:$p, cc_out:$s)>; +def : t2InstSubst<"add${s}${p} $Rd, $imm", + (t2SUBri rGPR:$Rd, rGPR:$Rd, t2_so_imm_neg:$imm, pred:$p, cc_out:$s)>; def : t2InstSubst<"add${s}${p} $Rd, $imm", - (t2SUBri GPRnopc:$Rd, GPRnopc:$Rd, t2_so_imm_neg:$imm, + (t2SUBspImm GPRsp:$Rd, GPRsp:$Rd, t2_so_imm_neg:$imm, pred:$p, cc_out:$s)>; // Same for CMP <--> CMN via t2_so_imm_neg def : t2InstSubst<"cmp${p} $Rd, $imm", @@ -5178,8 +5304,6 @@ class t2LOL<dag oops, dag iops, string asm, string ops> let Inst{31-23} = 0b111100000; let Inst{15-14} = 0b11; let Inst{0} = 0b1; - let isBranch = 1; - let isTerminator = 1; let DecoderMethod = "DecodeLOLoop"; let Predicates = [IsThumb2, HasV8_1MMainline, HasLOB]; } @@ -5196,13 +5320,13 @@ def t2WLS : t2LOL<(outs GPRlr:$LR), let Inst{11} = label{0}; let Inst{10-1} = label{10-1}; let usesCustomInserter = 1; + let isBranch = 1; + let isTerminator = 1; } def t2DLS : t2LOL<(outs GPRlr:$LR), (ins rGPR:$Rn), "dls", "$LR, $Rn"> { bits<4> Rn; - let isBranch = 0; - let isTerminator = 0; let Inst{22-20} = 0b100; let Inst{19-16} = Rn{3-0}; let Inst{13-1} = 0b1000000000000; @@ -5218,6 +5342,8 @@ def t2LEUpdate : t2LOL<(outs GPRlr:$LRout), let Inst{11} = label{0}; let Inst{10-1} = label{10-1}; let usesCustomInserter = 1; + let isBranch = 1; + let isTerminator = 1; } def t2LE : t2LOL<(outs ), (ins lelabel_u11:$label), "le", "$label"> { @@ -5226,6 +5352,8 @@ def t2LE : t2LOL<(outs ), (ins lelabel_u11:$label), "le", "$label"> { let Inst{13-12} = 0b00; let Inst{11} = label{0}; let Inst{10-1} = label{10-1}; + let isBranch = 1; + let isTerminator = 1; } def t2DoLoopStart : diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrVFP.td b/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrVFP.td index fdd961bfbb2f..a41a483d1a4c 100644 --- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrVFP.td +++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrVFP.td @@ -277,7 +277,7 @@ def : MnemonicAlias<"vstm", "vstmia">; // let mayLoad = 1 in def VLLDM : AXSI4<(outs), (ins GPRnopc:$Rn, pred:$p), IndexModeNone, - IIC_fpLoad_m, "vlldm${p}\t$Rn", "", []>, + NoItinerary, "vlldm${p}\t$Rn", "", []>, Requires<[HasV8MMainline, Has8MSecExt]> { let Inst{24-23} = 0b00; let Inst{22} = 0; @@ -290,7 +290,7 @@ def VLLDM : AXSI4<(outs), (ins GPRnopc:$Rn, pred:$p), IndexModeNone, let mayStore = 1 in def VLSTM : AXSI4<(outs), (ins GPRnopc:$Rn, pred:$p), IndexModeNone, - IIC_fpStore_m, "vlstm${p}\t$Rn", "", []>, + NoItinerary, "vlstm${p}\t$Rn", "", []>, Requires<[HasV8MMainline, Has8MSecExt]> { let Inst{24-23} = 0b00; let Inst{22} = 0; @@ -2143,6 +2143,9 @@ def : Pat<(f64 (fma (fneg DPR:$Dn), DPR:$Dm, DPR:$Ddin)), def : Pat<(f32 (fma (fneg SPR:$Sn), SPR:$Sm, SPR:$Sdin)), (VFMSS SPR:$Sdin, SPR:$Sn, SPR:$Sm)>, Requires<[HasVFP4]>; +def : Pat<(f16 (fma (fneg HPR:$Sn), HPR:$Sm, HPR:$Sdin)), + (VFMSH HPR:$Sdin, HPR:$Sn, HPR:$Sm)>, + Requires<[HasFullFP16]>; // (fma x, (fneg y), z) -> (vfms z, x, y) def : Pat<(f64 (fma DPR:$Dn, (fneg DPR:$Dm), DPR:$Ddin)), (VFMSD DPR:$Ddin, DPR:$Dn, DPR:$Dm)>, @@ -2150,6 +2153,9 @@ def : Pat<(f64 (fma DPR:$Dn, (fneg DPR:$Dm), DPR:$Ddin)), def : Pat<(f32 (fma SPR:$Sn, (fneg SPR:$Sm), SPR:$Sdin)), (VFMSS SPR:$Sdin, SPR:$Sn, SPR:$Sm)>, Requires<[HasVFP4]>; +def : Pat<(f16 (fma HPR:$Sn, (fneg HPR:$Sm), HPR:$Sdin)), + (VFMSH HPR:$Sdin, HPR:$Sn, HPR:$Sm)>, + Requires<[HasFullFP16]>; def VFNMAD : ADbI<0b11101, 0b01, 1, 0, (outs DPR:$Dd), (ins DPR:$Ddin, DPR:$Dn, DPR:$Dm), @@ -2196,6 +2202,9 @@ def : Pat<(fneg (fma (f64 DPR:$Dn), (f64 DPR:$Dm), (f64 DPR:$Ddin))), def : Pat<(fneg (fma (f32 SPR:$Sn), (f32 SPR:$Sm), (f32 SPR:$Sdin))), (VFNMAS SPR:$Sdin, SPR:$Sn, SPR:$Sm)>, Requires<[HasVFP4]>; +def : Pat<(fneg (fma (f16 HPR:$Sn), (f16 HPR:$Sm), (f16 HPR:$Sdin))), + (VFNMAH HPR:$Sdin, HPR:$Sn, HPR:$Sm)>, + Requires<[HasFullFP16]>; // (fma (fneg x), y, (fneg z)) -> (vfnma z, x, y) def : Pat<(f64 (fma (fneg DPR:$Dn), DPR:$Dm, (fneg DPR:$Ddin))), (VFNMAD DPR:$Ddin, DPR:$Dn, DPR:$Dm)>, @@ -2203,6 +2212,9 @@ def : Pat<(f64 (fma (fneg DPR:$Dn), DPR:$Dm, (fneg DPR:$Ddin))), def : Pat<(f32 (fma (fneg SPR:$Sn), SPR:$Sm, (fneg SPR:$Sdin))), (VFNMAS SPR:$Sdin, SPR:$Sn, SPR:$Sm)>, Requires<[HasVFP4]>; +def : Pat<(f16 (fma (fneg HPR:$Sn), HPR:$Sm, (fneg HPR:$Sdin))), + (VFNMAH HPR:$Sdin, HPR:$Sn, HPR:$Sm)>, + Requires<[HasFullFP16]>; def VFNMSD : ADbI<0b11101, 0b01, 0, 0, (outs DPR:$Dd), (ins DPR:$Ddin, DPR:$Dn, DPR:$Dm), @@ -2248,6 +2260,9 @@ def : Pat<(f64 (fma DPR:$Dn, DPR:$Dm, (fneg DPR:$Ddin))), def : Pat<(f32 (fma SPR:$Sn, SPR:$Sm, (fneg SPR:$Sdin))), (VFNMSS SPR:$Sdin, SPR:$Sn, SPR:$Sm)>, Requires<[HasVFP4]>; +def : Pat<(f16 (fma HPR:$Sn, HPR:$Sm, (fneg HPR:$Sdin))), + (VFNMSH HPR:$Sdin, HPR:$Sn, HPR:$Sm)>, + Requires<[HasFullFP16]>; // (fneg (fma (fneg x), y, z)) -> (vfnms z, x, y) def : Pat<(fneg (f64 (fma (fneg DPR:$Dn), DPR:$Dm, DPR:$Ddin))), (VFNMSD DPR:$Ddin, DPR:$Dn, DPR:$Dm)>, @@ -2255,6 +2270,9 @@ def : Pat<(fneg (f64 (fma (fneg DPR:$Dn), DPR:$Dm, DPR:$Ddin))), def : Pat<(fneg (f32 (fma (fneg SPR:$Sn), SPR:$Sm, SPR:$Sdin))), (VFNMSS SPR:$Sdin, SPR:$Sn, SPR:$Sm)>, Requires<[HasVFP4]>; +def : Pat<(fneg (f16 (fma (fneg HPR:$Sn), HPR:$Sm, HPR:$Sdin))), + (VFNMSH HPR:$Sdin, HPR:$Sn, HPR:$Sm)>, + Requires<[HasFullFP16]>; // (fneg (fma x, (fneg y), z) -> (vfnms z, x, y) def : Pat<(fneg (f64 (fma DPR:$Dn, (fneg DPR:$Dm), DPR:$Ddin))), (VFNMSD DPR:$Ddin, DPR:$Dn, DPR:$Dm)>, @@ -2262,6 +2280,9 @@ def : Pat<(fneg (f64 (fma DPR:$Dn, (fneg DPR:$Dm), DPR:$Ddin))), def : Pat<(fneg (f32 (fma SPR:$Sn, (fneg SPR:$Sm), SPR:$Sdin))), (VFNMSS SPR:$Sdin, SPR:$Sn, SPR:$Sm)>, Requires<[HasVFP4]>; +def : Pat<(fneg (f16 (fma HPR:$Sn, (fneg HPR:$Sm), HPR:$Sdin))), + (VFNMSH HPR:$Sdin, HPR:$Sn, HPR:$Sm)>, + Requires<[HasFullFP16]>; //===----------------------------------------------------------------------===// // FP Conditional moves. @@ -2279,6 +2300,12 @@ def VMOVScc : PseudoInst<(outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm, cmovpred:$p), [(set (f32 SPR:$Sd), (ARMcmov SPR:$Sn, SPR:$Sm, cmovpred:$p))]>, RegConstraint<"$Sn = $Sd">, Requires<[HasFPRegs]>; + +def VMOVHcc : PseudoInst<(outs HPR:$Sd), (ins HPR:$Sn, HPR:$Sm, cmovpred:$p), + IIC_fpUNA16, + [(set (f16 HPR:$Sd), + (ARMcmov HPR:$Sn, HPR:$Sm, cmovpred:$p))]>, + RegConstraint<"$Sd = $Sn">, Requires<[HasFPRegs]>; } // hasSideEffects //===----------------------------------------------------------------------===// diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstructionSelector.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstructionSelector.cpp index 8e5e474c0f59..67816bc2103f 100644 --- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstructionSelector.cpp +++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstructionSelector.cpp @@ -17,6 +17,7 @@ #include "llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h" #include "llvm/CodeGen/MachineConstantPool.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/IR/IntrinsicsARM.h" #include "llvm/Support/Debug.h" #define DEBUG_TYPE "arm-isel" @@ -137,8 +138,10 @@ private: unsigned selectLoadStoreOpCode(unsigned Opc, unsigned RegBank, unsigned Size) const; - void renderVFPF32Imm(MachineInstrBuilder &New, const MachineInstr &Old) const; - void renderVFPF64Imm(MachineInstrBuilder &New, const MachineInstr &Old) const; + void renderVFPF32Imm(MachineInstrBuilder &New, const MachineInstr &Old, + int OpIdx = -1) const; + void renderVFPF64Imm(MachineInstrBuilder &New, const MachineInstr &Old, + int OpIdx = -1) const; #define GET_GLOBALISEL_PREDICATES_DECL #include "ARMGenGlobalISel.inc" @@ -810,9 +813,10 @@ bool ARMInstructionSelector::selectShift(unsigned ShiftOpc, } void ARMInstructionSelector::renderVFPF32Imm( - MachineInstrBuilder &NewInstBuilder, const MachineInstr &OldInst) const { + MachineInstrBuilder &NewInstBuilder, const MachineInstr &OldInst, + int OpIdx) const { assert(OldInst.getOpcode() == TargetOpcode::G_FCONSTANT && - "Expected G_FCONSTANT"); + OpIdx == -1 && "Expected G_FCONSTANT"); APFloat FPImmValue = OldInst.getOperand(1).getFPImm()->getValueAPF(); int FPImmEncoding = ARM_AM::getFP32Imm(FPImmValue); @@ -822,9 +826,9 @@ void ARMInstructionSelector::renderVFPF32Imm( } void ARMInstructionSelector::renderVFPF64Imm( - MachineInstrBuilder &NewInstBuilder, const MachineInstr &OldInst) const { + MachineInstrBuilder &NewInstBuilder, const MachineInstr &OldInst, int OpIdx) const { assert(OldInst.getOpcode() == TargetOpcode::G_FCONSTANT && - "Expected G_FCONSTANT"); + OpIdx == -1 && "Expected G_FCONSTANT"); APFloat FPImmValue = OldInst.getOperand(1).getFPImm()->getValueAPF(); int FPImmEncoding = ARM_AM::getFP64Imm(FPImmValue); @@ -1061,7 +1065,7 @@ bool ARMInstructionSelector::select(MachineInstr &I) { case G_SHL: { return selectShift(ARM_AM::ShiftOpc::lsl, MIB); } - case G_GEP: + case G_PTR_ADD: I.setDesc(TII.get(Opcodes.ADDrr)); MIB.add(predOps(ARMCC::AL)).add(condCodeOp()); break; diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMLegalizerInfo.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/ARMLegalizerInfo.cpp index 81414e6d76fe..e2dff51ea61c 100644 --- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMLegalizerInfo.cpp +++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMLegalizerInfo.cpp @@ -162,7 +162,7 @@ ARMLegalizerInfo::ARMLegalizerInfo(const ARMSubtarget &ST) { .legalFor({s32, p0}) .minScalar(0, s32); - getActionDefinitionsBuilder(G_GEP) + getActionDefinitionsBuilder(G_PTR_ADD) .legalFor({{p0, s32}}) .minScalar(1, s32); @@ -264,7 +264,7 @@ void ARMLegalizerInfo::setFCmpLibcallsAEABI() { {RTLIB::OLE_F32, CmpInst::BAD_ICMP_PREDICATE}}; FCmp32Libcalls[CmpInst::FCMP_OLT] = { {RTLIB::OLT_F32, CmpInst::BAD_ICMP_PREDICATE}}; - FCmp32Libcalls[CmpInst::FCMP_ORD] = {{RTLIB::O_F32, CmpInst::ICMP_EQ}}; + FCmp32Libcalls[CmpInst::FCMP_ORD] = {{RTLIB::UO_F32, CmpInst::ICMP_EQ}}; FCmp32Libcalls[CmpInst::FCMP_UGE] = {{RTLIB::OLT_F32, CmpInst::ICMP_EQ}}; FCmp32Libcalls[CmpInst::FCMP_UGT] = {{RTLIB::OLE_F32, CmpInst::ICMP_EQ}}; FCmp32Libcalls[CmpInst::FCMP_ULE] = {{RTLIB::OGT_F32, CmpInst::ICMP_EQ}}; @@ -290,7 +290,7 @@ void ARMLegalizerInfo::setFCmpLibcallsAEABI() { {RTLIB::OLE_F64, CmpInst::BAD_ICMP_PREDICATE}}; FCmp64Libcalls[CmpInst::FCMP_OLT] = { {RTLIB::OLT_F64, CmpInst::BAD_ICMP_PREDICATE}}; - FCmp64Libcalls[CmpInst::FCMP_ORD] = {{RTLIB::O_F64, CmpInst::ICMP_EQ}}; + FCmp64Libcalls[CmpInst::FCMP_ORD] = {{RTLIB::UO_F64, CmpInst::ICMP_EQ}}; FCmp64Libcalls[CmpInst::FCMP_UGE] = {{RTLIB::OLT_F64, CmpInst::ICMP_EQ}}; FCmp64Libcalls[CmpInst::FCMP_UGT] = {{RTLIB::OLE_F64, CmpInst::ICMP_EQ}}; FCmp64Libcalls[CmpInst::FCMP_ULE] = {{RTLIB::OGT_F64, CmpInst::ICMP_EQ}}; @@ -315,7 +315,7 @@ void ARMLegalizerInfo::setFCmpLibcallsGNU() { FCmp32Libcalls[CmpInst::FCMP_OGT] = {{RTLIB::OGT_F32, CmpInst::ICMP_SGT}}; FCmp32Libcalls[CmpInst::FCMP_OLE] = {{RTLIB::OLE_F32, CmpInst::ICMP_SLE}}; FCmp32Libcalls[CmpInst::FCMP_OLT] = {{RTLIB::OLT_F32, CmpInst::ICMP_SLT}}; - FCmp32Libcalls[CmpInst::FCMP_ORD] = {{RTLIB::O_F32, CmpInst::ICMP_EQ}}; + FCmp32Libcalls[CmpInst::FCMP_ORD] = {{RTLIB::UO_F32, CmpInst::ICMP_EQ}}; FCmp32Libcalls[CmpInst::FCMP_UGE] = {{RTLIB::OLT_F32, CmpInst::ICMP_SGE}}; FCmp32Libcalls[CmpInst::FCMP_UGT] = {{RTLIB::OLE_F32, CmpInst::ICMP_SGT}}; FCmp32Libcalls[CmpInst::FCMP_ULE] = {{RTLIB::OGT_F32, CmpInst::ICMP_SLE}}; @@ -333,7 +333,7 @@ void ARMLegalizerInfo::setFCmpLibcallsGNU() { FCmp64Libcalls[CmpInst::FCMP_OGT] = {{RTLIB::OGT_F64, CmpInst::ICMP_SGT}}; FCmp64Libcalls[CmpInst::FCMP_OLE] = {{RTLIB::OLE_F64, CmpInst::ICMP_SLE}}; FCmp64Libcalls[CmpInst::FCMP_OLT] = {{RTLIB::OLT_F64, CmpInst::ICMP_SLT}}; - FCmp64Libcalls[CmpInst::FCMP_ORD] = {{RTLIB::O_F64, CmpInst::ICMP_EQ}}; + FCmp64Libcalls[CmpInst::FCMP_ORD] = {{RTLIB::UO_F64, CmpInst::ICMP_EQ}}; FCmp64Libcalls[CmpInst::FCMP_UGE] = {{RTLIB::OLT_F64, CmpInst::ICMP_SGE}}; FCmp64Libcalls[CmpInst::FCMP_UGT] = {{RTLIB::OLE_F64, CmpInst::ICMP_SGT}}; FCmp64Libcalls[CmpInst::FCMP_ULE] = {{RTLIB::OGT_F64, CmpInst::ICMP_SLE}}; diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp index 4a193fed04a3..12dddd29ca84 100644 --- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp +++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp @@ -696,18 +696,23 @@ MachineInstr *ARMLoadStoreOpt::CreateLoadStoreMulti( return nullptr; } - int BaseOpc = - isThumb2 ? ARM::t2ADDri : - (isThumb1 && Base == ARM::SP) ? ARM::tADDrSPi : - (isThumb1 && Offset < 8) ? ARM::tADDi3 : - isThumb1 ? ARM::tADDi8 : ARM::ADDri; + int BaseOpc = isThumb2 ? (BaseKill && Base == ARM::SP ? ARM::t2ADDspImm + : ARM::t2ADDri) + : (isThumb1 && Base == ARM::SP) + ? ARM::tADDrSPi + : (isThumb1 && Offset < 8) + ? ARM::tADDi3 + : isThumb1 ? ARM::tADDi8 : ARM::ADDri; if (Offset < 0) { - Offset = - Offset; - BaseOpc = - isThumb2 ? ARM::t2SUBri : - (isThumb1 && Offset < 8 && Base != ARM::SP) ? ARM::tSUBi3 : - isThumb1 ? ARM::tSUBi8 : ARM::SUBri; + // FIXME: There are no Thumb1 load/store instructions with negative + // offsets. So the Base != ARM::SP might be unnecessary. + Offset = -Offset; + BaseOpc = isThumb2 ? (BaseKill && Base == ARM::SP ? ARM::t2SUBspImm + : ARM::t2SUBri) + : (isThumb1 && Offset < 8 && Base != ARM::SP) + ? ARM::tSUBi3 + : isThumb1 ? ARM::tSUBi8 : ARM::SUBri; } if (!TL->isLegalAddImmediate(Offset)) @@ -1186,8 +1191,10 @@ static int isIncrementOrDecrement(const MachineInstr &MI, unsigned Reg, case ARM::tADDi8: Scale = 4; CheckCPSRDef = true; break; case ARM::tSUBi8: Scale = -4; CheckCPSRDef = true; break; case ARM::t2SUBri: + case ARM::t2SUBspImm: case ARM::SUBri: Scale = -1; CheckCPSRDef = true; break; case ARM::t2ADDri: + case ARM::t2ADDspImm: case ARM::ADDri: Scale = 1; CheckCPSRDef = true; break; case ARM::tADDspi: Scale = 4; CheckCPSRDef = false; break; case ARM::tSUBspi: Scale = -4; CheckCPSRDef = false; break; diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp index e1c5a9c3e223..6717d4706aef 100644 --- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp +++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp @@ -15,6 +15,26 @@ /// - t2LoopDec - placed within in the loop body. /// - t2LoopEnd - the loop latch terminator. /// +/// In addition to this, we also look for the presence of the VCTP instruction, +/// which determines whether we can generated the tail-predicated low-overhead +/// loop form. +/// +/// Assumptions and Dependencies: +/// Low-overhead loops are constructed and executed using a setup instruction: +/// DLS, WLS, DLSTP or WLSTP and an instruction that loops back: LE or LETP. +/// WLS(TP) and LE(TP) are branching instructions with a (large) limited range +/// but fixed polarity: WLS can only branch forwards and LE can only branch +/// backwards. These restrictions mean that this pass is dependent upon block +/// layout and block sizes, which is why it's the last pass to run. The same is +/// true for ConstantIslands, but this pass does not increase the size of the +/// basic blocks, nor does it change the CFG. Instructions are mainly removed +/// during the transform and pseudo instructions are replaced by real ones. In +/// some cases, when we have to revert to a 'normal' loop, we have to introduce +/// multiple instructions for a single pseudo (see RevertWhile and +/// RevertLoopEnd). To handle this situation, t2WhileLoopStart and t2LoopEnd +/// are defined to be as large as this maximum sequence of replacement +/// instructions. +/// //===----------------------------------------------------------------------===// #include "ARM.h" @@ -22,9 +42,16 @@ #include "ARMBaseRegisterInfo.h" #include "ARMBasicBlockInfo.h" #include "ARMSubtarget.h" +#include "Thumb2InstrInfo.h" +#include "llvm/ADT/SetOperations.h" +#include "llvm/ADT/SmallSet.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineLoopInfo.h" +#include "llvm/CodeGen/MachineLoopUtils.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/CodeGen/ReachingDefAnalysis.h" +#include "llvm/MC/MCInstrDesc.h" using namespace llvm; @@ -33,10 +60,154 @@ using namespace llvm; namespace { + struct PredicatedMI { + MachineInstr *MI = nullptr; + SetVector<MachineInstr*> Predicates; + + public: + PredicatedMI(MachineInstr *I, SetVector<MachineInstr*> &Preds) : + MI(I) { + Predicates.insert(Preds.begin(), Preds.end()); + } + }; + + // Represent a VPT block, a list of instructions that begins with a VPST and + // has a maximum of four proceeding instructions. All instructions within the + // block are predicated upon the vpr and we allow instructions to define the + // vpr within in the block too. + class VPTBlock { + std::unique_ptr<PredicatedMI> VPST; + PredicatedMI *Divergent = nullptr; + SmallVector<PredicatedMI, 4> Insts; + + public: + VPTBlock(MachineInstr *MI, SetVector<MachineInstr*> &Preds) { + VPST = std::make_unique<PredicatedMI>(MI, Preds); + } + + void addInst(MachineInstr *MI, SetVector<MachineInstr*> &Preds) { + LLVM_DEBUG(dbgs() << "ARM Loops: Adding predicated MI: " << *MI); + if (!Divergent && !set_difference(Preds, VPST->Predicates).empty()) { + Divergent = &Insts.back(); + LLVM_DEBUG(dbgs() << " - has divergent predicate: " << *Divergent->MI); + } + Insts.emplace_back(MI, Preds); + assert(Insts.size() <= 4 && "Too many instructions in VPT block!"); + } + + // Have we found an instruction within the block which defines the vpr? If + // so, not all the instructions in the block will have the same predicate. + bool HasNonUniformPredicate() const { + return Divergent != nullptr; + } + + // Is the given instruction part of the predicate set controlling the entry + // to the block. + bool IsPredicatedOn(MachineInstr *MI) const { + return VPST->Predicates.count(MI); + } + + // Is the given instruction the only predicate which controls the entry to + // the block. + bool IsOnlyPredicatedOn(MachineInstr *MI) const { + return IsPredicatedOn(MI) && VPST->Predicates.size() == 1; + } + + unsigned size() const { return Insts.size(); } + SmallVectorImpl<PredicatedMI> &getInsts() { return Insts; } + MachineInstr *getVPST() const { return VPST->MI; } + PredicatedMI *getDivergent() const { return Divergent; } + }; + + struct LowOverheadLoop { + + MachineLoop *ML = nullptr; + MachineFunction *MF = nullptr; + MachineInstr *InsertPt = nullptr; + MachineInstr *Start = nullptr; + MachineInstr *Dec = nullptr; + MachineInstr *End = nullptr; + MachineInstr *VCTP = nullptr; + VPTBlock *CurrentBlock = nullptr; + SetVector<MachineInstr*> CurrentPredicate; + SmallVector<VPTBlock, 4> VPTBlocks; + bool Revert = false; + bool CannotTailPredicate = false; + + LowOverheadLoop(MachineLoop *ML) : ML(ML) { + MF = ML->getHeader()->getParent(); + } + + // If this is an MVE instruction, check that we know how to use tail + // predication with it. Record VPT blocks and return whether the + // instruction is valid for tail predication. + bool ValidateMVEInst(MachineInstr *MI); + + void AnalyseMVEInst(MachineInstr *MI) { + CannotTailPredicate = !ValidateMVEInst(MI); + } + + bool IsTailPredicationLegal() const { + // For now, let's keep things really simple and only support a single + // block for tail predication. + return !Revert && FoundAllComponents() && VCTP && + !CannotTailPredicate && ML->getNumBlocks() == 1; + } + + bool ValidateTailPredicate(MachineInstr *StartInsertPt, + ReachingDefAnalysis *RDA, + MachineLoopInfo *MLI); + + // Is it safe to define LR with DLS/WLS? + // LR can be defined if it is the operand to start, because it's the same + // value, or if it's going to be equivalent to the operand to Start. + MachineInstr *IsSafeToDefineLR(ReachingDefAnalysis *RDA); + + // Check the branch targets are within range and we satisfy our + // restrictions. + void CheckLegality(ARMBasicBlockUtils *BBUtils, ReachingDefAnalysis *RDA, + MachineLoopInfo *MLI); + + bool FoundAllComponents() const { + return Start && Dec && End; + } + + SmallVectorImpl<VPTBlock> &getVPTBlocks() { return VPTBlocks; } + + // Return the loop iteration count, or the number of elements if we're tail + // predicating. + MachineOperand &getCount() { + return IsTailPredicationLegal() ? + VCTP->getOperand(1) : Start->getOperand(0); + } + + unsigned getStartOpcode() const { + bool IsDo = Start->getOpcode() == ARM::t2DoLoopStart; + if (!IsTailPredicationLegal()) + return IsDo ? ARM::t2DLS : ARM::t2WLS; + + return VCTPOpcodeToLSTP(VCTP->getOpcode(), IsDo); + } + + void dump() const { + if (Start) dbgs() << "ARM Loops: Found Loop Start: " << *Start; + if (Dec) dbgs() << "ARM Loops: Found Loop Dec: " << *Dec; + if (End) dbgs() << "ARM Loops: Found Loop End: " << *End; + if (VCTP) dbgs() << "ARM Loops: Found VCTP: " << *VCTP; + if (!FoundAllComponents()) + dbgs() << "ARM Loops: Not a low-overhead loop.\n"; + else if (!(Start && Dec && End)) + dbgs() << "ARM Loops: Failed to find all loop components.\n"; + } + }; + class ARMLowOverheadLoops : public MachineFunctionPass { MachineFunction *MF = nullptr; + MachineLoopInfo *MLI = nullptr; + ReachingDefAnalysis *RDA = nullptr; const ARMBaseInstrInfo *TII = nullptr; MachineRegisterInfo *MRI = nullptr; + const TargetRegisterInfo *TRI = nullptr; std::unique_ptr<ARMBasicBlockUtils> BBUtils = nullptr; public: @@ -47,6 +218,7 @@ namespace { void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); AU.addRequired<MachineLoopInfo>(); + AU.addRequired<ReachingDefAnalysis>(); MachineFunctionPass::getAnalysisUsage(AU); } @@ -54,7 +226,8 @@ namespace { MachineFunctionProperties getRequiredProperties() const override { return MachineFunctionProperties().set( - MachineFunctionProperties::Property::NoVRegs); + MachineFunctionProperties::Property::NoVRegs).set( + MachineFunctionProperties::Property::TracksLiveness); } StringRef getPassName() const override { @@ -64,8 +237,6 @@ namespace { private: bool ProcessLoop(MachineLoop *ML); - MachineInstr * IsSafeToDefineLR(MachineInstr *MI); - bool RevertNonLoops(); void RevertWhile(MachineInstr *MI) const; @@ -74,9 +245,13 @@ namespace { void RevertLoopEnd(MachineInstr *MI, bool SkipCmp = false) const; - void Expand(MachineLoop *ML, MachineInstr *Start, - MachineInstr *InsertPt, MachineInstr *Dec, - MachineInstr *End, bool Revert); + void RemoveLoopUpdate(LowOverheadLoop &LoLoop); + + void ConvertVPTBlocks(LowOverheadLoop &LoLoop); + + MachineInstr *ExpandLoopStart(LowOverheadLoop &LoLoop); + + void Expand(LowOverheadLoop &LoLoop); }; } @@ -86,128 +261,321 @@ char ARMLowOverheadLoops::ID = 0; INITIALIZE_PASS(ARMLowOverheadLoops, DEBUG_TYPE, ARM_LOW_OVERHEAD_LOOPS_NAME, false, false) -bool ARMLowOverheadLoops::runOnMachineFunction(MachineFunction &mf) { - const ARMSubtarget &ST = static_cast<const ARMSubtarget&>(mf.getSubtarget()); - if (!ST.hasLOB()) - return false; +MachineInstr *LowOverheadLoop::IsSafeToDefineLR(ReachingDefAnalysis *RDA) { + // We can define LR because LR already contains the same value. + if (Start->getOperand(0).getReg() == ARM::LR) + return Start; - MF = &mf; - LLVM_DEBUG(dbgs() << "ARM Loops on " << MF->getName() << " ------------- \n"); + unsigned CountReg = Start->getOperand(0).getReg(); + auto IsMoveLR = [&CountReg](MachineInstr *MI) { + return MI->getOpcode() == ARM::tMOVr && + MI->getOperand(0).getReg() == ARM::LR && + MI->getOperand(1).getReg() == CountReg && + MI->getOperand(2).getImm() == ARMCC::AL; + }; - auto &MLI = getAnalysis<MachineLoopInfo>(); - MF->getProperties().set(MachineFunctionProperties::Property::TracksLiveness); - MRI = &MF->getRegInfo(); - TII = static_cast<const ARMBaseInstrInfo*>(ST.getInstrInfo()); - BBUtils = std::unique_ptr<ARMBasicBlockUtils>(new ARMBasicBlockUtils(*MF)); - BBUtils->computeAllBlockSizes(); - BBUtils->adjustBBOffsetsAfter(&MF->front()); + MachineBasicBlock *MBB = Start->getParent(); - bool Changed = false; - for (auto ML : MLI) { - if (!ML->getParentLoop()) - Changed |= ProcessLoop(ML); - } - Changed |= RevertNonLoops(); - return Changed; + // Find an insertion point: + // - Is there a (mov lr, Count) before Start? If so, and nothing else writes + // to Count before Start, we can insert at that mov. + if (auto *LRDef = RDA->getReachingMIDef(Start, ARM::LR)) + if (IsMoveLR(LRDef) && RDA->hasSameReachingDef(Start, LRDef, CountReg)) + return LRDef; + + // - Is there a (mov lr, Count) after Start? If so, and nothing else writes + // to Count after Start, we can insert at that mov. + if (auto *LRDef = RDA->getLocalLiveOutMIDef(MBB, ARM::LR)) + if (IsMoveLR(LRDef) && RDA->hasSameReachingDef(Start, LRDef, CountReg)) + return LRDef; + + // We've found no suitable LR def and Start doesn't use LR directly. Can we + // just define LR anyway? + if (!RDA->isRegUsedAfter(Start, ARM::LR)) + return Start; + + return nullptr; } -static bool IsLoopStart(MachineInstr &MI) { - return MI.getOpcode() == ARM::t2DoLoopStart || - MI.getOpcode() == ARM::t2WhileLoopStart; +// Can we safely move 'From' to just before 'To'? To satisfy this, 'From' must +// not define a register that is used by any instructions, after and including, +// 'To'. These instructions also must not redefine any of Froms operands. +template<typename Iterator> +static bool IsSafeToMove(MachineInstr *From, MachineInstr *To, ReachingDefAnalysis *RDA) { + SmallSet<int, 2> Defs; + // First check that From would compute the same value if moved. + for (auto &MO : From->operands()) { + if (!MO.isReg() || MO.isUndef() || !MO.getReg()) + continue; + if (MO.isDef()) + Defs.insert(MO.getReg()); + else if (!RDA->hasSameReachingDef(From, To, MO.getReg())) + return false; + } + + // Now walk checking that the rest of the instructions will compute the same + // value. + for (auto I = ++Iterator(From), E = Iterator(To); I != E; ++I) { + for (auto &MO : I->operands()) + if (MO.isReg() && MO.getReg() && MO.isUse() && Defs.count(MO.getReg())) + return false; + } + return true; } -template<typename T> -static MachineInstr* SearchForDef(MachineInstr *Begin, T End, unsigned Reg) { - for(auto &MI : make_range(T(Begin), End)) { - for (auto &MO : MI.operands()) { - if (!MO.isReg() || !MO.isDef() || MO.getReg() != Reg) +bool LowOverheadLoop::ValidateTailPredicate(MachineInstr *StartInsertPt, + ReachingDefAnalysis *RDA, MachineLoopInfo *MLI) { + assert(VCTP && "VCTP instruction expected but is not set"); + // All predication within the loop should be based on vctp. If the block + // isn't predicated on entry, check whether the vctp is within the block + // and that all other instructions are then predicated on it. + for (auto &Block : VPTBlocks) { + if (Block.IsPredicatedOn(VCTP)) + continue; + if (!Block.HasNonUniformPredicate() || !isVCTP(Block.getDivergent()->MI)) { + LLVM_DEBUG(dbgs() << "ARM Loops: Found unsupported diverging predicate: " + << *Block.getDivergent()->MI); + return false; + } + SmallVectorImpl<PredicatedMI> &Insts = Block.getInsts(); + for (auto &PredMI : Insts) { + if (PredMI.Predicates.count(VCTP) || isVCTP(PredMI.MI)) continue; - return &MI; + LLVM_DEBUG(dbgs() << "ARM Loops: Can't convert: " << *PredMI.MI + << " - which is predicated on:\n"; + for (auto *MI : PredMI.Predicates) + dbgs() << " - " << *MI; + ); + return false; } } - return nullptr; -} -static MachineInstr* SearchForUse(MachineInstr *Begin, - MachineBasicBlock::iterator End, - unsigned Reg) { - for(auto &MI : make_range(MachineBasicBlock::iterator(Begin), End)) { - for (auto &MO : MI.operands()) { - if (!MO.isReg() || !MO.isUse() || MO.getReg() != Reg) - continue; - return &MI; + // For tail predication, we need to provide the number of elements, instead + // of the iteration count, to the loop start instruction. The number of + // elements is provided to the vctp instruction, so we need to check that + // we can use this register at InsertPt. + Register NumElements = VCTP->getOperand(1).getReg(); + + // If the register is defined within loop, then we can't perform TP. + // TODO: Check whether this is just a mov of a register that would be + // available. + if (RDA->getReachingDef(VCTP, NumElements) >= 0) { + LLVM_DEBUG(dbgs() << "ARM Loops: VCTP operand is defined in the loop.\n"); + return false; + } + + // The element count register maybe defined after InsertPt, in which case we + // need to try to move either InsertPt or the def so that the [w|d]lstp can + // use the value. + MachineBasicBlock *InsertBB = InsertPt->getParent(); + if (!RDA->isReachingDefLiveOut(InsertPt, NumElements)) { + if (auto *ElemDef = RDA->getLocalLiveOutMIDef(InsertBB, NumElements)) { + if (IsSafeToMove<MachineBasicBlock::reverse_iterator>(ElemDef, InsertPt, RDA)) { + ElemDef->removeFromParent(); + InsertBB->insert(MachineBasicBlock::iterator(InsertPt), ElemDef); + LLVM_DEBUG(dbgs() << "ARM Loops: Moved element count def: " + << *ElemDef); + } else if (IsSafeToMove<MachineBasicBlock::iterator>(InsertPt, ElemDef, RDA)) { + InsertPt->removeFromParent(); + InsertBB->insertAfter(MachineBasicBlock::iterator(ElemDef), InsertPt); + LLVM_DEBUG(dbgs() << "ARM Loops: Moved start past: " << *ElemDef); + } else { + LLVM_DEBUG(dbgs() << "ARM Loops: Unable to move element count to loop " + << "start instruction.\n"); + return false; + } } } - return nullptr; + + // Especially in the case of while loops, InsertBB may not be the + // preheader, so we need to check that the register isn't redefined + // before entering the loop. + auto CannotProvideElements = [&RDA](MachineBasicBlock *MBB, + Register NumElements) { + // NumElements is redefined in this block. + if (RDA->getReachingDef(&MBB->back(), NumElements) >= 0) + return true; + + // Don't continue searching up through multiple predecessors. + if (MBB->pred_size() > 1) + return true; + + return false; + }; + + // First, find the block that looks like the preheader. + MachineBasicBlock *MBB = MLI->findLoopPreheader(ML, true); + if (!MBB) { + LLVM_DEBUG(dbgs() << "ARM Loops: Didn't find preheader.\n"); + return false; + } + + // Then search backwards for a def, until we get to InsertBB. + while (MBB != InsertBB) { + if (CannotProvideElements(MBB, NumElements)) { + LLVM_DEBUG(dbgs() << "ARM Loops: Unable to provide element count.\n"); + return false; + } + MBB = *MBB->pred_begin(); + } + + LLVM_DEBUG(dbgs() << "ARM Loops: Will use tail predication.\n"); + return true; } -// Is it safe to define LR with DLS/WLS? -// LR can defined if it is the operand to start, because it's the same value, -// or if it's going to be equivalent to the operand to Start. -MachineInstr *ARMLowOverheadLoops::IsSafeToDefineLR(MachineInstr *Start) { +void LowOverheadLoop::CheckLegality(ARMBasicBlockUtils *BBUtils, + ReachingDefAnalysis *RDA, + MachineLoopInfo *MLI) { + if (Revert) + return; - auto IsMoveLR = [](MachineInstr *MI, unsigned Reg) { - return MI->getOpcode() == ARM::tMOVr && - MI->getOperand(0).getReg() == ARM::LR && - MI->getOperand(1).getReg() == Reg && - MI->getOperand(2).getImm() == ARMCC::AL; - }; + if (!End->getOperand(1).isMBB()) + report_fatal_error("Expected LoopEnd to target basic block"); - MachineBasicBlock *MBB = Start->getParent(); - unsigned CountReg = Start->getOperand(0).getReg(); - // Walk forward and backward in the block to find the closest instructions - // that define LR. Then also filter them out if they're not a mov lr. - MachineInstr *PredLRDef = SearchForDef(Start, MBB->rend(), ARM::LR); - if (PredLRDef && !IsMoveLR(PredLRDef, CountReg)) - PredLRDef = nullptr; - - MachineInstr *SuccLRDef = SearchForDef(Start, MBB->end(), ARM::LR); - if (SuccLRDef && !IsMoveLR(SuccLRDef, CountReg)) - SuccLRDef = nullptr; - - // We've either found one, two or none mov lr instructions... Now figure out - // if they are performing the equilvant mov that the Start instruction will. - // Do this by scanning forward and backward to see if there's a def of the - // register holding the count value. If we find a suitable def, return it as - // the insert point. Later, if InsertPt != Start, then we can remove the - // redundant instruction. - if (SuccLRDef) { - MachineBasicBlock::iterator End(SuccLRDef); - if (!SearchForDef(Start, End, CountReg)) { - return SuccLRDef; - } else - SuccLRDef = nullptr; + // TODO Maybe there's cases where the target doesn't have to be the header, + // but for now be safe and revert. + if (End->getOperand(1).getMBB() != ML->getHeader()) { + LLVM_DEBUG(dbgs() << "ARM Loops: LoopEnd is not targetting header.\n"); + Revert = true; + return; } - if (PredLRDef) { - MachineBasicBlock::reverse_iterator End(PredLRDef); - if (!SearchForDef(Start, End, CountReg)) { - return PredLRDef; - } else - PredLRDef = nullptr; + + // The WLS and LE instructions have 12-bits for the label offset. WLS + // requires a positive offset, while LE uses negative. + if (BBUtils->getOffsetOf(End) < BBUtils->getOffsetOf(ML->getHeader()) || + !BBUtils->isBBInRange(End, ML->getHeader(), 4094)) { + LLVM_DEBUG(dbgs() << "ARM Loops: LE offset is out-of-range\n"); + Revert = true; + return; } - // We can define LR because LR already contains the same value. - if (Start->getOperand(0).getReg() == ARM::LR) - return Start; + if (Start->getOpcode() == ARM::t2WhileLoopStart && + (BBUtils->getOffsetOf(Start) > + BBUtils->getOffsetOf(Start->getOperand(1).getMBB()) || + !BBUtils->isBBInRange(Start, Start->getOperand(1).getMBB(), 4094))) { + LLVM_DEBUG(dbgs() << "ARM Loops: WLS offset is out-of-range!\n"); + Revert = true; + return; + } - // We've found no suitable LR def and Start doesn't use LR directly. Can we - // just define LR anyway? - const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo(); - LivePhysRegs LiveRegs(*TRI); - LiveRegs.addLiveOuts(*MBB); + InsertPt = Revert ? nullptr : IsSafeToDefineLR(RDA); + if (!InsertPt) { + LLVM_DEBUG(dbgs() << "ARM Loops: Unable to find safe insertion point.\n"); + Revert = true; + return; + } else + LLVM_DEBUG(dbgs() << "ARM Loops: Start insertion point: " << *InsertPt); - // Not if we've haven't found a suitable mov and LR is live out. - if (LiveRegs.contains(ARM::LR)) - return nullptr; + if (!IsTailPredicationLegal()) { + LLVM_DEBUG(if (!VCTP) + dbgs() << "ARM Loops: Didn't find a VCTP instruction.\n"; + dbgs() << "ARM Loops: Tail-predication is not valid.\n"); + return; + } - // If LR is not live out, we can insert the instruction if nothing else - // uses LR after it. - if (!SearchForUse(Start, MBB->end(), ARM::LR)) - return Start; + assert(ML->getBlocks().size() == 1 && + "Shouldn't be processing a loop with more than one block"); + CannotTailPredicate = !ValidateTailPredicate(InsertPt, RDA, MLI); + LLVM_DEBUG(if (CannotTailPredicate) + dbgs() << "ARM Loops: Couldn't validate tail predicate.\n"); +} - LLVM_DEBUG(dbgs() << "ARM Loops: Failed to find suitable insertion point for" - << " LR\n"); - return nullptr; +bool LowOverheadLoop::ValidateMVEInst(MachineInstr* MI) { + if (CannotTailPredicate) + return false; + + // Only support a single vctp. + if (isVCTP(MI) && VCTP) + return false; + + // Start a new vpt block when we discover a vpt. + if (MI->getOpcode() == ARM::MVE_VPST) { + VPTBlocks.emplace_back(MI, CurrentPredicate); + CurrentBlock = &VPTBlocks.back(); + return true; + } else if (isVCTP(MI)) + VCTP = MI; + else if (MI->getOpcode() == ARM::MVE_VPSEL || + MI->getOpcode() == ARM::MVE_VPNOT) + return false; + + // TODO: Allow VPSEL and VPNOT, we currently cannot because: + // 1) It will use the VPR as a predicate operand, but doesn't have to be + // instead a VPT block, which means we can assert while building up + // the VPT block because we don't find another VPST to being a new + // one. + // 2) VPSEL still requires a VPR operand even after tail predicating, + // which means we can't remove it unless there is another + // instruction, such as vcmp, that can provide the VPR def. + + bool IsUse = false; + bool IsDef = false; + const MCInstrDesc &MCID = MI->getDesc(); + for (int i = MI->getNumOperands() - 1; i >= 0; --i) { + const MachineOperand &MO = MI->getOperand(i); + if (!MO.isReg() || MO.getReg() != ARM::VPR) + continue; + + if (MO.isDef()) { + CurrentPredicate.insert(MI); + IsDef = true; + } else if (ARM::isVpred(MCID.OpInfo[i].OperandType)) { + CurrentBlock->addInst(MI, CurrentPredicate); + IsUse = true; + } else { + LLVM_DEBUG(dbgs() << "ARM Loops: Found instruction using vpr: " << *MI); + return false; + } + } + + // If we find a vpr def that is not already predicated on the vctp, we've + // got disjoint predicates that may not be equivalent when we do the + // conversion. + if (IsDef && !IsUse && VCTP && !isVCTP(MI)) { + LLVM_DEBUG(dbgs() << "ARM Loops: Found disjoint vpr def: " << *MI); + return false; + } + + uint64_t Flags = MCID.TSFlags; + if ((Flags & ARMII::DomainMask) != ARMII::DomainMVE) + return true; + + // If we find an instruction that has been marked as not valid for tail + // predication, only allow the instruction if it's contained within a valid + // VPT block. + if ((Flags & ARMII::ValidForTailPredication) == 0 && !IsUse) { + LLVM_DEBUG(dbgs() << "ARM Loops: Can't tail predicate: " << *MI); + return false; + } + + return true; +} + +bool ARMLowOverheadLoops::runOnMachineFunction(MachineFunction &mf) { + const ARMSubtarget &ST = static_cast<const ARMSubtarget&>(mf.getSubtarget()); + if (!ST.hasLOB()) + return false; + + MF = &mf; + LLVM_DEBUG(dbgs() << "ARM Loops on " << MF->getName() << " ------------- \n"); + + MLI = &getAnalysis<MachineLoopInfo>(); + RDA = &getAnalysis<ReachingDefAnalysis>(); + MF->getProperties().set(MachineFunctionProperties::Property::TracksLiveness); + MRI = &MF->getRegInfo(); + TII = static_cast<const ARMBaseInstrInfo*>(ST.getInstrInfo()); + TRI = ST.getRegisterInfo(); + BBUtils = std::unique_ptr<ARMBasicBlockUtils>(new ARMBasicBlockUtils(*MF)); + BBUtils->computeAllBlockSizes(); + BBUtils->adjustBBOffsetsAfter(&MF->front()); + + bool Changed = false; + for (auto ML : *MLI) { + if (!ML->getParentLoop()) + Changed |= ProcessLoop(ML); + } + Changed |= RevertNonLoops(); + return Changed; } bool ARMLowOverheadLoops::ProcessLoop(MachineLoop *ML) { @@ -218,14 +586,21 @@ bool ARMLowOverheadLoops::ProcessLoop(MachineLoop *ML) { for (auto I = ML->begin(), E = ML->end(); I != E; ++I) Changed |= ProcessLoop(*I); - LLVM_DEBUG(dbgs() << "ARM Loops: Processing " << *ML); + LLVM_DEBUG(dbgs() << "ARM Loops: Processing loop containing:\n"; + if (auto *Preheader = ML->getLoopPreheader()) + dbgs() << " - " << Preheader->getName() << "\n"; + else if (auto *Preheader = MLI->findLoopPreheader(ML)) + dbgs() << " - " << Preheader->getName() << "\n"; + for (auto *MBB : ML->getBlocks()) + dbgs() << " - " << MBB->getName() << "\n"; + ); // Search the given block for a loop start instruction. If one isn't found, // and there's only one predecessor block, search that one too. std::function<MachineInstr*(MachineBasicBlock*)> SearchForStart = [&SearchForStart](MachineBasicBlock *MBB) -> MachineInstr* { for (auto &MI : *MBB) { - if (IsLoopStart(MI)) + if (isLoopStart(MI)) return &MI; } if (MBB->pred_size() == 1) @@ -233,53 +608,43 @@ bool ARMLowOverheadLoops::ProcessLoop(MachineLoop *ML) { return nullptr; }; - MachineInstr *Start = nullptr; - MachineInstr *Dec = nullptr; - MachineInstr *End = nullptr; - bool Revert = false; - - // Search the preheader for the start intrinsic, or look through the - // predecessors of the header to find exactly one set.iterations intrinsic. + LowOverheadLoop LoLoop(ML); + // Search the preheader for the start intrinsic. // FIXME: I don't see why we shouldn't be supporting multiple predecessors // with potentially multiple set.loop.iterations, so we need to enable this. - if (auto *Preheader = ML->getLoopPreheader()) { - Start = SearchForStart(Preheader); - } else { - LLVM_DEBUG(dbgs() << "ARM Loops: Failed to find loop preheader!\n" - << " - Performing manual predecessor search.\n"); - MachineBasicBlock *Pred = nullptr; - for (auto *MBB : ML->getHeader()->predecessors()) { - if (!ML->contains(MBB)) { - if (Pred) { - LLVM_DEBUG(dbgs() << " - Found multiple out-of-loop preds.\n"); - Start = nullptr; - break; - } - Pred = MBB; - Start = SearchForStart(MBB); - } - } - } + if (auto *Preheader = ML->getLoopPreheader()) + LoLoop.Start = SearchForStart(Preheader); + else if (auto *Preheader = MLI->findLoopPreheader(ML, true)) + LoLoop.Start = SearchForStart(Preheader); + else + return false; // Find the low-overhead loop components and decide whether or not to fall - // back to a normal loop. + // back to a normal loop. Also look for a vctp instructions and decide + // whether we can convert that predicate using tail predication. for (auto *MBB : reverse(ML->getBlocks())) { for (auto &MI : *MBB) { if (MI.getOpcode() == ARM::t2LoopDec) - Dec = &MI; + LoLoop.Dec = &MI; else if (MI.getOpcode() == ARM::t2LoopEnd) - End = &MI; - else if (IsLoopStart(MI)) - Start = &MI; + LoLoop.End = &MI; + else if (isLoopStart(MI)) + LoLoop.Start = &MI; else if (MI.getDesc().isCall()) { // TODO: Though the call will require LE to execute again, does this // mean we should revert? Always executing LE hopefully should be // faster than performing a sub,cmp,br or even subs,br. - Revert = true; + LoLoop.Revert = true; LLVM_DEBUG(dbgs() << "ARM Loops: Found call.\n"); + } else { + // Record VPR defs and build up their corresponding vpt blocks. + // Check we know how to tail predicate any mve instructions. + LoLoop.AnalyseMVEInst(&MI); } - if (!Dec || End) + // We need to ensure that LR is not used or defined inbetween LoopDec and + // LoopEnd. + if (!LoLoop.Dec || LoLoop.End || LoLoop.Revert) continue; // If we find that LR has been written or read between LoopDec and @@ -294,61 +659,21 @@ bool ARMLowOverheadLoops::ProcessLoop(MachineLoop *ML) { if (MI.getOpcode() != ARM::t2LoopDec && MO.isReg() && MO.getReg() == ARM::LR) { LLVM_DEBUG(dbgs() << "ARM Loops: Found LR Use/Def: " << MI); - Revert = true; + LoLoop.Revert = true; break; } } } - - if (Dec && End && Revert) - break; } - LLVM_DEBUG(if (Start) dbgs() << "ARM Loops: Found Loop Start: " << *Start; - if (Dec) dbgs() << "ARM Loops: Found Loop Dec: " << *Dec; - if (End) dbgs() << "ARM Loops: Found Loop End: " << *End;); - - if (!Start && !Dec && !End) { - LLVM_DEBUG(dbgs() << "ARM Loops: Not a low-overhead loop.\n"); - return Changed; - } else if (!(Start && Dec && End)) { - LLVM_DEBUG(dbgs() << "ARM Loops: Failed to find all loop components.\n"); + LLVM_DEBUG(LoLoop.dump()); + if (!LoLoop.FoundAllComponents()) { + LLVM_DEBUG(dbgs() << "ARM Loops: Didn't find loop start, update, end\n"); return false; } - if (!End->getOperand(1).isMBB()) - report_fatal_error("Expected LoopEnd to target basic block"); - - // TODO Maybe there's cases where the target doesn't have to be the header, - // but for now be safe and revert. - if (End->getOperand(1).getMBB() != ML->getHeader()) { - LLVM_DEBUG(dbgs() << "ARM Loops: LoopEnd is not targetting header.\n"); - Revert = true; - } - - // The WLS and LE instructions have 12-bits for the label offset. WLS - // requires a positive offset, while LE uses negative. - if (BBUtils->getOffsetOf(End) < BBUtils->getOffsetOf(ML->getHeader()) || - !BBUtils->isBBInRange(End, ML->getHeader(), 4094)) { - LLVM_DEBUG(dbgs() << "ARM Loops: LE offset is out-of-range\n"); - Revert = true; - } - if (Start->getOpcode() == ARM::t2WhileLoopStart && - (BBUtils->getOffsetOf(Start) > - BBUtils->getOffsetOf(Start->getOperand(1).getMBB()) || - !BBUtils->isBBInRange(Start, Start->getOperand(1).getMBB(), 4094))) { - LLVM_DEBUG(dbgs() << "ARM Loops: WLS offset is out-of-range!\n"); - Revert = true; - } - - MachineInstr *InsertPt = Revert ? nullptr : IsSafeToDefineLR(Start); - if (!InsertPt) { - LLVM_DEBUG(dbgs() << "ARM Loops: Unable to find safe insertion point.\n"); - Revert = true; - } else - LLVM_DEBUG(dbgs() << "ARM Loops: Start insertion point: " << *InsertPt); - - Expand(ML, Start, InsertPt, Dec, End, Revert); + LoLoop.CheckLegality(BBUtils.get(), RDA, MLI); + Expand(LoLoop); return true; } @@ -365,7 +690,7 @@ void ARMLowOverheadLoops::RevertWhile(MachineInstr *MI) const { MIB.addImm(0); MIB.addImm(ARMCC::AL); MIB.addReg(ARM::NoRegister); - + MachineBasicBlock *DestBB = MI->getOperand(1).getMBB(); unsigned BrOpc = BBUtils->isBBInRange(MI, DestBB, 254) ? ARM::tBcc : ARM::t2Bcc; @@ -378,19 +703,15 @@ void ARMLowOverheadLoops::RevertWhile(MachineInstr *MI) const { } bool ARMLowOverheadLoops::RevertLoopDec(MachineInstr *MI, - bool AllowFlags) const { + bool SetFlags) const { LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to sub: " << *MI); MachineBasicBlock *MBB = MI->getParent(); - // If nothing uses or defines CPSR between LoopDec and LoopEnd, use a t2SUBS. - bool SetFlags = false; - if (AllowFlags) { - if (auto *Def = SearchForDef(MI, MBB->end(), ARM::CPSR)) { - if (!SearchForUse(MI, MBB->end(), ARM::CPSR) && - Def->getOpcode() == ARM::t2LoopEnd) - SetFlags = true; - } - } + // If nothing defines CPSR between LoopDec and LoopEnd, use a t2SUBS. + if (SetFlags && + (RDA->isRegUsedAfter(MI, ARM::CPSR) || + !RDA->hasSameReachingDef(MI, &MBB->back(), ARM::CPSR))) + SetFlags = false; MachineInstrBuilder MIB = BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(ARM::t2SUBri)); @@ -438,44 +759,223 @@ void ARMLowOverheadLoops::RevertLoopEnd(MachineInstr *MI, bool SkipCmp) const { MI->eraseFromParent(); } -void ARMLowOverheadLoops::Expand(MachineLoop *ML, MachineInstr *Start, - MachineInstr *InsertPt, - MachineInstr *Dec, MachineInstr *End, - bool Revert) { +MachineInstr* ARMLowOverheadLoops::ExpandLoopStart(LowOverheadLoop &LoLoop) { + MachineInstr *InsertPt = LoLoop.InsertPt; + MachineInstr *Start = LoLoop.Start; + MachineBasicBlock *MBB = InsertPt->getParent(); + bool IsDo = Start->getOpcode() == ARM::t2DoLoopStart; + unsigned Opc = LoLoop.getStartOpcode(); + MachineOperand &Count = LoLoop.getCount(); - auto ExpandLoopStart = [this](MachineLoop *ML, MachineInstr *Start, - MachineInstr *InsertPt) { - MachineBasicBlock *MBB = InsertPt->getParent(); - unsigned Opc = Start->getOpcode() == ARM::t2DoLoopStart ? - ARM::t2DLS : ARM::t2WLS; - MachineInstrBuilder MIB = - BuildMI(*MBB, InsertPt, InsertPt->getDebugLoc(), TII->get(Opc)); + MachineInstrBuilder MIB = + BuildMI(*MBB, InsertPt, InsertPt->getDebugLoc(), TII->get(Opc)); - MIB.addDef(ARM::LR); - MIB.add(Start->getOperand(0)); - if (Opc == ARM::t2WLS) - MIB.add(Start->getOperand(1)); - - if (InsertPt != Start) - InsertPt->eraseFromParent(); - Start->eraseFromParent(); - LLVM_DEBUG(dbgs() << "ARM Loops: Inserted start: " << *MIB); - return &*MIB; + MIB.addDef(ARM::LR); + MIB.add(Count); + if (!IsDo) + MIB.add(Start->getOperand(1)); + + // When using tail-predication, try to delete the dead code that was used to + // calculate the number of loop iterations. + if (LoLoop.IsTailPredicationLegal()) { + SmallVector<MachineInstr*, 4> Killed; + SmallVector<MachineInstr*, 4> Dead; + if (auto *Def = RDA->getReachingMIDef(Start, + Start->getOperand(0).getReg())) { + Killed.push_back(Def); + + while (!Killed.empty()) { + MachineInstr *Def = Killed.back(); + Killed.pop_back(); + Dead.push_back(Def); + for (auto &MO : Def->operands()) { + if (!MO.isReg() || !MO.isKill()) + continue; + + MachineInstr *Kill = RDA->getReachingMIDef(Def, MO.getReg()); + if (Kill && RDA->getNumUses(Kill, MO.getReg()) == 1) + Killed.push_back(Kill); + } + } + for (auto *MI : Dead) + MI->eraseFromParent(); + } + } + + // If we're inserting at a mov lr, then remove it as it's redundant. + if (InsertPt != Start) + InsertPt->eraseFromParent(); + Start->eraseFromParent(); + LLVM_DEBUG(dbgs() << "ARM Loops: Inserted start: " << *MIB); + return &*MIB; +} + +// Goal is to optimise and clean-up these loops: +// +// vector.body: +// renamable $vpr = MVE_VCTP32 renamable $r3, 0, $noreg +// renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3(tied-def 0), 4 +// .. +// $lr = MVE_DLSTP_32 renamable $r3 +// +// The SUB is the old update of the loop iteration count expression, which +// is no longer needed. This sub is removed when the element count, which is in +// r3 in this example, is defined by an instruction in the loop, and it has +// no uses. +// +void ARMLowOverheadLoops::RemoveLoopUpdate(LowOverheadLoop &LoLoop) { + Register ElemCount = LoLoop.VCTP->getOperand(1).getReg(); + MachineInstr *LastInstrInBlock = &LoLoop.VCTP->getParent()->back(); + + LLVM_DEBUG(dbgs() << "ARM Loops: Trying to remove loop update stmt\n"); + + if (LoLoop.ML->getNumBlocks() != 1) { + LLVM_DEBUG(dbgs() << "ARM Loops: Single block loop expected\n"); + return; + } + + LLVM_DEBUG(dbgs() << "ARM Loops: Analyzing elemcount in operand: "; + LoLoop.VCTP->getOperand(1).dump()); + + // Find the definition we are interested in removing, if there is one. + MachineInstr *Def = RDA->getReachingMIDef(LastInstrInBlock, ElemCount); + if (!Def) { + LLVM_DEBUG(dbgs() << "ARM Loops: Can't find a def, nothing to do.\n"); + return; + } + + // Bail if we define CPSR and it is not dead + if (!Def->registerDefIsDead(ARM::CPSR, TRI)) { + LLVM_DEBUG(dbgs() << "ARM Loops: CPSR is not dead\n"); + return; + } + + // Bail if elemcount is used in exit blocks, i.e. if it is live-in. + if (isRegLiveInExitBlocks(LoLoop.ML, ElemCount)) { + LLVM_DEBUG(dbgs() << "ARM Loops: Elemcount is live-out, can't remove stmt\n"); + return; + } + + // Bail if there are uses after this Def in the block. + SmallVector<MachineInstr*, 4> Uses; + RDA->getReachingLocalUses(Def, ElemCount, Uses); + if (Uses.size()) { + LLVM_DEBUG(dbgs() << "ARM Loops: Local uses in block, can't remove stmt\n"); + return; + } + + Uses.clear(); + RDA->getAllInstWithUseBefore(Def, ElemCount, Uses); + + // Remove Def if there are no uses, or if the only use is the VCTP + // instruction. + if (!Uses.size() || (Uses.size() == 1 && Uses[0] == LoLoop.VCTP)) { + LLVM_DEBUG(dbgs() << "ARM Loops: Removing loop update instruction: "; + Def->dump()); + Def->eraseFromParent(); + return; + } + + LLVM_DEBUG(dbgs() << "ARM Loops: Can't remove loop update, it's used by:\n"; + for (auto U : Uses) U->dump()); +} + +void ARMLowOverheadLoops::ConvertVPTBlocks(LowOverheadLoop &LoLoop) { + auto RemovePredicate = [](MachineInstr *MI) { + LLVM_DEBUG(dbgs() << "ARM Loops: Removing predicate from: " << *MI); + if (int PIdx = llvm::findFirstVPTPredOperandIdx(*MI)) { + assert(MI->getOperand(PIdx).getImm() == ARMVCC::Then && + "Expected Then predicate!"); + MI->getOperand(PIdx).setImm(ARMVCC::None); + MI->getOperand(PIdx+1).setReg(0); + } else + llvm_unreachable("trying to unpredicate a non-predicated instruction"); }; + // There are a few scenarios which we have to fix up: + // 1) A VPT block with is only predicated by the vctp and has no internal vpr + // defs. + // 2) A VPT block which is only predicated by the vctp but has an internal + // vpr def. + // 3) A VPT block which is predicated upon the vctp as well as another vpr + // def. + // 4) A VPT block which is not predicated upon a vctp, but contains it and + // all instructions within the block are predicated upon in. + + for (auto &Block : LoLoop.getVPTBlocks()) { + SmallVectorImpl<PredicatedMI> &Insts = Block.getInsts(); + if (Block.HasNonUniformPredicate()) { + PredicatedMI *Divergent = Block.getDivergent(); + if (isVCTP(Divergent->MI)) { + // The vctp will be removed, so the size of the vpt block needs to be + // modified. + uint64_t Size = getARMVPTBlockMask(Block.size() - 1); + Block.getVPST()->getOperand(0).setImm(Size); + LLVM_DEBUG(dbgs() << "ARM Loops: Modified VPT block mask.\n"); + } else if (Block.IsOnlyPredicatedOn(LoLoop.VCTP)) { + // The VPT block has a non-uniform predicate but it's entry is guarded + // only by a vctp, which means we: + // - Need to remove the original vpst. + // - Then need to unpredicate any following instructions, until + // we come across the divergent vpr def. + // - Insert a new vpst to predicate the instruction(s) that following + // the divergent vpr def. + // TODO: We could be producing more VPT blocks than necessary and could + // fold the newly created one into a proceeding one. + for (auto I = ++MachineBasicBlock::iterator(Block.getVPST()), + E = ++MachineBasicBlock::iterator(Divergent->MI); I != E; ++I) + RemovePredicate(&*I); + + unsigned Size = 0; + auto E = MachineBasicBlock::reverse_iterator(Divergent->MI); + auto I = MachineBasicBlock::reverse_iterator(Insts.back().MI); + MachineInstr *InsertAt = nullptr; + while (I != E) { + InsertAt = &*I; + ++Size; + ++I; + } + MachineInstrBuilder MIB = BuildMI(*InsertAt->getParent(), InsertAt, + InsertAt->getDebugLoc(), + TII->get(ARM::MVE_VPST)); + MIB.addImm(getARMVPTBlockMask(Size)); + LLVM_DEBUG(dbgs() << "ARM Loops: Removing VPST: " << *Block.getVPST()); + LLVM_DEBUG(dbgs() << "ARM Loops: Created VPST: " << *MIB); + Block.getVPST()->eraseFromParent(); + } + } else if (Block.IsOnlyPredicatedOn(LoLoop.VCTP)) { + // A vpt block which is only predicated upon vctp and has no internal vpr + // defs: + // - Remove vpst. + // - Unpredicate the remaining instructions. + LLVM_DEBUG(dbgs() << "ARM Loops: Removing VPST: " << *Block.getVPST()); + Block.getVPST()->eraseFromParent(); + for (auto &PredMI : Insts) + RemovePredicate(PredMI.MI); + } + } + + LLVM_DEBUG(dbgs() << "ARM Loops: Removing VCTP: " << *LoLoop.VCTP); + LoLoop.VCTP->eraseFromParent(); +} + +void ARMLowOverheadLoops::Expand(LowOverheadLoop &LoLoop) { + // Combine the LoopDec and LoopEnd instructions into LE(TP). - auto ExpandLoopEnd = [this](MachineLoop *ML, MachineInstr *Dec, - MachineInstr *End) { + auto ExpandLoopEnd = [this](LowOverheadLoop &LoLoop) { + MachineInstr *End = LoLoop.End; MachineBasicBlock *MBB = End->getParent(); + unsigned Opc = LoLoop.IsTailPredicationLegal() ? + ARM::MVE_LETP : ARM::t2LEUpdate; MachineInstrBuilder MIB = BuildMI(*MBB, End, End->getDebugLoc(), - TII->get(ARM::t2LEUpdate)); + TII->get(Opc)); MIB.addDef(ARM::LR); MIB.add(End->getOperand(0)); MIB.add(End->getOperand(1)); LLVM_DEBUG(dbgs() << "ARM Loops: Inserted LE: " << *MIB); - End->eraseFromParent(); - Dec->eraseFromParent(); + LoLoop.End->eraseFromParent(); + LoLoop.Dec->eraseFromParent(); return &*MIB; }; @@ -496,18 +996,22 @@ void ARMLowOverheadLoops::Expand(MachineLoop *ML, MachineInstr *Start, } }; - if (Revert) { - if (Start->getOpcode() == ARM::t2WhileLoopStart) - RevertWhile(Start); + if (LoLoop.Revert) { + if (LoLoop.Start->getOpcode() == ARM::t2WhileLoopStart) + RevertWhile(LoLoop.Start); else - Start->eraseFromParent(); - bool FlagsAlreadySet = RevertLoopDec(Dec, true); - RevertLoopEnd(End, FlagsAlreadySet); + LoLoop.Start->eraseFromParent(); + bool FlagsAlreadySet = RevertLoopDec(LoLoop.Dec, true); + RevertLoopEnd(LoLoop.End, FlagsAlreadySet); } else { - Start = ExpandLoopStart(ML, Start, InsertPt); - RemoveDeadBranch(Start); - End = ExpandLoopEnd(ML, Dec, End); - RemoveDeadBranch(End); + LoLoop.Start = ExpandLoopStart(LoLoop); + RemoveDeadBranch(LoLoop.Start); + LoLoop.End = ExpandLoopEnd(LoLoop); + RemoveDeadBranch(LoLoop.End); + if (LoLoop.IsTailPredicationLegal()) { + RemoveLoopUpdate(LoLoop); + ConvertVPTBlocks(LoLoop); + } } } @@ -521,7 +1025,7 @@ bool ARMLowOverheadLoops::RevertNonLoops() { SmallVector<MachineInstr*, 4> Ends; for (auto &I : MBB) { - if (IsLoopStart(I)) + if (isLoopStart(I)) Starts.push_back(&I); else if (I.getOpcode() == ARM::t2LoopDec) Decs.push_back(&I); diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMParallelDSP.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/ARMParallelDSP.cpp index ae5657a0a2c1..e2c9335db419 100644 --- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMParallelDSP.cpp +++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMParallelDSP.cpp @@ -14,23 +14,24 @@ // //===----------------------------------------------------------------------===// -#include "llvm/ADT/Statistic.h" +#include "ARM.h" +#include "ARMSubtarget.h" #include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/LoopAccessAnalysis.h" #include "llvm/Analysis/OrderedBasicBlock.h" +#include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicsARM.h" #include "llvm/IR/NoFolder.h" -#include "llvm/Transforms/Scalar.h" -#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/IR/PatternMatch.h" #include "llvm/Pass.h" #include "llvm/PassRegistry.h" #include "llvm/PassSupport.h" #include "llvm/Support/Debug.h" -#include "llvm/IR/PatternMatch.h" -#include "llvm/CodeGen/TargetPassConfig.h" -#include "ARM.h" -#include "ARMSubtarget.h" +#include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" using namespace llvm; using namespace PatternMatch; diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMPredicates.td b/contrib/llvm-project/llvm/lib/Target/ARM/ARMPredicates.td index b008d3e2e296..dea1d767beb4 100644 --- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMPredicates.td +++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMPredicates.td @@ -182,11 +182,9 @@ def UseMulOps : Predicate<"Subtarget->useMulOps()">; // But only select them if more precision in FP computation is allowed, and when // they are not slower than a mul + add sequence. // Do not use them for Darwin platforms. -def UseFusedMAC : Predicate<"(TM.Options.AllowFPOpFusion ==" - " FPOpFusion::Fast && " - " Subtarget->hasVFP4Base()) && " - "!Subtarget->isTargetDarwin() &&" - "Subtarget->useFPVMLx()">; +def UseFusedMAC : Predicate<"TM.Options.AllowFPOpFusion ==" + " FPOpFusion::Fast && " + "Subtarget->useFPVFMx()">; def HasFastVGETLNi32 : Predicate<"!Subtarget->hasSlowVGETLNi32()">; def HasSlowVGETLNi32 : Predicate<"Subtarget->hasSlowVGETLNi32()">; diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMRegisterBankInfo.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/ARMRegisterBankInfo.cpp index b100150175fc..43c8cd5a89be 100644 --- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMRegisterBankInfo.cpp +++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMRegisterBankInfo.cpp @@ -172,8 +172,9 @@ ARMRegisterBankInfo::ARMRegisterBankInfo(const TargetRegisterInfo &TRI) #endif } -const RegisterBank &ARMRegisterBankInfo::getRegBankFromRegClass( - const TargetRegisterClass &RC) const { +const RegisterBank & +ARMRegisterBankInfo::getRegBankFromRegClass(const TargetRegisterClass &RC, + LLT) const { using namespace ARM; switch (RC.getID()) { @@ -249,7 +250,7 @@ ARMRegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case G_SEXT: case G_ZEXT: case G_ANYEXT: - case G_GEP: + case G_PTR_ADD: case G_INTTOPTR: case G_PTRTOINT: case G_CTLZ: diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMRegisterBankInfo.h b/contrib/llvm-project/llvm/lib/Target/ARM/ARMRegisterBankInfo.h index 1961f7af49bb..b8aff65a967e 100644 --- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMRegisterBankInfo.h +++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMRegisterBankInfo.h @@ -32,8 +32,8 @@ class ARMRegisterBankInfo final : public ARMGenRegisterBankInfo { public: ARMRegisterBankInfo(const TargetRegisterInfo &TRI); - const RegisterBank & - getRegBankFromRegClass(const TargetRegisterClass &RC) const override; + const RegisterBank &getRegBankFromRegClass(const TargetRegisterClass &RC, + LLT) const override; const InstructionMapping & getInstrMapping(const MachineInstr &MI) const override; diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMSubtarget.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/ARMSubtarget.cpp index 09603057b2c8..eb4d39b01cbb 100644 --- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMSubtarget.cpp +++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMSubtarget.cpp @@ -72,6 +72,9 @@ static cl::opt<bool> ForceFastISel("arm-force-fast-isel", cl::init(false), cl::Hidden); +static cl::opt<bool> EnableSubRegLiveness("arm-enable-subreg-liveness", + cl::init(false), cl::Hidden); + /// initializeSubtargetDependencies - Initializes using a CPU and feature string /// so that we can use initializer lists for subtarget initialization. ARMSubtarget &ARMSubtarget::initializeSubtargetDependencies(StringRef CPU, @@ -379,11 +382,23 @@ bool ARMSubtarget::enableMachineScheduler() const { return useMachineScheduler(); } +bool ARMSubtarget::enableSubRegLiveness() const { return EnableSubRegLiveness; } + // This overrides the PostRAScheduler bit in the SchedModel for any CPU. bool ARMSubtarget::enablePostRAScheduler() const { + if (enableMachineScheduler()) + return false; + if (disablePostRAScheduler()) + return false; + // Thumb1 cores will generally not benefit from post-ra scheduling + return !isThumb1Only(); +} + +bool ARMSubtarget::enablePostRAMachineScheduler() const { + if (!enableMachineScheduler()) + return false; if (disablePostRAScheduler()) return false; - // Don't reschedule potential IT blocks. return !isThumb1Only(); } diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMSubtarget.h b/contrib/llvm-project/llvm/lib/Target/ARM/ARMSubtarget.h index ef460342a69e..6bdd021970ef 100644 --- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMSubtarget.h +++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMSubtarget.h @@ -203,6 +203,10 @@ protected: /// whether the FP VML[AS] instructions are slow (if so, don't use them). bool SlowFPVMLx = false; + /// SlowFPVFMx - If the VFP4 / NEON instructions are available, indicates + /// whether the FP VFM[AS] instructions are slow (if so, don't use them). + bool SlowFPVFMx = false; + /// HasVMLxForwarding - If true, NEON has special multiplier accumulator /// forwarding to allow mul + mla being issued back to back. bool HasVMLxForwarding = false; @@ -223,9 +227,6 @@ protected: /// register allocation. bool DisablePostRAScheduler = false; - /// UseAA - True if using AA during codegen (DAGCombine, MISched, etc) - bool UseAA = false; - /// HasThumb2 - True if Thumb2 instructions are supported. bool HasThumb2 = false; @@ -635,6 +636,11 @@ public: bool useMulOps() const { return UseMulOps; } bool useFPVMLx() const { return !SlowFPVMLx; } + bool useFPVFMx() const { + return !isTargetDarwin() && hasVFP4Base() && !SlowFPVFMx; + } + bool useFPVFMx16() const { return useFPVFMx() && hasFullFP16(); } + bool useFPVFMx64() const { return useFPVFMx() && hasFP64(); } bool hasVMLxForwarding() const { return HasVMLxForwarding; } bool isFPBrccSlow() const { return SlowFPBrcc; } bool hasFP64() const { return HasFP64; } @@ -806,9 +812,15 @@ public: /// True for some subtargets at > -O0. bool enablePostRAScheduler() const override; + /// True for some subtargets at > -O0. + bool enablePostRAMachineScheduler() const override; + + /// Check whether this subtarget wants to use subregister liveness. + bool enableSubRegLiveness() const override; + /// Enable use of alias analysis during code generation (during MI /// scheduling, DAGCombine, etc.). - bool useAA() const override { return UseAA; } + bool useAA() const override { return true; } // enableAtomicExpand- True if we need to expand our atomics. bool enableAtomicExpand() const override; diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMTargetMachine.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/ARMTargetMachine.cpp index 5c8007f101d9..84876eda33a6 100644 --- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMTargetMachine.cpp +++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMTargetMachine.cpp @@ -46,6 +46,7 @@ #include "llvm/Support/TargetRegistry.h" #include "llvm/Target/TargetLoweringObjectFile.h" #include "llvm/Target/TargetOptions.h" +#include "llvm/Transforms/CFGuard.h" #include "llvm/Transforms/Scalar.h" #include <cassert> #include <memory> @@ -78,7 +79,7 @@ namespace llvm { void initializeARMExecutionDomainFixPass(PassRegistry&); } -extern "C" void LLVMInitializeARMTarget() { +extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeARMTarget() { // Register the target. RegisterTargetMachine<ARMLETargetMachine> X(getTheARMLETarget()); RegisterTargetMachine<ARMLETargetMachine> A(getTheThumbLETarget()); @@ -90,7 +91,6 @@ extern "C" void LLVMInitializeARMTarget() { initializeARMLoadStoreOptPass(Registry); initializeARMPreAllocLoadStoreOptPass(Registry); initializeARMParallelDSPPass(Registry); - initializeARMCodeGenPreparePass(Registry); initializeARMConstantIslandsPass(Registry); initializeARMExecutionDomainFixPass(Registry); initializeARMExpandPseudoPass(Registry); @@ -98,6 +98,7 @@ extern "C" void LLVMInitializeARMTarget() { initializeMVEVPTBlockPass(Registry); initializeMVETailPredicationPass(Registry); initializeARMLowOverheadLoopsPass(Registry); + initializeMVEGatherScatterLoweringPass(Registry); } static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) { @@ -321,14 +322,7 @@ namespace { class ARMPassConfig : public TargetPassConfig { public: ARMPassConfig(ARMBaseTargetMachine &TM, PassManagerBase &PM) - : TargetPassConfig(TM, PM) { - if (TM.getOptLevel() != CodeGenOpt::None) { - ARMGenSubtargetInfo STI(TM.getTargetTriple(), TM.getTargetCPU(), - TM.getTargetFeatureString()); - if (STI.hasFeature(ARM::FeatureUseMISched)) - substitutePass(&PostRASchedulerID, &PostMachineSchedulerID); - } - } + : TargetPassConfig(TM, PM) {} ARMBaseTargetMachine &getARMTargetMachine() const { return getTM<ARMBaseTargetMachine>(); @@ -411,6 +405,8 @@ void ARMPassConfig::addIRPasses() { return ST.hasAnyDataBarrier() && !ST.isThumb1Only(); })); + addPass(createMVEGatherScatterLoweringPass()); + TargetPassConfig::addIRPasses(); // Run the parallel DSP pass. @@ -420,11 +416,15 @@ void ARMPassConfig::addIRPasses() { // Match interleaved memory accesses to ldN/stN intrinsics. if (TM->getOptLevel() != CodeGenOpt::None) addPass(createInterleavedAccessPass()); + + // Add Control Flow Guard checks. + if (TM->getTargetTriple().isOSWindows()) + addPass(createCFGuardCheckPass()); } void ARMPassConfig::addCodeGenPrepare() { if (getOptLevel() != CodeGenOpt::None) - addPass(createARMCodeGenPreparePass()); + addPass(createTypePromotionPass()); TargetPassConfig::addCodeGenPrepare(); } @@ -518,6 +518,13 @@ void ARMPassConfig::addPreSched2() { } addPass(createMVEVPTBlockPass()); addPass(createThumb2ITBlockPass()); + + // Add both scheduling passes to give the subtarget an opportunity to pick + // between them. + if (getOptLevel() != CodeGenOpt::None) { + addPass(&PostMachineSchedulerID); + addPass(&PostRASchedulerID); + } } void ARMPassConfig::addPreEmitPass() { @@ -534,4 +541,8 @@ void ARMPassConfig::addPreEmitPass() { addPass(createARMConstantIslandPass()); addPass(createARMLowOverheadLoopsPass()); + + // Identify valid longjmp targets for Windows Control Flow Guard. + if (TM->getTargetTriple().isOSWindows()) + addPass(createCFGuardLongjmpPass()); } diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMTargetMachine.h b/contrib/llvm-project/llvm/lib/Target/ARM/ARMTargetMachine.h index cb8650d8139b..ac55d2bdcc2b 100644 --- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMTargetMachine.h +++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMTargetMachine.h @@ -70,6 +70,8 @@ public: TargetTriple.isOSWindows() || TargetABI == ARMBaseTargetMachine::ARM_ABI_AAPCS16; } + + bool targetSchedulesPostRAScheduling() const override { return true; }; }; /// ARM/Thumb little endian target machine. diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp index 86c8684d14dc..7ff05034c1f2 100644 --- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp +++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp @@ -22,6 +22,7 @@ #include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/PatternMatch.h" #include "llvm/IR/Type.h" #include "llvm/MC/SubtargetFeature.h" #include "llvm/Support/Casting.h" @@ -37,13 +38,17 @@ using namespace llvm; #define DEBUG_TYPE "armtti" static cl::opt<bool> EnableMaskedLoadStores( - "enable-arm-maskedldst", cl::Hidden, cl::init(false), + "enable-arm-maskedldst", cl::Hidden, cl::init(true), cl::desc("Enable the generation of masked loads and stores")); static cl::opt<bool> DisableLowOverheadLoops( "disable-arm-loloops", cl::Hidden, cl::init(false), cl::desc("Disable the generation of low-overhead loops")); +extern cl::opt<bool> DisableTailPredication; + +extern cl::opt<bool> EnableMaskedGatherScatters; + bool ARMTTIImpl::areInlineCompatible(const Function *Caller, const Function *Callee) const { const TargetMachine &TM = getTLI()->getTargetMachine(); @@ -104,7 +109,7 @@ int ARMTTIImpl::getIntImmCodeSizeCost(unsigned Opcode, unsigned Idx, return 1; } -int ARMTTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm, +int ARMTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty) { // Division by a constant can be turned into multiplication, but only if we // know it's constant. So it's not so much that the immediate is cheap (it's @@ -512,6 +517,27 @@ bool ARMTTIImpl::isLegalMaskedLoad(Type *DataTy, MaybeAlign Alignment) { (EltWidth == 8); } +bool ARMTTIImpl::isLegalMaskedGather(Type *Ty, MaybeAlign Alignment) { + if (!EnableMaskedGatherScatters || !ST->hasMVEIntegerOps()) + return false; + + // This method is called in 2 places: + // - from the vectorizer with a scalar type, in which case we need to get + // this as good as we can with the limited info we have (and rely on the cost + // model for the rest). + // - from the masked intrinsic lowering pass with the actual vector type. + // For MVE, we have a custom lowering pass that will already have custom + // legalised any gathers that we can to MVE intrinsics, and want to expand all + // the rest. The pass runs before the masked intrinsic lowering pass, so if we + // are here, we know we want to expand. + if (isa<VectorType>(Ty)) + return false; + + unsigned EltWidth = Ty->getScalarSizeInBits(); + return ((EltWidth == 32 && (!Alignment || Alignment >= 4)) || + (EltWidth == 16 && (!Alignment || Alignment >= 2)) || EltWidth == 8); +} + int ARMTTIImpl::getMemcpyCost(const Instruction *I) { const MemCpyInst *MI = dyn_cast<MemCpyInst>(I); assert(MI && "MemcpyInst expected"); @@ -640,58 +666,60 @@ int ARMTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, return BaseCost * BaseT::getShuffleCost(Kind, Tp, Index, SubTp); } -int ARMTTIImpl::getArithmeticInstrCost( - unsigned Opcode, Type *Ty, TTI::OperandValueKind Op1Info, - TTI::OperandValueKind Op2Info, TTI::OperandValueProperties Opd1PropInfo, - TTI::OperandValueProperties Opd2PropInfo, - ArrayRef<const Value *> Args) { +int ARMTTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty, + TTI::OperandValueKind Op1Info, + TTI::OperandValueKind Op2Info, + TTI::OperandValueProperties Opd1PropInfo, + TTI::OperandValueProperties Opd2PropInfo, + ArrayRef<const Value *> Args, + const Instruction *CxtI) { int ISDOpcode = TLI->InstructionOpcodeToISD(Opcode); std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty); - const unsigned FunctionCallDivCost = 20; - const unsigned ReciprocalDivCost = 10; - static const CostTblEntry CostTbl[] = { - // Division. - // These costs are somewhat random. Choose a cost of 20 to indicate that - // vectorizing devision (added function call) is going to be very expensive. - // Double registers types. - { ISD::SDIV, MVT::v1i64, 1 * FunctionCallDivCost}, - { ISD::UDIV, MVT::v1i64, 1 * FunctionCallDivCost}, - { ISD::SREM, MVT::v1i64, 1 * FunctionCallDivCost}, - { ISD::UREM, MVT::v1i64, 1 * FunctionCallDivCost}, - { ISD::SDIV, MVT::v2i32, 2 * FunctionCallDivCost}, - { ISD::UDIV, MVT::v2i32, 2 * FunctionCallDivCost}, - { ISD::SREM, MVT::v2i32, 2 * FunctionCallDivCost}, - { ISD::UREM, MVT::v2i32, 2 * FunctionCallDivCost}, - { ISD::SDIV, MVT::v4i16, ReciprocalDivCost}, - { ISD::UDIV, MVT::v4i16, ReciprocalDivCost}, - { ISD::SREM, MVT::v4i16, 4 * FunctionCallDivCost}, - { ISD::UREM, MVT::v4i16, 4 * FunctionCallDivCost}, - { ISD::SDIV, MVT::v8i8, ReciprocalDivCost}, - { ISD::UDIV, MVT::v8i8, ReciprocalDivCost}, - { ISD::SREM, MVT::v8i8, 8 * FunctionCallDivCost}, - { ISD::UREM, MVT::v8i8, 8 * FunctionCallDivCost}, - // Quad register types. - { ISD::SDIV, MVT::v2i64, 2 * FunctionCallDivCost}, - { ISD::UDIV, MVT::v2i64, 2 * FunctionCallDivCost}, - { ISD::SREM, MVT::v2i64, 2 * FunctionCallDivCost}, - { ISD::UREM, MVT::v2i64, 2 * FunctionCallDivCost}, - { ISD::SDIV, MVT::v4i32, 4 * FunctionCallDivCost}, - { ISD::UDIV, MVT::v4i32, 4 * FunctionCallDivCost}, - { ISD::SREM, MVT::v4i32, 4 * FunctionCallDivCost}, - { ISD::UREM, MVT::v4i32, 4 * FunctionCallDivCost}, - { ISD::SDIV, MVT::v8i16, 8 * FunctionCallDivCost}, - { ISD::UDIV, MVT::v8i16, 8 * FunctionCallDivCost}, - { ISD::SREM, MVT::v8i16, 8 * FunctionCallDivCost}, - { ISD::UREM, MVT::v8i16, 8 * FunctionCallDivCost}, - { ISD::SDIV, MVT::v16i8, 16 * FunctionCallDivCost}, - { ISD::UDIV, MVT::v16i8, 16 * FunctionCallDivCost}, - { ISD::SREM, MVT::v16i8, 16 * FunctionCallDivCost}, - { ISD::UREM, MVT::v16i8, 16 * FunctionCallDivCost}, - // Multiplication. - }; - if (ST->hasNEON()) { + const unsigned FunctionCallDivCost = 20; + const unsigned ReciprocalDivCost = 10; + static const CostTblEntry CostTbl[] = { + // Division. + // These costs are somewhat random. Choose a cost of 20 to indicate that + // vectorizing devision (added function call) is going to be very expensive. + // Double registers types. + { ISD::SDIV, MVT::v1i64, 1 * FunctionCallDivCost}, + { ISD::UDIV, MVT::v1i64, 1 * FunctionCallDivCost}, + { ISD::SREM, MVT::v1i64, 1 * FunctionCallDivCost}, + { ISD::UREM, MVT::v1i64, 1 * FunctionCallDivCost}, + { ISD::SDIV, MVT::v2i32, 2 * FunctionCallDivCost}, + { ISD::UDIV, MVT::v2i32, 2 * FunctionCallDivCost}, + { ISD::SREM, MVT::v2i32, 2 * FunctionCallDivCost}, + { ISD::UREM, MVT::v2i32, 2 * FunctionCallDivCost}, + { ISD::SDIV, MVT::v4i16, ReciprocalDivCost}, + { ISD::UDIV, MVT::v4i16, ReciprocalDivCost}, + { ISD::SREM, MVT::v4i16, 4 * FunctionCallDivCost}, + { ISD::UREM, MVT::v4i16, 4 * FunctionCallDivCost}, + { ISD::SDIV, MVT::v8i8, ReciprocalDivCost}, + { ISD::UDIV, MVT::v8i8, ReciprocalDivCost}, + { ISD::SREM, MVT::v8i8, 8 * FunctionCallDivCost}, + { ISD::UREM, MVT::v8i8, 8 * FunctionCallDivCost}, + // Quad register types. + { ISD::SDIV, MVT::v2i64, 2 * FunctionCallDivCost}, + { ISD::UDIV, MVT::v2i64, 2 * FunctionCallDivCost}, + { ISD::SREM, MVT::v2i64, 2 * FunctionCallDivCost}, + { ISD::UREM, MVT::v2i64, 2 * FunctionCallDivCost}, + { ISD::SDIV, MVT::v4i32, 4 * FunctionCallDivCost}, + { ISD::UDIV, MVT::v4i32, 4 * FunctionCallDivCost}, + { ISD::SREM, MVT::v4i32, 4 * FunctionCallDivCost}, + { ISD::UREM, MVT::v4i32, 4 * FunctionCallDivCost}, + { ISD::SDIV, MVT::v8i16, 8 * FunctionCallDivCost}, + { ISD::UDIV, MVT::v8i16, 8 * FunctionCallDivCost}, + { ISD::SREM, MVT::v8i16, 8 * FunctionCallDivCost}, + { ISD::UREM, MVT::v8i16, 8 * FunctionCallDivCost}, + { ISD::SDIV, MVT::v16i8, 16 * FunctionCallDivCost}, + { ISD::UDIV, MVT::v16i8, 16 * FunctionCallDivCost}, + { ISD::SREM, MVT::v16i8, 16 * FunctionCallDivCost}, + { ISD::UREM, MVT::v16i8, 16 * FunctionCallDivCost}, + // Multiplication. + }; + if (const auto *Entry = CostTableLookup(CostTbl, ISDOpcode, LT.second)) return LT.first * Entry->Cost; @@ -712,6 +740,33 @@ int ARMTTIImpl::getArithmeticInstrCost( return Cost; } + // If this operation is a shift on arm/thumb2, it might well be folded into + // the following instruction, hence having a cost of 0. + auto LooksLikeAFreeShift = [&]() { + if (ST->isThumb1Only() || Ty->isVectorTy()) + return false; + + if (!CxtI || !CxtI->hasOneUse() || !CxtI->isShift()) + return false; + if (Op2Info != TargetTransformInfo::OK_UniformConstantValue) + return false; + + // Folded into a ADC/ADD/AND/BIC/CMP/EOR/MVN/ORR/ORN/RSB/SBC/SUB + switch (cast<Instruction>(CxtI->user_back())->getOpcode()) { + case Instruction::Add: + case Instruction::Sub: + case Instruction::And: + case Instruction::Xor: + case Instruction::Or: + case Instruction::ICmp: + return true; + default: + return false; + } + }; + if (LooksLikeAFreeShift()) + return 0; + int BaseCost = ST->hasMVEIntegerOps() && Ty->isVectorTy() ? ST->getMVEVectorCostFactor() : 1; @@ -735,11 +790,13 @@ int ARMTTIImpl::getArithmeticInstrCost( return BaseCost; } -int ARMTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, - unsigned AddressSpace, const Instruction *I) { +int ARMTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, + MaybeAlign Alignment, unsigned AddressSpace, + const Instruction *I) { std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Src); - if (ST->hasNEON() && Src->isVectorTy() && Alignment != 16 && + if (ST->hasNEON() && Src->isVectorTy() && + (Alignment && *Alignment != Align(16)) && Src->getVectorElementType()->isDoubleTy()) { // Unaligned loads/stores are extremely inefficient. // We need 4 uops for vst.1/vld.1 vs 1uop for vldr/vstr. @@ -751,13 +808,10 @@ int ARMTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, return BaseCost * LT.first; } -int ARMTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, - unsigned Factor, - ArrayRef<unsigned> Indices, - unsigned Alignment, - unsigned AddressSpace, - bool UseMaskForCond, - bool UseMaskForGaps) { +int ARMTTIImpl::getInterleavedMemoryOpCost( + unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices, + unsigned Alignment, unsigned AddressSpace, bool UseMaskForCond, + bool UseMaskForGaps) { assert(Factor >= 2 && "Invalid interleave factor"); assert(isa<VectorType>(VecTy) && "Expect a vector type"); @@ -772,9 +826,19 @@ int ARMTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, // vldN/vstN only support legal vector types of size 64 or 128 in bits. // Accesses having vector types that are a multiple of 128 bits can be // matched to more than one vldN/vstN instruction. + int BaseCost = ST->hasMVEIntegerOps() ? ST->getMVEVectorCostFactor() : 1; if (NumElts % Factor == 0 && - TLI->isLegalInterleavedAccessType(SubVecTy, DL)) - return Factor * TLI->getNumInterleavedAccesses(SubVecTy, DL); + TLI->isLegalInterleavedAccessType(Factor, SubVecTy, DL)) + return Factor * BaseCost * TLI->getNumInterleavedAccesses(SubVecTy, DL); + + // Some smaller than legal interleaved patterns are cheap as we can make + // use of the vmovn or vrev patterns to interleave a standard load. This is + // true for v4i8, v8i8 and v4i16 at least (but not for v4f16 as it is + // promoted differently). The cost of 2 here is then a load and vrev or + // vmovn. + if (ST->hasMVEIntegerOps() && Factor == 2 && NumElts / Factor > 2 && + VecTy->isIntOrIntVectorTy() && DL.getTypeSizeInBits(SubVecTy) <= 64) + return 2 * BaseCost; } return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, @@ -998,6 +1062,142 @@ bool ARMTTIImpl::isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE, return true; } +static bool canTailPredicateInstruction(Instruction &I, int &ICmpCount) { + // We don't allow icmp's, and because we only look at single block loops, + // we simply count the icmps, i.e. there should only be 1 for the backedge. + if (isa<ICmpInst>(&I) && ++ICmpCount > 1) + return false; + + if (isa<FCmpInst>(&I)) + return false; + + // We could allow extending/narrowing FP loads/stores, but codegen is + // too inefficient so reject this for now. + if (isa<FPExtInst>(&I) || isa<FPTruncInst>(&I)) + return false; + + // Extends have to be extending-loads + if (isa<SExtInst>(&I) || isa<ZExtInst>(&I) ) + if (!I.getOperand(0)->hasOneUse() || !isa<LoadInst>(I.getOperand(0))) + return false; + + // Truncs have to be narrowing-stores + if (isa<TruncInst>(&I) ) + if (!I.hasOneUse() || !isa<StoreInst>(*I.user_begin())) + return false; + + return true; +} + +// To set up a tail-predicated loop, we need to know the total number of +// elements processed by that loop. Thus, we need to determine the element +// size and: +// 1) it should be uniform for all operations in the vector loop, so we +// e.g. don't want any widening/narrowing operations. +// 2) it should be smaller than i64s because we don't have vector operations +// that work on i64s. +// 3) we don't want elements to be reversed or shuffled, to make sure the +// tail-predication masks/predicates the right lanes. +// +static bool canTailPredicateLoop(Loop *L, LoopInfo *LI, ScalarEvolution &SE, + const DataLayout &DL, + const LoopAccessInfo *LAI) { + PredicatedScalarEvolution PSE = LAI->getPSE(); + int ICmpCount = 0; + int Stride = 0; + + LLVM_DEBUG(dbgs() << "tail-predication: checking allowed instructions\n"); + SmallVector<Instruction *, 16> LoadStores; + for (BasicBlock *BB : L->blocks()) { + for (Instruction &I : BB->instructionsWithoutDebug()) { + if (isa<PHINode>(&I)) + continue; + if (!canTailPredicateInstruction(I, ICmpCount)) { + LLVM_DEBUG(dbgs() << "Instruction not allowed: "; I.dump()); + return false; + } + + Type *T = I.getType(); + if (T->isPointerTy()) + T = T->getPointerElementType(); + + if (T->getScalarSizeInBits() > 32) { + LLVM_DEBUG(dbgs() << "Unsupported Type: "; T->dump()); + return false; + } + + if (isa<StoreInst>(I) || isa<LoadInst>(I)) { + Value *Ptr = isa<LoadInst>(I) ? I.getOperand(0) : I.getOperand(1); + int64_t NextStride = getPtrStride(PSE, Ptr, L); + // TODO: for now only allow consecutive strides of 1. We could support + // other strides as long as it is uniform, but let's keep it simple for + // now. + if (Stride == 0 && NextStride == 1) { + Stride = NextStride; + continue; + } + if (Stride != NextStride) { + LLVM_DEBUG(dbgs() << "Different strides found, can't " + "tail-predicate\n."); + return false; + } + } + } + } + + LLVM_DEBUG(dbgs() << "tail-predication: all instructions allowed!\n"); + return true; +} + +bool ARMTTIImpl::preferPredicateOverEpilogue(Loop *L, LoopInfo *LI, + ScalarEvolution &SE, + AssumptionCache &AC, + TargetLibraryInfo *TLI, + DominatorTree *DT, + const LoopAccessInfo *LAI) { + if (DisableTailPredication) + return false; + + // Creating a predicated vector loop is the first step for generating a + // tail-predicated hardware loop, for which we need the MVE masked + // load/stores instructions: + if (!ST->hasMVEIntegerOps()) + return false; + + // For now, restrict this to single block loops. + if (L->getNumBlocks() > 1) { + LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: not a single block " + "loop.\n"); + return false; + } + + assert(L->empty() && "preferPredicateOverEpilogue: inner-loop expected"); + + HardwareLoopInfo HWLoopInfo(L); + if (!HWLoopInfo.canAnalyze(*LI)) { + LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not " + "analyzable.\n"); + return false; + } + + // This checks if we have the low-overhead branch architecture + // extension, and if we will create a hardware-loop: + if (!isHardwareLoopProfitable(L, SE, AC, TLI, HWLoopInfo)) { + LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not " + "profitable.\n"); + return false; + } + + if (!HWLoopInfo.isHardwareLoopCandidate(SE, *LI, *DT)) { + LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not " + "a candidate.\n"); + return false; + } + + return canTailPredicateLoop(L, LI, SE, DL, LAI); +} + + void ARMTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP) { // Only currently enable these preferences for M-Class cores. @@ -1035,6 +1235,11 @@ void ARMTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, unsigned Cost = 0; for (auto *BB : L->getBlocks()) { for (auto &I : *BB) { + // Don't unroll vectorised loop. MVE does not benefit from it as much as + // scalar code. + if (I.getType()->isVectorTy()) + return; + if (isa<CallInst>(I) || isa<InvokeInst>(I)) { ImmutableCallSite CS(&I); if (const Function *F = CS.getCalledFunction()) { @@ -1043,10 +1248,6 @@ void ARMTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, } return; } - // Don't unroll vectorised loop. MVE does not benefit from it as much as - // scalar code. - if (I.getType()->isVectorTy()) - return; SmallVector<const Value*, 4> Operands(I.value_op_begin(), I.value_op_end()); diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMTargetTransformInfo.h b/contrib/llvm-project/llvm/lib/Target/ARM/ARMTargetTransformInfo.h index a878fdcfe3c7..880588adfdfd 100644 --- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMTargetTransformInfo.h +++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMTargetTransformInfo.h @@ -69,15 +69,15 @@ class ARMTTIImpl : public BasicTTIImplBase<ARMTTIImpl> { ARM::FeatureDontWidenVMOVS, ARM::FeatureExpandMLx, ARM::FeatureHasVMLxHazards, ARM::FeatureNEONForFPMovs, ARM::FeatureNEONForFP, ARM::FeatureCheckVLDnAlign, - ARM::FeatureHasSlowFPVMLx, ARM::FeatureVMLxForwarding, - ARM::FeaturePref32BitThumb, ARM::FeatureAvoidPartialCPSR, - ARM::FeatureCheapPredicableCPSR, ARM::FeatureAvoidMOVsShOp, - ARM::FeatureHasRetAddrStack, ARM::FeatureHasNoBranchPredictor, - ARM::FeatureDSP, ARM::FeatureMP, ARM::FeatureVirtualization, - ARM::FeatureMClass, ARM::FeatureRClass, ARM::FeatureAClass, - ARM::FeatureNaClTrap, ARM::FeatureStrictAlign, ARM::FeatureLongCalls, - ARM::FeatureExecuteOnly, ARM::FeatureReserveR9, ARM::FeatureNoMovt, - ARM::FeatureNoNegativeImmediates + ARM::FeatureHasSlowFPVMLx, ARM::FeatureHasSlowFPVFMx, + ARM::FeatureVMLxForwarding, ARM::FeaturePref32BitThumb, + ARM::FeatureAvoidPartialCPSR, ARM::FeatureCheapPredicableCPSR, + ARM::FeatureAvoidMOVsShOp, ARM::FeatureHasRetAddrStack, + ARM::FeatureHasNoBranchPredictor, ARM::FeatureDSP, ARM::FeatureMP, + ARM::FeatureVirtualization, ARM::FeatureMClass, ARM::FeatureRClass, + ARM::FeatureAClass, ARM::FeatureNaClTrap, ARM::FeatureStrictAlign, + ARM::FeatureLongCalls, ARM::FeatureExecuteOnly, ARM::FeatureReserveR9, + ARM::FeatureNoMovt, ARM::FeatureNoNegativeImmediates }; const ARMSubtarget *getST() const { return ST; } @@ -115,7 +115,7 @@ public: using BaseT::getIntImmCost; int getIntImmCost(const APInt &Imm, Type *Ty); - int getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty); + int getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty); /// @} @@ -159,6 +159,10 @@ public: return isLegalMaskedLoad(DataTy, Alignment); } + bool isLegalMaskedGather(Type *Ty, MaybeAlign Alignment); + + bool isLegalMaskedScatter(Type *Ty, MaybeAlign Alignment) { return false; } + int getMemcpyCost(const Instruction *I); int getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp); @@ -187,9 +191,10 @@ public: TTI::OperandValueKind Op2Info = TTI::OK_AnyValue, TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None, TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None, - ArrayRef<const Value *> Args = ArrayRef<const Value *>()); + ArrayRef<const Value *> Args = ArrayRef<const Value *>(), + const Instruction *CxtI = nullptr); - int getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, + int getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, const Instruction *I = nullptr); int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, @@ -203,7 +208,12 @@ public: AssumptionCache &AC, TargetLibraryInfo *LibInfo, HardwareLoopInfo &HWLoopInfo); - + bool preferPredicateOverEpilogue(Loop *L, LoopInfo *LI, + ScalarEvolution &SE, + AssumptionCache &AC, + TargetLibraryInfo *TLI, + DominatorTree *DT, + const LoopAccessInfo *LAI); void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP); diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp index d2c355c1da75..f6d76ee09534 100644 --- a/contrib/llvm-project/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp +++ b/contrib/llvm-project/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp @@ -6554,7 +6554,8 @@ bool ARMAsmParser::shouldOmitCCOutOperand(StringRef Mnemonic, // Check against T3. If the second register is the PC, this is an // alternate form of ADR, which uses encoding T4, so check for that too. if (static_cast<ARMOperand &>(*Operands[4]).getReg() != ARM::PC && - static_cast<ARMOperand &>(*Operands[5]).isT2SOImm()) + (static_cast<ARMOperand &>(*Operands[5]).isT2SOImm() || + static_cast<ARMOperand &>(*Operands[5]).isT2SOImmNeg())) return false; // Otherwise, we use encoding T4, which does not have a cc_out @@ -6609,9 +6610,34 @@ bool ARMAsmParser::shouldOmitCCOutOperand(StringRef Mnemonic, static_cast<ARMOperand &>(*Operands[1]).getReg() == 0 && (static_cast<ARMOperand &>(*Operands[4]).isImm() || (Operands.size() == 6 && - static_cast<ARMOperand &>(*Operands[5]).isImm()))) - return true; - + static_cast<ARMOperand &>(*Operands[5]).isImm()))) { + // Thumb2 (add|sub){s}{p}.w GPRnopc, sp, #{T2SOImm} has cc_out + return (!(isThumbTwo() && + (static_cast<ARMOperand &>(*Operands[4]).isT2SOImm() || + static_cast<ARMOperand &>(*Operands[4]).isT2SOImmNeg()))); + } + // Fixme: Should join all the thumb+thumb2 (add|sub) in a single if case + // Thumb2 ADD r0, #4095 -> ADDW r0, r0, #4095 (T4) + // Thumb2 SUB r0, #4095 -> SUBW r0, r0, #4095 + if (isThumbTwo() && (Mnemonic == "add" || Mnemonic == "sub") && + (Operands.size() == 5) && + static_cast<ARMOperand &>(*Operands[3]).isReg() && + static_cast<ARMOperand &>(*Operands[3]).getReg() != ARM::SP && + static_cast<ARMOperand &>(*Operands[3]).getReg() != ARM::PC && + static_cast<ARMOperand &>(*Operands[1]).getReg() == 0 && + static_cast<ARMOperand &>(*Operands[4]).isImm()) { + const ARMOperand &IMM = static_cast<ARMOperand &>(*Operands[4]); + if (IMM.isT2SOImm() || IMM.isT2SOImmNeg()) + return false; // add.w / sub.w + if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(IMM.getImm())) { + const int64_t Value = CE->getValue(); + // Thumb1 imm8 sub / add + if ((Value < ((1 << 7) - 1) << 2) && inITBlock() && (!(Value & 3)) && + isARMLowRegister(static_cast<ARMOperand &>(*Operands[3]).getReg())) + return false; + return true; // Thumb2 T4 addw / subw + } + } return false; } @@ -6703,7 +6729,7 @@ static void applyMnemonicAliases(StringRef &Mnemonic, // omitted. We don't have a way to do that in tablegen, so fix it up here. // // We have to be careful to not emit an invalid Rt2 here, because the rest of -// the assmebly parser could then generate confusing diagnostics refering to +// the assembly parser could then generate confusing diagnostics refering to // it. If we do find anything that prevents us from doing the transformation we // bail out, and let the assembly parser report an error on the instruction as // it is written. @@ -7707,12 +7733,8 @@ bool ARMAsmParser::validateInstruction(MCInst &Inst, } break; - case ARM::t2ADDri: - case ARM::t2ADDri12: case ARM::t2ADDrr: case ARM::t2ADDrs: - case ARM::t2SUBri: - case ARM::t2SUBri12: case ARM::t2SUBrr: case ARM::t2SUBrs: if (Inst.getOperand(0).getReg() == ARM::SP && @@ -7895,10 +7917,10 @@ bool ARMAsmParser::validateInstruction(MCInst &Inst, case ARM::MVE_VQDMULLs32bh: case ARM::MVE_VQDMULLs32th: case ARM::MVE_VCMULf32: - case ARM::MVE_VMULLs32bh: - case ARM::MVE_VMULLs32th: - case ARM::MVE_VMULLu32bh: - case ARM::MVE_VMULLu32th: { + case ARM::MVE_VMULLBs32: + case ARM::MVE_VMULLTs32: + case ARM::MVE_VMULLBu32: + case ARM::MVE_VMULLTu32: { if (Operands[3]->getReg() == Operands[4]->getReg()) { return Error (Operands[3]->getStartLoc(), "Qd register and Qn register can't be identical"); @@ -9750,23 +9772,33 @@ bool ARMAsmParser::processInstruction(MCInst &Inst, } break; case ARM::t2ADDri12: - // If the immediate fits for encoding T3 (t2ADDri) and the generic "add" - // mnemonic was used (not "addw"), encoding T3 is preferred. - if (static_cast<ARMOperand &>(*Operands[0]).getToken() != "add" || - ARM_AM::getT2SOImmVal(Inst.getOperand(2).getImm()) == -1) - break; - Inst.setOpcode(ARM::t2ADDri); - Inst.addOperand(MCOperand::createReg(0)); // cc_out - break; case ARM::t2SUBri12: - // If the immediate fits for encoding T3 (t2SUBri) and the generic "sub" - // mnemonic was used (not "subw"), encoding T3 is preferred. - if (static_cast<ARMOperand &>(*Operands[0]).getToken() != "sub" || + case ARM::t2ADDspImm12: + case ARM::t2SUBspImm12: { + // If the immediate fits for encoding T3 and the generic + // mnemonic was used, encoding T3 is preferred. + const StringRef Token = static_cast<ARMOperand &>(*Operands[0]).getToken(); + if ((Token != "add" && Token != "sub") || ARM_AM::getT2SOImmVal(Inst.getOperand(2).getImm()) == -1) break; - Inst.setOpcode(ARM::t2SUBri); + switch (Inst.getOpcode()) { + case ARM::t2ADDri12: + Inst.setOpcode(ARM::t2ADDri); + break; + case ARM::t2SUBri12: + Inst.setOpcode(ARM::t2SUBri); + break; + case ARM::t2ADDspImm12: + Inst.setOpcode(ARM::t2ADDspImm); + break; + case ARM::t2SUBspImm12: + Inst.setOpcode(ARM::t2SUBspImm); + break; + } + Inst.addOperand(MCOperand::createReg(0)); // cc_out - break; + return true; + } case ARM::tADDi8: // If the immediate is in the range 0-7, we want tADDi3 iff Rd was // explicitly specified. From the ARM ARM: "Encoding T1 is preferred @@ -9812,6 +9844,25 @@ bool ARMAsmParser::processInstruction(MCInst &Inst, Inst = TmpInst; return true; } + case ARM::t2ADDspImm: + case ARM::t2SUBspImm: { + // Prefer T1 encoding if possible + if (Inst.getOperand(5).getReg() != 0 || HasWideQualifier) + break; + unsigned V = Inst.getOperand(2).getImm(); + if (V & 3 || V > ((1 << 7) - 1) << 2) + break; + MCInst TmpInst; + TmpInst.setOpcode(Inst.getOpcode() == ARM::t2ADDspImm ? ARM::tADDspi + : ARM::tSUBspi); + TmpInst.addOperand(MCOperand::createReg(ARM::SP)); // destination reg + TmpInst.addOperand(MCOperand::createReg(ARM::SP)); // source reg + TmpInst.addOperand(MCOperand::createImm(V / 4)); // immediate + TmpInst.addOperand(Inst.getOperand(3)); // pred + TmpInst.addOperand(Inst.getOperand(4)); + Inst = TmpInst; + return true; + } case ARM::t2ADDrr: { // If the destination and first source operand are the same, and // there's no setting of the flags, use encoding T2 instead of T3. @@ -11495,7 +11546,7 @@ bool ARMAsmParser::parseDirectiveThumbSet(SMLoc L) { } /// Force static initialization. -extern "C" void LLVMInitializeARMAsmParser() { +extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeARMAsmParser() { RegisterMCAsmParser<ARMAsmParser> X(getTheARMLETarget()); RegisterMCAsmParser<ARMAsmParser> Y(getTheARMBETarget()); RegisterMCAsmParser<ARMAsmParser> A(getTheThumbLETarget()); diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp index eabc26d05f47..d26b04556abb 100644 --- a/contrib/llvm-project/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp +++ b/contrib/llvm-project/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp @@ -137,18 +137,15 @@ public: DecodeStatus getInstruction(MCInst &Instr, uint64_t &Size, ArrayRef<uint8_t> Bytes, uint64_t Address, - raw_ostream &VStream, raw_ostream &CStream) const override; private: DecodeStatus getARMInstruction(MCInst &Instr, uint64_t &Size, ArrayRef<uint8_t> Bytes, uint64_t Address, - raw_ostream &VStream, raw_ostream &CStream) const; DecodeStatus getThumbInstruction(MCInst &Instr, uint64_t &Size, ArrayRef<uint8_t> Bytes, uint64_t Address, - raw_ostream &VStream, raw_ostream &CStream) const; mutable ITStatus ITBlock; @@ -204,6 +201,9 @@ static DecodeStatus DecoderGPRRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address, const void *Decoder); static DecodeStatus DecodeGPRPairRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address, const void *Decoder); +static DecodeStatus DecodeGPRspRegisterClass(MCInst &Inst, unsigned RegNo, + uint64_t Address, + const void *Decoder); static DecodeStatus DecodeHPRRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address, const void *Decoder); static DecodeStatus DecodeSPRRegisterClass(MCInst &Inst, unsigned RegNo, @@ -566,6 +566,9 @@ static DecodeStatus DecodeMVEVPNOT(MCInst &Inst, unsigned Insn, static DecodeStatus DecodeMVEOverlappingLongShift(MCInst &Inst, unsigned Insn, uint64_t Address, const void *Decoder); +static DecodeStatus DecodeT2AddSubSPImm(MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder); + #include "ARMGenDisassemblerTables.inc" static MCDisassembler *createARMDisassembler(const Target &T, @@ -576,8 +579,7 @@ static MCDisassembler *createARMDisassembler(const Target &T, // Post-decoding checks static DecodeStatus checkDecodedInstruction(MCInst &MI, uint64_t &Size, - uint64_t Address, raw_ostream &OS, - raw_ostream &CS, + uint64_t Address, raw_ostream &CS, uint32_t Insn, DecodeStatus Result) { switch (MI.getOpcode()) { @@ -609,17 +611,16 @@ static DecodeStatus checkDecodedInstruction(MCInst &MI, uint64_t &Size, DecodeStatus ARMDisassembler::getInstruction(MCInst &MI, uint64_t &Size, ArrayRef<uint8_t> Bytes, - uint64_t Address, raw_ostream &OS, + uint64_t Address, raw_ostream &CS) const { if (STI.getFeatureBits()[ARM::ModeThumb]) - return getThumbInstruction(MI, Size, Bytes, Address, OS, CS); - return getARMInstruction(MI, Size, Bytes, Address, OS, CS); + return getThumbInstruction(MI, Size, Bytes, Address, CS); + return getARMInstruction(MI, Size, Bytes, Address, CS); } DecodeStatus ARMDisassembler::getARMInstruction(MCInst &MI, uint64_t &Size, ArrayRef<uint8_t> Bytes, uint64_t Address, - raw_ostream &OS, raw_ostream &CS) const { CommentStream = &CS; @@ -642,7 +643,7 @@ DecodeStatus ARMDisassembler::getARMInstruction(MCInst &MI, uint64_t &Size, decodeInstruction(DecoderTableARM32, MI, Insn, Address, this, STI); if (Result != MCDisassembler::Fail) { Size = 4; - return checkDecodedInstruction(MI, Size, Address, OS, CS, Insn, Result); + return checkDecodedInstruction(MI, Size, Address, CS, Insn, Result); } struct DecodeTable { @@ -673,7 +674,7 @@ DecodeStatus ARMDisassembler::getARMInstruction(MCInst &MI, uint64_t &Size, decodeInstruction(DecoderTableCoProc32, MI, Insn, Address, this, STI); if (Result != MCDisassembler::Fail) { Size = 4; - return checkDecodedInstruction(MI, Size, Address, OS, CS, Insn, Result); + return checkDecodedInstruction(MI, Size, Address, CS, Insn, Result); } Size = 4; @@ -906,7 +907,6 @@ void ARMDisassembler::UpdateThumbVFPPredicate( DecodeStatus ARMDisassembler::getThumbInstruction(MCInst &MI, uint64_t &Size, ArrayRef<uint8_t> Bytes, uint64_t Address, - raw_ostream &OS, raw_ostream &CS) const { CommentStream = &CS; @@ -1010,7 +1010,7 @@ DecodeStatus ARMDisassembler::getThumbInstruction(MCInst &MI, uint64_t &Size, if (Result != MCDisassembler::Fail) { Size = 4; Check(Result, AddThumbPredicate(MI)); - return checkDecodedInstruction(MI, Size, Address, OS, CS, Insn32, Result); + return checkDecodedInstruction(MI, Size, Address, CS, Insn32, Result); } if (fieldFromInstruction(Insn32, 28, 4) == 0xE) { @@ -1099,7 +1099,7 @@ DecodeStatus ARMDisassembler::getThumbInstruction(MCInst &MI, uint64_t &Size, return MCDisassembler::Fail; } -extern "C" void LLVMInitializeARMDisassembler() { +extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeARMDisassembler() { TargetRegistry::RegisterMCDisassembler(getTheARMLETarget(), createARMDisassembler); TargetRegistry::RegisterMCDisassembler(getTheARMBETarget(), @@ -1231,6 +1231,17 @@ static DecodeStatus DecodeGPRPairRegisterClass(MCInst &Inst, unsigned RegNo, return S; } +static DecodeStatus DecodeGPRspRegisterClass(MCInst &Inst, unsigned RegNo, + uint64_t Address, + const void *Decoder) { + if (RegNo != 13) + return MCDisassembler::Fail; + + unsigned Register = GPRDecoderTable[RegNo]; + Inst.addOperand(MCOperand::createReg(Register)); + return MCDisassembler::Success; +} + static DecodeStatus DecodetcGPRRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address, const void *Decoder) { unsigned Register = 0; @@ -5588,14 +5599,25 @@ static DecodeStatus DecodeT2Adr(MCInst &Inst, uint32_t Insn, unsigned sign1 = fieldFromInstruction(Insn, 21, 1); unsigned sign2 = fieldFromInstruction(Insn, 23, 1); if (sign1 != sign2) return MCDisassembler::Fail; + const unsigned Rd = fieldFromInstruction(Insn, 8, 4); + assert(Inst.getNumOperands() == 0 && "We should receive an empty Inst"); + DecodeStatus S = DecoderGPRRegisterClass(Inst, Rd, Address, Decoder); unsigned Val = fieldFromInstruction(Insn, 0, 8); Val |= fieldFromInstruction(Insn, 12, 3) << 8; Val |= fieldFromInstruction(Insn, 26, 1) << 11; - Val |= sign1 << 12; - Inst.addOperand(MCOperand::createImm(SignExtend32<13>(Val))); - - return MCDisassembler::Success; + // If sign, then it is decreasing the address. + if (sign1) { + // Following ARMv7 Architecture Manual, when the offset + // is zero, it is decoded as a subw, not as a adr.w + if (!Val) { + Inst.setOpcode(ARM::t2SUBri12); + Inst.addOperand(MCOperand::createReg(ARM::PC)); + } else + Val = -Val; + } + Inst.addOperand(MCOperand::createImm(Val)); + return S; } static DecodeStatus DecodeT2ShifterImmOperand(MCInst &Inst, uint32_t Val, @@ -6595,3 +6617,40 @@ static DecodeStatus DecodeMVEVPNOT(MCInst &Inst, unsigned Insn, uint64_t Address Inst.addOperand(MCOperand::createReg(ARM::VPR)); return S; } + +static DecodeStatus DecodeT2AddSubSPImm(MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder) { + const unsigned Rd = fieldFromInstruction(Insn, 8, 4); + const unsigned Rn = fieldFromInstruction(Insn, 16, 4); + const unsigned Imm12 = fieldFromInstruction(Insn, 26, 1) << 11 | + fieldFromInstruction(Insn, 12, 3) << 8 | + fieldFromInstruction(Insn, 0, 8); + const unsigned TypeT3 = fieldFromInstruction(Insn, 25, 1); + unsigned sign1 = fieldFromInstruction(Insn, 21, 1); + unsigned sign2 = fieldFromInstruction(Insn, 23, 1); + unsigned S = fieldFromInstruction(Insn, 20, 1); + if (sign1 != sign2) + return MCDisassembler::Fail; + + // T3 does a zext of imm12, where T2 does a ThumbExpandImm (T2SOImm) + DecodeStatus DS = MCDisassembler::Success; + if ((!Check(DS, + DecodeGPRspRegisterClass(Inst, Rd, Address, Decoder))) || // dst + (!Check(DS, DecodeGPRspRegisterClass(Inst, Rn, Address, Decoder)))) + return MCDisassembler::Fail; + if (TypeT3) { + Inst.setOpcode(sign1 ? ARM::t2SUBspImm12 : ARM::t2ADDspImm12); + S = 0; + Inst.addOperand(MCOperand::createImm(Imm12)); // zext imm12 + } else { + Inst.setOpcode(sign1 ? ARM::t2SUBspImm : ARM::t2ADDspImm); + if (!Check(DS, DecodeT2SOImm(Inst, Imm12, Address, Decoder))) // imm12 + return MCDisassembler::Fail; + } + if (!Check(DS, DecodeCCOutOperand(Inst, S, Address, Decoder))) // cc_out + return MCDisassembler::Fail; + + Inst.addOperand(MCOperand::createReg(0)); // pred + + return DS; +} diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp index 1fee38821a49..2c26dd388c05 100644 --- a/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp +++ b/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp @@ -255,8 +255,11 @@ void ARMELFObjectWriter::addTargetSectionFlags(MCContext &Ctx, // execute-only section in the object. MCSectionELF *TextSection = static_cast<MCSectionELF *>(Ctx.getObjectFileInfo()->getTextSection()); - if (Sec.getKind().isExecuteOnly() && !TextSection->hasInstructions() && - !TextSection->hasData()) { + if (Sec.getKind().isExecuteOnly() && !TextSection->hasInstructions()) { + for (auto &F : TextSection->getFragmentList()) + if (auto *DF = dyn_cast<MCDataFragment>(&F)) + if (!DF->getContents().empty()) + return; TextSection->setFlags(TextSection->getFlags() | ELF::SHF_ARM_PURECODE); } } diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp index f51fbdcd84da..f558ca8d2d9f 100644 --- a/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp +++ b/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp @@ -441,10 +441,12 @@ public: friend class ARMTargetELFStreamer; ARMELFStreamer(MCContext &Context, std::unique_ptr<MCAsmBackend> TAB, - std::unique_ptr<MCObjectWriter> OW, std::unique_ptr<MCCodeEmitter> Emitter, - bool IsThumb) - : MCELFStreamer(Context, std::move(TAB), std::move(OW), std::move(Emitter)), - IsThumb(IsThumb) { + std::unique_ptr<MCObjectWriter> OW, + std::unique_ptr<MCCodeEmitter> Emitter, bool IsThumb, + bool IsAndroid) + : MCELFStreamer(Context, std::move(TAB), std::move(OW), + std::move(Emitter)), + IsThumb(IsThumb), IsAndroid(IsAndroid) { EHReset(); } @@ -657,11 +659,10 @@ private: uint64_t Offset) { auto *Symbol = cast<MCSymbolELF>(getContext().getOrCreateSymbol( Name + "." + Twine(MappingSymbolCounter++))); - EmitLabel(Symbol, Loc, F); + EmitLabelAtPos(Symbol, Loc, F, Offset); Symbol->setType(ELF::STT_NOTYPE); Symbol->setBinding(ELF::STB_LOCAL); Symbol->setExternal(false); - Symbol->setOffset(Offset); } void EmitThumbFunc(MCSymbol *Func) override { @@ -687,6 +688,7 @@ private: void EmitFixup(const MCExpr *Expr, MCFixupKind Kind); bool IsThumb; + bool IsAndroid; int64_t MappingSymbolCounter = 0; DenseMap<const MCSection *, std::unique_ptr<ElfMappingSymbolInfo>> @@ -1269,7 +1271,12 @@ void ARMELFStreamer::emitFnEnd() { // Emit the exception index table entry SwitchToExIdxSection(*FnStart); - if (PersonalityIndex < ARM::EHABI::NUM_PERSONALITY_INDEX) + // The EHABI requires a dependency preserving R_ARM_NONE relocation to the + // personality routine to protect it from an arbitrary platform's static + // linker garbage collection. We disable this for Android where the unwinder + // is either dynamically linked or directly references the personality + // routine. + if (PersonalityIndex < ARM::EHABI::NUM_PERSONALITY_INDEX && !IsAndroid) EmitPersonalityFixup(GetAEABIUnwindPersonalityName(PersonalityIndex)); const MCSymbolRefExpr *FnStartRef = @@ -1504,9 +1511,11 @@ MCELFStreamer *createARMELFStreamer(MCContext &Context, std::unique_ptr<MCAsmBackend> TAB, std::unique_ptr<MCObjectWriter> OW, std::unique_ptr<MCCodeEmitter> Emitter, - bool RelaxAll, bool IsThumb) { - ARMELFStreamer *S = new ARMELFStreamer(Context, std::move(TAB), std::move(OW), - std::move(Emitter), IsThumb); + bool RelaxAll, bool IsThumb, + bool IsAndroid) { + ARMELFStreamer *S = + new ARMELFStreamer(Context, std::move(TAB), std::move(OW), + std::move(Emitter), IsThumb, IsAndroid); // FIXME: This should eventually end up somewhere else where more // intelligent flag decisions can be made. For now we are just maintaining // the status quo for ARM and setting EF_ARM_EABI_VER5 as the default. diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMInstPrinter.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMInstPrinter.cpp index a1def61b58d9..b36106a78b71 100644 --- a/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMInstPrinter.cpp +++ b/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMInstPrinter.cpp @@ -88,8 +88,9 @@ void ARMInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const { OS << markup("<reg:") << getRegisterName(RegNo, DefaultAltIdx) << markup(">"); } -void ARMInstPrinter::printInst(const MCInst *MI, raw_ostream &O, - StringRef Annot, const MCSubtargetInfo &STI) { +void ARMInstPrinter::printInst(const MCInst *MI, uint64_t Address, + StringRef Annot, const MCSubtargetInfo &STI, + raw_ostream &O) { unsigned Opcode = MI->getOpcode(); switch (Opcode) { @@ -275,7 +276,7 @@ void ARMInstPrinter::printInst(const MCInst *MI, raw_ostream &O, // Copy the rest operands into NewMI. for (unsigned i = isStore ? 3 : 2; i < MI->getNumOperands(); ++i) NewMI.addOperand(MI->getOperand(i)); - printInstruction(&NewMI, STI, O); + printInstruction(&NewMI, Address, STI, O); return; } break; @@ -288,7 +289,7 @@ void ARMInstPrinter::printInst(const MCInst *MI, raw_ostream &O, switch (MI->getOperand(0).getImm()) { default: if (!printAliasInstr(MI, STI, O)) - printInstruction(MI, STI, O); + printInstruction(MI, Address, STI, O); break; case 0: O << "\tssbb"; @@ -302,7 +303,7 @@ void ARMInstPrinter::printInst(const MCInst *MI, raw_ostream &O, } if (!printAliasInstr(MI, STI, O)) - printInstruction(MI, STI, O); + printInstruction(MI, Address, STI, O); printAnnotation(O, Annot); } diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMInstPrinter.h b/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMInstPrinter.h index eeb811e216fc..20f901033395 100644 --- a/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMInstPrinter.h +++ b/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMInstPrinter.h @@ -25,13 +25,13 @@ public: bool applyTargetSpecificCLOption(StringRef Opt) override; - void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot, - const MCSubtargetInfo &STI) override; + void printInst(const MCInst *MI, uint64_t Address, StringRef Annot, + const MCSubtargetInfo &STI, raw_ostream &O) override; void printRegName(raw_ostream &OS, unsigned RegNo) const override; // Autogenerated by tblgen. - void printInstruction(const MCInst *MI, const MCSubtargetInfo &STI, - raw_ostream &O); + void printInstruction(const MCInst *MI, uint64_t Address, + const MCSubtargetInfo &STI, raw_ostream &O); virtual bool printAliasInstr(const MCInst *MI, const MCSubtargetInfo &STI, raw_ostream &O); virtual void printCustomAliasOperand(const MCInst *MI, unsigned OpIdx, diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp index 90022a8d88a6..9f60e70e0e02 100644 --- a/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp +++ b/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp @@ -187,7 +187,8 @@ static MCRegisterInfo *createARMMCRegisterInfo(const Triple &Triple) { } static MCAsmInfo *createARMMCAsmInfo(const MCRegisterInfo &MRI, - const Triple &TheTriple) { + const Triple &TheTriple, + const MCTargetOptions &Options) { MCAsmInfo *MAI; if (TheTriple.isOSDarwin() || TheTriple.isOSBinFormatMachO()) MAI = new ARMMCAsmInfoDarwin(TheTriple); @@ -211,7 +212,8 @@ static MCStreamer *createELFStreamer(const Triple &T, MCContext &Ctx, bool RelaxAll) { return createARMELFStreamer( Ctx, std::move(MAB), std::move(OW), std::move(Emitter), false, - (T.getArch() == Triple::thumb || T.getArch() == Triple::thumbeb)); + (T.getArch() == Triple::thumb || T.getArch() == Triple::thumbeb), + T.isAndroid()); } static MCStreamer * @@ -315,7 +317,7 @@ static MCInstrAnalysis *createThumbMCInstrAnalysis(const MCInstrInfo *Info) { } // Force static initialization. -extern "C" void LLVMInitializeARMTargetMC() { +extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeARMTargetMC() { for (Target *T : {&getTheARMLETarget(), &getTheARMBETarget(), &getTheThumbLETarget(), &getTheThumbBETarget()}) { // Register the MC asm info. diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMUnwindOpAsm.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMUnwindOpAsm.cpp index 38667d686b85..a9460b70da56 100644 --- a/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMUnwindOpAsm.cpp +++ b/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMUnwindOpAsm.cpp @@ -6,7 +6,7 @@ // //===----------------------------------------------------------------------===// // -// This file implements the unwind opcode assmebler for ARM exception handling +// This file implements the unwind opcode assembler for ARM exception handling // table. // //===----------------------------------------------------------------------===// diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMUnwindOpAsm.h b/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMUnwindOpAsm.h index c3134c04b33a..5fb7307159d1 100644 --- a/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMUnwindOpAsm.h +++ b/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMUnwindOpAsm.h @@ -6,7 +6,7 @@ // //===----------------------------------------------------------------------===// // -// This file declares the unwind opcode assmebler for ARM exception handling +// This file declares the unwind opcode assembler for ARM exception handling // table. // //===----------------------------------------------------------------------===// diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp new file mode 100644 index 000000000000..9f64af02e698 --- /dev/null +++ b/contrib/llvm-project/llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp @@ -0,0 +1,301 @@ +//===- MVEGatherScatterLowering.cpp - Gather/Scatter lowering -------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// This pass custom lowers llvm.gather and llvm.scatter instructions to +/// arm.mve.gather and arm.mve.scatter intrinsics, optimising the code to +/// produce a better final result as we go. +// +//===----------------------------------------------------------------------===// + +#include "ARM.h" +#include "ARMBaseInstrInfo.h" +#include "ARMSubtarget.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/CodeGen/TargetLowering.h" +#include "llvm/CodeGen/TargetPassConfig.h" +#include "llvm/CodeGen/TargetSubtargetInfo.h" +#include "llvm/InitializePasses.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/Constant.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/InstrTypes.h" +#include "llvm/IR/Instruction.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/IntrinsicsARM.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/PatternMatch.h" +#include "llvm/IR/Type.h" +#include "llvm/IR/Value.h" +#include "llvm/Pass.h" +#include "llvm/Support/Casting.h" +#include <algorithm> +#include <cassert> + +using namespace llvm; + +#define DEBUG_TYPE "mve-gather-scatter-lowering" + +cl::opt<bool> EnableMaskedGatherScatters( + "enable-arm-maskedgatscat", cl::Hidden, cl::init(false), + cl::desc("Enable the generation of masked gathers and scatters")); + +namespace { + +class MVEGatherScatterLowering : public FunctionPass { +public: + static char ID; // Pass identification, replacement for typeid + + explicit MVEGatherScatterLowering() : FunctionPass(ID) { + initializeMVEGatherScatterLoweringPass(*PassRegistry::getPassRegistry()); + } + + bool runOnFunction(Function &F) override; + + StringRef getPassName() const override { + return "MVE gather/scatter lowering"; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + AU.addRequired<TargetPassConfig>(); + FunctionPass::getAnalysisUsage(AU); + } + +private: + // Check this is a valid gather with correct alignment + bool isLegalTypeAndAlignment(unsigned NumElements, unsigned ElemSize, + unsigned Alignment); + // Check whether Ptr is hidden behind a bitcast and look through it + void lookThroughBitcast(Value *&Ptr); + // Check for a getelementptr and deduce base and offsets from it, on success + // returning the base directly and the offsets indirectly using the Offsets + // argument + Value *checkGEP(Value *&Offsets, Type *Ty, Value *Ptr, IRBuilder<> Builder); + + bool lowerGather(IntrinsicInst *I); + // Create a gather from a base + vector of offsets + Value *tryCreateMaskedGatherOffset(IntrinsicInst *I, Value *Ptr, + IRBuilder<> Builder); + // Create a gather from a vector of pointers + Value *tryCreateMaskedGatherBase(IntrinsicInst *I, Value *Ptr, + IRBuilder<> Builder); +}; + +} // end anonymous namespace + +char MVEGatherScatterLowering::ID = 0; + +INITIALIZE_PASS(MVEGatherScatterLowering, DEBUG_TYPE, + "MVE gather/scattering lowering pass", false, false) + +Pass *llvm::createMVEGatherScatterLoweringPass() { + return new MVEGatherScatterLowering(); +} + +bool MVEGatherScatterLowering::isLegalTypeAndAlignment(unsigned NumElements, + unsigned ElemSize, + unsigned Alignment) { + // Do only allow non-extending gathers for now + if (((NumElements == 4 && ElemSize == 32) || + (NumElements == 8 && ElemSize == 16) || + (NumElements == 16 && ElemSize == 8)) && + ElemSize / 8 <= Alignment) + return true; + LLVM_DEBUG(dbgs() << "masked gathers: instruction does not have valid " + << "alignment or vector type \n"); + return false; +} + +Value *MVEGatherScatterLowering::checkGEP(Value *&Offsets, Type *Ty, Value *Ptr, + IRBuilder<> Builder) { + GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr); + if (!GEP) { + LLVM_DEBUG(dbgs() << "masked gathers: no getelementpointer found\n"); + return nullptr; + } + LLVM_DEBUG(dbgs() << "masked gathers: getelementpointer found. Loading" + << " from base + vector of offsets\n"); + Value *GEPPtr = GEP->getPointerOperand(); + if (GEPPtr->getType()->isVectorTy()) { + LLVM_DEBUG(dbgs() << "masked gathers: gather from a vector of pointers" + << " hidden behind a getelementptr currently not" + << " supported. Expanding.\n"); + return nullptr; + } + if (GEP->getNumOperands() != 2) { + LLVM_DEBUG(dbgs() << "masked gathers: getelementptr with too many" + << " operands. Expanding.\n"); + return nullptr; + } + Offsets = GEP->getOperand(1); + // SExt offsets inside masked gathers are not permitted by the architecture; + // we therefore can't fold them + if (ZExtInst *ZextOffs = dyn_cast<ZExtInst>(Offsets)) + Offsets = ZextOffs->getOperand(0); + Type *OffsType = VectorType::getInteger(cast<VectorType>(Ty)); + // If the offset we found does not have the type the intrinsic expects, + // i.e., the same type as the gather itself, we need to convert it (only i + // types) or fall back to expanding the gather + if (OffsType != Offsets->getType()) { + if (OffsType->getScalarSizeInBits() > + Offsets->getType()->getScalarSizeInBits()) { + LLVM_DEBUG(dbgs() << "masked gathers: extending offsets\n"); + Offsets = Builder.CreateZExt(Offsets, OffsType, ""); + } else { + LLVM_DEBUG(dbgs() << "masked gathers: no correct offset type. Can't" + << " create masked gather\n"); + return nullptr; + } + } + // If none of the checks failed, return the gep's base pointer + return GEPPtr; +} + +void MVEGatherScatterLowering::lookThroughBitcast(Value *&Ptr) { + // Look through bitcast instruction if #elements is the same + if (auto *BitCast = dyn_cast<BitCastInst>(Ptr)) { + Type *BCTy = BitCast->getType(); + Type *BCSrcTy = BitCast->getOperand(0)->getType(); + if (BCTy->getVectorNumElements() == BCSrcTy->getVectorNumElements()) { + LLVM_DEBUG(dbgs() << "masked gathers: looking through bitcast\n"); + Ptr = BitCast->getOperand(0); + } + } +} + +bool MVEGatherScatterLowering::lowerGather(IntrinsicInst *I) { + using namespace PatternMatch; + LLVM_DEBUG(dbgs() << "masked gathers: checking transform preconditions\n"); + + // @llvm.masked.gather.*(Ptrs, alignment, Mask, Src0) + // Attempt to turn the masked gather in I into a MVE intrinsic + // Potentially optimising the addressing modes as we do so. + Type *Ty = I->getType(); + Value *Ptr = I->getArgOperand(0); + unsigned Alignment = cast<ConstantInt>(I->getArgOperand(1))->getZExtValue(); + Value *Mask = I->getArgOperand(2); + Value *PassThru = I->getArgOperand(3); + + if (!isLegalTypeAndAlignment(Ty->getVectorNumElements(), + Ty->getScalarSizeInBits(), Alignment)) + return false; + lookThroughBitcast(Ptr); + assert(Ptr->getType()->isVectorTy() && "Unexpected pointer type"); + + IRBuilder<> Builder(I->getContext()); + Builder.SetInsertPoint(I); + Builder.SetCurrentDebugLocation(I->getDebugLoc()); + Value *Load = tryCreateMaskedGatherOffset(I, Ptr, Builder); + if (!Load) + Load = tryCreateMaskedGatherBase(I, Ptr, Builder); + if (!Load) + return false; + + if (!isa<UndefValue>(PassThru) && !match(PassThru, m_Zero())) { + LLVM_DEBUG(dbgs() << "masked gathers: found non-trivial passthru - " + << "creating select\n"); + Load = Builder.CreateSelect(Mask, Load, PassThru); + } + + LLVM_DEBUG(dbgs() << "masked gathers: successfully built masked gather\n"); + I->replaceAllUsesWith(Load); + I->eraseFromParent(); + return true; +} + +Value *MVEGatherScatterLowering::tryCreateMaskedGatherBase( + IntrinsicInst *I, Value *Ptr, IRBuilder<> Builder) { + using namespace PatternMatch; + LLVM_DEBUG(dbgs() << "masked gathers: loading from vector of pointers\n"); + Type *Ty = I->getType(); + if (Ty->getVectorNumElements() != 4) + // Can't build an intrinsic for this + return nullptr; + Value *Mask = I->getArgOperand(2); + if (match(Mask, m_One())) + return Builder.CreateIntrinsic(Intrinsic::arm_mve_vldr_gather_base, + {Ty, Ptr->getType()}, + {Ptr, Builder.getInt32(0)}); + else + return Builder.CreateIntrinsic( + Intrinsic::arm_mve_vldr_gather_base_predicated, + {Ty, Ptr->getType(), Mask->getType()}, + {Ptr, Builder.getInt32(0), Mask}); +} + +Value *MVEGatherScatterLowering::tryCreateMaskedGatherOffset( + IntrinsicInst *I, Value *Ptr, IRBuilder<> Builder) { + using namespace PatternMatch; + Type *Ty = I->getType(); + Value *Offsets; + Value *BasePtr = checkGEP(Offsets, Ty, Ptr, Builder); + if (!BasePtr) + return nullptr; + + unsigned Scale; + int GEPElemSize = + BasePtr->getType()->getPointerElementType()->getPrimitiveSizeInBits(); + int ResultElemSize = Ty->getScalarSizeInBits(); + // This can be a 32bit load scaled by 4, a 16bit load scaled by 2, or a + // 8bit, 16bit or 32bit load scaled by 1 + if (GEPElemSize == 32 && ResultElemSize == 32) { + Scale = 2; + } else if (GEPElemSize == 16 && ResultElemSize == 16) { + Scale = 1; + } else if (GEPElemSize == 8) { + Scale = 0; + } else { + LLVM_DEBUG(dbgs() << "masked gathers: incorrect scale for load. Can't" + << " create masked gather\n"); + return nullptr; + } + + Value *Mask = I->getArgOperand(2); + if (!match(Mask, m_One())) + return Builder.CreateIntrinsic( + Intrinsic::arm_mve_vldr_gather_offset_predicated, + {Ty, BasePtr->getType(), Offsets->getType(), Mask->getType()}, + {BasePtr, Offsets, Builder.getInt32(Ty->getScalarSizeInBits()), + Builder.getInt32(Scale), Builder.getInt32(1), Mask}); + else + return Builder.CreateIntrinsic( + Intrinsic::arm_mve_vldr_gather_offset, + {Ty, BasePtr->getType(), Offsets->getType()}, + {BasePtr, Offsets, Builder.getInt32(Ty->getScalarSizeInBits()), + Builder.getInt32(Scale), Builder.getInt32(1)}); +} + +bool MVEGatherScatterLowering::runOnFunction(Function &F) { + if (!EnableMaskedGatherScatters) + return false; + auto &TPC = getAnalysis<TargetPassConfig>(); + auto &TM = TPC.getTM<TargetMachine>(); + auto *ST = &TM.getSubtarget<ARMSubtarget>(F); + if (!ST->hasMVEIntegerOps()) + return false; + SmallVector<IntrinsicInst *, 4> Gathers; + for (BasicBlock &BB : F) { + for (Instruction &I : BB) { + IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I); + if (II && II->getIntrinsicID() == Intrinsic::masked_gather) + Gathers.push_back(II); + } + } + + if (Gathers.empty()) + return false; + + for (IntrinsicInst *I : Gathers) + lowerGather(I); + + return true; +} diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/MVETailPredication.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/MVETailPredication.cpp index 4db8ab17c49b..038c68739cdf 100644 --- a/contrib/llvm-project/llvm/lib/Target/ARM/MVETailPredication.cpp +++ b/contrib/llvm-project/llvm/lib/Target/ARM/MVETailPredication.cpp @@ -20,7 +20,14 @@ /// - A tail-predicated loop, with implicit predication. /// - A loop containing multiple VCPT instructions, predicating multiple VPT /// blocks of instructions operating on different vector types. +/// +/// This pass inserts the inserts the VCTP intrinsic to represent the effect of +/// tail predication. This will be picked up by the ARM Low-overhead loop pass, +/// which performs the final transformation to a DLSTP or WLSTP tail-predicated +/// loop. +#include "ARM.h" +#include "ARMSubtarget.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/LoopPass.h" #include "llvm/Analysis/ScalarEvolution.h" @@ -28,20 +35,19 @@ #include "llvm/Analysis/ScalarEvolutionExpressions.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/CodeGen/TargetPassConfig.h" -#include "llvm/IR/Instructions.h" #include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicsARM.h" #include "llvm/IR/PatternMatch.h" #include "llvm/Support/Debug.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" -#include "ARM.h" -#include "ARMSubtarget.h" using namespace llvm; #define DEBUG_TYPE "mve-tail-predication" #define DESC "Transform predicated vector loops to use MVE tail predication" -static cl::opt<bool> +cl::opt<bool> DisableTailPredication("disable-mve-tail-predication", cl::Hidden, cl::init(true), cl::desc("Disable MVE Tail Predication")); @@ -85,6 +91,12 @@ private: /// Is the icmp that generates an i1 vector, based upon a loop counter /// and a limit that is defined outside the loop. bool isTailPredicate(Instruction *Predicate, Value *NumElements); + + /// Insert the intrinsic to represent the effect of tail predication. + void InsertVCTPIntrinsic(Instruction *Predicate, + DenseMap<Instruction*, Instruction*> &NewPredicates, + VectorType *VecTy, + Value *NumElements); }; } // end namespace @@ -123,7 +135,7 @@ bool MVETailPredication::runOnLoop(Loop *L, LPPassManager&) { // The MVE and LOB extensions are combined to enable tail-predication, but // there's nothing preventing us from generating VCTP instructions for v8.1m. if (!ST->hasMVEIntegerOps() || !ST->hasV8_1MMainlineOps()) { - LLVM_DEBUG(dbgs() << "TP: Not a v8.1m.main+mve target.\n"); + LLVM_DEBUG(dbgs() << "ARM TP: Not a v8.1m.main+mve target.\n"); return false; } @@ -148,7 +160,7 @@ bool MVETailPredication::runOnLoop(Loop *L, LPPassManager&) { // Look for the hardware loop intrinsic that sets the iteration count. IntrinsicInst *Setup = FindLoopIterations(Preheader); - // The test.set iteration could live in the pre- preheader. + // The test.set iteration could live in the pre-preheader. if (!Setup) { if (!Preheader->getSinglePredecessor()) return false; @@ -171,11 +183,9 @@ bool MVETailPredication::runOnLoop(Loop *L, LPPassManager&) { if (!Decrement) return false; - LLVM_DEBUG(dbgs() << "TP: Running on Loop: " << *L - << *Setup << "\n" + LLVM_DEBUG(dbgs() << "ARM TP: Running on Loop: " << *L << *Setup << "\n" << *Decrement << "\n"); - bool Changed = TryConvert(Setup->getArgOperand(0)); - return Changed; + return TryConvert(Setup->getArgOperand(0)); } bool MVETailPredication::isTailPredicate(Instruction *I, Value *NumElements) { @@ -208,7 +218,7 @@ bool MVETailPredication::isTailPredicate(Instruction *I, Value *NumElements) { // The vector icmp if (!match(I, m_ICmp(Pred, m_Instruction(Induction), m_Instruction(Shuffle))) || - Pred != ICmpInst::ICMP_ULE || !L->isLoopInvariant(Shuffle)) + Pred != ICmpInst::ICMP_ULE) return false; // First find the stuff outside the loop which is setting up the limit @@ -230,11 +240,11 @@ bool MVETailPredication::isTailPredicate(Instruction *I, Value *NumElements) { if (!match(BECount, m_Add(m_Value(TripCount), m_AllOnes()))) return false; - if (TripCount != NumElements) + if (TripCount != NumElements || !L->isLoopInvariant(BECount)) return false; // Now back to searching inside the loop body... - // Find the add with takes the index iv and adds a constant vector to it. + // Find the add with takes the index iv and adds a constant vector to it. Instruction *BroadcastSplat = nullptr; Constant *Const = nullptr; if (!match(Induction, m_Add(m_Instruction(BroadcastSplat), @@ -269,14 +279,14 @@ bool MVETailPredication::isTailPredicate(Instruction *I, Value *NumElements) { Value *OnEntry = Phi->getIncomingValueForBlock(L->getLoopPreheader()); if (!match(OnEntry, m_Zero())) return false; - + Value *InLoop = Phi->getIncomingValueForBlock(L->getLoopLatch()); unsigned Lanes = cast<VectorType>(Insert->getType())->getNumElements(); Instruction *LHS = nullptr; if (!match(InLoop, m_Add(m_Instruction(LHS), m_SpecificInt(Lanes)))) return false; - + return LHS == Phi; } @@ -298,8 +308,8 @@ bool MVETailPredication::IsPredicatedVectorLoop() { unsigned ElementWidth = VecTy->getScalarSizeInBits(); // MVE vectors are 128-bit, but don't support 128 x i1. // TODO: Can we support vectors larger than 128-bits? - unsigned MaxWidth = TTI->getRegisterBitWidth(true); - if (Lanes * ElementWidth != MaxWidth || Lanes == MaxWidth) + unsigned MaxWidth = TTI->getRegisterBitWidth(true); + if (Lanes * ElementWidth > MaxWidth || Lanes == MaxWidth) return false; MaskedInsts.push_back(cast<IntrinsicInst>(&I)); } else if (auto *Int = dyn_cast<IntrinsicInst>(&I)) { @@ -399,19 +409,25 @@ Value* MVETailPredication::ComputeElements(Value *TripCount, // tail predicated loop. static void Cleanup(DenseMap<Instruction*, Instruction*> &NewPredicates, SetVector<Instruction*> &MaybeDead, Loop *L) { - if (BasicBlock *Exit = L->getUniqueExitBlock()) { - for (auto &Pair : NewPredicates) { - Instruction *OldPred = Pair.first; - Instruction *NewPred = Pair.second; - - for (auto &I : *Exit) { - if (I.isSameOperationAs(OldPred)) { - Instruction *PredClone = NewPred->clone(); - PredClone->insertBefore(&I); - I.replaceAllUsesWith(PredClone); - MaybeDead.insert(&I); - break; - } + BasicBlock *Exit = L->getUniqueExitBlock(); + if (!Exit) { + LLVM_DEBUG(dbgs() << "ARM TP: can't find loop exit block\n"); + return; + } + + for (auto &Pair : NewPredicates) { + Instruction *OldPred = Pair.first; + Instruction *NewPred = Pair.second; + + for (auto &I : *Exit) { + if (I.isSameOperationAs(OldPred)) { + Instruction *PredClone = NewPred->clone(); + PredClone->insertBefore(&I); + I.replaceAllUsesWith(PredClone); + MaybeDead.insert(&I); + LLVM_DEBUG(dbgs() << "ARM TP: replacing: "; I.dump(); + dbgs() << "ARM TP: with: "; PredClone->dump()); + break; } } } @@ -432,23 +448,69 @@ static void Cleanup(DenseMap<Instruction*, Instruction*> &NewPredicates, Dead.insert(I); } - for (auto *I : Dead) + for (auto *I : Dead) { + LLVM_DEBUG(dbgs() << "ARM TP: removing dead insn: "; I->dump()); I->eraseFromParent(); + } for (auto I : L->blocks()) DeleteDeadPHIs(I); } +void MVETailPredication::InsertVCTPIntrinsic(Instruction *Predicate, + DenseMap<Instruction*, Instruction*> &NewPredicates, + VectorType *VecTy, Value *NumElements) { + IRBuilder<> Builder(L->getHeader()->getFirstNonPHI()); + Module *M = L->getHeader()->getModule(); + Type *Ty = IntegerType::get(M->getContext(), 32); + + // Insert a phi to count the number of elements processed by the loop. + PHINode *Processed = Builder.CreatePHI(Ty, 2); + Processed->addIncoming(NumElements, L->getLoopPreheader()); + + // Insert the intrinsic to represent the effect of tail predication. + Builder.SetInsertPoint(cast<Instruction>(Predicate)); + ConstantInt *Factor = + ConstantInt::get(cast<IntegerType>(Ty), VecTy->getNumElements()); + + Intrinsic::ID VCTPID; + switch (VecTy->getNumElements()) { + default: + llvm_unreachable("unexpected number of lanes"); + case 4: VCTPID = Intrinsic::arm_mve_vctp32; break; + case 8: VCTPID = Intrinsic::arm_mve_vctp16; break; + case 16: VCTPID = Intrinsic::arm_mve_vctp8; break; + + // FIXME: vctp64 currently not supported because the predicate + // vector wants to be <2 x i1>, but v2i1 is not a legal MVE + // type, so problems happen at isel time. + // Intrinsic::arm_mve_vctp64 exists for ACLE intrinsics + // purposes, but takes a v4i1 instead of a v2i1. + } + Function *VCTP = Intrinsic::getDeclaration(M, VCTPID); + Value *TailPredicate = Builder.CreateCall(VCTP, Processed); + Predicate->replaceAllUsesWith(TailPredicate); + NewPredicates[Predicate] = cast<Instruction>(TailPredicate); + + // Add the incoming value to the new phi. + // TODO: This add likely already exists in the loop. + Value *Remaining = Builder.CreateSub(Processed, Factor); + Processed->addIncoming(Remaining, L->getLoopLatch()); + LLVM_DEBUG(dbgs() << "ARM TP: Insert processed elements phi: " + << *Processed << "\n" + << "ARM TP: Inserted VCTP: " << *TailPredicate << "\n"); +} + bool MVETailPredication::TryConvert(Value *TripCount) { - if (!IsPredicatedVectorLoop()) + if (!IsPredicatedVectorLoop()) { + LLVM_DEBUG(dbgs() << "ARM TP: no masked instructions in loop"); return false; + } - LLVM_DEBUG(dbgs() << "TP: Found predicated vector loop.\n"); + LLVM_DEBUG(dbgs() << "ARM TP: Found predicated vector loop.\n"); // Walk through the masked intrinsics and try to find whether the predicate // operand is generated from an induction variable. - Module *M = L->getHeader()->getModule(); - Type *Ty = IntegerType::get(M->getContext(), 32); SetVector<Instruction*> Predicates; DenseMap<Instruction*, Instruction*> NewPredicates; @@ -465,43 +527,14 @@ bool MVETailPredication::TryConvert(Value *TripCount) { continue; if (!isTailPredicate(Predicate, NumElements)) { - LLVM_DEBUG(dbgs() << "TP: Not tail predicate: " << *Predicate << "\n"); + LLVM_DEBUG(dbgs() << "ARM TP: Not tail predicate: " << *Predicate << "\n"); continue; } - LLVM_DEBUG(dbgs() << "TP: Found tail predicate: " << *Predicate << "\n"); + LLVM_DEBUG(dbgs() << "ARM TP: Found tail predicate: " << *Predicate << "\n"); Predicates.insert(Predicate); - // Insert a phi to count the number of elements processed by the loop. - IRBuilder<> Builder(L->getHeader()->getFirstNonPHI()); - PHINode *Processed = Builder.CreatePHI(Ty, 2); - Processed->addIncoming(NumElements, L->getLoopPreheader()); - - // Insert the intrinsic to represent the effect of tail predication. - Builder.SetInsertPoint(cast<Instruction>(Predicate)); - ConstantInt *Factor = - ConstantInt::get(cast<IntegerType>(Ty), VecTy->getNumElements()); - Intrinsic::ID VCTPID; - switch (VecTy->getNumElements()) { - default: - llvm_unreachable("unexpected number of lanes"); - case 2: VCTPID = Intrinsic::arm_vctp64; break; - case 4: VCTPID = Intrinsic::arm_vctp32; break; - case 8: VCTPID = Intrinsic::arm_vctp16; break; - case 16: VCTPID = Intrinsic::arm_vctp8; break; - } - Function *VCTP = Intrinsic::getDeclaration(M, VCTPID); - Value *TailPredicate = Builder.CreateCall(VCTP, Processed); - Predicate->replaceAllUsesWith(TailPredicate); - NewPredicates[Predicate] = cast<Instruction>(TailPredicate); - - // Add the incoming value to the new phi. - // TODO: This add likely already exists in the loop. - Value *Remaining = Builder.CreateSub(Processed, Factor); - Processed->addIncoming(Remaining, L->getLoopLatch()); - LLVM_DEBUG(dbgs() << "TP: Insert processed elements phi: " - << *Processed << "\n" - << "TP: Inserted VCTP: " << *TailPredicate << "\n"); + InsertVCTPIntrinsic(Predicate, NewPredicates, VecTy, NumElements); } // Now clean up. diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/MVEVPTBlockPass.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/MVEVPTBlockPass.cpp index bc0a80b177ed..a5df46c94f42 100644 --- a/contrib/llvm-project/llvm/lib/Target/ARM/MVEVPTBlockPass.cpp +++ b/contrib/llvm-project/llvm/lib/Target/ARM/MVEVPTBlockPass.cpp @@ -22,9 +22,9 @@ #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineInstrBundle.h" #include "llvm/CodeGen/MachineOperand.h" +#include "llvm/CodeGen/ReachingDefAnalysis.h" #include "llvm/IR/DebugLoc.h" #include "llvm/MC/MCInstrDesc.h" -#include "llvm/MC/MCRegisterInfo.h" #include "llvm/Support/Debug.h" #include <cassert> #include <new> @@ -37,16 +37,21 @@ namespace { class MVEVPTBlock : public MachineFunctionPass { public: static char ID; - const Thumb2InstrInfo *TII; - const TargetRegisterInfo *TRI; MVEVPTBlock() : MachineFunctionPass(ID) {} bool runOnMachineFunction(MachineFunction &Fn) override; + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + AU.addRequired<ReachingDefAnalysis>(); + MachineFunctionPass::getAnalysisUsage(AU); + } + MachineFunctionProperties getRequiredProperties() const override { return MachineFunctionProperties().set( - MachineFunctionProperties::Property::NoVRegs); + MachineFunctionProperties::Property::NoVRegs).set( + MachineFunctionProperties::Property::TracksLiveness); } StringRef getPassName() const override { @@ -55,6 +60,9 @@ namespace { private: bool InsertVPTBlocks(MachineBasicBlock &MBB); + + const Thumb2InstrInfo *TII = nullptr; + ReachingDefAnalysis *RDA = nullptr; }; char MVEVPTBlock::ID = 0; @@ -63,112 +71,32 @@ namespace { INITIALIZE_PASS(MVEVPTBlock, DEBUG_TYPE, "ARM MVE VPT block pass", false, false) -enum VPTMaskValue { - T = 8, // 0b1000 - TT = 4, // 0b0100 - TE = 12, // 0b1100 - TTT = 2, // 0b0010 - TTE = 6, // 0b0110 - TEE = 10, // 0b1010 - TET = 14, // 0b1110 - TTTT = 1, // 0b0001 - TTTE = 3, // 0b0011 - TTEE = 5, // 0b0101 - TTET = 7, // 0b0111 - TEEE = 9, // 0b1001 - TEET = 11, // 0b1011 - TETT = 13, // 0b1101 - TETE = 15 // 0b1111 -}; - -static unsigned VCMPOpcodeToVPT(unsigned Opcode) { - switch (Opcode) { - case ARM::MVE_VCMPf32: - return ARM::MVE_VPTv4f32; - case ARM::MVE_VCMPf16: - return ARM::MVE_VPTv8f16; - case ARM::MVE_VCMPi8: - return ARM::MVE_VPTv16i8; - case ARM::MVE_VCMPi16: - return ARM::MVE_VPTv8i16; - case ARM::MVE_VCMPi32: - return ARM::MVE_VPTv4i32; - case ARM::MVE_VCMPu8: - return ARM::MVE_VPTv16u8; - case ARM::MVE_VCMPu16: - return ARM::MVE_VPTv8u16; - case ARM::MVE_VCMPu32: - return ARM::MVE_VPTv4u32; - case ARM::MVE_VCMPs8: - return ARM::MVE_VPTv16s8; - case ARM::MVE_VCMPs16: - return ARM::MVE_VPTv8s16; - case ARM::MVE_VCMPs32: - return ARM::MVE_VPTv4s32; - - case ARM::MVE_VCMPf32r: - return ARM::MVE_VPTv4f32r; - case ARM::MVE_VCMPf16r: - return ARM::MVE_VPTv8f16r; - case ARM::MVE_VCMPi8r: - return ARM::MVE_VPTv16i8r; - case ARM::MVE_VCMPi16r: - return ARM::MVE_VPTv8i16r; - case ARM::MVE_VCMPi32r: - return ARM::MVE_VPTv4i32r; - case ARM::MVE_VCMPu8r: - return ARM::MVE_VPTv16u8r; - case ARM::MVE_VCMPu16r: - return ARM::MVE_VPTv8u16r; - case ARM::MVE_VCMPu32r: - return ARM::MVE_VPTv4u32r; - case ARM::MVE_VCMPs8r: - return ARM::MVE_VPTv16s8r; - case ARM::MVE_VCMPs16r: - return ARM::MVE_VPTv8s16r; - case ARM::MVE_VCMPs32r: - return ARM::MVE_VPTv4s32r; - - default: - return 0; - } -} - -static MachineInstr *findVCMPToFoldIntoVPST(MachineBasicBlock::iterator MI, - const TargetRegisterInfo *TRI, +static MachineInstr *findVCMPToFoldIntoVPST(MachineInstr *MI, + ReachingDefAnalysis *RDA, unsigned &NewOpcode) { - // Search backwards to the instruction that defines VPR. This may or not - // be a VCMP, we check that after this loop. If we find another instruction - // that reads cpsr, we return nullptr. - MachineBasicBlock::iterator CmpMI = MI; - while (CmpMI != MI->getParent()->begin()) { - --CmpMI; - if (CmpMI->modifiesRegister(ARM::VPR, TRI)) - break; - if (CmpMI->readsRegister(ARM::VPR, TRI)) - break; - } - - if (CmpMI == MI) - return nullptr; - NewOpcode = VCMPOpcodeToVPT(CmpMI->getOpcode()); - if (NewOpcode == 0) + // First, search backwards to the instruction that defines VPR + auto *Def = RDA->getReachingMIDef(MI, ARM::VPR); + if (!Def) return nullptr; - // Search forward from CmpMI to MI, checking if either register was def'd - if (registerDefinedBetween(CmpMI->getOperand(1).getReg(), std::next(CmpMI), - MI, TRI)) + // Now check that Def is a VCMP + if (!(NewOpcode = VCMPOpcodeToVPT(Def->getOpcode()))) return nullptr; - if (registerDefinedBetween(CmpMI->getOperand(2).getReg(), std::next(CmpMI), - MI, TRI)) + + // Check that Def's operands are not defined between the VCMP and MI, i.e. + // check that they have the same reaching def. + if (!RDA->hasSameReachingDef(Def, MI, Def->getOperand(1).getReg()) || + !RDA->hasSameReachingDef(Def, MI, Def->getOperand(2).getReg())) return nullptr; - return &*CmpMI; + + return Def; } bool MVEVPTBlock::InsertVPTBlocks(MachineBasicBlock &Block) { bool Modified = false; MachineBasicBlock::instr_iterator MBIter = Block.instr_begin(); MachineBasicBlock::instr_iterator EndIter = Block.instr_end(); + SmallSet<MachineInstr *, 4> RemovedVCMPs; while (MBIter != EndIter) { MachineInstr *MI = &*MBIter; @@ -208,29 +136,13 @@ bool MVEVPTBlock::InsertVPTBlocks(MachineBasicBlock &Block) { ++MBIter; }; - unsigned BlockMask = 0; - switch (VPTInstCnt) { - case 1: - BlockMask = VPTMaskValue::T; - break; - case 2: - BlockMask = VPTMaskValue::TT; - break; - case 3: - BlockMask = VPTMaskValue::TTT; - break; - case 4: - BlockMask = VPTMaskValue::TTTT; - break; - default: - llvm_unreachable("Unexpected number of instruction in a VPT block"); - }; + unsigned BlockMask = getARMVPTBlockMask(VPTInstCnt); // Search back for a VCMP that can be folded to create a VPT, or else create // a VPST directly MachineInstrBuilder MIBuilder; unsigned NewOpcode; - MachineInstr *VCMP = findVCMPToFoldIntoVPST(MI, TRI, NewOpcode); + MachineInstr *VCMP = findVCMPToFoldIntoVPST(MI, RDA, NewOpcode); if (VCMP) { LLVM_DEBUG(dbgs() << " folding VCMP into VPST: "; VCMP->dump()); MIBuilder = BuildMI(Block, MI, dl, TII->get(NewOpcode)); @@ -238,7 +150,11 @@ bool MVEVPTBlock::InsertVPTBlocks(MachineBasicBlock &Block) { MIBuilder.add(VCMP->getOperand(1)); MIBuilder.add(VCMP->getOperand(2)); MIBuilder.add(VCMP->getOperand(3)); - VCMP->eraseFromParent(); + // We delay removing the actual VCMP instruction by saving it to a list + // and deleting all instructions in this list in one go after we have + // created the VPT blocks. We do this in order not to invalidate the + // ReachingDefAnalysis that is queried by 'findVCMPToFoldIntoVPST'. + RemovedVCMPs.insert(VCMP); } else { MIBuilder = BuildMI(Block, MI, dl, TII->get(ARM::MVE_VPST)); MIBuilder.addImm(BlockMask); @@ -249,10 +165,17 @@ bool MVEVPTBlock::InsertVPTBlocks(MachineBasicBlock &Block) { Modified = true; } + + for (auto *I : RemovedVCMPs) + I->eraseFromParent(); + return Modified; } bool MVEVPTBlock::runOnMachineFunction(MachineFunction &Fn) { + if (skipFunction(Fn.getFunction())) + return false; + const ARMSubtarget &STI = static_cast<const ARMSubtarget &>(Fn.getSubtarget()); @@ -260,7 +183,7 @@ bool MVEVPTBlock::runOnMachineFunction(MachineFunction &Fn) { return false; TII = static_cast<const Thumb2InstrInfo *>(STI.getInstrInfo()); - TRI = STI.getRegisterInfo(); + RDA = &getAnalysis<ReachingDefAnalysis>(); LLVM_DEBUG(dbgs() << "********** ARM MVE VPT BLOCKS **********\n" << "********** Function: " << Fn.getName() << '\n'); diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/TargetInfo/ARMTargetInfo.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/TargetInfo/ARMTargetInfo.cpp index 86cb907abfa3..a7f7d75e356e 100644 --- a/contrib/llvm-project/llvm/lib/Target/ARM/TargetInfo/ARMTargetInfo.cpp +++ b/contrib/llvm-project/llvm/lib/Target/ARM/TargetInfo/ARMTargetInfo.cpp @@ -27,7 +27,7 @@ Target &llvm::getTheThumbBETarget() { return TheThumbBETarget; } -extern "C" void LLVMInitializeARMTargetInfo() { +extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeARMTargetInfo() { RegisterTarget<Triple::arm, /*HasJIT=*/true> X(getTheARMLETarget(), "arm", "ARM", "ARM"); RegisterTarget<Triple::armeb, /*HasJIT=*/true> Y(getTheARMBETarget(), "armeb", diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/Thumb1InstrInfo.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/Thumb1InstrInfo.cpp index fccaa4c9cc8a..b08b71a4952d 100644 --- a/contrib/llvm-project/llvm/lib/Target/ARM/Thumb1InstrInfo.cpp +++ b/contrib/llvm-project/llvm/lib/Target/ARM/Thumb1InstrInfo.cpp @@ -37,8 +37,8 @@ unsigned Thumb1InstrInfo::getUnindexedOpcode(unsigned Opc) const { void Thumb1InstrInfo::copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, - const DebugLoc &DL, unsigned DestReg, - unsigned SrcReg, bool KillSrc) const { + const DebugLoc &DL, MCRegister DestReg, + MCRegister SrcReg, bool KillSrc) const { // Need to check the arch. MachineFunction &MF = *MBB.getParent(); const ARMSubtarget &st = MF.getSubtarget<ARMSubtarget>(); diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/Thumb1InstrInfo.h b/contrib/llvm-project/llvm/lib/Target/ARM/Thumb1InstrInfo.h index bc433e7a7a93..530289fe8c5d 100644 --- a/contrib/llvm-project/llvm/lib/Target/ARM/Thumb1InstrInfo.h +++ b/contrib/llvm-project/llvm/lib/Target/ARM/Thumb1InstrInfo.h @@ -38,7 +38,7 @@ public: const ThumbRegisterInfo &getRegisterInfo() const override { return RI; } void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, - const DebugLoc &DL, unsigned DestReg, unsigned SrcReg, + const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc) const override; void storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp index af1f0aeb27ba..e06bb9546c03 100644 --- a/contrib/llvm-project/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp +++ b/contrib/llvm-project/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp @@ -120,8 +120,8 @@ Thumb2InstrInfo::isLegalToSplitMBBAt(MachineBasicBlock &MBB, void Thumb2InstrInfo::copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, - const DebugLoc &DL, unsigned DestReg, - unsigned SrcReg, bool KillSrc) const { + const DebugLoc &DL, MCRegister DestReg, + MCRegister SrcReg, bool KillSrc) const { // Handle SPR, DPR, and QPR copies. if (!ARM::GPRRegClass.contains(DestReg, SrcReg)) return ARMBaseInstrInfo::copyPhysReg(MBB, I, DL, DestReg, SrcReg, KillSrc); @@ -303,50 +303,45 @@ void llvm::emitT2RegPlusImmediate(MachineBasicBlock &MBB, continue; } - bool HasCCOut = true; - if (BaseReg == ARM::SP) { - // sub sp, sp, #imm7 - if (DestReg == ARM::SP && (ThisVal < ((1 << 7)-1) * 4)) { - assert((ThisVal & 3) == 0 && "Stack update is not multiple of 4?"); - Opc = isSub ? ARM::tSUBspi : ARM::tADDspi; - BuildMI(MBB, MBBI, dl, TII.get(Opc), DestReg) - .addReg(BaseReg) - .addImm(ThisVal / 4) - .setMIFlags(MIFlags) - .add(predOps(ARMCC::AL)); - NumBytes = 0; - continue; - } + assert((DestReg != ARM::SP || BaseReg == ARM::SP) && + "Writing to SP, from other register."); - // sub rd, sp, so_imm - Opc = isSub ? ARM::t2SUBri : ARM::t2ADDri; - if (ARM_AM::getT2SOImmVal(NumBytes) != -1) { - NumBytes = 0; - } else { - // FIXME: Move this to ARMAddressingModes.h? - unsigned RotAmt = countLeadingZeros(ThisVal); - ThisVal = ThisVal & ARM_AM::rotr32(0xff000000U, RotAmt); - NumBytes &= ~ThisVal; - assert(ARM_AM::getT2SOImmVal(ThisVal) != -1 && - "Bit extraction didn't work?"); - } + // Try to use T1, as it smaller + if ((DestReg == ARM::SP) && (ThisVal < ((1 << 7) - 1) * 4)) { + assert((ThisVal & 3) == 0 && "Stack update is not multiple of 4?"); + Opc = isSub ? ARM::tSUBspi : ARM::tADDspi; + BuildMI(MBB, MBBI, dl, TII.get(Opc), DestReg) + .addReg(BaseReg) + .addImm(ThisVal / 4) + .setMIFlags(MIFlags) + .add(predOps(ARMCC::AL)); + break; + } + bool HasCCOut = true; + int ImmIsT2SO = ARM_AM::getT2SOImmVal(ThisVal); + bool ToSP = DestReg == ARM::SP; + unsigned t2SUB = ToSP ? ARM::t2SUBspImm : ARM::t2SUBri; + unsigned t2ADD = ToSP ? ARM::t2ADDspImm : ARM::t2ADDri; + unsigned t2SUBi12 = ToSP ? ARM::t2SUBspImm12 : ARM::t2SUBri12; + unsigned t2ADDi12 = ToSP ? ARM::t2ADDspImm12 : ARM::t2ADDri12; + Opc = isSub ? t2SUB : t2ADD; + // Prefer T2: sub rd, rn, so_imm | sub sp, sp, so_imm + if (ImmIsT2SO != -1) { + NumBytes = 0; + } else if (ThisVal < 4096) { + // Prefer T3 if can make it in a single go: subw rd, rn, imm12 | subw sp, + // sp, imm12 + Opc = isSub ? t2SUBi12 : t2ADDi12; + HasCCOut = false; + NumBytes = 0; } else { - assert(DestReg != ARM::SP && BaseReg != ARM::SP); - Opc = isSub ? ARM::t2SUBri : ARM::t2ADDri; - if (ARM_AM::getT2SOImmVal(NumBytes) != -1) { - NumBytes = 0; - } else if (ThisVal < 4096) { - Opc = isSub ? ARM::t2SUBri12 : ARM::t2ADDri12; - HasCCOut = false; - NumBytes = 0; - } else { - // FIXME: Move this to ARMAddressingModes.h? - unsigned RotAmt = countLeadingZeros(ThisVal); - ThisVal = ThisVal & ARM_AM::rotr32(0xff000000U, RotAmt); - NumBytes &= ~ThisVal; - assert(ARM_AM::getT2SOImmVal(ThisVal) != -1 && - "Bit extraction didn't work?"); - } + // Use one T2 instruction to reduce NumBytes + // FIXME: Move this to ARMAddressingModes.h? + unsigned RotAmt = countLeadingZeros(ThisVal); + ThisVal = ThisVal & ARM_AM::rotr32(0xff000000U, RotAmt); + NumBytes &= ~ThisVal; + assert(ARM_AM::getT2SOImmVal(ThisVal) != -1 && + "Bit extraction didn't work?"); } // Build the new ADD / SUB. @@ -375,6 +370,8 @@ negativeOffsetOpcode(unsigned opcode) case ARM::t2STRBi12: return ARM::t2STRBi8; case ARM::t2STRHi12: return ARM::t2STRHi8; case ARM::t2PLDi12: return ARM::t2PLDi8; + case ARM::t2PLDWi12: return ARM::t2PLDWi8; + case ARM::t2PLIi12: return ARM::t2PLIi8; case ARM::t2LDRi8: case ARM::t2LDRHi8: @@ -385,13 +382,13 @@ negativeOffsetOpcode(unsigned opcode) case ARM::t2STRBi8: case ARM::t2STRHi8: case ARM::t2PLDi8: + case ARM::t2PLDWi8: + case ARM::t2PLIi8: return opcode; default: - break; + llvm_unreachable("unknown thumb2 opcode."); } - - return 0; } static unsigned @@ -407,6 +404,8 @@ positiveOffsetOpcode(unsigned opcode) case ARM::t2STRBi8: return ARM::t2STRBi12; case ARM::t2STRHi8: return ARM::t2STRHi12; case ARM::t2PLDi8: return ARM::t2PLDi12; + case ARM::t2PLDWi8: return ARM::t2PLDWi12; + case ARM::t2PLIi8: return ARM::t2PLIi12; case ARM::t2LDRi12: case ARM::t2LDRHi12: @@ -417,13 +416,13 @@ positiveOffsetOpcode(unsigned opcode) case ARM::t2STRBi12: case ARM::t2STRHi12: case ARM::t2PLDi12: + case ARM::t2PLDWi12: + case ARM::t2PLIi12: return opcode; default: - break; + llvm_unreachable("unknown thumb2 opcode."); } - - return 0; } static unsigned @@ -439,6 +438,8 @@ immediateOffsetOpcode(unsigned opcode) case ARM::t2STRBs: return ARM::t2STRBi12; case ARM::t2STRHs: return ARM::t2STRHi12; case ARM::t2PLDs: return ARM::t2PLDi12; + case ARM::t2PLDWs: return ARM::t2PLDWi12; + case ARM::t2PLIs: return ARM::t2PLIi12; case ARM::t2LDRi12: case ARM::t2LDRHi12: @@ -449,6 +450,8 @@ immediateOffsetOpcode(unsigned opcode) case ARM::t2STRBi12: case ARM::t2STRHi12: case ARM::t2PLDi12: + case ARM::t2PLDWi12: + case ARM::t2PLIi12: case ARM::t2LDRi8: case ARM::t2LDRHi8: case ARM::t2LDRBi8: @@ -458,13 +461,13 @@ immediateOffsetOpcode(unsigned opcode) case ARM::t2STRBi8: case ARM::t2STRHi8: case ARM::t2PLDi8: + case ARM::t2PLDWi8: + case ARM::t2PLIi8: return opcode; default: - break; + llvm_unreachable("unknown thumb2 opcode."); } - - return 0; } bool llvm::rewriteT2FrameIndex(MachineInstr &MI, unsigned FrameRegIdx, @@ -484,7 +487,8 @@ bool llvm::rewriteT2FrameIndex(MachineInstr &MI, unsigned FrameRegIdx, if (Opcode == ARM::INLINEASM || Opcode == ARM::INLINEASM_BR) AddrMode = ARMII::AddrModeT2_i12; // FIXME. mode for thumb2? - if (Opcode == ARM::t2ADDri || Opcode == ARM::t2ADDri12) { + const bool IsSP = Opcode == ARM::t2ADDspImm12 || Opcode == ARM::t2ADDspImm; + if (IsSP || Opcode == ARM::t2ADDri || Opcode == ARM::t2ADDri12) { Offset += MI.getOperand(FrameRegIdx+1).getImm(); unsigned PredReg; @@ -501,14 +505,14 @@ bool llvm::rewriteT2FrameIndex(MachineInstr &MI, unsigned FrameRegIdx, return true; } - bool HasCCOut = Opcode != ARM::t2ADDri12; + bool HasCCOut = (Opcode != ARM::t2ADDspImm12 && Opcode != ARM::t2ADDri12); if (Offset < 0) { Offset = -Offset; isSub = true; - MI.setDesc(TII.get(ARM::t2SUBri)); + MI.setDesc(IsSP ? TII.get(ARM::t2SUBspImm) : TII.get(ARM::t2SUBri)); } else { - MI.setDesc(TII.get(ARM::t2ADDri)); + MI.setDesc(IsSP ? TII.get(ARM::t2ADDspImm) : TII.get(ARM::t2ADDri)); } // Common case: small offset, fits into instruction. @@ -524,7 +528,8 @@ bool llvm::rewriteT2FrameIndex(MachineInstr &MI, unsigned FrameRegIdx, // Another common case: imm12. if (Offset < 4096 && (!HasCCOut || MI.getOperand(MI.getNumOperands()-1).getReg() == 0)) { - unsigned NewOpc = isSub ? ARM::t2SUBri12 : ARM::t2ADDri12; + unsigned NewOpc = isSub ? IsSP ? ARM::t2SUBspImm12 : ARM::t2SUBri12 + : IsSP ? ARM::t2ADDspImm12 : ARM::t2ADDri12; MI.setDesc(TII.get(NewOpc)); MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false); MI.getOperand(FrameRegIdx+1).ChangeToImmediate(Offset); diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/Thumb2InstrInfo.h b/contrib/llvm-project/llvm/lib/Target/ARM/Thumb2InstrInfo.h index a6712d5a0e72..7d8dff14e1e7 100644 --- a/contrib/llvm-project/llvm/lib/Target/ARM/Thumb2InstrInfo.h +++ b/contrib/llvm-project/llvm/lib/Target/ARM/Thumb2InstrInfo.h @@ -39,7 +39,7 @@ public: MachineBasicBlock::iterator MBBI) const override; void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, - const DebugLoc &DL, unsigned DestReg, unsigned SrcReg, + const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc) const override; void storeRegToStackSlot(MachineBasicBlock &MBB, diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/Utils/ARMBaseInfo.h b/contrib/llvm-project/llvm/lib/Target/ARM/Utils/ARMBaseInfo.h index aa3aca359cb8..27605422983d 100644 --- a/contrib/llvm-project/llvm/lib/Target/ARM/Utils/ARMBaseInfo.h +++ b/contrib/llvm-project/llvm/lib/Target/ARM/Utils/ARMBaseInfo.h @@ -64,6 +64,25 @@ inline static CondCodes getOppositeCondition(CondCodes CC) { case LE: return GT; } } + +/// getSwappedCondition - assume the flags are set by MI(a,b), return +/// the condition code if we modify the instructions such that flags are +/// set by MI(b,a). +inline static ARMCC::CondCodes getSwappedCondition(ARMCC::CondCodes CC) { + switch (CC) { + default: return ARMCC::AL; + case ARMCC::EQ: return ARMCC::EQ; + case ARMCC::NE: return ARMCC::NE; + case ARMCC::HS: return ARMCC::LS; + case ARMCC::LO: return ARMCC::HI; + case ARMCC::HI: return ARMCC::LO; + case ARMCC::LS: return ARMCC::HS; + case ARMCC::GE: return ARMCC::LE; + case ARMCC::LT: return ARMCC::GT; + case ARMCC::GT: return ARMCC::LT; + case ARMCC::LE: return ARMCC::GE; + } +} } // end namespace ARMCC namespace ARMVCC { @@ -72,6 +91,40 @@ namespace ARMVCC { Then, Else }; + + enum VPTMaskValue { + T = 8, // 0b1000 + TT = 4, // 0b0100 + TE = 12, // 0b1100 + TTT = 2, // 0b0010 + TTE = 6, // 0b0110 + TEE = 10, // 0b1010 + TET = 14, // 0b1110 + TTTT = 1, // 0b0001 + TTTE = 3, // 0b0011 + TTEE = 5, // 0b0101 + TTET = 7, // 0b0111 + TEEE = 9, // 0b1001 + TEET = 11, // 0b1011 + TETT = 13, // 0b1101 + TETE = 15 // 0b1111 + }; +} + +inline static unsigned getARMVPTBlockMask(unsigned NumInsts) { + switch (NumInsts) { + case 1: + return ARMVCC::T; + case 2: + return ARMVCC::TT; + case 3: + return ARMVCC::TTT; + case 4: + return ARMVCC::TTTT; + default: + break; + }; + llvm_unreachable("Unexpected number of instruction in a VPT block"); } inline static const char *ARMVPTPredToString(ARMVCC::VPTCodes CC) { |