diff options
author | Dimitry Andric <dim@FreeBSD.org> | 2020-07-26 19:36:28 +0000 |
---|---|---|
committer | Dimitry Andric <dim@FreeBSD.org> | 2020-07-26 19:36:28 +0000 |
commit | cfca06d7963fa0909f90483b42a6d7d194d01e08 (patch) | |
tree | 209fb2a2d68f8f277793fc8df46c753d31bc853b /llvm/lib/Target/AArch64 | |
parent | 706b4fc47bbc608932d3b491ae19a3b9cde9497b (diff) |
Notes
Diffstat (limited to 'llvm/lib/Target/AArch64')
89 files changed, 15016 insertions, 3367 deletions
diff --git a/llvm/lib/Target/AArch64/AArch64.h b/llvm/lib/Target/AArch64/AArch64.h index ac765ebcddc04..fd35b530e3ce4 100644 --- a/llvm/lib/Target/AArch64/AArch64.h +++ b/llvm/lib/Target/AArch64/AArch64.h @@ -38,6 +38,8 @@ FunctionPass *createAArch64ISelDag(AArch64TargetMachine &TM, CodeGenOpt::Level OptLevel); FunctionPass *createAArch64StorePairSuppressPass(); FunctionPass *createAArch64ExpandPseudoPass(); +FunctionPass *createAArch64SLSHardeningPass(); +FunctionPass *createAArch64IndirectThunks(); FunctionPass *createAArch64SpeculationHardeningPass(); FunctionPass *createAArch64LoadStoreOptimizationPass(); FunctionPass *createAArch64SIMDInstrOptPass(); @@ -52,11 +54,13 @@ FunctionPass *createAArch64BranchTargetsPass(); FunctionPass *createAArch64CleanupLocalDynamicTLSPass(); FunctionPass *createAArch64CollectLOHPass(); +ModulePass *createSVEIntrinsicOptsPass(); InstructionSelector * createAArch64InstructionSelector(const AArch64TargetMachine &, AArch64Subtarget &, AArch64RegisterBankInfo &); FunctionPass *createAArch64PreLegalizeCombiner(bool IsOptNone); -FunctionPass *createAArch64StackTaggingPass(bool MergeInit); +FunctionPass *createAArch64PostLegalizeCombiner(bool IsOptNone); +FunctionPass *createAArch64StackTaggingPass(bool IsOptNone); FunctionPass *createAArch64StackTaggingPreRAPass(); void initializeAArch64A53Fix835769Pass(PassRegistry&); @@ -70,16 +74,19 @@ void initializeAArch64ConditionalComparesPass(PassRegistry&); void initializeAArch64ConditionOptimizerPass(PassRegistry&); void initializeAArch64DeadRegisterDefinitionsPass(PassRegistry&); void initializeAArch64ExpandPseudoPass(PassRegistry&); +void initializeAArch64SLSHardeningPass(PassRegistry&); void initializeAArch64SpeculationHardeningPass(PassRegistry&); void initializeAArch64LoadStoreOptPass(PassRegistry&); void initializeAArch64SIMDInstrOptPass(PassRegistry&); void initializeAArch64PreLegalizerCombinerPass(PassRegistry&); +void initializeAArch64PostLegalizerCombinerPass(PassRegistry &); void initializeAArch64PromoteConstantPass(PassRegistry&); void initializeAArch64RedundantCopyEliminationPass(PassRegistry&); void initializeAArch64StorePairSuppressPass(PassRegistry&); void initializeFalkorHWPFFixPass(PassRegistry&); void initializeFalkorMarkStridedAccessesLegacyPass(PassRegistry&); void initializeLDTLSCleanupPass(PassRegistry&); +void initializeSVEIntrinsicOptsPass(PassRegistry&); void initializeAArch64StackTaggingPass(PassRegistry&); void initializeAArch64StackTaggingPreRAPass(PassRegistry&); } // end namespace llvm diff --git a/llvm/lib/Target/AArch64/AArch64.td b/llvm/lib/Target/AArch64/AArch64.td index 0106355b1a440..534af9686af06 100644 --- a/llvm/lib/Target/AArch64/AArch64.td +++ b/llvm/lib/Target/AArch64/AArch64.td @@ -42,11 +42,11 @@ def FeatureAES : SubtargetFeature< "Enable AES support", [FeatureNEON]>; // Crypto has been split up and any combination is now valid (see the -// crypto defintions above). Also, crypto is now context sensitive: +// crypto definitions above). Also, crypto is now context sensitive: // it has a different meaning for e.g. Armv8.4 than it has for Armv8.2. // Therefore, we rely on Clang, the user interacing tool, to pass on the // appropriate crypto options. But here in the backend, crypto has very little -// meaning anymore. We kept the Crypto defintion here for backward +// meaning anymore. We kept the Crypto definition here for backward // compatibility, and now imply features SHA2 and AES, which was the // "traditional" meaning of Crypto. def FeatureCrypto : SubtargetFeature<"crypto", "HasCrypto", "true", @@ -101,7 +101,25 @@ def FeatureCCPP : SubtargetFeature<"ccpp", "HasCCPP", "true", "Enable v8.2 data Cache Clean to Point of Persistence" >; def FeatureSVE : SubtargetFeature<"sve", "HasSVE", "true", - "Enable Scalable Vector Extension (SVE) instructions">; + "Enable Scalable Vector Extension (SVE) instructions", [FeatureFullFP16]>; + +// This flag is currently still labeled as Experimental, but when fully +// implemented this should tell the compiler to use the zeroing pseudos to +// benefit from the reverse instructions (e.g. SUB vs SUBR) if the inactive +// lanes are known to be zero. The pseudos will then be expanded using the +// MOVPRFX instruction to zero the inactive lanes. This feature should only be +// enabled if MOVPRFX instructions are known to merge with the destructive +// operations they prefix. +// +// This feature could similarly be extended to support cheap merging of _any_ +// value into the inactive lanes using the MOVPRFX instruction that uses +// merging-predication. +def FeatureExperimentalZeroingPseudos + : SubtargetFeature<"use-experimental-zeroing-pseudos", + "UseExperimentalZeroingPseudos", "true", + "Hint to the compiler that the MOVPRFX instruction is " + "merged with destructive operations", + []>; def FeatureSVE2 : SubtargetFeature<"sve2", "HasSVE2", "true", "Enable Scalable Vector Extension 2 (SVE2) instructions", [FeatureSVE]>; @@ -142,7 +160,7 @@ def FeatureStrictAlign : SubtargetFeature<"strict-align", "Disallow all unaligned memory " "access">; -foreach i = {1-7,9-15,18,20-28} in +foreach i = {1-7,9-15,18,20-28,30} in def FeatureReserveX#i : SubtargetFeature<"reserve-x"#i, "ReserveXRegister["#i#"]", "true", "Reserve X"#i#", making it unavailable " "as a GPR">; @@ -240,11 +258,11 @@ def FeatureDotProd : SubtargetFeature< def FeaturePA : SubtargetFeature< "pa", "HasPA", "true", - "Enable v8.3-A Pointer Authentication enchancement">; + "Enable v8.3-A Pointer Authentication extension">; def FeatureJS : SubtargetFeature< "jsconv", "HasJS", "true", - "Enable v8.3-A JavaScript FP conversion enchancement", + "Enable v8.3-A JavaScript FP conversion instructions", [FeatureFPARMv8]>; def FeatureCCIDX : SubtargetFeature< @@ -281,6 +299,11 @@ def FeatureAM : SubtargetFeature< "am", "HasAM", "true", "Enable v8.4-A Activity Monitors extension">; +def FeatureAMVS : SubtargetFeature< + "amvs", "HasAMVS", "true", + "Enable v8.6-A Activity Monitors Virtualization support", + [FeatureAM]>; + def FeatureSEL2 : SubtargetFeature< "sel2", "HasSEL2", "true", "Enable v8.4-A Secure Exception Level 2 extension">; @@ -365,6 +388,25 @@ def FeatureTaggedGlobals : SubtargetFeature<"tagged-globals", "true", "Use an instruction sequence for taking the address of a global " "that allows a memory tag in the upper address bits">; +def FeatureBF16 : SubtargetFeature<"bf16", "HasBF16", + "true", "Enable BFloat16 Extension" >; + +def FeatureMatMulInt8 : SubtargetFeature<"i8mm", "HasMatMulInt8", + "true", "Enable Matrix Multiply Int8 Extension">; + +def FeatureMatMulFP32 : SubtargetFeature<"f32mm", "HasMatMulFP32", + "true", "Enable Matrix Multiply FP32 Extension", [FeatureSVE]>; + +def FeatureMatMulFP64 : SubtargetFeature<"f64mm", "HasMatMulFP64", + "true", "Enable Matrix Multiply FP64 Extension", [FeatureSVE]>; + +def FeatureFineGrainedTraps : SubtargetFeature<"fgt", "HasFineGrainedTraps", + "true", "Enable fine grained virtualization traps extension">; + +def FeatureEnhancedCounterVirtualization : + SubtargetFeature<"ecv", "HasEnhancedCounterVirtualization", + "true", "Enable enhanced counter virtualization extension">; + //===----------------------------------------------------------------------===// // Architectures. // @@ -391,8 +433,13 @@ def HasV8_5aOps : SubtargetFeature< "v8.5a", "HasV8_5aOps", "true", "Support ARM v8.5a instructions", [HasV8_4aOps, FeatureAltFPCmp, FeatureFRInt3264, FeatureSpecRestrict, FeatureSSBS, FeatureSB, FeaturePredRes, FeatureCacheDeepPersist, - FeatureBranchTargetId] ->; + FeatureBranchTargetId]>; + +def HasV8_6aOps : SubtargetFeature< + "v8.6a", "HasV8_6aOps", "true", "Support ARM v8.6a instructions", + + [HasV8_5aOps, FeatureAMVS, FeatureBF16, FeatureFineGrainedTraps, + FeatureEnhancedCounterVirtualization, FeatureMatMulInt8]>; //===----------------------------------------------------------------------===// // Register File Description @@ -429,6 +476,17 @@ def FeatureUseEL#i#ForTP : SubtargetFeature<"tpidr-el"#i, "UseEL"#i#"ForTP", "true", "Permit use of TPIDR_EL"#i#" for the TLS base">; //===----------------------------------------------------------------------===// +// Control codegen mitigation against Straight Line Speculation vulnerability. +//===----------------------------------------------------------------------===// + +def FeatureHardenSlsRetBr : SubtargetFeature<"harden-sls-retbr", + "HardenSlsRetBr", "true", + "Harden against straight line speculation across RET and BR instructions">; +def FeatureHardenSlsBlr : SubtargetFeature<"harden-sls-blr", + "HardenSlsBlr", "true", + "Harden against straight line speculation across BLR instructions">; + +//===----------------------------------------------------------------------===// // AArch64 Processors supported. // @@ -443,6 +501,10 @@ def SVEUnsupported : AArch64Unsupported { HasSVE2BitPerm]; } +def PAUnsupported : AArch64Unsupported { + let F = [HasPA]; +} + include "AArch64SchedA53.td" include "AArch64SchedA57.td" include "AArch64SchedCyclone.td" @@ -453,6 +515,7 @@ include "AArch64SchedExynosM4.td" include "AArch64SchedExynosM5.td" include "AArch64SchedThunderX.td" include "AArch64SchedThunderX2T99.td" +include "AArch64SchedThunderX3T110.td" def ProcA35 : SubtargetFeature<"a35", "ARMProcFamily", "CortexA35", "Cortex-A35 ARM processors", [ @@ -563,6 +626,67 @@ def ProcA76 : SubtargetFeature<"a76", "ARMProcFamily", "CortexA76", FeatureSSBS ]>; +def ProcA77 : SubtargetFeature<"a77", "ARMProcFamily", "CortexA77", + "Cortex-A77 ARM processors", [ + HasV8_2aOps, + FeatureFPARMv8, + FeatureNEON, FeatureRCPC, + FeatureCrypto, + FeatureFullFP16, + FeatureDotProd + ]>; + +def ProcA78 : SubtargetFeature<"cortex-a78", "ARMProcFamily", + "CortexA78", + "Cortex-A78 ARM processors", [ + HasV8_2aOps, + FeatureCrypto, + FeatureFPARMv8, + FeatureFuseAES, + FeatureNEON, + FeatureRCPC, + FeaturePerfMon, + FeaturePostRAScheduler, + FeatureSPE, + FeatureFullFP16, + FeatureSSBS, + FeatureDotProd]>; + +def ProcX1 : SubtargetFeature<"cortex-x1", "ARMProcFamily", "CortexX1", + "Cortex-X1 ARM processors", [ + HasV8_2aOps, + FeatureCrypto, + FeatureFPARMv8, + FeatureFuseAES, + FeatureNEON, + FeatureRCPC, + FeaturePerfMon, + FeaturePostRAScheduler, + FeatureSPE, + FeatureFullFP16, + FeatureDotProd]>; + +def ProcA64FX : SubtargetFeature<"a64fx", "ARMProcFamily", "A64FX", + "Fujitsu A64FX processors", [ + HasV8_2aOps, + FeatureFPARMv8, + FeatureNEON, + FeatureSHA2, + FeaturePerfMon, + FeatureFullFP16, + FeatureSVE, + FeaturePostRAScheduler, + FeatureComplxNum + ]>; + +def ProcCarmel : SubtargetFeature<"carmel", "ARMProcFamily", "Carmel", + "Nvidia Carmel processors", [ + HasV8_2aOps, + FeatureNEON, + FeatureCrypto, + FeatureFullFP16 + ]>; + // Note that cyclone does not fuse AES instructions, but newer apple chips do // perform the fusion and cyclone is used by default when targetting apple OSes. def ProcAppleA7 : SubtargetFeature<"apple-a7", "ARMProcFamily", "AppleA7", @@ -780,6 +904,25 @@ def ProcThunderX2T99 : SubtargetFeature<"thunderx2t99", "ARMProcFamily", FeatureLSE, HasV8_1aOps]>; +def ProcThunderX3T110 : SubtargetFeature<"thunderx3t110", "ARMProcFamily", + "ThunderX3T110", + "Marvell ThunderX3 processors", [ + FeatureAggressiveFMA, + FeatureCRC, + FeatureCrypto, + FeatureFPARMv8, + FeatureArithmeticBccFusion, + FeatureNEON, + FeaturePostRAScheduler, + FeaturePredictableSelectIsExpensive, + FeatureLSE, + FeaturePA, + FeatureUseAA, + FeatureBalanceFPOps, + FeaturePerfMon, + FeatureStrictAlign, + HasV8_3aOps]>; + def ProcThunderX : SubtargetFeature<"thunderx", "ARMProcFamily", "ThunderX", "Cavium ThunderX processors", [ FeatureCRC, @@ -844,7 +987,7 @@ def : ProcessorModel<"generic", NoSchedModel, [ FeatureNEON, FeaturePerfMon, FeaturePostRAScheduler, -// ETE and TRBE are future architecture extensions. We temporariliy enable them +// ETE and TRBE are future architecture extensions. We temporarily enable them // by default for users targeting generic AArch64, until it is decided in which // armv8.x-a architecture revision they will end up. The extensions do not // affect code generated by the compiler and can be used only by explicitly @@ -853,6 +996,7 @@ def : ProcessorModel<"generic", NoSchedModel, [ ]>; def : ProcessorModel<"cortex-a35", CortexA53Model, [ProcA35]>; +def : ProcessorModel<"cortex-a34", CortexA53Model, [ProcA35]>; def : ProcessorModel<"cortex-a53", CortexA53Model, [ProcA53]>; def : ProcessorModel<"cortex-a55", CortexA53Model, [ProcA55]>; def : ProcessorModel<"cortex-a57", CortexA57Model, [ProcA57]>; @@ -863,6 +1007,9 @@ def : ProcessorModel<"cortex-a73", CortexA57Model, [ProcA73]>; def : ProcessorModel<"cortex-a75", CortexA57Model, [ProcA75]>; def : ProcessorModel<"cortex-a76", CortexA57Model, [ProcA76]>; def : ProcessorModel<"cortex-a76ae", CortexA57Model, [ProcA76]>; +def : ProcessorModel<"cortex-a77", CortexA57Model, [ProcA77]>; +def : ProcessorModel<"cortex-a78", CortexA57Model, [ProcA78]>; +def : ProcessorModel<"cortex-x1", CortexA57Model, [ProcX1]>; def : ProcessorModel<"neoverse-e1", CortexA53Model, [ProcNeoverseE1]>; def : ProcessorModel<"neoverse-n1", CortexA57Model, [ProcNeoverseN1]>; def : ProcessorModel<"exynos-m3", ExynosM3Model, [ProcExynosM3]>; @@ -878,6 +1025,8 @@ def : ProcessorModel<"thunderxt81", ThunderXT8XModel, [ProcThunderXT81]>; def : ProcessorModel<"thunderxt83", ThunderXT8XModel, [ProcThunderXT83]>; // Cavium ThunderX2T9X Processors. Formerly Broadcom Vulcan. def : ProcessorModel<"thunderx2t99", ThunderX2T99Model, [ProcThunderX2T99]>; +// Marvell ThunderX3T110 Processors. +def : ProcessorModel<"thunderx3t110", ThunderX3T110Model, [ProcThunderX3T110]>; // FIXME: HiSilicon TSV110 is currently modeled as a Cortex-A57. def : ProcessorModel<"tsv110", CortexA57Model, [ProcTSV110]>; @@ -900,6 +1049,13 @@ def : ProcessorModel<"apple-s5", CycloneModel, [ProcAppleA12]>; // Alias for the latest Apple processor model supported by LLVM. def : ProcessorModel<"apple-latest", CycloneModel, [ProcAppleA13]>; +// Fujitsu A64FX +// FIXME: Scheduling model is not implemented yet. +def : ProcessorModel<"a64fx", NoSchedModel, [ProcA64FX]>; + +// Nvidia Carmel +def : ProcessorModel<"carmel", NoSchedModel, [ProcCarmel]>; + //===----------------------------------------------------------------------===// // Assembly parser //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp index 00e321f9b8509..3a94820dac8d3 100644 --- a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp +++ b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp @@ -84,8 +84,8 @@ public: return MCInstLowering.lowerOperand(MO, MCOp); } - void EmitStartOfAsmFile(Module &M) override; - void EmitJumpTableInfo() override; + void emitStartOfAsmFile(Module &M) override; + void emitJumpTableInfo() override; void emitJumpTableEntry(const MachineJumpTableInfo *MJTI, const MachineBasicBlock *MBB, unsigned JTI); @@ -112,7 +112,9 @@ public: bool emitPseudoExpansionLowering(MCStreamer &OutStreamer, const MachineInstr *MI); - void EmitInstruction(const MachineInstr *MI) override; + void emitInstruction(const MachineInstr *MI) override; + + void emitFunctionHeaderComment() override; void getAnalysisUsage(AnalysisUsage &AU) const override { AsmPrinter::getAnalysisUsage(AU); @@ -139,7 +141,7 @@ public: } // Emit the rest of the function body. - EmitFunctionBody(); + emitFunctionBody(); // Emit the XRay table for this function. emitXRayTable(); @@ -162,10 +164,10 @@ private: void PrintDebugValueComment(const MachineInstr *MI, raw_ostream &OS); - void EmitFunctionBodyEnd() override; + void emitFunctionBodyEnd() override; MCSymbol *GetCPISymbol(unsigned CPID) const override; - void EmitEndOfAsmFile(Module &M) override; + void emitEndOfAsmFile(Module &M) override; AArch64FunctionInfo *AArch64FI = nullptr; @@ -182,7 +184,7 @@ private: } // end anonymous namespace -void AArch64AsmPrinter::EmitStartOfAsmFile(Module &M) { +void AArch64AsmPrinter::emitStartOfAsmFile(Module &M) { if (!TM.getTargetTriple().isOSBinFormatELF()) return; @@ -225,22 +227,29 @@ void AArch64AsmPrinter::EmitStartOfAsmFile(Module &M) { OutStreamer->SwitchSection(Nt); // Emit the note header. - EmitAlignment(Align(8)); - OutStreamer->EmitIntValue(4, 4); // data size for "GNU\0" - OutStreamer->EmitIntValue(4 * 4, 4); // Elf_Prop size - OutStreamer->EmitIntValue(ELF::NT_GNU_PROPERTY_TYPE_0, 4); - OutStreamer->EmitBytes(StringRef("GNU", 4)); // note name + emitAlignment(Align(8)); + OutStreamer->emitInt32(4); // data size for "GNU\0" + OutStreamer->emitInt32(4 * 4); // Elf_Prop size + OutStreamer->emitInt32(ELF::NT_GNU_PROPERTY_TYPE_0); + OutStreamer->emitBytes(StringRef("GNU", 4)); // note name // Emit the PAC/BTI properties. - OutStreamer->EmitIntValue(ELF::GNU_PROPERTY_AARCH64_FEATURE_1_AND, 4); - OutStreamer->EmitIntValue(4, 4); // data size - OutStreamer->EmitIntValue(Flags, 4); // data - OutStreamer->EmitIntValue(0, 4); // pad + OutStreamer->emitInt32(ELF::GNU_PROPERTY_AARCH64_FEATURE_1_AND); + OutStreamer->emitInt32(4); // data size + OutStreamer->emitInt32(Flags); // data + OutStreamer->emitInt32(0); // pad OutStreamer->endSection(Nt); OutStreamer->SwitchSection(Cur); } +void AArch64AsmPrinter::emitFunctionHeaderComment() { + const AArch64FunctionInfo *FI = MF->getInfo<AArch64FunctionInfo>(); + Optional<std::string> OutlinerString = FI->getOutliningStyle(); + if (OutlinerString != None) + OutStreamer->GetCommentOS() << ' ' << OutlinerString; +} + void AArch64AsmPrinter::LowerPATCHABLE_FUNCTION_ENTER(const MachineInstr &MI) { const Function &F = MF->getFunction(); @@ -250,8 +259,7 @@ void AArch64AsmPrinter::LowerPATCHABLE_FUNCTION_ENTER(const MachineInstr &MI) .getValueAsString() .getAsInteger(10, Num)) return; - for (; Num; --Num) - EmitToStreamer(*OutStreamer, MCInstBuilder(AArch64::HINT).addImm(0)); + emitNops(Num); return; } @@ -291,9 +299,9 @@ void AArch64AsmPrinter::EmitSled(const MachineInstr &MI, SledKind Kind) // ;DATA: higher 32 bits of the address of the trampoline // LDP X0, X30, [SP], #16 ; pop X0 and the link register from the stack // - OutStreamer->EmitCodeAlignment(4); + OutStreamer->emitCodeAlignment(4); auto CurSled = OutContext.createTempSymbol("xray_sled_", true); - OutStreamer->EmitLabel(CurSled); + OutStreamer->emitLabel(CurSled); auto Target = OutContext.createTempSymbol(); // Emit "B #32" instruction, which jumps over the next 28 bytes. @@ -304,8 +312,8 @@ void AArch64AsmPrinter::EmitSled(const MachineInstr &MI, SledKind Kind) for (int8_t I = 0; I < NoopsInSledCount; I++) EmitToStreamer(*OutStreamer, MCInstBuilder(AArch64::HINT).addImm(0)); - OutStreamer->EmitLabel(Target); - recordSled(CurSled, MI, Kind); + OutStreamer->emitLabel(Target); + recordSled(CurSled, MI, Kind, 2); } void AArch64AsmPrinter::LowerHWASAN_CHECK_MEMACCESS(const MachineInstr &MI) { @@ -364,25 +372,25 @@ void AArch64AsmPrinter::EmitHwasanMemaccessSymbols(Module &M) { ELF::SHF_EXECINSTR | ELF::SHF_ALLOC | ELF::SHF_GROUP, 0, Sym->getName())); - OutStreamer->EmitSymbolAttribute(Sym, MCSA_ELF_TypeFunction); - OutStreamer->EmitSymbolAttribute(Sym, MCSA_Weak); - OutStreamer->EmitSymbolAttribute(Sym, MCSA_Hidden); - OutStreamer->EmitLabel(Sym); + OutStreamer->emitSymbolAttribute(Sym, MCSA_ELF_TypeFunction); + OutStreamer->emitSymbolAttribute(Sym, MCSA_Weak); + OutStreamer->emitSymbolAttribute(Sym, MCSA_Hidden); + OutStreamer->emitLabel(Sym); - OutStreamer->EmitInstruction(MCInstBuilder(AArch64::UBFMXri) + OutStreamer->emitInstruction(MCInstBuilder(AArch64::UBFMXri) .addReg(AArch64::X16) .addReg(Reg) .addImm(4) .addImm(55), *STI); - OutStreamer->EmitInstruction(MCInstBuilder(AArch64::LDRBBroX) + OutStreamer->emitInstruction(MCInstBuilder(AArch64::LDRBBroX) .addReg(AArch64::W16) .addReg(AArch64::X9) .addReg(AArch64::X16) .addImm(0) .addImm(0), *STI); - OutStreamer->EmitInstruction( + OutStreamer->emitInstruction( MCInstBuilder(AArch64::SUBSXrs) .addReg(AArch64::XZR) .addReg(AArch64::X16) @@ -390,33 +398,33 @@ void AArch64AsmPrinter::EmitHwasanMemaccessSymbols(Module &M) { .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSR, 56)), *STI); MCSymbol *HandleMismatchOrPartialSym = OutContext.createTempSymbol(); - OutStreamer->EmitInstruction( + OutStreamer->emitInstruction( MCInstBuilder(AArch64::Bcc) .addImm(AArch64CC::NE) .addExpr(MCSymbolRefExpr::create(HandleMismatchOrPartialSym, OutContext)), *STI); MCSymbol *ReturnSym = OutContext.createTempSymbol(); - OutStreamer->EmitLabel(ReturnSym); - OutStreamer->EmitInstruction( + OutStreamer->emitLabel(ReturnSym); + OutStreamer->emitInstruction( MCInstBuilder(AArch64::RET).addReg(AArch64::LR), *STI); - OutStreamer->EmitLabel(HandleMismatchOrPartialSym); + OutStreamer->emitLabel(HandleMismatchOrPartialSym); if (IsShort) { - OutStreamer->EmitInstruction(MCInstBuilder(AArch64::SUBSWri) + OutStreamer->emitInstruction(MCInstBuilder(AArch64::SUBSWri) .addReg(AArch64::WZR) .addReg(AArch64::W16) .addImm(15) .addImm(0), *STI); MCSymbol *HandleMismatchSym = OutContext.createTempSymbol(); - OutStreamer->EmitInstruction( + OutStreamer->emitInstruction( MCInstBuilder(AArch64::Bcc) .addImm(AArch64CC::HI) .addExpr(MCSymbolRefExpr::create(HandleMismatchSym, OutContext)), *STI); - OutStreamer->EmitInstruction( + OutStreamer->emitInstruction( MCInstBuilder(AArch64::ANDXri) .addReg(AArch64::X17) .addReg(Reg) @@ -424,59 +432,59 @@ void AArch64AsmPrinter::EmitHwasanMemaccessSymbols(Module &M) { *STI); unsigned Size = 1 << (AccessInfo & 0xf); if (Size != 1) - OutStreamer->EmitInstruction(MCInstBuilder(AArch64::ADDXri) + OutStreamer->emitInstruction(MCInstBuilder(AArch64::ADDXri) .addReg(AArch64::X17) .addReg(AArch64::X17) .addImm(Size - 1) .addImm(0), *STI); - OutStreamer->EmitInstruction(MCInstBuilder(AArch64::SUBSWrs) + OutStreamer->emitInstruction(MCInstBuilder(AArch64::SUBSWrs) .addReg(AArch64::WZR) .addReg(AArch64::W16) .addReg(AArch64::W17) .addImm(0), *STI); - OutStreamer->EmitInstruction( + OutStreamer->emitInstruction( MCInstBuilder(AArch64::Bcc) .addImm(AArch64CC::LS) .addExpr(MCSymbolRefExpr::create(HandleMismatchSym, OutContext)), *STI); - OutStreamer->EmitInstruction( + OutStreamer->emitInstruction( MCInstBuilder(AArch64::ORRXri) .addReg(AArch64::X16) .addReg(Reg) .addImm(AArch64_AM::encodeLogicalImmediate(0xf, 64)), *STI); - OutStreamer->EmitInstruction(MCInstBuilder(AArch64::LDRBBui) + OutStreamer->emitInstruction(MCInstBuilder(AArch64::LDRBBui) .addReg(AArch64::W16) .addReg(AArch64::X16) .addImm(0), *STI); - OutStreamer->EmitInstruction( + OutStreamer->emitInstruction( MCInstBuilder(AArch64::SUBSXrs) .addReg(AArch64::XZR) .addReg(AArch64::X16) .addReg(Reg) .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSR, 56)), *STI); - OutStreamer->EmitInstruction( + OutStreamer->emitInstruction( MCInstBuilder(AArch64::Bcc) .addImm(AArch64CC::EQ) .addExpr(MCSymbolRefExpr::create(ReturnSym, OutContext)), *STI); - OutStreamer->EmitLabel(HandleMismatchSym); + OutStreamer->emitLabel(HandleMismatchSym); } - OutStreamer->EmitInstruction(MCInstBuilder(AArch64::STPXpre) + OutStreamer->emitInstruction(MCInstBuilder(AArch64::STPXpre) .addReg(AArch64::SP) .addReg(AArch64::X0) .addReg(AArch64::X1) .addReg(AArch64::SP) .addImm(-32), *STI); - OutStreamer->EmitInstruction(MCInstBuilder(AArch64::STPXi) + OutStreamer->emitInstruction(MCInstBuilder(AArch64::STPXi) .addReg(AArch64::FP) .addReg(AArch64::LR) .addReg(AArch64::SP) @@ -484,13 +492,13 @@ void AArch64AsmPrinter::EmitHwasanMemaccessSymbols(Module &M) { *STI); if (Reg != AArch64::X0) - OutStreamer->EmitInstruction(MCInstBuilder(AArch64::ORRXrs) + OutStreamer->emitInstruction(MCInstBuilder(AArch64::ORRXrs) .addReg(AArch64::X0) .addReg(AArch64::XZR) .addReg(Reg) .addImm(0), *STI); - OutStreamer->EmitInstruction(MCInstBuilder(AArch64::MOVZXi) + OutStreamer->emitInstruction(MCInstBuilder(AArch64::MOVZXi) .addReg(AArch64::X1) .addImm(AccessInfo) .addImm(0), @@ -499,14 +507,14 @@ void AArch64AsmPrinter::EmitHwasanMemaccessSymbols(Module &M) { // Intentionally load the GOT entry and branch to it, rather than possibly // late binding the function, which may clobber the registers before we have // a chance to save them. - OutStreamer->EmitInstruction( + OutStreamer->emitInstruction( MCInstBuilder(AArch64::ADRP) .addReg(AArch64::X16) .addExpr(AArch64MCExpr::create( HwasanTagMismatchRef, AArch64MCExpr::VariantKind::VK_GOT_PAGE, OutContext)), *STI); - OutStreamer->EmitInstruction( + OutStreamer->emitInstruction( MCInstBuilder(AArch64::LDRXui) .addReg(AArch64::X16) .addReg(AArch64::X16) @@ -514,12 +522,12 @@ void AArch64AsmPrinter::EmitHwasanMemaccessSymbols(Module &M) { HwasanTagMismatchRef, AArch64MCExpr::VariantKind::VK_GOT_LO12, OutContext)), *STI); - OutStreamer->EmitInstruction( + OutStreamer->emitInstruction( MCInstBuilder(AArch64::BR).addReg(AArch64::X16), *STI); } } -void AArch64AsmPrinter::EmitEndOfAsmFile(Module &M) { +void AArch64AsmPrinter::emitEndOfAsmFile(Module &M) { EmitHwasanMemaccessSymbols(M); const Triple &TT = TM.getTargetTriple(); @@ -529,7 +537,7 @@ void AArch64AsmPrinter::EmitEndOfAsmFile(Module &M) { // implementation of multiple entry points). If this doesn't occur, the // linker can safely perform dead code stripping. Since LLVM never // generates code that does this, it is always safe to set. - OutStreamer->EmitAssemblerFlag(MCAF_SubsectionsViaSymbols); + OutStreamer->emitAssemblerFlag(MCAF_SubsectionsViaSymbols); } emitStackMaps(SM); } @@ -544,12 +552,12 @@ void AArch64AsmPrinter::EmitLOHs() { "Label hasn't been inserted for LOH related instruction"); MCArgs.push_back(LabelIt->second); } - OutStreamer->EmitLOHDirective(D.getKind(), MCArgs); + OutStreamer->emitLOHDirective(D.getKind(), MCArgs); MCArgs.clear(); } } -void AArch64AsmPrinter::EmitFunctionBodyEnd() { +void AArch64AsmPrinter::emitFunctionBodyEnd() { if (!AArch64FI->getLOHRelated().empty()) EmitLOHs(); } @@ -741,11 +749,10 @@ void AArch64AsmPrinter::PrintDebugValueComment(const MachineInstr *MI, assert(NOps == 4); OS << '\t' << MAI->getCommentString() << "DEBUG_VALUE: "; // cast away const; DIetc do not take const operands for some reason. - OS << cast<DILocalVariable>(MI->getOperand(NOps - 2).getMetadata()) - ->getName(); + OS << MI->getDebugVariable()->getName(); OS << " <- "; // Frame address. Currently handles register +- offset only. - assert(MI->getOperand(0).isReg() && MI->getOperand(1).isImm()); + assert(MI->getDebugOperand(0).isReg() && MI->isDebugOffsetImm()); OS << '['; printOperand(MI, 0, OS); OS << '+'; @@ -755,7 +762,7 @@ void AArch64AsmPrinter::PrintDebugValueComment(const MachineInstr *MI, printOperand(MI, NOps - 2, OS); } -void AArch64AsmPrinter::EmitJumpTableInfo() { +void AArch64AsmPrinter::emitJumpTableInfo() { const MachineJumpTableInfo *MJTI = MF->getJumpTableInfo(); if (!MJTI) return; @@ -783,8 +790,8 @@ void AArch64AsmPrinter::EmitJumpTableInfo() { if (JTBBs.empty()) continue; unsigned Size = AFI->getJumpTableEntrySize(JTI); - EmitAlignment(Align(Size)); - OutStreamer->EmitLabel(GetJTISymbol(JTI)); + emitAlignment(Align(Size)); + OutStreamer->emitLabel(GetJTISymbol(JTI)); for (auto *JTBB : JTBBs) emitJumpTableEntry(MJTI, JTBB, JTI); @@ -812,7 +819,7 @@ void AArch64AsmPrinter::emitJumpTableEntry(const MachineJumpTableInfo *MJTI, Value, MCConstantExpr::create(2, OutContext), OutContext); } - OutStreamer->EmitValue(Value, Size); + OutStreamer->emitValue(Value, Size); } /// Small jump tables contain an unsigned byte or half, representing the offset @@ -868,7 +875,7 @@ void AArch64AsmPrinter::LowerSTACKMAP(MCStreamer &OutStreamer, StackMaps &SM, auto &Ctx = OutStreamer.getContext(); MCSymbol *MILabel = Ctx.createTempSymbol(); - OutStreamer.EmitLabel(MILabel); + OutStreamer.emitLabel(MILabel); SM.recordStackMap(*MILabel, MI); assert(NumNOPBytes % 4 == 0 && "Invalid number of NOP bytes requested!"); @@ -898,7 +905,7 @@ void AArch64AsmPrinter::LowerPATCHPOINT(MCStreamer &OutStreamer, StackMaps &SM, const MachineInstr &MI) { auto &Ctx = OutStreamer.getContext(); MCSymbol *MILabel = Ctx.createTempSymbol(); - OutStreamer.EmitLabel(MILabel); + OutStreamer.emitLabel(MILabel); SM.recordPatchPoint(*MILabel, MI); PatchPointOpers Opers(&MI); @@ -982,7 +989,7 @@ void AArch64AsmPrinter::EmitFMov0(const MachineInstr &MI) { // instructions) auto-generated. #include "AArch64GenMCPseudoLowering.inc" -void AArch64AsmPrinter::EmitInstruction(const MachineInstr *MI) { +void AArch64AsmPrinter::emitInstruction(const MachineInstr *MI) { // Do any auto-generated pseudo lowerings. if (emitPseudoExpansionLowering(*OutStreamer, MI)) return; @@ -992,7 +999,7 @@ void AArch64AsmPrinter::EmitInstruction(const MachineInstr *MI) { MCSymbol *LOHLabel = createTempSymbol("loh"); // Associate the instruction with the label LOHInstToLabel[MI] = LOHLabel; - OutStreamer->EmitLabel(LOHLabel); + OutStreamer->emitLabel(LOHLabel); } AArch64TargetStreamer *TS = @@ -1001,6 +1008,26 @@ void AArch64AsmPrinter::EmitInstruction(const MachineInstr *MI) { switch (MI->getOpcode()) { default: break; + case AArch64::HINT: { + // CurrentPatchableFunctionEntrySym can be CurrentFnBegin only for + // -fpatchable-function-entry=N,0. The entry MBB is guaranteed to be + // non-empty. If MI is the initial BTI, place the + // __patchable_function_entries label after BTI. + if (CurrentPatchableFunctionEntrySym && + CurrentPatchableFunctionEntrySym == CurrentFnBegin && + MI == &MF->front().front()) { + int64_t Imm = MI->getOperand(0).getImm(); + if ((Imm & 32) && (Imm & 6)) { + MCInst Inst; + MCInstLowering.Lower(MI, Inst); + EmitToStreamer(*OutStreamer, Inst); + CurrentPatchableFunctionEntrySym = createTempSymbol("patch"); + OutStreamer->emitLabel(CurrentPatchableFunctionEntrySym); + return; + } + } + break; + } case AArch64::MOVMCSym: { Register DestReg = MI->getOperand(0).getReg(); const MachineOperand &MO_Sym = MI->getOperand(1); @@ -1048,7 +1075,7 @@ void AArch64AsmPrinter::EmitInstruction(const MachineInstr *MI) { SmallString<128> TmpStr; raw_svector_ostream OS(TmpStr); PrintDebugValueComment(MI, OS); - OutStreamer->EmitRawText(StringRef(OS.str())); + OutStreamer->emitRawText(StringRef(OS.str())); } return; @@ -1061,7 +1088,7 @@ void AArch64AsmPrinter::EmitInstruction(const MachineInstr *MI) { if (needsCFIMoves() == CFI_M_None) return; - OutStreamer->EmitCFIBKeyFrame(); + OutStreamer->emitCFIBKeyFrame(); return; } } @@ -1087,6 +1114,25 @@ void AArch64AsmPrinter::EmitInstruction(const MachineInstr *MI) { EmitToStreamer(*OutStreamer, TmpInst); return; } + case AArch64::SpeculationBarrierISBDSBEndBB: { + // Print DSB SYS + ISB + MCInst TmpInstDSB; + TmpInstDSB.setOpcode(AArch64::DSB); + TmpInstDSB.addOperand(MCOperand::createImm(0xf)); + EmitToStreamer(*OutStreamer, TmpInstDSB); + MCInst TmpInstISB; + TmpInstISB.setOpcode(AArch64::ISB); + TmpInstISB.addOperand(MCOperand::createImm(0xf)); + EmitToStreamer(*OutStreamer, TmpInstISB); + return; + } + case AArch64::SpeculationBarrierSBEndBB: { + // Print SB + MCInst TmpInstSB; + TmpInstSB.setOpcode(AArch64::SB); + EmitToStreamer(*OutStreamer, TmpInstSB); + return; + } case AArch64::TLSDESC_CALLSEQ: { /// lower this to: /// adrp x0, :tlsdesc:var diff --git a/llvm/lib/Target/AArch64/AArch64BranchTargets.cpp b/llvm/lib/Target/AArch64/AArch64BranchTargets.cpp index 6fa3a462bc71a..1956014b738d0 100644 --- a/llvm/lib/Target/AArch64/AArch64BranchTargets.cpp +++ b/llvm/lib/Target/AArch64/AArch64BranchTargets.cpp @@ -118,9 +118,15 @@ void AArch64BranchTargets::addBTI(MachineBasicBlock &MBB, bool CouldCall, auto MBBI = MBB.begin(); - // PACI[AB]SP are implicitly BTI JC, so no BTI instruction needed there. - if (MBBI != MBB.end() && (MBBI->getOpcode() == AArch64::PACIASP || - MBBI->getOpcode() == AArch64::PACIBSP)) + // Skip the meta instuctions, those will be removed anyway. + for (; MBBI != MBB.end() && MBBI->isMetaInstruction(); ++MBBI) + ; + + // SCTLR_EL1.BT[01] is set to 0 by default which means + // PACI[AB]SP are implicitly BTI C so no BTI C instruction is needed there. + if (MBBI != MBB.end() && HintNum == 34 && + (MBBI->getOpcode() == AArch64::PACIASP || + MBBI->getOpcode() == AArch64::PACIBSP)) return; BuildMI(MBB, MBB.begin(), MBB.findDebugLoc(MBB.begin()), diff --git a/llvm/lib/Target/AArch64/AArch64CallingConvention.cpp b/llvm/lib/Target/AArch64/AArch64CallingConvention.cpp index a0695cef615f3..84ec5afcc9c19 100644 --- a/llvm/lib/Target/AArch64/AArch64CallingConvention.cpp +++ b/llvm/lib/Target/AArch64/AArch64CallingConvention.cpp @@ -38,18 +38,17 @@ static const MCPhysReg QRegList[] = {AArch64::Q0, AArch64::Q1, AArch64::Q2, static bool finishStackBlock(SmallVectorImpl<CCValAssign> &PendingMembers, MVT LocVT, ISD::ArgFlagsTy &ArgFlags, - CCState &State, unsigned SlotAlign) { + CCState &State, Align SlotAlign) { unsigned Size = LocVT.getSizeInBits() / 8; const Align StackAlign = State.getMachineFunction().getDataLayout().getStackAlignment(); - const Align OrigAlign(ArgFlags.getOrigAlign()); - const Align Align = std::min(OrigAlign, StackAlign); + const Align OrigAlign = ArgFlags.getNonZeroOrigAlign(); + const Align Alignment = std::min(OrigAlign, StackAlign); for (auto &It : PendingMembers) { - It.convertToMem(State.AllocateStack( - Size, std::max((unsigned)Align.value(), SlotAlign))); + It.convertToMem(State.AllocateStack(Size, std::max(Alignment, SlotAlign))); State.addLoc(It); - SlotAlign = 1; + SlotAlign = Align(1); } // All pending members have now been allocated @@ -72,7 +71,7 @@ static bool CC_AArch64_Custom_Stack_Block( if (!ArgFlags.isInConsecutiveRegsLast()) return true; - return finishStackBlock(PendingMembers, LocVT, ArgFlags, State, 8); + return finishStackBlock(PendingMembers, LocVT, ArgFlags, State, Align(8)); } /// Given an [N x Ty] block, it should be passed in a consecutive sequence of @@ -146,7 +145,7 @@ static bool CC_AArch64_Custom_Block(unsigned &ValNo, MVT &ValVT, MVT &LocVT, for (auto Reg : RegList) State.AllocateReg(Reg); - unsigned SlotAlign = Subtarget.isTargetDarwin() ? 1 : 8; + const Align SlotAlign = Subtarget.isTargetDarwin() ? Align(1) : Align(8); return finishStackBlock(PendingMembers, LocVT, ArgFlags, State, SlotAlign); } diff --git a/llvm/lib/Target/AArch64/AArch64CallingConvention.td b/llvm/lib/Target/AArch64/AArch64CallingConvention.td index a0b2d7712b662..fdcc890bf5892 100644 --- a/llvm/lib/Target/AArch64/AArch64CallingConvention.td +++ b/llvm/lib/Target/AArch64/AArch64CallingConvention.td @@ -10,9 +10,6 @@ // //===----------------------------------------------------------------------===// -/// CCIfAlign - Match of the original alignment of the arg -class CCIfAlign<string Align, CCAction A> : - CCIf<!strconcat("ArgFlags.getOrigAlign() == ", Align), A>; /// CCIfBigEndian - Match only if we're in big endian mode. class CCIfBigEndian<CCAction A> : CCIf<"State.getMachineFunction().getDataLayout().isBigEndian()", A>; @@ -33,9 +30,9 @@ def CC_AArch64_AAPCS : CallingConv<[ // Big endian vectors must be passed as if they were 1-element vectors so that // their lanes are in a consistent order. - CCIfBigEndian<CCIfType<[v2i32, v2f32, v4i16, v4f16, v8i8], + CCIfBigEndian<CCIfType<[v2i32, v2f32, v4i16, v4f16, v4bf16, v8i8], CCBitConvertToType<f64>>>, - CCIfBigEndian<CCIfType<[v2i64, v2f64, v4i32, v4f32, v8i16, v8f16, v16i8], + CCIfBigEndian<CCIfType<[v2i64, v2f64, v4i32, v4f32, v8i16, v8f16, v8bf16, v16i8], CCBitConvertToType<f128>>>, // In AAPCS, an SRet is passed in X8, not X0 like a normal pointer parameter. @@ -75,10 +72,10 @@ def CC_AArch64_AAPCS : CallingConv<[ CCIfConsecutiveRegs<CCCustom<"CC_AArch64_Custom_Block">>, CCIfType<[nxv16i8, nxv8i16, nxv4i32, nxv2i64, nxv2f16, nxv4f16, nxv8f16, - nxv2f32, nxv4f32, nxv2f64], + nxv2bf16, nxv4bf16, nxv8bf16, nxv2f32, nxv4f32, nxv2f64], CCAssignToReg<[Z0, Z1, Z2, Z3, Z4, Z5, Z6, Z7]>>, CCIfType<[nxv16i8, nxv8i16, nxv4i32, nxv2i64, nxv2f16, nxv4f16, nxv8f16, - nxv2f32, nxv4f32, nxv2f64], + nxv2bf16, nxv4bf16, nxv8bf16, nxv2f32, nxv4f32, nxv2f64], CCPassIndirect<i64>>, CCIfType<[nxv2i1, nxv4i1, nxv8i1, nxv16i1], @@ -102,22 +99,24 @@ def CC_AArch64_AAPCS : CallingConv<[ [W0, W1, W2, W3, W4, W5, W6, W7]>>, CCIfType<[f16], CCAssignToRegWithShadow<[H0, H1, H2, H3, H4, H5, H6, H7], [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>, + CCIfType<[bf16], CCAssignToRegWithShadow<[H0, H1, H2, H3, H4, H5, H6, H7], + [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>, CCIfType<[f32], CCAssignToRegWithShadow<[S0, S1, S2, S3, S4, S5, S6, S7], [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>, CCIfType<[f64], CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7], [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>, - CCIfType<[v1i64, v2i32, v4i16, v8i8, v1f64, v2f32, v4f16], + CCIfType<[v1i64, v2i32, v4i16, v8i8, v1f64, v2f32, v4f16, v4bf16], CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7], [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>, - CCIfType<[f128, v2i64, v4i32, v8i16, v16i8, v4f32, v2f64, v8f16], + CCIfType<[f128, v2i64, v4i32, v8i16, v16i8, v4f32, v2f64, v8f16, v8bf16], CCAssignToReg<[Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>, // If more than will fit in registers, pass them on the stack instead. - CCIfType<[i1, i8, i16, f16], CCAssignToStack<8, 8>>, + CCIfType<[i1, i8, i16, f16, bf16], CCAssignToStack<8, 8>>, CCIfType<[i32, f32], CCAssignToStack<8, 8>>, - CCIfType<[i64, f64, v1f64, v2f32, v1i64, v2i32, v4i16, v8i8, v4f16], + CCIfType<[i64, f64, v1f64, v2f32, v1i64, v2i32, v4i16, v8i8, v4f16, v4bf16], CCAssignToStack<8, 8>>, - CCIfType<[f128, v2i64, v4i32, v8i16, v16i8, v4f32, v2f64, v8f16], + CCIfType<[f128, v2i64, v4i32, v8i16, v16i8, v4f32, v2f64, v8f16, v8bf16], CCAssignToStack<16, 16>> ]>; @@ -132,9 +131,9 @@ def RetCC_AArch64_AAPCS : CallingConv<[ // Big endian vectors must be passed as if they were 1-element vectors so that // their lanes are in a consistent order. - CCIfBigEndian<CCIfType<[v2i32, v2f32, v4i16, v4f16, v8i8], + CCIfBigEndian<CCIfType<[v2i32, v2f32, v4i16, v4f16, v4bf16, v8i8], CCBitConvertToType<f64>>>, - CCIfBigEndian<CCIfType<[v2i64, v2f64, v4i32, v4f32, v8i16, v8f16, v16i8], + CCIfBigEndian<CCIfType<[v2i64, v2f64, v4i32, v4f32, v8i16, v8f16, v8bf16, v16i8], CCBitConvertToType<f128>>>, CCIfType<[i1, i8, i16], CCPromoteToType<i32>>, @@ -144,18 +143,20 @@ def RetCC_AArch64_AAPCS : CallingConv<[ [W0, W1, W2, W3, W4, W5, W6, W7]>>, CCIfType<[f16], CCAssignToRegWithShadow<[H0, H1, H2, H3, H4, H5, H6, H7], [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>, + CCIfType<[bf16], CCAssignToRegWithShadow<[H0, H1, H2, H3, H4, H5, H6, H7], + [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>, CCIfType<[f32], CCAssignToRegWithShadow<[S0, S1, S2, S3, S4, S5, S6, S7], [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>, CCIfType<[f64], CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7], [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>, - CCIfType<[v1i64, v2i32, v4i16, v8i8, v1f64, v2f32, v4f16], + CCIfType<[v1i64, v2i32, v4i16, v8i8, v1f64, v2f32, v4f16, v4bf16], CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7], [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>, - CCIfType<[f128, v2i64, v4i32, v8i16, v16i8, v4f32, v2f64, v8f16], + CCIfType<[f128, v2i64, v4i32, v8i16, v16i8, v4f32, v2f64, v8f16, v8bf16], CCAssignToReg<[Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>, CCIfType<[nxv16i8, nxv8i16, nxv4i32, nxv2i64, nxv2f16, nxv4f16, nxv8f16, - nxv2f32, nxv4f32, nxv2f64], + nxv2bf16, nxv4bf16, nxv8bf16, nxv2f32, nxv4f32, nxv2f64], CCAssignToReg<[Z0, Z1, Z2, Z3, Z4, Z5, Z6, Z7]>>, CCIfType<[nxv2i1, nxv4i1, nxv8i1, nxv16i1], @@ -165,7 +166,7 @@ def RetCC_AArch64_AAPCS : CallingConv<[ // Vararg functions on windows pass floats in integer registers let Entry = 1 in def CC_AArch64_Win64_VarArg : CallingConv<[ - CCIfType<[f16, f32], CCPromoteToType<f64>>, + CCIfType<[f16, bf16, f32], CCPromoteToType<f64>>, CCIfType<[f64], CCBitConvertToType<i64>>, CCDelegateTo<CC_AArch64_AAPCS> ]>; @@ -219,19 +220,22 @@ def CC_AArch64_DarwinPCS : CallingConv<[ [W0, W1, W2, W3, W4, W5, W6, W7]>>, CCIfType<[f16], CCAssignToRegWithShadow<[H0, H1, H2, H3, H4, H5, H6, H7], [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>, + CCIfType<[bf16], CCAssignToRegWithShadow<[H0, H1, H2, H3, H4, H5, H6, H7], + [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>, CCIfType<[f32], CCAssignToRegWithShadow<[S0, S1, S2, S3, S4, S5, S6, S7], [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>, CCIfType<[f64], CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7], [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>, - CCIfType<[v1i64, v2i32, v4i16, v8i8, v1f64, v2f32, v4f16], + CCIfType<[v1i64, v2i32, v4i16, v8i8, v1f64, v2f32, v4f16, v4bf16], CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7], [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>, - CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32, v2f64, v8f16], + CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32, v2f64, v8f16, v8bf16], CCAssignToReg<[Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>, // If more than will fit in registers, pass them on the stack instead. CCIf<"ValVT == MVT::i1 || ValVT == MVT::i8", CCAssignToStack<1, 1>>, - CCIf<"ValVT == MVT::i16 || ValVT == MVT::f16", CCAssignToStack<2, 2>>, + CCIf<"ValVT == MVT::i16 || ValVT == MVT::f16 || ValVT == MVT::bf16", + CCAssignToStack<2, 2>>, CCIfType<[i32, f32], CCAssignToStack<4, 4>>, // Re-demote pointers to 32-bits so we don't end up storing 64-bit @@ -239,9 +243,9 @@ def CC_AArch64_DarwinPCS : CallingConv<[ CCIfPtr<CCIfILP32<CCTruncToType<i32>>>, CCIfPtr<CCIfILP32<CCAssignToStack<4, 4>>>, - CCIfType<[i64, f64, v1f64, v2f32, v1i64, v2i32, v4i16, v8i8, v4f16], + CCIfType<[i64, f64, v1f64, v2f32, v1i64, v2i32, v4i16, v8i8, v4f16, v4bf16], CCAssignToStack<8, 8>>, - CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32, v2f64, v8f16], + CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32, v2f64, v8f16, v8bf16], CCAssignToStack<16, 16>> ]>; @@ -255,14 +259,14 @@ def CC_AArch64_DarwinPCS_VarArg : CallingConv<[ // Handle all scalar types as either i64 or f64. CCIfType<[i8, i16, i32], CCPromoteToType<i64>>, - CCIfType<[f16, f32], CCPromoteToType<f64>>, + CCIfType<[f16, bf16, f32], CCPromoteToType<f64>>, // Everything is on the stack. // i128 is split to two i64s, and its stack alignment is 16 bytes. CCIfType<[i64], CCIfSplit<CCAssignToStack<8, 16>>>, - CCIfType<[i64, f64, v1i64, v2i32, v4i16, v8i8, v1f64, v2f32, v4f16], + CCIfType<[i64, f64, v1i64, v2i32, v4i16, v8i8, v1f64, v2f32, v4f16, v4bf16], CCAssignToStack<8, 8>>, - CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32, v2f64, v8f16], + CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32, v2f64, v8f16, v8bf16], CCAssignToStack<16, 16>> ]>; @@ -275,16 +279,16 @@ def CC_AArch64_DarwinPCS_ILP32_VarArg : CallingConv<[ // Handle all scalar types as either i32 or f32. CCIfType<[i8, i16], CCPromoteToType<i32>>, - CCIfType<[f16], CCPromoteToType<f32>>, + CCIfType<[f16, bf16], CCPromoteToType<f32>>, // Everything is on the stack. // i128 is split to two i64s, and its stack alignment is 16 bytes. CCIfPtr<CCIfILP32<CCTruncToType<i32>>>, CCIfType<[i32, f32], CCAssignToStack<4, 4>>, CCIfType<[i64], CCIfSplit<CCAssignToStack<8, 16>>>, - CCIfType<[i64, f64, v1i64, v2i32, v4i16, v8i8, v1f64, v2f32, v4f16], + CCIfType<[i64, f64, v1i64, v2i32, v4i16, v8i8, v1f64, v2f32, v4f16, v4bf16], CCAssignToStack<8, 8>>, - CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32, v2f64, v8f16], + CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32, v2f64, v8f16, v8bf16], CCAssignToStack<16, 16>> ]>; @@ -377,11 +381,9 @@ def CSR_AArch64_AAPCS : CalleeSavedRegs<(add X19, X20, X21, X22, X23, X24, D8, D9, D10, D11, D12, D13, D14, D15)>; -// Darwin puts the frame-record at the top of the callee-save area. -def CSR_Darwin_AArch64_AAPCS : CalleeSavedRegs<(add LR, FP, X19, X20, X21, X22, - X23, X24, X25, X26, X27, X28, - D8, D9, D10, D11, - D12, D13, D14, D15)>; +// A variant for treating X18 as callee saved, when interfacing with +// code that needs X18 to be preserved. +def CSR_AArch64_AAPCS_X18 : CalleeSavedRegs<(add X18, CSR_AArch64_AAPCS)>; // Win64 has unwinding codes for an (FP,LR) pair, save_fplr and save_fplr_x. // We put FP before LR, so that frame lowering logic generates (FP,LR) pairs, @@ -421,33 +423,7 @@ def CSR_AArch64_SVE_AAPCS : CalleeSavedRegs<(add (sequence "Z%u", 8, 23), def CSR_AArch64_AAPCS_ThisReturn : CalleeSavedRegs<(add CSR_AArch64_AAPCS, X0)>; def CSR_AArch64_AAPCS_SwiftError - : CalleeSavedRegs<(sub CSR_Darwin_AArch64_AAPCS, X21)>; - -// The function used by Darwin to obtain the address of a thread-local variable -// guarantees more than a normal AAPCS function. x16 and x17 are used on the -// fast path for calculation, but other registers except X0 (argument/return) -// and LR (it is a call, after all) are preserved. -def CSR_AArch64_TLS_Darwin - : CalleeSavedRegs<(add (sub (sequence "X%u", 1, 28), X16, X17), - FP, - (sequence "Q%u", 0, 31))>; - -// We can only handle a register pair with adjacent registers, the register pair -// should belong to the same class as well. Since the access function on the -// fast path calls a function that follows CSR_AArch64_TLS_Darwin, -// CSR_AArch64_CXX_TLS_Darwin should be a subset of CSR_AArch64_TLS_Darwin. -def CSR_AArch64_CXX_TLS_Darwin - : CalleeSavedRegs<(add CSR_Darwin_AArch64_AAPCS, - (sub (sequence "X%u", 1, 28), X15, X16, X17, X18), - (sequence "D%u", 0, 31))>; - -// CSRs that are handled by prologue, epilogue. -def CSR_AArch64_CXX_TLS_Darwin_PE - : CalleeSavedRegs<(add LR, FP)>; - -// CSRs that are handled explicitly via copies. -def CSR_AArch64_CXX_TLS_Darwin_ViaCopy - : CalleeSavedRegs<(sub CSR_AArch64_CXX_TLS_Darwin, LR, FP)>; + : CalleeSavedRegs<(sub CSR_AArch64_AAPCS, X21)>; // The ELF stub used for TLS-descriptor access saves every feasible // register. Only X0 and LR are clobbered. @@ -472,14 +448,57 @@ def CSR_AArch64_StackProbe_Windows (sequence "X%u", 18, 28), FP, SP, (sequence "Q%u", 0, 31))>; +// Darwin variants of AAPCS. +// Darwin puts the frame-record at the top of the callee-save area. +def CSR_Darwin_AArch64_AAPCS : CalleeSavedRegs<(add LR, FP, X19, X20, X21, X22, + X23, X24, X25, X26, X27, X28, + D8, D9, D10, D11, + D12, D13, D14, D15)>; + +def CSR_Darwin_AArch64_AAVPCS : CalleeSavedRegs<(add LR, FP, X19, X20, X21, + X22, X23, X24, X25, X26, X27, + X28, (sequence "Q%u", 8, 23))>; +def CSR_Darwin_AArch64_AAPCS_ThisReturn + : CalleeSavedRegs<(add CSR_Darwin_AArch64_AAPCS, X0)>; + +def CSR_Darwin_AArch64_AAPCS_SwiftError + : CalleeSavedRegs<(sub CSR_Darwin_AArch64_AAPCS, X21)>; + +// The function used by Darwin to obtain the address of a thread-local variable +// guarantees more than a normal AAPCS function. x16 and x17 are used on the +// fast path for calculation, but other registers except X0 (argument/return) +// and LR (it is a call, after all) are preserved. +def CSR_Darwin_AArch64_TLS + : CalleeSavedRegs<(add (sub (sequence "X%u", 1, 28), X16, X17), + FP, + (sequence "Q%u", 0, 31))>; + +// We can only handle a register pair with adjacent registers, the register pair +// should belong to the same class as well. Since the access function on the +// fast path calls a function that follows CSR_Darwin_AArch64_TLS, +// CSR_Darwin_AArch64_CXX_TLS should be a subset of CSR_Darwin_AArch64_TLS. +def CSR_Darwin_AArch64_CXX_TLS + : CalleeSavedRegs<(add CSR_Darwin_AArch64_AAPCS, + (sub (sequence "X%u", 1, 28), X15, X16, X17, X18), + (sequence "D%u", 0, 31))>; + +// CSRs that are handled by prologue, epilogue. +def CSR_Darwin_AArch64_CXX_TLS_PE + : CalleeSavedRegs<(add LR, FP)>; + +// CSRs that are handled explicitly via copies. +def CSR_Darwin_AArch64_CXX_TLS_ViaCopy + : CalleeSavedRegs<(sub CSR_Darwin_AArch64_CXX_TLS, LR, FP)>; + +def CSR_Darwin_AArch64_RT_MostRegs + : CalleeSavedRegs<(add CSR_Darwin_AArch64_AAPCS, (sequence "X%u", 9, 15))>; + // Variants of the standard calling conventions for shadow call stack. // These all preserve x18 in addition to any other registers. def CSR_AArch64_NoRegs_SCS : CalleeSavedRegs<(add CSR_AArch64_NoRegs, X18)>; def CSR_AArch64_AllRegs_SCS : CalleeSavedRegs<(add CSR_AArch64_AllRegs, X18)>; -def CSR_AArch64_CXX_TLS_Darwin_SCS - : CalleeSavedRegs<(add CSR_AArch64_CXX_TLS_Darwin, X18)>; def CSR_AArch64_AAPCS_SwiftError_SCS : CalleeSavedRegs<(add CSR_AArch64_AAPCS_SwiftError, X18)>; def CSR_AArch64_RT_MostRegs_SCS diff --git a/llvm/lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp b/llvm/lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp index 688bd1b28e855..3f244ba10102a 100644 --- a/llvm/lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp +++ b/llvm/lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp @@ -105,6 +105,10 @@ struct LDTLSCleanup : public MachineFunctionPass { TII->get(TargetOpcode::COPY), AArch64::X0) .addReg(TLSBaseAddrReg); + // Update the call site info. + if (I.shouldUpdateCallSiteInfo()) + I.getMF()->eraseCallSiteInfo(&I); + // Erase the TLS_base_addr instruction. I.eraseFromParent(); diff --git a/llvm/lib/Target/AArch64/AArch64CollectLOH.cpp b/llvm/lib/Target/AArch64/AArch64CollectLOH.cpp index 35e6fef24363c..efdb1131abc91 100644 --- a/llvm/lib/Target/AArch64/AArch64CollectLOH.cpp +++ b/llvm/lib/Target/AArch64/AArch64CollectLOH.cpp @@ -382,7 +382,7 @@ static bool handleMiddleInst(const MachineInstr &MI, LOHInfo &DefInfo, /// Update state when seeing and ADRP instruction. static void handleADRP(const MachineInstr &MI, AArch64FunctionInfo &AFI, - LOHInfo &Info) { + LOHInfo &Info, LOHInfo *LOHInfos) { if (Info.LastADRP != nullptr) { LLVM_DEBUG(dbgs() << "Adding MCLOH_AdrpAdrp:\n" << '\t' << MI << '\t' << *Info.LastADRP); @@ -393,12 +393,24 @@ static void handleADRP(const MachineInstr &MI, AArch64FunctionInfo &AFI, // Produce LOH directive if possible. if (Info.IsCandidate) { switch (Info.Type) { - case MCLOH_AdrpAdd: + case MCLOH_AdrpAdd: { + // ADRPs and ADDs for this candidate may be split apart if using + // GlobalISel instead of pseudo-expanded. If that happens, the + // def register of the ADD may have a use in between. Adding an LOH in + // this case can cause the linker to rewrite the ADRP to write to that + // register, clobbering the use. + const MachineInstr *AddMI = Info.MI0; + int DefIdx = mapRegToGPRIndex(MI.getOperand(0).getReg()); + int OpIdx = mapRegToGPRIndex(AddMI->getOperand(0).getReg()); + LOHInfo DefInfo = LOHInfos[OpIdx]; + if (DefIdx != OpIdx && (DefInfo.OneUser || DefInfo.MultiUsers)) + break; LLVM_DEBUG(dbgs() << "Adding MCLOH_AdrpAdd:\n" << '\t' << MI << '\t' << *Info.MI0); AFI.addLOHDirective(MCLOH_AdrpAdd, {&MI, Info.MI0}); ++NumADRSimpleCandidate; break; + } case MCLOH_AdrpLdr: if (supportLoadFromLiteral(*Info.MI0)) { LLVM_DEBUG(dbgs() << "Adding MCLOH_AdrpLdr:\n" @@ -522,7 +534,8 @@ bool AArch64CollectLOH::runOnMachineFunction(MachineFunction &MF) { // Walk the basic block backwards and update the per register state machine // in the process. - for (const MachineInstr &MI : make_range(MBB.rbegin(), MBB.rend())) { + for (const MachineInstr &MI : + instructionsWithoutDebug(MBB.rbegin(), MBB.rend())) { unsigned Opcode = MI.getOpcode(); switch (Opcode) { case AArch64::ADDXri: @@ -544,7 +557,7 @@ bool AArch64CollectLOH::runOnMachineFunction(MachineFunction &MF) { const MachineOperand &Op0 = MI.getOperand(0); int Idx = mapRegToGPRIndex(Op0.getReg()); if (Idx >= 0) { - handleADRP(MI, AFI, LOHInfos[Idx]); + handleADRP(MI, AFI, LOHInfos[Idx], LOHInfos); continue; } break; diff --git a/llvm/lib/Target/AArch64/AArch64Combine.td b/llvm/lib/Target/AArch64/AArch64Combine.td index bb99f2516ecf0..aa41cae289e8b 100644 --- a/llvm/lib/Target/AArch64/AArch64Combine.td +++ b/llvm/lib/Target/AArch64/AArch64Combine.td @@ -11,8 +11,74 @@ include "llvm/Target/GlobalISel/Combine.td" +def fconstant_to_constant : GICombineRule< + (defs root:$root), + (match (wip_match_opcode G_FCONSTANT):$root, + [{ return matchFConstantToConstant(*${root}, MRI); }]), + (apply [{ applyFConstantToConstant(*${root}); }])>; + def AArch64PreLegalizerCombinerHelper: GICombinerHelper< "AArch64GenPreLegalizerCombinerHelper", [all_combines, - elide_br_by_inverting_cond]> { + elide_br_by_inverting_cond, + fconstant_to_constant]> { let DisableRuleOption = "aarch64prelegalizercombiner-disable-rule"; + let StateClass = "AArch64PreLegalizerCombinerHelperState"; + let AdditionalArguments = []; +} + +// Matchdata for combines which replace a G_SHUFFLE_VECTOR with a +// target-specific opcode. +def shuffle_matchdata : GIDefMatchData<"ShuffleVectorPseudo">; + +def rev : GICombineRule< + (defs root:$root, shuffle_matchdata:$matchinfo), + (match (wip_match_opcode G_SHUFFLE_VECTOR):$root, + [{ return matchREV(*${root}, MRI, ${matchinfo}); }]), + (apply [{ applyShuffleVectorPseudo(*${root}, ${matchinfo}); }]) +>; + +def zip : GICombineRule< + (defs root:$root, shuffle_matchdata:$matchinfo), + (match (wip_match_opcode G_SHUFFLE_VECTOR):$root, + [{ return matchZip(*${root}, MRI, ${matchinfo}); }]), + (apply [{ applyShuffleVectorPseudo(*${root}, ${matchinfo}); }]) +>; + +def uzp : GICombineRule< + (defs root:$root, shuffle_matchdata:$matchinfo), + (match (wip_match_opcode G_SHUFFLE_VECTOR):$root, + [{ return matchUZP(*${root}, MRI, ${matchinfo}); }]), + (apply [{ applyShuffleVectorPseudo(*${root}, ${matchinfo}); }]) +>; + +def dup: GICombineRule < + (defs root:$root, shuffle_matchdata:$matchinfo), + (match (wip_match_opcode G_SHUFFLE_VECTOR):$root, + [{ return matchDup(*${root}, MRI, ${matchinfo}); }]), + (apply [{ applyShuffleVectorPseudo(*${root}, ${matchinfo}); }]) +>; + +def trn : GICombineRule< + (defs root:$root, shuffle_matchdata:$matchinfo), + (match (wip_match_opcode G_SHUFFLE_VECTOR):$root, + [{ return matchTRN(*${root}, MRI, ${matchinfo}); }]), + (apply [{ applyShuffleVectorPseudo(*${root}, ${matchinfo}); }]) +>; + +def ext: GICombineRule < + (defs root:$root, shuffle_matchdata:$matchinfo), + (match (wip_match_opcode G_SHUFFLE_VECTOR):$root, + [{ return matchEXT(*${root}, MRI, ${matchinfo}); }]), + (apply [{ applyEXT(*${root}, ${matchinfo}); }]) +>; + +// Combines which replace a G_SHUFFLE_VECTOR with a target-specific pseudo +// instruction. +def shuffle_vector_pseudos : GICombineGroup<[dup, rev, ext, zip, uzp, trn]>; + +def AArch64PostLegalizerCombinerHelper + : GICombinerHelper<"AArch64GenPostLegalizerCombinerHelper", + [erase_undef_store, combines_for_extload, + sext_already_extended, shuffle_vector_pseudos]> { + let DisableRuleOption = "aarch64postlegalizercombiner-disable-rule"; } diff --git a/llvm/lib/Target/AArch64/AArch64CompressJumpTables.cpp b/llvm/lib/Target/AArch64/AArch64CompressJumpTables.cpp index 2592387059652..57dc8a4061f12 100644 --- a/llvm/lib/Target/AArch64/AArch64CompressJumpTables.cpp +++ b/llvm/lib/Target/AArch64/AArch64CompressJumpTables.cpp @@ -79,7 +79,7 @@ void AArch64CompressJumpTables::scanFunction() { for (MachineBasicBlock &MBB : *MF) { const Align Alignment = MBB.getAlignment(); unsigned AlignedOffset; - if (Alignment == Align::None()) + if (Alignment == Align(1)) AlignedOffset = Offset; else AlignedOffset = alignTo(Offset, Alignment); diff --git a/llvm/lib/Target/AArch64/AArch64CondBrTuning.cpp b/llvm/lib/Target/AArch64/AArch64CondBrTuning.cpp index 25e23e4623de1..e90e8e3da0576 100644 --- a/llvm/lib/Target/AArch64/AArch64CondBrTuning.cpp +++ b/llvm/lib/Target/AArch64/AArch64CondBrTuning.cpp @@ -194,12 +194,8 @@ bool AArch64CondBrTuning::tryToTuneBranch(MachineInstr &MI, // There must not be any instruction between DefMI and MI that clobbers or // reads NZCV. - MachineBasicBlock::iterator I(DefMI), E(MI); - for (I = std::next(I); I != E; ++I) { - if (I->modifiesRegister(AArch64::NZCV, TRI) || - I->readsRegister(AArch64::NZCV, TRI)) - return false; - } + if (isNZCVTouchedInInstructionRange(DefMI, MI, TRI)) + return false; LLVM_DEBUG(dbgs() << " Replacing instructions:\n "); LLVM_DEBUG(DefMI.print(dbgs())); LLVM_DEBUG(dbgs() << " "); @@ -253,12 +249,8 @@ bool AArch64CondBrTuning::tryToTuneBranch(MachineInstr &MI, return false; // There must not be any instruction between DefMI and MI that clobbers or // reads NZCV. - MachineBasicBlock::iterator I(DefMI), E(MI); - for (I = std::next(I); I != E; ++I) { - if (I->modifiesRegister(AArch64::NZCV, TRI) || - I->readsRegister(AArch64::NZCV, TRI)) - return false; - } + if (isNZCVTouchedInInstructionRange(DefMI, MI, TRI)) + return false; LLVM_DEBUG(dbgs() << " Replacing instructions:\n "); LLVM_DEBUG(DefMI.print(dbgs())); LLVM_DEBUG(dbgs() << " "); diff --git a/llvm/lib/Target/AArch64/AArch64ConditionOptimizer.cpp b/llvm/lib/Target/AArch64/AArch64ConditionOptimizer.cpp index 51b2ce0297019..64f0bb63762de 100644 --- a/llvm/lib/Target/AArch64/AArch64ConditionOptimizer.cpp +++ b/llvm/lib/Target/AArch64/AArch64ConditionOptimizer.cpp @@ -145,11 +145,11 @@ void AArch64ConditionOptimizer::getAnalysisUsage(AnalysisUsage &AU) const { // instructions. MachineInstr *AArch64ConditionOptimizer::findSuitableCompare( MachineBasicBlock *MBB) { - MachineBasicBlock::iterator I = MBB->getFirstTerminator(); - if (I == MBB->end()) + MachineBasicBlock::iterator Term = MBB->getFirstTerminator(); + if (Term == MBB->end()) return nullptr; - if (I->getOpcode() != AArch64::Bcc) + if (Term->getOpcode() != AArch64::Bcc) return nullptr; // Since we may modify cmp of this MBB, make sure NZCV does not live out. @@ -158,32 +158,33 @@ MachineInstr *AArch64ConditionOptimizer::findSuitableCompare( return nullptr; // Now find the instruction controlling the terminator. - for (MachineBasicBlock::iterator B = MBB->begin(); I != B;) { - --I; - assert(!I->isTerminator() && "Spurious terminator"); + for (MachineBasicBlock::iterator B = MBB->begin(), It = Term; It != B;) { + It = prev_nodbg(It, B); + MachineInstr &I = *It; + assert(!I.isTerminator() && "Spurious terminator"); // Check if there is any use of NZCV between CMP and Bcc. - if (I->readsRegister(AArch64::NZCV)) + if (I.readsRegister(AArch64::NZCV)) return nullptr; - switch (I->getOpcode()) { + switch (I.getOpcode()) { // cmp is an alias for subs with a dead destination register. case AArch64::SUBSWri: case AArch64::SUBSXri: // cmn is an alias for adds with a dead destination register. case AArch64::ADDSWri: case AArch64::ADDSXri: { - unsigned ShiftAmt = AArch64_AM::getShiftValue(I->getOperand(3).getImm()); - if (!I->getOperand(2).isImm()) { - LLVM_DEBUG(dbgs() << "Immediate of cmp is symbolic, " << *I << '\n'); + unsigned ShiftAmt = AArch64_AM::getShiftValue(I.getOperand(3).getImm()); + if (!I.getOperand(2).isImm()) { + LLVM_DEBUG(dbgs() << "Immediate of cmp is symbolic, " << I << '\n'); return nullptr; - } else if (I->getOperand(2).getImm() << ShiftAmt >= 0xfff) { - LLVM_DEBUG(dbgs() << "Immediate of cmp may be out of range, " << *I + } else if (I.getOperand(2).getImm() << ShiftAmt >= 0xfff) { + LLVM_DEBUG(dbgs() << "Immediate of cmp may be out of range, " << I << '\n'); return nullptr; - } else if (!MRI->use_empty(I->getOperand(0).getReg())) { - LLVM_DEBUG(dbgs() << "Destination of cmp is not dead, " << *I << '\n'); + } else if (!MRI->use_nodbg_empty(I.getOperand(0).getReg())) { + LLVM_DEBUG(dbgs() << "Destination of cmp is not dead, " << I << '\n'); return nullptr; } - return &*I; + return &I; } // Prevent false positive case like: // cmp w19, #0 @@ -294,12 +295,10 @@ void AArch64ConditionOptimizer::modifyCmp(MachineInstr *CmpMI, .add(BrMI.getOperand(1)); BrMI.eraseFromParent(); - MBB->updateTerminator(); - ++NumConditionsAdjusted; } -// Parse a condition code returned by AnalyzeBranch, and compute the CondCode +// Parse a condition code returned by analyzeBranch, and compute the CondCode // corresponding to TBB. // Returns true if parsing was successful, otherwise false is returned. static bool parseCond(ArrayRef<MachineOperand> Cond, AArch64CC::CondCode &CC) { diff --git a/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp b/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp index 054ef8f482ca9..82e8df3b73f90 100644 --- a/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp +++ b/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp @@ -157,7 +157,7 @@ public: MachineInstr *CmpMI; private: - /// The branch condition in Head as determined by AnalyzeBranch. + /// The branch condition in Head as determined by analyzeBranch. SmallVector<MachineOperand, 4> HeadCond; /// The condition code that makes Head branch to CmpBB. @@ -267,7 +267,7 @@ bool SSACCmpConv::isDeadDef(unsigned DstReg) { return MRI->use_nodbg_empty(DstReg); } -// Parse a condition code returned by AnalyzeBranch, and compute the CondCode +// Parse a condition code returned by analyzeBranch, and compute the CondCode // corresponding to TBB. // Return static bool parseCond(ArrayRef<MachineOperand> Cond, AArch64CC::CondCode &CC) { @@ -317,7 +317,7 @@ MachineInstr *SSACCmpConv::findConvertibleCompare(MachineBasicBlock *MBB) { // Now find the instruction controlling the terminator. for (MachineBasicBlock::iterator B = MBB->begin(); I != B;) { - --I; + I = prev_nodbg(I, MBB->begin()); assert(!I->isTerminator() && "Spurious terminator"); switch (I->getOpcode()) { // cmp is an alias for subs with a dead destination register. @@ -509,7 +509,7 @@ bool SSACCmpConv::canConvert(MachineBasicBlock *MBB) { // landing pad. if (!TBB || HeadCond.empty()) { LLVM_DEBUG( - dbgs() << "AnalyzeBranch didn't find conditional branch in Head.\n"); + dbgs() << "analyzeBranch didn't find conditional branch in Head.\n"); ++NumHeadBranchRejs; return false; } @@ -536,7 +536,7 @@ bool SSACCmpConv::canConvert(MachineBasicBlock *MBB) { if (!TBB || CmpBBCond.empty()) { LLVM_DEBUG( - dbgs() << "AnalyzeBranch didn't find conditional branch in CmpBB.\n"); + dbgs() << "analyzeBranch didn't find conditional branch in CmpBB.\n"); ++NumCmpBranchRejs; return false; } @@ -710,7 +710,7 @@ void SSACCmpConv::convert(SmallVectorImpl<MachineBasicBlock *> &RemovedBlocks) { .add(CmpMI->getOperand(1)); // Branch target. } CmpMI->eraseFromParent(); - Head->updateTerminator(); + Head->updateTerminator(CmpBB->getNextNode()); RemovedBlocks.push_back(CmpBB); CmpBB->eraseFromParent(); @@ -828,7 +828,7 @@ void AArch64ConditionalCompares::updateDomTree( assert(Node != HeadNode && "Cannot erase the head node"); assert(Node->getIDom() == HeadNode && "CmpBB should be dominated by Head"); while (Node->getNumChildren()) - DomTree->changeImmediateDominator(Node->getChildren().back(), HeadNode); + DomTree->changeImmediateDominator(Node->back(), HeadNode); DomTree->eraseNode(RemovedMBB); } } diff --git a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp index 3b8f8a19fe49c..9e65ad2e18f95 100644 --- a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp +++ b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp @@ -68,6 +68,8 @@ private: bool expandMOVImm(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, unsigned BitSize); + bool expand_DestructiveOp(MachineInstr &MI, MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI); bool expandCMP_SWAP(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, unsigned LdarOp, unsigned StlrOp, unsigned CmpOp, unsigned ExtendImm, unsigned ZeroReg, @@ -78,6 +80,9 @@ private: bool expandSetTagLoop(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, MachineBasicBlock::iterator &NextMBBI); + bool expandSVESpillFill(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, unsigned Opc, + unsigned N); }; } // end anonymous namespace @@ -344,27 +349,225 @@ bool AArch64ExpandPseudo::expandCMP_SWAP_128( return true; } +/// \brief Expand Pseudos to Instructions with destructive operands. +/// +/// This mechanism uses MOVPRFX instructions for zeroing the false lanes +/// or for fixing relaxed register allocation conditions to comply with +/// the instructions register constraints. The latter case may be cheaper +/// than setting the register constraints in the register allocator, +/// since that will insert regular MOV instructions rather than MOVPRFX. +/// +/// Example (after register allocation): +/// +/// FSUB_ZPZZ_ZERO_B Z0, Pg, Z1, Z0 +/// +/// * The Pseudo FSUB_ZPZZ_ZERO_B maps to FSUB_ZPmZ_B. +/// * We cannot map directly to FSUB_ZPmZ_B because the register +/// constraints of the instruction are not met. +/// * Also the _ZERO specifies the false lanes need to be zeroed. +/// +/// We first try to see if the destructive operand == result operand, +/// if not, we try to swap the operands, e.g. +/// +/// FSUB_ZPmZ_B Z0, Pg/m, Z0, Z1 +/// +/// But because FSUB_ZPmZ is not commutative, this is semantically +/// different, so we need a reverse instruction: +/// +/// FSUBR_ZPmZ_B Z0, Pg/m, Z0, Z1 +/// +/// Then we implement the zeroing of the false lanes of Z0 by adding +/// a zeroing MOVPRFX instruction: +/// +/// MOVPRFX_ZPzZ_B Z0, Pg/z, Z0 +/// FSUBR_ZPmZ_B Z0, Pg/m, Z0, Z1 +/// +/// Note that this can only be done for _ZERO or _UNDEF variants where +/// we can guarantee the false lanes to be zeroed (by implementing this) +/// or that they are undef (don't care / not used), otherwise the +/// swapping of operands is illegal because the operation is not +/// (or cannot be emulated to be) fully commutative. +bool AArch64ExpandPseudo::expand_DestructiveOp( + MachineInstr &MI, + MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI) { + unsigned Opcode = AArch64::getSVEPseudoMap(MI.getOpcode()); + uint64_t DType = TII->get(Opcode).TSFlags & AArch64::DestructiveInstTypeMask; + uint64_t FalseLanes = MI.getDesc().TSFlags & AArch64::FalseLanesMask; + bool FalseZero = FalseLanes == AArch64::FalseLanesZero; + + unsigned DstReg = MI.getOperand(0).getReg(); + bool DstIsDead = MI.getOperand(0).isDead(); + + if (DType == AArch64::DestructiveBinary) + assert(DstReg != MI.getOperand(3).getReg()); + + bool UseRev = false; + unsigned PredIdx, DOPIdx, SrcIdx; + switch (DType) { + case AArch64::DestructiveBinaryComm: + case AArch64::DestructiveBinaryCommWithRev: + if (DstReg == MI.getOperand(3).getReg()) { + // FSUB Zd, Pg, Zs1, Zd ==> FSUBR Zd, Pg/m, Zd, Zs1 + std::tie(PredIdx, DOPIdx, SrcIdx) = std::make_tuple(1, 3, 2); + UseRev = true; + break; + } + LLVM_FALLTHROUGH; + case AArch64::DestructiveBinary: + case AArch64::DestructiveBinaryImm: + std::tie(PredIdx, DOPIdx, SrcIdx) = std::make_tuple(1, 2, 3); + break; + default: + llvm_unreachable("Unsupported Destructive Operand type"); + } + +#ifndef NDEBUG + // MOVPRFX can only be used if the destination operand + // is the destructive operand, not as any other operand, + // so the Destructive Operand must be unique. + bool DOPRegIsUnique = false; + switch (DType) { + case AArch64::DestructiveBinaryComm: + case AArch64::DestructiveBinaryCommWithRev: + DOPRegIsUnique = + DstReg != MI.getOperand(DOPIdx).getReg() || + MI.getOperand(DOPIdx).getReg() != MI.getOperand(SrcIdx).getReg(); + break; + case AArch64::DestructiveBinaryImm: + DOPRegIsUnique = true; + break; + } +#endif + + // Resolve the reverse opcode + if (UseRev) { + int NewOpcode; + // e.g. DIV -> DIVR + if ((NewOpcode = AArch64::getSVERevInstr(Opcode)) != -1) + Opcode = NewOpcode; + // e.g. DIVR -> DIV + else if ((NewOpcode = AArch64::getSVENonRevInstr(Opcode)) != -1) + Opcode = NewOpcode; + } + + // Get the right MOVPRFX + uint64_t ElementSize = TII->getElementSizeForOpcode(Opcode); + unsigned MovPrfx, MovPrfxZero; + switch (ElementSize) { + case AArch64::ElementSizeNone: + case AArch64::ElementSizeB: + MovPrfx = AArch64::MOVPRFX_ZZ; + MovPrfxZero = AArch64::MOVPRFX_ZPzZ_B; + break; + case AArch64::ElementSizeH: + MovPrfx = AArch64::MOVPRFX_ZZ; + MovPrfxZero = AArch64::MOVPRFX_ZPzZ_H; + break; + case AArch64::ElementSizeS: + MovPrfx = AArch64::MOVPRFX_ZZ; + MovPrfxZero = AArch64::MOVPRFX_ZPzZ_S; + break; + case AArch64::ElementSizeD: + MovPrfx = AArch64::MOVPRFX_ZZ; + MovPrfxZero = AArch64::MOVPRFX_ZPzZ_D; + break; + default: + llvm_unreachable("Unsupported ElementSize"); + } + + // + // Create the destructive operation (if required) + // + MachineInstrBuilder PRFX, DOP; + if (FalseZero) { +#ifndef NDEBUG + assert(DOPRegIsUnique && "The destructive operand should be unique"); +#endif + assert(ElementSize != AArch64::ElementSizeNone && + "This instruction is unpredicated"); + + // Merge source operand into destination register + PRFX = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(MovPrfxZero)) + .addReg(DstReg, RegState::Define) + .addReg(MI.getOperand(PredIdx).getReg()) + .addReg(MI.getOperand(DOPIdx).getReg()); + + // After the movprfx, the destructive operand is same as Dst + DOPIdx = 0; + } else if (DstReg != MI.getOperand(DOPIdx).getReg()) { +#ifndef NDEBUG + assert(DOPRegIsUnique && "The destructive operand should be unique"); +#endif + PRFX = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(MovPrfx)) + .addReg(DstReg, RegState::Define) + .addReg(MI.getOperand(DOPIdx).getReg()); + DOPIdx = 0; + } + + // + // Create the destructive operation + // + DOP = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Opcode)) + .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead)); + + switch (DType) { + case AArch64::DestructiveBinaryImm: + case AArch64::DestructiveBinaryComm: + case AArch64::DestructiveBinaryCommWithRev: + DOP.add(MI.getOperand(PredIdx)) + .addReg(MI.getOperand(DOPIdx).getReg(), RegState::Kill) + .add(MI.getOperand(SrcIdx)); + break; + } + + if (PRFX) { + finalizeBundle(MBB, PRFX->getIterator(), MBBI->getIterator()); + transferImpOps(MI, PRFX, DOP); + } else + transferImpOps(MI, DOP, DOP); + + MI.eraseFromParent(); + return true; +} + bool AArch64ExpandPseudo::expandSetTagLoop( MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, MachineBasicBlock::iterator &NextMBBI) { MachineInstr &MI = *MBBI; DebugLoc DL = MI.getDebugLoc(); - Register SizeReg = MI.getOperand(2).getReg(); - Register AddressReg = MI.getOperand(3).getReg(); + Register SizeReg = MI.getOperand(0).getReg(); + Register AddressReg = MI.getOperand(1).getReg(); MachineFunction *MF = MBB.getParent(); - bool ZeroData = MI.getOpcode() == AArch64::STZGloop; - const unsigned OpCode = + bool ZeroData = MI.getOpcode() == AArch64::STZGloop_wback; + const unsigned OpCode1 = + ZeroData ? AArch64::STZGPostIndex : AArch64::STGPostIndex; + const unsigned OpCode2 = ZeroData ? AArch64::STZ2GPostIndex : AArch64::ST2GPostIndex; + unsigned Size = MI.getOperand(2).getImm(); + assert(Size > 0 && Size % 16 == 0); + if (Size % (16 * 2) != 0) { + BuildMI(MBB, MBBI, DL, TII->get(OpCode1), AddressReg) + .addReg(AddressReg) + .addReg(AddressReg) + .addImm(1); + Size -= 16; + } + MachineBasicBlock::iterator I = + BuildMI(MBB, MBBI, DL, TII->get(AArch64::MOVi64imm), SizeReg) + .addImm(Size); + expandMOVImm(MBB, I, 64); + auto LoopBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock()); auto DoneBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock()); MF->insert(++MBB.getIterator(), LoopBB); MF->insert(++LoopBB->getIterator(), DoneBB); - BuildMI(LoopBB, DL, TII->get(OpCode)) + BuildMI(LoopBB, DL, TII->get(OpCode2)) .addDef(AddressReg) .addReg(AddressReg) .addReg(AddressReg) @@ -402,6 +605,28 @@ bool AArch64ExpandPseudo::expandSetTagLoop( return true; } +bool AArch64ExpandPseudo::expandSVESpillFill(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + unsigned Opc, unsigned N) { + const TargetRegisterInfo *TRI = + MBB.getParent()->getSubtarget().getRegisterInfo(); + MachineInstr &MI = *MBBI; + for (unsigned Offset = 0; Offset < N; ++Offset) { + int ImmOffset = MI.getOperand(2).getImm() + Offset; + bool Kill = (Offset + 1 == N) ? MI.getOperand(1).isKill() : false; + assert(ImmOffset >= -256 && ImmOffset < 256 && + "Immediate spill offset out of range"); + BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Opc)) + .addReg( + TRI->getSubReg(MI.getOperand(0).getReg(), AArch64::zsub0 + Offset), + Opc == AArch64::LDR_ZXI ? RegState::Define : 0) + .addReg(MI.getOperand(1).getReg(), getKillRegState(Kill)) + .addImm(ImmOffset); + } + MI.eraseFromParent(); + return true; +} + /// If MBBI references a pseudo instruction that should be expanded here, /// do the expansion and return true. Otherwise return false. bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB, @@ -409,10 +634,76 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB, MachineBasicBlock::iterator &NextMBBI) { MachineInstr &MI = *MBBI; unsigned Opcode = MI.getOpcode(); + + // Check if we can expand the destructive op + int OrigInstr = AArch64::getSVEPseudoMap(MI.getOpcode()); + if (OrigInstr != -1) { + auto &Orig = TII->get(OrigInstr); + if ((Orig.TSFlags & AArch64::DestructiveInstTypeMask) + != AArch64::NotDestructive) { + return expand_DestructiveOp(MI, MBB, MBBI); + } + } + switch (Opcode) { default: break; + case AArch64::BSPv8i8: + case AArch64::BSPv16i8: { + Register DstReg = MI.getOperand(0).getReg(); + if (DstReg == MI.getOperand(3).getReg()) { + // Expand to BIT + BuildMI(MBB, MBBI, MI.getDebugLoc(), + TII->get(Opcode == AArch64::BSPv8i8 ? AArch64::BITv8i8 + : AArch64::BITv16i8)) + .add(MI.getOperand(0)) + .add(MI.getOperand(3)) + .add(MI.getOperand(2)) + .add(MI.getOperand(1)); + } else if (DstReg == MI.getOperand(2).getReg()) { + // Expand to BIF + BuildMI(MBB, MBBI, MI.getDebugLoc(), + TII->get(Opcode == AArch64::BSPv8i8 ? AArch64::BIFv8i8 + : AArch64::BIFv16i8)) + .add(MI.getOperand(0)) + .add(MI.getOperand(2)) + .add(MI.getOperand(3)) + .add(MI.getOperand(1)); + } else { + // Expand to BSL, use additional move if required + if (DstReg == MI.getOperand(1).getReg()) { + BuildMI(MBB, MBBI, MI.getDebugLoc(), + TII->get(Opcode == AArch64::BSPv8i8 ? AArch64::BSLv8i8 + : AArch64::BSLv16i8)) + .add(MI.getOperand(0)) + .add(MI.getOperand(1)) + .add(MI.getOperand(2)) + .add(MI.getOperand(3)); + } else { + BuildMI(MBB, MBBI, MI.getDebugLoc(), + TII->get(Opcode == AArch64::BSPv8i8 ? AArch64::ORRv8i8 + : AArch64::ORRv16i8)) + .addReg(DstReg, + RegState::Define | + getRenamableRegState(MI.getOperand(0).isRenamable())) + .add(MI.getOperand(1)) + .add(MI.getOperand(1)); + BuildMI(MBB, MBBI, MI.getDebugLoc(), + TII->get(Opcode == AArch64::BSPv8i8 ? AArch64::BSLv8i8 + : AArch64::BSLv16i8)) + .add(MI.getOperand(0)) + .addReg(DstReg, + RegState::Kill | + getRenamableRegState(MI.getOperand(0).isRenamable())) + .add(MI.getOperand(2)) + .add(MI.getOperand(3)); + } + } + MI.eraseFromParent(); + return true; + } + case AArch64::ADDWrr: case AArch64::SUBWrr: case AArch64::ADDXrr: @@ -599,10 +890,7 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB, Register DstReg = MI.getOperand(0).getReg(); auto SysReg = AArch64SysReg::TPIDR_EL0; MachineFunction *MF = MBB.getParent(); - if (MF->getTarget().getTargetTriple().isOSFuchsia() && - MF->getTarget().getCodeModel() == CodeModel::Kernel) - SysReg = AArch64SysReg::TPIDR_EL1; - else if (MF->getSubtarget<AArch64Subtarget>().useEL3ForTP()) + if (MF->getSubtarget<AArch64Subtarget>().useEL3ForTP()) SysReg = AArch64SysReg::TPIDR_EL3; else if (MF->getSubtarget<AArch64Subtarget>().useEL2ForTP()) SysReg = AArch64SysReg::TPIDR_EL2; @@ -676,7 +964,7 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB, // almost always point to SP-after-prologue; if not, emit a longer // instruction sequence. int BaseOffset = -AFI->getTaggedBasePointerOffset(); - unsigned FrameReg; + Register FrameReg; StackOffset FrameRegOffset = TFI->resolveFrameOffsetReference( MF, BaseOffset, false /*isFixed*/, false /*isSVE*/, FrameReg, /*PreferFP=*/false, @@ -706,9 +994,26 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB, MI.eraseFromParent(); return true; } + case AArch64::STGloop_wback: + case AArch64::STZGloop_wback: + return expandSetTagLoop(MBB, MBBI, NextMBBI); case AArch64::STGloop: case AArch64::STZGloop: - return expandSetTagLoop(MBB, MBBI, NextMBBI); + report_fatal_error( + "Non-writeback variants of STGloop / STZGloop should not " + "survive past PrologEpilogInserter."); + case AArch64::STR_ZZZZXI: + return expandSVESpillFill(MBB, MBBI, AArch64::STR_ZXI, 4); + case AArch64::STR_ZZZXI: + return expandSVESpillFill(MBB, MBBI, AArch64::STR_ZXI, 3); + case AArch64::STR_ZZXI: + return expandSVESpillFill(MBB, MBBI, AArch64::STR_ZXI, 2); + case AArch64::LDR_ZZZZXI: + return expandSVESpillFill(MBB, MBBI, AArch64::LDR_ZXI, 4); + case AArch64::LDR_ZZZXI: + return expandSVESpillFill(MBB, MBBI, AArch64::LDR_ZXI, 3); + case AArch64::LDR_ZZXI: + return expandSVESpillFill(MBB, MBBI, AArch64::LDR_ZXI, 2); } return false; } diff --git a/llvm/lib/Target/AArch64/AArch64FalkorHWPFFix.cpp b/llvm/lib/Target/AArch64/AArch64FalkorHWPFFix.cpp index c1fc183b04f6f..538863ebe95af 100644 --- a/llvm/lib/Target/AArch64/AArch64FalkorHWPFFix.cpp +++ b/llvm/lib/Target/AArch64/AArch64FalkorHWPFFix.cpp @@ -823,9 +823,6 @@ bool FalkorHWPFFix::runOnMachineFunction(MachineFunction &Fn) { TII = static_cast<const AArch64InstrInfo *>(ST.getInstrInfo()); TRI = ST.getRegisterInfo(); - assert(TRI->trackLivenessAfterRegAlloc(Fn) && - "Register liveness not available!"); - MachineLoopInfo &LI = getAnalysis<MachineLoopInfo>(); Modified = false; diff --git a/llvm/lib/Target/AArch64/AArch64FastISel.cpp b/llvm/lib/Target/AArch64/AArch64FastISel.cpp index 7e9c68f2bb305..0f63f4ca62e5e 100644 --- a/llvm/lib/Target/AArch64/AArch64FastISel.cpp +++ b/llvm/lib/Target/AArch64/AArch64FastISel.cpp @@ -434,11 +434,9 @@ unsigned AArch64FastISel::materializeFP(const ConstantFP *CFP, MVT VT) { // Materialize via constant pool. MachineConstantPool wants an explicit // alignment. - unsigned Align = DL.getPrefTypeAlignment(CFP->getType()); - if (Align == 0) - Align = DL.getTypeAllocSize(CFP->getType()); + Align Alignment = DL.getPrefTypeAlign(CFP->getType()); - unsigned CPI = MCP.getConstantPoolIndex(cast<Constant>(CFP), Align); + unsigned CPI = MCP.getConstantPoolIndex(cast<Constant>(CFP), Alignment); unsigned ADRPReg = createResultReg(&AArch64::GPR64commonRegClass); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ADRP), ADRPReg).addConstantPoolIndex(CPI, 0, AArch64II::MO_PAGE); @@ -1130,7 +1128,7 @@ void AArch64FastISel::addLoadStoreOperands(Address &Addr, // and alignment should be based on the VT. MMO = FuncInfo.MF->getMachineMemOperand( MachinePointerInfo::getFixedStack(*FuncInfo.MF, FI, Offset), Flags, - MFI.getObjectSize(FI), MFI.getObjectAlignment(FI)); + MFI.getObjectSize(FI), MFI.getObjectAlign(FI)); // Now add the rest of the operands. MIB.addFrameIndex(FI).addImm(Offset); } else { @@ -3137,7 +3135,7 @@ bool AArch64FastISel::processCallArgs(CallLoweringInfo &CLI, Addr.setReg(AArch64::SP); Addr.setOffset(VA.getLocMemOffset() + BEAlign); - unsigned Alignment = DL.getABITypeAlignment(ArgVal->getType()); + Align Alignment = DL.getABITypeAlign(ArgVal->getType()); MachineMemOperand *MMO = FuncInfo.MF->getMachineMemOperand( MachinePointerInfo::getStack(*FuncInfo.MF, Addr.getOffset()), MachineMemOperand::MOStore, ArgVT.getStoreSize(), Alignment); @@ -3272,7 +3270,8 @@ bool AArch64FastISel::fastLowerCall(CallLoweringInfo &CLI) { // Issue the call. MachineInstrBuilder MIB; if (Subtarget->useSmallAddressing()) { - const MCInstrDesc &II = TII.get(Addr.getReg() ? AArch64::BLR : AArch64::BL); + const MCInstrDesc &II = + TII.get(Addr.getReg() ? getBLRCallOpcode(*MF) : (unsigned)AArch64::BL); MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II); if (Symbol) MIB.addSym(Symbol, 0); @@ -3305,7 +3304,7 @@ bool AArch64FastISel::fastLowerCall(CallLoweringInfo &CLI) { if (!CallReg) return false; - const MCInstrDesc &II = TII.get(AArch64::BLR); + const MCInstrDesc &II = TII.get(getBLRCallOpcode(*MF)); CallReg = constrainOperandRegClass(II, CallReg, 0); MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II).addReg(CallReg); } diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp index ea3e800a1ad20..efa3fd5ca9cef 100644 --- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp @@ -170,8 +170,45 @@ static cl::opt<bool> cl::desc("reverse the CSR restore sequence"), cl::init(false), cl::Hidden); +static cl::opt<bool> StackTaggingMergeSetTag( + "stack-tagging-merge-settag", + cl::desc("merge settag instruction in function epilog"), cl::init(true), + cl::Hidden); + STATISTIC(NumRedZoneFunctions, "Number of functions using red zone"); +/// Returns the argument pop size. +static uint64_t getArgumentPopSize(MachineFunction &MF, + MachineBasicBlock &MBB) { + MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr(); + bool IsTailCallReturn = false; + if (MBB.end() != MBBI) { + unsigned RetOpcode = MBBI->getOpcode(); + IsTailCallReturn = RetOpcode == AArch64::TCRETURNdi || + RetOpcode == AArch64::TCRETURNri || + RetOpcode == AArch64::TCRETURNriBTI; + } + AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>(); + + uint64_t ArgumentPopSize = 0; + if (IsTailCallReturn) { + MachineOperand &StackAdjust = MBBI->getOperand(1); + + // For a tail-call in a callee-pops-arguments environment, some or all of + // the stack may actually be in use for the call's arguments, this is + // calculated during LowerCall and consumed here... + ArgumentPopSize = StackAdjust.getImm(); + } else { + // ... otherwise the amount to pop is *all* of the argument space, + // conveniently stored in the MachineFunctionInfo by + // LowerFormalArguments. This will, of course, be zero for the C calling + // convention. + ArgumentPopSize = AFI->getArgumentStackToRestore(); + } + + return ArgumentPopSize; +} + /// This is the biggest offset to the stack pointer we can encode in aarch64 /// instructions (without using a separate calculation and a temp register). /// Note that the exception here are vector stores/loads which cannot encode any @@ -211,6 +248,24 @@ AArch64FrameLowering::getStackIDForScalableVectors() const { return TargetStackID::SVEVector; } +/// Returns the size of the fixed object area (allocated next to sp on entry) +/// On Win64 this may include a var args area and an UnwindHelp object for EH. +static unsigned getFixedObjectSize(const MachineFunction &MF, + const AArch64FunctionInfo *AFI, bool IsWin64, + bool IsFunclet) { + if (!IsWin64 || IsFunclet) { + // Only Win64 uses fixed objects, and then only for the function (not + // funclets) + return 0; + } else { + // Var args are stored here in the primary function. + const unsigned VarArgsArea = AFI->getVarArgsGPRSize(); + // To support EH funclets we allocate an UnwindHelp object + const unsigned UnwindHelpObject = (MF.hasEHFunclets() ? 8 : 0); + return alignTo(VarArgsArea + UnwindHelpObject, 16); + } +} + /// Returns the size of the entire SVE stackframe (calleesaves + spills). static StackOffset getSVEStackSize(const MachineFunction &MF) { const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>(); @@ -286,10 +341,8 @@ MachineBasicBlock::iterator AArch64FrameLowering::eliminateCallFramePseudoInstr( uint64_t CalleePopAmount = IsDestroy ? I->getOperand(1).getImm() : 0; if (!hasReservedCallFrame(MF)) { - unsigned Align = getStackAlignment(); - int64_t Amount = I->getOperand(0).getImm(); - Amount = alignTo(Amount, Align); + Amount = alignTo(Amount, getStackAlign()); if (!IsDestroy) Amount = -Amount; @@ -480,6 +533,39 @@ bool AArch64FrameLowering::shouldCombineCSRLocalStackBump( return true; } +bool AArch64FrameLowering::shouldCombineCSRLocalStackBumpInEpilogue( + MachineBasicBlock &MBB, unsigned StackBumpBytes) const { + if (!shouldCombineCSRLocalStackBump(*MBB.getParent(), StackBumpBytes)) + return false; + + if (MBB.empty()) + return true; + + // Disable combined SP bump if the last instruction is an MTE tag store. It + // is almost always better to merge SP adjustment into those instructions. + MachineBasicBlock::iterator LastI = MBB.getFirstTerminator(); + MachineBasicBlock::iterator Begin = MBB.begin(); + while (LastI != Begin) { + --LastI; + if (LastI->isTransient()) + continue; + if (!LastI->getFlag(MachineInstr::FrameDestroy)) + break; + } + switch (LastI->getOpcode()) { + case AArch64::STGloop: + case AArch64::STZGloop: + case AArch64::STGOffset: + case AArch64::STZGOffset: + case AArch64::ST2GOffset: + case AArch64::STZ2GOffset: + return false; + default: + return true; + } + llvm_unreachable("unreachable"); +} + // Given a load or a store instruction, generate an appropriate unwinding SEH // code on Windows. static MachineBasicBlock::iterator InsertSEH(MachineBasicBlock::iterator MBBI, @@ -940,11 +1026,11 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, // Label used to tie together the PROLOG_LABEL and the MachineMoves. MCSymbol *FrameLabel = MMI.getContext().createTempSymbol(); // Encode the stack size of the leaf function. - unsigned CFIIndex = MF.addFrameInst( - MCCFIInstruction::createDefCfaOffset(FrameLabel, -NumBytes)); - BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION)) - .addCFIIndex(CFIIndex) - .setMIFlags(MachineInstr::FrameSetup); + unsigned CFIIndex = MF.addFrameInst( + MCCFIInstruction::cfiDefCfaOffset(FrameLabel, NumBytes)); + BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION)) + .addCFIIndex(CFIIndex) + .setMIFlags(MachineInstr::FrameSetup); } } @@ -959,10 +1045,7 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, bool IsWin64 = Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv()); - // Var args are accounted for in the containing function, so don't - // include them for funclets. - unsigned FixedObject = (IsWin64 && !IsFunclet) ? - alignTo(AFI->getVarArgsGPRSize(), 16) : 0; + unsigned FixedObject = getFixedObjectSize(MF, AFI, IsWin64, IsFunclet); auto PrologueSaveSize = AFI->getCalleeSavedStackSize() + FixedObject; // All of the remaining stack allocations are for locals. @@ -993,32 +1076,8 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, ++MBBI; } - // The code below is not applicable to funclets. We have emitted all the SEH - // opcodes that we needed to emit. The FP and BP belong to the containing - // function. - if (IsFunclet) { - if (NeedsWinCFI) { - HasWinCFI = true; - BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_PrologEnd)) - .setMIFlag(MachineInstr::FrameSetup); - } - - // SEH funclets are passed the frame pointer in X1. If the parent - // function uses the base register, then the base register is used - // directly, and is not retrieved from X1. - if (F.hasPersonalityFn()) { - EHPersonality Per = classifyEHPersonality(F.getPersonalityFn()); - if (isAsynchronousEHPersonality(Per)) { - BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::COPY), AArch64::FP) - .addReg(AArch64::X1).setMIFlag(MachineInstr::FrameSetup); - MBB.addLiveIn(AArch64::X1); - } - } - - return; - } - - if (HasFP) { + // For funclets the FP belongs to the containing function. + if (!IsFunclet && HasFP) { // Only set up FP if we actually need to. int64_t FPOffset = isTargetDarwin(MF) ? (AFI->getCalleeSavedStackSize() - 16) : 0; @@ -1099,7 +1158,7 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, .setMIFlag(MachineInstr::FrameSetup); } - BuildMI(MBB, MBBI, DL, TII->get(AArch64::BLR)) + BuildMI(MBB, MBBI, DL, TII->get(getBLRCallOpcode(MF))) .addReg(AArch64::X16, RegState::Kill) .addReg(AArch64::X15, RegState::Implicit | RegState::Define) .addReg(AArch64::X16, RegState::Implicit | RegState::Define | RegState::Dead) @@ -1161,7 +1220,9 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, // Allocate space for the rest of the frame. if (NumBytes) { - const bool NeedsRealignment = RegInfo->needsStackRealignment(MF); + // Alignment is required for the parent frame, not the funclet + const bool NeedsRealignment = + !IsFunclet && RegInfo->needsStackRealignment(MF); unsigned scratchSPReg = AArch64::SP; if (NeedsRealignment) { @@ -1179,8 +1240,7 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, false, NeedsWinCFI, &HasWinCFI); if (NeedsRealignment) { - const unsigned Alignment = MFI.getMaxAlignment(); - const unsigned NrBitsToZero = countTrailingZeros(Alignment); + const unsigned NrBitsToZero = Log2(MFI.getMaxAlign()); assert(NrBitsToZero > 1); assert(scratchSPReg != AArch64::SP); @@ -1215,7 +1275,8 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, // FIXME: Clarify FrameSetup flags here. // Note: Use emitFrameOffset() like above for FP if the FrameSetup flag is // needed. - if (RegInfo->hasBasePointer(MF)) { + // For funclets the BP belongs to the containing function. + if (!IsFunclet && RegInfo->hasBasePointer(MF)) { TII->copyPhysReg(MBB, MBBI, DL, RegInfo->getBaseRegister(), AArch64::SP, false); if (NeedsWinCFI) { @@ -1232,6 +1293,19 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, .setMIFlag(MachineInstr::FrameSetup); } + // SEH funclets are passed the frame pointer in X1. If the parent + // function uses the base register, then the base register is used + // directly, and is not retrieved from X1. + if (IsFunclet && F.hasPersonalityFn()) { + EHPersonality Per = classifyEHPersonality(F.getPersonalityFn()); + if (isAsynchronousEHPersonality(Per)) { + BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::COPY), AArch64::FP) + .addReg(AArch64::X1) + .setMIFlag(MachineInstr::FrameSetup); + MBB.addLiveIn(AArch64::X1); + } + } + if (needsFrameMoves) { const DataLayout &TD = MF.getDataLayout(); const int StackGrowth = isTargetDarwin(MF) @@ -1307,15 +1381,15 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, if (HasFP) { // Define the current CFA rule to use the provided FP. unsigned Reg = RegInfo->getDwarfRegNum(FramePtr, true); - unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::createDefCfa( - nullptr, Reg, StackGrowth - FixedObject)); + unsigned CFIIndex = MF.addFrameInst( + MCCFIInstruction::cfiDefCfa(nullptr, Reg, FixedObject - StackGrowth)); BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION)) .addCFIIndex(CFIIndex) .setMIFlags(MachineInstr::FrameSetup); } else { // Encode the stack size of the leaf function. unsigned CFIIndex = MF.addFrameInst( - MCCFIInstruction::createDefCfaOffset(nullptr, -MFI.getStackSize())); + MCCFIInstruction::cfiDefCfaOffset(nullptr, MFI.getStackSize())); BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION)) .addCFIIndex(CFIIndex) .setMIFlags(MachineInstr::FrameSetup); @@ -1374,7 +1448,6 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF, const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>(); const TargetInstrInfo *TII = Subtarget.getInstrInfo(); DebugLoc DL; - bool IsTailCallReturn = false; bool NeedsWinCFI = needsWinCFI(MF); bool HasWinCFI = false; bool IsFunclet = false; @@ -1385,10 +1458,6 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF, if (MBB.end() != MBBI) { DL = MBBI->getDebugLoc(); - unsigned RetOpcode = MBBI->getOpcode(); - IsTailCallReturn = RetOpcode == AArch64::TCRETURNdi || - RetOpcode == AArch64::TCRETURNri || - RetOpcode == AArch64::TCRETURNriBTI; IsFunclet = isFuncletReturnInstr(*MBBI); } @@ -1403,21 +1472,7 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF, // Initial and residual are named for consistency with the prologue. Note that // in the epilogue, the residual adjustment is executed first. - uint64_t ArgumentPopSize = 0; - if (IsTailCallReturn) { - MachineOperand &StackAdjust = MBBI->getOperand(1); - - // For a tail-call in a callee-pops-arguments environment, some or all of - // the stack may actually be in use for the call's arguments, this is - // calculated during LowerCall and consumed here... - ArgumentPopSize = StackAdjust.getImm(); - } else { - // ... otherwise the amount to pop is *all* of the argument space, - // conveniently stored in the MachineFunctionInfo by - // LowerFormalArguments. This will, of course, be zero for the C calling - // convention. - ArgumentPopSize = AFI->getArgumentStackToRestore(); - } + uint64_t ArgumentPopSize = getArgumentPopSize(MF, MBB); // The stack frame should be like below, // @@ -1450,10 +1505,7 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF, bool IsWin64 = Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv()); - // Var args are accounted for in the containing function, so don't - // include them for funclets. - unsigned FixedObject = - (IsWin64 && !IsFunclet) ? alignTo(AFI->getVarArgsGPRSize(), 16) : 0; + unsigned FixedObject = getFixedObjectSize(MF, AFI, IsWin64, IsFunclet); uint64_t AfterCSRPopSize = ArgumentPopSize; auto PrologueSaveSize = AFI->getCalleeSavedStackSize() + FixedObject; @@ -1463,7 +1515,7 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF, // function. if (MF.hasEHFunclets()) AFI->setLocalStackSize(NumBytes - PrologueSaveSize); - bool CombineSPBump = shouldCombineCSRLocalStackBump(MF, NumBytes); + bool CombineSPBump = shouldCombineCSRLocalStackBumpInEpilogue(MBB, NumBytes); // Assume we can't combine the last pop with the sp restore. if (!CombineSPBump && PrologueSaveSize != 0) { @@ -1660,7 +1712,7 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF, /// SP-relative and simple call frames aren't used. int AArch64FrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI, - unsigned &FrameReg) const { + Register &FrameReg) const { return resolveFrameIndexReference( MF, FI, FrameReg, /*PreferFP=*/ @@ -1679,7 +1731,9 @@ static StackOffset getFPOffset(const MachineFunction &MF, int64_t ObjectOffset) const auto &Subtarget = MF.getSubtarget<AArch64Subtarget>(); bool IsWin64 = Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv()); - unsigned FixedObject = IsWin64 ? alignTo(AFI->getVarArgsGPRSize(), 16) : 0; + + unsigned FixedObject = + getFixedObjectSize(MF, AFI, IsWin64, /*IsFunclet=*/false); unsigned FPAdjust = isTargetDarwin(MF) ? 16 : AFI->getCalleeSavedStackSize(MF.getFrameInfo()); return {ObjectOffset + FixedObject + FPAdjust, MVT::i8}; @@ -1701,7 +1755,7 @@ int AArch64FrameLowering::getSEHFrameIndexOffset(const MachineFunction &MF, } StackOffset AArch64FrameLowering::resolveFrameIndexReference( - const MachineFunction &MF, int FI, unsigned &FrameReg, bool PreferFP, + const MachineFunction &MF, int FI, Register &FrameReg, bool PreferFP, bool ForSimm) const { const auto &MFI = MF.getFrameInfo(); int64_t ObjectOffset = MFI.getObjectOffset(FI); @@ -1713,7 +1767,7 @@ StackOffset AArch64FrameLowering::resolveFrameIndexReference( StackOffset AArch64FrameLowering::resolveFrameOffsetReference( const MachineFunction &MF, int64_t ObjectOffset, bool isFixed, bool isSVE, - unsigned &FrameReg, bool PreferFP, bool ForSimm) const { + Register &FrameReg, bool PreferFP, bool ForSimm) const { const auto &MFI = MF.getFrameInfo(); const auto *RegInfo = static_cast<const AArch64RegisterInfo *>( MF.getSubtarget().getRegisterInfo()); @@ -1764,10 +1818,8 @@ StackOffset AArch64FrameLowering::resolveFrameOffsetReference( bool CanUseBP = RegInfo->hasBasePointer(MF); if (FPOffsetFits && CanUseBP) // Both are ok. Pick the best. UseFP = PreferFP; - else if (!CanUseBP) { // Can't use BP. Forced to use FP. - assert(!SVEStackSize && "Expected BP to be available"); + else if (!CanUseBP) // Can't use BP. Forced to use FP. UseFP = true; - } // else we can use BP and FP, but the offset from FP won't fit. // That will make us scavenge registers which we can probably avoid by // using BP. If it won't fit for BP either, we'll scavenge anyway. @@ -1933,7 +1985,7 @@ struct RegPairInfo { } // end anonymous namespace static void computeCalleeSaveRegisterPairs( - MachineFunction &MF, const std::vector<CalleeSavedInfo> &CSI, + MachineFunction &MF, ArrayRef<CalleeSavedInfo> CSI, const TargetRegisterInfo *TRI, SmallVectorImpl<RegPairInfo> &RegPairs, bool &NeedShadowCallStackProlog, bool NeedsFrameRecord) { @@ -2058,8 +2110,8 @@ static void computeCalleeSaveRegisterPairs( FixupDone = true; ByteOffset -= 8; assert(ByteOffset % 16 == 0); - assert(MFI.getObjectAlignment(RPI.FrameIdx) <= 16); - MFI.setObjectAlignment(RPI.FrameIdx, 16); + assert(MFI.getObjectAlign(RPI.FrameIdx) <= Align(16)); + MFI.setObjectAlignment(RPI.FrameIdx, Align(16)); } int Offset = RPI.isScalable() ? ScalableByteOffset : ByteOffset; @@ -2078,8 +2130,7 @@ static void computeCalleeSaveRegisterPairs( bool AArch64FrameLowering::spillCalleeSavedRegisters( MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, - const std::vector<CalleeSavedInfo> &CSI, - const TargetRegisterInfo *TRI) const { + ArrayRef<CalleeSavedInfo> CSI, const TargetRegisterInfo *TRI) const { MachineFunction &MF = *MBB.getParent(); const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); bool NeedsWinCFI = needsWinCFI(MF); @@ -2142,32 +2193,33 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters( // Rationale: This sequence saves uop updates compared to a sequence of // pre-increment spills like stp xi,xj,[sp,#-16]! // Note: Similar rationale and sequence for restores in epilog. - unsigned Size, Align; + unsigned Size; + Align Alignment; switch (RPI.Type) { case RegPairInfo::GPR: StrOpc = RPI.isPaired() ? AArch64::STPXi : AArch64::STRXui; Size = 8; - Align = 8; + Alignment = Align(8); break; case RegPairInfo::FPR64: StrOpc = RPI.isPaired() ? AArch64::STPDi : AArch64::STRDui; Size = 8; - Align = 8; + Alignment = Align(8); break; case RegPairInfo::FPR128: StrOpc = RPI.isPaired() ? AArch64::STPQi : AArch64::STRQui; Size = 16; - Align = 16; + Alignment = Align(16); break; case RegPairInfo::ZPR: StrOpc = AArch64::STR_ZXI; Size = 16; - Align = 16; + Alignment = Align(16); break; case RegPairInfo::PPR: StrOpc = AArch64::STR_PXI; Size = 2; - Align = 2; + Alignment = Align(2); break; } LLVM_DEBUG(dbgs() << "CSR spill: (" << printReg(Reg1, TRI); @@ -2196,7 +2248,7 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters( MIB.addReg(Reg2, getPrologueDeath(MF, Reg2)); MIB.addMemOperand(MF.getMachineMemOperand( MachinePointerInfo::getFixedStack(MF, FrameIdxReg2), - MachineMemOperand::MOStore, Size, Align)); + MachineMemOperand::MOStore, Size, Alignment)); } MIB.addReg(Reg1, getPrologueDeath(MF, Reg1)) .addReg(AArch64::SP) @@ -2204,8 +2256,8 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters( // where factor*scale is implicit .setMIFlag(MachineInstr::FrameSetup); MIB.addMemOperand(MF.getMachineMemOperand( - MachinePointerInfo::getFixedStack(MF,FrameIdxReg1), - MachineMemOperand::MOStore, Size, Align)); + MachinePointerInfo::getFixedStack(MF, FrameIdxReg1), + MachineMemOperand::MOStore, Size, Alignment)); if (NeedsWinCFI) InsertSEH(MIB, TII, MachineInstr::FrameSetup); @@ -2220,8 +2272,7 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters( bool AArch64FrameLowering::restoreCalleeSavedRegisters( MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, - std::vector<CalleeSavedInfo> &CSI, - const TargetRegisterInfo *TRI) const { + MutableArrayRef<CalleeSavedInfo> CSI, const TargetRegisterInfo *TRI) const { MachineFunction &MF = *MBB.getParent(); const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); DebugLoc DL; @@ -2248,32 +2299,33 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters( // ldp x22, x21, [sp, #0] // addImm(+0) // Note: see comment in spillCalleeSavedRegisters() unsigned LdrOpc; - unsigned Size, Align; + unsigned Size; + Align Alignment; switch (RPI.Type) { case RegPairInfo::GPR: LdrOpc = RPI.isPaired() ? AArch64::LDPXi : AArch64::LDRXui; Size = 8; - Align = 8; + Alignment = Align(8); break; case RegPairInfo::FPR64: LdrOpc = RPI.isPaired() ? AArch64::LDPDi : AArch64::LDRDui; Size = 8; - Align = 8; + Alignment = Align(8); break; case RegPairInfo::FPR128: LdrOpc = RPI.isPaired() ? AArch64::LDPQi : AArch64::LDRQui; Size = 16; - Align = 16; + Alignment = Align(16); break; case RegPairInfo::ZPR: LdrOpc = AArch64::LDR_ZXI; Size = 16; - Align = 16; + Alignment = Align(16); break; case RegPairInfo::PPR: LdrOpc = AArch64::LDR_PXI; Size = 2; - Align = 2; + Alignment = Align(2); break; } LLVM_DEBUG(dbgs() << "CSR restore: (" << printReg(Reg1, TRI); @@ -2296,7 +2348,7 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters( MIB.addReg(Reg2, getDefRegState(true)); MIB.addMemOperand(MF.getMachineMemOperand( MachinePointerInfo::getFixedStack(MF, FrameIdxReg2), - MachineMemOperand::MOLoad, Size, Align)); + MachineMemOperand::MOLoad, Size, Alignment)); } MIB.addReg(Reg1, getDefRegState(true)) .addReg(AArch64::SP) @@ -2305,7 +2357,7 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters( .setMIFlag(MachineInstr::FrameDestroy); MIB.addMemOperand(MF.getMachineMemOperand( MachinePointerInfo::getFixedStack(MF, FrameIdxReg1), - MachineMemOperand::MOLoad, Size, Align)); + MachineMemOperand::MOLoad, Size, Alignment)); if (NeedsWinCFI) InsertSEH(MIB, TII, MachineInstr::FrameDestroy); }; @@ -2348,6 +2400,7 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF, TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS); const AArch64RegisterInfo *RegInfo = static_cast<const AArch64RegisterInfo *>( MF.getSubtarget().getRegisterInfo()); + const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>(); AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>(); unsigned UnspilledCSGPR = AArch64::NoRegister; unsigned UnspilledCSGPRPaired = AArch64::NoRegister; @@ -2396,6 +2449,16 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF, } } + if (MF.getFunction().getCallingConv() == CallingConv::Win64 && + !Subtarget.isTargetWindows()) { + // For Windows calling convention on a non-windows OS, where X18 is treated + // as reserved, back up X18 when entering non-windows code (marked with the + // Windows calling convention) and restore when returning regardless of + // whether the individual function uses it - it might call other functions + // that clobber it. + SavedRegs.set(AArch64::X18); + } + // Calculates the callee saved stack size. unsigned CSStackSize = 0; unsigned SVECSStackSize = 0; @@ -2467,8 +2530,8 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF, const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); const TargetRegisterClass &RC = AArch64::GPR64RegClass; unsigned Size = TRI->getSpillSize(RC); - unsigned Align = TRI->getSpillAlignment(RC); - int FI = MFI.CreateStackObject(Size, Align, false); + Align Alignment = TRI->getSpillAlign(RC); + int FI = MFI.CreateStackObject(Size, Alignment, false); RS->addScavengingFrameIndex(FI); LLVM_DEBUG(dbgs() << "No available CS registers, allocated fi#" << FI << " as the emergency spill slot.\n"); @@ -2549,12 +2612,12 @@ static int64_t determineSVEStackObjectOffsets(MachineFrameInfo &MFI, // Then process all callee saved slots. if (getSVECalleeSaveSlotRange(MFI, MinCSFrameIndex, MaxCSFrameIndex)) { // Make sure to align the last callee save slot. - MFI.setObjectAlignment(MaxCSFrameIndex, 16U); + MFI.setObjectAlignment(MaxCSFrameIndex, Align(16)); // Assign offsets to the callee save slots. for (int I = MinCSFrameIndex; I <= MaxCSFrameIndex; ++I) { Offset += MFI.getObjectSize(I); - Offset = alignTo(Offset, MFI.getObjectAlignment(I)); + Offset = alignTo(Offset, MFI.getObjectAlign(I)); if (AssignOffsets) Assign(I, -Offset); } @@ -2576,15 +2639,15 @@ static int64_t determineSVEStackObjectOffsets(MachineFrameInfo &MFI, // Allocate all SVE locals and spills for (unsigned FI : ObjectsToAllocate) { - unsigned Align = MFI.getObjectAlignment(FI); + Align Alignment = MFI.getObjectAlign(FI); // FIXME: Given that the length of SVE vectors is not necessarily a power of // two, we'd need to align every object dynamically at runtime if the // alignment is larger than 16. This is not yet supported. - if (Align > 16) + if (Alignment > Align(16)) report_fatal_error( "Alignment of scalable vectors > 16 bytes is not yet supported"); - Offset = alignTo(Offset + MFI.getObjectSize(FI), Align); + Offset = alignTo(Offset + MFI.getObjectSize(FI), Alignment); if (AssignOffsets) Assign(FI, -Offset); } @@ -2632,9 +2695,14 @@ void AArch64FrameLowering::processFunctionBeforeFrameFinalized( ++MBBI; // Create an UnwindHelp object. - int UnwindHelpFI = - MFI.CreateStackObject(/*size*/8, /*alignment*/16, false); + // The UnwindHelp object is allocated at the start of the fixed object area + int64_t FixedObject = + getFixedObjectSize(MF, AFI, /*IsWin64*/ true, /*IsFunclet*/ false); + int UnwindHelpFI = MFI.CreateFixedObject(/*Size*/ 8, + /*SPOffset*/ -FixedObject, + /*IsImmutable=*/false); EHInfo.UnwindHelpFrameIdx = UnwindHelpFI; + // We need to store -2 into the UnwindHelp object at the start of the // function. DebugLoc DL; @@ -2649,17 +2717,411 @@ void AArch64FrameLowering::processFunctionBeforeFrameFinalized( .addImm(0); } -/// For Win64 AArch64 EH, the offset to the Unwind object is from the SP before -/// the update. This is easily retrieved as it is exactly the offset that is set -/// in processFunctionBeforeFrameFinalized. +namespace { +struct TagStoreInstr { + MachineInstr *MI; + int64_t Offset, Size; + explicit TagStoreInstr(MachineInstr *MI, int64_t Offset, int64_t Size) + : MI(MI), Offset(Offset), Size(Size) {} +}; + +class TagStoreEdit { + MachineFunction *MF; + MachineBasicBlock *MBB; + MachineRegisterInfo *MRI; + // Tag store instructions that are being replaced. + SmallVector<TagStoreInstr, 8> TagStores; + // Combined memref arguments of the above instructions. + SmallVector<MachineMemOperand *, 8> CombinedMemRefs; + + // Replace allocation tags in [FrameReg + FrameRegOffset, FrameReg + + // FrameRegOffset + Size) with the address tag of SP. + Register FrameReg; + StackOffset FrameRegOffset; + int64_t Size; + // If not None, move FrameReg to (FrameReg + FrameRegUpdate) at the end. + Optional<int64_t> FrameRegUpdate; + // MIFlags for any FrameReg updating instructions. + unsigned FrameRegUpdateFlags; + + // Use zeroing instruction variants. + bool ZeroData; + DebugLoc DL; + + void emitUnrolled(MachineBasicBlock::iterator InsertI); + void emitLoop(MachineBasicBlock::iterator InsertI); + +public: + TagStoreEdit(MachineBasicBlock *MBB, bool ZeroData) + : MBB(MBB), ZeroData(ZeroData) { + MF = MBB->getParent(); + MRI = &MF->getRegInfo(); + } + // Add an instruction to be replaced. Instructions must be added in the + // ascending order of Offset, and have to be adjacent. + void addInstruction(TagStoreInstr I) { + assert((TagStores.empty() || + TagStores.back().Offset + TagStores.back().Size == I.Offset) && + "Non-adjacent tag store instructions."); + TagStores.push_back(I); + } + void clear() { TagStores.clear(); } + // Emit equivalent code at the given location, and erase the current set of + // instructions. May skip if the replacement is not profitable. May invalidate + // the input iterator and replace it with a valid one. + void emitCode(MachineBasicBlock::iterator &InsertI, + const AArch64FrameLowering *TFI, bool IsLast); +}; + +void TagStoreEdit::emitUnrolled(MachineBasicBlock::iterator InsertI) { + const AArch64InstrInfo *TII = + MF->getSubtarget<AArch64Subtarget>().getInstrInfo(); + + const int64_t kMinOffset = -256 * 16; + const int64_t kMaxOffset = 255 * 16; + + Register BaseReg = FrameReg; + int64_t BaseRegOffsetBytes = FrameRegOffset.getBytes(); + if (BaseRegOffsetBytes < kMinOffset || + BaseRegOffsetBytes + (Size - Size % 32) > kMaxOffset) { + Register ScratchReg = MRI->createVirtualRegister(&AArch64::GPR64RegClass); + emitFrameOffset(*MBB, InsertI, DL, ScratchReg, BaseReg, + {BaseRegOffsetBytes, MVT::i8}, TII); + BaseReg = ScratchReg; + BaseRegOffsetBytes = 0; + } + + MachineInstr *LastI = nullptr; + while (Size) { + int64_t InstrSize = (Size > 16) ? 32 : 16; + unsigned Opcode = + InstrSize == 16 + ? (ZeroData ? AArch64::STZGOffset : AArch64::STGOffset) + : (ZeroData ? AArch64::STZ2GOffset : AArch64::ST2GOffset); + MachineInstr *I = BuildMI(*MBB, InsertI, DL, TII->get(Opcode)) + .addReg(AArch64::SP) + .addReg(BaseReg) + .addImm(BaseRegOffsetBytes / 16) + .setMemRefs(CombinedMemRefs); + // A store to [BaseReg, #0] should go last for an opportunity to fold the + // final SP adjustment in the epilogue. + if (BaseRegOffsetBytes == 0) + LastI = I; + BaseRegOffsetBytes += InstrSize; + Size -= InstrSize; + } + + if (LastI) + MBB->splice(InsertI, MBB, LastI); +} + +void TagStoreEdit::emitLoop(MachineBasicBlock::iterator InsertI) { + const AArch64InstrInfo *TII = + MF->getSubtarget<AArch64Subtarget>().getInstrInfo(); + + Register BaseReg = FrameRegUpdate + ? FrameReg + : MRI->createVirtualRegister(&AArch64::GPR64RegClass); + Register SizeReg = MRI->createVirtualRegister(&AArch64::GPR64RegClass); + + emitFrameOffset(*MBB, InsertI, DL, BaseReg, FrameReg, FrameRegOffset, TII); + + int64_t LoopSize = Size; + // If the loop size is not a multiple of 32, split off one 16-byte store at + // the end to fold BaseReg update into. + if (FrameRegUpdate && *FrameRegUpdate) + LoopSize -= LoopSize % 32; + MachineInstr *LoopI = BuildMI(*MBB, InsertI, DL, + TII->get(ZeroData ? AArch64::STZGloop_wback + : AArch64::STGloop_wback)) + .addDef(SizeReg) + .addDef(BaseReg) + .addImm(LoopSize) + .addReg(BaseReg) + .setMemRefs(CombinedMemRefs); + if (FrameRegUpdate) + LoopI->setFlags(FrameRegUpdateFlags); + + int64_t ExtraBaseRegUpdate = + FrameRegUpdate ? (*FrameRegUpdate - FrameRegOffset.getBytes() - Size) : 0; + if (LoopSize < Size) { + assert(FrameRegUpdate); + assert(Size - LoopSize == 16); + // Tag 16 more bytes at BaseReg and update BaseReg. + BuildMI(*MBB, InsertI, DL, + TII->get(ZeroData ? AArch64::STZGPostIndex : AArch64::STGPostIndex)) + .addDef(BaseReg) + .addReg(BaseReg) + .addReg(BaseReg) + .addImm(1 + ExtraBaseRegUpdate / 16) + .setMemRefs(CombinedMemRefs) + .setMIFlags(FrameRegUpdateFlags); + } else if (ExtraBaseRegUpdate) { + // Update BaseReg. + BuildMI( + *MBB, InsertI, DL, + TII->get(ExtraBaseRegUpdate > 0 ? AArch64::ADDXri : AArch64::SUBXri)) + .addDef(BaseReg) + .addReg(BaseReg) + .addImm(std::abs(ExtraBaseRegUpdate)) + .addImm(0) + .setMIFlags(FrameRegUpdateFlags); + } +} + +// Check if *II is a register update that can be merged into STGloop that ends +// at (Reg + Size). RemainingOffset is the required adjustment to Reg after the +// end of the loop. +bool canMergeRegUpdate(MachineBasicBlock::iterator II, unsigned Reg, + int64_t Size, int64_t *TotalOffset) { + MachineInstr &MI = *II; + if ((MI.getOpcode() == AArch64::ADDXri || + MI.getOpcode() == AArch64::SUBXri) && + MI.getOperand(0).getReg() == Reg && MI.getOperand(1).getReg() == Reg) { + unsigned Shift = AArch64_AM::getShiftValue(MI.getOperand(3).getImm()); + int64_t Offset = MI.getOperand(2).getImm() << Shift; + if (MI.getOpcode() == AArch64::SUBXri) + Offset = -Offset; + int64_t AbsPostOffset = std::abs(Offset - Size); + const int64_t kMaxOffset = + 0xFFF; // Max encoding for unshifted ADDXri / SUBXri + if (AbsPostOffset <= kMaxOffset && AbsPostOffset % 16 == 0) { + *TotalOffset = Offset; + return true; + } + } + return false; +} + +void mergeMemRefs(const SmallVectorImpl<TagStoreInstr> &TSE, + SmallVectorImpl<MachineMemOperand *> &MemRefs) { + MemRefs.clear(); + for (auto &TS : TSE) { + MachineInstr *MI = TS.MI; + // An instruction without memory operands may access anything. Be + // conservative and return an empty list. + if (MI->memoperands_empty()) { + MemRefs.clear(); + return; + } + MemRefs.append(MI->memoperands_begin(), MI->memoperands_end()); + } +} + +void TagStoreEdit::emitCode(MachineBasicBlock::iterator &InsertI, + const AArch64FrameLowering *TFI, bool IsLast) { + if (TagStores.empty()) + return; + TagStoreInstr &FirstTagStore = TagStores[0]; + TagStoreInstr &LastTagStore = TagStores[TagStores.size() - 1]; + Size = LastTagStore.Offset - FirstTagStore.Offset + LastTagStore.Size; + DL = TagStores[0].MI->getDebugLoc(); + + Register Reg; + FrameRegOffset = TFI->resolveFrameOffsetReference( + *MF, FirstTagStore.Offset, false /*isFixed*/, false /*isSVE*/, Reg, + /*PreferFP=*/false, /*ForSimm=*/true); + FrameReg = Reg; + FrameRegUpdate = None; + + mergeMemRefs(TagStores, CombinedMemRefs); + + LLVM_DEBUG(dbgs() << "Replacing adjacent STG instructions:\n"; + for (const auto &Instr + : TagStores) { dbgs() << " " << *Instr.MI; }); + + // Size threshold where a loop becomes shorter than a linear sequence of + // tagging instructions. + const int kSetTagLoopThreshold = 176; + if (Size < kSetTagLoopThreshold) { + if (TagStores.size() < 2) + return; + emitUnrolled(InsertI); + } else { + MachineInstr *UpdateInstr = nullptr; + int64_t TotalOffset; + if (IsLast) { + // See if we can merge base register update into the STGloop. + // This is done in AArch64LoadStoreOptimizer for "normal" stores, + // but STGloop is way too unusual for that, and also it only + // realistically happens in function epilogue. Also, STGloop is expanded + // before that pass. + if (InsertI != MBB->end() && + canMergeRegUpdate(InsertI, FrameReg, FrameRegOffset.getBytes() + Size, + &TotalOffset)) { + UpdateInstr = &*InsertI++; + LLVM_DEBUG(dbgs() << "Folding SP update into loop:\n " + << *UpdateInstr); + } + } + + if (!UpdateInstr && TagStores.size() < 2) + return; + + if (UpdateInstr) { + FrameRegUpdate = TotalOffset; + FrameRegUpdateFlags = UpdateInstr->getFlags(); + } + emitLoop(InsertI); + if (UpdateInstr) + UpdateInstr->eraseFromParent(); + } + + for (auto &TS : TagStores) + TS.MI->eraseFromParent(); +} + +bool isMergeableStackTaggingInstruction(MachineInstr &MI, int64_t &Offset, + int64_t &Size, bool &ZeroData) { + MachineFunction &MF = *MI.getParent()->getParent(); + const MachineFrameInfo &MFI = MF.getFrameInfo(); + + unsigned Opcode = MI.getOpcode(); + ZeroData = (Opcode == AArch64::STZGloop || Opcode == AArch64::STZGOffset || + Opcode == AArch64::STZ2GOffset); + + if (Opcode == AArch64::STGloop || Opcode == AArch64::STZGloop) { + if (!MI.getOperand(0).isDead() || !MI.getOperand(1).isDead()) + return false; + if (!MI.getOperand(2).isImm() || !MI.getOperand(3).isFI()) + return false; + Offset = MFI.getObjectOffset(MI.getOperand(3).getIndex()); + Size = MI.getOperand(2).getImm(); + return true; + } + + if (Opcode == AArch64::STGOffset || Opcode == AArch64::STZGOffset) + Size = 16; + else if (Opcode == AArch64::ST2GOffset || Opcode == AArch64::STZ2GOffset) + Size = 32; + else + return false; + + if (MI.getOperand(0).getReg() != AArch64::SP || !MI.getOperand(1).isFI()) + return false; + + Offset = MFI.getObjectOffset(MI.getOperand(1).getIndex()) + + 16 * MI.getOperand(2).getImm(); + return true; +} + +// Detect a run of memory tagging instructions for adjacent stack frame slots, +// and replace them with a shorter instruction sequence: +// * replace STG + STG with ST2G +// * replace STGloop + STGloop with STGloop +// This code needs to run when stack slot offsets are already known, but before +// FrameIndex operands in STG instructions are eliminated. +MachineBasicBlock::iterator tryMergeAdjacentSTG(MachineBasicBlock::iterator II, + const AArch64FrameLowering *TFI, + RegScavenger *RS) { + bool FirstZeroData; + int64_t Size, Offset; + MachineInstr &MI = *II; + MachineBasicBlock *MBB = MI.getParent(); + MachineBasicBlock::iterator NextI = ++II; + if (&MI == &MBB->instr_back()) + return II; + if (!isMergeableStackTaggingInstruction(MI, Offset, Size, FirstZeroData)) + return II; + + SmallVector<TagStoreInstr, 4> Instrs; + Instrs.emplace_back(&MI, Offset, Size); + + constexpr int kScanLimit = 10; + int Count = 0; + for (MachineBasicBlock::iterator E = MBB->end(); + NextI != E && Count < kScanLimit; ++NextI) { + MachineInstr &MI = *NextI; + bool ZeroData; + int64_t Size, Offset; + // Collect instructions that update memory tags with a FrameIndex operand + // and (when applicable) constant size, and whose output registers are dead + // (the latter is almost always the case in practice). Since these + // instructions effectively have no inputs or outputs, we are free to skip + // any non-aliasing instructions in between without tracking used registers. + if (isMergeableStackTaggingInstruction(MI, Offset, Size, ZeroData)) { + if (ZeroData != FirstZeroData) + break; + Instrs.emplace_back(&MI, Offset, Size); + continue; + } + + // Only count non-transient, non-tagging instructions toward the scan + // limit. + if (!MI.isTransient()) + ++Count; + + // Just in case, stop before the epilogue code starts. + if (MI.getFlag(MachineInstr::FrameSetup) || + MI.getFlag(MachineInstr::FrameDestroy)) + break; + + // Reject anything that may alias the collected instructions. + if (MI.mayLoadOrStore() || MI.hasUnmodeledSideEffects()) + break; + } + + // New code will be inserted after the last tagging instruction we've found. + MachineBasicBlock::iterator InsertI = Instrs.back().MI; + InsertI++; + + llvm::stable_sort(Instrs, + [](const TagStoreInstr &Left, const TagStoreInstr &Right) { + return Left.Offset < Right.Offset; + }); + + // Make sure that we don't have any overlapping stores. + int64_t CurOffset = Instrs[0].Offset; + for (auto &Instr : Instrs) { + if (CurOffset > Instr.Offset) + return NextI; + CurOffset = Instr.Offset + Instr.Size; + } + + // Find contiguous runs of tagged memory and emit shorter instruction + // sequencies for them when possible. + TagStoreEdit TSE(MBB, FirstZeroData); + Optional<int64_t> EndOffset; + for (auto &Instr : Instrs) { + if (EndOffset && *EndOffset != Instr.Offset) { + // Found a gap. + TSE.emitCode(InsertI, TFI, /*IsLast = */ false); + TSE.clear(); + } + + TSE.addInstruction(Instr); + EndOffset = Instr.Offset + Instr.Size; + } + + TSE.emitCode(InsertI, TFI, /*IsLast = */ true); + + return InsertI; +} +} // namespace + +void AArch64FrameLowering::processFunctionBeforeFrameIndicesReplaced( + MachineFunction &MF, RegScavenger *RS = nullptr) const { + if (StackTaggingMergeSetTag) + for (auto &BB : MF) + for (MachineBasicBlock::iterator II = BB.begin(); II != BB.end();) + II = tryMergeAdjacentSTG(II, this, RS); +} + +/// For Win64 AArch64 EH, the offset to the Unwind object is from the SP +/// before the update. This is easily retrieved as it is exactly the offset +/// that is set in processFunctionBeforeFrameFinalized. int AArch64FrameLowering::getFrameIndexReferencePreferSP( - const MachineFunction &MF, int FI, unsigned &FrameReg, + const MachineFunction &MF, int FI, Register &FrameReg, bool IgnoreSPUpdates) const { const MachineFrameInfo &MFI = MF.getFrameInfo(); - LLVM_DEBUG(dbgs() << "Offset from the SP for " << FI << " is " - << MFI.getObjectOffset(FI) << "\n"); - FrameReg = AArch64::SP; - return MFI.getObjectOffset(FI); + if (IgnoreSPUpdates) { + LLVM_DEBUG(dbgs() << "Offset from the SP for " << FI << " is " + << MFI.getObjectOffset(FI) << "\n"); + FrameReg = AArch64::SP; + return MFI.getObjectOffset(FI); + } + + return getFrameIndexReference(MF, FI, FrameReg); } /// The parent frame offset (aka dispFrame) is only used on X86_64 to retrieve @@ -2678,5 +3140,5 @@ unsigned AArch64FrameLowering::getWinEHFuncletFrameSize( MF.getInfo<AArch64FunctionInfo>()->getCalleeSavedStackSize(); // This is the amount of stack a funclet needs to allocate. return alignTo(CSSize + MF.getFrameInfo().getMaxCallFrameSize(), - getStackAlignment()); + getStackAlign()); } diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.h b/llvm/lib/Target/AArch64/AArch64FrameLowering.h index b5719feb6b154..9d0a6d9eaf255 100644 --- a/llvm/lib/Target/AArch64/AArch64FrameLowering.h +++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.h @@ -24,8 +24,9 @@ public: : TargetFrameLowering(StackGrowsDown, Align(16), 0, Align(16), true /*StackRealignable*/) {} - void emitCalleeSavedFrameMoves(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MBBI) const; + void + emitCalleeSavedFrameMoves(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI) const override; MachineBasicBlock::iterator eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB, @@ -39,23 +40,24 @@ public: bool canUseAsPrologue(const MachineBasicBlock &MBB) const override; int getFrameIndexReference(const MachineFunction &MF, int FI, - unsigned &FrameReg) const override; + Register &FrameReg) const override; StackOffset resolveFrameIndexReference(const MachineFunction &MF, int FI, - unsigned &FrameReg, bool PreferFP, + Register &FrameReg, bool PreferFP, bool ForSimm) const; StackOffset resolveFrameOffsetReference(const MachineFunction &MF, int64_t ObjectOffset, bool isFixed, - bool isSVE, unsigned &FrameReg, + bool isSVE, Register &FrameReg, bool PreferFP, bool ForSimm) const; bool spillCalleeSavedRegisters(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, - const std::vector<CalleeSavedInfo> &CSI, + ArrayRef<CalleeSavedInfo> CSI, const TargetRegisterInfo *TRI) const override; - bool restoreCalleeSavedRegisters(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MI, - std::vector<CalleeSavedInfo> &CSI, - const TargetRegisterInfo *TRI) const override; + bool + restoreCalleeSavedRegisters(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + MutableArrayRef<CalleeSavedInfo> CSI, + const TargetRegisterInfo *TRI) const override; /// Can this function use the red zone for local allocations. bool canUseRedZone(const MachineFunction &MF) const; @@ -77,12 +79,16 @@ public: void processFunctionBeforeFrameFinalized(MachineFunction &MF, RegScavenger *RS) const override; + void + processFunctionBeforeFrameIndicesReplaced(MachineFunction &MF, + RegScavenger *RS) const override; + unsigned getWinEHParentFrameOffset(const MachineFunction &MF) const override; unsigned getWinEHFuncletFrameSize(const MachineFunction &MF) const; int getFrameIndexReferencePreferSP(const MachineFunction &MF, int FI, - unsigned &FrameReg, + Register &FrameReg, bool IgnoreSPUpdates) const override; int getNonLocalFrameIndexReference(const MachineFunction &MF, int FI) const override; @@ -107,6 +113,8 @@ private: int64_t assignSVEStackObjectOffsets(MachineFrameInfo &MF, int &MinCSFrameIndex, int &MaxCSFrameIndex) const; + bool shouldCombineCSRLocalStackBumpInEpilogue(MachineBasicBlock &MBB, + unsigned StackBumpBytes) const; }; } // End llvm namespace diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp index a51aa85a931c0..10c4778533533 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp @@ -62,6 +62,9 @@ public: unsigned ConstraintID, std::vector<SDValue> &OutOps) override; + template <signed Low, signed High, signed Scale> + bool SelectRDVLImm(SDValue N, SDValue &Imm); + bool tryMLAV64LaneV128(SDNode *N); bool tryMULLV64LaneV128(unsigned IntNo, SDNode *N); bool SelectArithExtendedRegister(SDValue N, SDValue &Reg, SDValue &Shift); @@ -159,6 +162,24 @@ public: return false; } + bool SelectDupZero(SDValue N) { + switch(N->getOpcode()) { + case AArch64ISD::DUP: + case ISD::SPLAT_VECTOR: { + auto Opnd0 = N->getOperand(0); + if (auto CN = dyn_cast<ConstantSDNode>(Opnd0)) + if (CN->isNullValue()) + return true; + if (auto CN = dyn_cast<ConstantFPSDNode>(Opnd0)) + if (CN->isZero()) + return true; + break; + } + } + + return false; + } + template<MVT::SimpleValueType VT> bool SelectSVEAddSubImm(SDValue N, SDValue &Imm, SDValue &Shift) { return SelectSVEAddSubImm(N, VT, Imm, Shift); @@ -169,6 +190,11 @@ public: return SelectSVELogicalImm(N, VT, Imm); } + template <unsigned Low, unsigned High> + bool SelectSVEShiftImm64(SDValue N, SDValue &Imm) { + return SelectSVEShiftImm64(N, Low, High, Imm); + } + // Returns a suitable CNT/INC/DEC/RDVL multiplier to calculate VSCALE*N. template<signed Min, signed Max, signed Scale, bool Shift> bool SelectCntImm(SDValue N, SDValue &Imm) { @@ -197,6 +223,9 @@ public: /// unchanged; otherwise a REG_SEQUENCE value is returned. SDValue createDTuple(ArrayRef<SDValue> Vecs); SDValue createQTuple(ArrayRef<SDValue> Vecs); + // Form a sequence of SVE registers for instructions using list of vectors, + // e.g. structured loads and stores (ldN, stN). + SDValue createZTuple(ArrayRef<SDValue> Vecs); /// Generic helper for the createDTuple/createQTuple /// functions. Those should almost always be called instead. @@ -216,11 +245,31 @@ public: unsigned SubRegIdx); void SelectLoadLane(SDNode *N, unsigned NumVecs, unsigned Opc); void SelectPostLoadLane(SDNode *N, unsigned NumVecs, unsigned Opc); + void SelectPredicatedLoad(SDNode *N, unsigned NumVecs, const unsigned Opc); + + bool SelectAddrModeFrameIndexSVE(SDValue N, SDValue &Base, SDValue &OffImm); + /// SVE Reg+Imm addressing mode. + template <int64_t Min, int64_t Max> + bool SelectAddrModeIndexedSVE(SDNode *Root, SDValue N, SDValue &Base, + SDValue &OffImm); + /// SVE Reg+Reg address mode. + template <unsigned Scale> + bool SelectSVERegRegAddrMode(SDValue N, SDValue &Base, SDValue &Offset) { + return SelectSVERegRegAddrMode(N, Scale, Base, Offset); + } void SelectStore(SDNode *N, unsigned NumVecs, unsigned Opc); void SelectPostStore(SDNode *N, unsigned NumVecs, unsigned Opc); void SelectStoreLane(SDNode *N, unsigned NumVecs, unsigned Opc); void SelectPostStoreLane(SDNode *N, unsigned NumVecs, unsigned Opc); + template <unsigned Scale> + void SelectPredicatedStore(SDNode *N, unsigned NumVecs, const unsigned Opc_rr, + const unsigned Opc_ri); + template <unsigned Scale> + std::tuple<unsigned, SDValue, SDValue> + findAddrModeSVELoadStore(SDNode *N, const unsigned Opc_rr, + const unsigned Opc_ri, const SDValue &OldBase, + const SDValue &OldOffset); bool tryBitfieldExtractOp(SDNode *N); bool tryBitfieldExtractOpFromSExt(SDNode *N); @@ -268,13 +317,19 @@ private: bool SelectCMP_SWAP(SDNode *N); + bool SelectSVE8BitLslImm(SDValue N, SDValue &Imm, SDValue &Shift); + bool SelectSVEAddSubImm(SDValue N, MVT VT, SDValue &Imm, SDValue &Shift); bool SelectSVELogicalImm(SDValue N, MVT VT, SDValue &Imm); bool SelectSVESignedArithImm(SDValue N, SDValue &Imm); + bool SelectSVEShiftImm64(SDValue N, uint64_t Low, uint64_t High, + SDValue &Imm); bool SelectSVEArithImm(SDValue N, SDValue &Imm); + bool SelectSVERegRegAddrMode(SDValue N, unsigned Scale, SDValue &Base, + SDValue &Offset); }; } // end anonymous namespace @@ -679,6 +734,23 @@ static SDValue narrowIfNeeded(SelectionDAG *CurDAG, SDValue N) { return SDValue(Node, 0); } +// Returns a suitable CNT/INC/DEC/RDVL multiplier to calculate VSCALE*N. +template<signed Low, signed High, signed Scale> +bool AArch64DAGToDAGISel::SelectRDVLImm(SDValue N, SDValue &Imm) { + if (!isa<ConstantSDNode>(N)) + return false; + + int64_t MulImm = cast<ConstantSDNode>(N)->getSExtValue(); + if ((MulImm % std::abs(Scale)) == 0) { + int64_t RDVLImm = MulImm / Scale; + if ((RDVLImm >= Low) && (RDVLImm <= High)) { + Imm = CurDAG->getTargetConstant(RDVLImm, SDLoc(N), MVT::i32); + return true; + } + } + + return false; +} /// SelectArithExtendedRegister - Select a "extended register" operand. This /// operand folds in an extend followed by an optional left shift. @@ -832,16 +904,9 @@ bool AArch64DAGToDAGISel::SelectAddrModeIndexed(SDValue N, unsigned Size, if (!GAN) return true; - if (GAN->getOffset() % Size == 0) { - const GlobalValue *GV = GAN->getGlobal(); - unsigned Alignment = GV->getAlignment(); - Type *Ty = GV->getValueType(); - if (Alignment == 0 && Ty->isSized()) - Alignment = DL.getABITypeAlignment(Ty); - - if (Alignment >= Size) - return true; - } + if (GAN->getOffset() % Size == 0 && + GAN->getGlobal()->getPointerAlignment(DL) >= Size) + return true; } if (CurDAG->isBaseWithConstantOffset(N)) { @@ -1132,6 +1197,16 @@ SDValue AArch64DAGToDAGISel::createQTuple(ArrayRef<SDValue> Regs) { return createTuple(Regs, RegClassIDs, SubRegs); } +SDValue AArch64DAGToDAGISel::createZTuple(ArrayRef<SDValue> Regs) { + static const unsigned RegClassIDs[] = {AArch64::ZPR2RegClassID, + AArch64::ZPR3RegClassID, + AArch64::ZPR4RegClassID}; + static const unsigned SubRegs[] = {AArch64::zsub0, AArch64::zsub1, + AArch64::zsub2, AArch64::zsub3}; + + return createTuple(Regs, RegClassIDs, SubRegs); +} + SDValue AArch64DAGToDAGISel::createTuple(ArrayRef<SDValue> Regs, const unsigned RegClassIDs[], const unsigned SubRegs[]) { @@ -1240,6 +1315,8 @@ bool AArch64DAGToDAGISel::tryIndexedLoad(SDNode *N) { } } else if (VT == MVT::f16) { Opcode = IsPre ? AArch64::LDRHpre : AArch64::LDRHpost; + } else if (VT == MVT::bf16) { + Opcode = IsPre ? AArch64::LDRHpre : AArch64::LDRHpost; } else if (VT == MVT::f32) { Opcode = IsPre ? AArch64::LDRSpre : AArch64::LDRSpost; } else if (VT == MVT::f64 || VT.is64BitVector()) { @@ -1334,6 +1411,54 @@ void AArch64DAGToDAGISel::SelectPostLoad(SDNode *N, unsigned NumVecs, CurDAG->RemoveDeadNode(N); } +/// Optimize \param OldBase and \param OldOffset selecting the best addressing +/// mode. Returns a tuple consisting of an Opcode, an SDValue representing the +/// new Base and an SDValue representing the new offset. +template <unsigned Scale> +std::tuple<unsigned, SDValue, SDValue> +AArch64DAGToDAGISel::findAddrModeSVELoadStore(SDNode *N, const unsigned Opc_rr, + const unsigned Opc_ri, + const SDValue &OldBase, + const SDValue &OldOffset) { + SDValue NewBase = OldBase; + SDValue NewOffset = OldOffset; + // Detect a possible Reg+Imm addressing mode. + const bool IsRegImm = SelectAddrModeIndexedSVE</*Min=*/-8, /*Max=*/7>( + N, OldBase, NewBase, NewOffset); + + // Detect a possible reg+reg addressing mode, but only if we haven't already + // detected a Reg+Imm one. + const bool IsRegReg = + !IsRegImm && SelectSVERegRegAddrMode<Scale>(OldBase, NewBase, NewOffset); + + // Select the instruction. + return std::make_tuple(IsRegReg ? Opc_rr : Opc_ri, NewBase, NewOffset); +} + +void AArch64DAGToDAGISel::SelectPredicatedLoad(SDNode *N, unsigned NumVecs, + const unsigned Opc) { + SDLoc DL(N); + EVT VT = N->getValueType(0); + SDValue Chain = N->getOperand(0); + + SDValue Ops[] = {N->getOperand(1), // Predicate + N->getOperand(2), // Memory operand + CurDAG->getTargetConstant(0, DL, MVT::i64), Chain}; + + const EVT ResTys[] = {MVT::Untyped, MVT::Other}; + + SDNode *Load = CurDAG->getMachineNode(Opc, DL, ResTys, Ops); + SDValue SuperReg = SDValue(Load, 0); + for (unsigned i = 0; i < NumVecs; ++i) + ReplaceUses(SDValue(N, i), CurDAG->getTargetExtractSubreg( + AArch64::zsub0 + i, DL, VT, SuperReg)); + + // Copy chain + unsigned ChainIdx = NumVecs; + ReplaceUses(SDValue(N, ChainIdx), SDValue(Load, 1)); + CurDAG->RemoveDeadNode(N); +} + void AArch64DAGToDAGISel::SelectStore(SDNode *N, unsigned NumVecs, unsigned Opc) { SDLoc dl(N); @@ -1354,6 +1479,49 @@ void AArch64DAGToDAGISel::SelectStore(SDNode *N, unsigned NumVecs, ReplaceNode(N, St); } +template <unsigned Scale> +void AArch64DAGToDAGISel::SelectPredicatedStore(SDNode *N, unsigned NumVecs, + const unsigned Opc_rr, + const unsigned Opc_ri) { + SDLoc dl(N); + + // Form a REG_SEQUENCE to force register allocation. + SmallVector<SDValue, 4> Regs(N->op_begin() + 2, N->op_begin() + 2 + NumVecs); + SDValue RegSeq = createZTuple(Regs); + + // Optimize addressing mode. + unsigned Opc; + SDValue Offset, Base; + std::tie(Opc, Base, Offset) = findAddrModeSVELoadStore<Scale>( + N, Opc_rr, Opc_ri, N->getOperand(NumVecs + 3), + CurDAG->getTargetConstant(0, dl, MVT::i64)); + + SDValue Ops[] = {RegSeq, N->getOperand(NumVecs + 2), // predicate + Base, // address + Offset, // offset + N->getOperand(0)}; // chain + SDNode *St = CurDAG->getMachineNode(Opc, dl, N->getValueType(0), Ops); + + ReplaceNode(N, St); +} + +bool AArch64DAGToDAGISel::SelectAddrModeFrameIndexSVE(SDValue N, SDValue &Base, + SDValue &OffImm) { + SDLoc dl(N); + const DataLayout &DL = CurDAG->getDataLayout(); + const TargetLowering *TLI = getTargetLowering(); + + // Try to match it for the frame address + if (auto FINode = dyn_cast<FrameIndexSDNode>(N)) { + int FI = FINode->getIndex(); + Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy(DL)); + OffImm = CurDAG->getTargetConstant(0, dl, MVT::i64); + return true; + } + + return false; +} + void AArch64DAGToDAGISel::SelectPostStore(SDNode *N, unsigned NumVecs, unsigned Opc) { SDLoc dl(N); @@ -2632,7 +2800,8 @@ bool AArch64DAGToDAGISel::tryShiftAmountMod(SDNode *N) { // bits that are implicitly ANDed off by the above opcodes and if so, skip // the AND. uint64_t MaskImm; - if (!isOpcWithIntImmediate(ShiftAmt.getNode(), ISD::AND, MaskImm)) + if (!isOpcWithIntImmediate(ShiftAmt.getNode(), ISD::AND, MaskImm) && + !isOpcWithIntImmediate(ShiftAmt.getNode(), AArch64ISD::ANDS, MaskImm)) return false; if (countTrailingOnes(MaskImm) < Bits) @@ -2879,6 +3048,32 @@ bool AArch64DAGToDAGISel::SelectCMP_SWAP(SDNode *N) { return true; } +bool AArch64DAGToDAGISel::SelectSVE8BitLslImm(SDValue N, SDValue &Base, + SDValue &Offset) { + auto C = dyn_cast<ConstantSDNode>(N); + if (!C) + return false; + + auto Ty = N->getValueType(0); + + int64_t Imm = C->getSExtValue(); + SDLoc DL(N); + + if ((Imm >= -128) && (Imm <= 127)) { + Base = CurDAG->getTargetConstant(Imm, DL, Ty); + Offset = CurDAG->getTargetConstant(0, DL, Ty); + return true; + } + + if (((Imm % 256) == 0) && (Imm >= -32768) && (Imm <= 32512)) { + Base = CurDAG->getTargetConstant(Imm/256, DL, Ty); + Offset = CurDAG->getTargetConstant(8, DL, Ty); + return true; + } + + return false; +} + bool AArch64DAGToDAGISel::SelectSVEAddSubImm(SDValue N, MVT VT, SDValue &Imm, SDValue &Shift) { if (auto CNode = dyn_cast<ConstantSDNode>(N)) { const int64_t ImmVal = CNode->getZExtValue(); @@ -2917,7 +3112,7 @@ bool AArch64DAGToDAGISel::SelectSVESignedArithImm(SDValue N, SDValue &Imm) { if (auto CNode = dyn_cast<ConstantSDNode>(N)) { int64_t ImmVal = CNode->getSExtValue(); SDLoc DL(N); - if (ImmVal >= -127 && ImmVal < 127) { + if (ImmVal >= -128 && ImmVal < 128) { Imm = CurDAG->getTargetConstant(ImmVal, DL, MVT::i32); return true; } @@ -2975,6 +3170,24 @@ bool AArch64DAGToDAGISel::SelectSVELogicalImm(SDValue N, MVT VT, SDValue &Imm) { return false; } +// This method is only needed to "cast" i64s into i32s when the value +// is a valid shift which has been splatted into a vector with i64 elements. +// Every other type is fine in tablegen. +bool AArch64DAGToDAGISel::SelectSVEShiftImm64(SDValue N, uint64_t Low, + uint64_t High, SDValue &Imm) { + if (auto *CN = dyn_cast<ConstantSDNode>(N)) { + uint64_t ImmVal = CN->getZExtValue(); + SDLoc DL(N); + + if (ImmVal >= Low && ImmVal <= High) { + Imm = CurDAG->getTargetConstant(ImmVal, DL, MVT::i32); + return true; + } + } + + return false; +} + bool AArch64DAGToDAGISel::trySelectStackSlotTagP(SDNode *N) { // tagp(FrameIndex, IRGstack, tag_offset): // since the offset between FrameIndex and IRGstack is a compile-time @@ -3027,6 +3240,63 @@ void AArch64DAGToDAGISel::SelectTagP(SDNode *N) { ReplaceNode(N, N3); } +// NOTE: We cannot use EXTRACT_SUBREG in all cases because the fixed length +// vector types larger than NEON don't have a matching SubRegIndex. +static SDNode *extractSubReg(SelectionDAG *DAG, EVT VT, SDValue V) { + assert(V.getValueType().isScalableVector() && + V.getValueType().getSizeInBits().getKnownMinSize() == + AArch64::SVEBitsPerBlock && + "Expected to extract from a packed scalable vector!"); + assert(VT.isFixedLengthVector() && + "Expected to extract a fixed length vector!"); + + SDLoc DL(V); + switch (VT.getSizeInBits()) { + case 64: { + auto SubReg = DAG->getTargetConstant(AArch64::dsub, DL, MVT::i32); + return DAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL, VT, V, SubReg); + } + case 128: { + auto SubReg = DAG->getTargetConstant(AArch64::zsub, DL, MVT::i32); + return DAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL, VT, V, SubReg); + } + default: { + auto RC = DAG->getTargetConstant(AArch64::ZPRRegClassID, DL, MVT::i64); + return DAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS, DL, VT, V, RC); + } + } +} + +// NOTE: We cannot use INSERT_SUBREG in all cases because the fixed length +// vector types larger than NEON don't have a matching SubRegIndex. +static SDNode *insertSubReg(SelectionDAG *DAG, EVT VT, SDValue V) { + assert(VT.isScalableVector() && + VT.getSizeInBits().getKnownMinSize() == AArch64::SVEBitsPerBlock && + "Expected to insert into a packed scalable vector!"); + assert(V.getValueType().isFixedLengthVector() && + "Expected to insert a fixed length vector!"); + + SDLoc DL(V); + switch (V.getValueType().getSizeInBits()) { + case 64: { + auto SubReg = DAG->getTargetConstant(AArch64::dsub, DL, MVT::i32); + auto Container = DAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, VT); + return DAG->getMachineNode(TargetOpcode::INSERT_SUBREG, DL, VT, + SDValue(Container, 0), V, SubReg); + } + case 128: { + auto SubReg = DAG->getTargetConstant(AArch64::zsub, DL, MVT::i32); + auto Container = DAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, VT); + return DAG->getMachineNode(TargetOpcode::INSERT_SUBREG, DL, VT, + SDValue(Container, 0), V, SubReg); + } + default: { + auto RC = DAG->getTargetConstant(AArch64::ZPRRegClassID, DL, MVT::i64); + return DAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS, DL, VT, V, RC); + } + } +} + void AArch64DAGToDAGISel::Select(SDNode *Node) { // If we have a custom node, we already have selected! if (Node->isMachineOpcode()) { @@ -3100,6 +3370,52 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { return; break; + case ISD::EXTRACT_SUBVECTOR: { + // Bail when not a "cast" like extract_subvector. + if (cast<ConstantSDNode>(Node->getOperand(1))->getZExtValue() != 0) + break; + + // Bail when normal isel can do the job. + EVT InVT = Node->getOperand(0).getValueType(); + if (VT.isScalableVector() || InVT.isFixedLengthVector()) + break; + + // NOTE: We can only get here when doing fixed length SVE code generation. + // We do manual selection because the types involved are not linked to real + // registers (despite being legal) and must be coerced into SVE registers. + // + // NOTE: If the above changes, be aware that selection will still not work + // because the td definition of extract_vector does not support extracting + // a fixed length vector from a scalable vector. + + ReplaceNode(Node, extractSubReg(CurDAG, VT, Node->getOperand(0))); + return; + } + + case ISD::INSERT_SUBVECTOR: { + // Bail when not a "cast" like insert_subvector. + if (cast<ConstantSDNode>(Node->getOperand(2))->getZExtValue() != 0) + break; + if (!Node->getOperand(0).isUndef()) + break; + + // Bail when normal isel should do the job. + EVT InVT = Node->getOperand(1).getValueType(); + if (VT.isFixedLengthVector() || InVT.isScalableVector()) + break; + + // NOTE: We can only get here when doing fixed length SVE code generation. + // We do manual selection because the types involved are not linked to real + // registers (despite being legal) and must be coerced into SVE registers. + // + // NOTE: If the above changes, be aware that selection will still not work + // because the td definition of insert_vector does not support inserting a + // fixed length vector into a scalable vector. + + ReplaceNode(Node, insertSubReg(CurDAG, VT, Node->getOperand(1))); + return; + } + case ISD::Constant: { // Materialize zero constants as copies from WZR/XZR. This allows // the coalescer to propagate these into other instructions. @@ -3185,10 +3501,10 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { } else if (VT == MVT::v16i8) { SelectLoad(Node, 2, AArch64::LD1Twov16b, AArch64::qsub0); return; - } else if (VT == MVT::v4i16 || VT == MVT::v4f16) { + } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) { SelectLoad(Node, 2, AArch64::LD1Twov4h, AArch64::dsub0); return; - } else if (VT == MVT::v8i16 || VT == MVT::v8f16) { + } else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) { SelectLoad(Node, 2, AArch64::LD1Twov8h, AArch64::qsub0); return; } else if (VT == MVT::v2i32 || VT == MVT::v2f32) { @@ -3212,10 +3528,10 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { } else if (VT == MVT::v16i8) { SelectLoad(Node, 3, AArch64::LD1Threev16b, AArch64::qsub0); return; - } else if (VT == MVT::v4i16 || VT == MVT::v4f16) { + } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) { SelectLoad(Node, 3, AArch64::LD1Threev4h, AArch64::dsub0); return; - } else if (VT == MVT::v8i16 || VT == MVT::v8f16) { + } else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) { SelectLoad(Node, 3, AArch64::LD1Threev8h, AArch64::qsub0); return; } else if (VT == MVT::v2i32 || VT == MVT::v2f32) { @@ -3239,10 +3555,10 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { } else if (VT == MVT::v16i8) { SelectLoad(Node, 4, AArch64::LD1Fourv16b, AArch64::qsub0); return; - } else if (VT == MVT::v4i16 || VT == MVT::v4f16) { + } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) { SelectLoad(Node, 4, AArch64::LD1Fourv4h, AArch64::dsub0); return; - } else if (VT == MVT::v8i16 || VT == MVT::v8f16) { + } else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) { SelectLoad(Node, 4, AArch64::LD1Fourv8h, AArch64::qsub0); return; } else if (VT == MVT::v2i32 || VT == MVT::v2f32) { @@ -3266,10 +3582,10 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { } else if (VT == MVT::v16i8) { SelectLoad(Node, 2, AArch64::LD2Twov16b, AArch64::qsub0); return; - } else if (VT == MVT::v4i16 || VT == MVT::v4f16) { + } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) { SelectLoad(Node, 2, AArch64::LD2Twov4h, AArch64::dsub0); return; - } else if (VT == MVT::v8i16 || VT == MVT::v8f16) { + } else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) { SelectLoad(Node, 2, AArch64::LD2Twov8h, AArch64::qsub0); return; } else if (VT == MVT::v2i32 || VT == MVT::v2f32) { @@ -3293,10 +3609,10 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { } else if (VT == MVT::v16i8) { SelectLoad(Node, 3, AArch64::LD3Threev16b, AArch64::qsub0); return; - } else if (VT == MVT::v4i16 || VT == MVT::v4f16) { + } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) { SelectLoad(Node, 3, AArch64::LD3Threev4h, AArch64::dsub0); return; - } else if (VT == MVT::v8i16 || VT == MVT::v8f16) { + } else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) { SelectLoad(Node, 3, AArch64::LD3Threev8h, AArch64::qsub0); return; } else if (VT == MVT::v2i32 || VT == MVT::v2f32) { @@ -3320,10 +3636,10 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { } else if (VT == MVT::v16i8) { SelectLoad(Node, 4, AArch64::LD4Fourv16b, AArch64::qsub0); return; - } else if (VT == MVT::v4i16 || VT == MVT::v4f16) { + } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) { SelectLoad(Node, 4, AArch64::LD4Fourv4h, AArch64::dsub0); return; - } else if (VT == MVT::v8i16 || VT == MVT::v8f16) { + } else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) { SelectLoad(Node, 4, AArch64::LD4Fourv8h, AArch64::qsub0); return; } else if (VT == MVT::v2i32 || VT == MVT::v2f32) { @@ -3347,10 +3663,10 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { } else if (VT == MVT::v16i8) { SelectLoad(Node, 2, AArch64::LD2Rv16b, AArch64::qsub0); return; - } else if (VT == MVT::v4i16 || VT == MVT::v4f16) { + } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) { SelectLoad(Node, 2, AArch64::LD2Rv4h, AArch64::dsub0); return; - } else if (VT == MVT::v8i16 || VT == MVT::v8f16) { + } else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) { SelectLoad(Node, 2, AArch64::LD2Rv8h, AArch64::qsub0); return; } else if (VT == MVT::v2i32 || VT == MVT::v2f32) { @@ -3374,10 +3690,10 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { } else if (VT == MVT::v16i8) { SelectLoad(Node, 3, AArch64::LD3Rv16b, AArch64::qsub0); return; - } else if (VT == MVT::v4i16 || VT == MVT::v4f16) { + } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) { SelectLoad(Node, 3, AArch64::LD3Rv4h, AArch64::dsub0); return; - } else if (VT == MVT::v8i16 || VT == MVT::v8f16) { + } else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) { SelectLoad(Node, 3, AArch64::LD3Rv8h, AArch64::qsub0); return; } else if (VT == MVT::v2i32 || VT == MVT::v2f32) { @@ -3401,10 +3717,10 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { } else if (VT == MVT::v16i8) { SelectLoad(Node, 4, AArch64::LD4Rv16b, AArch64::qsub0); return; - } else if (VT == MVT::v4i16 || VT == MVT::v4f16) { + } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) { SelectLoad(Node, 4, AArch64::LD4Rv4h, AArch64::dsub0); return; - } else if (VT == MVT::v8i16 || VT == MVT::v8f16) { + } else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) { SelectLoad(Node, 4, AArch64::LD4Rv8h, AArch64::qsub0); return; } else if (VT == MVT::v2i32 || VT == MVT::v2f32) { @@ -3426,7 +3742,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { SelectLoadLane(Node, 2, AArch64::LD2i8); return; } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 || - VT == MVT::v8f16) { + VT == MVT::v8f16 || VT == MVT::v4bf16 || VT == MVT::v8bf16) { SelectLoadLane(Node, 2, AArch64::LD2i16); return; } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 || @@ -3444,7 +3760,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { SelectLoadLane(Node, 3, AArch64::LD3i8); return; } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 || - VT == MVT::v8f16) { + VT == MVT::v8f16 || VT == MVT::v4bf16 || VT == MVT::v8bf16) { SelectLoadLane(Node, 3, AArch64::LD3i16); return; } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 || @@ -3462,7 +3778,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { SelectLoadLane(Node, 4, AArch64::LD4i8); return; } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 || - VT == MVT::v8f16) { + VT == MVT::v8f16 || VT == MVT::v4bf16 || VT == MVT::v8bf16) { SelectLoadLane(Node, 4, AArch64::LD4i16); return; } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 || @@ -3537,10 +3853,12 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { } else if (VT == MVT::v16i8) { SelectStore(Node, 2, AArch64::ST1Twov16b); return; - } else if (VT == MVT::v4i16 || VT == MVT::v4f16) { + } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || + VT == MVT::v4bf16) { SelectStore(Node, 2, AArch64::ST1Twov4h); return; - } else if (VT == MVT::v8i16 || VT == MVT::v8f16) { + } else if (VT == MVT::v8i16 || VT == MVT::v8f16 || + VT == MVT::v8bf16) { SelectStore(Node, 2, AArch64::ST1Twov8h); return; } else if (VT == MVT::v2i32 || VT == MVT::v2f32) { @@ -3565,10 +3883,12 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { } else if (VT == MVT::v16i8) { SelectStore(Node, 3, AArch64::ST1Threev16b); return; - } else if (VT == MVT::v4i16 || VT == MVT::v4f16) { + } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || + VT == MVT::v4bf16) { SelectStore(Node, 3, AArch64::ST1Threev4h); return; - } else if (VT == MVT::v8i16 || VT == MVT::v8f16) { + } else if (VT == MVT::v8i16 || VT == MVT::v8f16 || + VT == MVT::v8bf16) { SelectStore(Node, 3, AArch64::ST1Threev8h); return; } else if (VT == MVT::v2i32 || VT == MVT::v2f32) { @@ -3593,10 +3913,12 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { } else if (VT == MVT::v16i8) { SelectStore(Node, 4, AArch64::ST1Fourv16b); return; - } else if (VT == MVT::v4i16 || VT == MVT::v4f16) { + } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || + VT == MVT::v4bf16) { SelectStore(Node, 4, AArch64::ST1Fourv4h); return; - } else if (VT == MVT::v8i16 || VT == MVT::v8f16) { + } else if (VT == MVT::v8i16 || VT == MVT::v8f16 || + VT == MVT::v8bf16) { SelectStore(Node, 4, AArch64::ST1Fourv8h); return; } else if (VT == MVT::v2i32 || VT == MVT::v2f32) { @@ -3621,10 +3943,12 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { } else if (VT == MVT::v16i8) { SelectStore(Node, 2, AArch64::ST2Twov16b); return; - } else if (VT == MVT::v4i16 || VT == MVT::v4f16) { + } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || + VT == MVT::v4bf16) { SelectStore(Node, 2, AArch64::ST2Twov4h); return; - } else if (VT == MVT::v8i16 || VT == MVT::v8f16) { + } else if (VT == MVT::v8i16 || VT == MVT::v8f16 || + VT == MVT::v8bf16) { SelectStore(Node, 2, AArch64::ST2Twov8h); return; } else if (VT == MVT::v2i32 || VT == MVT::v2f32) { @@ -3649,10 +3973,12 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { } else if (VT == MVT::v16i8) { SelectStore(Node, 3, AArch64::ST3Threev16b); return; - } else if (VT == MVT::v4i16 || VT == MVT::v4f16) { + } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || + VT == MVT::v4bf16) { SelectStore(Node, 3, AArch64::ST3Threev4h); return; - } else if (VT == MVT::v8i16 || VT == MVT::v8f16) { + } else if (VT == MVT::v8i16 || VT == MVT::v8f16 || + VT == MVT::v8bf16) { SelectStore(Node, 3, AArch64::ST3Threev8h); return; } else if (VT == MVT::v2i32 || VT == MVT::v2f32) { @@ -3677,10 +4003,12 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { } else if (VT == MVT::v16i8) { SelectStore(Node, 4, AArch64::ST4Fourv16b); return; - } else if (VT == MVT::v4i16 || VT == MVT::v4f16) { + } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || + VT == MVT::v4bf16) { SelectStore(Node, 4, AArch64::ST4Fourv4h); return; - } else if (VT == MVT::v8i16 || VT == MVT::v8f16) { + } else if (VT == MVT::v8i16 || VT == MVT::v8f16 || + VT == MVT::v8bf16) { SelectStore(Node, 4, AArch64::ST4Fourv8h); return; } else if (VT == MVT::v2i32 || VT == MVT::v2f32) { @@ -3703,7 +4031,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { SelectStoreLane(Node, 2, AArch64::ST2i8); return; } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 || - VT == MVT::v8f16) { + VT == MVT::v8f16 || VT == MVT::v4bf16 || VT == MVT::v8bf16) { SelectStoreLane(Node, 2, AArch64::ST2i16); return; } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 || @@ -3722,7 +4050,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { SelectStoreLane(Node, 3, AArch64::ST3i8); return; } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 || - VT == MVT::v8f16) { + VT == MVT::v8f16 || VT == MVT::v4bf16 || VT == MVT::v8bf16) { SelectStoreLane(Node, 3, AArch64::ST3i16); return; } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 || @@ -3741,7 +4069,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { SelectStoreLane(Node, 4, AArch64::ST4i8); return; } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 || - VT == MVT::v8f16) { + VT == MVT::v8f16 || VT == MVT::v4bf16 || VT == MVT::v8bf16) { SelectStoreLane(Node, 4, AArch64::ST4i16); return; } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 || @@ -3755,6 +4083,69 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { } break; } + case Intrinsic::aarch64_sve_st2: { + if (VT == MVT::nxv16i8) { + SelectPredicatedStore</*Scale=*/0>(Node, 2, AArch64::ST2B, + AArch64::ST2B_IMM); + return; + } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 || + (VT == MVT::nxv8bf16 && Subtarget->hasBF16())) { + SelectPredicatedStore</*Scale=*/1>(Node, 2, AArch64::ST2H, + AArch64::ST2H_IMM); + return; + } else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) { + SelectPredicatedStore</*Scale=*/2>(Node, 2, AArch64::ST2W, + AArch64::ST2W_IMM); + return; + } else if (VT == MVT::nxv2i64 || VT == MVT::nxv2f64) { + SelectPredicatedStore</*Scale=*/3>(Node, 2, AArch64::ST2D, + AArch64::ST2D_IMM); + return; + } + break; + } + case Intrinsic::aarch64_sve_st3: { + if (VT == MVT::nxv16i8) { + SelectPredicatedStore</*Scale=*/0>(Node, 3, AArch64::ST3B, + AArch64::ST3B_IMM); + return; + } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 || + (VT == MVT::nxv8bf16 && Subtarget->hasBF16())) { + SelectPredicatedStore</*Scale=*/1>(Node, 3, AArch64::ST3H, + AArch64::ST3H_IMM); + return; + } else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) { + SelectPredicatedStore</*Scale=*/2>(Node, 3, AArch64::ST3W, + AArch64::ST3W_IMM); + return; + } else if (VT == MVT::nxv2i64 || VT == MVT::nxv2f64) { + SelectPredicatedStore</*Scale=*/3>(Node, 3, AArch64::ST3D, + AArch64::ST3D_IMM); + return; + } + break; + } + case Intrinsic::aarch64_sve_st4: { + if (VT == MVT::nxv16i8) { + SelectPredicatedStore</*Scale=*/0>(Node, 4, AArch64::ST4B, + AArch64::ST4B_IMM); + return; + } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 || + (VT == MVT::nxv8bf16 && Subtarget->hasBF16())) { + SelectPredicatedStore</*Scale=*/1>(Node, 4, AArch64::ST4H, + AArch64::ST4H_IMM); + return; + } else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) { + SelectPredicatedStore</*Scale=*/2>(Node, 4, AArch64::ST4W, + AArch64::ST4W_IMM); + return; + } else if (VT == MVT::nxv2i64 || VT == MVT::nxv2f64) { + SelectPredicatedStore</*Scale=*/3>(Node, 4, AArch64::ST4D, + AArch64::ST4D_IMM); + return; + } + break; + } } break; } @@ -3765,10 +4156,10 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { } else if (VT == MVT::v16i8) { SelectPostLoad(Node, 2, AArch64::LD2Twov16b_POST, AArch64::qsub0); return; - } else if (VT == MVT::v4i16 || VT == MVT::v4f16) { + } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) { SelectPostLoad(Node, 2, AArch64::LD2Twov4h_POST, AArch64::dsub0); return; - } else if (VT == MVT::v8i16 || VT == MVT::v8f16) { + } else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) { SelectPostLoad(Node, 2, AArch64::LD2Twov8h_POST, AArch64::qsub0); return; } else if (VT == MVT::v2i32 || VT == MVT::v2f32) { @@ -3793,10 +4184,10 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { } else if (VT == MVT::v16i8) { SelectPostLoad(Node, 3, AArch64::LD3Threev16b_POST, AArch64::qsub0); return; - } else if (VT == MVT::v4i16 || VT == MVT::v4f16) { + } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) { SelectPostLoad(Node, 3, AArch64::LD3Threev4h_POST, AArch64::dsub0); return; - } else if (VT == MVT::v8i16 || VT == MVT::v8f16) { + } else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) { SelectPostLoad(Node, 3, AArch64::LD3Threev8h_POST, AArch64::qsub0); return; } else if (VT == MVT::v2i32 || VT == MVT::v2f32) { @@ -3821,10 +4212,10 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { } else if (VT == MVT::v16i8) { SelectPostLoad(Node, 4, AArch64::LD4Fourv16b_POST, AArch64::qsub0); return; - } else if (VT == MVT::v4i16 || VT == MVT::v4f16) { + } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) { SelectPostLoad(Node, 4, AArch64::LD4Fourv4h_POST, AArch64::dsub0); return; - } else if (VT == MVT::v8i16 || VT == MVT::v8f16) { + } else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) { SelectPostLoad(Node, 4, AArch64::LD4Fourv8h_POST, AArch64::qsub0); return; } else if (VT == MVT::v2i32 || VT == MVT::v2f32) { @@ -3849,10 +4240,10 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { } else if (VT == MVT::v16i8) { SelectPostLoad(Node, 2, AArch64::LD1Twov16b_POST, AArch64::qsub0); return; - } else if (VT == MVT::v4i16 || VT == MVT::v4f16) { + } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) { SelectPostLoad(Node, 2, AArch64::LD1Twov4h_POST, AArch64::dsub0); return; - } else if (VT == MVT::v8i16 || VT == MVT::v8f16) { + } else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) { SelectPostLoad(Node, 2, AArch64::LD1Twov8h_POST, AArch64::qsub0); return; } else if (VT == MVT::v2i32 || VT == MVT::v2f32) { @@ -3877,10 +4268,10 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { } else if (VT == MVT::v16i8) { SelectPostLoad(Node, 3, AArch64::LD1Threev16b_POST, AArch64::qsub0); return; - } else if (VT == MVT::v4i16 || VT == MVT::v4f16) { + } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) { SelectPostLoad(Node, 3, AArch64::LD1Threev4h_POST, AArch64::dsub0); return; - } else if (VT == MVT::v8i16 || VT == MVT::v8f16) { + } else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) { SelectPostLoad(Node, 3, AArch64::LD1Threev8h_POST, AArch64::qsub0); return; } else if (VT == MVT::v2i32 || VT == MVT::v2f32) { @@ -3905,10 +4296,10 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { } else if (VT == MVT::v16i8) { SelectPostLoad(Node, 4, AArch64::LD1Fourv16b_POST, AArch64::qsub0); return; - } else if (VT == MVT::v4i16 || VT == MVT::v4f16) { + } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) { SelectPostLoad(Node, 4, AArch64::LD1Fourv4h_POST, AArch64::dsub0); return; - } else if (VT == MVT::v8i16 || VT == MVT::v8f16) { + } else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) { SelectPostLoad(Node, 4, AArch64::LD1Fourv8h_POST, AArch64::qsub0); return; } else if (VT == MVT::v2i32 || VT == MVT::v2f32) { @@ -3933,10 +4324,10 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { } else if (VT == MVT::v16i8) { SelectPostLoad(Node, 1, AArch64::LD1Rv16b_POST, AArch64::qsub0); return; - } else if (VT == MVT::v4i16 || VT == MVT::v4f16) { + } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) { SelectPostLoad(Node, 1, AArch64::LD1Rv4h_POST, AArch64::dsub0); return; - } else if (VT == MVT::v8i16 || VT == MVT::v8f16) { + } else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) { SelectPostLoad(Node, 1, AArch64::LD1Rv8h_POST, AArch64::qsub0); return; } else if (VT == MVT::v2i32 || VT == MVT::v2f32) { @@ -3961,10 +4352,10 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { } else if (VT == MVT::v16i8) { SelectPostLoad(Node, 2, AArch64::LD2Rv16b_POST, AArch64::qsub0); return; - } else if (VT == MVT::v4i16 || VT == MVT::v4f16) { + } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) { SelectPostLoad(Node, 2, AArch64::LD2Rv4h_POST, AArch64::dsub0); return; - } else if (VT == MVT::v8i16 || VT == MVT::v8f16) { + } else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) { SelectPostLoad(Node, 2, AArch64::LD2Rv8h_POST, AArch64::qsub0); return; } else if (VT == MVT::v2i32 || VT == MVT::v2f32) { @@ -3989,10 +4380,10 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { } else if (VT == MVT::v16i8) { SelectPostLoad(Node, 3, AArch64::LD3Rv16b_POST, AArch64::qsub0); return; - } else if (VT == MVT::v4i16 || VT == MVT::v4f16) { + } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) { SelectPostLoad(Node, 3, AArch64::LD3Rv4h_POST, AArch64::dsub0); return; - } else if (VT == MVT::v8i16 || VT == MVT::v8f16) { + } else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) { SelectPostLoad(Node, 3, AArch64::LD3Rv8h_POST, AArch64::qsub0); return; } else if (VT == MVT::v2i32 || VT == MVT::v2f32) { @@ -4017,10 +4408,10 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { } else if (VT == MVT::v16i8) { SelectPostLoad(Node, 4, AArch64::LD4Rv16b_POST, AArch64::qsub0); return; - } else if (VT == MVT::v4i16 || VT == MVT::v4f16) { + } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) { SelectPostLoad(Node, 4, AArch64::LD4Rv4h_POST, AArch64::dsub0); return; - } else if (VT == MVT::v8i16 || VT == MVT::v8f16) { + } else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) { SelectPostLoad(Node, 4, AArch64::LD4Rv8h_POST, AArch64::qsub0); return; } else if (VT == MVT::v2i32 || VT == MVT::v2f32) { @@ -4043,7 +4434,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { SelectPostLoadLane(Node, 1, AArch64::LD1i8_POST); return; } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 || - VT == MVT::v8f16) { + VT == MVT::v8f16 || VT == MVT::v4bf16 || VT == MVT::v8bf16) { SelectPostLoadLane(Node, 1, AArch64::LD1i16_POST); return; } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 || @@ -4062,7 +4453,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { SelectPostLoadLane(Node, 2, AArch64::LD2i8_POST); return; } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 || - VT == MVT::v8f16) { + VT == MVT::v8f16 || VT == MVT::v4bf16 || VT == MVT::v8bf16) { SelectPostLoadLane(Node, 2, AArch64::LD2i16_POST); return; } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 || @@ -4081,7 +4472,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { SelectPostLoadLane(Node, 3, AArch64::LD3i8_POST); return; } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 || - VT == MVT::v8f16) { + VT == MVT::v8f16 || VT == MVT::v4bf16 || VT == MVT::v8bf16) { SelectPostLoadLane(Node, 3, AArch64::LD3i16_POST); return; } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 || @@ -4100,7 +4491,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { SelectPostLoadLane(Node, 4, AArch64::LD4i8_POST); return; } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 || - VT == MVT::v8f16) { + VT == MVT::v8f16 || VT == MVT::v4bf16 || VT == MVT::v8bf16) { SelectPostLoadLane(Node, 4, AArch64::LD4i16_POST); return; } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 || @@ -4122,10 +4513,10 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { } else if (VT == MVT::v16i8) { SelectPostStore(Node, 2, AArch64::ST2Twov16b_POST); return; - } else if (VT == MVT::v4i16 || VT == MVT::v4f16) { + } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) { SelectPostStore(Node, 2, AArch64::ST2Twov4h_POST); return; - } else if (VT == MVT::v8i16 || VT == MVT::v8f16) { + } else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) { SelectPostStore(Node, 2, AArch64::ST2Twov8h_POST); return; } else if (VT == MVT::v2i32 || VT == MVT::v2f32) { @@ -4151,10 +4542,10 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { } else if (VT == MVT::v16i8) { SelectPostStore(Node, 3, AArch64::ST3Threev16b_POST); return; - } else if (VT == MVT::v4i16 || VT == MVT::v4f16) { + } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) { SelectPostStore(Node, 3, AArch64::ST3Threev4h_POST); return; - } else if (VT == MVT::v8i16 || VT == MVT::v8f16) { + } else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) { SelectPostStore(Node, 3, AArch64::ST3Threev8h_POST); return; } else if (VT == MVT::v2i32 || VT == MVT::v2f32) { @@ -4180,10 +4571,10 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { } else if (VT == MVT::v16i8) { SelectPostStore(Node, 4, AArch64::ST4Fourv16b_POST); return; - } else if (VT == MVT::v4i16 || VT == MVT::v4f16) { + } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) { SelectPostStore(Node, 4, AArch64::ST4Fourv4h_POST); return; - } else if (VT == MVT::v8i16 || VT == MVT::v8f16) { + } else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) { SelectPostStore(Node, 4, AArch64::ST4Fourv8h_POST); return; } else if (VT == MVT::v2i32 || VT == MVT::v2f32) { @@ -4209,10 +4600,10 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { } else if (VT == MVT::v16i8) { SelectPostStore(Node, 2, AArch64::ST1Twov16b_POST); return; - } else if (VT == MVT::v4i16 || VT == MVT::v4f16) { + } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) { SelectPostStore(Node, 2, AArch64::ST1Twov4h_POST); return; - } else if (VT == MVT::v8i16 || VT == MVT::v8f16) { + } else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) { SelectPostStore(Node, 2, AArch64::ST1Twov8h_POST); return; } else if (VT == MVT::v2i32 || VT == MVT::v2f32) { @@ -4238,10 +4629,10 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { } else if (VT == MVT::v16i8) { SelectPostStore(Node, 3, AArch64::ST1Threev16b_POST); return; - } else if (VT == MVT::v4i16 || VT == MVT::v4f16) { + } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) { SelectPostStore(Node, 3, AArch64::ST1Threev4h_POST); return; - } else if (VT == MVT::v8i16 || VT == MVT::v8f16) { + } else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16 ) { SelectPostStore(Node, 3, AArch64::ST1Threev8h_POST); return; } else if (VT == MVT::v2i32 || VT == MVT::v2f32) { @@ -4267,10 +4658,10 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { } else if (VT == MVT::v16i8) { SelectPostStore(Node, 4, AArch64::ST1Fourv16b_POST); return; - } else if (VT == MVT::v4i16 || VT == MVT::v4f16) { + } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) { SelectPostStore(Node, 4, AArch64::ST1Fourv4h_POST); return; - } else if (VT == MVT::v8i16 || VT == MVT::v8f16) { + } else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) { SelectPostStore(Node, 4, AArch64::ST1Fourv8h_POST); return; } else if (VT == MVT::v2i32 || VT == MVT::v2f32) { @@ -4294,7 +4685,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { SelectPostStoreLane(Node, 2, AArch64::ST2i8_POST); return; } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 || - VT == MVT::v8f16) { + VT == MVT::v8f16 || VT == MVT::v4bf16 || VT == MVT::v8bf16) { SelectPostStoreLane(Node, 2, AArch64::ST2i16_POST); return; } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 || @@ -4314,7 +4705,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { SelectPostStoreLane(Node, 3, AArch64::ST3i8_POST); return; } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 || - VT == MVT::v8f16) { + VT == MVT::v8f16 || VT == MVT::v4bf16 || VT == MVT::v8bf16) { SelectPostStoreLane(Node, 3, AArch64::ST3i16_POST); return; } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 || @@ -4334,7 +4725,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { SelectPostStoreLane(Node, 4, AArch64::ST4i8_POST); return; } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 || - VT == MVT::v8f16) { + VT == MVT::v8f16 || VT == MVT::v4bf16 || VT == MVT::v8bf16) { SelectPostStoreLane(Node, 4, AArch64::ST4i16_POST); return; } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 || @@ -4348,6 +4739,57 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { } break; } + case AArch64ISD::SVE_LD2_MERGE_ZERO: { + if (VT == MVT::nxv16i8) { + SelectPredicatedLoad(Node, 2, AArch64::LD2B_IMM); + return; + } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 || + (VT == MVT::nxv8bf16 && Subtarget->hasBF16())) { + SelectPredicatedLoad(Node, 2, AArch64::LD2H_IMM); + return; + } else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) { + SelectPredicatedLoad(Node, 2, AArch64::LD2W_IMM); + return; + } else if (VT == MVT::nxv2i64 || VT == MVT::nxv2f64) { + SelectPredicatedLoad(Node, 2, AArch64::LD2D_IMM); + return; + } + break; + } + case AArch64ISD::SVE_LD3_MERGE_ZERO: { + if (VT == MVT::nxv16i8) { + SelectPredicatedLoad(Node, 3, AArch64::LD3B_IMM); + return; + } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 || + (VT == MVT::nxv8bf16 && Subtarget->hasBF16())) { + SelectPredicatedLoad(Node, 3, AArch64::LD3H_IMM); + return; + } else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) { + SelectPredicatedLoad(Node, 3, AArch64::LD3W_IMM); + return; + } else if (VT == MVT::nxv2i64 || VT == MVT::nxv2f64) { + SelectPredicatedLoad(Node, 3, AArch64::LD3D_IMM); + return; + } + break; + } + case AArch64ISD::SVE_LD4_MERGE_ZERO: { + if (VT == MVT::nxv16i8) { + SelectPredicatedLoad(Node, 4, AArch64::LD4B_IMM); + return; + } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 || + (VT == MVT::nxv8bf16 && Subtarget->hasBF16())) { + SelectPredicatedLoad(Node, 4, AArch64::LD4H_IMM); + return; + } else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) { + SelectPredicatedLoad(Node, 4, AArch64::LD4W_IMM); + return; + } else if (VT == MVT::nxv2i64 || VT == MVT::nxv2f64) { + SelectPredicatedLoad(Node, 4, AArch64::LD4D_IMM); + return; + } + break; + } } // Select the default instruction @@ -4360,3 +4802,130 @@ FunctionPass *llvm::createAArch64ISelDag(AArch64TargetMachine &TM, CodeGenOpt::Level OptLevel) { return new AArch64DAGToDAGISel(TM, OptLevel); } + +/// When \p PredVT is a scalable vector predicate in the form +/// MVT::nx<M>xi1, it builds the correspondent scalable vector of +/// integers MVT::nx<M>xi<bits> s.t. M x bits = 128. If the input +/// PredVT is not in the form MVT::nx<M>xi1, it returns an invalid +/// EVT. +static EVT getPackedVectorTypeFromPredicateType(LLVMContext &Ctx, EVT PredVT) { + if (!PredVT.isScalableVector() || PredVT.getVectorElementType() != MVT::i1) + return EVT(); + + if (PredVT != MVT::nxv16i1 && PredVT != MVT::nxv8i1 && + PredVT != MVT::nxv4i1 && PredVT != MVT::nxv2i1) + return EVT(); + + ElementCount EC = PredVT.getVectorElementCount(); + EVT ScalarVT = EVT::getIntegerVT(Ctx, AArch64::SVEBitsPerBlock / EC.Min); + EVT MemVT = EVT::getVectorVT(Ctx, ScalarVT, EC); + return MemVT; +} + +/// Return the EVT of the data associated to a memory operation in \p +/// Root. If such EVT cannot be retrived, it returns an invalid EVT. +static EVT getMemVTFromNode(LLVMContext &Ctx, SDNode *Root) { + if (isa<MemSDNode>(Root)) + return cast<MemSDNode>(Root)->getMemoryVT(); + + if (isa<MemIntrinsicSDNode>(Root)) + return cast<MemIntrinsicSDNode>(Root)->getMemoryVT(); + + const unsigned Opcode = Root->getOpcode(); + // For custom ISD nodes, we have to look at them individually to extract the + // type of the data moved to/from memory. + switch (Opcode) { + case AArch64ISD::LD1_MERGE_ZERO: + case AArch64ISD::LD1S_MERGE_ZERO: + case AArch64ISD::LDNF1_MERGE_ZERO: + case AArch64ISD::LDNF1S_MERGE_ZERO: + return cast<VTSDNode>(Root->getOperand(3))->getVT(); + case AArch64ISD::ST1_PRED: + return cast<VTSDNode>(Root->getOperand(4))->getVT(); + default: + break; + } + + if (Opcode != ISD::INTRINSIC_VOID) + return EVT(); + + const unsigned IntNo = + cast<ConstantSDNode>(Root->getOperand(1))->getZExtValue(); + if (IntNo != Intrinsic::aarch64_sve_prf) + return EVT(); + + // We are using an SVE prefetch intrinsic. Type must be inferred + // from the width of the predicate. + return getPackedVectorTypeFromPredicateType( + Ctx, Root->getOperand(2)->getValueType(0)); +} + +/// SelectAddrModeIndexedSVE - Attempt selection of the addressing mode: +/// Base + OffImm * sizeof(MemVT) for Min >= OffImm <= Max +/// where Root is the memory access using N for its address. +template <int64_t Min, int64_t Max> +bool AArch64DAGToDAGISel::SelectAddrModeIndexedSVE(SDNode *Root, SDValue N, + SDValue &Base, + SDValue &OffImm) { + const EVT MemVT = getMemVTFromNode(*(CurDAG->getContext()), Root); + + if (MemVT == EVT()) + return false; + + if (N.getOpcode() != ISD::ADD) + return false; + + SDValue VScale = N.getOperand(1); + if (VScale.getOpcode() != ISD::VSCALE) + return false; + + TypeSize TS = MemVT.getSizeInBits(); + int64_t MemWidthBytes = static_cast<int64_t>(TS.getKnownMinSize()) / 8; + int64_t MulImm = cast<ConstantSDNode>(VScale.getOperand(0))->getSExtValue(); + + if ((MulImm % MemWidthBytes) != 0) + return false; + + int64_t Offset = MulImm / MemWidthBytes; + if (Offset < Min || Offset > Max) + return false; + + Base = N.getOperand(0); + OffImm = CurDAG->getTargetConstant(Offset, SDLoc(N), MVT::i64); + return true; +} + +/// Select register plus register addressing mode for SVE, with scaled +/// offset. +bool AArch64DAGToDAGISel::SelectSVERegRegAddrMode(SDValue N, unsigned Scale, + SDValue &Base, + SDValue &Offset) { + if (N.getOpcode() != ISD::ADD) + return false; + + // Process an ADD node. + const SDValue LHS = N.getOperand(0); + const SDValue RHS = N.getOperand(1); + + // 8 bit data does not come with the SHL node, so it is treated + // separately. + if (Scale == 0) { + Base = LHS; + Offset = RHS; + return true; + } + + // Check if the RHS is a shift node with a constant. + if (RHS.getOpcode() != ISD::SHL) + return false; + + const SDValue ShiftRHS = RHS.getOperand(1); + if (auto *C = dyn_cast<ConstantSDNode>(ShiftRHS)) + if (C->getZExtValue() == Scale) { + Base = LHS; + Offset = RHS.getOperand(0); + return true; + } + + return false; +} diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index d45a80057564a..85db14ab66feb 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -99,11 +99,6 @@ STATISTIC(NumTailCalls, "Number of tail calls"); STATISTIC(NumShiftInserts, "Number of vector shift inserts"); STATISTIC(NumOptimizedImms, "Number of times immediates were optimized"); -static cl::opt<bool> -EnableAArch64SlrGeneration("aarch64-shift-insert-generation", cl::Hidden, - cl::desc("Allow AArch64 SLI/SRI formation"), - cl::init(false)); - // FIXME: The necessary dtprel relocations don't seem to be supported // well in the GNU bfd and gold linkers at the moment. Therefore, by // default, for now, fall back to GeneralDynamic code generation. @@ -121,6 +116,18 @@ EnableOptimizeLogicalImm("aarch64-enable-logical-imm", cl::Hidden, /// Value type used for condition codes. static const MVT MVT_CC = MVT::i32; +/// Returns true if VT's elements occupy the lowest bit positions of its +/// associated register class without any intervening space. +/// +/// For example, nxv2f16, nxv4f16 and nxv8f16 are legal types that belong to the +/// same register class, but only nxv8f16 can be treated as a packed vector. +static inline bool isPackedVectorType(EVT VT, SelectionDAG &DAG) { + assert(VT.isVector() && DAG.getTargetLoweringInfo().isTypeLegal(VT) && + "Expected legal vector type!"); + return VT.isFixedLengthVector() || + VT.getSizeInBits().getKnownMinSize() == AArch64::SVEBitsPerBlock; +} + AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, const AArch64Subtarget &STI) : TargetLowering(TM), Subtarget(&STI) { @@ -137,6 +144,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, if (Subtarget->hasFPARMv8()) { addRegisterClass(MVT::f16, &AArch64::FPR16RegClass); + addRegisterClass(MVT::bf16, &AArch64::FPR16RegClass); addRegisterClass(MVT::f32, &AArch64::FPR32RegClass); addRegisterClass(MVT::f64, &AArch64::FPR64RegClass); addRegisterClass(MVT::f128, &AArch64::FPR128RegClass); @@ -153,6 +161,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, addDRTypeForNEON(MVT::v1i64); addDRTypeForNEON(MVT::v1f64); addDRTypeForNEON(MVT::v4f16); + addDRTypeForNEON(MVT::v4bf16); addQRTypeForNEON(MVT::v4f32); addQRTypeForNEON(MVT::v2f64); @@ -161,6 +170,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, addQRTypeForNEON(MVT::v4i32); addQRTypeForNEON(MVT::v2i64); addQRTypeForNEON(MVT::v8f16); + addQRTypeForNEON(MVT::v8bf16); } if (Subtarget->hasSVE()) { @@ -183,21 +193,51 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, addRegisterClass(MVT::nxv4f32, &AArch64::ZPRRegClass); addRegisterClass(MVT::nxv2f64, &AArch64::ZPRRegClass); + if (Subtarget->hasBF16()) { + addRegisterClass(MVT::nxv2bf16, &AArch64::ZPRRegClass); + addRegisterClass(MVT::nxv4bf16, &AArch64::ZPRRegClass); + addRegisterClass(MVT::nxv8bf16, &AArch64::ZPRRegClass); + } + + if (useSVEForFixedLengthVectors()) { + for (MVT VT : MVT::integer_fixedlen_vector_valuetypes()) + if (useSVEForFixedLengthVectorVT(VT)) + addRegisterClass(VT, &AArch64::ZPRRegClass); + + for (MVT VT : MVT::fp_fixedlen_vector_valuetypes()) + if (useSVEForFixedLengthVectorVT(VT)) + addRegisterClass(VT, &AArch64::ZPRRegClass); + } + for (auto VT : { MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32, MVT::nxv2i64 }) { setOperationAction(ISD::SADDSAT, VT, Legal); setOperationAction(ISD::UADDSAT, VT, Legal); setOperationAction(ISD::SSUBSAT, VT, Legal); setOperationAction(ISD::USUBSAT, VT, Legal); - setOperationAction(ISD::SMAX, VT, Legal); - setOperationAction(ISD::UMAX, VT, Legal); - setOperationAction(ISD::SMIN, VT, Legal); - setOperationAction(ISD::UMIN, VT, Legal); + setOperationAction(ISD::UREM, VT, Expand); + setOperationAction(ISD::SREM, VT, Expand); + setOperationAction(ISD::SDIVREM, VT, Expand); + setOperationAction(ISD::UDIVREM, VT, Expand); } for (auto VT : { MVT::nxv2i8, MVT::nxv2i16, MVT::nxv2i32, MVT::nxv2i64, MVT::nxv4i8, MVT::nxv4i16, MVT::nxv4i32, MVT::nxv8i8, MVT::nxv8i16 }) setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Legal); + + for (auto VT : + { MVT::nxv2f16, MVT::nxv4f16, MVT::nxv8f16, MVT::nxv2f32, MVT::nxv4f32, + MVT::nxv2f64 }) { + setCondCodeAction(ISD::SETO, VT, Expand); + setCondCodeAction(ISD::SETOLT, VT, Expand); + setCondCodeAction(ISD::SETOLE, VT, Expand); + setCondCodeAction(ISD::SETULT, VT, Expand); + setCondCodeAction(ISD::SETULE, VT, Expand); + setCondCodeAction(ISD::SETUGE, VT, Expand); + setCondCodeAction(ISD::SETUGT, VT, Expand); + setCondCodeAction(ISD::SETUEQ, VT, Expand); + setCondCodeAction(ISD::SETUNE, VT, Expand); + } } // Compute derived properties from the register classes @@ -211,6 +251,12 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setOperationAction(ISD::SETCC, MVT::f16, Custom); setOperationAction(ISD::SETCC, MVT::f32, Custom); setOperationAction(ISD::SETCC, MVT::f64, Custom); + setOperationAction(ISD::STRICT_FSETCC, MVT::f16, Custom); + setOperationAction(ISD::STRICT_FSETCC, MVT::f32, Custom); + setOperationAction(ISD::STRICT_FSETCC, MVT::f64, Custom); + setOperationAction(ISD::STRICT_FSETCCS, MVT::f16, Custom); + setOperationAction(ISD::STRICT_FSETCCS, MVT::f32, Custom); + setOperationAction(ISD::STRICT_FSETCCS, MVT::f64, Custom); setOperationAction(ISD::BITREVERSE, MVT::i32, Legal); setOperationAction(ISD::BITREVERSE, MVT::i64, Legal); setOperationAction(ISD::BRCOND, MVT::Other, Expand); @@ -266,6 +312,8 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setOperationAction(ISD::FSUB, MVT::f128, Custom); setOperationAction(ISD::FTRUNC, MVT::f128, Expand); setOperationAction(ISD::SETCC, MVT::f128, Custom); + setOperationAction(ISD::STRICT_FSETCC, MVT::f128, Custom); + setOperationAction(ISD::STRICT_FSETCCS, MVT::f128, Custom); setOperationAction(ISD::BR_CC, MVT::f128, Custom); setOperationAction(ISD::SELECT, MVT::f128, Custom); setOperationAction(ISD::SELECT_CC, MVT::f128, Custom); @@ -276,17 +324,31 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom); setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom); setOperationAction(ISD::FP_TO_SINT, MVT::i128, Custom); + setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i32, Custom); + setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i64, Custom); + setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i128, Custom); setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom); setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom); setOperationAction(ISD::FP_TO_UINT, MVT::i128, Custom); + setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i32, Custom); + setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i64, Custom); + setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i128, Custom); setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom); setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom); setOperationAction(ISD::SINT_TO_FP, MVT::i128, Custom); + setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i32, Custom); + setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i64, Custom); + setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i128, Custom); setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom); setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom); setOperationAction(ISD::UINT_TO_FP, MVT::i128, Custom); + setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i32, Custom); + setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i64, Custom); + setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i128, Custom); setOperationAction(ISD::FP_ROUND, MVT::f32, Custom); setOperationAction(ISD::FP_ROUND, MVT::f64, Custom); + setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Custom); + setOperationAction(ISD::STRICT_FP_ROUND, MVT::f64, Custom); // Variable arguments. setOperationAction(ISD::VASTART, MVT::Other, Custom); @@ -327,12 +389,17 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setOperationAction(ISD::ROTR, VT, Expand); } + // AArch64 doesn't have i32 MULH{S|U}. + setOperationAction(ISD::MULHU, MVT::i32, Expand); + setOperationAction(ISD::MULHS, MVT::i32, Expand); + // AArch64 doesn't have {U|S}MUL_LOHI. setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand); setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand); setOperationAction(ISD::CTPOP, MVT::i32, Custom); setOperationAction(ISD::CTPOP, MVT::i64, Custom); + setOperationAction(ISD::CTPOP, MVT::i128, Custom); setOperationAction(ISD::SDIVREM, MVT::i32, Expand); setOperationAction(ISD::SDIVREM, MVT::i64, Expand); @@ -525,6 +592,17 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setOperationAction(ISD::LOAD, MVT::i128, Custom); setOperationAction(ISD::STORE, MVT::i128, Custom); + // 256 bit non-temporal stores can be lowered to STNP. Do this as part of the + // custom lowering, as there are no un-paired non-temporal stores and + // legalization will break up 256 bit inputs. + setOperationAction(ISD::STORE, MVT::v32i8, Custom); + setOperationAction(ISD::STORE, MVT::v16i16, Custom); + setOperationAction(ISD::STORE, MVT::v16f16, Custom); + setOperationAction(ISD::STORE, MVT::v8i32, Custom); + setOperationAction(ISD::STORE, MVT::v8f32, Custom); + setOperationAction(ISD::STORE, MVT::v4f64, Custom); + setOperationAction(ISD::STORE, MVT::v4i64, Custom); + // Lower READCYCLECOUNTER using an mrs from PMCCNTR_EL0. // This requires the Performance Monitors extension. if (Subtarget->hasPerfMon()) @@ -574,6 +652,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setOperationAction(ISD::BITCAST, MVT::i16, Custom); setOperationAction(ISD::BITCAST, MVT::f16, Custom); + setOperationAction(ISD::BITCAST, MVT::bf16, Custom); // Indexed loads and stores are supported. for (unsigned im = (unsigned)ISD::PRE_INC; @@ -585,6 +664,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setIndexedLoadAction(im, MVT::f64, Legal); setIndexedLoadAction(im, MVT::f32, Legal); setIndexedLoadAction(im, MVT::f16, Legal); + setIndexedLoadAction(im, MVT::bf16, Legal); setIndexedStoreAction(im, MVT::i8, Legal); setIndexedStoreAction(im, MVT::i16, Legal); setIndexedStoreAction(im, MVT::i32, Legal); @@ -592,6 +672,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setIndexedStoreAction(im, MVT::f64, Legal); setIndexedStoreAction(im, MVT::f32, Legal); setIndexedStoreAction(im, MVT::f16, Legal); + setIndexedStoreAction(im, MVT::bf16, Legal); } // Trap. @@ -769,6 +850,8 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setOperationAction(ISD::UADDSAT, VT, Legal); setOperationAction(ISD::SSUBSAT, VT, Legal); setOperationAction(ISD::USUBSAT, VT, Legal); + + setOperationAction(ISD::TRUNCATE, VT, Custom); } for (MVT VT : { MVT::v4f16, MVT::v2f32, MVT::v8f16, MVT::v4f32, MVT::v2f64 }) { @@ -825,6 +908,9 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, } } + if (Subtarget->hasSVE()) + setOperationAction(ISD::VSCALE, MVT::i32, Custom); + setTruncStoreAction(MVT::v4i16, MVT::v4i8, Custom); } @@ -833,11 +919,60 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, // splat of 0 or undef) once vector selects supported in SVE codegen. See // D68877 for more details. for (MVT VT : MVT::integer_scalable_vector_valuetypes()) { - if (isTypeLegal(VT)) + if (isTypeLegal(VT)) { + setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom); setOperationAction(ISD::SPLAT_VECTOR, VT, Custom); + setOperationAction(ISD::SELECT, VT, Custom); + setOperationAction(ISD::SDIV, VT, Custom); + setOperationAction(ISD::UDIV, VT, Custom); + setOperationAction(ISD::SMIN, VT, Custom); + setOperationAction(ISD::UMIN, VT, Custom); + setOperationAction(ISD::SMAX, VT, Custom); + setOperationAction(ISD::UMAX, VT, Custom); + setOperationAction(ISD::SHL, VT, Custom); + setOperationAction(ISD::SRL, VT, Custom); + setOperationAction(ISD::SRA, VT, Custom); + if (VT.getScalarType() == MVT::i1) + setOperationAction(ISD::SETCC, VT, Custom); + } } + + for (auto VT : {MVT::nxv8i8, MVT::nxv4i16, MVT::nxv2i32}) + setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom); + setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i8, Custom); setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i16, Custom); + + for (MVT VT : MVT::fp_scalable_vector_valuetypes()) { + if (isTypeLegal(VT)) { + setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom); + setOperationAction(ISD::SPLAT_VECTOR, VT, Custom); + setOperationAction(ISD::SELECT, VT, Custom); + setOperationAction(ISD::FMA, VT, Custom); + } + } + + // NOTE: Currently this has to happen after computeRegisterProperties rather + // than the preferred option of combining it with the addRegisterClass call. + if (useSVEForFixedLengthVectors()) { + for (MVT VT : MVT::integer_fixedlen_vector_valuetypes()) + if (useSVEForFixedLengthVectorVT(VT)) + addTypeForFixedLengthSVE(VT); + for (MVT VT : MVT::fp_fixedlen_vector_valuetypes()) + if (useSVEForFixedLengthVectorVT(VT)) + addTypeForFixedLengthSVE(VT); + + // 64bit results can mean a bigger than NEON input. + for (auto VT : {MVT::v8i8, MVT::v4i16}) + setOperationAction(ISD::TRUNCATE, VT, Custom); + setOperationAction(ISD::FP_ROUND, MVT::v4f16, Custom); + + // 128bit results imply a bigger than NEON input. + for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32}) + setOperationAction(ISD::TRUNCATE, VT, Custom); + for (auto VT : {MVT::v8f16, MVT::v4f32}) + setOperationAction(ISD::FP_ROUND, VT, Expand); + } } PredictableSelectIsExpensive = Subtarget->predictableSelectIsExpensive(); @@ -922,6 +1057,24 @@ void AArch64TargetLowering::addTypeForNEON(MVT VT, MVT PromotedBitwiseVT) { } } +void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT) { + assert(VT.isFixedLengthVector() && "Expected fixed length vector type!"); + + // By default everything must be expanded. + for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) + setOperationAction(Op, VT, Expand); + + // We use EXTRACT_SUBVECTOR to "cast" a scalable vector to a fixed length one. + setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom); + + // Lower fixed length vector operations to scalable equivalents. + setOperationAction(ISD::ADD, VT, Custom); + setOperationAction(ISD::FADD, VT, Custom); + setOperationAction(ISD::LOAD, VT, Custom); + setOperationAction(ISD::STORE, VT, Custom); + setOperationAction(ISD::TRUNCATE, VT, Custom); +} + void AArch64TargetLowering::addDRTypeForNEON(MVT VT) { addRegisterClass(VT, &AArch64::FPR64RegClass); addTypeForNEON(VT, MVT::v2i32); @@ -932,10 +1085,12 @@ void AArch64TargetLowering::addQRTypeForNEON(MVT VT) { addTypeForNEON(VT, MVT::v4i32); } -EVT AArch64TargetLowering::getSetCCResultType(const DataLayout &, LLVMContext &, - EVT VT) const { +EVT AArch64TargetLowering::getSetCCResultType(const DataLayout &, + LLVMContext &C, EVT VT) const { if (!VT.isVector()) return MVT::i32; + if (VT.isScalableVector()) + return EVT::getVectorVT(C, MVT::i1, VT.getVectorElementCount()); return VT.changeVectorElementTypeToInteger(); } @@ -1035,7 +1190,8 @@ static bool optimizeLogicalImm(SDValue Op, unsigned Size, uint64_t Imm, } bool AArch64TargetLowering::targetShrinkDemandedConstant( - SDValue Op, const APInt &Demanded, TargetLoweringOpt &TLO) const { + SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, + TargetLoweringOpt &TLO) const { // Delay this optimization to as late as possible. if (!TLO.LegalOps) return false; @@ -1052,7 +1208,7 @@ bool AArch64TargetLowering::targetShrinkDemandedConstant( "i32 or i64 is expected after legalization."); // Exit early if we demand all bits. - if (Demanded.countPopulation() == Size) + if (DemandedBits.countPopulation() == Size) return false; unsigned NewOpc; @@ -1073,7 +1229,7 @@ bool AArch64TargetLowering::targetShrinkDemandedConstant( if (!C) return false; uint64_t Imm = C->getZExtValue(); - return optimizeLogicalImm(Op, Size, Imm, Demanded, TLO, NewOpc); + return optimizeLogicalImm(Op, Size, Imm, DemandedBits, TLO, NewOpc); } /// computeKnownBitsForTargetNode - Determine which of the bits specified in @@ -1177,7 +1333,7 @@ bool AArch64TargetLowering::allowsMisalignedMemoryAccesses( // Same as above but handling LLTs instead. bool AArch64TargetLowering::allowsMisalignedMemoryAccesses( - LLT Ty, unsigned AddrSpace, unsigned Align, MachineMemOperand::Flags Flags, + LLT Ty, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags, bool *Fast) const { if (Subtarget->requiresStrictAlign()) return false; @@ -1192,7 +1348,7 @@ bool AArch64TargetLowering::allowsMisalignedMemoryAccesses( // Code that uses clang vector extensions can mark that it // wants unaligned accesses to be treated as fast by // underspecifying alignment to be 1 or 2. - Align <= 2 || + Alignment <= 2 || // Disregard v2i64. Memcpy lowering produces those and splitting // them regresses performance on micro-benchmarks and olden/bh. @@ -1208,181 +1364,246 @@ AArch64TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo, } const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const { +#define MAKE_CASE(V) \ + case V: \ + return #V; switch ((AArch64ISD::NodeType)Opcode) { - case AArch64ISD::FIRST_NUMBER: break; - case AArch64ISD::CALL: return "AArch64ISD::CALL"; - case AArch64ISD::ADRP: return "AArch64ISD::ADRP"; - case AArch64ISD::ADR: return "AArch64ISD::ADR"; - case AArch64ISD::ADDlow: return "AArch64ISD::ADDlow"; - case AArch64ISD::LOADgot: return "AArch64ISD::LOADgot"; - case AArch64ISD::RET_FLAG: return "AArch64ISD::RET_FLAG"; - case AArch64ISD::BRCOND: return "AArch64ISD::BRCOND"; - case AArch64ISD::CSEL: return "AArch64ISD::CSEL"; - case AArch64ISD::FCSEL: return "AArch64ISD::FCSEL"; - case AArch64ISD::CSINV: return "AArch64ISD::CSINV"; - case AArch64ISD::CSNEG: return "AArch64ISD::CSNEG"; - case AArch64ISD::CSINC: return "AArch64ISD::CSINC"; - case AArch64ISD::THREAD_POINTER: return "AArch64ISD::THREAD_POINTER"; - case AArch64ISD::TLSDESC_CALLSEQ: return "AArch64ISD::TLSDESC_CALLSEQ"; - case AArch64ISD::ADC: return "AArch64ISD::ADC"; - case AArch64ISD::SBC: return "AArch64ISD::SBC"; - case AArch64ISD::ADDS: return "AArch64ISD::ADDS"; - case AArch64ISD::SUBS: return "AArch64ISD::SUBS"; - case AArch64ISD::ADCS: return "AArch64ISD::ADCS"; - case AArch64ISD::SBCS: return "AArch64ISD::SBCS"; - case AArch64ISD::ANDS: return "AArch64ISD::ANDS"; - case AArch64ISD::CCMP: return "AArch64ISD::CCMP"; - case AArch64ISD::CCMN: return "AArch64ISD::CCMN"; - case AArch64ISD::FCCMP: return "AArch64ISD::FCCMP"; - case AArch64ISD::FCMP: return "AArch64ISD::FCMP"; - case AArch64ISD::DUP: return "AArch64ISD::DUP"; - case AArch64ISD::DUPLANE8: return "AArch64ISD::DUPLANE8"; - case AArch64ISD::DUPLANE16: return "AArch64ISD::DUPLANE16"; - case AArch64ISD::DUPLANE32: return "AArch64ISD::DUPLANE32"; - case AArch64ISD::DUPLANE64: return "AArch64ISD::DUPLANE64"; - case AArch64ISD::MOVI: return "AArch64ISD::MOVI"; - case AArch64ISD::MOVIshift: return "AArch64ISD::MOVIshift"; - case AArch64ISD::MOVIedit: return "AArch64ISD::MOVIedit"; - case AArch64ISD::MOVImsl: return "AArch64ISD::MOVImsl"; - case AArch64ISD::FMOV: return "AArch64ISD::FMOV"; - case AArch64ISD::MVNIshift: return "AArch64ISD::MVNIshift"; - case AArch64ISD::MVNImsl: return "AArch64ISD::MVNImsl"; - case AArch64ISD::BICi: return "AArch64ISD::BICi"; - case AArch64ISD::ORRi: return "AArch64ISD::ORRi"; - case AArch64ISD::BSL: return "AArch64ISD::BSL"; - case AArch64ISD::NEG: return "AArch64ISD::NEG"; - case AArch64ISD::EXTR: return "AArch64ISD::EXTR"; - case AArch64ISD::ZIP1: return "AArch64ISD::ZIP1"; - case AArch64ISD::ZIP2: return "AArch64ISD::ZIP2"; - case AArch64ISD::UZP1: return "AArch64ISD::UZP1"; - case AArch64ISD::UZP2: return "AArch64ISD::UZP2"; - case AArch64ISD::TRN1: return "AArch64ISD::TRN1"; - case AArch64ISD::TRN2: return "AArch64ISD::TRN2"; - case AArch64ISD::REV16: return "AArch64ISD::REV16"; - case AArch64ISD::REV32: return "AArch64ISD::REV32"; - case AArch64ISD::REV64: return "AArch64ISD::REV64"; - case AArch64ISD::EXT: return "AArch64ISD::EXT"; - case AArch64ISD::VSHL: return "AArch64ISD::VSHL"; - case AArch64ISD::VLSHR: return "AArch64ISD::VLSHR"; - case AArch64ISD::VASHR: return "AArch64ISD::VASHR"; - case AArch64ISD::CMEQ: return "AArch64ISD::CMEQ"; - case AArch64ISD::CMGE: return "AArch64ISD::CMGE"; - case AArch64ISD::CMGT: return "AArch64ISD::CMGT"; - case AArch64ISD::CMHI: return "AArch64ISD::CMHI"; - case AArch64ISD::CMHS: return "AArch64ISD::CMHS"; - case AArch64ISD::FCMEQ: return "AArch64ISD::FCMEQ"; - case AArch64ISD::FCMGE: return "AArch64ISD::FCMGE"; - case AArch64ISD::FCMGT: return "AArch64ISD::FCMGT"; - case AArch64ISD::CMEQz: return "AArch64ISD::CMEQz"; - case AArch64ISD::CMGEz: return "AArch64ISD::CMGEz"; - case AArch64ISD::CMGTz: return "AArch64ISD::CMGTz"; - case AArch64ISD::CMLEz: return "AArch64ISD::CMLEz"; - case AArch64ISD::CMLTz: return "AArch64ISD::CMLTz"; - case AArch64ISD::FCMEQz: return "AArch64ISD::FCMEQz"; - case AArch64ISD::FCMGEz: return "AArch64ISD::FCMGEz"; - case AArch64ISD::FCMGTz: return "AArch64ISD::FCMGTz"; - case AArch64ISD::FCMLEz: return "AArch64ISD::FCMLEz"; - case AArch64ISD::FCMLTz: return "AArch64ISD::FCMLTz"; - case AArch64ISD::SADDV: return "AArch64ISD::SADDV"; - case AArch64ISD::UADDV: return "AArch64ISD::UADDV"; - case AArch64ISD::SMINV: return "AArch64ISD::SMINV"; - case AArch64ISD::UMINV: return "AArch64ISD::UMINV"; - case AArch64ISD::SMAXV: return "AArch64ISD::SMAXV"; - case AArch64ISD::UMAXV: return "AArch64ISD::UMAXV"; - case AArch64ISD::SMAXV_PRED: return "AArch64ISD::SMAXV_PRED"; - case AArch64ISD::UMAXV_PRED: return "AArch64ISD::UMAXV_PRED"; - case AArch64ISD::SMINV_PRED: return "AArch64ISD::SMINV_PRED"; - case AArch64ISD::UMINV_PRED: return "AArch64ISD::UMINV_PRED"; - case AArch64ISD::ORV_PRED: return "AArch64ISD::ORV_PRED"; - case AArch64ISD::EORV_PRED: return "AArch64ISD::EORV_PRED"; - case AArch64ISD::ANDV_PRED: return "AArch64ISD::ANDV_PRED"; - case AArch64ISD::CLASTA_N: return "AArch64ISD::CLASTA_N"; - case AArch64ISD::CLASTB_N: return "AArch64ISD::CLASTB_N"; - case AArch64ISD::LASTA: return "AArch64ISD::LASTA"; - case AArch64ISD::LASTB: return "AArch64ISD::LASTB"; - case AArch64ISD::REV: return "AArch64ISD::REV"; - case AArch64ISD::TBL: return "AArch64ISD::TBL"; - case AArch64ISD::NOT: return "AArch64ISD::NOT"; - case AArch64ISD::BIT: return "AArch64ISD::BIT"; - case AArch64ISD::CBZ: return "AArch64ISD::CBZ"; - case AArch64ISD::CBNZ: return "AArch64ISD::CBNZ"; - case AArch64ISD::TBZ: return "AArch64ISD::TBZ"; - case AArch64ISD::TBNZ: return "AArch64ISD::TBNZ"; - case AArch64ISD::TC_RETURN: return "AArch64ISD::TC_RETURN"; - case AArch64ISD::PREFETCH: return "AArch64ISD::PREFETCH"; - case AArch64ISD::SITOF: return "AArch64ISD::SITOF"; - case AArch64ISD::UITOF: return "AArch64ISD::UITOF"; - case AArch64ISD::NVCAST: return "AArch64ISD::NVCAST"; - case AArch64ISD::SQSHL_I: return "AArch64ISD::SQSHL_I"; - case AArch64ISD::UQSHL_I: return "AArch64ISD::UQSHL_I"; - case AArch64ISD::SRSHR_I: return "AArch64ISD::SRSHR_I"; - case AArch64ISD::URSHR_I: return "AArch64ISD::URSHR_I"; - case AArch64ISD::SQSHLU_I: return "AArch64ISD::SQSHLU_I"; - case AArch64ISD::WrapperLarge: return "AArch64ISD::WrapperLarge"; - case AArch64ISD::LD2post: return "AArch64ISD::LD2post"; - case AArch64ISD::LD3post: return "AArch64ISD::LD3post"; - case AArch64ISD::LD4post: return "AArch64ISD::LD4post"; - case AArch64ISD::ST2post: return "AArch64ISD::ST2post"; - case AArch64ISD::ST3post: return "AArch64ISD::ST3post"; - case AArch64ISD::ST4post: return "AArch64ISD::ST4post"; - case AArch64ISD::LD1x2post: return "AArch64ISD::LD1x2post"; - case AArch64ISD::LD1x3post: return "AArch64ISD::LD1x3post"; - case AArch64ISD::LD1x4post: return "AArch64ISD::LD1x4post"; - case AArch64ISD::ST1x2post: return "AArch64ISD::ST1x2post"; - case AArch64ISD::ST1x3post: return "AArch64ISD::ST1x3post"; - case AArch64ISD::ST1x4post: return "AArch64ISD::ST1x4post"; - case AArch64ISD::LD1DUPpost: return "AArch64ISD::LD1DUPpost"; - case AArch64ISD::LD2DUPpost: return "AArch64ISD::LD2DUPpost"; - case AArch64ISD::LD3DUPpost: return "AArch64ISD::LD3DUPpost"; - case AArch64ISD::LD4DUPpost: return "AArch64ISD::LD4DUPpost"; - case AArch64ISD::LD1LANEpost: return "AArch64ISD::LD1LANEpost"; - case AArch64ISD::LD2LANEpost: return "AArch64ISD::LD2LANEpost"; - case AArch64ISD::LD3LANEpost: return "AArch64ISD::LD3LANEpost"; - case AArch64ISD::LD4LANEpost: return "AArch64ISD::LD4LANEpost"; - case AArch64ISD::ST2LANEpost: return "AArch64ISD::ST2LANEpost"; - case AArch64ISD::ST3LANEpost: return "AArch64ISD::ST3LANEpost"; - case AArch64ISD::ST4LANEpost: return "AArch64ISD::ST4LANEpost"; - case AArch64ISD::SMULL: return "AArch64ISD::SMULL"; - case AArch64ISD::UMULL: return "AArch64ISD::UMULL"; - case AArch64ISD::FRECPE: return "AArch64ISD::FRECPE"; - case AArch64ISD::FRECPS: return "AArch64ISD::FRECPS"; - case AArch64ISD::FRSQRTE: return "AArch64ISD::FRSQRTE"; - case AArch64ISD::FRSQRTS: return "AArch64ISD::FRSQRTS"; - case AArch64ISD::STG: return "AArch64ISD::STG"; - case AArch64ISD::STZG: return "AArch64ISD::STZG"; - case AArch64ISD::ST2G: return "AArch64ISD::ST2G"; - case AArch64ISD::STZ2G: return "AArch64ISD::STZ2G"; - case AArch64ISD::SUNPKHI: return "AArch64ISD::SUNPKHI"; - case AArch64ISD::SUNPKLO: return "AArch64ISD::SUNPKLO"; - case AArch64ISD::UUNPKHI: return "AArch64ISD::UUNPKHI"; - case AArch64ISD::UUNPKLO: return "AArch64ISD::UUNPKLO"; - case AArch64ISD::INSR: return "AArch64ISD::INSR"; - case AArch64ISD::PTEST: return "AArch64ISD::PTEST"; - case AArch64ISD::PTRUE: return "AArch64ISD::PTRUE"; - case AArch64ISD::GLD1: return "AArch64ISD::GLD1"; - case AArch64ISD::GLD1_SCALED: return "AArch64ISD::GLD1_SCALED"; - case AArch64ISD::GLD1_SXTW: return "AArch64ISD::GLD1_SXTW"; - case AArch64ISD::GLD1_UXTW: return "AArch64ISD::GLD1_UXTW"; - case AArch64ISD::GLD1_SXTW_SCALED: return "AArch64ISD::GLD1_SXTW_SCALED"; - case AArch64ISD::GLD1_UXTW_SCALED: return "AArch64ISD::GLD1_UXTW_SCALED"; - case AArch64ISD::GLD1_IMM: return "AArch64ISD::GLD1_IMM"; - case AArch64ISD::GLD1S: return "AArch64ISD::GLD1S"; - case AArch64ISD::GLD1S_SCALED: return "AArch64ISD::GLD1S_SCALED"; - case AArch64ISD::GLD1S_SXTW: return "AArch64ISD::GLD1S_SXTW"; - case AArch64ISD::GLD1S_UXTW: return "AArch64ISD::GLD1S_UXTW"; - case AArch64ISD::GLD1S_SXTW_SCALED: return "AArch64ISD::GLD1S_SXTW_SCALED"; - case AArch64ISD::GLD1S_UXTW_SCALED: return "AArch64ISD::GLD1S_UXTW_SCALED"; - case AArch64ISD::GLD1S_IMM: return "AArch64ISD::GLD1S_IMM"; - case AArch64ISD::SST1: return "AArch64ISD::SST1"; - case AArch64ISD::SST1_SCALED: return "AArch64ISD::SST1_SCALED"; - case AArch64ISD::SST1_SXTW: return "AArch64ISD::SST1_SXTW"; - case AArch64ISD::SST1_UXTW: return "AArch64ISD::SST1_UXTW"; - case AArch64ISD::SST1_SXTW_SCALED: return "AArch64ISD::SST1_SXTW_SCALED"; - case AArch64ISD::SST1_UXTW_SCALED: return "AArch64ISD::SST1_UXTW_SCALED"; - case AArch64ISD::SST1_IMM: return "AArch64ISD::SST1_IMM"; - case AArch64ISD::LDP: return "AArch64ISD::LDP"; - case AArch64ISD::STP: return "AArch64ISD::STP"; - } + case AArch64ISD::FIRST_NUMBER: + break; + MAKE_CASE(AArch64ISD::CALL) + MAKE_CASE(AArch64ISD::ADRP) + MAKE_CASE(AArch64ISD::ADR) + MAKE_CASE(AArch64ISD::ADDlow) + MAKE_CASE(AArch64ISD::LOADgot) + MAKE_CASE(AArch64ISD::RET_FLAG) + MAKE_CASE(AArch64ISD::BRCOND) + MAKE_CASE(AArch64ISD::CSEL) + MAKE_CASE(AArch64ISD::FCSEL) + MAKE_CASE(AArch64ISD::CSINV) + MAKE_CASE(AArch64ISD::CSNEG) + MAKE_CASE(AArch64ISD::CSINC) + MAKE_CASE(AArch64ISD::THREAD_POINTER) + MAKE_CASE(AArch64ISD::TLSDESC_CALLSEQ) + MAKE_CASE(AArch64ISD::ADD_PRED) + MAKE_CASE(AArch64ISD::SDIV_PRED) + MAKE_CASE(AArch64ISD::UDIV_PRED) + MAKE_CASE(AArch64ISD::SMIN_MERGE_OP1) + MAKE_CASE(AArch64ISD::UMIN_MERGE_OP1) + MAKE_CASE(AArch64ISD::SMAX_MERGE_OP1) + MAKE_CASE(AArch64ISD::UMAX_MERGE_OP1) + MAKE_CASE(AArch64ISD::SHL_MERGE_OP1) + MAKE_CASE(AArch64ISD::SRL_MERGE_OP1) + MAKE_CASE(AArch64ISD::SRA_MERGE_OP1) + MAKE_CASE(AArch64ISD::SETCC_MERGE_ZERO) + MAKE_CASE(AArch64ISD::ADC) + MAKE_CASE(AArch64ISD::SBC) + MAKE_CASE(AArch64ISD::ADDS) + MAKE_CASE(AArch64ISD::SUBS) + MAKE_CASE(AArch64ISD::ADCS) + MAKE_CASE(AArch64ISD::SBCS) + MAKE_CASE(AArch64ISD::ANDS) + MAKE_CASE(AArch64ISD::CCMP) + MAKE_CASE(AArch64ISD::CCMN) + MAKE_CASE(AArch64ISD::FCCMP) + MAKE_CASE(AArch64ISD::FCMP) + MAKE_CASE(AArch64ISD::STRICT_FCMP) + MAKE_CASE(AArch64ISD::STRICT_FCMPE) + MAKE_CASE(AArch64ISD::DUP) + MAKE_CASE(AArch64ISD::DUPLANE8) + MAKE_CASE(AArch64ISD::DUPLANE16) + MAKE_CASE(AArch64ISD::DUPLANE32) + MAKE_CASE(AArch64ISD::DUPLANE64) + MAKE_CASE(AArch64ISD::MOVI) + MAKE_CASE(AArch64ISD::MOVIshift) + MAKE_CASE(AArch64ISD::MOVIedit) + MAKE_CASE(AArch64ISD::MOVImsl) + MAKE_CASE(AArch64ISD::FMOV) + MAKE_CASE(AArch64ISD::MVNIshift) + MAKE_CASE(AArch64ISD::MVNImsl) + MAKE_CASE(AArch64ISD::BICi) + MAKE_CASE(AArch64ISD::ORRi) + MAKE_CASE(AArch64ISD::BSP) + MAKE_CASE(AArch64ISD::NEG) + MAKE_CASE(AArch64ISD::EXTR) + MAKE_CASE(AArch64ISD::ZIP1) + MAKE_CASE(AArch64ISD::ZIP2) + MAKE_CASE(AArch64ISD::UZP1) + MAKE_CASE(AArch64ISD::UZP2) + MAKE_CASE(AArch64ISD::TRN1) + MAKE_CASE(AArch64ISD::TRN2) + MAKE_CASE(AArch64ISD::REV16) + MAKE_CASE(AArch64ISD::REV32) + MAKE_CASE(AArch64ISD::REV64) + MAKE_CASE(AArch64ISD::EXT) + MAKE_CASE(AArch64ISD::VSHL) + MAKE_CASE(AArch64ISD::VLSHR) + MAKE_CASE(AArch64ISD::VASHR) + MAKE_CASE(AArch64ISD::VSLI) + MAKE_CASE(AArch64ISD::VSRI) + MAKE_CASE(AArch64ISD::CMEQ) + MAKE_CASE(AArch64ISD::CMGE) + MAKE_CASE(AArch64ISD::CMGT) + MAKE_CASE(AArch64ISD::CMHI) + MAKE_CASE(AArch64ISD::CMHS) + MAKE_CASE(AArch64ISD::FCMEQ) + MAKE_CASE(AArch64ISD::FCMGE) + MAKE_CASE(AArch64ISD::FCMGT) + MAKE_CASE(AArch64ISD::CMEQz) + MAKE_CASE(AArch64ISD::CMGEz) + MAKE_CASE(AArch64ISD::CMGTz) + MAKE_CASE(AArch64ISD::CMLEz) + MAKE_CASE(AArch64ISD::CMLTz) + MAKE_CASE(AArch64ISD::FCMEQz) + MAKE_CASE(AArch64ISD::FCMGEz) + MAKE_CASE(AArch64ISD::FCMGTz) + MAKE_CASE(AArch64ISD::FCMLEz) + MAKE_CASE(AArch64ISD::FCMLTz) + MAKE_CASE(AArch64ISD::SADDV) + MAKE_CASE(AArch64ISD::UADDV) + MAKE_CASE(AArch64ISD::SRHADD) + MAKE_CASE(AArch64ISD::URHADD) + MAKE_CASE(AArch64ISD::SMINV) + MAKE_CASE(AArch64ISD::UMINV) + MAKE_CASE(AArch64ISD::SMAXV) + MAKE_CASE(AArch64ISD::UMAXV) + MAKE_CASE(AArch64ISD::SMAXV_PRED) + MAKE_CASE(AArch64ISD::UMAXV_PRED) + MAKE_CASE(AArch64ISD::SMINV_PRED) + MAKE_CASE(AArch64ISD::UMINV_PRED) + MAKE_CASE(AArch64ISD::ORV_PRED) + MAKE_CASE(AArch64ISD::EORV_PRED) + MAKE_CASE(AArch64ISD::ANDV_PRED) + MAKE_CASE(AArch64ISD::CLASTA_N) + MAKE_CASE(AArch64ISD::CLASTB_N) + MAKE_CASE(AArch64ISD::LASTA) + MAKE_CASE(AArch64ISD::LASTB) + MAKE_CASE(AArch64ISD::REV) + MAKE_CASE(AArch64ISD::REINTERPRET_CAST) + MAKE_CASE(AArch64ISD::TBL) + MAKE_CASE(AArch64ISD::FADD_PRED) + MAKE_CASE(AArch64ISD::FADDA_PRED) + MAKE_CASE(AArch64ISD::FADDV_PRED) + MAKE_CASE(AArch64ISD::FMA_PRED) + MAKE_CASE(AArch64ISD::FMAXV_PRED) + MAKE_CASE(AArch64ISD::FMAXNMV_PRED) + MAKE_CASE(AArch64ISD::FMINV_PRED) + MAKE_CASE(AArch64ISD::FMINNMV_PRED) + MAKE_CASE(AArch64ISD::NOT) + MAKE_CASE(AArch64ISD::BIT) + MAKE_CASE(AArch64ISD::CBZ) + MAKE_CASE(AArch64ISD::CBNZ) + MAKE_CASE(AArch64ISD::TBZ) + MAKE_CASE(AArch64ISD::TBNZ) + MAKE_CASE(AArch64ISD::TC_RETURN) + MAKE_CASE(AArch64ISD::PREFETCH) + MAKE_CASE(AArch64ISD::SITOF) + MAKE_CASE(AArch64ISD::UITOF) + MAKE_CASE(AArch64ISD::NVCAST) + MAKE_CASE(AArch64ISD::SQSHL_I) + MAKE_CASE(AArch64ISD::UQSHL_I) + MAKE_CASE(AArch64ISD::SRSHR_I) + MAKE_CASE(AArch64ISD::URSHR_I) + MAKE_CASE(AArch64ISD::SQSHLU_I) + MAKE_CASE(AArch64ISD::WrapperLarge) + MAKE_CASE(AArch64ISD::LD2post) + MAKE_CASE(AArch64ISD::LD3post) + MAKE_CASE(AArch64ISD::LD4post) + MAKE_CASE(AArch64ISD::ST2post) + MAKE_CASE(AArch64ISD::ST3post) + MAKE_CASE(AArch64ISD::ST4post) + MAKE_CASE(AArch64ISD::LD1x2post) + MAKE_CASE(AArch64ISD::LD1x3post) + MAKE_CASE(AArch64ISD::LD1x4post) + MAKE_CASE(AArch64ISD::ST1x2post) + MAKE_CASE(AArch64ISD::ST1x3post) + MAKE_CASE(AArch64ISD::ST1x4post) + MAKE_CASE(AArch64ISD::LD1DUPpost) + MAKE_CASE(AArch64ISD::LD2DUPpost) + MAKE_CASE(AArch64ISD::LD3DUPpost) + MAKE_CASE(AArch64ISD::LD4DUPpost) + MAKE_CASE(AArch64ISD::LD1LANEpost) + MAKE_CASE(AArch64ISD::LD2LANEpost) + MAKE_CASE(AArch64ISD::LD3LANEpost) + MAKE_CASE(AArch64ISD::LD4LANEpost) + MAKE_CASE(AArch64ISD::ST2LANEpost) + MAKE_CASE(AArch64ISD::ST3LANEpost) + MAKE_CASE(AArch64ISD::ST4LANEpost) + MAKE_CASE(AArch64ISD::SMULL) + MAKE_CASE(AArch64ISD::UMULL) + MAKE_CASE(AArch64ISD::FRECPE) + MAKE_CASE(AArch64ISD::FRECPS) + MAKE_CASE(AArch64ISD::FRSQRTE) + MAKE_CASE(AArch64ISD::FRSQRTS) + MAKE_CASE(AArch64ISD::STG) + MAKE_CASE(AArch64ISD::STZG) + MAKE_CASE(AArch64ISD::ST2G) + MAKE_CASE(AArch64ISD::STZ2G) + MAKE_CASE(AArch64ISD::SUNPKHI) + MAKE_CASE(AArch64ISD::SUNPKLO) + MAKE_CASE(AArch64ISD::UUNPKHI) + MAKE_CASE(AArch64ISD::UUNPKLO) + MAKE_CASE(AArch64ISD::INSR) + MAKE_CASE(AArch64ISD::PTEST) + MAKE_CASE(AArch64ISD::PTRUE) + MAKE_CASE(AArch64ISD::LD1_MERGE_ZERO) + MAKE_CASE(AArch64ISD::LD1S_MERGE_ZERO) + MAKE_CASE(AArch64ISD::LDNF1_MERGE_ZERO) + MAKE_CASE(AArch64ISD::LDNF1S_MERGE_ZERO) + MAKE_CASE(AArch64ISD::LDFF1_MERGE_ZERO) + MAKE_CASE(AArch64ISD::LDFF1S_MERGE_ZERO) + MAKE_CASE(AArch64ISD::LD1RQ_MERGE_ZERO) + MAKE_CASE(AArch64ISD::LD1RO_MERGE_ZERO) + MAKE_CASE(AArch64ISD::SVE_LD2_MERGE_ZERO) + MAKE_CASE(AArch64ISD::SVE_LD3_MERGE_ZERO) + MAKE_CASE(AArch64ISD::SVE_LD4_MERGE_ZERO) + MAKE_CASE(AArch64ISD::GLD1_MERGE_ZERO) + MAKE_CASE(AArch64ISD::GLD1_SCALED_MERGE_ZERO) + MAKE_CASE(AArch64ISD::GLD1_SXTW_MERGE_ZERO) + MAKE_CASE(AArch64ISD::GLD1_UXTW_MERGE_ZERO) + MAKE_CASE(AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO) + MAKE_CASE(AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO) + MAKE_CASE(AArch64ISD::GLD1_IMM_MERGE_ZERO) + MAKE_CASE(AArch64ISD::GLD1S_MERGE_ZERO) + MAKE_CASE(AArch64ISD::GLD1S_SCALED_MERGE_ZERO) + MAKE_CASE(AArch64ISD::GLD1S_SXTW_MERGE_ZERO) + MAKE_CASE(AArch64ISD::GLD1S_UXTW_MERGE_ZERO) + MAKE_CASE(AArch64ISD::GLD1S_SXTW_SCALED_MERGE_ZERO) + MAKE_CASE(AArch64ISD::GLD1S_UXTW_SCALED_MERGE_ZERO) + MAKE_CASE(AArch64ISD::GLD1S_IMM_MERGE_ZERO) + MAKE_CASE(AArch64ISD::GLDFF1_MERGE_ZERO) + MAKE_CASE(AArch64ISD::GLDFF1_SCALED_MERGE_ZERO) + MAKE_CASE(AArch64ISD::GLDFF1_SXTW_MERGE_ZERO) + MAKE_CASE(AArch64ISD::GLDFF1_UXTW_MERGE_ZERO) + MAKE_CASE(AArch64ISD::GLDFF1_SXTW_SCALED_MERGE_ZERO) + MAKE_CASE(AArch64ISD::GLDFF1_UXTW_SCALED_MERGE_ZERO) + MAKE_CASE(AArch64ISD::GLDFF1_IMM_MERGE_ZERO) + MAKE_CASE(AArch64ISD::GLDFF1S_MERGE_ZERO) + MAKE_CASE(AArch64ISD::GLDFF1S_SCALED_MERGE_ZERO) + MAKE_CASE(AArch64ISD::GLDFF1S_SXTW_MERGE_ZERO) + MAKE_CASE(AArch64ISD::GLDFF1S_UXTW_MERGE_ZERO) + MAKE_CASE(AArch64ISD::GLDFF1S_SXTW_SCALED_MERGE_ZERO) + MAKE_CASE(AArch64ISD::GLDFF1S_UXTW_SCALED_MERGE_ZERO) + MAKE_CASE(AArch64ISD::GLDFF1S_IMM_MERGE_ZERO) + MAKE_CASE(AArch64ISD::GLDNT1_MERGE_ZERO) + MAKE_CASE(AArch64ISD::GLDNT1_INDEX_MERGE_ZERO) + MAKE_CASE(AArch64ISD::GLDNT1S_MERGE_ZERO) + MAKE_CASE(AArch64ISD::ST1_PRED) + MAKE_CASE(AArch64ISD::SST1_PRED) + MAKE_CASE(AArch64ISD::SST1_SCALED_PRED) + MAKE_CASE(AArch64ISD::SST1_SXTW_PRED) + MAKE_CASE(AArch64ISD::SST1_UXTW_PRED) + MAKE_CASE(AArch64ISD::SST1_SXTW_SCALED_PRED) + MAKE_CASE(AArch64ISD::SST1_UXTW_SCALED_PRED) + MAKE_CASE(AArch64ISD::SST1_IMM_PRED) + MAKE_CASE(AArch64ISD::SSTNT1_PRED) + MAKE_CASE(AArch64ISD::SSTNT1_INDEX_PRED) + MAKE_CASE(AArch64ISD::LDP) + MAKE_CASE(AArch64ISD::STP) + MAKE_CASE(AArch64ISD::STNP) + MAKE_CASE(AArch64ISD::DUP_MERGE_PASSTHRU) + MAKE_CASE(AArch64ISD::INDEX_VECTOR) + } +#undef MAKE_CASE return nullptr; } @@ -1454,12 +1675,6 @@ MachineBasicBlock *AArch64TargetLowering::EmitLoweredCatchRet( return BB; } -MachineBasicBlock *AArch64TargetLowering::EmitLoweredCatchPad( - MachineInstr &MI, MachineBasicBlock *BB) const { - MI.eraseFromParent(); - return BB; -} - MachineBasicBlock *AArch64TargetLowering::EmitInstrWithCustomInserter( MachineInstr &MI, MachineBasicBlock *BB) const { switch (MI.getOpcode()) { @@ -1478,8 +1693,6 @@ MachineBasicBlock *AArch64TargetLowering::EmitInstrWithCustomInserter( case AArch64::CATCHRET: return EmitLoweredCatchRet(MI, BB); - case AArch64::CATCHPAD: - return EmitLoweredCatchPad(MI, BB); } } @@ -1668,6 +1881,17 @@ static bool isCMN(SDValue Op, ISD::CondCode CC) { (CC == ISD::SETEQ || CC == ISD::SETNE); } +static SDValue emitStrictFPComparison(SDValue LHS, SDValue RHS, const SDLoc &dl, + SelectionDAG &DAG, SDValue Chain, + bool IsSignaling) { + EVT VT = LHS.getValueType(); + assert(VT != MVT::f128); + assert(VT != MVT::f16 && "Lowering of strict fp16 not yet implemented"); + unsigned Opcode = + IsSignaling ? AArch64ISD::STRICT_FCMPE : AArch64ISD::STRICT_FCMP; + return DAG.getNode(Opcode, dl, {VT, MVT::Other}, {Chain, LHS, RHS}); +} + static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC, const SDLoc &dl, SelectionDAG &DAG) { EVT VT = LHS.getValueType(); @@ -1699,14 +1923,22 @@ static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC, // we combine a (CMP (sub 0, op1), op2) into a CMN instruction ? Opcode = AArch64ISD::ADDS; LHS = LHS.getOperand(1); - } else if (LHS.getOpcode() == ISD::AND && isNullConstant(RHS) && - !isUnsignedIntSetCC(CC)) { - // Similarly, (CMP (and X, Y), 0) can be implemented with a TST - // (a.k.a. ANDS) except that the flags are only guaranteed to work for one - // of the signed comparisons. - Opcode = AArch64ISD::ANDS; - RHS = LHS.getOperand(1); - LHS = LHS.getOperand(0); + } else if (isNullConstant(RHS) && !isUnsignedIntSetCC(CC)) { + if (LHS.getOpcode() == ISD::AND) { + // Similarly, (CMP (and X, Y), 0) can be implemented with a TST + // (a.k.a. ANDS) except that the flags are only guaranteed to work for one + // of the signed comparisons. + const SDValue ANDSNode = DAG.getNode(AArch64ISD::ANDS, dl, + DAG.getVTList(VT, MVT_CC), + LHS.getOperand(0), + LHS.getOperand(1)); + // Replace all users of (and X, Y) with newly generated (ands X, Y) + DAG.ReplaceAllUsesWith(LHS, ANDSNode); + return ANDSNode.getValue(1); + } else if (LHS.getOpcode() == AArch64ISD::ANDS) { + // Use result of ANDS + return LHS.getValue(1); + } } return DAG.getNode(Opcode, dl, DAG.getVTList(VT, MVT_CC), LHS, RHS) @@ -2284,18 +2516,16 @@ getAArch64XALUOOp(AArch64CC::CondCode &CC, SDValue Op, SelectionDAG &DAG) { SDValue AArch64TargetLowering::LowerF128Call(SDValue Op, SelectionDAG &DAG, RTLIB::Libcall Call) const { - SmallVector<SDValue, 2> Ops(Op->op_begin(), Op->op_end()); + bool IsStrict = Op->isStrictFPOpcode(); + unsigned Offset = IsStrict ? 1 : 0; + SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue(); + SmallVector<SDValue, 2> Ops(Op->op_begin() + Offset, Op->op_end()); MakeLibCallOptions CallOptions; - return makeLibCall(DAG, Call, MVT::f128, Ops, CallOptions, SDLoc(Op)).first; -} - -// Returns true if the given Op is the overflow flag result of an overflow -// intrinsic operation. -static bool isOverflowIntrOpRes(SDValue Op) { - unsigned Opc = Op.getOpcode(); - return (Op.getResNo() == 1 && - (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO || - Opc == ISD::USUBO || Opc == ISD::SMULO || Opc == ISD::UMULO)); + SDValue Result; + SDLoc dl(Op); + std::tie(Result, Chain) = makeLibCall(DAG, Call, Op.getValueType(), Ops, + CallOptions, dl, Chain); + return IsStrict ? DAG.getMergeValues({Result, Chain}, dl) : Result; } static SDValue LowerXOR(SDValue Op, SelectionDAG &DAG) { @@ -2310,7 +2540,7 @@ static SDValue LowerXOR(SDValue Op, SelectionDAG &DAG) { // (csel 1, 0, invert(cc), overflow_op_bool) // ... which later gets transformed to just a cset instruction with an // inverted condition code, rather than a cset + eor sequence. - if (isOneConstant(Other) && isOverflowIntrOpRes(Sel)) { + if (isOneConstant(Other) && ISD::isOverflowIntrOpRes(Sel)) { // Only lower legal XALUO ops. if (!DAG.getTargetLoweringInfo().isTypeLegal(Sel->getValueType(0))) return SDValue(); @@ -2483,21 +2713,32 @@ SDValue AArch64TargetLowering::LowerFP_EXTEND(SDValue Op, SDValue AArch64TargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const { - if (Op.getOperand(0).getValueType() != MVT::f128) { + bool IsStrict = Op->isStrictFPOpcode(); + SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0); + EVT SrcVT = SrcVal.getValueType(); + + if (SrcVT != MVT::f128) { + // Expand cases where the input is a vector bigger than NEON. + if (useSVEForFixedLengthVectorVT(SrcVT)) + return SDValue(); + // It's legal except when f128 is involved return Op; } RTLIB::Libcall LC; - LC = RTLIB::getFPROUND(Op.getOperand(0).getValueType(), Op.getValueType()); + LC = RTLIB::getFPROUND(SrcVT, Op.getValueType()); // FP_ROUND node has a second operand indicating whether it is known to be // precise. That doesn't take part in the LibCall so we can't directly use // LowerF128Call. - SDValue SrcVal = Op.getOperand(0); MakeLibCallOptions CallOptions; - return makeLibCall(DAG, LC, Op.getValueType(), SrcVal, CallOptions, - SDLoc(Op)).first; + SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue(); + SDValue Result; + SDLoc dl(Op); + std::tie(Result, Chain) = makeLibCall(DAG, LC, Op.getValueType(), SrcVal, + CallOptions, dl, Chain); + return IsStrict ? DAG.getMergeValues({Result, Chain}, dl) : Result; } SDValue AArch64TargetLowering::LowerVectorFP_TO_INT(SDValue Op, @@ -2542,32 +2783,34 @@ SDValue AArch64TargetLowering::LowerVectorFP_TO_INT(SDValue Op, SDValue AArch64TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const { - if (Op.getOperand(0).getValueType().isVector()) + bool IsStrict = Op->isStrictFPOpcode(); + SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0); + + if (SrcVal.getValueType().isVector()) return LowerVectorFP_TO_INT(Op, DAG); // f16 conversions are promoted to f32 when full fp16 is not supported. - if (Op.getOperand(0).getValueType() == MVT::f16 && - !Subtarget->hasFullFP16()) { + if (SrcVal.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) { + assert(!IsStrict && "Lowering of strict fp16 not yet implemented"); SDLoc dl(Op); return DAG.getNode( Op.getOpcode(), dl, Op.getValueType(), - DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, Op.getOperand(0))); + DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, SrcVal)); } - if (Op.getOperand(0).getValueType() != MVT::f128) { + if (SrcVal.getValueType() != MVT::f128) { // It's legal except when f128 is involved return Op; } RTLIB::Libcall LC; - if (Op.getOpcode() == ISD::FP_TO_SINT) - LC = RTLIB::getFPTOSINT(Op.getOperand(0).getValueType(), Op.getValueType()); + if (Op.getOpcode() == ISD::FP_TO_SINT || + Op.getOpcode() == ISD::STRICT_FP_TO_SINT) + LC = RTLIB::getFPTOSINT(SrcVal.getValueType(), Op.getValueType()); else - LC = RTLIB::getFPTOUINT(Op.getOperand(0).getValueType(), Op.getValueType()); + LC = RTLIB::getFPTOUINT(SrcVal.getValueType(), Op.getValueType()); - SmallVector<SDValue, 2> Ops(Op->op_begin(), Op->op_end()); - MakeLibCallOptions CallOptions; - return makeLibCall(DAG, LC, Op.getValueType(), Ops, CallOptions, SDLoc(Op)).first; + return LowerF128Call(Op, DAG, LC); } static SDValue LowerVectorINT_TO_FP(SDValue Op, SelectionDAG &DAG) { @@ -2603,18 +2846,22 @@ SDValue AArch64TargetLowering::LowerINT_TO_FP(SDValue Op, if (Op.getValueType().isVector()) return LowerVectorINT_TO_FP(Op, DAG); + bool IsStrict = Op->isStrictFPOpcode(); + SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0); + // f16 conversions are promoted to f32 when full fp16 is not supported. if (Op.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) { + assert(!IsStrict && "Lowering of strict fp16 not yet implemented"); SDLoc dl(Op); return DAG.getNode( ISD::FP_ROUND, dl, MVT::f16, - DAG.getNode(Op.getOpcode(), dl, MVT::f32, Op.getOperand(0)), + DAG.getNode(Op.getOpcode(), dl, MVT::f32, SrcVal), DAG.getIntPtrConstant(0, dl)); } // i128 conversions are libcalls. - if (Op.getOperand(0).getValueType() == MVT::i128) + if (SrcVal.getValueType() == MVT::i128) return SDValue(); // Other conversions are legal, unless it's to the completely software-based @@ -2623,10 +2870,11 @@ SDValue AArch64TargetLowering::LowerINT_TO_FP(SDValue Op, return Op; RTLIB::Libcall LC; - if (Op.getOpcode() == ISD::SINT_TO_FP) - LC = RTLIB::getSINTTOFP(Op.getOperand(0).getValueType(), Op.getValueType()); + if (Op.getOpcode() == ISD::SINT_TO_FP || + Op.getOpcode() == ISD::STRICT_SINT_TO_FP) + LC = RTLIB::getSINTTOFP(SrcVal.getValueType(), Op.getValueType()); else - LC = RTLIB::getUINTTOFP(Op.getOperand(0).getValueType(), Op.getValueType()); + LC = RTLIB::getUINTTOFP(SrcVal.getValueType(), Op.getValueType()); return LowerF128Call(Op, DAG, LC); } @@ -2666,7 +2914,8 @@ SDValue AArch64TargetLowering::LowerFSINCOS(SDValue Op, } static SDValue LowerBITCAST(SDValue Op, SelectionDAG &DAG) { - if (Op.getValueType() != MVT::f16) + EVT OpVT = Op.getValueType(); + if (OpVT != MVT::f16 && OpVT != MVT::bf16) return SDValue(); assert(Op.getOperand(0).getValueType() == MVT::i16); @@ -2675,7 +2924,7 @@ static SDValue LowerBITCAST(SDValue Op, SelectionDAG &DAG) { Op = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op.getOperand(0)); Op = DAG.getNode(ISD::BITCAST, DL, MVT::f32, Op); return SDValue( - DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL, MVT::f16, Op, + DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL, OpVT, Op, DAG.getTargetConstant(AArch64::hsub, DL, MVT::i32)), 0); } @@ -2804,16 +3053,19 @@ SDValue AArch64TargetLowering::LowerFLT_ROUNDS_(SDValue Op, // so that the shift + and get folded into a bitfield extract. SDLoc dl(Op); - SDValue FPCR_64 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::i64, - DAG.getConstant(Intrinsic::aarch64_get_fpcr, dl, - MVT::i64)); + SDValue Chain = Op.getOperand(0); + SDValue FPCR_64 = DAG.getNode( + ISD::INTRINSIC_W_CHAIN, dl, {MVT::i64, MVT::Other}, + {Chain, DAG.getConstant(Intrinsic::aarch64_get_fpcr, dl, MVT::i64)}); + Chain = FPCR_64.getValue(1); SDValue FPCR_32 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, FPCR_64); SDValue FltRounds = DAG.getNode(ISD::ADD, dl, MVT::i32, FPCR_32, DAG.getConstant(1U << 22, dl, MVT::i32)); SDValue RMODE = DAG.getNode(ISD::SRL, dl, MVT::i32, FltRounds, DAG.getConstant(22, dl, MVT::i32)); - return DAG.getNode(ISD::AND, dl, MVT::i32, RMODE, - DAG.getConstant(3, dl, MVT::i32)); + SDValue AND = DAG.getNode(ISD::AND, dl, MVT::i32, RMODE, + DAG.getConstant(3, dl, MVT::i32)); + return DAG.getMergeValues({AND, Chain}, dl); } static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) { @@ -2885,6 +3137,12 @@ static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) { DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1)); } +static inline SDValue getPTrue(SelectionDAG &DAG, SDLoc DL, EVT VT, + int Pattern) { + return DAG.getNode(AArch64ISD::PTRUE, DL, VT, + DAG.getTargetConstant(Pattern, DL, MVT::i32)); +} + SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const { unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); @@ -2972,6 +3230,26 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, case Intrinsic::aarch64_sve_ptrue: return DAG.getNode(AArch64ISD::PTRUE, dl, Op.getValueType(), Op.getOperand(1)); + case Intrinsic::aarch64_sve_dupq_lane: + return LowerDUPQLane(Op, DAG); + case Intrinsic::aarch64_sve_convert_from_svbool: + return DAG.getNode(AArch64ISD::REINTERPRET_CAST, dl, Op.getValueType(), + Op.getOperand(1)); + case Intrinsic::aarch64_sve_convert_to_svbool: { + EVT OutVT = Op.getValueType(); + EVT InVT = Op.getOperand(1).getValueType(); + // Return the operand if the cast isn't changing type, + // i.e. <n x 16 x i1> -> <n x 16 x i1> + if (InVT == OutVT) + return Op.getOperand(1); + // Otherwise, zero the newly introduced lanes. + SDValue Reinterpret = + DAG.getNode(AArch64ISD::REINTERPRET_CAST, dl, OutVT, Op.getOperand(1)); + SDValue Mask = getPTrue(DAG, dl, InVT, AArch64SVEPredPattern::all); + SDValue MaskReinterpret = + DAG.getNode(AArch64ISD::REINTERPRET_CAST, dl, OutVT, Mask); + return DAG.getNode(ISD::AND, dl, OutVT, Reinterpret, MaskReinterpret); + } case Intrinsic::aarch64_sve_insr: { SDValue Scalar = Op.getOperand(2); @@ -3004,6 +3282,29 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, "llvm.eh.recoverfp must take a function as the first argument"); return IncomingFPOp; } + + case Intrinsic::aarch64_neon_vsri: + case Intrinsic::aarch64_neon_vsli: { + EVT Ty = Op.getValueType(); + + if (!Ty.isVector()) + report_fatal_error("Unexpected type for aarch64_neon_vsli"); + + assert(Op.getConstantOperandVal(3) <= Ty.getScalarSizeInBits()); + + bool IsShiftRight = IntNo == Intrinsic::aarch64_neon_vsri; + unsigned Opcode = IsShiftRight ? AArch64ISD::VSRI : AArch64ISD::VSLI; + return DAG.getNode(Opcode, dl, Ty, Op.getOperand(1), Op.getOperand(2), + Op.getOperand(3)); + } + + case Intrinsic::aarch64_neon_srhadd: + case Intrinsic::aarch64_neon_urhadd: { + bool IsSignedAdd = IntNo == Intrinsic::aarch64_neon_srhadd; + unsigned Opcode = IsSignedAdd ? AArch64ISD::SRHADD : AArch64ISD::URHADD; + return DAG.getNode(Opcode, dl, Op.getValueType(), Op.getOperand(1), + Op.getOperand(2)); + } } } @@ -3058,10 +3359,13 @@ SDValue AArch64TargetLowering::LowerSTORE(SDValue Op, EVT MemVT = StoreNode->getMemoryVT(); if (VT.isVector()) { + if (useSVEForFixedLengthVectorVT(VT)) + return LowerFixedLengthVectorStoreToSVE(Op, DAG); + unsigned AS = StoreNode->getAddressSpace(); - unsigned Align = StoreNode->getAlignment(); - if (Align < MemVT.getStoreSize() && - !allowsMisalignedMemoryAccesses(MemVT, AS, Align, + Align Alignment = StoreNode->getAlign(); + if (Alignment < MemVT.getStoreSize() && + !allowsMisalignedMemoryAccesses(MemVT, AS, Alignment.value(), StoreNode->getMemOperand()->getFlags(), nullptr)) { return scalarizeVectorStore(StoreNode, DAG); @@ -3070,6 +3374,30 @@ SDValue AArch64TargetLowering::LowerSTORE(SDValue Op, if (StoreNode->isTruncatingStore()) { return LowerTruncateVectorStore(Dl, StoreNode, VT, MemVT, DAG); } + // 256 bit non-temporal stores can be lowered to STNP. Do this as part of + // the custom lowering, as there are no un-paired non-temporal stores and + // legalization will break up 256 bit inputs. + if (StoreNode->isNonTemporal() && MemVT.getSizeInBits() == 256u && + MemVT.getVectorElementCount().Min % 2u == 0 && + ((MemVT.getScalarSizeInBits() == 8u || + MemVT.getScalarSizeInBits() == 16u || + MemVT.getScalarSizeInBits() == 32u || + MemVT.getScalarSizeInBits() == 64u))) { + SDValue Lo = + DAG.getNode(ISD::EXTRACT_SUBVECTOR, Dl, + MemVT.getHalfNumVectorElementsVT(*DAG.getContext()), + StoreNode->getValue(), DAG.getConstant(0, Dl, MVT::i64)); + SDValue Hi = DAG.getNode( + ISD::EXTRACT_SUBVECTOR, Dl, + MemVT.getHalfNumVectorElementsVT(*DAG.getContext()), + StoreNode->getValue(), + DAG.getConstant(MemVT.getVectorElementCount().Min / 2, Dl, MVT::i64)); + SDValue Result = DAG.getMemIntrinsicNode( + AArch64ISD::STNP, Dl, DAG.getVTList(MVT::Other), + {StoreNode->getChain(), Lo, Hi, StoreNode->getBasePtr()}, + StoreNode->getMemoryVT(), StoreNode->getMemOperand()); + return Result; + } } else if (MemVT == MVT::i128 && StoreNode->isVolatile()) { assert(StoreNode->getValue()->getValueType(0) == MVT::i128); SDValue Lo = @@ -3104,6 +3432,8 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op, case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG); case ISD::SETCC: + case ISD::STRICT_FSETCC: + case ISD::STRICT_FSETCCS: return LowerSETCC(Op, DAG); case ISD::BR_CC: return LowerBR_CC(Op, DAG); @@ -3138,14 +3468,19 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op, case ISD::UMULO: return LowerXALUO(Op, DAG); case ISD::FADD: + if (useSVEForFixedLengthVectorVT(Op.getValueType())) + return LowerToPredicatedOp(Op, DAG, AArch64ISD::FADD_PRED); return LowerF128Call(Op, DAG, RTLIB::ADD_F128); case ISD::FSUB: return LowerF128Call(Op, DAG, RTLIB::SUB_F128); case ISD::FMUL: return LowerF128Call(Op, DAG, RTLIB::MUL_F128); + case ISD::FMA: + return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMA_PRED); case ISD::FDIV: return LowerF128Call(Op, DAG, RTLIB::DIV_F128); case ISD::FP_ROUND: + case ISD::STRICT_FP_ROUND: return LowerFP_ROUND(Op, DAG); case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG); @@ -3169,6 +3504,20 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op, return LowerSPLAT_VECTOR(Op, DAG); case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG); + case ISD::INSERT_SUBVECTOR: + return LowerINSERT_SUBVECTOR(Op, DAG); + case ISD::SDIV: + return LowerToPredicatedOp(Op, DAG, AArch64ISD::SDIV_PRED); + case ISD::UDIV: + return LowerToPredicatedOp(Op, DAG, AArch64ISD::UDIV_PRED); + case ISD::SMIN: + return LowerToPredicatedOp(Op, DAG, AArch64ISD::SMIN_MERGE_OP1); + case ISD::UMIN: + return LowerToPredicatedOp(Op, DAG, AArch64ISD::UMIN_MERGE_OP1); + case ISD::SMAX: + return LowerToPredicatedOp(Op, DAG, AArch64ISD::SMAX_MERGE_OP1); + case ISD::UMAX: + return LowerToPredicatedOp(Op, DAG, AArch64ISD::UMAX_MERGE_OP1); case ISD::SRA: case ISD::SRL: case ISD::SHL: @@ -3190,9 +3539,13 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op, return LowerPREFETCH(Op, DAG); case ISD::SINT_TO_FP: case ISD::UINT_TO_FP: + case ISD::STRICT_SINT_TO_FP: + case ISD::STRICT_UINT_TO_FP: return LowerINT_TO_FP(Op, DAG); case ISD::FP_TO_SINT: case ISD::FP_TO_UINT: + case ISD::STRICT_FP_TO_SINT: + case ISD::STRICT_FP_TO_UINT: return LowerFP_TO_INT(Op, DAG); case ISD::FSINCOS: return LowerFSINCOS(Op, DAG); @@ -3218,9 +3571,68 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op, return LowerATOMIC_LOAD_AND(Op, DAG); case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG); + case ISD::VSCALE: + return LowerVSCALE(Op, DAG); + case ISD::TRUNCATE: + return LowerTRUNCATE(Op, DAG); + case ISD::LOAD: + if (useSVEForFixedLengthVectorVT(Op.getValueType())) + return LowerFixedLengthVectorLoadToSVE(Op, DAG); + llvm_unreachable("Unexpected request to lower ISD::LOAD"); + case ISD::ADD: + if (useSVEForFixedLengthVectorVT(Op.getValueType())) + return LowerToPredicatedOp(Op, DAG, AArch64ISD::ADD_PRED); + llvm_unreachable("Unexpected request to lower ISD::ADD"); } } +bool AArch64TargetLowering::useSVEForFixedLengthVectors() const { + // Prefer NEON unless larger SVE registers are available. + return Subtarget->hasSVE() && Subtarget->getMinSVEVectorSizeInBits() >= 256; +} + +bool AArch64TargetLowering::useSVEForFixedLengthVectorVT(EVT VT) const { + if (!useSVEForFixedLengthVectors()) + return false; + + if (!VT.isFixedLengthVector()) + return false; + + // Fixed length predicates should be promoted to i8. + // NOTE: This is consistent with how NEON (and thus 64/128bit vectors) work. + if (VT.getVectorElementType() == MVT::i1) + return false; + + // Don't use SVE for vectors we cannot scalarize if required. + switch (VT.getVectorElementType().getSimpleVT().SimpleTy) { + default: + return false; + case MVT::i8: + case MVT::i16: + case MVT::i32: + case MVT::i64: + case MVT::f16: + case MVT::f32: + case MVT::f64: + break; + } + + // Ensure NEON MVTs only belong to a single register class. + if (VT.getSizeInBits() <= 128) + return false; + + // Don't use SVE for types that don't fit. + if (VT.getSizeInBits() > Subtarget->getMinSVEVectorSizeInBits()) + return false; + + // TODO: Perhaps an artificial restriction, but worth having whilst getting + // the base fixed length SVE support in place. + if (!VT.isPow2VectorType()) + return false; + + return true; +} + //===----------------------------------------------------------------------===// // Calling Convention Implementation //===----------------------------------------------------------------------===// @@ -3231,9 +3643,6 @@ CCAssignFn *AArch64TargetLowering::CCAssignFnForCall(CallingConv::ID CC, switch (CC) { default: report_fatal_error("Unsupported calling convention."); - case CallingConv::AArch64_SVE_VectorCall: - // Calling SVE functions is currently not yet supported. - report_fatal_error("Unsupported calling convention."); case CallingConv::WebKit_JS: return CC_AArch64_WebKit_JS; case CallingConv::GHC: @@ -3256,6 +3665,7 @@ CCAssignFn *AArch64TargetLowering::CCAssignFnForCall(CallingConv::ID CC, case CallingConv::CFGuard_Check: return CC_AArch64_Win64_CFGuard_Check; case CallingConv::AArch64_VectorCall: + case CallingConv::AArch64_SVE_VectorCall: return CC_AArch64_AAPCS; } } @@ -3343,7 +3753,7 @@ SDValue AArch64TargetLowering::LowerFormalArguments( RC = &AArch64::GPR32RegClass; else if (RegVT == MVT::i64) RC = &AArch64::GPR64RegClass; - else if (RegVT == MVT::f16) + else if (RegVT == MVT::f16 || RegVT == MVT::bf16) RC = &AArch64::FPR16RegClass; else if (RegVT == MVT::f32) RC = &AArch64::FPR32RegClass; @@ -3374,7 +3784,7 @@ SDValue AArch64TargetLowering::LowerFormalArguments( case CCValAssign::Indirect: assert(VA.getValVT().isScalableVector() && "Only scalable vectors can be passed indirectly"); - llvm_unreachable("Spilling of SVE vectors not yet implemented"); + break; case CCValAssign::BCvt: ArgValue = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), ArgValue); break; @@ -3391,7 +3801,9 @@ SDValue AArch64TargetLowering::LowerFormalArguments( } else { // VA.isRegLoc() assert(VA.isMemLoc() && "CCValAssign is neither reg nor mem"); unsigned ArgOffset = VA.getLocMemOffset(); - unsigned ArgSize = VA.getValVT().getSizeInBits() / 8; + unsigned ArgSize = (VA.getLocInfo() == CCValAssign::Indirect + ? VA.getLocVT().getSizeInBits() + : VA.getValVT().getSizeInBits()) / 8; uint32_t BEAlign = 0; if (!Subtarget->isLittleEndian() && ArgSize < 8 && @@ -3417,7 +3829,8 @@ SDValue AArch64TargetLowering::LowerFormalArguments( case CCValAssign::Indirect: assert(VA.getValVT().isScalableVector() && "Only scalable vectors can be passed indirectly"); - llvm_unreachable("Spilling of SVE vectors not yet implemented"); + MemVT = VA.getLocVT(); + break; case CCValAssign::SExt: ExtType = ISD::SEXTLOAD; break; @@ -3435,6 +3848,15 @@ SDValue AArch64TargetLowering::LowerFormalArguments( MemVT); } + + if (VA.getLocInfo() == CCValAssign::Indirect) { + assert(VA.getValVT().isScalableVector() && + "Only scalable vectors can be passed indirectly"); + // If value is passed via pointer - do a load. + ArgValue = + DAG.getLoad(VA.getValVT(), DL, Chain, ArgValue, MachinePointerInfo()); + } + if (Subtarget->isTargetILP32() && Ins[i].Flags.isPointer()) ArgValue = DAG.getNode(ISD::AssertZext, DL, ArgValue.getValueType(), ArgValue, DAG.getValueType(MVT::i32)); @@ -3550,7 +3972,7 @@ void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo, // The extra size here, if triggered, will always be 8. MFI.CreateFixedObject(16 - (GPRSaveSize & 15), -(int)alignTo(GPRSaveSize, 16), false); } else - GPRIdx = MFI.CreateStackObject(GPRSaveSize, 8, false); + GPRIdx = MFI.CreateStackObject(GPRSaveSize, Align(8), false); SDValue FIN = DAG.getFrameIndex(GPRIdx, PtrVT); @@ -3582,7 +4004,7 @@ void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo, unsigned FPRSaveSize = 16 * (NumFPRArgRegs - FirstVariadicFPR); int FPRIdx = 0; if (FPRSaveSize != 0) { - FPRIdx = MFI.CreateStackObject(FPRSaveSize, 16, false); + FPRIdx = MFI.CreateStackObject(FPRSaveSize, Align(16), false); SDValue FIN = DAG.getFrameIndex(FPRIdx, PtrVT); @@ -3703,6 +4125,13 @@ bool AArch64TargetLowering::isEligibleForTailCallOptimization( CallingConv::ID CallerCC = CallerF.getCallingConv(); bool CCMatch = CallerCC == CalleeCC; + // When using the Windows calling convention on a non-windows OS, we want + // to back up and restore X18 in such functions; we can't do a tail call + // from those functions. + if (CallerCC == CallingConv::Win64 && !Subtarget->isTargetWindows() && + CalleeCC != CallingConv::Win64) + return false; + // Byval parameters hand the function a pointer directly into the stack area // we want to reuse during a tail call. Working around this *is* possible (see // X86) but less efficient and uglier in LowerCall. @@ -3795,6 +4224,18 @@ bool AArch64TargetLowering::isEligibleForTailCallOptimization( const AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>(); + // If any of the arguments is passed indirectly, it must be SVE, so the + // 'getBytesInStackArgArea' is not sufficient to determine whether we need to + // allocate space on the stack. That is why we determine this explicitly here + // the call cannot be a tailcall. + if (llvm::any_of(ArgLocs, [](CCValAssign &A) { + assert((A.getLocInfo() != CCValAssign::Indirect || + A.getValVT().isScalableVector()) && + "Expected value to be scalable"); + return A.getLocInfo() == CCValAssign::Indirect; + })) + return false; + // If the stack arguments for this call do not fit into our own save area then // the call cannot be made tail. if (CCInfo.getNextStackOffset() > FuncInfo->getBytesInStackArgArea()) @@ -3873,7 +4314,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, // Check if it's really possible to do a tail call. IsTailCall = isEligibleForTailCallOptimization( Callee, CallConv, IsVarArg, Outs, OutVals, Ins, DAG); - if (!IsTailCall && CLI.CS && CLI.CS.isMustTailCall()) + if (!IsTailCall && CLI.CB && CLI.CB->isMustTailCall()) report_fatal_error("failed to perform tail call elimination on a call " "site marked musttail"); @@ -3983,7 +4424,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, SmallVector<SDValue, 8> MemOpChains; auto PtrVT = getPointerTy(DAG.getDataLayout()); - if (IsVarArg && CLI.CS && CLI.CS.isMustTailCall()) { + if (IsVarArg && CLI.CB && CLI.CB->isMustTailCall()) { const auto &Forwards = FuncInfo->getForwardedMustTailRegParms(); for (const auto &F : Forwards) { SDValue Val = DAG.getCopyFromReg(Chain, DL, F.VReg, F.VT); @@ -4035,7 +4476,20 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, case CCValAssign::Indirect: assert(VA.getValVT().isScalableVector() && "Only scalable vectors can be passed indirectly"); - llvm_unreachable("Spilling of SVE vectors not yet implemented"); + MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo(); + Type *Ty = EVT(VA.getValVT()).getTypeForEVT(*DAG.getContext()); + Align Alignment = DAG.getDataLayout().getPrefTypeAlign(Ty); + int FI = MFI.CreateStackObject( + VA.getValVT().getStoreSize().getKnownMinSize(), Alignment, false); + MFI.setStackID(FI, TargetStackID::SVEVector); + + SDValue SpillSlot = DAG.getFrameIndex( + FI, DAG.getTargetLoweringInfo().getFrameIndexTy(DAG.getDataLayout())); + Chain = DAG.getStore( + Chain, DL, Arg, SpillSlot, + MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI)); + Arg = SpillSlot; + break; } if (VA.isRegLoc()) { @@ -4071,7 +4525,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, RegsToPass.emplace_back(VA.getLocReg(), Arg); RegsUsed.insert(VA.getLocReg()); const TargetOptions &Options = DAG.getTarget().Options; - if (Options.EnableDebugEntryValues) + if (Options.EmitCallSiteInfo) CSInfo.emplace_back(VA.getLocReg(), i); } } else { @@ -4083,8 +4537,12 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, // FIXME: This works on big-endian for composite byvals, which are the // common case. It should also work for fundamental types too. uint32_t BEAlign = 0; - unsigned OpSize = Flags.isByVal() ? Flags.getByValSize() * 8 - : VA.getValVT().getSizeInBits(); + unsigned OpSize; + if (VA.getLocInfo() == CCValAssign::Indirect) + OpSize = VA.getLocVT().getSizeInBits(); + else + OpSize = Flags.isByVal() ? Flags.getByValSize() * 8 + : VA.getValVT().getSizeInBits(); OpSize = (OpSize + 7) / 8; if (!Subtarget->isLittleEndian() && !Flags.isByVal() && !Flags.isInConsecutiveRegs()) { @@ -4120,10 +4578,10 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, SDValue SizeNode = DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i64); SDValue Cpy = DAG.getMemcpy( - Chain, DL, DstAddr, Arg, SizeNode, Outs[i].Flags.getByValAlign(), + Chain, DL, DstAddr, Arg, SizeNode, + Outs[i].Flags.getNonZeroByValAlign(), /*isVol = */ false, /*AlwaysInline = */ false, - /*isTailCall = */ false, - DstInfo, MachinePointerInfo()); + /*isTailCall = */ false, DstInfo, MachinePointerInfo()); MemOpChains.push_back(Cpy); } else { @@ -4257,6 +4715,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, // Returns a chain and a flag for retval copy to use. Chain = DAG.getNode(AArch64ISD::CALL, DL, NodeTys, Ops); + DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge); InFlag = Chain.getValue(1); DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo)); @@ -4422,7 +4881,7 @@ SDValue AArch64TargetLowering::getTargetNode(JumpTableSDNode *N, EVT Ty, SDValue AArch64TargetLowering::getTargetNode(ConstantPoolSDNode *N, EVT Ty, SelectionDAG &DAG, unsigned Flag) const { - return DAG.getTargetConstantPool(N->getConstVal(), Ty, N->getAlignment(), + return DAG.getTargetConstantPool(N->getConstVal(), Ty, N->getAlign(), N->getOffset(), Flag); } @@ -4913,7 +5372,7 @@ SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const { // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch // instruction. - if (isOverflowIntrOpRes(LHS) && isOneConstant(RHS) && + if (ISD::isOverflowIntrOpRes(LHS) && isOneConstant(RHS) && (CC == ISD::SETEQ || CC == ISD::SETNE)) { // Only lower legal XALUO ops. if (!DAG.getTargetLoweringInfo().isTypeLegal(LHS->getValueType(0))) @@ -4997,8 +5456,8 @@ SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const { Cmp); } - assert(LHS.getValueType() == MVT::f16 || LHS.getValueType() == MVT::f32 || - LHS.getValueType() == MVT::f64); + assert(LHS.getValueType() == MVT::f16 || LHS.getValueType() == MVT::bf16 || + LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64); // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally // clean. Some of them require two branches to implement. @@ -5124,6 +5583,15 @@ SDValue AArch64TargetLowering::LowerCTPOP(SDValue Op, SelectionDAG &DAG) const { if (VT == MVT::i64) UaddLV = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, UaddLV); return UaddLV; + } else if (VT == MVT::i128) { + Val = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Val); + + SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v16i8, Val); + SDValue UaddLV = DAG.getNode( + ISD::INTRINSIC_WO_CHAIN, DL, MVT::i32, + DAG.getConstant(Intrinsic::aarch64_neon_uaddlv, DL, MVT::i32), CtPop); + + return DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i128, UaddLV); } assert((VT == MVT::v1i64 || VT == MVT::v2i64 || VT == MVT::v2i32 || @@ -5154,9 +5622,15 @@ SDValue AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { if (Op.getValueType().isVector()) return LowerVSETCC(Op, DAG); - SDValue LHS = Op.getOperand(0); - SDValue RHS = Op.getOperand(1); - ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get(); + bool IsStrict = Op->isStrictFPOpcode(); + bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS; + unsigned OpNo = IsStrict ? 1 : 0; + SDValue Chain; + if (IsStrict) + Chain = Op.getOperand(0); + SDValue LHS = Op.getOperand(OpNo + 0); + SDValue RHS = Op.getOperand(OpNo + 1); + ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(OpNo + 2))->get(); SDLoc dl(Op); // We chose ZeroOrOneBooleanContents, so use zero and one. @@ -5167,13 +5641,14 @@ SDValue AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { // Handle f128 first, since one possible outcome is a normal integer // comparison which gets picked up by the next if statement. if (LHS.getValueType() == MVT::f128) { - softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl, LHS, RHS); + softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl, LHS, RHS, Chain, + IsSignaling); // If softenSetCCOperands returned a scalar, use it. if (!RHS.getNode()) { assert(LHS.getValueType() == Op.getValueType() && "Unexpected setcc expansion!"); - return LHS; + return IsStrict ? DAG.getMergeValues({LHS, Chain}, dl) : LHS; } } @@ -5185,7 +5660,8 @@ SDValue AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { // Note that we inverted the condition above, so we reverse the order of // the true and false operands here. This will allow the setcc to be // matched to a single CSINC instruction. - return DAG.getNode(AArch64ISD::CSEL, dl, VT, FVal, TVal, CCVal, Cmp); + SDValue Res = DAG.getNode(AArch64ISD::CSEL, dl, VT, FVal, TVal, CCVal, Cmp); + return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res; } // Now we know we're dealing with FP values. @@ -5194,10 +5670,15 @@ SDValue AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { // If that fails, we'll need to perform an FCMP + CSEL sequence. Go ahead // and do the comparison. - SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG); + SDValue Cmp; + if (IsStrict) + Cmp = emitStrictFPComparison(LHS, RHS, dl, DAG, Chain, IsSignaling); + else + Cmp = emitComparison(LHS, RHS, CC, dl, DAG); AArch64CC::CondCode CC1, CC2; changeFPCCToAArch64CC(CC, CC1, CC2); + SDValue Res; if (CC2 == AArch64CC::AL) { changeFPCCToAArch64CC(ISD::getSetCCInverse(CC, LHS.getValueType()), CC1, CC2); @@ -5206,7 +5687,7 @@ SDValue AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { // Note that we inverted the condition above, so we reverse the order of // the true and false operands here. This will allow the setcc to be // matched to a single CSINC instruction. - return DAG.getNode(AArch64ISD::CSEL, dl, VT, FVal, TVal, CC1Val, Cmp); + Res = DAG.getNode(AArch64ISD::CSEL, dl, VT, FVal, TVal, CC1Val, Cmp); } else { // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't // totally clean. Some of them require two CSELs to implement. As is in @@ -5219,8 +5700,9 @@ SDValue AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, FVal, CC1Val, Cmp); SDValue CC2Val = DAG.getConstant(CC2, dl, MVT::i32); - return DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, CS1, CC2Val, Cmp); + Res = DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, CS1, CC2Val, Cmp); } + return IsStrict ? DAG.getMergeValues({Res, Cmp.getValue(1)}, dl) : Res; } SDValue AArch64TargetLowering::LowerSELECT_CC(ISD::CondCode CC, SDValue LHS, @@ -5429,9 +5911,17 @@ SDValue AArch64TargetLowering::LowerSELECT(SDValue Op, SDValue FVal = Op->getOperand(2); SDLoc DL(Op); + EVT Ty = Op.getValueType(); + if (Ty.isScalableVector()) { + SDValue TruncCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, CCVal); + MVT PredVT = MVT::getVectorVT(MVT::i1, Ty.getVectorElementCount()); + SDValue SplatPred = DAG.getNode(ISD::SPLAT_VECTOR, DL, PredVT, TruncCC); + return DAG.getNode(ISD::VSELECT, DL, Ty, SplatPred, TVal, FVal); + } + // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a select // instruction. - if (isOverflowIntrOpRes(CCVal)) { + if (ISD::isOverflowIntrOpRes(CCVal)) { // Only lower legal XALUO ops. if (!DAG.getTargetLoweringInfo().isTypeLegal(CCVal->getValueType(0))) return SDValue(); @@ -5642,9 +6132,9 @@ SDValue AArch64TargetLowering::LowerVACOPY(SDValue Op, const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue(); return DAG.getMemcpy(Op.getOperand(0), DL, Op.getOperand(1), Op.getOperand(2), - DAG.getConstant(VaListSize, DL, MVT::i32), PtrSize, - false, false, false, MachinePointerInfo(DestSV), - MachinePointerInfo(SrcSV)); + DAG.getConstant(VaListSize, DL, MVT::i32), + Align(PtrSize), false, false, false, + MachinePointerInfo(DestSV), MachinePointerInfo(SrcSV)); } SDValue AArch64TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const { @@ -5656,7 +6146,7 @@ SDValue AArch64TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const { SDLoc DL(Op); SDValue Chain = Op.getOperand(0); SDValue Addr = Op.getOperand(1); - unsigned Align = Op.getConstantOperandVal(3); + MaybeAlign Align(Op.getConstantOperandVal(3)); unsigned MinSlotSize = Subtarget->isTargetILP32() ? 4 : 8; auto PtrVT = getPointerTy(DAG.getDataLayout()); auto PtrMemVT = getPointerMemTy(DAG.getDataLayout()); @@ -5665,12 +6155,11 @@ SDValue AArch64TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const { Chain = VAList.getValue(1); VAList = DAG.getZExtOrTrunc(VAList, DL, PtrVT); - if (Align > MinSlotSize) { - assert(((Align & (Align - 1)) == 0) && "Expected Align to be a power of 2"); + if (Align && *Align > MinSlotSize) { VAList = DAG.getNode(ISD::ADD, DL, PtrVT, VAList, - DAG.getConstant(Align - 1, DL, PtrVT)); + DAG.getConstant(Align->value() - 1, DL, PtrVT)); VAList = DAG.getNode(ISD::AND, DL, PtrVT, VAList, - DAG.getConstant(-(int64_t)Align, DL, PtrVT)); + DAG.getConstant(-(int64_t)Align->value(), DL, PtrVT)); } Type *ArgTy = VT.getTypeForEVT(*DAG.getContext()); @@ -7001,7 +7490,8 @@ static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS, return DAG.getNode(AArch64ISD::REV64, dl, VT, OpLHS); // vrev <4 x i16> -> REV32 if (VT.getVectorElementType() == MVT::i16 || - VT.getVectorElementType() == MVT::f16) + VT.getVectorElementType() == MVT::f16 || + VT.getVectorElementType() == MVT::bf16) return DAG.getNode(AArch64ISD::REV32, dl, VT, OpLHS); // vrev <4 x i8> -> REV16 assert(VT.getVectorElementType() == MVT::i8); @@ -7014,7 +7504,7 @@ static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS, unsigned Opcode; if (EltTy == MVT::i8) Opcode = AArch64ISD::DUPLANE8; - else if (EltTy == MVT::i16 || EltTy == MVT::f16) + else if (EltTy == MVT::i16 || EltTy == MVT::f16 || EltTy == MVT::bf16) Opcode = AArch64ISD::DUPLANE16; else if (EltTy == MVT::i32 || EltTy == MVT::f32) Opcode = AArch64ISD::DUPLANE32; @@ -7121,7 +7611,7 @@ static SDValue GenerateTBL(SDValue Op, ArrayRef<int> ShuffleMask, static unsigned getDUPLANEOp(EVT EltType) { if (EltType == MVT::i8) return AArch64ISD::DUPLANE8; - if (EltType == MVT::i16 || EltType == MVT::f16) + if (EltType == MVT::i16 || EltType == MVT::f16 || EltType == MVT::bf16) return AArch64ISD::DUPLANE16; if (EltType == MVT::i32 || EltType == MVT::f32) return AArch64ISD::DUPLANE32; @@ -7330,18 +7820,16 @@ SDValue AArch64TargetLowering::LowerSPLAT_VECTOR(SDValue Op, // Extend input splat value where needed to fit into a GPR (32b or 64b only) // FPRs don't have this restriction. switch (ElemVT.getSimpleVT().SimpleTy) { - case MVT::i8: - case MVT::i16: - case MVT::i32: - SplatVal = DAG.getAnyExtOrTrunc(SplatVal, dl, MVT::i32); - return DAG.getNode(AArch64ISD::DUP, dl, VT, SplatVal); - case MVT::i64: - SplatVal = DAG.getAnyExtOrTrunc(SplatVal, dl, MVT::i64); - return DAG.getNode(AArch64ISD::DUP, dl, VT, SplatVal); case MVT::i1: { + // The only legal i1 vectors are SVE vectors, so we can use SVE-specific + // lowering code. + if (auto *ConstVal = dyn_cast<ConstantSDNode>(SplatVal)) { + if (ConstVal->isOne()) + return getPTrue(DAG, dl, VT, AArch64SVEPredPattern::all); + // TODO: Add special case for constant false + } // The general case of i1. There isn't any natural way to do this, // so we use some trickery with whilelo. - // TODO: Add special cases for splat of constant true/false. SplatVal = DAG.getAnyExtOrTrunc(SplatVal, dl, MVT::i64); SplatVal = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::i64, SplatVal, DAG.getValueType(MVT::i1)); @@ -7350,15 +7838,76 @@ SDValue AArch64TargetLowering::LowerSPLAT_VECTOR(SDValue Op, return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, ID, DAG.getConstant(0, dl, MVT::i64), SplatVal); } - // TODO: we can support float types, but haven't added patterns yet. + case MVT::i8: + case MVT::i16: + case MVT::i32: + SplatVal = DAG.getAnyExtOrTrunc(SplatVal, dl, MVT::i32); + break; + case MVT::i64: + SplatVal = DAG.getAnyExtOrTrunc(SplatVal, dl, MVT::i64); + break; case MVT::f16: + case MVT::bf16: case MVT::f32: case MVT::f64: + // Fine as is + break; default: report_fatal_error("Unsupported SPLAT_VECTOR input operand type"); } + + return DAG.getNode(AArch64ISD::DUP, dl, VT, SplatVal); +} + +SDValue AArch64TargetLowering::LowerDUPQLane(SDValue Op, + SelectionDAG &DAG) const { + SDLoc DL(Op); + + EVT VT = Op.getValueType(); + if (!isTypeLegal(VT) || !VT.isScalableVector()) + return SDValue(); + + // Current lowering only supports the SVE-ACLE types. + if (VT.getSizeInBits().getKnownMinSize() != AArch64::SVEBitsPerBlock) + return SDValue(); + + // The DUPQ operation is indepedent of element type so normalise to i64s. + SDValue V = DAG.getNode(ISD::BITCAST, DL, MVT::nxv2i64, Op.getOperand(1)); + SDValue Idx128 = Op.getOperand(2); + + // DUPQ can be used when idx is in range. + auto *CIdx = dyn_cast<ConstantSDNode>(Idx128); + if (CIdx && (CIdx->getZExtValue() <= 3)) { + SDValue CI = DAG.getTargetConstant(CIdx->getZExtValue(), DL, MVT::i64); + SDNode *DUPQ = + DAG.getMachineNode(AArch64::DUP_ZZI_Q, DL, MVT::nxv2i64, V, CI); + return DAG.getNode(ISD::BITCAST, DL, VT, SDValue(DUPQ, 0)); + } + + // The ACLE says this must produce the same result as: + // svtbl(data, svadd_x(svptrue_b64(), + // svand_x(svptrue_b64(), svindex_u64(0, 1), 1), + // index * 2)) + SDValue One = DAG.getConstant(1, DL, MVT::i64); + SDValue SplatOne = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::nxv2i64, One); + + // create the vector 0,1,0,1,... + SDValue Zero = DAG.getConstant(0, DL, MVT::i64); + SDValue SV = DAG.getNode(AArch64ISD::INDEX_VECTOR, + DL, MVT::nxv2i64, Zero, One); + SV = DAG.getNode(ISD::AND, DL, MVT::nxv2i64, SV, SplatOne); + + // create the vector idx64,idx64+1,idx64,idx64+1,... + SDValue Idx64 = DAG.getNode(ISD::ADD, DL, MVT::i64, Idx128, Idx128); + SDValue SplatIdx64 = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::nxv2i64, Idx64); + SDValue ShuffleMask = DAG.getNode(ISD::ADD, DL, MVT::nxv2i64, SV, SplatIdx64); + + // create the vector Val[idx64],Val[idx64+1],Val[idx64],Val[idx64+1],... + SDValue TBL = DAG.getNode(AArch64ISD::TBL, DL, MVT::nxv2i64, V, ShuffleMask); + return DAG.getNode(ISD::BITCAST, DL, VT, TBL); } + static bool resolveBuildVector(BuildVectorSDNode *BVN, APInt &CnstBits, APInt &UndefBits) { EVT VT = BVN->getValueType(0); @@ -7609,8 +8158,10 @@ static unsigned getIntrinsicID(const SDNode *N) { // Attempt to form a vector S[LR]I from (or (and X, BvecC1), (lsl Y, C2)), // to (SLI X, Y, C2), where X and Y have matching vector types, BvecC1 is a -// BUILD_VECTORs with constant element C1, C2 is a constant, and C1 == ~C2. -// Also, logical shift right -> sri, with the same structure. +// BUILD_VECTORs with constant element C1, C2 is a constant, and: +// - for the SLI case: C1 == ~(Ones(ElemSizeInBits) << C2) +// - for the SRI case: C1 == ~(Ones(ElemSizeInBits) >> C2) +// The (or (lsl Y, C2), (and X, BvecC1)) case is also handled. static SDValue tryLowerToSLI(SDNode *N, SelectionDAG &DAG) { EVT VT = N->getValueType(0); @@ -7619,49 +8170,70 @@ static SDValue tryLowerToSLI(SDNode *N, SelectionDAG &DAG) { SDLoc DL(N); - // Is the first op an AND? - const SDValue And = N->getOperand(0); - if (And.getOpcode() != ISD::AND) + SDValue And; + SDValue Shift; + + SDValue FirstOp = N->getOperand(0); + unsigned FirstOpc = FirstOp.getOpcode(); + SDValue SecondOp = N->getOperand(1); + unsigned SecondOpc = SecondOp.getOpcode(); + + // Is one of the operands an AND or a BICi? The AND may have been optimised to + // a BICi in order to use an immediate instead of a register. + // Is the other operand an shl or lshr? This will have been turned into: + // AArch64ISD::VSHL vector, #shift or AArch64ISD::VLSHR vector, #shift. + if ((FirstOpc == ISD::AND || FirstOpc == AArch64ISD::BICi) && + (SecondOpc == AArch64ISD::VSHL || SecondOpc == AArch64ISD::VLSHR)) { + And = FirstOp; + Shift = SecondOp; + + } else if ((SecondOpc == ISD::AND || SecondOpc == AArch64ISD::BICi) && + (FirstOpc == AArch64ISD::VSHL || FirstOpc == AArch64ISD::VLSHR)) { + And = SecondOp; + Shift = FirstOp; + } else return SDValue(); - // Is the second op an shl or lshr? - SDValue Shift = N->getOperand(1); - // This will have been turned into: AArch64ISD::VSHL vector, #shift - // or AArch64ISD::VLSHR vector, #shift - unsigned ShiftOpc = Shift.getOpcode(); - if ((ShiftOpc != AArch64ISD::VSHL && ShiftOpc != AArch64ISD::VLSHR)) - return SDValue(); - bool IsShiftRight = ShiftOpc == AArch64ISD::VLSHR; + bool IsAnd = And.getOpcode() == ISD::AND; + bool IsShiftRight = Shift.getOpcode() == AArch64ISD::VLSHR; // Is the shift amount constant? ConstantSDNode *C2node = dyn_cast<ConstantSDNode>(Shift.getOperand(1)); if (!C2node) return SDValue(); - // Is the and mask vector all constant? uint64_t C1; - if (!isAllConstantBuildVector(And.getOperand(1), C1)) - return SDValue(); + if (IsAnd) { + // Is the and mask vector all constant? + if (!isAllConstantBuildVector(And.getOperand(1), C1)) + return SDValue(); + } else { + // Reconstruct the corresponding AND immediate from the two BICi immediates. + ConstantSDNode *C1nodeImm = dyn_cast<ConstantSDNode>(And.getOperand(1)); + ConstantSDNode *C1nodeShift = dyn_cast<ConstantSDNode>(And.getOperand(2)); + assert(C1nodeImm && C1nodeShift); + C1 = ~(C1nodeImm->getZExtValue() << C1nodeShift->getZExtValue()); + } - // Is C1 == ~C2, taking into account how much one can shift elements of a - // particular size? + // Is C1 == ~(Ones(ElemSizeInBits) << C2) or + // C1 == ~(Ones(ElemSizeInBits) >> C2), taking into account + // how much one can shift elements of a particular size? uint64_t C2 = C2node->getZExtValue(); unsigned ElemSizeInBits = VT.getScalarSizeInBits(); if (C2 > ElemSizeInBits) return SDValue(); - unsigned ElemMask = (1 << ElemSizeInBits) - 1; - if ((C1 & ElemMask) != (~C2 & ElemMask)) + + APInt C1AsAPInt(ElemSizeInBits, C1); + APInt RequiredC1 = IsShiftRight ? APInt::getHighBitsSet(ElemSizeInBits, C2) + : APInt::getLowBitsSet(ElemSizeInBits, C2); + if (C1AsAPInt != RequiredC1) return SDValue(); SDValue X = And.getOperand(0); SDValue Y = Shift.getOperand(0); - unsigned Intrin = - IsShiftRight ? Intrinsic::aarch64_neon_vsri : Intrinsic::aarch64_neon_vsli; - SDValue ResultSLI = - DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, - DAG.getConstant(Intrin, DL, MVT::i32), X, Y, - Shift.getOperand(1)); + unsigned Inst = IsShiftRight ? AArch64ISD::VSRI : AArch64ISD::VSLI; + SDValue ResultSLI = DAG.getNode(Inst, DL, VT, X, Y, Shift.getOperand(1)); LLVM_DEBUG(dbgs() << "aarch64-lower: transformed: \n"); LLVM_DEBUG(N->dump(&DAG)); @@ -7675,10 +8247,8 @@ static SDValue tryLowerToSLI(SDNode *N, SelectionDAG &DAG) { SDValue AArch64TargetLowering::LowerVectorOR(SDValue Op, SelectionDAG &DAG) const { // Attempt to form a vector S[LR]I from (or (and X, C1), (lsl Y, C2)) - if (EnableAArch64SlrGeneration) { - if (SDValue Res = tryLowerToSLI(Op.getNode(), DAG)) - return Res; - } + if (SDValue Res = tryLowerToSLI(Op.getNode(), DAG)) + return Res; EVT VT = Op.getValueType(); @@ -7966,8 +8536,8 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op, if (VT.getVectorElementType().isFloatingPoint()) { SmallVector<SDValue, 8> Ops; EVT EltTy = VT.getVectorElementType(); - assert ((EltTy == MVT::f16 || EltTy == MVT::f32 || EltTy == MVT::f64) && - "Unsupported floating-point vector type"); + assert ((EltTy == MVT::f16 || EltTy == MVT::bf16 || EltTy == MVT::f32 || + EltTy == MVT::f64) && "Unsupported floating-point vector type"); LLVM_DEBUG( dbgs() << "LowerBUILD_VECTOR: float constant splats, creating int " "BITCASTS, and try again\n"); @@ -8086,11 +8656,12 @@ SDValue AArch64TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, // Insertion/extraction are legal for V128 types. if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 || VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64 || - VT == MVT::v8f16) + VT == MVT::v8f16 || VT == MVT::v8bf16) return Op; if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 && - VT != MVT::v1i64 && VT != MVT::v2f32 && VT != MVT::v4f16) + VT != MVT::v1i64 && VT != MVT::v2f32 && VT != MVT::v4f16 && + VT != MVT::v4bf16) return SDValue(); // For V64 types, we perform insertion by expanding the value @@ -8120,11 +8691,12 @@ AArch64TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, // Insertion/extraction are legal for V128 types. if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 || VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64 || - VT == MVT::v8f16) + VT == MVT::v8f16 || VT == MVT::v8bf16) return Op; if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 && - VT != MVT::v1i64 && VT != MVT::v2f32 && VT != MVT::v4f16) + VT != MVT::v1i64 && VT != MVT::v2f32 && VT != MVT::v4f16 && + VT != MVT::v4bf16) return SDValue(); // For V64 types, we perform extraction by expanding the value @@ -8144,32 +8716,57 @@ AArch64TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, SDValue AArch64TargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const { - EVT VT = Op.getOperand(0).getValueType(); - SDLoc dl(Op); - // Just in case... - if (!VT.isVector()) - return SDValue(); - - ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op.getOperand(1)); - if (!Cst) - return SDValue(); - unsigned Val = Cst->getZExtValue(); + assert(Op.getValueType().isFixedLengthVector() && + "Only cases that extract a fixed length vector are supported!"); + EVT InVT = Op.getOperand(0).getValueType(); + unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); unsigned Size = Op.getValueSizeInBits(); + if (InVT.isScalableVector()) { + // This will be matched by custom code during ISelDAGToDAG. + if (Idx == 0 && isPackedVectorType(InVT, DAG)) + return Op; + + return SDValue(); + } + // This will get lowered to an appropriate EXTRACT_SUBREG in ISel. - if (Val == 0) + if (Idx == 0 && InVT.getSizeInBits() <= 128) return Op; // If this is extracting the upper 64-bits of a 128-bit vector, we match // that directly. - if (Size == 64 && Val * VT.getScalarSizeInBits() == 64) + if (Size == 64 && Idx * InVT.getScalarSizeInBits() == 64) + return Op; + + return SDValue(); +} + +SDValue AArch64TargetLowering::LowerINSERT_SUBVECTOR(SDValue Op, + SelectionDAG &DAG) const { + assert(Op.getValueType().isScalableVector() && + "Only expect to lower inserts into scalable vectors!"); + + EVT InVT = Op.getOperand(1).getValueType(); + unsigned Idx = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue(); + + // We don't have any patterns for scalable vector yet. + if (InVT.isScalableVector() || !useSVEForFixedLengthVectorVT(InVT)) + return SDValue(); + + // This will be matched by custom code during ISelDAGToDAG. + if (Idx == 0 && isPackedVectorType(InVT, DAG) && Op.getOperand(0).isUndef()) return Op; return SDValue(); } bool AArch64TargetLowering::isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const { + // Currently no fixed length shuffles that require SVE are legal. + if (useSVEForFixedLengthVectorVT(VT)) + return false; + if (VT.getVectorNumElements() == 4 && (VT.is128BitVector() || VT.is64BitVector())) { unsigned PFIndexes[4]; @@ -8249,6 +8846,81 @@ static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, int64_t &Cnt) { return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits / 2 : ElementBits)); } +// Attempt to form urhadd(OpA, OpB) from +// truncate(vlshr(sub(zext(OpB), xor(zext(OpA), Ones(ElemSizeInBits))), 1)). +// The original form of this expression is +// truncate(srl(add(zext(OpB), add(zext(OpA), 1)), 1)) and before this function +// is called the srl will have been lowered to AArch64ISD::VLSHR and the +// ((OpA + OpB + 1) >> 1) expression will have been changed to (OpB - (~OpA)). +// This pass can also recognize a variant of this pattern that uses sign +// extension instead of zero extension and form a srhadd(OpA, OpB) from it. +SDValue AArch64TargetLowering::LowerTRUNCATE(SDValue Op, + SelectionDAG &DAG) const { + EVT VT = Op.getValueType(); + + if (!VT.isVector() || VT.isScalableVector()) + return Op; + + if (useSVEForFixedLengthVectorVT(Op.getOperand(0).getValueType())) + return LowerFixedLengthVectorTruncateToSVE(Op, DAG); + + // Since we are looking for a right shift by a constant value of 1 and we are + // operating on types at least 16 bits in length (sign/zero extended OpA and + // OpB, which are at least 8 bits), it follows that the truncate will always + // discard the shifted-in bit and therefore the right shift will be logical + // regardless of the signedness of OpA and OpB. + SDValue Shift = Op.getOperand(0); + if (Shift.getOpcode() != AArch64ISD::VLSHR) + return Op; + + // Is the right shift using an immediate value of 1? + uint64_t ShiftAmount = Shift.getConstantOperandVal(1); + if (ShiftAmount != 1) + return Op; + + SDValue Sub = Shift->getOperand(0); + if (Sub.getOpcode() != ISD::SUB) + return Op; + + SDValue Xor = Sub.getOperand(1); + if (Xor.getOpcode() != ISD::XOR) + return Op; + + SDValue ExtendOpA = Xor.getOperand(0); + SDValue ExtendOpB = Sub.getOperand(0); + unsigned ExtendOpAOpc = ExtendOpA.getOpcode(); + unsigned ExtendOpBOpc = ExtendOpB.getOpcode(); + if (!(ExtendOpAOpc == ExtendOpBOpc && + (ExtendOpAOpc == ISD::ZERO_EXTEND || ExtendOpAOpc == ISD::SIGN_EXTEND))) + return Op; + + // Is the result of the right shift being truncated to the same value type as + // the original operands, OpA and OpB? + SDValue OpA = ExtendOpA.getOperand(0); + SDValue OpB = ExtendOpB.getOperand(0); + EVT OpAVT = OpA.getValueType(); + assert(ExtendOpA.getValueType() == ExtendOpB.getValueType()); + if (!(VT == OpAVT && OpAVT == OpB.getValueType())) + return Op; + + // Is the XOR using a constant amount of all ones in the right hand side? + uint64_t C; + if (!isAllConstantBuildVector(Xor.getOperand(1), C)) + return Op; + + unsigned ElemSizeInBits = VT.getScalarSizeInBits(); + APInt CAsAPInt(ElemSizeInBits, C); + if (CAsAPInt != APInt::getAllOnesValue(ElemSizeInBits)) + return Op; + + SDLoc DL(Op); + bool IsSignExtend = ExtendOpAOpc == ISD::SIGN_EXTEND; + unsigned RHADDOpc = IsSignExtend ? AArch64ISD::SRHADD : AArch64ISD::URHADD; + SDValue ResultURHADD = DAG.getNode(RHADDOpc, DL, VT, OpA, OpB); + + return ResultURHADD; +} + SDValue AArch64TargetLowering::LowerVectorSRA_SRL_SHL(SDValue Op, SelectionDAG &DAG) const { EVT VT = Op.getValueType(); @@ -8264,6 +8936,9 @@ SDValue AArch64TargetLowering::LowerVectorSRA_SRL_SHL(SDValue Op, llvm_unreachable("unexpected shift opcode"); case ISD::SHL: + if (VT.isScalableVector()) + return LowerToPredicatedOp(Op, DAG, AArch64ISD::SHL_MERGE_OP1); + if (isVShiftLImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize) return DAG.getNode(AArch64ISD::VSHL, DL, VT, Op.getOperand(0), DAG.getConstant(Cnt, DL, MVT::i32)); @@ -8273,6 +8948,12 @@ SDValue AArch64TargetLowering::LowerVectorSRA_SRL_SHL(SDValue Op, Op.getOperand(0), Op.getOperand(1)); case ISD::SRA: case ISD::SRL: + if (VT.isScalableVector()) { + unsigned Opc = Op.getOpcode() == ISD::SRA ? AArch64ISD::SRA_MERGE_OP1 + : AArch64ISD::SRL_MERGE_OP1; + return LowerToPredicatedOp(Op, DAG, Opc); + } + // Right shift immediate if (isVShiftRImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize) { unsigned Opc = @@ -8395,6 +9076,12 @@ static SDValue EmitVectorComparison(SDValue LHS, SDValue RHS, SDValue AArch64TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) const { + if (Op.getValueType().isScalableVector()) { + if (Op.getOperand(0).getValueType().isFloatingPoint()) + return Op; + return LowerToPredicatedOp(Op, DAG, AArch64ISD::SETCC_MERGE_ZERO); + } + ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get(); SDValue LHS = Op.getOperand(0); SDValue RHS = Op.getOperand(1); @@ -8570,7 +9257,8 @@ AArch64TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SDNode *Node = Op.getNode(); SDValue Chain = Op.getOperand(0); SDValue Size = Op.getOperand(1); - unsigned Align = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue(); + MaybeAlign Align = + cast<ConstantSDNode>(Op.getOperand(2))->getMaybeAlignValue(); EVT VT = Node->getValueType(0); if (DAG.getMachineFunction().getFunction().hasFnAttribute( @@ -8580,7 +9268,7 @@ AArch64TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size); if (Align) SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0), - DAG.getConstant(-(uint64_t)Align, dl, VT)); + DAG.getConstant(-(uint64_t)Align->value(), dl, VT)); Chain = DAG.getCopyToReg(Chain, dl, AArch64::SP, SP); SDValue Ops[2] = {SP, Chain}; return DAG.getMergeValues(Ops, dl); @@ -8595,7 +9283,7 @@ AArch64TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size); if (Align) SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0), - DAG.getConstant(-(uint64_t)Align, dl, VT)); + DAG.getConstant(-(uint64_t)Align->value(), dl, VT)); Chain = DAG.getCopyToReg(Chain, dl, AArch64::SP, SP); Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, dl, true), @@ -8605,6 +9293,41 @@ AArch64TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, return DAG.getMergeValues(Ops, dl); } +SDValue AArch64TargetLowering::LowerVSCALE(SDValue Op, + SelectionDAG &DAG) const { + EVT VT = Op.getValueType(); + assert(VT != MVT::i64 && "Expected illegal VSCALE node"); + + SDLoc DL(Op); + APInt MulImm = cast<ConstantSDNode>(Op.getOperand(0))->getAPIntValue(); + return DAG.getZExtOrTrunc(DAG.getVScale(DL, MVT::i64, MulImm.sextOrSelf(64)), + DL, VT); +} + +/// Set the IntrinsicInfo for the `aarch64_sve_st<N>` intrinsics. +template <unsigned NumVecs> +static bool setInfoSVEStN(AArch64TargetLowering::IntrinsicInfo &Info, + const CallInst &CI) { + Info.opc = ISD::INTRINSIC_VOID; + // Retrieve EC from first vector argument. + const EVT VT = EVT::getEVT(CI.getArgOperand(0)->getType()); + ElementCount EC = VT.getVectorElementCount(); +#ifndef NDEBUG + // Check the assumption that all input vectors are the same type. + for (unsigned I = 0; I < NumVecs; ++I) + assert(VT == EVT::getEVT(CI.getArgOperand(I)->getType()) && + "Invalid type."); +#endif + // memVT is `NumVecs * VT`. + Info.memVT = EVT::getVectorVT(CI.getType()->getContext(), VT.getScalarType(), + EC * NumVecs); + Info.ptrVal = CI.getArgOperand(CI.getNumArgOperands() - 1); + Info.offset = 0; + Info.align.reset(); + Info.flags = MachineMemOperand::MOStore; + return true; +} + /// getTgtMemIntrinsic - Represent NEON load and store intrinsics as /// MemIntrinsicNodes. The associated MachineMemOperands record the alignment /// specified in the intrinsic calls. @@ -8614,6 +9337,12 @@ bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, unsigned Intrinsic) const { auto &DL = I.getModule()->getDataLayout(); switch (Intrinsic) { + case Intrinsic::aarch64_sve_st2: + return setInfoSVEStN<2>(Info, I); + case Intrinsic::aarch64_sve_st3: + return setInfoSVEStN<3>(Info, I); + case Intrinsic::aarch64_sve_st4: + return setInfoSVEStN<4>(Info, I); case Intrinsic::aarch64_neon_ld2: case Intrinsic::aarch64_neon_ld3: case Intrinsic::aarch64_neon_ld4: @@ -8670,7 +9399,7 @@ bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.memVT = MVT::getVT(PtrTy->getElementType()); Info.ptrVal = I.getArgOperand(0); Info.offset = 0; - Info.align = MaybeAlign(DL.getABITypeAlignment(PtrTy->getElementType())); + Info.align = DL.getABITypeAlign(PtrTy->getElementType()); Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile; return true; } @@ -8681,7 +9410,7 @@ bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.memVT = MVT::getVT(PtrTy->getElementType()); Info.ptrVal = I.getArgOperand(1); Info.offset = 0; - Info.align = MaybeAlign(DL.getABITypeAlignment(PtrTy->getElementType())); + Info.align = DL.getABITypeAlign(PtrTy->getElementType()); Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOVolatile; return true; } @@ -8706,21 +9435,25 @@ bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, case Intrinsic::aarch64_sve_ldnt1: { PointerType *PtrTy = cast<PointerType>(I.getArgOperand(1)->getType()); Info.opc = ISD::INTRINSIC_W_CHAIN; - Info.memVT = MVT::getVT(PtrTy->getElementType()); + Info.memVT = MVT::getVT(I.getType()); Info.ptrVal = I.getArgOperand(1); Info.offset = 0; - Info.align = MaybeAlign(DL.getABITypeAlignment(PtrTy->getElementType())); - Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MONonTemporal; + Info.align = DL.getABITypeAlign(PtrTy->getElementType()); + Info.flags = MachineMemOperand::MOLoad; + if (Intrinsic == Intrinsic::aarch64_sve_ldnt1) + Info.flags |= MachineMemOperand::MONonTemporal; return true; } case Intrinsic::aarch64_sve_stnt1: { PointerType *PtrTy = cast<PointerType>(I.getArgOperand(2)->getType()); Info.opc = ISD::INTRINSIC_W_CHAIN; - Info.memVT = MVT::getVT(PtrTy->getElementType()); + Info.memVT = MVT::getVT(I.getOperand(0)->getType()); Info.ptrVal = I.getArgOperand(2); Info.offset = 0; - Info.align = MaybeAlign(DL.getABITypeAlignment(PtrTy->getElementType())); - Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MONonTemporal; + Info.align = DL.getABITypeAlign(PtrTy->getElementType()); + Info.flags = MachineMemOperand::MOStore; + if (Intrinsic == Intrinsic::aarch64_sve_stnt1) + Info.flags |= MachineMemOperand::MONonTemporal; return true; } default: @@ -8895,21 +9628,22 @@ bool AArch64TargetLowering::isExtFreeImpl(const Instruction *Ext) const { /// or upper half of the vector elements. static bool areExtractShuffleVectors(Value *Op1, Value *Op2) { auto areTypesHalfed = [](Value *FullV, Value *HalfV) { - auto *FullVT = cast<VectorType>(FullV->getType()); - auto *HalfVT = cast<VectorType>(HalfV->getType()); - return FullVT->getBitWidth() == 2 * HalfVT->getBitWidth(); + auto *FullTy = FullV->getType(); + auto *HalfTy = HalfV->getType(); + return FullTy->getPrimitiveSizeInBits().getFixedSize() == + 2 * HalfTy->getPrimitiveSizeInBits().getFixedSize(); }; auto extractHalf = [](Value *FullV, Value *HalfV) { - auto *FullVT = cast<VectorType>(FullV->getType()); - auto *HalfVT = cast<VectorType>(HalfV->getType()); + auto *FullVT = cast<FixedVectorType>(FullV->getType()); + auto *HalfVT = cast<FixedVectorType>(HalfV->getType()); return FullVT->getNumElements() == 2 * HalfVT->getNumElements(); }; - Constant *M1, *M2; + ArrayRef<int> M1, M2; Value *S1Op1, *S2Op1; - if (!match(Op1, m_ShuffleVector(m_Value(S1Op1), m_Undef(), m_Constant(M1))) || - !match(Op2, m_ShuffleVector(m_Value(S2Op1), m_Undef(), m_Constant(M2)))) + if (!match(Op1, m_Shuffle(m_Value(S1Op1), m_Undef(), m_Mask(M1))) || + !match(Op2, m_Shuffle(m_Value(S2Op1), m_Undef(), m_Mask(M2)))) return false; // Check that the operands are half as wide as the result and we extract @@ -8922,7 +9656,7 @@ static bool areExtractShuffleVectors(Value *Op1, Value *Op2) { // elements. int M1Start = -1; int M2Start = -1; - int NumElements = cast<VectorType>(Op1->getType())->getNumElements() * 2; + int NumElements = cast<FixedVectorType>(Op1->getType())->getNumElements() * 2; if (!ShuffleVectorInst::isExtractSubvectorMask(M1, NumElements, M1Start) || !ShuffleVectorInst::isExtractSubvectorMask(M2, NumElements, M2Start) || M1Start != M2Start || (M1Start != 0 && M2Start != (NumElements / 2))) @@ -8948,6 +9682,22 @@ static bool areExtractExts(Value *Ext1, Value *Ext2) { return true; } +/// Check if Op could be used with vmull_high_p64 intrinsic. +static bool isOperandOfVmullHighP64(Value *Op) { + Value *VectorOperand = nullptr; + ConstantInt *ElementIndex = nullptr; + return match(Op, m_ExtractElt(m_Value(VectorOperand), + m_ConstantInt(ElementIndex))) && + ElementIndex->getValue() == 1 && + isa<FixedVectorType>(VectorOperand->getType()) && + cast<FixedVectorType>(VectorOperand->getType())->getNumElements() == 2; +} + +/// Check if Op1 and Op2 could be used with vmull_high_p64 intrinsic. +static bool areOperandsOfVmullHighP64(Value *Op1, Value *Op2) { + return isOperandOfVmullHighP64(Op1) && isOperandOfVmullHighP64(Op2); +} + /// Check if sinking \p I's operands to I's basic block is profitable, because /// the operands can be folded into a target instruction, e.g. /// shufflevectors extracts and/or sext/zext can be folded into (u,s)subl(2). @@ -8964,6 +9714,15 @@ bool AArch64TargetLowering::shouldSinkOperands( Ops.push_back(&II->getOperandUse(0)); Ops.push_back(&II->getOperandUse(1)); return true; + + case Intrinsic::aarch64_neon_pmull64: + if (!areOperandsOfVmullHighP64(II->getArgOperand(0), + II->getArgOperand(1))) + return false; + Ops.push_back(&II->getArgOperandUse(0)); + Ops.push_back(&II->getArgOperandUse(1)); + return true; + default: return false; } @@ -8996,12 +9755,12 @@ bool AArch64TargetLowering::shouldSinkOperands( } bool AArch64TargetLowering::hasPairedLoad(EVT LoadedType, - unsigned &RequiredAligment) const { + Align &RequiredAligment) const { if (!LoadedType.isSimple() || (!LoadedType.isInteger() && !LoadedType.isFloatingPoint())) return false; // Cyclone supports unaligned accesses. - RequiredAligment = 0; + RequiredAligment = Align(1); unsigned NumBits = LoadedType.getSizeInBits(); return NumBits == 32 || NumBits == 64; } @@ -9015,7 +9774,7 @@ AArch64TargetLowering::getNumInterleavedAccesses(VectorType *VecTy, } MachineMemOperand::Flags -AArch64TargetLowering::getMMOFlags(const Instruction &I) const { +AArch64TargetLowering::getTargetMMOFlags(const Instruction &I) const { if (Subtarget->getProcFamily() == AArch64Subtarget::Falkor && I.getMetadata(FALKOR_STRIDED_ACCESS_MD) != nullptr) return MOStridedAccess; @@ -9029,7 +9788,7 @@ bool AArch64TargetLowering::isLegalInterleavedAccessType( unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType()); // Ensure the number of vector elements is greater than 1. - if (VecTy->getNumElements() < 2) + if (cast<FixedVectorType>(VecTy)->getNumElements() < 2) return false; // Ensure the element type is legal. @@ -9063,22 +9822,24 @@ bool AArch64TargetLowering::lowerInterleavedLoad( const DataLayout &DL = LI->getModule()->getDataLayout(); - VectorType *VecTy = Shuffles[0]->getType(); + VectorType *VTy = Shuffles[0]->getType(); // Skip if we do not have NEON and skip illegal vector types. We can // "legalize" wide vector types into multiple interleaved accesses as long as // the vector types are divisible by 128. - if (!Subtarget->hasNEON() || !isLegalInterleavedAccessType(VecTy, DL)) + if (!Subtarget->hasNEON() || !isLegalInterleavedAccessType(VTy, DL)) return false; - unsigned NumLoads = getNumInterleavedAccesses(VecTy, DL); + unsigned NumLoads = getNumInterleavedAccesses(VTy, DL); + + auto *FVTy = cast<FixedVectorType>(VTy); // A pointer vector can not be the return type of the ldN intrinsics. Need to // load integer vectors first and then convert to pointer vectors. - Type *EltTy = VecTy->getVectorElementType(); + Type *EltTy = FVTy->getElementType(); if (EltTy->isPointerTy()) - VecTy = - VectorType::get(DL.getIntPtrType(EltTy), VecTy->getVectorNumElements()); + FVTy = + FixedVectorType::get(DL.getIntPtrType(EltTy), FVTy->getNumElements()); IRBuilder<> Builder(LI); @@ -9088,19 +9849,19 @@ bool AArch64TargetLowering::lowerInterleavedLoad( if (NumLoads > 1) { // If we're going to generate more than one load, reset the sub-vector type // to something legal. - VecTy = VectorType::get(VecTy->getVectorElementType(), - VecTy->getVectorNumElements() / NumLoads); + FVTy = FixedVectorType::get(FVTy->getElementType(), + FVTy->getNumElements() / NumLoads); // We will compute the pointer operand of each load from the original base // address using GEPs. Cast the base address to a pointer to the scalar // element type. BaseAddr = Builder.CreateBitCast( - BaseAddr, VecTy->getVectorElementType()->getPointerTo( - LI->getPointerAddressSpace())); + BaseAddr, + FVTy->getElementType()->getPointerTo(LI->getPointerAddressSpace())); } - Type *PtrTy = VecTy->getPointerTo(LI->getPointerAddressSpace()); - Type *Tys[2] = {VecTy, PtrTy}; + Type *PtrTy = FVTy->getPointerTo(LI->getPointerAddressSpace()); + Type *Tys[2] = {FVTy, PtrTy}; static const Intrinsic::ID LoadInts[3] = {Intrinsic::aarch64_neon_ld2, Intrinsic::aarch64_neon_ld3, Intrinsic::aarch64_neon_ld4}; @@ -9117,9 +9878,8 @@ bool AArch64TargetLowering::lowerInterleavedLoad( // If we're generating more than one load, compute the base address of // subsequent loads as an offset from the previous. if (LoadCount > 0) - BaseAddr = - Builder.CreateConstGEP1_32(VecTy->getVectorElementType(), BaseAddr, - VecTy->getVectorNumElements() * Factor); + BaseAddr = Builder.CreateConstGEP1_32(FVTy->getElementType(), BaseAddr, + FVTy->getNumElements() * Factor); CallInst *LdN = Builder.CreateCall( LdNFunc, Builder.CreateBitCast(BaseAddr, PtrTy), "ldN"); @@ -9134,8 +9894,8 @@ bool AArch64TargetLowering::lowerInterleavedLoad( // Convert the integer vector to pointer vector if the element is pointer. if (EltTy->isPointerTy()) SubVec = Builder.CreateIntToPtr( - SubVec, VectorType::get(SVI->getType()->getVectorElementType(), - VecTy->getVectorNumElements())); + SubVec, FixedVectorType::get(SVI->getType()->getElementType(), + FVTy->getNumElements())); SubVecs[SVI].push_back(SubVec); } } @@ -9186,13 +9946,12 @@ bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI, assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() && "Invalid interleave factor"); - VectorType *VecTy = SVI->getType(); - assert(VecTy->getVectorNumElements() % Factor == 0 && - "Invalid interleaved store"); + auto *VecTy = cast<FixedVectorType>(SVI->getType()); + assert(VecTy->getNumElements() % Factor == 0 && "Invalid interleaved store"); - unsigned LaneLen = VecTy->getVectorNumElements() / Factor; - Type *EltTy = VecTy->getVectorElementType(); - VectorType *SubVecTy = VectorType::get(EltTy, LaneLen); + unsigned LaneLen = VecTy->getNumElements() / Factor; + Type *EltTy = VecTy->getElementType(); + auto *SubVecTy = FixedVectorType::get(EltTy, LaneLen); const DataLayout &DL = SI->getModule()->getDataLayout(); @@ -9212,14 +9971,15 @@ bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI, // vectors to integer vectors. if (EltTy->isPointerTy()) { Type *IntTy = DL.getIntPtrType(EltTy); - unsigned NumOpElts = Op0->getType()->getVectorNumElements(); + unsigned NumOpElts = + cast<FixedVectorType>(Op0->getType())->getNumElements(); // Convert to the corresponding integer vector. - Type *IntVecTy = VectorType::get(IntTy, NumOpElts); + auto *IntVecTy = FixedVectorType::get(IntTy, NumOpElts); Op0 = Builder.CreatePtrToInt(Op0, IntVecTy); Op1 = Builder.CreatePtrToInt(Op1, IntVecTy); - SubVecTy = VectorType::get(IntTy, LaneLen); + SubVecTy = FixedVectorType::get(IntTy, LaneLen); } // The base address of the store. @@ -9229,14 +9989,14 @@ bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI, // If we're going to generate more than one store, reset the lane length // and sub-vector type to something legal. LaneLen /= NumStores; - SubVecTy = VectorType::get(SubVecTy->getVectorElementType(), LaneLen); + SubVecTy = FixedVectorType::get(SubVecTy->getElementType(), LaneLen); // We will compute the pointer operand of each store from the original base // address using GEPs. Cast the base address to a pointer to the scalar // element type. BaseAddr = Builder.CreateBitCast( - BaseAddr, SubVecTy->getVectorElementType()->getPointerTo( - SI->getPointerAddressSpace())); + BaseAddr, + SubVecTy->getElementType()->getPointerTo(SI->getPointerAddressSpace())); } auto Mask = SVI->getShuffleMask(); @@ -9258,7 +10018,7 @@ bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI, unsigned IdxI = StoreCount * LaneLen * Factor + i; if (Mask[IdxI] >= 0) { Ops.push_back(Builder.CreateShuffleVector( - Op0, Op1, createSequentialMask(Builder, Mask[IdxI], LaneLen, 0))); + Op0, Op1, createSequentialMask(Mask[IdxI], LaneLen, 0))); } else { unsigned StartMask = 0; for (unsigned j = 1; j < LaneLen; j++) { @@ -9274,14 +10034,14 @@ bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI, // Note: StartMask cannot be negative, it's checked in // isReInterleaveMask Ops.push_back(Builder.CreateShuffleVector( - Op0, Op1, createSequentialMask(Builder, StartMask, LaneLen, 0))); + Op0, Op1, createSequentialMask(StartMask, LaneLen, 0))); } } // If we generating more than one store, we compute the base address of // subsequent stores as an offset from the previous. if (StoreCount > 0) - BaseAddr = Builder.CreateConstGEP1_32(SubVecTy->getVectorElementType(), + BaseAddr = Builder.CreateConstGEP1_32(SubVecTy->getElementType(), BaseAddr, LaneLen * Factor); Ops.push_back(Builder.CreateBitCast(BaseAddr, PtrTy)); @@ -9290,16 +10050,59 @@ bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI, return true; } -static bool memOpAlign(unsigned DstAlign, unsigned SrcAlign, - unsigned AlignCheck) { - return ((SrcAlign == 0 || SrcAlign % AlignCheck == 0) && - (DstAlign == 0 || DstAlign % AlignCheck == 0)); +// Lower an SVE structured load intrinsic returning a tuple type to target +// specific intrinsic taking the same input but returning a multi-result value +// of the split tuple type. +// +// E.g. Lowering an LD3: +// +// call <vscale x 12 x i32> @llvm.aarch64.sve.ld3.nxv12i32( +// <vscale x 4 x i1> %pred, +// <vscale x 4 x i32>* %addr) +// +// Output DAG: +// +// t0: ch = EntryToken +// t2: nxv4i1,ch = CopyFromReg t0, Register:nxv4i1 %0 +// t4: i64,ch = CopyFromReg t0, Register:i64 %1 +// t5: nxv4i32,nxv4i32,nxv4i32,ch = AArch64ISD::SVE_LD3 t0, t2, t4 +// t6: nxv12i32 = concat_vectors t5, t5:1, t5:2 +// +// This is called pre-legalization to avoid widening/splitting issues with +// non-power-of-2 tuple types used for LD3, such as nxv12i32. +SDValue AArch64TargetLowering::LowerSVEStructLoad(unsigned Intrinsic, + ArrayRef<SDValue> LoadOps, + EVT VT, SelectionDAG &DAG, + const SDLoc &DL) const { + assert(VT.isScalableVector() && "Can only lower scalable vectors"); + + unsigned N, Opcode; + static std::map<unsigned, std::pair<unsigned, unsigned>> IntrinsicMap = { + {Intrinsic::aarch64_sve_ld2, {2, AArch64ISD::SVE_LD2_MERGE_ZERO}}, + {Intrinsic::aarch64_sve_ld3, {3, AArch64ISD::SVE_LD3_MERGE_ZERO}}, + {Intrinsic::aarch64_sve_ld4, {4, AArch64ISD::SVE_LD4_MERGE_ZERO}}}; + + std::tie(N, Opcode) = IntrinsicMap[Intrinsic]; + assert(VT.getVectorElementCount().Min % N == 0 && + "invalid tuple vector type!"); + + EVT SplitVT = EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), + VT.getVectorElementCount() / N); + assert(isTypeLegal(SplitVT)); + + SmallVector<EVT, 5> VTs(N, SplitVT); + VTs.push_back(MVT::Other); // Chain + SDVTList NodeTys = DAG.getVTList(VTs); + + SDValue PseudoLoad = DAG.getNode(Opcode, DL, NodeTys, LoadOps); + SmallVector<SDValue, 4> PseudoLoadOps; + for (unsigned I = 0; I < N; ++I) + PseudoLoadOps.push_back(SDValue(PseudoLoad.getNode(), I)); + return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, PseudoLoadOps); } EVT AArch64TargetLowering::getOptimalMemOpType( - uint64_t Size, unsigned DstAlign, unsigned SrcAlign, bool IsMemset, - bool ZeroMemset, bool MemcpyStrSrc, - const AttributeList &FuncAttributes) const { + const MemOp &Op, const AttributeList &FuncAttributes) const { bool CanImplicitFloat = !FuncAttributes.hasFnAttribute(Attribute::NoImplicitFloat); bool CanUseNEON = Subtarget->hasNEON() && CanImplicitFloat; @@ -9307,9 +10110,9 @@ EVT AArch64TargetLowering::getOptimalMemOpType( // Only use AdvSIMD to implement memset of 32-byte and above. It would have // taken one instruction to materialize the v2i64 zero and one store (with // restrictive addressing mode). Just do i64 stores. - bool IsSmallMemset = IsMemset && Size < 32; - auto AlignmentIsAcceptable = [&](EVT VT, unsigned AlignCheck) { - if (memOpAlign(SrcAlign, DstAlign, AlignCheck)) + bool IsSmallMemset = Op.isMemset() && Op.size() < 32; + auto AlignmentIsAcceptable = [&](EVT VT, Align AlignCheck) { + if (Op.isAligned(AlignCheck)) return true; bool Fast; return allowsMisalignedMemoryAccesses(VT, 0, 1, MachineMemOperand::MONone, @@ -9317,22 +10120,20 @@ EVT AArch64TargetLowering::getOptimalMemOpType( Fast; }; - if (CanUseNEON && IsMemset && !IsSmallMemset && - AlignmentIsAcceptable(MVT::v2i64, 16)) + if (CanUseNEON && Op.isMemset() && !IsSmallMemset && + AlignmentIsAcceptable(MVT::v2i64, Align(16))) return MVT::v2i64; - if (CanUseFP && !IsSmallMemset && AlignmentIsAcceptable(MVT::f128, 16)) + if (CanUseFP && !IsSmallMemset && AlignmentIsAcceptable(MVT::f128, Align(16))) return MVT::f128; - if (Size >= 8 && AlignmentIsAcceptable(MVT::i64, 8)) + if (Op.size() >= 8 && AlignmentIsAcceptable(MVT::i64, Align(8))) return MVT::i64; - if (Size >= 4 && AlignmentIsAcceptable(MVT::i32, 4)) + if (Op.size() >= 4 && AlignmentIsAcceptable(MVT::i32, Align(4))) return MVT::i32; return MVT::Other; } LLT AArch64TargetLowering::getOptimalMemOpLLT( - uint64_t Size, unsigned DstAlign, unsigned SrcAlign, bool IsMemset, - bool ZeroMemset, bool MemcpyStrSrc, - const AttributeList &FuncAttributes) const { + const MemOp &Op, const AttributeList &FuncAttributes) const { bool CanImplicitFloat = !FuncAttributes.hasFnAttribute(Attribute::NoImplicitFloat); bool CanUseNEON = Subtarget->hasNEON() && CanImplicitFloat; @@ -9340,9 +10141,9 @@ LLT AArch64TargetLowering::getOptimalMemOpLLT( // Only use AdvSIMD to implement memset of 32-byte and above. It would have // taken one instruction to materialize the v2i64 zero and one store (with // restrictive addressing mode). Just do i64 stores. - bool IsSmallMemset = IsMemset && Size < 32; - auto AlignmentIsAcceptable = [&](EVT VT, unsigned AlignCheck) { - if (memOpAlign(SrcAlign, DstAlign, AlignCheck)) + bool IsSmallMemset = Op.isMemset() && Op.size() < 32; + auto AlignmentIsAcceptable = [&](EVT VT, Align AlignCheck) { + if (Op.isAligned(AlignCheck)) return true; bool Fast; return allowsMisalignedMemoryAccesses(VT, 0, 1, MachineMemOperand::MONone, @@ -9350,14 +10151,14 @@ LLT AArch64TargetLowering::getOptimalMemOpLLT( Fast; }; - if (CanUseNEON && IsMemset && !IsSmallMemset && - AlignmentIsAcceptable(MVT::v2i64, 16)) + if (CanUseNEON && Op.isMemset() && !IsSmallMemset && + AlignmentIsAcceptable(MVT::v2i64, Align(16))) return LLT::vector(2, 64); - if (CanUseFP && !IsSmallMemset && AlignmentIsAcceptable(MVT::f128, 16)) + if (CanUseFP && !IsSmallMemset && AlignmentIsAcceptable(MVT::f128, Align(16))) return LLT::scalar(128); - if (Size >= 8 && AlignmentIsAcceptable(MVT::i64, 8)) + if (Op.size() >= 8 && AlignmentIsAcceptable(MVT::i64, Align(8))) return LLT::scalar(64); - if (Size >= 4 && AlignmentIsAcceptable(MVT::i32, 4)) + if (Op.size() >= 4 && AlignmentIsAcceptable(MVT::i32, Align(4))) return LLT::scalar(32); return LLT(); } @@ -9404,6 +10205,10 @@ bool AArch64TargetLowering::isLegalAddressingMode(const DataLayout &DL, if (AM.HasBaseReg && AM.BaseOffs && AM.Scale) return false; + // FIXME: Update this method to support scalable addressing modes. + if (isa<ScalableVectorType>(Ty)) + return AM.HasBaseReg && !AM.BaseOffs && !AM.Scale; + // check reg + imm case: // i.e., reg + 0, reg + imm9, reg + SIZE_IN_BYTES * uimm12 uint64_t NumBytes = 0; @@ -10110,7 +10915,7 @@ static SDValue tryCombineToBSL(SDNode *N, } if (FoundMatch) - return DAG.getNode(AArch64ISD::BSL, DL, VT, SDValue(BVN0, 0), + return DAG.getNode(AArch64ISD::BSP, DL, VT, SDValue(BVN0, 0), N0->getOperand(1 - i), N1->getOperand(1 - j)); } @@ -10167,29 +10972,81 @@ static SDValue performSVEAndCombine(SDNode *N, if (DCI.isBeforeLegalizeOps()) return SDValue(); + SelectionDAG &DAG = DCI.DAG; SDValue Src = N->getOperand(0); + unsigned Opc = Src->getOpcode(); + + // Zero/any extend of an unsigned unpack + if (Opc == AArch64ISD::UUNPKHI || Opc == AArch64ISD::UUNPKLO) { + SDValue UnpkOp = Src->getOperand(0); + SDValue Dup = N->getOperand(1); + + if (Dup.getOpcode() != AArch64ISD::DUP) + return SDValue(); + + SDLoc DL(N); + ConstantSDNode *C = dyn_cast<ConstantSDNode>(Dup->getOperand(0)); + uint64_t ExtVal = C->getZExtValue(); + + // If the mask is fully covered by the unpack, we don't need to push + // a new AND onto the operand + EVT EltTy = UnpkOp->getValueType(0).getVectorElementType(); + if ((ExtVal == 0xFF && EltTy == MVT::i8) || + (ExtVal == 0xFFFF && EltTy == MVT::i16) || + (ExtVal == 0xFFFFFFFF && EltTy == MVT::i32)) + return Src; + + // Truncate to prevent a DUP with an over wide constant + APInt Mask = C->getAPIntValue().trunc(EltTy.getSizeInBits()); + + // Otherwise, make sure we propagate the AND to the operand + // of the unpack + Dup = DAG.getNode(AArch64ISD::DUP, DL, + UnpkOp->getValueType(0), + DAG.getConstant(Mask.zextOrTrunc(32), DL, MVT::i32)); + + SDValue And = DAG.getNode(ISD::AND, DL, + UnpkOp->getValueType(0), UnpkOp, Dup); + + return DAG.getNode(Opc, DL, N->getValueType(0), And); + } + SDValue Mask = N->getOperand(1); if (!Src.hasOneUse()) return SDValue(); - // GLD1* instructions perform an implicit zero-extend, which makes them + EVT MemVT; + + // SVE load instructions perform an implicit zero-extend, which makes them // perfect candidates for combining. - switch (Src->getOpcode()) { - case AArch64ISD::GLD1: - case AArch64ISD::GLD1_SCALED: - case AArch64ISD::GLD1_SXTW: - case AArch64ISD::GLD1_SXTW_SCALED: - case AArch64ISD::GLD1_UXTW: - case AArch64ISD::GLD1_UXTW_SCALED: - case AArch64ISD::GLD1_IMM: + switch (Opc) { + case AArch64ISD::LD1_MERGE_ZERO: + case AArch64ISD::LDNF1_MERGE_ZERO: + case AArch64ISD::LDFF1_MERGE_ZERO: + MemVT = cast<VTSDNode>(Src->getOperand(3))->getVT(); + break; + case AArch64ISD::GLD1_MERGE_ZERO: + case AArch64ISD::GLD1_SCALED_MERGE_ZERO: + case AArch64ISD::GLD1_SXTW_MERGE_ZERO: + case AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO: + case AArch64ISD::GLD1_UXTW_MERGE_ZERO: + case AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO: + case AArch64ISD::GLD1_IMM_MERGE_ZERO: + case AArch64ISD::GLDFF1_MERGE_ZERO: + case AArch64ISD::GLDFF1_SCALED_MERGE_ZERO: + case AArch64ISD::GLDFF1_SXTW_MERGE_ZERO: + case AArch64ISD::GLDFF1_SXTW_SCALED_MERGE_ZERO: + case AArch64ISD::GLDFF1_UXTW_MERGE_ZERO: + case AArch64ISD::GLDFF1_UXTW_SCALED_MERGE_ZERO: + case AArch64ISD::GLDFF1_IMM_MERGE_ZERO: + case AArch64ISD::GLDNT1_MERGE_ZERO: + MemVT = cast<VTSDNode>(Src->getOperand(4))->getVT(); break; default: return SDValue(); } - EVT MemVT = cast<VTSDNode>(Src->getOperand(4))->getVT(); - if (isConstantSplatVectorMaskForType(Mask.getNode(), MemVT)) return Src; @@ -10273,6 +11130,7 @@ static SDValue performConcatVectorsCombine(SDNode *N, SDLoc dl(N); EVT VT = N->getValueType(0); SDValue N0 = N->getOperand(0), N1 = N->getOperand(1); + unsigned N0Opc = N0->getOpcode(), N1Opc = N1->getOpcode(); // Optimize concat_vectors of truncated vectors, where the intermediate // type is illegal, to avoid said illegality, e.g., @@ -10285,9 +11143,8 @@ static SDValue performConcatVectorsCombine(SDNode *N, // This isn't really target-specific, but ISD::TRUNCATE legality isn't keyed // on both input and result type, so we might generate worse code. // On AArch64 we know it's fine for v2i64->v4i16 and v4i32->v8i8. - if (N->getNumOperands() == 2 && - N0->getOpcode() == ISD::TRUNCATE && - N1->getOpcode() == ISD::TRUNCATE) { + if (N->getNumOperands() == 2 && N0Opc == ISD::TRUNCATE && + N1Opc == ISD::TRUNCATE) { SDValue N00 = N0->getOperand(0); SDValue N10 = N1->getOperand(0); EVT N00VT = N00.getValueType(); @@ -10312,6 +11169,52 @@ static SDValue performConcatVectorsCombine(SDNode *N, if (DCI.isBeforeLegalizeOps()) return SDValue(); + // Optimise concat_vectors of two [us]rhadds that use extracted subvectors + // from the same original vectors. Combine these into a single [us]rhadd that + // operates on the two original vectors. Example: + // (v16i8 (concat_vectors (v8i8 (urhadd (extract_subvector (v16i8 OpA, <0>), + // extract_subvector (v16i8 OpB, + // <0>))), + // (v8i8 (urhadd (extract_subvector (v16i8 OpA, <8>), + // extract_subvector (v16i8 OpB, + // <8>))))) + // -> + // (v16i8(urhadd(v16i8 OpA, v16i8 OpB))) + if (N->getNumOperands() == 2 && N0Opc == N1Opc && + (N0Opc == AArch64ISD::URHADD || N0Opc == AArch64ISD::SRHADD)) { + SDValue N00 = N0->getOperand(0); + SDValue N01 = N0->getOperand(1); + SDValue N10 = N1->getOperand(0); + SDValue N11 = N1->getOperand(1); + + EVT N00VT = N00.getValueType(); + EVT N10VT = N10.getValueType(); + + if (N00->getOpcode() == ISD::EXTRACT_SUBVECTOR && + N01->getOpcode() == ISD::EXTRACT_SUBVECTOR && + N10->getOpcode() == ISD::EXTRACT_SUBVECTOR && + N11->getOpcode() == ISD::EXTRACT_SUBVECTOR && N00VT == N10VT) { + SDValue N00Source = N00->getOperand(0); + SDValue N01Source = N01->getOperand(0); + SDValue N10Source = N10->getOperand(0); + SDValue N11Source = N11->getOperand(0); + + if (N00Source == N10Source && N01Source == N11Source && + N00Source.getValueType() == VT && N01Source.getValueType() == VT) { + assert(N0.getValueType() == N1.getValueType()); + + uint64_t N00Index = N00.getConstantOperandVal(1); + uint64_t N01Index = N01.getConstantOperandVal(1); + uint64_t N10Index = N10.getConstantOperandVal(1); + uint64_t N11Index = N11.getConstantOperandVal(1); + + if (N00Index == N01Index && N10Index == N11Index && N00Index == 0 && + N10Index == N00VT.getVectorNumElements()) + return DAG.getNode(N0Opc, dl, VT, N00Source, N01Source); + } + } + } + // If we see a (concat_vectors (v1x64 A), (v1x64 A)) it's really a vector // splat. The indexed instructions are going to be expecting a DUPLANE64, so // canonicalise to that. @@ -10330,7 +11233,7 @@ static SDValue performConcatVectorsCombine(SDNode *N, // becomes // (bitconvert (concat_vectors (v4i16 (bitconvert LHS)), RHS)) - if (N1->getOpcode() != ISD::BITCAST) + if (N1Opc != ISD::BITCAST) return SDValue(); SDValue RHS = N1->getOperand(0); MVT RHSTy = RHS.getValueType().getSimpleVT(); @@ -10794,6 +11697,35 @@ static SDValue LowerSVEIntReduction(SDNode *N, unsigned Opc, return SDValue(); } +static SDValue LowerSVEIntrinsicIndex(SDNode *N, SelectionDAG &DAG) { + SDLoc DL(N); + SDValue Op1 = N->getOperand(1); + SDValue Op2 = N->getOperand(2); + EVT ScalarTy = Op1.getValueType(); + + if ((ScalarTy == MVT::i8) || (ScalarTy == MVT::i16)) { + Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op1); + Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op2); + } + + return DAG.getNode(AArch64ISD::INDEX_VECTOR, DL, N->getValueType(0), + Op1, Op2); +} + +static SDValue LowerSVEIntrinsicDUP(SDNode *N, SelectionDAG &DAG) { + SDLoc dl(N); + SDValue Scalar = N->getOperand(3); + EVT ScalarTy = Scalar.getValueType(); + + if ((ScalarTy == MVT::i8) || (ScalarTy == MVT::i16)) + Scalar = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Scalar); + + SDValue Passthru = N->getOperand(1); + SDValue Pred = N->getOperand(2); + return DAG.getNode(AArch64ISD::DUP_MERGE_PASSTHRU, dl, N->getValueType(0), + Pred, Scalar, Passthru); +} + static SDValue LowerSVEIntrinsicEXT(SDNode *N, SelectionDAG &DAG) { SDLoc dl(N); LLVMContext &Ctx = *DAG.getContext(); @@ -10819,8 +11751,7 @@ static SDValue LowerSVEIntrinsicEXT(SDNode *N, SelectionDAG &DAG) { return DAG.getNode(ISD::BITCAST, dl, VT, EXT); } -static SDValue tryConvertSVEWideCompare(SDNode *N, unsigned ReplacementIID, - bool Invert, +static SDValue tryConvertSVEWideCompare(SDNode *N, ISD::CondCode CC, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG) { if (DCI.isBeforeLegalize()) @@ -10873,18 +11804,12 @@ static SDValue tryConvertSVEWideCompare(SDNode *N, unsigned ReplacementIID, } } + if (!Imm) + return SDValue(); + SDValue Splat = DAG.getNode(ISD::SPLAT_VECTOR, DL, CmpVT, Imm); - SDValue ID = DAG.getTargetConstant(ReplacementIID, DL, MVT::i64); - SDValue Op0, Op1; - if (Invert) { - Op0 = Splat; - Op1 = N->getOperand(2); - } else { - Op0 = N->getOperand(2); - Op1 = Splat; - } - return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, - ID, Pred, Op0, Op1); + return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, DL, VT, Pred, + N->getOperand(2), Splat, DAG.getCondCode(CC)); } return SDValue(); @@ -10914,6 +11839,46 @@ static SDValue getPTest(SelectionDAG &DAG, EVT VT, SDValue Pg, SDValue Op, return DAG.getZExtOrTrunc(Res, DL, VT); } +static SDValue combineSVEReductionFP(SDNode *N, unsigned Opc, + SelectionDAG &DAG) { + SDLoc DL(N); + + SDValue Pred = N->getOperand(1); + SDValue VecToReduce = N->getOperand(2); + + EVT ReduceVT = VecToReduce.getValueType(); + SDValue Reduce = DAG.getNode(Opc, DL, ReduceVT, Pred, VecToReduce); + + // SVE reductions set the whole vector register with the first element + // containing the reduction result, which we'll now extract. + SDValue Zero = DAG.getConstant(0, DL, MVT::i64); + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0), Reduce, + Zero); +} + +static SDValue combineSVEReductionOrderedFP(SDNode *N, unsigned Opc, + SelectionDAG &DAG) { + SDLoc DL(N); + + SDValue Pred = N->getOperand(1); + SDValue InitVal = N->getOperand(2); + SDValue VecToReduce = N->getOperand(3); + EVT ReduceVT = VecToReduce.getValueType(); + + // Ordered reductions use the first lane of the result vector as the + // reduction's initial value. + SDValue Zero = DAG.getConstant(0, DL, MVT::i64); + InitVal = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ReduceVT, + DAG.getUNDEF(ReduceVT), InitVal, Zero); + + SDValue Reduce = DAG.getNode(Opc, DL, ReduceVT, Pred, InitVal, VecToReduce); + + // SVE reductions set the whole vector register with the first element + // containing the reduction result, which we'll now extract. + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0), Reduce, + Zero); +} + static SDValue performIntrinsicCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget) { @@ -10982,38 +11947,107 @@ static SDValue performIntrinsicCombine(SDNode *N, return LowerSVEIntReduction(N, AArch64ISD::EORV_PRED, DAG); case Intrinsic::aarch64_sve_andv: return LowerSVEIntReduction(N, AArch64ISD::ANDV_PRED, DAG); + case Intrinsic::aarch64_sve_index: + return LowerSVEIntrinsicIndex(N, DAG); + case Intrinsic::aarch64_sve_dup: + return LowerSVEIntrinsicDUP(N, DAG); + case Intrinsic::aarch64_sve_dup_x: + return DAG.getNode(ISD::SPLAT_VECTOR, SDLoc(N), N->getValueType(0), + N->getOperand(1)); case Intrinsic::aarch64_sve_ext: return LowerSVEIntrinsicEXT(N, DAG); + case Intrinsic::aarch64_sve_smin: + return DAG.getNode(AArch64ISD::SMIN_MERGE_OP1, SDLoc(N), N->getValueType(0), + N->getOperand(1), N->getOperand(2), N->getOperand(3)); + case Intrinsic::aarch64_sve_umin: + return DAG.getNode(AArch64ISD::UMIN_MERGE_OP1, SDLoc(N), N->getValueType(0), + N->getOperand(1), N->getOperand(2), N->getOperand(3)); + case Intrinsic::aarch64_sve_smax: + return DAG.getNode(AArch64ISD::SMAX_MERGE_OP1, SDLoc(N), N->getValueType(0), + N->getOperand(1), N->getOperand(2), N->getOperand(3)); + case Intrinsic::aarch64_sve_umax: + return DAG.getNode(AArch64ISD::UMAX_MERGE_OP1, SDLoc(N), N->getValueType(0), + N->getOperand(1), N->getOperand(2), N->getOperand(3)); + case Intrinsic::aarch64_sve_lsl: + return DAG.getNode(AArch64ISD::SHL_MERGE_OP1, SDLoc(N), N->getValueType(0), + N->getOperand(1), N->getOperand(2), N->getOperand(3)); + case Intrinsic::aarch64_sve_lsr: + return DAG.getNode(AArch64ISD::SRL_MERGE_OP1, SDLoc(N), N->getValueType(0), + N->getOperand(1), N->getOperand(2), N->getOperand(3)); + case Intrinsic::aarch64_sve_asr: + return DAG.getNode(AArch64ISD::SRA_MERGE_OP1, SDLoc(N), N->getValueType(0), + N->getOperand(1), N->getOperand(2), N->getOperand(3)); + case Intrinsic::aarch64_sve_cmphs: + if (!N->getOperand(2).getValueType().isFloatingPoint()) + return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N), + N->getValueType(0), N->getOperand(1), N->getOperand(2), + N->getOperand(3), DAG.getCondCode(ISD::SETUGE)); + break; + case Intrinsic::aarch64_sve_cmphi: + if (!N->getOperand(2).getValueType().isFloatingPoint()) + return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N), + N->getValueType(0), N->getOperand(1), N->getOperand(2), + N->getOperand(3), DAG.getCondCode(ISD::SETUGT)); + break; + case Intrinsic::aarch64_sve_cmpge: + if (!N->getOperand(2).getValueType().isFloatingPoint()) + return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N), + N->getValueType(0), N->getOperand(1), N->getOperand(2), + N->getOperand(3), DAG.getCondCode(ISD::SETGE)); + break; + case Intrinsic::aarch64_sve_cmpgt: + if (!N->getOperand(2).getValueType().isFloatingPoint()) + return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N), + N->getValueType(0), N->getOperand(1), N->getOperand(2), + N->getOperand(3), DAG.getCondCode(ISD::SETGT)); + break; + case Intrinsic::aarch64_sve_cmpeq: + if (!N->getOperand(2).getValueType().isFloatingPoint()) + return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N), + N->getValueType(0), N->getOperand(1), N->getOperand(2), + N->getOperand(3), DAG.getCondCode(ISD::SETEQ)); + break; + case Intrinsic::aarch64_sve_cmpne: + if (!N->getOperand(2).getValueType().isFloatingPoint()) + return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N), + N->getValueType(0), N->getOperand(1), N->getOperand(2), + N->getOperand(3), DAG.getCondCode(ISD::SETNE)); + break; + case Intrinsic::aarch64_sve_fadda: + return combineSVEReductionOrderedFP(N, AArch64ISD::FADDA_PRED, DAG); + case Intrinsic::aarch64_sve_faddv: + return combineSVEReductionFP(N, AArch64ISD::FADDV_PRED, DAG); + case Intrinsic::aarch64_sve_fmaxnmv: + return combineSVEReductionFP(N, AArch64ISD::FMAXNMV_PRED, DAG); + case Intrinsic::aarch64_sve_fmaxv: + return combineSVEReductionFP(N, AArch64ISD::FMAXV_PRED, DAG); + case Intrinsic::aarch64_sve_fminnmv: + return combineSVEReductionFP(N, AArch64ISD::FMINNMV_PRED, DAG); + case Intrinsic::aarch64_sve_fminv: + return combineSVEReductionFP(N, AArch64ISD::FMINV_PRED, DAG); + case Intrinsic::aarch64_sve_sel: + return DAG.getNode(ISD::VSELECT, SDLoc(N), N->getValueType(0), + N->getOperand(1), N->getOperand(2), N->getOperand(3)); case Intrinsic::aarch64_sve_cmpeq_wide: - return tryConvertSVEWideCompare(N, Intrinsic::aarch64_sve_cmpeq, - false, DCI, DAG); + return tryConvertSVEWideCompare(N, ISD::SETEQ, DCI, DAG); case Intrinsic::aarch64_sve_cmpne_wide: - return tryConvertSVEWideCompare(N, Intrinsic::aarch64_sve_cmpne, - false, DCI, DAG); + return tryConvertSVEWideCompare(N, ISD::SETNE, DCI, DAG); case Intrinsic::aarch64_sve_cmpge_wide: - return tryConvertSVEWideCompare(N, Intrinsic::aarch64_sve_cmpge, - false, DCI, DAG); + return tryConvertSVEWideCompare(N, ISD::SETGE, DCI, DAG); case Intrinsic::aarch64_sve_cmpgt_wide: - return tryConvertSVEWideCompare(N, Intrinsic::aarch64_sve_cmpgt, - false, DCI, DAG); + return tryConvertSVEWideCompare(N, ISD::SETGT, DCI, DAG); case Intrinsic::aarch64_sve_cmplt_wide: - return tryConvertSVEWideCompare(N, Intrinsic::aarch64_sve_cmpgt, - true, DCI, DAG); + return tryConvertSVEWideCompare(N, ISD::SETLT, DCI, DAG); case Intrinsic::aarch64_sve_cmple_wide: - return tryConvertSVEWideCompare(N, Intrinsic::aarch64_sve_cmpge, - true, DCI, DAG); + return tryConvertSVEWideCompare(N, ISD::SETLE, DCI, DAG); case Intrinsic::aarch64_sve_cmphs_wide: - return tryConvertSVEWideCompare(N, Intrinsic::aarch64_sve_cmphs, - false, DCI, DAG); + return tryConvertSVEWideCompare(N, ISD::SETUGE, DCI, DAG); case Intrinsic::aarch64_sve_cmphi_wide: - return tryConvertSVEWideCompare(N, Intrinsic::aarch64_sve_cmphi, - false, DCI, DAG); + return tryConvertSVEWideCompare(N, ISD::SETUGT, DCI, DAG); case Intrinsic::aarch64_sve_cmplo_wide: - return tryConvertSVEWideCompare(N, Intrinsic::aarch64_sve_cmphi, true, - DCI, DAG); + return tryConvertSVEWideCompare(N, ISD::SETULT, DCI, DAG); case Intrinsic::aarch64_sve_cmpls_wide: - return tryConvertSVEWideCompare(N, Intrinsic::aarch64_sve_cmphs, true, - DCI, DAG); + return tryConvertSVEWideCompare(N, ISD::SETULE, DCI, DAG); case Intrinsic::aarch64_sve_ptest_any: return getPTest(DAG, N->getValueType(0), N->getOperand(1), N->getOperand(2), AArch64CC::ANY_ACTIVE); @@ -11091,14 +12125,14 @@ static SDValue performExtendCombine(SDNode *N, if (!ResVT.isSimple() || !SrcVT.isSimple()) return SDValue(); - // If the source VT is a 64-bit vector, we can play games and get the - // better results we want. - if (SrcVT.getSizeInBits() != 64) + // If the source VT is a 64-bit fixed or scalable vector, we can play games + // and get the better results we want. + if (SrcVT.getSizeInBits().getKnownMinSize() != 64) return SDValue(); unsigned SrcEltSize = SrcVT.getScalarSizeInBits(); - unsigned ElementCount = SrcVT.getVectorNumElements(); - SrcVT = MVT::getVectorVT(MVT::getIntegerVT(SrcEltSize * 2), ElementCount); + ElementCount SrcEC = SrcVT.getVectorElementCount(); + SrcVT = MVT::getVectorVT(MVT::getIntegerVT(SrcEltSize * 2), SrcEC); SDLoc DL(N); Src = DAG.getNode(N->getOpcode(), DL, SrcVT, Src); @@ -11106,17 +12140,14 @@ static SDValue performExtendCombine(SDNode *N, // bit source. EVT LoVT, HiVT; SDValue Lo, Hi; - unsigned NumElements = ResVT.getVectorNumElements(); - assert(!(NumElements & 1) && "Splitting vector, but not in half!"); - LoVT = HiVT = EVT::getVectorVT(*DAG.getContext(), - ResVT.getVectorElementType(), NumElements / 2); + LoVT = HiVT = ResVT.getHalfNumVectorElementsVT(*DAG.getContext()); EVT InNVT = EVT::getVectorVT(*DAG.getContext(), SrcVT.getVectorElementType(), - LoVT.getVectorNumElements()); + LoVT.getVectorElementCount()); Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InNVT, Src, DAG.getConstant(0, DL, MVT::i64)); Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InNVT, Src, - DAG.getConstant(InNVT.getVectorNumElements(), DL, MVT::i64)); + DAG.getConstant(InNVT.getVectorMinNumElements(), DL, MVT::i64)); Lo = DAG.getNode(N->getOpcode(), DL, LoVT, Lo); Hi = DAG.getNode(N->getOpcode(), DL, HiVT, Hi); @@ -11165,11 +12196,71 @@ static SDValue splitStoreSplat(SelectionDAG &DAG, StoreSDNode &St, return NewST1; } +// Returns an SVE type that ContentTy can be trivially sign or zero extended +// into. +static MVT getSVEContainerType(EVT ContentTy) { + assert(ContentTy.isSimple() && "No SVE containers for extended types"); + + switch (ContentTy.getSimpleVT().SimpleTy) { + default: + llvm_unreachable("No known SVE container for this MVT type"); + case MVT::nxv2i8: + case MVT::nxv2i16: + case MVT::nxv2i32: + case MVT::nxv2i64: + case MVT::nxv2f32: + case MVT::nxv2f64: + return MVT::nxv2i64; + case MVT::nxv4i8: + case MVT::nxv4i16: + case MVT::nxv4i32: + case MVT::nxv4f32: + return MVT::nxv4i32; + case MVT::nxv8i8: + case MVT::nxv8i16: + case MVT::nxv8f16: + case MVT::nxv8bf16: + return MVT::nxv8i16; + case MVT::nxv16i8: + return MVT::nxv16i8; + } +} + +static SDValue performLD1Combine(SDNode *N, SelectionDAG &DAG, unsigned Opc) { + SDLoc DL(N); + EVT VT = N->getValueType(0); + + if (VT.getSizeInBits().getKnownMinSize() > AArch64::SVEBitsPerBlock) + return SDValue(); + + EVT ContainerVT = VT; + if (ContainerVT.isInteger()) + ContainerVT = getSVEContainerType(ContainerVT); + + SDVTList VTs = DAG.getVTList(ContainerVT, MVT::Other); + SDValue Ops[] = { N->getOperand(0), // Chain + N->getOperand(2), // Pg + N->getOperand(3), // Base + DAG.getValueType(VT) }; + + SDValue Load = DAG.getNode(Opc, DL, VTs, Ops); + SDValue LoadChain = SDValue(Load.getNode(), 1); + + if (ContainerVT.isInteger() && (VT != ContainerVT)) + Load = DAG.getNode(ISD::TRUNCATE, DL, VT, Load.getValue(0)); + + return DAG.getMergeValues({ Load, LoadChain }, DL); +} + static SDValue performLDNT1Combine(SDNode *N, SelectionDAG &DAG) { SDLoc DL(N); EVT VT = N->getValueType(0); EVT PtrTy = N->getOperand(3).getValueType(); + if (VT == MVT::nxv8bf16 && + !static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasBF16()) + return SDValue(); + EVT LoadVT = VT; if (VT.isFloatingPoint()) LoadVT = VT.changeTypeToInteger(); @@ -11190,6 +12281,58 @@ static SDValue performLDNT1Combine(SDNode *N, SelectionDAG &DAG) { return L; } +template <unsigned Opcode> +static SDValue performLD1ReplicateCombine(SDNode *N, SelectionDAG &DAG) { + static_assert(Opcode == AArch64ISD::LD1RQ_MERGE_ZERO || + Opcode == AArch64ISD::LD1RO_MERGE_ZERO, + "Unsupported opcode."); + SDLoc DL(N); + EVT VT = N->getValueType(0); + + EVT LoadVT = VT; + if (VT.isFloatingPoint()) + LoadVT = VT.changeTypeToInteger(); + + SDValue Ops[] = {N->getOperand(0), N->getOperand(2), N->getOperand(3)}; + SDValue Load = DAG.getNode(Opcode, DL, {LoadVT, MVT::Other}, Ops); + SDValue LoadChain = SDValue(Load.getNode(), 1); + + if (VT.isFloatingPoint()) + Load = DAG.getNode(ISD::BITCAST, DL, VT, Load.getValue(0)); + + return DAG.getMergeValues({Load, LoadChain}, DL); +} + +static SDValue performST1Combine(SDNode *N, SelectionDAG &DAG) { + SDLoc DL(N); + SDValue Data = N->getOperand(2); + EVT DataVT = Data.getValueType(); + EVT HwSrcVt = getSVEContainerType(DataVT); + SDValue InputVT = DAG.getValueType(DataVT); + + if (DataVT == MVT::nxv8bf16 && + !static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasBF16()) + return SDValue(); + + if (DataVT.isFloatingPoint()) + InputVT = DAG.getValueType(HwSrcVt); + + SDValue SrcNew; + if (Data.getValueType().isFloatingPoint()) + SrcNew = DAG.getNode(ISD::BITCAST, DL, HwSrcVt, Data); + else + SrcNew = DAG.getNode(ISD::ANY_EXTEND, DL, HwSrcVt, Data); + + SDValue Ops[] = { N->getOperand(0), // Chain + SrcNew, + N->getOperand(4), // Base + N->getOperand(3), // Pg + InputVT + }; + + return DAG.getNode(AArch64ISD::ST1_PRED, DL, N->getValueType(0), Ops); +} + static SDValue performSTNT1Combine(SDNode *N, SelectionDAG &DAG) { SDLoc DL(N); @@ -11197,6 +12340,10 @@ static SDValue performSTNT1Combine(SDNode *N, SelectionDAG &DAG) { EVT DataVT = Data.getValueType(); EVT PtrTy = N->getOperand(4).getValueType(); + if (DataVT == MVT::nxv8bf16 && + !static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasBF16()) + return SDValue(); + if (DataVT.isFloatingPoint()) Data = DAG.getNode(ISD::BITCAST, DL, DataVT.changeTypeToInteger(), Data); @@ -11226,6 +12373,10 @@ static SDValue replaceZeroVectorStore(SelectionDAG &DAG, StoreSDNode &St) { SDValue StVal = St.getValue(); EVT VT = StVal.getValueType(); + // Avoid scalarizing zero splat stores for scalable vectors. + if (VT.isScalableVector()) + return SDValue(); + // It is beneficial to scalarize a zero splat store for 2 or 3 i64 elements or // 2, 3 or 4 i32 elements. int NumVecElts = VT.getVectorNumElements(); @@ -11348,7 +12499,8 @@ static SDValue splitStores(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SDValue StVal = S->getValue(); EVT VT = StVal.getValueType(); - if (!VT.isVector()) + + if (!VT.isFixedLengthVector()) return SDValue(); // If we get a splat of zeros, convert this vector store to a store of @@ -11419,6 +12571,9 @@ static SDValue performPostLD1Combine(SDNode *N, SelectionDAG &DAG = DCI.DAG; EVT VT = N->getValueType(0); + if (VT.isScalableVector()) + return SDValue(); + unsigned LoadIdx = IsLaneOp ? 1 : 0; SDNode *LD = N->getOperand(LoadIdx).getNode(); // If it is not LOAD, can not do such combine. @@ -12258,32 +13413,57 @@ static SDValue performGlobalAddressCombine(SDNode *N, SelectionDAG &DAG, DAG.getConstant(MinOffset, DL, MVT::i64)); } -// Returns an SVE type that ContentTy can be trivially sign or zero extended -// into. -static MVT getSVEContainerType(EVT ContentTy) { - assert(ContentTy.isSimple() && "No SVE containers for extended types"); +// Turns the vector of indices into a vector of byte offstes by scaling Offset +// by (BitWidth / 8). +static SDValue getScaledOffsetForBitWidth(SelectionDAG &DAG, SDValue Offset, + SDLoc DL, unsigned BitWidth) { + assert(Offset.getValueType().isScalableVector() && + "This method is only for scalable vectors of offsets"); - switch (ContentTy.getSimpleVT().SimpleTy) { - default: - llvm_unreachable("No known SVE container for this MVT type"); - case MVT::nxv2i8: - case MVT::nxv2i16: - case MVT::nxv2i32: - case MVT::nxv2i64: - case MVT::nxv2f32: - case MVT::nxv2f64: - return MVT::nxv2i64; - case MVT::nxv4i8: - case MVT::nxv4i16: - case MVT::nxv4i32: - case MVT::nxv4f32: - return MVT::nxv4i32; - } + SDValue Shift = DAG.getConstant(Log2_32(BitWidth / 8), DL, MVT::i64); + SDValue SplatShift = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::nxv2i64, Shift); + + return DAG.getNode(ISD::SHL, DL, MVT::nxv2i64, Offset, SplatShift); } -static SDValue performST1ScatterCombine(SDNode *N, SelectionDAG &DAG, - unsigned Opcode, - bool OnlyPackedOffsets = true) { +/// Check if the value of \p OffsetInBytes can be used as an immediate for +/// the gather load/prefetch and scatter store instructions with vector base and +/// immediate offset addressing mode: +/// +/// [<Zn>.[S|D]{, #<imm>}] +/// +/// where <imm> = sizeof(<T>) * k, for k = 0, 1, ..., 31. + +inline static bool isValidImmForSVEVecImmAddrMode(unsigned OffsetInBytes, + unsigned ScalarSizeInBytes) { + // The immediate is not a multiple of the scalar size. + if (OffsetInBytes % ScalarSizeInBytes) + return false; + + // The immediate is out of range. + if (OffsetInBytes / ScalarSizeInBytes > 31) + return false; + + return true; +} + +/// Check if the value of \p Offset represents a valid immediate for the SVE +/// gather load/prefetch and scatter store instructiona with vector base and +/// immediate offset addressing mode: +/// +/// [<Zn>.[S|D]{, #<imm>}] +/// +/// where <imm> = sizeof(<T>) * k, for k = 0, 1, ..., 31. +static bool isValidImmForSVEVecImmAddrMode(SDValue Offset, + unsigned ScalarSizeInBytes) { + ConstantSDNode *OffsetConst = dyn_cast<ConstantSDNode>(Offset.getNode()); + return OffsetConst && isValidImmForSVEVecImmAddrMode( + OffsetConst->getZExtValue(), ScalarSizeInBytes); +} + +static SDValue performScatterStoreCombine(SDNode *N, SelectionDAG &DAG, + unsigned Opcode, + bool OnlyPackedOffsets = true) { const SDValue Src = N->getOperand(2); const EVT SrcVT = Src->getValueType(0); assert(SrcVT.isScalableVector() && @@ -12303,11 +13483,46 @@ static SDValue performST1ScatterCombine(SDNode *N, SelectionDAG &DAG, // Depending on the addressing mode, this is either a pointer or a vector of // pointers (that fits into one register) - const SDValue Base = N->getOperand(4); + SDValue Base = N->getOperand(4); // Depending on the addressing mode, this is either a single offset or a // vector of offsets (that fits into one register) SDValue Offset = N->getOperand(5); + // For "scalar + vector of indices", just scale the indices. This only + // applies to non-temporal scatters because there's no instruction that takes + // indicies. + if (Opcode == AArch64ISD::SSTNT1_INDEX_PRED) { + Offset = + getScaledOffsetForBitWidth(DAG, Offset, DL, SrcElVT.getSizeInBits()); + Opcode = AArch64ISD::SSTNT1_PRED; + } + + // In the case of non-temporal gather loads there's only one SVE instruction + // per data-size: "scalar + vector", i.e. + // * stnt1{b|h|w|d} { z0.s }, p0/z, [z0.s, x0] + // Since we do have intrinsics that allow the arguments to be in a different + // order, we may need to swap them to match the spec. + if (Opcode == AArch64ISD::SSTNT1_PRED && Offset.getValueType().isVector()) + std::swap(Base, Offset); + + // SST1_IMM requires that the offset is an immediate that is: + // * a multiple of #SizeInBytes, + // * in the range [0, 31 x #SizeInBytes], + // where #SizeInBytes is the size in bytes of the stored items. For + // immediates outside that range and non-immediate scalar offsets use SST1 or + // SST1_UXTW instead. + if (Opcode == AArch64ISD::SST1_IMM_PRED) { + if (!isValidImmForSVEVecImmAddrMode(Offset, + SrcVT.getScalarSizeInBits() / 8)) { + if (MVT::nxv4i32 == Base.getValueType().getSimpleVT().SimpleTy) + Opcode = AArch64ISD::SST1_UXTW_PRED; + else + Opcode = AArch64ISD::SST1_PRED; + + std::swap(Base, Offset); + } + } + auto &TLI = DAG.getTargetLoweringInfo(); if (!TLI.isTypeLegal(Base.getValueType())) return SDValue(); @@ -12325,9 +13540,9 @@ static SDValue performST1ScatterCombine(SDNode *N, SelectionDAG &DAG, // Source value type that is representable in hardware EVT HwSrcVt = getSVEContainerType(SrcVT); - // Keep the original type of the input data to store - this is needed to - // differentiate between ST1B, ST1H, ST1W and ST1D. For FP values we want the - // integer equivalent, so just use HwSrcVt. + // Keep the original type of the input data to store - this is needed to be + // able to select the correct instruction, e.g. ST1B, ST1H, ST1W and ST1D. For + // FP values we want the integer equivalent, so just use HwSrcVt. SDValue InputVT = DAG.getValueType(SrcVT); if (SrcVT.isFloatingPoint()) InputVT = DAG.getValueType(HwSrcVt); @@ -12350,24 +13565,67 @@ static SDValue performST1ScatterCombine(SDNode *N, SelectionDAG &DAG, return DAG.getNode(Opcode, DL, VTs, Ops); } -static SDValue performLD1GatherCombine(SDNode *N, SelectionDAG &DAG, - unsigned Opcode, - bool OnlyPackedOffsets = true) { - EVT RetVT = N->getValueType(0); +static SDValue performGatherLoadCombine(SDNode *N, SelectionDAG &DAG, + unsigned Opcode, + bool OnlyPackedOffsets = true) { + const EVT RetVT = N->getValueType(0); assert(RetVT.isScalableVector() && "Gather loads are only possible for SVE vectors"); + SDLoc DL(N); + // Make sure that the loaded data will fit into an SVE register if (RetVT.getSizeInBits().getKnownMinSize() > AArch64::SVEBitsPerBlock) return SDValue(); // Depending on the addressing mode, this is either a pointer or a vector of // pointers (that fits into one register) - const SDValue Base = N->getOperand(3); + SDValue Base = N->getOperand(3); // Depending on the addressing mode, this is either a single offset or a // vector of offsets (that fits into one register) SDValue Offset = N->getOperand(4); + // For "scalar + vector of indices", just scale the indices. This only + // applies to non-temporal gathers because there's no instruction that takes + // indicies. + if (Opcode == AArch64ISD::GLDNT1_INDEX_MERGE_ZERO) { + Offset = getScaledOffsetForBitWidth(DAG, Offset, DL, + RetVT.getScalarSizeInBits()); + Opcode = AArch64ISD::GLDNT1_MERGE_ZERO; + } + + // In the case of non-temporal gather loads there's only one SVE instruction + // per data-size: "scalar + vector", i.e. + // * ldnt1{b|h|w|d} { z0.s }, p0/z, [z0.s, x0] + // Since we do have intrinsics that allow the arguments to be in a different + // order, we may need to swap them to match the spec. + if (Opcode == AArch64ISD::GLDNT1_MERGE_ZERO && + Offset.getValueType().isVector()) + std::swap(Base, Offset); + + // GLD{FF}1_IMM requires that the offset is an immediate that is: + // * a multiple of #SizeInBytes, + // * in the range [0, 31 x #SizeInBytes], + // where #SizeInBytes is the size in bytes of the loaded items. For + // immediates outside that range and non-immediate scalar offsets use + // GLD1_MERGE_ZERO or GLD1_UXTW_MERGE_ZERO instead. + if (Opcode == AArch64ISD::GLD1_IMM_MERGE_ZERO || + Opcode == AArch64ISD::GLDFF1_IMM_MERGE_ZERO) { + if (!isValidImmForSVEVecImmAddrMode(Offset, + RetVT.getScalarSizeInBits() / 8)) { + if (MVT::nxv4i32 == Base.getValueType().getSimpleVT().SimpleTy) + Opcode = (Opcode == AArch64ISD::GLD1_IMM_MERGE_ZERO) + ? AArch64ISD::GLD1_UXTW_MERGE_ZERO + : AArch64ISD::GLDFF1_UXTW_MERGE_ZERO; + else + Opcode = (Opcode == AArch64ISD::GLD1_IMM_MERGE_ZERO) + ? AArch64ISD::GLD1_MERGE_ZERO + : AArch64ISD::GLDFF1_MERGE_ZERO; + + std::swap(Base, Offset); + } + } + auto &TLI = DAG.getTargetLoweringInfo(); if (!TLI.isTypeLegal(Base.getValueType())) return SDValue(); @@ -12382,10 +13640,9 @@ static SDValue performLD1GatherCombine(SDNode *N, SelectionDAG &DAG, // Return value type that is representable in hardware EVT HwRetVt = getSVEContainerType(RetVT); - // Keep the original output value type around - this will better inform - // optimisations (e.g. instruction folding when load is followed by - // zext/sext). This will only be used for ints, so the value for FPs - // doesn't matter. + // Keep the original output value type around - this is needed to be able to + // select the correct instruction, e.g. LD1B, LD1H, LD1W and LD1D. For FP + // values we want the integer equivalent, so just use HwRetVT. SDValue OutVT = DAG.getValueType(RetVT); if (RetVT.isFloatingPoint()) OutVT = DAG.getValueType(HwRetVt); @@ -12409,55 +13666,126 @@ static SDValue performLD1GatherCombine(SDNode *N, SelectionDAG &DAG, return DAG.getMergeValues({Load, LoadChain}, DL); } - static SDValue performSignExtendInRegCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG) { if (DCI.isBeforeLegalizeOps()) return SDValue(); + SDLoc DL(N); SDValue Src = N->getOperand(0); unsigned Opc = Src->getOpcode(); - // Gather load nodes (e.g. AArch64ISD::GLD1) are straightforward candidates + // Sign extend of an unsigned unpack -> signed unpack + if (Opc == AArch64ISD::UUNPKHI || Opc == AArch64ISD::UUNPKLO) { + + unsigned SOpc = Opc == AArch64ISD::UUNPKHI ? AArch64ISD::SUNPKHI + : AArch64ISD::SUNPKLO; + + // Push the sign extend to the operand of the unpack + // This is necessary where, for example, the operand of the unpack + // is another unpack: + // 4i32 sign_extend_inreg (4i32 uunpklo(8i16 uunpklo (16i8 opnd)), from 4i8) + // -> + // 4i32 sunpklo (8i16 sign_extend_inreg(8i16 uunpklo (16i8 opnd), from 8i8) + // -> + // 4i32 sunpklo(8i16 sunpklo(16i8 opnd)) + SDValue ExtOp = Src->getOperand(0); + auto VT = cast<VTSDNode>(N->getOperand(1))->getVT(); + EVT EltTy = VT.getVectorElementType(); + (void)EltTy; + + assert((EltTy == MVT::i8 || EltTy == MVT::i16 || EltTy == MVT::i32) && + "Sign extending from an invalid type"); + + EVT ExtVT = EVT::getVectorVT(*DAG.getContext(), + VT.getVectorElementType(), + VT.getVectorElementCount() * 2); + + SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, ExtOp.getValueType(), + ExtOp, DAG.getValueType(ExtVT)); + + return DAG.getNode(SOpc, DL, N->getValueType(0), Ext); + } + + // SVE load nodes (e.g. AArch64ISD::GLD1) are straightforward candidates // for DAG Combine with SIGN_EXTEND_INREG. Bail out for all other nodes. unsigned NewOpc; + unsigned MemVTOpNum = 4; switch (Opc) { - case AArch64ISD::GLD1: - NewOpc = AArch64ISD::GLD1S; + case AArch64ISD::LD1_MERGE_ZERO: + NewOpc = AArch64ISD::LD1S_MERGE_ZERO; + MemVTOpNum = 3; + break; + case AArch64ISD::LDNF1_MERGE_ZERO: + NewOpc = AArch64ISD::LDNF1S_MERGE_ZERO; + MemVTOpNum = 3; + break; + case AArch64ISD::LDFF1_MERGE_ZERO: + NewOpc = AArch64ISD::LDFF1S_MERGE_ZERO; + MemVTOpNum = 3; + break; + case AArch64ISD::GLD1_MERGE_ZERO: + NewOpc = AArch64ISD::GLD1S_MERGE_ZERO; break; - case AArch64ISD::GLD1_SCALED: - NewOpc = AArch64ISD::GLD1S_SCALED; + case AArch64ISD::GLD1_SCALED_MERGE_ZERO: + NewOpc = AArch64ISD::GLD1S_SCALED_MERGE_ZERO; break; - case AArch64ISD::GLD1_SXTW: - NewOpc = AArch64ISD::GLD1S_SXTW; + case AArch64ISD::GLD1_SXTW_MERGE_ZERO: + NewOpc = AArch64ISD::GLD1S_SXTW_MERGE_ZERO; break; - case AArch64ISD::GLD1_SXTW_SCALED: - NewOpc = AArch64ISD::GLD1S_SXTW_SCALED; + case AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO: + NewOpc = AArch64ISD::GLD1S_SXTW_SCALED_MERGE_ZERO; break; - case AArch64ISD::GLD1_UXTW: - NewOpc = AArch64ISD::GLD1S_UXTW; + case AArch64ISD::GLD1_UXTW_MERGE_ZERO: + NewOpc = AArch64ISD::GLD1S_UXTW_MERGE_ZERO; break; - case AArch64ISD::GLD1_UXTW_SCALED: - NewOpc = AArch64ISD::GLD1S_UXTW_SCALED; + case AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO: + NewOpc = AArch64ISD::GLD1S_UXTW_SCALED_MERGE_ZERO; break; - case AArch64ISD::GLD1_IMM: - NewOpc = AArch64ISD::GLD1S_IMM; + case AArch64ISD::GLD1_IMM_MERGE_ZERO: + NewOpc = AArch64ISD::GLD1S_IMM_MERGE_ZERO; + break; + case AArch64ISD::GLDFF1_MERGE_ZERO: + NewOpc = AArch64ISD::GLDFF1S_MERGE_ZERO; + break; + case AArch64ISD::GLDFF1_SCALED_MERGE_ZERO: + NewOpc = AArch64ISD::GLDFF1S_SCALED_MERGE_ZERO; + break; + case AArch64ISD::GLDFF1_SXTW_MERGE_ZERO: + NewOpc = AArch64ISD::GLDFF1S_SXTW_MERGE_ZERO; + break; + case AArch64ISD::GLDFF1_SXTW_SCALED_MERGE_ZERO: + NewOpc = AArch64ISD::GLDFF1S_SXTW_SCALED_MERGE_ZERO; + break; + case AArch64ISD::GLDFF1_UXTW_MERGE_ZERO: + NewOpc = AArch64ISD::GLDFF1S_UXTW_MERGE_ZERO; + break; + case AArch64ISD::GLDFF1_UXTW_SCALED_MERGE_ZERO: + NewOpc = AArch64ISD::GLDFF1S_UXTW_SCALED_MERGE_ZERO; + break; + case AArch64ISD::GLDFF1_IMM_MERGE_ZERO: + NewOpc = AArch64ISD::GLDFF1S_IMM_MERGE_ZERO; + break; + case AArch64ISD::GLDNT1_MERGE_ZERO: + NewOpc = AArch64ISD::GLDNT1S_MERGE_ZERO; break; default: return SDValue(); } EVT SignExtSrcVT = cast<VTSDNode>(N->getOperand(1))->getVT(); - EVT GLD1SrcMemVT = cast<VTSDNode>(Src->getOperand(4))->getVT(); + EVT SrcMemVT = cast<VTSDNode>(Src->getOperand(MemVTOpNum))->getVT(); - if ((SignExtSrcVT != GLD1SrcMemVT) || !Src.hasOneUse()) + if ((SignExtSrcVT != SrcMemVT) || !Src.hasOneUse()) return SDValue(); EVT DstVT = N->getValueType(0); SDVTList VTs = DAG.getVTList(DstVT, MVT::Other); - SDValue Ops[] = {Src->getOperand(0), Src->getOperand(1), Src->getOperand(2), - Src->getOperand(3), Src->getOperand(4)}; + + SmallVector<SDValue, 5> Ops; + for (unsigned I = 0; I < Src->getNumOperands(); ++I) + Ops.push_back(Src->getOperand(I)); SDValue ExtLoad = DAG.getNode(NewOpc, SDLoc(N), VTs, Ops); DCI.CombineTo(N, ExtLoad); @@ -12467,6 +13795,51 @@ performSignExtendInRegCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, return SDValue(N, 0); } +/// Legalize the gather prefetch (scalar + vector addressing mode) when the +/// offset vector is an unpacked 32-bit scalable vector. The other cases (Offset +/// != nxv2i32) do not need legalization. +static SDValue legalizeSVEGatherPrefetchOffsVec(SDNode *N, SelectionDAG &DAG) { + const unsigned OffsetPos = 4; + SDValue Offset = N->getOperand(OffsetPos); + + // Not an unpacked vector, bail out. + if (Offset.getValueType().getSimpleVT().SimpleTy != MVT::nxv2i32) + return SDValue(); + + // Extend the unpacked offset vector to 64-bit lanes. + SDLoc DL(N); + Offset = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::nxv2i64, Offset); + SmallVector<SDValue, 5> Ops(N->op_begin(), N->op_end()); + // Replace the offset operand with the 64-bit one. + Ops[OffsetPos] = Offset; + + return DAG.getNode(N->getOpcode(), DL, DAG.getVTList(MVT::Other), Ops); +} + +/// Combines a node carrying the intrinsic +/// `aarch64_sve_prf<T>_gather_scalar_offset` into a node that uses +/// `aarch64_sve_prfb_gather_uxtw_index` when the scalar offset passed to +/// `aarch64_sve_prf<T>_gather_scalar_offset` is not a valid immediate for the +/// sve gather prefetch instruction with vector plus immediate addressing mode. +static SDValue combineSVEPrefetchVecBaseImmOff(SDNode *N, SelectionDAG &DAG, + unsigned ScalarSizeInBytes) { + const unsigned ImmPos = 4, OffsetPos = 3; + // No need to combine the node if the immediate is valid... + if (isValidImmForSVEVecImmAddrMode(N->getOperand(ImmPos), ScalarSizeInBytes)) + return SDValue(); + + // ...otherwise swap the offset base with the offset... + SmallVector<SDValue, 5> Ops(N->op_begin(), N->op_end()); + std::swap(Ops[ImmPos], Ops[OffsetPos]); + // ...and remap the intrinsic `aarch64_sve_prf<T>_gather_scalar_offset` to + // `aarch64_sve_prfb_gather_uxtw_index`. + SDLoc DL(N); + Ops[1] = DAG.getConstant(Intrinsic::aarch64_sve_prfb_gather_uxtw_index, DL, + MVT::i64); + + return DAG.getNode(N->getOpcode(), DL, DAG.getVTList(MVT::Other), Ops); +} + SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; @@ -12531,6 +13904,23 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N, case ISD::INTRINSIC_VOID: case ISD::INTRINSIC_W_CHAIN: switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) { + case Intrinsic::aarch64_sve_prfb_gather_scalar_offset: + return combineSVEPrefetchVecBaseImmOff(N, DAG, 1 /*=ScalarSizeInBytes*/); + case Intrinsic::aarch64_sve_prfh_gather_scalar_offset: + return combineSVEPrefetchVecBaseImmOff(N, DAG, 2 /*=ScalarSizeInBytes*/); + case Intrinsic::aarch64_sve_prfw_gather_scalar_offset: + return combineSVEPrefetchVecBaseImmOff(N, DAG, 4 /*=ScalarSizeInBytes*/); + case Intrinsic::aarch64_sve_prfd_gather_scalar_offset: + return combineSVEPrefetchVecBaseImmOff(N, DAG, 8 /*=ScalarSizeInBytes*/); + case Intrinsic::aarch64_sve_prfb_gather_uxtw_index: + case Intrinsic::aarch64_sve_prfb_gather_sxtw_index: + case Intrinsic::aarch64_sve_prfh_gather_uxtw_index: + case Intrinsic::aarch64_sve_prfh_gather_sxtw_index: + case Intrinsic::aarch64_sve_prfw_gather_uxtw_index: + case Intrinsic::aarch64_sve_prfw_gather_sxtw_index: + case Intrinsic::aarch64_sve_prfd_gather_uxtw_index: + case Intrinsic::aarch64_sve_prfd_gather_sxtw_index: + return legalizeSVEGatherPrefetchOffsVec(N, DAG); case Intrinsic::aarch64_neon_ld2: case Intrinsic::aarch64_neon_ld3: case Intrinsic::aarch64_neon_ld4: @@ -12555,44 +13945,180 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N, return performNEONPostLDSTCombine(N, DCI, DAG); case Intrinsic::aarch64_sve_ldnt1: return performLDNT1Combine(N, DAG); + case Intrinsic::aarch64_sve_ld1rq: + return performLD1ReplicateCombine<AArch64ISD::LD1RQ_MERGE_ZERO>(N, DAG); + case Intrinsic::aarch64_sve_ld1ro: + return performLD1ReplicateCombine<AArch64ISD::LD1RO_MERGE_ZERO>(N, DAG); + case Intrinsic::aarch64_sve_ldnt1_gather_scalar_offset: + return performGatherLoadCombine(N, DAG, AArch64ISD::GLDNT1_MERGE_ZERO); + case Intrinsic::aarch64_sve_ldnt1_gather: + return performGatherLoadCombine(N, DAG, AArch64ISD::GLDNT1_MERGE_ZERO); + case Intrinsic::aarch64_sve_ldnt1_gather_index: + return performGatherLoadCombine(N, DAG, + AArch64ISD::GLDNT1_INDEX_MERGE_ZERO); + case Intrinsic::aarch64_sve_ldnt1_gather_uxtw: + return performGatherLoadCombine(N, DAG, AArch64ISD::GLDNT1_MERGE_ZERO); + case Intrinsic::aarch64_sve_ld1: + return performLD1Combine(N, DAG, AArch64ISD::LD1_MERGE_ZERO); + case Intrinsic::aarch64_sve_ldnf1: + return performLD1Combine(N, DAG, AArch64ISD::LDNF1_MERGE_ZERO); + case Intrinsic::aarch64_sve_ldff1: + return performLD1Combine(N, DAG, AArch64ISD::LDFF1_MERGE_ZERO); + case Intrinsic::aarch64_sve_st1: + return performST1Combine(N, DAG); case Intrinsic::aarch64_sve_stnt1: return performSTNT1Combine(N, DAG); + case Intrinsic::aarch64_sve_stnt1_scatter_scalar_offset: + return performScatterStoreCombine(N, DAG, AArch64ISD::SSTNT1_PRED); + case Intrinsic::aarch64_sve_stnt1_scatter_uxtw: + return performScatterStoreCombine(N, DAG, AArch64ISD::SSTNT1_PRED); + case Intrinsic::aarch64_sve_stnt1_scatter: + return performScatterStoreCombine(N, DAG, AArch64ISD::SSTNT1_PRED); + case Intrinsic::aarch64_sve_stnt1_scatter_index: + return performScatterStoreCombine(N, DAG, AArch64ISD::SSTNT1_INDEX_PRED); case Intrinsic::aarch64_sve_ld1_gather: - return performLD1GatherCombine(N, DAG, AArch64ISD::GLD1); + return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1_MERGE_ZERO); case Intrinsic::aarch64_sve_ld1_gather_index: - return performLD1GatherCombine(N, DAG, AArch64ISD::GLD1_SCALED); + return performGatherLoadCombine(N, DAG, + AArch64ISD::GLD1_SCALED_MERGE_ZERO); case Intrinsic::aarch64_sve_ld1_gather_sxtw: - return performLD1GatherCombine(N, DAG, AArch64ISD::GLD1_SXTW, + return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1_SXTW_MERGE_ZERO, /*OnlyPackedOffsets=*/false); case Intrinsic::aarch64_sve_ld1_gather_uxtw: - return performLD1GatherCombine(N, DAG, AArch64ISD::GLD1_UXTW, + return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1_UXTW_MERGE_ZERO, /*OnlyPackedOffsets=*/false); case Intrinsic::aarch64_sve_ld1_gather_sxtw_index: - return performLD1GatherCombine(N, DAG, AArch64ISD::GLD1_SXTW_SCALED, + return performGatherLoadCombine(N, DAG, + AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO, /*OnlyPackedOffsets=*/false); case Intrinsic::aarch64_sve_ld1_gather_uxtw_index: - return performLD1GatherCombine(N, DAG, AArch64ISD::GLD1_UXTW_SCALED, + return performGatherLoadCombine(N, DAG, + AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO, + /*OnlyPackedOffsets=*/false); + case Intrinsic::aarch64_sve_ld1_gather_scalar_offset: + return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1_IMM_MERGE_ZERO); + case Intrinsic::aarch64_sve_ldff1_gather: + return performGatherLoadCombine(N, DAG, AArch64ISD::GLDFF1_MERGE_ZERO); + case Intrinsic::aarch64_sve_ldff1_gather_index: + return performGatherLoadCombine(N, DAG, + AArch64ISD::GLDFF1_SCALED_MERGE_ZERO); + case Intrinsic::aarch64_sve_ldff1_gather_sxtw: + return performGatherLoadCombine(N, DAG, + AArch64ISD::GLDFF1_SXTW_MERGE_ZERO, + /*OnlyPackedOffsets=*/false); + case Intrinsic::aarch64_sve_ldff1_gather_uxtw: + return performGatherLoadCombine(N, DAG, + AArch64ISD::GLDFF1_UXTW_MERGE_ZERO, + /*OnlyPackedOffsets=*/false); + case Intrinsic::aarch64_sve_ldff1_gather_sxtw_index: + return performGatherLoadCombine(N, DAG, + AArch64ISD::GLDFF1_SXTW_SCALED_MERGE_ZERO, + /*OnlyPackedOffsets=*/false); + case Intrinsic::aarch64_sve_ldff1_gather_uxtw_index: + return performGatherLoadCombine(N, DAG, + AArch64ISD::GLDFF1_UXTW_SCALED_MERGE_ZERO, /*OnlyPackedOffsets=*/false); - case Intrinsic::aarch64_sve_ld1_gather_imm: - return performLD1GatherCombine(N, DAG, AArch64ISD::GLD1_IMM); + case Intrinsic::aarch64_sve_ldff1_gather_scalar_offset: + return performGatherLoadCombine(N, DAG, + AArch64ISD::GLDFF1_IMM_MERGE_ZERO); case Intrinsic::aarch64_sve_st1_scatter: - return performST1ScatterCombine(N, DAG, AArch64ISD::SST1); + return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_PRED); case Intrinsic::aarch64_sve_st1_scatter_index: - return performST1ScatterCombine(N, DAG, AArch64ISD::SST1_SCALED); + return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_SCALED_PRED); case Intrinsic::aarch64_sve_st1_scatter_sxtw: - return performST1ScatterCombine(N, DAG, AArch64ISD::SST1_SXTW, - /*OnlyPackedOffsets=*/false); + return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_SXTW_PRED, + /*OnlyPackedOffsets=*/false); case Intrinsic::aarch64_sve_st1_scatter_uxtw: - return performST1ScatterCombine(N, DAG, AArch64ISD::SST1_UXTW, - /*OnlyPackedOffsets=*/false); + return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_UXTW_PRED, + /*OnlyPackedOffsets=*/false); case Intrinsic::aarch64_sve_st1_scatter_sxtw_index: - return performST1ScatterCombine(N, DAG, AArch64ISD::SST1_SXTW_SCALED, - /*OnlyPackedOffsets=*/false); + return performScatterStoreCombine(N, DAG, + AArch64ISD::SST1_SXTW_SCALED_PRED, + /*OnlyPackedOffsets=*/false); case Intrinsic::aarch64_sve_st1_scatter_uxtw_index: - return performST1ScatterCombine(N, DAG, AArch64ISD::SST1_UXTW_SCALED, - /*OnlyPackedOffsets=*/false); - case Intrinsic::aarch64_sve_st1_scatter_imm: - return performST1ScatterCombine(N, DAG, AArch64ISD::SST1_IMM); + return performScatterStoreCombine(N, DAG, + AArch64ISD::SST1_UXTW_SCALED_PRED, + /*OnlyPackedOffsets=*/false); + case Intrinsic::aarch64_sve_st1_scatter_scalar_offset: + return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_IMM_PRED); + case Intrinsic::aarch64_sve_tuple_get: { + SDLoc DL(N); + SDValue Chain = N->getOperand(0); + SDValue Src1 = N->getOperand(2); + SDValue Idx = N->getOperand(3); + + uint64_t IdxConst = cast<ConstantSDNode>(Idx)->getZExtValue(); + EVT ResVT = N->getValueType(0); + uint64_t NumLanes = ResVT.getVectorElementCount().Min; + SDValue Val = + DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ResVT, Src1, + DAG.getConstant(IdxConst * NumLanes, DL, MVT::i32)); + return DAG.getMergeValues({Val, Chain}, DL); + } + case Intrinsic::aarch64_sve_tuple_set: { + SDLoc DL(N); + SDValue Chain = N->getOperand(0); + SDValue Tuple = N->getOperand(2); + SDValue Idx = N->getOperand(3); + SDValue Vec = N->getOperand(4); + + EVT TupleVT = Tuple.getValueType(); + uint64_t TupleLanes = TupleVT.getVectorElementCount().Min; + + uint64_t IdxConst = cast<ConstantSDNode>(Idx)->getZExtValue(); + uint64_t NumLanes = Vec.getValueType().getVectorElementCount().Min; + + if ((TupleLanes % NumLanes) != 0) + report_fatal_error("invalid tuple vector!"); + + uint64_t NumVecs = TupleLanes / NumLanes; + + SmallVector<SDValue, 4> Opnds; + for (unsigned I = 0; I < NumVecs; ++I) { + if (I == IdxConst) + Opnds.push_back(Vec); + else { + Opnds.push_back( + DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, Vec.getValueType(), Tuple, + DAG.getConstant(I * NumLanes, DL, MVT::i32))); + } + } + SDValue Concat = + DAG.getNode(ISD::CONCAT_VECTORS, DL, Tuple.getValueType(), Opnds); + return DAG.getMergeValues({Concat, Chain}, DL); + } + case Intrinsic::aarch64_sve_tuple_create2: + case Intrinsic::aarch64_sve_tuple_create3: + case Intrinsic::aarch64_sve_tuple_create4: { + SDLoc DL(N); + SDValue Chain = N->getOperand(0); + + SmallVector<SDValue, 4> Opnds; + for (unsigned I = 2; I < N->getNumOperands(); ++I) + Opnds.push_back(N->getOperand(I)); + + EVT VT = Opnds[0].getValueType(); + EVT EltVT = VT.getVectorElementType(); + EVT DestVT = EVT::getVectorVT(*DAG.getContext(), EltVT, + VT.getVectorElementCount() * + (N->getNumOperands() - 2)); + SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, DestVT, Opnds); + return DAG.getMergeValues({Concat, Chain}, DL); + } + case Intrinsic::aarch64_sve_ld2: + case Intrinsic::aarch64_sve_ld3: + case Intrinsic::aarch64_sve_ld4: { + SDLoc DL(N); + SDValue Chain = N->getOperand(0); + SDValue Mask = N->getOperand(2); + SDValue BasePtr = N->getOperand(3); + SDValue LoadOps[] = {Chain, Mask, BasePtr}; + unsigned IntrinsicID = + cast<ConstantSDNode>(N->getOperand(1))->getZExtValue(); + SDValue Result = + LowerSVEStructLoad(IntrinsicID, LoadOps, N->getValueType(0), DAG, DL); + return DAG.getMergeValues({Result, Chain}, DL); + } default: break; } @@ -12724,7 +14250,8 @@ static void ReplaceBITCASTResults(SDNode *N, SmallVectorImpl<SDValue> &Results, SDLoc DL(N); SDValue Op = N->getOperand(0); - if (N->getValueType(0) != MVT::i16 || Op.getValueType() != MVT::f16) + if (N->getValueType(0) != MVT::i16 || + (Op.getValueType() != MVT::f16 && Op.getValueType() != MVT::bf16)) return; Op = SDValue( @@ -12759,6 +14286,40 @@ static std::pair<SDValue, SDValue> splitInt128(SDValue N, SelectionDAG &DAG) { return std::make_pair(Lo, Hi); } +void AArch64TargetLowering::ReplaceExtractSubVectorResults( + SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const { + SDValue In = N->getOperand(0); + EVT InVT = In.getValueType(); + + // Common code will handle these just fine. + if (!InVT.isScalableVector() || !InVT.isInteger()) + return; + + SDLoc DL(N); + EVT VT = N->getValueType(0); + + // The following checks bail if this is not a halving operation. + + ElementCount ResEC = VT.getVectorElementCount(); + + if (InVT.getVectorElementCount().Min != (ResEC.Min * 2)) + return; + + auto *CIndex = dyn_cast<ConstantSDNode>(N->getOperand(1)); + if (!CIndex) + return; + + unsigned Index = CIndex->getZExtValue(); + if ((Index != 0) && (Index != ResEC.Min)) + return; + + unsigned Opcode = (Index == 0) ? AArch64ISD::UUNPKLO : AArch64ISD::UUNPKHI; + EVT ExtendedHalfVT = VT.widenIntegerVectorElementType(*DAG.getContext()); + + SDValue Half = DAG.getNode(Opcode, DL, ExtendedHalfVT, N->getOperand(0)); + Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, Half)); +} + // Create an even/odd pair of X registers holding integer value V. static SDValue createGPRPairNode(SelectionDAG &DAG, SDValue V) { SDLoc dl(V.getNode()); @@ -12822,10 +14383,12 @@ static void ReplaceCMP_SWAP_128Results(SDNode *N, unsigned SubReg1 = AArch64::sube64, SubReg2 = AArch64::subo64; if (DAG.getDataLayout().isBigEndian()) std::swap(SubReg1, SubReg2); - Results.push_back(DAG.getTargetExtractSubreg(SubReg1, SDLoc(N), MVT::i64, - SDValue(CmpSwap, 0))); - Results.push_back(DAG.getTargetExtractSubreg(SubReg2, SDLoc(N), MVT::i64, - SDValue(CmpSwap, 0))); + SDValue Lo = DAG.getTargetExtractSubreg(SubReg1, SDLoc(N), MVT::i64, + SDValue(CmpSwap, 0)); + SDValue Hi = DAG.getTargetExtractSubreg(SubReg2, SDLoc(N), MVT::i64, + SDValue(CmpSwap, 0)); + Results.push_back( + DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i128, Lo, Hi)); Results.push_back(SDValue(CmpSwap, 1)); // Chain out return; } @@ -12841,8 +14404,8 @@ static void ReplaceCMP_SWAP_128Results(SDNode *N, MachineMemOperand *MemOp = cast<MemSDNode>(N)->getMemOperand(); DAG.setNodeMemRefs(cast<MachineSDNode>(CmpSwap), {MemOp}); - Results.push_back(SDValue(CmpSwap, 0)); - Results.push_back(SDValue(CmpSwap, 1)); + Results.push_back(DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i128, + SDValue(CmpSwap, 0), SDValue(CmpSwap, 1))); Results.push_back(SDValue(CmpSwap, 3)); } @@ -12862,6 +14425,9 @@ void AArch64TargetLowering::ReplaceNodeResults( Results.push_back(LowerVECREDUCE(SDValue(N, 0), DAG)); return; + case ISD::CTPOP: + Results.push_back(LowerCTPOP(SDValue(N, 0), DAG)); + return; case AArch64ISD::SADDV: ReplaceReductionResults(N, Results, DAG, ISD::ADD, AArch64ISD::SADDV); return; @@ -12909,6 +14475,9 @@ void AArch64TargetLowering::ReplaceNodeResults( Results.append({Pair, Result.getValue(2) /* Chain */}); return; } + case ISD::EXTRACT_SUBVECTOR: + ReplaceExtractSubVectorResults(N, Results, DAG); + return; case ISD::INTRINSIC_WO_CHAIN: { EVT VT = N->getValueType(0); assert((VT == MVT::i8 || VT == MVT::i16) && @@ -13019,7 +14588,7 @@ AArch64TargetLowering::shouldExpandAtomicCmpXchgInIR( // on the stack and close enough to the spill slot, this can lead to a // situation where the monitor always gets cleared and the atomic operation // can never succeed. So at -O0 we need a late-expanded pseudo-inst instead. - if (getTargetMachine().getOptLevel() == 0) + if (getTargetMachine().getOptLevel() == CodeGenOpt::None) return AtomicExpansionKind::None; return AtomicExpansionKind::LLSC; } @@ -13278,8 +14847,7 @@ bool AArch64TargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const { // integer division, leaving the division as-is is a loss even in terms of // size, because it will have to be scalarized, while the alternative code // sequence can be performed in vector form. - bool OptSize = - Attr.hasAttribute(AttributeList::FunctionIndex, Attribute::MinSize); + bool OptSize = Attr.hasFnAttribute(Attribute::MinSize); return OptSize && !VT.isVector(); } @@ -13309,3 +14877,280 @@ void AArch64TargetLowering::finalizeLowering(MachineFunction &MF) const { bool AArch64TargetLowering::needsFixedCatchObjects() const { return false; } + +bool AArch64TargetLowering::shouldLocalize( + const MachineInstr &MI, const TargetTransformInfo *TTI) const { + switch (MI.getOpcode()) { + case TargetOpcode::G_GLOBAL_VALUE: { + // On Darwin, TLS global vars get selected into function calls, which + // we don't want localized, as they can get moved into the middle of a + // another call sequence. + const GlobalValue &GV = *MI.getOperand(1).getGlobal(); + if (GV.isThreadLocal() && Subtarget->isTargetMachO()) + return false; + break; + } + // If we legalized G_GLOBAL_VALUE into ADRP + G_ADD_LOW, mark both as being + // localizable. + case AArch64::ADRP: + case AArch64::G_ADD_LOW: + return true; + default: + break; + } + return TargetLoweringBase::shouldLocalize(MI, TTI); +} + +bool AArch64TargetLowering::fallBackToDAGISel(const Instruction &Inst) const { + if (isa<ScalableVectorType>(Inst.getType())) + return true; + + for (unsigned i = 0; i < Inst.getNumOperands(); ++i) + if (isa<ScalableVectorType>(Inst.getOperand(i)->getType())) + return true; + + return false; +} + +// Return the largest legal scalable vector type that matches VT's element type. +static EVT getContainerForFixedLengthVector(SelectionDAG &DAG, EVT VT) { + assert(VT.isFixedLengthVector() && + DAG.getTargetLoweringInfo().isTypeLegal(VT) && + "Expected legal fixed length vector!"); + switch (VT.getVectorElementType().getSimpleVT().SimpleTy) { + default: + llvm_unreachable("unexpected element type for SVE container"); + case MVT::i8: + return EVT(MVT::nxv16i8); + case MVT::i16: + return EVT(MVT::nxv8i16); + case MVT::i32: + return EVT(MVT::nxv4i32); + case MVT::i64: + return EVT(MVT::nxv2i64); + case MVT::f16: + return EVT(MVT::nxv8f16); + case MVT::f32: + return EVT(MVT::nxv4f32); + case MVT::f64: + return EVT(MVT::nxv2f64); + } +} + +// Return a PTRUE with active lanes corresponding to the extent of VT. +static SDValue getPredicateForFixedLengthVector(SelectionDAG &DAG, SDLoc &DL, + EVT VT) { + assert(VT.isFixedLengthVector() && + DAG.getTargetLoweringInfo().isTypeLegal(VT) && + "Expected legal fixed length vector!"); + + int PgPattern; + switch (VT.getVectorNumElements()) { + default: + llvm_unreachable("unexpected element count for SVE predicate"); + case 1: + PgPattern = AArch64SVEPredPattern::vl1; + break; + case 2: + PgPattern = AArch64SVEPredPattern::vl2; + break; + case 4: + PgPattern = AArch64SVEPredPattern::vl4; + break; + case 8: + PgPattern = AArch64SVEPredPattern::vl8; + break; + case 16: + PgPattern = AArch64SVEPredPattern::vl16; + break; + case 32: + PgPattern = AArch64SVEPredPattern::vl32; + break; + case 64: + PgPattern = AArch64SVEPredPattern::vl64; + break; + case 128: + PgPattern = AArch64SVEPredPattern::vl128; + break; + case 256: + PgPattern = AArch64SVEPredPattern::vl256; + break; + } + + // TODO: For vectors that are exactly getMaxSVEVectorSizeInBits big, we can + // use AArch64SVEPredPattern::all, which can enable the use of unpredicated + // variants of instructions when available. + + MVT MaskVT; + switch (VT.getVectorElementType().getSimpleVT().SimpleTy) { + default: + llvm_unreachable("unexpected element type for SVE predicate"); + case MVT::i8: + MaskVT = MVT::nxv16i1; + break; + case MVT::i16: + case MVT::f16: + MaskVT = MVT::nxv8i1; + break; + case MVT::i32: + case MVT::f32: + MaskVT = MVT::nxv4i1; + break; + case MVT::i64: + case MVT::f64: + MaskVT = MVT::nxv2i1; + break; + } + + return DAG.getNode(AArch64ISD::PTRUE, DL, MaskVT, + DAG.getTargetConstant(PgPattern, DL, MVT::i64)); +} + +static SDValue getPredicateForScalableVector(SelectionDAG &DAG, SDLoc &DL, + EVT VT) { + assert(VT.isScalableVector() && DAG.getTargetLoweringInfo().isTypeLegal(VT) && + "Expected legal scalable vector!"); + auto PredTy = VT.changeVectorElementType(MVT::i1); + return getPTrue(DAG, DL, PredTy, AArch64SVEPredPattern::all); +} + +static SDValue getPredicateForVector(SelectionDAG &DAG, SDLoc &DL, EVT VT) { + if (VT.isFixedLengthVector()) + return getPredicateForFixedLengthVector(DAG, DL, VT); + + return getPredicateForScalableVector(DAG, DL, VT); +} + +// Grow V to consume an entire SVE register. +static SDValue convertToScalableVector(SelectionDAG &DAG, EVT VT, SDValue V) { + assert(VT.isScalableVector() && + "Expected to convert into a scalable vector!"); + assert(V.getValueType().isFixedLengthVector() && + "Expected a fixed length vector operand!"); + SDLoc DL(V); + SDValue Zero = DAG.getConstant(0, DL, MVT::i64); + return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V, Zero); +} + +// Shrink V so it's just big enough to maintain a VT's worth of data. +static SDValue convertFromScalableVector(SelectionDAG &DAG, EVT VT, SDValue V) { + assert(VT.isFixedLengthVector() && + "Expected to convert into a fixed length vector!"); + assert(V.getValueType().isScalableVector() && + "Expected a scalable vector operand!"); + SDLoc DL(V); + SDValue Zero = DAG.getConstant(0, DL, MVT::i64); + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V, Zero); +} + +// Convert all fixed length vector loads larger than NEON to masked_loads. +SDValue AArch64TargetLowering::LowerFixedLengthVectorLoadToSVE( + SDValue Op, SelectionDAG &DAG) const { + auto Load = cast<LoadSDNode>(Op); + + SDLoc DL(Op); + EVT VT = Op.getValueType(); + EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT); + + auto NewLoad = DAG.getMaskedLoad( + ContainerVT, DL, Load->getChain(), Load->getBasePtr(), Load->getOffset(), + getPredicateForFixedLengthVector(DAG, DL, VT), DAG.getUNDEF(ContainerVT), + Load->getMemoryVT(), Load->getMemOperand(), Load->getAddressingMode(), + Load->getExtensionType()); + + auto Result = convertFromScalableVector(DAG, VT, NewLoad); + SDValue MergedValues[2] = {Result, Load->getChain()}; + return DAG.getMergeValues(MergedValues, DL); +} + +// Convert all fixed length vector stores larger than NEON to masked_stores. +SDValue AArch64TargetLowering::LowerFixedLengthVectorStoreToSVE( + SDValue Op, SelectionDAG &DAG) const { + auto Store = cast<StoreSDNode>(Op); + + SDLoc DL(Op); + EVT VT = Store->getValue().getValueType(); + EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT); + + auto NewValue = convertToScalableVector(DAG, ContainerVT, Store->getValue()); + return DAG.getMaskedStore( + Store->getChain(), DL, NewValue, Store->getBasePtr(), Store->getOffset(), + getPredicateForFixedLengthVector(DAG, DL, VT), Store->getMemoryVT(), + Store->getMemOperand(), Store->getAddressingMode(), + Store->isTruncatingStore()); +} + +SDValue AArch64TargetLowering::LowerFixedLengthVectorTruncateToSVE( + SDValue Op, SelectionDAG &DAG) const { + EVT VT = Op.getValueType(); + assert(VT.isFixedLengthVector() && "Expected fixed length vector type!"); + + SDLoc DL(Op); + SDValue Val = Op.getOperand(0); + EVT ContainerVT = getContainerForFixedLengthVector(DAG, Val.getValueType()); + Val = convertToScalableVector(DAG, ContainerVT, Val); + + // Repeatedly truncate Val until the result is of the desired element type. + switch (ContainerVT.getSimpleVT().SimpleTy) { + default: + llvm_unreachable("unimplemented container type"); + case MVT::nxv2i64: + Val = DAG.getNode(ISD::BITCAST, DL, MVT::nxv4i32, Val); + Val = DAG.getNode(AArch64ISD::UZP1, DL, MVT::nxv4i32, Val, Val); + if (VT.getVectorElementType() == MVT::i32) + break; + LLVM_FALLTHROUGH; + case MVT::nxv4i32: + Val = DAG.getNode(ISD::BITCAST, DL, MVT::nxv8i16, Val); + Val = DAG.getNode(AArch64ISD::UZP1, DL, MVT::nxv8i16, Val, Val); + if (VT.getVectorElementType() == MVT::i16) + break; + LLVM_FALLTHROUGH; + case MVT::nxv8i16: + Val = DAG.getNode(ISD::BITCAST, DL, MVT::nxv16i8, Val); + Val = DAG.getNode(AArch64ISD::UZP1, DL, MVT::nxv16i8, Val, Val); + assert(VT.getVectorElementType() == MVT::i8 && "Unexpected element type!"); + break; + } + + return convertFromScalableVector(DAG, VT, Val); +} + +SDValue AArch64TargetLowering::LowerToPredicatedOp(SDValue Op, + SelectionDAG &DAG, + unsigned NewOp) const { + EVT VT = Op.getValueType(); + SDLoc DL(Op); + auto Pg = getPredicateForVector(DAG, DL, VT); + + if (useSVEForFixedLengthVectorVT(VT)) { + EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT); + + // Create list of operands by convereting existing ones to scalable types. + SmallVector<SDValue, 4> Operands = {Pg}; + for (const SDValue &V : Op->op_values()) { + if (isa<CondCodeSDNode>(V)) { + Operands.push_back(V); + continue; + } + + assert(useSVEForFixedLengthVectorVT(V.getValueType()) && + "Only fixed length vectors are supported!"); + Operands.push_back(convertToScalableVector(DAG, ContainerVT, V)); + } + + auto ScalableRes = DAG.getNode(NewOp, DL, ContainerVT, Operands); + return convertFromScalableVector(DAG, VT, ScalableRes); + } + + assert(VT.isScalableVector() && "Only expect to lower scalable vector op!"); + + SmallVector<SDValue, 4> Operands = {Pg}; + for (const SDValue &V : Op->op_values()) { + assert((isa<CondCodeSDNode>(V) || V.getValueType().isScalableVector()) && + "Only scalable vectors are supported!"); + Operands.push_back(V); + } + + return DAG.getNode(NewOp, DL, VT, Operands); +} diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h index 672dfc4fcbc06..4fe77481706b3 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -25,6 +25,26 @@ namespace llvm { namespace AArch64ISD { +// For predicated nodes where the result is a vector, the operation is +// controlled by a governing predicate and the inactive lanes are explicitly +// defined with a value, please stick the following naming convention: +// +// _MERGE_OP<n> The result value is a vector with inactive lanes equal +// to source operand OP<n>. +// +// _MERGE_ZERO The result value is a vector with inactive lanes +// actively zeroed. +// +// _MERGE_PASSTHRU The result value is a vector with inactive lanes equal +// to the last source operand which only purpose is being +// a passthru value. +// +// For other cases where no explicit action is needed to set the inactive lanes, +// or when the result is not a vector and it is needed or helpful to +// distinguish a node from similar unpredicated nodes, use: +// +// _PRED +// enum NodeType : unsigned { FIRST_NUMBER = ISD::BUILTIN_OP_END, WrapperLarge, // 4-instruction MOVZ/MOVK sequence for 64-bit addresses. @@ -52,6 +72,22 @@ enum NodeType : unsigned { ADC, SBC, // adc, sbc instructions + // Arithmetic instructions + ADD_PRED, + FADD_PRED, + SDIV_PRED, + UDIV_PRED, + FMA_PRED, + SMIN_MERGE_OP1, + UMIN_MERGE_OP1, + SMAX_MERGE_OP1, + UMAX_MERGE_OP1, + SHL_MERGE_OP1, + SRL_MERGE_OP1, + SRA_MERGE_OP1, + + SETCC_MERGE_ZERO, + // Arithmetic instructions which write flags. ADDS, SUBS, @@ -90,9 +126,9 @@ enum NodeType : unsigned { BICi, ORRi, - // Vector bit select: similar to ISD::VSELECT but not all bits within an + // Vector bitwise select: similar to ISD::VSELECT but not all bits within an // element must be identical. - BSL, + BSP, // Vector arithmetic negation NEG, @@ -121,6 +157,10 @@ enum NodeType : unsigned { SRSHR_I, URSHR_I, + // Vector shift by constant and insert + VSLI, + VSRI, + // Vector comparisons CMEQ, CMGE, @@ -148,6 +188,10 @@ enum NodeType : unsigned { SADDV, UADDV, + // Vector rounding halving addition + SRHADD, + URHADD, + // Vector across-lanes min/max // Only the lower result lane is defined. SMINV, @@ -166,7 +210,7 @@ enum NodeType : unsigned { // Vector bitwise negation NOT, - // Vector bitwise selection + // Vector bitwise insertion BIT, // Compare-and-branch @@ -196,8 +240,10 @@ enum NodeType : unsigned { UMULL, // Reciprocal estimates and steps. - FRECPE, FRECPS, - FRSQRTE, FRSQRTS, + FRECPE, + FRECPS, + FRSQRTE, + FRSQRTS, SUNPKHI, SUNPKLO, @@ -211,35 +257,97 @@ enum NodeType : unsigned { REV, TBL, + // Floating-point reductions. + FADDA_PRED, + FADDV_PRED, + FMAXV_PRED, + FMAXNMV_PRED, + FMINV_PRED, + FMINNMV_PRED, + INSR, PTEST, PTRUE, + DUP_MERGE_PASSTHRU, + INDEX_VECTOR, + + REINTERPRET_CAST, + + LD1_MERGE_ZERO, + LD1S_MERGE_ZERO, + LDNF1_MERGE_ZERO, + LDNF1S_MERGE_ZERO, + LDFF1_MERGE_ZERO, + LDFF1S_MERGE_ZERO, + LD1RQ_MERGE_ZERO, + LD1RO_MERGE_ZERO, + + // Structured loads. + SVE_LD2_MERGE_ZERO, + SVE_LD3_MERGE_ZERO, + SVE_LD4_MERGE_ZERO, + // Unsigned gather loads. - GLD1, - GLD1_SCALED, - GLD1_UXTW, - GLD1_SXTW, - GLD1_UXTW_SCALED, - GLD1_SXTW_SCALED, - GLD1_IMM, + GLD1_MERGE_ZERO, + GLD1_SCALED_MERGE_ZERO, + GLD1_UXTW_MERGE_ZERO, + GLD1_SXTW_MERGE_ZERO, + GLD1_UXTW_SCALED_MERGE_ZERO, + GLD1_SXTW_SCALED_MERGE_ZERO, + GLD1_IMM_MERGE_ZERO, // Signed gather loads - GLD1S, - GLD1S_SCALED, - GLD1S_UXTW, - GLD1S_SXTW, - GLD1S_UXTW_SCALED, - GLD1S_SXTW_SCALED, - GLD1S_IMM, + GLD1S_MERGE_ZERO, + GLD1S_SCALED_MERGE_ZERO, + GLD1S_UXTW_MERGE_ZERO, + GLD1S_SXTW_MERGE_ZERO, + GLD1S_UXTW_SCALED_MERGE_ZERO, + GLD1S_SXTW_SCALED_MERGE_ZERO, + GLD1S_IMM_MERGE_ZERO, + + // Unsigned gather loads. + GLDFF1_MERGE_ZERO, + GLDFF1_SCALED_MERGE_ZERO, + GLDFF1_UXTW_MERGE_ZERO, + GLDFF1_SXTW_MERGE_ZERO, + GLDFF1_UXTW_SCALED_MERGE_ZERO, + GLDFF1_SXTW_SCALED_MERGE_ZERO, + GLDFF1_IMM_MERGE_ZERO, + + // Signed gather loads. + GLDFF1S_MERGE_ZERO, + GLDFF1S_SCALED_MERGE_ZERO, + GLDFF1S_UXTW_MERGE_ZERO, + GLDFF1S_SXTW_MERGE_ZERO, + GLDFF1S_UXTW_SCALED_MERGE_ZERO, + GLDFF1S_SXTW_SCALED_MERGE_ZERO, + GLDFF1S_IMM_MERGE_ZERO, + + // Non-temporal gather loads + GLDNT1_MERGE_ZERO, + GLDNT1_INDEX_MERGE_ZERO, + GLDNT1S_MERGE_ZERO, + + // Contiguous masked store. + ST1_PRED, + // Scatter store - SST1, - SST1_SCALED, - SST1_UXTW, - SST1_SXTW, - SST1_UXTW_SCALED, - SST1_SXTW_SCALED, - SST1_IMM, + SST1_PRED, + SST1_SCALED_PRED, + SST1_UXTW_PRED, + SST1_SXTW_PRED, + SST1_UXTW_SCALED_PRED, + SST1_SXTW_SCALED_PRED, + SST1_IMM_PRED, + + // Non-temporal scatter store + SSTNT1_PRED, + SSTNT1_INDEX_PRED, + + // Strict (exception-raising) floating point comparison + STRICT_FCMP = ISD::FIRST_TARGET_STRICTFP_OPCODE, + STRICT_FCMPE, // NEON Load/Store with post-increment base updates LD2post = ISD::FIRST_TARGET_MEMORY_OPCODE, @@ -272,7 +380,8 @@ enum NodeType : unsigned { STZ2G, LDP, - STP + STP, + STNP }; } // end namespace AArch64ISD @@ -321,7 +430,8 @@ public: return MVT::getIntegerVT(64); } - bool targetShrinkDemandedConstant(SDValue Op, const APInt &Demanded, + bool targetShrinkDemandedConstant(SDValue Op, const APInt &DemandedBits, + const APInt &DemandedElts, TargetLoweringOpt &TLO) const override; MVT getScalarShiftAmountTy(const DataLayout &DL, EVT) const override; @@ -333,9 +443,10 @@ public: MachineMemOperand::Flags Flags = MachineMemOperand::MONone, bool *Fast = nullptr) const override; /// LLT variant. - bool allowsMisalignedMemoryAccesses( - LLT Ty, unsigned AddrSpace, unsigned Align, MachineMemOperand::Flags Flags, - bool *Fast = nullptr) const override; + bool allowsMisalignedMemoryAccesses(LLT Ty, unsigned AddrSpace, + Align Alignment, + MachineMemOperand::Flags Flags, + bool *Fast = nullptr) const override; /// Provide custom lowering hooks for some operations. SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override; @@ -376,9 +487,6 @@ public: MachineBasicBlock *EmitLoweredCatchRet(MachineInstr &MI, MachineBasicBlock *BB) const; - MachineBasicBlock *EmitLoweredCatchPad(MachineInstr &MI, - MachineBasicBlock *BB) const; - MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const override; @@ -402,7 +510,7 @@ public: bool shouldSinkOperands(Instruction *I, SmallVectorImpl<Use *> &Ops) const override; - bool hasPairedLoad(EVT LoadedType, unsigned &RequiredAligment) const override; + bool hasPairedLoad(EVT LoadedType, Align &RequiredAligment) const override; unsigned getMaxSupportedInterleaveFactor() const override { return 4; } @@ -418,13 +526,11 @@ public: bool shouldConsiderGEPOffsetSplit() const override; - EVT getOptimalMemOpType(uint64_t Size, unsigned DstAlign, unsigned SrcAlign, - bool IsMemset, bool ZeroMemset, bool MemcpyStrSrc, + EVT getOptimalMemOpType(const MemOp &Op, const AttributeList &FuncAttributes) const override; - LLT getOptimalMemOpLLT(uint64_t Size, unsigned DstAlign, unsigned SrcAlign, - bool IsMemset, bool ZeroMemset, bool MemcpyStrSrc, - const AttributeList &FuncAttributes) const override; + LLT getOptimalMemOpLLT(const MemOp &Op, + const AttributeList &FuncAttributes) const override; /// Return true if the addressing mode represented by AM is legal for this /// target, for a load/store of the specified type. @@ -463,6 +569,13 @@ public: bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const override; + bool shouldFormOverflowOp(unsigned Opcode, EVT VT, + bool MathUsed) const override { + // Using overflow ops for overflow checks only should beneficial on + // AArch64. + return TargetLowering::shouldFormOverflowOp(Opcode, VT, true); + } + Value *emitLoadLinked(IRBuilder<> &Builder, Value *Addr, AtomicOrdering Ord) const override; Value *emitStoreConditional(IRBuilder<> &Builder, Value *Val, @@ -497,7 +610,7 @@ public: /// If a physical register, this returns the register that receives the /// exception address on entry to an EH pad. - unsigned + Register getExceptionPointerRegister(const Constant *PersonalityFn) const override { // FIXME: This is a guess. Has this been defined yet? return AArch64::X0; @@ -505,7 +618,7 @@ public: /// If a physical register, this returns the register that receives the /// exception typeid on entry to a landing pad. - unsigned + Register getExceptionSelectorRegister(const Constant *PersonalityFn) const override { // FIXME: This is a guess. Has this been defined yet? return AArch64::X1; @@ -611,13 +724,27 @@ public: unsigned getNumInterleavedAccesses(VectorType *VecTy, const DataLayout &DL) const; - MachineMemOperand::Flags getMMOFlags(const Instruction &I) const override; + MachineMemOperand::Flags getTargetMMOFlags( + const Instruction &I) const override; bool functionArgumentNeedsConsecutiveRegisters(Type *Ty, CallingConv::ID CallConv, bool isVarArg) const override; /// Used for exception handling on Win64. bool needsFixedCatchObjects() const override; + + bool fallBackToDAGISel(const Instruction &Inst) const override; + + /// SVE code generation for fixed length vectors does not custom lower + /// BUILD_VECTOR. This makes BUILD_VECTOR legalisation a source of stores to + /// merge. However, merging them creates a BUILD_VECTOR that is just as + /// illegal as the original, thus leading to an infinite legalisation loop. + /// NOTE: Once BUILD_VECTOR is legal or can be custom lowered for all legal + /// vector types this override can be removed. + bool mergeStoresAfterLegalization(EVT VT) const override { + return !useSVEForFixedLengthVectors(); + } + private: /// Keep a pointer to the AArch64Subtarget around so that we can /// make the right decision when generating code for different targets. @@ -626,6 +753,7 @@ private: bool isExtFreeImpl(const Instruction *Ext) const override; void addTypeForNEON(MVT VT, MVT PromotedBitwiseVT); + void addTypeForFixedLengthSVE(MVT VT); void addDRTypeForNEON(MVT VT); void addQRTypeForNEON(MVT VT); @@ -729,7 +857,11 @@ private: SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const; SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSPLAT_VECTOR(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerDUPQLane(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerToPredicatedOp(SDValue Op, SelectionDAG &DAG, + unsigned NewOp) const; SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerINSERT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const; SDValue LowerVectorSRA_SRL_SHL(SDValue Op, SelectionDAG &DAG) const; SDValue LowerShiftLeftParts(SDValue Op, SelectionDAG &DAG) const; SDValue LowerShiftRightParts(SDValue Op, SelectionDAG &DAG) const; @@ -746,6 +878,8 @@ private: SDValue LowerVectorOR(SDValue Op, SelectionDAG &DAG) const; SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerVSCALE(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerVECREDUCE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerATOMIC_LOAD_SUB(SDValue Op, SelectionDAG &DAG) const; SDValue LowerATOMIC_LOAD_AND(SDValue Op, SelectionDAG &DAG) const; @@ -753,6 +887,13 @@ private: SDValue LowerWindowsDYNAMIC_STACKALLOC(SDValue Op, SDValue Chain, SDValue &Size, SelectionDAG &DAG) const; + SDValue LowerSVEStructLoad(unsigned Intrinsic, ArrayRef<SDValue> LoadOps, + EVT VT, SelectionDAG &DAG, const SDLoc &DL) const; + + SDValue LowerFixedLengthVectorLoadToSVE(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFixedLengthVectorStoreToSVE(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFixedLengthVectorTruncateToSVE(SDValue Op, + SelectionDAG &DAG) const; SDValue BuildSDIVPow2(SDNode *N, const APInt &Divisor, SelectionDAG &DAG, SmallVectorImpl<SDNode *> &Created) const override; @@ -807,10 +948,19 @@ private: void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const override; + void ReplaceExtractSubVectorResults(SDNode *N, + SmallVectorImpl<SDValue> &Results, + SelectionDAG &DAG) const; bool shouldNormalizeToSelectSequence(LLVMContext &, EVT) const override; void finalizeLowering(MachineFunction &MF) const override; + + bool shouldLocalize(const MachineInstr &MI, + const TargetTransformInfo *TTI) const override; + + bool useSVEForFixedLengthVectors() const; + bool useSVEForFixedLengthVectorVT(EVT VT) const; }; namespace AArch64 { diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td index c3efe03a0987f..6df7970f4d82b 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td +++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td @@ -20,6 +20,30 @@ class Format<bits<2> val> { def PseudoFrm : Format<0>; def NormalFrm : Format<1>; // Do we need any others? +// Enum describing whether an instruction is +// destructive in its first source operand. +class DestructiveInstTypeEnum<bits<4> val> { + bits<4> Value = val; +} +def NotDestructive : DestructiveInstTypeEnum<0>; +// Destructive in its first operand and can be MOVPRFX'd, but has no other +// special properties. +def DestructiveOther : DestructiveInstTypeEnum<1>; +def DestructiveUnary : DestructiveInstTypeEnum<2>; +def DestructiveBinaryImm : DestructiveInstTypeEnum<3>; +def DestructiveBinaryShImmUnpred : DestructiveInstTypeEnum<4>; +def DestructiveBinary : DestructiveInstTypeEnum<5>; +def DestructiveBinaryComm : DestructiveInstTypeEnum<6>; +def DestructiveBinaryCommWithRev : DestructiveInstTypeEnum<7>; +def DestructiveTernaryCommWithRev : DestructiveInstTypeEnum<8>; + +class FalseLanesEnum<bits<2> val> { + bits<2> Value = val; +} +def FalseLanesNone : FalseLanesEnum<0>; +def FalseLanesZero : FalseLanesEnum<1>; +def FalseLanesUndef : FalseLanesEnum<2>; + // AArch64 Instruction Format class AArch64Inst<Format f, string cstr> : Instruction { field bits<32> Inst; // Instruction encoding. @@ -34,6 +58,16 @@ class AArch64Inst<Format f, string cstr> : Instruction { let Namespace = "AArch64"; Format F = f; bits<2> Form = F.Value; + + // Defaults + FalseLanesEnum FalseLanes = FalseLanesNone; + DestructiveInstTypeEnum DestructiveInstType = NotDestructive; + ElementSizeEnum ElementSize = ElementSizeNone; + + let TSFlags{8-7} = FalseLanes.Value; + let TSFlags{6-3} = DestructiveInstType.Value; + let TSFlags{2-0} = ElementSize.Value; + let Pattern = []; let Constraints = cstr; } @@ -48,6 +82,7 @@ class Pseudo<dag oops, dag iops, list<dag> pattern, string cstr = ""> dag InOperandList = iops; let Pattern = pattern; let isCodeGenOnly = 1; + let isPseudo = 1; } // Real instructions (have encoding information) @@ -56,14 +91,6 @@ class EncodedI<string cstr, list<dag> pattern> : AArch64Inst<NormalFrm, cstr> { let Size = 4; } -// Enum describing whether an instruction is -// destructive in its first source operand. -class DestructiveInstTypeEnum<bits<1> val> { - bits<1> Value = val; -} -def NotDestructive : DestructiveInstTypeEnum<0>; -def Destructive : DestructiveInstTypeEnum<1>; - // Normal instructions class I<dag oops, dag iops, string asm, string operands, string cstr, list<dag> pattern> @@ -71,13 +98,6 @@ class I<dag oops, dag iops, string asm, string operands, string cstr, dag OutOperandList = oops; dag InOperandList = iops; let AsmString = !strconcat(asm, operands); - - // Destructive operations (SVE) - DestructiveInstTypeEnum DestructiveInstType = NotDestructive; - ElementSizeEnum ElementSize = ElementSizeB; - - let TSFlags{3} = DestructiveInstType.Value; - let TSFlags{2-0} = ElementSize.Value; } class TriOpFrag<dag res> : PatFrag<(ops node:$LHS, node:$MHS, node:$RHS), res>; @@ -327,6 +347,18 @@ def simm5_32b : Operand<i32>, ImmLeaf<i32, [{ return Imm >= -16 && Imm < 16; }]> let DecoderMethod = "DecodeSImm<5>"; } +def simm5_8b : Operand<i32>, ImmLeaf<i32, [{ return (int8_t)Imm >= -16 && (int8_t)Imm < 16; }]> { + let ParserMatchClass = SImm5Operand; + let DecoderMethod = "DecodeSImm<5>"; + let PrintMethod = "printSImm<8>"; +} + +def simm5_16b : Operand<i32>, ImmLeaf<i32, [{ return (int16_t)Imm >= -16 && (int16_t)Imm < 16; }]> { + let ParserMatchClass = SImm5Operand; + let DecoderMethod = "DecodeSImm<5>"; + let PrintMethod = "printSImm<16>"; +} + // simm7sN predicate - True if the immediate is a multiple of N in the range // [-64 * N, 63 * N]. @@ -349,6 +381,8 @@ def simm7s16 : Operand<i32> { let PrintMethod = "printImmScale<16>"; } +def am_sve_fi : ComplexPattern<i64, 2, "SelectAddrModeFrameIndexSVE", []>; + def am_indexed7s8 : ComplexPattern<i64, 2, "SelectAddrModeIndexed7S8", []>; def am_indexed7s16 : ComplexPattern<i64, 2, "SelectAddrModeIndexed7S16", []>; def am_indexed7s32 : ComplexPattern<i64, 2, "SelectAddrModeIndexed7S32", []>; @@ -358,6 +392,9 @@ def am_indexed7s128 : ComplexPattern<i64, 2, "SelectAddrModeIndexed7S128", []>; def am_indexedu6s128 : ComplexPattern<i64, 2, "SelectAddrModeIndexedU6S128", []>; def am_indexeds9s128 : ComplexPattern<i64, 2, "SelectAddrModeIndexedS9S128", []>; +def UImmS1XForm : SDNodeXForm<imm, [{ + return CurDAG->getTargetConstant(N->getZExtValue(), SDLoc(N), MVT::i64); +}]>; def UImmS2XForm : SDNodeXForm<imm, [{ return CurDAG->getTargetConstant(N->getZExtValue() / 2, SDLoc(N), MVT::i64); }]>; @@ -446,6 +483,19 @@ def uimm6s16 : Operand<i64>, ImmLeaf<i64, let ParserMatchClass = UImm6s16Operand; } +def SImmS2XForm : SDNodeXForm<imm, [{ + return CurDAG->getTargetConstant(N->getSExtValue() / 2, SDLoc(N), MVT::i64); +}]>; +def SImmS3XForm : SDNodeXForm<imm, [{ + return CurDAG->getTargetConstant(N->getSExtValue() / 3, SDLoc(N), MVT::i64); +}]>; +def SImmS4XForm : SDNodeXForm<imm, [{ + return CurDAG->getTargetConstant(N->getSExtValue() / 4, SDLoc(N), MVT::i64); +}]>; +def SImmS16XForm : SDNodeXForm<imm, [{ + return CurDAG->getTargetConstant(N->getSExtValue() / 16, SDLoc(N), MVT::i64); +}]>; + // simm6sN predicate - True if the immediate is a multiple of N in the range // [-32 * N, 31 * N]. def SImm6s1Operand : SImmScaledMemoryIndexed<6, 1>; @@ -461,6 +511,7 @@ def SImm4s2Operand : SImmScaledMemoryIndexed<4, 2>; def SImm4s3Operand : SImmScaledMemoryIndexed<4, 3>; def SImm4s4Operand : SImmScaledMemoryIndexed<4, 4>; def SImm4s16Operand : SImmScaledMemoryIndexed<4, 16>; +def SImm4s32Operand : SImmScaledMemoryIndexed<4, 32>; def simm4s1 : Operand<i64>, ImmLeaf<i64, [{ return Imm >=-8 && Imm <= 7; }]> { @@ -469,31 +520,37 @@ def simm4s1 : Operand<i64>, ImmLeaf<i64, } def simm4s2 : Operand<i64>, ImmLeaf<i64, -[{ return Imm >=-16 && Imm <= 14 && (Imm % 2) == 0x0; }]> { +[{ return Imm >=-16 && Imm <= 14 && (Imm % 2) == 0x0; }], SImmS2XForm> { let PrintMethod = "printImmScale<2>"; let ParserMatchClass = SImm4s2Operand; let DecoderMethod = "DecodeSImm<4>"; } def simm4s3 : Operand<i64>, ImmLeaf<i64, -[{ return Imm >=-24 && Imm <= 21 && (Imm % 3) == 0x0; }]> { +[{ return Imm >=-24 && Imm <= 21 && (Imm % 3) == 0x0; }], SImmS3XForm> { let PrintMethod = "printImmScale<3>"; let ParserMatchClass = SImm4s3Operand; let DecoderMethod = "DecodeSImm<4>"; } def simm4s4 : Operand<i64>, ImmLeaf<i64, -[{ return Imm >=-32 && Imm <= 28 && (Imm % 4) == 0x0; }]> { +[{ return Imm >=-32 && Imm <= 28 && (Imm % 4) == 0x0; }], SImmS4XForm> { let PrintMethod = "printImmScale<4>"; let ParserMatchClass = SImm4s4Operand; let DecoderMethod = "DecodeSImm<4>"; } def simm4s16 : Operand<i64>, ImmLeaf<i64, -[{ return Imm >=-128 && Imm <= 112 && (Imm % 16) == 0x0; }]> { +[{ return Imm >=-128 && Imm <= 112 && (Imm % 16) == 0x0; }], SImmS16XForm> { let PrintMethod = "printImmScale<16>"; let ParserMatchClass = SImm4s16Operand; let DecoderMethod = "DecodeSImm<4>"; } +def simm4s32 : Operand<i64>, ImmLeaf<i64, +[{ return Imm >=-256 && Imm <= 224 && (Imm % 32) == 0x0; }]> { + let PrintMethod = "printImmScale<32>"; + let ParserMatchClass = SImm4s32Operand; + let DecoderMethod = "DecodeSImm<4>"; +} def Imm1_8Operand : AsmImmRange<1, 8>; def Imm1_16Operand : AsmImmRange<1, 16>; @@ -647,6 +704,13 @@ def tvecshiftR32 : Operand<i32>, TImmLeaf<i32, [{ let DecoderMethod = "DecodeVecShiftR32Imm"; let ParserMatchClass = Imm1_32Operand; } +def tvecshiftR64 : Operand<i32>, TImmLeaf<i32, [{ + return (((uint32_t)Imm) > 0) && (((uint32_t)Imm) < 65); +}]> { + let EncoderMethod = "getVecShiftR64OpValue"; + let DecoderMethod = "DecodeVecShiftR64Imm"; + let ParserMatchClass = Imm1_64Operand; +} def Imm0_1Operand : AsmImmRange<0, 1>; def Imm0_7Operand : AsmImmRange<0, 7>; @@ -683,6 +747,36 @@ def vecshiftL64 : Operand<i32>, ImmLeaf<i32, [{ let ParserMatchClass = Imm0_63Operand; } +// Same as vecshiftL#N, but use TargetConstant (TimmLeaf) instead of Constant +// (ImmLeaf) +def tvecshiftL8 : Operand<i32>, TImmLeaf<i32, [{ + return (((uint32_t)Imm) < 8); +}]> { + let EncoderMethod = "getVecShiftL8OpValue"; + let DecoderMethod = "DecodeVecShiftL8Imm"; + let ParserMatchClass = Imm0_7Operand; +} +def tvecshiftL16 : Operand<i32>, TImmLeaf<i32, [{ + return (((uint32_t)Imm) < 16); +}]> { + let EncoderMethod = "getVecShiftL16OpValue"; + let DecoderMethod = "DecodeVecShiftL16Imm"; + let ParserMatchClass = Imm0_15Operand; +} +def tvecshiftL32 : Operand<i32>, TImmLeaf<i32, [{ + return (((uint32_t)Imm) < 32); +}]> { + let EncoderMethod = "getVecShiftL32OpValue"; + let DecoderMethod = "DecodeVecShiftL32Imm"; + let ParserMatchClass = Imm0_31Operand; +} +def tvecshiftL64 : Operand<i32>, TImmLeaf<i32, [{ + return (((uint32_t)Imm) < 64); +}]> { + let EncoderMethod = "getVecShiftL64OpValue"; + let DecoderMethod = "DecodeVecShiftL64Imm"; + let ParserMatchClass = Imm0_63Operand; +} // Crazy immediate formats used by 32-bit and 64-bit logical immediate // instructions for splatting repeating bit patterns across the immediate. @@ -796,7 +890,7 @@ def imm0_31 : Operand<i64>, ImmLeaf<i64, [{ } // timm0_31 predicate - same ass imm0_31, but use TargetConstant (TimmLeaf) -// instead of Contant (ImmLeaf) +// instead of Constant (ImmLeaf) def timm0_31 : Operand<i64>, TImmLeaf<i64, [{ return ((uint64_t)Imm) < 32; }]> { @@ -832,7 +926,7 @@ def imm0_7 : Operand<i64>, ImmLeaf<i64, [{ } // imm32_0_7 predicate - True if the 32-bit immediate is in the range [0,7] -def imm32_0_7 : Operand<i32>, ImmLeaf<i32, [{ +def imm32_0_7 : Operand<i32>, TImmLeaf<i32, [{ return ((uint32_t)Imm) < 8; }]> { let ParserMatchClass = Imm0_7Operand; @@ -1091,29 +1185,44 @@ class AsmVectorIndex<int Min, int Max, string NamePrefix=""> : AsmOperandClass { let RenderMethod = "addVectorIndexOperands"; } -class AsmVectorIndexOpnd<ValueType ty, AsmOperandClass mc, code pred> - : Operand<ty>, ImmLeaf<ty, pred> { +class AsmVectorIndexOpnd<ValueType ty, AsmOperandClass mc> + : Operand<ty> { let ParserMatchClass = mc; let PrintMethod = "printVectorIndex"; } +multiclass VectorIndex<ValueType ty, AsmOperandClass mc, code pred> { + def "" : AsmVectorIndexOpnd<ty, mc>, ImmLeaf<ty, pred>; + def _timm : AsmVectorIndexOpnd<ty, mc>, TImmLeaf<ty, pred>; +} + def VectorIndex1Operand : AsmVectorIndex<1, 1>; def VectorIndexBOperand : AsmVectorIndex<0, 15>; def VectorIndexHOperand : AsmVectorIndex<0, 7>; def VectorIndexSOperand : AsmVectorIndex<0, 3>; def VectorIndexDOperand : AsmVectorIndex<0, 1>; -def VectorIndex1 : AsmVectorIndexOpnd<i64, VectorIndex1Operand, [{ return ((uint64_t)Imm) == 1; }]>; -def VectorIndexB : AsmVectorIndexOpnd<i64, VectorIndexBOperand, [{ return ((uint64_t)Imm) < 16; }]>; -def VectorIndexH : AsmVectorIndexOpnd<i64, VectorIndexHOperand, [{ return ((uint64_t)Imm) < 8; }]>; -def VectorIndexS : AsmVectorIndexOpnd<i64, VectorIndexSOperand, [{ return ((uint64_t)Imm) < 4; }]>; -def VectorIndexD : AsmVectorIndexOpnd<i64, VectorIndexDOperand, [{ return ((uint64_t)Imm) < 2; }]>; - -def VectorIndex132b : AsmVectorIndexOpnd<i32, VectorIndex1Operand, [{ return ((uint64_t)Imm) == 1; }]>; -def VectorIndexB32b : AsmVectorIndexOpnd<i32, VectorIndexBOperand, [{ return ((uint64_t)Imm) < 16; }]>; -def VectorIndexH32b : AsmVectorIndexOpnd<i32, VectorIndexHOperand, [{ return ((uint64_t)Imm) < 8; }]>; -def VectorIndexS32b : AsmVectorIndexOpnd<i32, VectorIndexSOperand, [{ return ((uint64_t)Imm) < 4; }]>; -def VectorIndexD32b : AsmVectorIndexOpnd<i32, VectorIndexDOperand, [{ return ((uint64_t)Imm) < 2; }]>; +defm VectorIndex1 : VectorIndex<i64, VectorIndex1Operand, + [{ return ((uint64_t)Imm) == 1; }]>; +defm VectorIndexB : VectorIndex<i64, VectorIndexBOperand, + [{ return ((uint64_t)Imm) < 16; }]>; +defm VectorIndexH : VectorIndex<i64, VectorIndexHOperand, + [{ return ((uint64_t)Imm) < 8; }]>; +defm VectorIndexS : VectorIndex<i64, VectorIndexSOperand, + [{ return ((uint64_t)Imm) < 4; }]>; +defm VectorIndexD : VectorIndex<i64, VectorIndexDOperand, + [{ return ((uint64_t)Imm) < 2; }]>; + +defm VectorIndex132b : VectorIndex<i32, VectorIndex1Operand, + [{ return ((uint64_t)Imm) == 1; }]>; +defm VectorIndexB32b : VectorIndex<i32, VectorIndexBOperand, + [{ return ((uint64_t)Imm) < 16; }]>; +defm VectorIndexH32b : VectorIndex<i32, VectorIndexHOperand, + [{ return ((uint64_t)Imm) < 8; }]>; +defm VectorIndexS32b : VectorIndex<i32, VectorIndexSOperand, + [{ return ((uint64_t)Imm) < 4; }]>; +defm VectorIndexD32b : VectorIndex<i32, VectorIndexDOperand, + [{ return ((uint64_t)Imm) < 2; }]>; def SVEVectorIndexExtDupBOperand : AsmVectorIndex<0, 63, "SVE">; def SVEVectorIndexExtDupHOperand : AsmVectorIndex<0, 31, "SVE">; @@ -1121,16 +1230,21 @@ def SVEVectorIndexExtDupSOperand : AsmVectorIndex<0, 15, "SVE">; def SVEVectorIndexExtDupDOperand : AsmVectorIndex<0, 7, "SVE">; def SVEVectorIndexExtDupQOperand : AsmVectorIndex<0, 3, "SVE">; -def sve_elm_idx_extdup_b - : AsmVectorIndexOpnd<i64, SVEVectorIndexExtDupBOperand, [{ return ((uint64_t)Imm) < 64; }]>; -def sve_elm_idx_extdup_h - : AsmVectorIndexOpnd<i64, SVEVectorIndexExtDupHOperand, [{ return ((uint64_t)Imm) < 32; }]>; -def sve_elm_idx_extdup_s - : AsmVectorIndexOpnd<i64, SVEVectorIndexExtDupSOperand, [{ return ((uint64_t)Imm) < 16; }]>; -def sve_elm_idx_extdup_d - : AsmVectorIndexOpnd<i64, SVEVectorIndexExtDupDOperand, [{ return ((uint64_t)Imm) < 8; }]>; -def sve_elm_idx_extdup_q - : AsmVectorIndexOpnd<i64, SVEVectorIndexExtDupQOperand, [{ return ((uint64_t)Imm) < 4; }]>; +defm sve_elm_idx_extdup_b + : VectorIndex<i64, SVEVectorIndexExtDupBOperand, + [{ return ((uint64_t)Imm) < 64; }]>; +defm sve_elm_idx_extdup_h + : VectorIndex<i64, SVEVectorIndexExtDupHOperand, + [{ return ((uint64_t)Imm) < 32; }]>; +defm sve_elm_idx_extdup_s + : VectorIndex<i64, SVEVectorIndexExtDupSOperand, + [{ return ((uint64_t)Imm) < 16; }]>; +defm sve_elm_idx_extdup_d + : VectorIndex<i64, SVEVectorIndexExtDupDOperand, + [{ return ((uint64_t)Imm) < 8; }]>; +defm sve_elm_idx_extdup_q + : VectorIndex<i64, SVEVectorIndexExtDupQOperand, + [{ return ((uint64_t)Imm) < 4; }]>; // 8-bit immediate for AdvSIMD where 64-bit values of the form: // aaaaaaaa bbbbbbbb cccccccc dddddddd eeeeeeee ffffffff gggggggg hhhhhhhh @@ -1533,6 +1647,8 @@ class BaseAuthLoad<bit M, bit W, dag oops, dag iops, string asm, let Inst{10} = 1; let Inst{9-5} = Rn; let Inst{4-0} = Rt; + + let DecoderMethod = "DecodeAuthLoadInstruction"; } multiclass AuthLoad<bit M, string asm, Operand opr> { @@ -4333,14 +4449,14 @@ multiclass FPToIntegerUnscaled<bits<2> rmode, bits<3> opcode, string asm, SDPatternOperator OpN> { // Unscaled half-precision to 32-bit def UWHr : BaseFPToIntegerUnscaled<0b11, rmode, opcode, FPR16, GPR32, asm, - [(set GPR32:$Rd, (OpN FPR16:$Rn))]> { + [(set GPR32:$Rd, (OpN (f16 FPR16:$Rn)))]> { let Inst{31} = 0; // 32-bit GPR flag let Predicates = [HasFullFP16]; } // Unscaled half-precision to 64-bit def UXHr : BaseFPToIntegerUnscaled<0b11, rmode, opcode, FPR16, GPR64, asm, - [(set GPR64:$Rd, (OpN FPR16:$Rn))]> { + [(set GPR64:$Rd, (OpN (f16 FPR16:$Rn)))]> { let Inst{31} = 1; // 64-bit GPR flag let Predicates = [HasFullFP16]; } @@ -4375,7 +4491,7 @@ multiclass FPToIntegerScaled<bits<2> rmode, bits<3> opcode, string asm, // Scaled half-precision to 32-bit def SWHri : BaseFPToInteger<0b11, rmode, opcode, FPR16, GPR32, fixedpoint_f16_i32, asm, - [(set GPR32:$Rd, (OpN (fmul FPR16:$Rn, + [(set GPR32:$Rd, (OpN (fmul (f16 FPR16:$Rn), fixedpoint_f16_i32:$scale)))]> { let Inst{31} = 0; // 32-bit GPR flag let scale{5} = 1; @@ -4385,7 +4501,7 @@ multiclass FPToIntegerScaled<bits<2> rmode, bits<3> opcode, string asm, // Scaled half-precision to 64-bit def SXHri : BaseFPToInteger<0b11, rmode, opcode, FPR16, GPR64, fixedpoint_f16_i64, asm, - [(set GPR64:$Rd, (OpN (fmul FPR16:$Rn, + [(set GPR64:$Rd, (OpN (fmul (f16 FPR16:$Rn), fixedpoint_f16_i64:$scale)))]> { let Inst{31} = 1; // 64-bit GPR flag let Predicates = [HasFullFP16]; @@ -4501,7 +4617,7 @@ multiclass IntegerToFP<bit isUnsigned, string asm, SDNode node> { // Scaled def SWHri: BaseIntegerToFP<isUnsigned, GPR32, FPR16, fixedpoint_f16_i32, asm, - [(set FPR16:$Rd, + [(set (f16 FPR16:$Rd), (fdiv (node GPR32:$Rn), fixedpoint_f16_i32:$scale))]> { let Inst{31} = 0; // 32-bit GPR flag @@ -4529,7 +4645,7 @@ multiclass IntegerToFP<bit isUnsigned, string asm, SDNode node> { } def SXHri: BaseIntegerToFP<isUnsigned, GPR64, FPR16, fixedpoint_f16_i64, asm, - [(set FPR16:$Rd, + [(set (f16 FPR16:$Rd), (fdiv (node GPR64:$Rn), fixedpoint_f16_i64:$scale))]> { let Inst{31} = 1; // 64-bit GPR flag @@ -4702,19 +4818,19 @@ class BaseFPConversion<bits<2> type, bits<2> opcode, RegisterClass dstType, multiclass FPConversion<string asm> { // Double-precision to Half-precision def HDr : BaseFPConversion<0b01, 0b11, FPR16, FPR64, asm, - [(set FPR16:$Rd, (fpround FPR64:$Rn))]>; + [(set (f16 FPR16:$Rd), (any_fpround FPR64:$Rn))]>; // Double-precision to Single-precision def SDr : BaseFPConversion<0b01, 0b00, FPR32, FPR64, asm, - [(set FPR32:$Rd, (fpround FPR64:$Rn))]>; + [(set FPR32:$Rd, (any_fpround FPR64:$Rn))]>; // Half-precision to Double-precision def DHr : BaseFPConversion<0b11, 0b01, FPR64, FPR16, asm, - [(set FPR64:$Rd, (fpextend FPR16:$Rn))]>; + [(set FPR64:$Rd, (fpextend (f16 FPR16:$Rn)))]>; // Half-precision to Single-precision def SHr : BaseFPConversion<0b11, 0b00, FPR32, FPR16, asm, - [(set FPR32:$Rd, (fpextend FPR16:$Rn))]>; + [(set FPR32:$Rd, (fpextend (f16 FPR16:$Rn)))]>; // Single-precision to Double-precision def DSr : BaseFPConversion<0b00, 0b01, FPR64, FPR32, asm, @@ -4722,7 +4838,7 @@ multiclass FPConversion<string asm> { // Single-precision to Half-precision def HSr : BaseFPConversion<0b00, 0b11, FPR16, FPR32, asm, - [(set FPR16:$Rd, (fpround FPR32:$Rn))]>; + [(set (f16 FPR16:$Rd), (any_fpround FPR32:$Rn))]>; } //--- @@ -4824,7 +4940,7 @@ multiclass TwoOperandFPData<bits<4> opcode, string asm, multiclass TwoOperandFPDataNeg<bits<4> opcode, string asm, SDNode node> { def Hrr : BaseTwoOperandFPData<opcode, FPR16, asm, - [(set FPR16:$Rd, (fneg (node FPR16:$Rn, (f16 FPR16:$Rm))))]> { + [(set (f16 FPR16:$Rd), (fneg (node (f16 FPR16:$Rn), (f16 FPR16:$Rm))))]> { let Inst{23-22} = 0b11; // 16-bit size flag let Predicates = [HasFullFP16]; } @@ -4866,7 +4982,7 @@ class BaseThreeOperandFPData<bit isNegated, bit isSub, multiclass ThreeOperandFPData<bit isNegated, bit isSub,string asm, SDPatternOperator node> { def Hrrr : BaseThreeOperandFPData<isNegated, isSub, FPR16, asm, - [(set FPR16:$Rd, + [(set (f16 FPR16:$Rd), (node (f16 FPR16:$Rn), (f16 FPR16:$Rm), (f16 FPR16:$Ra)))]> { let Inst{23-22} = 0b11; // 16-bit size flag let Predicates = [HasFullFP16]; @@ -4928,7 +5044,7 @@ multiclass FPComparison<bit signalAllNans, string asm, SDPatternOperator OpNode = null_frag> { let Defs = [NZCV] in { def Hrr : BaseTwoOperandFPComparison<signalAllNans, FPR16, asm, - [(OpNode FPR16:$Rn, (f16 FPR16:$Rm)), (implicit NZCV)]> { + [(OpNode (f16 FPR16:$Rn), (f16 FPR16:$Rm)), (implicit NZCV)]> { let Inst{23-22} = 0b11; let Predicates = [HasFullFP16]; } @@ -5142,6 +5258,47 @@ class BaseSIMDThreeSameVectorTied<bit Q, bit U, bits<3> size, bits<5> opcode, let Inst{4-0} = Rd; } +let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in +class BaseSIMDThreeSameVectorPseudo<RegisterOperand regtype, list<dag> pattern> + : Pseudo<(outs regtype:$dst), (ins regtype:$Rd, regtype:$Rn, regtype:$Rm), pattern>, + Sched<[WriteV]>; + +multiclass SIMDLogicalThreeVectorPseudo<SDPatternOperator OpNode> { + def v8i8 : BaseSIMDThreeSameVectorPseudo<V64, + [(set (v8i8 V64:$dst), + (OpNode (v8i8 V64:$Rd), (v8i8 V64:$Rn), (v8i8 V64:$Rm)))]>; + def v16i8 : BaseSIMDThreeSameVectorPseudo<V128, + [(set (v16i8 V128:$dst), + (OpNode (v16i8 V128:$Rd), (v16i8 V128:$Rn), + (v16i8 V128:$Rm)))]>; + + def : Pat<(v4i16 (OpNode (v4i16 V64:$LHS), (v4i16 V64:$MHS), + (v4i16 V64:$RHS))), + (!cast<Instruction>(NAME#"v8i8") + V64:$LHS, V64:$MHS, V64:$RHS)>; + def : Pat<(v2i32 (OpNode (v2i32 V64:$LHS), (v2i32 V64:$MHS), + (v2i32 V64:$RHS))), + (!cast<Instruction>(NAME#"v8i8") + V64:$LHS, V64:$MHS, V64:$RHS)>; + def : Pat<(v1i64 (OpNode (v1i64 V64:$LHS), (v1i64 V64:$MHS), + (v1i64 V64:$RHS))), + (!cast<Instruction>(NAME#"v8i8") + V64:$LHS, V64:$MHS, V64:$RHS)>; + + def : Pat<(v8i16 (OpNode (v8i16 V128:$LHS), (v8i16 V128:$MHS), + (v8i16 V128:$RHS))), + (!cast<Instruction>(NAME#"v16i8") + V128:$LHS, V128:$MHS, V128:$RHS)>; + def : Pat<(v4i32 (OpNode (v4i32 V128:$LHS), (v4i32 V128:$MHS), + (v4i32 V128:$RHS))), + (!cast<Instruction>(NAME#"v16i8") + V128:$LHS, V128:$MHS, V128:$RHS)>; + def : Pat<(v2i64 (OpNode (v2i64 V128:$LHS), (v2i64 V128:$MHS), + (v2i64 V128:$RHS))), + (!cast<Instruction>(NAME#"v16i8") + V128:$LHS, V128:$MHS, V128:$RHS)>; +} + // All operand sizes distinguished in the encoding. multiclass SIMDThreeSameVector<bit U, bits<5> opc, string asm, SDPatternOperator OpNode> { @@ -5362,7 +5519,7 @@ multiclass SIMDLogicalThreeVector<bit U, bits<2> size, string asm, } multiclass SIMDLogicalThreeVectorTied<bit U, bits<2> size, - string asm, SDPatternOperator OpNode> { + string asm, SDPatternOperator OpNode = null_frag> { def v8i8 : BaseSIMDThreeSameVectorTied<0, U, {size,1}, 0b00011, V64, asm, ".8b", [(set (v8i8 V64:$dst), @@ -5402,11 +5559,11 @@ multiclass SIMDLogicalThreeVectorTied<bit U, bits<2> size, // ARMv8.2-A Dot Product Instructions (Vector): These instructions extract // bytes from S-sized elements. -class BaseSIMDThreeSameVectorDot<bit Q, bit U, string asm, string kind1, +class BaseSIMDThreeSameVectorDot<bit Q, bit U, bit Mixed, string asm, string kind1, string kind2, RegisterOperand RegType, ValueType AccumType, ValueType InputType, SDPatternOperator OpNode> : - BaseSIMDThreeSameVectorTied<Q, U, 0b100, 0b10010, RegType, asm, kind1, + BaseSIMDThreeSameVectorTied<Q, U, 0b100, {0b1001, Mixed}, RegType, asm, kind1, [(set (AccumType RegType:$dst), (OpNode (AccumType RegType:$Rd), (InputType RegType:$Rn), @@ -5414,10 +5571,10 @@ class BaseSIMDThreeSameVectorDot<bit Q, bit U, string asm, string kind1, let AsmString = !strconcat(asm, "{\t$Rd" # kind1 # ", $Rn" # kind2 # ", $Rm" # kind2 # "}"); } -multiclass SIMDThreeSameVectorDot<bit U, string asm, SDPatternOperator OpNode> { - def v8i8 : BaseSIMDThreeSameVectorDot<0, U, asm, ".2s", ".8b", V64, +multiclass SIMDThreeSameVectorDot<bit U, bit Mixed, string asm, SDPatternOperator OpNode> { + def v8i8 : BaseSIMDThreeSameVectorDot<0, U, Mixed, asm, ".2s", ".8b", V64, v2i32, v8i8, OpNode>; - def v16i8 : BaseSIMDThreeSameVectorDot<1, U, asm, ".4s", ".16b", V128, + def v16i8 : BaseSIMDThreeSameVectorDot<1, U, Mixed, asm, ".4s", ".16b", V128, v4i32, v16i8, OpNode>; } @@ -6581,13 +6738,13 @@ multiclass SIMDThreeScalarHSTied<bit U, bit R, bits<5> opc, string asm, multiclass SIMDFPThreeScalar<bit U, bit S, bits<3> opc, string asm, SDPatternOperator OpNode = null_frag> { let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in { - def #NAME#64 : BaseSIMDThreeScalar<U, {S,0b11}, {0b11,opc}, FPR64, asm, + def NAME#64 : BaseSIMDThreeScalar<U, {S,0b11}, {0b11,opc}, FPR64, asm, [(set (f64 FPR64:$Rd), (OpNode (f64 FPR64:$Rn), (f64 FPR64:$Rm)))]>; - def #NAME#32 : BaseSIMDThreeScalar<U, {S,0b01}, {0b11,opc}, FPR32, asm, + def NAME#32 : BaseSIMDThreeScalar<U, {S,0b01}, {0b11,opc}, FPR32, asm, [(set FPR32:$Rd, (OpNode FPR32:$Rn, FPR32:$Rm))]>; let Predicates = [HasNEON, HasFullFP16] in { - def #NAME#16 : BaseSIMDThreeScalar<U, {S,0b10}, {0b00,opc}, FPR16, asm, - [(set FPR16:$Rd, (OpNode FPR16:$Rn, FPR16:$Rm))]>; + def NAME#16 : BaseSIMDThreeScalar<U, {S,0b10}, {0b00,opc}, FPR16, asm, + [(set (f16 FPR16:$Rd), (OpNode (f16 FPR16:$Rn), (f16 FPR16:$Rm)))]>; } // Predicates = [HasNEON, HasFullFP16] } @@ -6598,12 +6755,12 @@ multiclass SIMDFPThreeScalar<bit U, bit S, bits<3> opc, string asm, multiclass SIMDThreeScalarFPCmp<bit U, bit S, bits<3> opc, string asm, SDPatternOperator OpNode = null_frag> { let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in { - def #NAME#64 : BaseSIMDThreeScalar<U, {S,0b11}, {0b11,opc}, FPR64, asm, + def NAME#64 : BaseSIMDThreeScalar<U, {S,0b11}, {0b11,opc}, FPR64, asm, [(set (i64 FPR64:$Rd), (OpNode (f64 FPR64:$Rn), (f64 FPR64:$Rm)))]>; - def #NAME#32 : BaseSIMDThreeScalar<U, {S,0b01}, {0b11,opc}, FPR32, asm, + def NAME#32 : BaseSIMDThreeScalar<U, {S,0b01}, {0b11,opc}, FPR32, asm, [(set (i32 FPR32:$Rd), (OpNode (f32 FPR32:$Rn), (f32 FPR32:$Rm)))]>; let Predicates = [HasNEON, HasFullFP16] in { - def #NAME#16 : BaseSIMDThreeScalar<U, {S,0b10}, {0b00,opc}, FPR16, asm, + def NAME#16 : BaseSIMDThreeScalar<U, {S,0b10}, {0b00,opc}, FPR16, asm, []>; } // Predicates = [HasNEON, HasFullFP16] } @@ -6794,7 +6951,7 @@ multiclass SIMDFPTwoScalarCVT<bit U, bit S, bits<5> opc, string asm, [(set FPR32:$Rd, (OpNode (f32 FPR32:$Rn)))]>; let Predicates = [HasNEON, HasFullFP16] in { def v1i16 : BaseSIMDTwoScalar<U, {S,1}, 0b11, opc, FPR16, FPR16, asm, - [(set FPR16:$Rd, (OpNode (f16 FPR16:$Rn)))]>; + [(set (f16 FPR16:$Rd), (OpNode (f16 FPR16:$Rn)))]>; } } @@ -6936,10 +7093,10 @@ multiclass SIMDFPAcrossLanes<bits<5> opcode, bit sz1, string asm, let Predicates = [HasNEON, HasFullFP16] in { def v4i16v : BaseSIMDAcrossLanes<0, 0, {sz1, 0}, opcode, FPR16, V64, asm, ".4h", - [(set FPR16:$Rd, (intOp (v4f16 V64:$Rn)))]>; + [(set (f16 FPR16:$Rd), (intOp (v4f16 V64:$Rn)))]>; def v8i16v : BaseSIMDAcrossLanes<1, 0, {sz1, 0}, opcode, FPR16, V128, asm, ".8h", - [(set FPR16:$Rd, (intOp (v8f16 V128:$Rn)))]>; + [(set (f16 FPR16:$Rd), (intOp (v8f16 V128:$Rn)))]>; } // Predicates = [HasNEON, HasFullFP16] def v4i32v : BaseSIMDAcrossLanes<1, 1, {sz1, 0}, opcode, FPR32, V128, asm, ".4s", @@ -7136,7 +7293,7 @@ class SIMDInsMainMovAlias<string size, Instruction inst, (inst V128:$dst, idxtype:$idx, regtype:$src)>; class SIMDInsElementMovAlias<string size, Instruction inst, Operand idxtype> - : InstAlias<"mov" # "{\t$dst" # size # "$idx, $src" # size # "$idx2" # + : InstAlias<"mov" # "{\t$dst" # size # "$idx, $src" # size # "$idx2" # "|" # size #"\t$dst$idx, $src$idx2}", (inst V128:$dst, idxtype:$idx, V128:$src, idxtype:$idx2)>; @@ -7377,7 +7534,7 @@ class BaseSIMDScalarCPY<RegisterClass regtype, RegisterOperand vectype, class SIMDScalarCPYAlias<string asm, string size, Instruction inst, RegisterClass regtype, RegisterOperand vectype, Operand idxtype> - : InstAlias<asm # "{\t$dst, $src" # size # "$index" # + : InstAlias<asm # "{\t$dst, $src" # size # "$index" # "|\t$dst, $src$index}", (inst regtype:$dst, vectype:$src, idxtype:$index), 0>; @@ -7651,13 +7808,152 @@ class BaseSIMDIndexedTied<bit Q, bit U, bit Scalar, bits<2> size, bits<4> opc, let Inst{4-0} = Rd; } + +//---------------------------------------------------------------------------- +// Armv8.6 BFloat16 Extension +//---------------------------------------------------------------------------- +let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in { + +class BaseSIMDThreeSameVectorBFDot<bit Q, bit U, string asm, string kind1, + string kind2, RegisterOperand RegType, + ValueType AccumType, ValueType InputType> + : BaseSIMDThreeSameVectorTied<Q, U, 0b010, 0b11111, RegType, asm, kind1, [(set (AccumType RegType:$dst), + (int_aarch64_neon_bfdot (AccumType RegType:$Rd), + (InputType RegType:$Rn), + (InputType RegType:$Rm)))]> { + let AsmString = !strconcat(asm, + "{\t$Rd" # kind1 # ", $Rn" # kind2 # + ", $Rm" # kind2 # "}"); +} + +multiclass SIMDThreeSameVectorBFDot<bit U, string asm> { + def v4bf16 : BaseSIMDThreeSameVectorBFDot<0, U, asm, ".2s", ".4h", V64, + v2f32, v8i8>; + def v8bf16 : BaseSIMDThreeSameVectorBFDot<1, U, asm, ".4s", ".8h", V128, + v4f32, v16i8>; +} + +class BaseSIMDThreeSameVectorBF16DotI<bit Q, bit U, string asm, + string dst_kind, string lhs_kind, + string rhs_kind, + RegisterOperand RegType, + ValueType AccumType, + ValueType InputType> + : BaseSIMDIndexedTied<Q, U, 0b0, 0b01, 0b1111, + RegType, RegType, V128, VectorIndexS, + asm, "", dst_kind, lhs_kind, rhs_kind, + [(set (AccumType RegType:$dst), + (AccumType (int_aarch64_neon_bfdot + (AccumType RegType:$Rd), + (InputType RegType:$Rn), + (InputType (bitconvert (AccumType + (AArch64duplane32 (v4f32 V128:$Rm), + VectorIndexH:$idx)))))))]> { + + bits<2> idx; + let Inst{21} = idx{0}; // L + let Inst{11} = idx{1}; // H +} + +multiclass SIMDThreeSameVectorBF16DotI<bit U, string asm> { + + def v4bf16 : BaseSIMDThreeSameVectorBF16DotI<0, U, asm, ".2s", ".4h", + ".2h", V64, v2f32, v8i8>; + def v8bf16 : BaseSIMDThreeSameVectorBF16DotI<1, U, asm, ".4s", ".8h", + ".2h", V128, v4f32, v16i8>; +} + +class SIMDBF16MLAL<bit Q, string asm, SDPatternOperator OpNode> + : BaseSIMDThreeSameVectorTied<Q, 0b1, 0b110, 0b11111, V128, asm, ".4s", + [(set (v4f32 V128:$dst), (OpNode (v4f32 V128:$Rd), + (v16i8 V128:$Rn), + (v16i8 V128:$Rm)))]> { + let AsmString = !strconcat(asm, "{\t$Rd.4s, $Rn.8h, $Rm.8h}"); +} + +class SIMDBF16MLALIndex<bit Q, string asm, SDPatternOperator OpNode> + : I<(outs V128:$dst), + (ins V128:$Rd, V128:$Rn, V128_lo:$Rm, VectorIndexH:$idx), asm, + "{\t$Rd.4s, $Rn.8h, $Rm.h$idx}", "$Rd = $dst", + [(set (v4f32 V128:$dst), + (v4f32 (OpNode (v4f32 V128:$Rd), + (v16i8 V128:$Rn), + (v16i8 (bitconvert (v8bf16 + (AArch64duplane16 (v8bf16 V128_lo:$Rm), + VectorIndexH:$idx)))))))]>, + Sched<[WriteV]> { + bits<5> Rd; + bits<5> Rn; + bits<4> Rm; + bits<3> idx; + + let Inst{31} = 0; + let Inst{30} = Q; + let Inst{29-22} = 0b00111111; + let Inst{21-20} = idx{1-0}; + let Inst{19-16} = Rm; + let Inst{15-12} = 0b1111; + let Inst{11} = idx{2}; // H + let Inst{10} = 0; + let Inst{9-5} = Rn; + let Inst{4-0} = Rd; +} + +class SIMDThreeSameVectorBF16MatrixMul<string asm> + : BaseSIMDThreeSameVectorTied<1, 1, 0b010, 0b11101, + V128, asm, ".4s", + [(set (v4f32 V128:$dst), + (int_aarch64_neon_bfmmla (v4f32 V128:$Rd), + (v16i8 V128:$Rn), + (v16i8 V128:$Rm)))]> { + let AsmString = !strconcat(asm, "{\t$Rd", ".4s", ", $Rn", ".8h", + ", $Rm", ".8h", "}"); +} + +class SIMD_BFCVTN + : BaseSIMDMixedTwoVector<0, 0, 0b10, 0b10110, V128, V128, + "bfcvtn", ".4h", ".4s", + [(set (v8bf16 V128:$Rd), + (int_aarch64_neon_bfcvtn (v4f32 V128:$Rn)))]>; + +class SIMD_BFCVTN2 + : BaseSIMDMixedTwoVectorTied<1, 0, 0b10, 0b10110, V128, V128, + "bfcvtn2", ".8h", ".4s", + [(set (v8bf16 V128:$dst), + (int_aarch64_neon_bfcvtn2 (v8bf16 V128:$Rd), (v4f32 V128:$Rn)))]>; + +class BF16ToSinglePrecision<string asm> + : I<(outs FPR16:$Rd), (ins FPR32:$Rn), asm, "\t$Rd, $Rn", "", + [(set (bf16 FPR16:$Rd), (int_aarch64_neon_bfcvt (f32 FPR32:$Rn)))]>, + Sched<[WriteFCvt]> { + bits<5> Rd; + bits<5> Rn; + let Inst{31-10} = 0b0001111001100011010000; + let Inst{9-5} = Rn; + let Inst{4-0} = Rd; +} +} // End of let mayStore = 0, mayLoad = 0, hasSideEffects = 0 + +//---------------------------------------------------------------------------- +// Armv8.6 Matrix Multiply Extension +//---------------------------------------------------------------------------- + +class SIMDThreeSameVectorMatMul<bit B, bit U, string asm, SDPatternOperator OpNode> + : BaseSIMDThreeSameVectorTied<1, U, 0b100, {0b1010, B}, V128, asm, ".4s", + [(set (v4i32 V128:$dst), (OpNode (v4i32 V128:$Rd), + (v16i8 V128:$Rn), + (v16i8 V128:$Rm)))]> { + let AsmString = asm # "{\t$Rd.4s, $Rn.16b, $Rm.16b}"; +} + +//---------------------------------------------------------------------------- // ARMv8.2-A Dot Product Instructions (Indexed) -class BaseSIMDThreeSameVectorDotIndex<bit Q, bit U, string asm, string dst_kind, - string lhs_kind, string rhs_kind, +class BaseSIMDThreeSameVectorDotIndex<bit Q, bit U, bit Mixed, bits<2> size, string asm, + string dst_kind, string lhs_kind, string rhs_kind, RegisterOperand RegType, ValueType AccumType, ValueType InputType, SDPatternOperator OpNode> : - BaseSIMDIndexedTied<Q, U, 0b0, 0b10, 0b1110, RegType, RegType, V128, + BaseSIMDIndexedTied<Q, U, 0b0, size, {0b111, Mixed}, RegType, RegType, V128, VectorIndexS, asm, "", dst_kind, lhs_kind, rhs_kind, [(set (AccumType RegType:$dst), (AccumType (OpNode (AccumType RegType:$Rd), @@ -7670,11 +7966,11 @@ class BaseSIMDThreeSameVectorDotIndex<bit Q, bit U, string asm, string dst_kind, let Inst{11} = idx{1}; // H } -multiclass SIMDThreeSameVectorDotIndex<bit U, string asm, +multiclass SIMDThreeSameVectorDotIndex<bit U, bit Mixed, bits<2> size, string asm, SDPatternOperator OpNode> { - def v8i8 : BaseSIMDThreeSameVectorDotIndex<0, U, asm, ".2s", ".8b", ".4b", + def v8i8 : BaseSIMDThreeSameVectorDotIndex<0, U, Mixed, size, asm, ".2s", ".8b", ".4b", V64, v2i32, v8i8, OpNode>; - def v16i8 : BaseSIMDThreeSameVectorDotIndex<1, U, asm, ".4s", ".16b", ".4b", + def v16i8 : BaseSIMDThreeSameVectorDotIndex<1, U, Mixed, size, asm, ".4s", ".16b", ".4b", V128, v4i32, v16i8, OpNode>; } @@ -7813,6 +8109,34 @@ multiclass SIMDFPIndexed<bit U, bits<4> opc, string asm, } multiclass SIMDFPIndexedTiedPatterns<string INST, SDPatternOperator OpNode> { + let Predicates = [HasNEON, HasFullFP16] in { + // Patterns for f16: DUPLANE, DUP scalar and vector_extract. + def : Pat<(v8f16 (OpNode (v8f16 V128:$Rd), (v8f16 V128:$Rn), + (AArch64duplane16 (v8f16 V128_lo:$Rm), + VectorIndexH:$idx))), + (!cast<Instruction>(INST # "v8i16_indexed") + V128:$Rd, V128:$Rn, V128_lo:$Rm, VectorIndexH:$idx)>; + def : Pat<(v8f16 (OpNode (v8f16 V128:$Rd), (v8f16 V128:$Rn), + (AArch64dup (f16 FPR16Op_lo:$Rm)))), + (!cast<Instruction>(INST # "v8i16_indexed") V128:$Rd, V128:$Rn, + (SUBREG_TO_REG (i32 0), (f16 FPR16Op_lo:$Rm), hsub), (i64 0))>; + + def : Pat<(v4f16 (OpNode (v4f16 V64:$Rd), (v4f16 V64:$Rn), + (AArch64duplane16 (v8f16 V128_lo:$Rm), + VectorIndexH:$idx))), + (!cast<Instruction>(INST # "v4i16_indexed") + V64:$Rd, V64:$Rn, V128_lo:$Rm, VectorIndexH:$idx)>; + def : Pat<(v4f16 (OpNode (v4f16 V64:$Rd), (v4f16 V64:$Rn), + (AArch64dup (f16 FPR16Op_lo:$Rm)))), + (!cast<Instruction>(INST # "v4i16_indexed") V64:$Rd, V64:$Rn, + (SUBREG_TO_REG (i32 0), (f16 FPR16Op_lo:$Rm), hsub), (i64 0))>; + + def : Pat<(f16 (OpNode (f16 FPR16:$Rd), (f16 FPR16:$Rn), + (vector_extract (v8f16 V128_lo:$Rm), VectorIndexH:$idx))), + (!cast<Instruction>(INST # "v1i16_indexed") FPR16:$Rd, FPR16:$Rn, + V128_lo:$Rm, VectorIndexH:$idx)>; + } // Predicates = [HasNEON, HasFullFP16] + // 2 variants for the .2s version: DUPLANE from 128-bit and DUP scalar. def : Pat<(v2f32 (OpNode (v2f32 V64:$Rd), (v2f32 V64:$Rn), (AArch64duplane32 (v4f32 V128:$Rm), @@ -7847,15 +8171,11 @@ multiclass SIMDFPIndexedTiedPatterns<string INST, SDPatternOperator OpNode> { (!cast<Instruction>(INST # "v2i64_indexed") V128:$Rd, V128:$Rn, (SUBREG_TO_REG (i32 0), FPR64Op:$Rm, dsub), (i64 0))>; - // 2 variants for 32-bit scalar version: extract from .2s or from .4s + // Covers 2 variants for 32-bit scalar version: extract from .2s or from .4s def : Pat<(f32 (OpNode (f32 FPR32:$Rd), (f32 FPR32:$Rn), (vector_extract (v4f32 V128:$Rm), VectorIndexS:$idx))), (!cast<Instruction>(INST # "v1i32_indexed") FPR32:$Rd, FPR32:$Rn, V128:$Rm, VectorIndexS:$idx)>; - def : Pat<(f32 (OpNode (f32 FPR32:$Rd), (f32 FPR32:$Rn), - (vector_extract (v2f32 V64:$Rm), VectorIndexS:$idx))), - (!cast<Instruction>(INST # "v1i32_indexed") FPR32:$Rd, FPR32:$Rn, - (SUBREG_TO_REG (i32 0), V64:$Rm, dsub), VectorIndexS:$idx)>; // 1 variant for 64-bit scalar version: extract from .1d or from .2d def : Pat<(f64 (OpNode (f64 FPR64:$Rd), (f64 FPR64:$Rn), @@ -7940,6 +8260,64 @@ multiclass SIMDFPIndexedTied<bit U, bits<4> opc, string asm> { } } +multiclass SIMDIndexedHSPatterns<SDPatternOperator OpNodeLane, + SDPatternOperator OpNodeLaneQ> { + + def : Pat<(v4i16 (OpNodeLane + (v4i16 V64:$Rn), (v4i16 V64_lo:$Rm), + VectorIndexS32b:$idx)), + (!cast<Instruction>(NAME # v4i16_indexed) $Rn, + (SUBREG_TO_REG (i32 0), (v4i16 V64_lo:$Rm), dsub), + (UImmS1XForm $idx))>; + + def : Pat<(v4i16 (OpNodeLaneQ + (v4i16 V64:$Rn), (v8i16 V128_lo:$Rm), + VectorIndexH32b:$idx)), + (!cast<Instruction>(NAME # v4i16_indexed) $Rn, $Rm, + (UImmS1XForm $idx))>; + + def : Pat<(v8i16 (OpNodeLane + (v8i16 V128:$Rn), (v4i16 V64_lo:$Rm), + VectorIndexS32b:$idx)), + (!cast<Instruction>(NAME # v8i16_indexed) $Rn, + (SUBREG_TO_REG (i32 0), $Rm, dsub), + (UImmS1XForm $idx))>; + + def : Pat<(v8i16 (OpNodeLaneQ + (v8i16 V128:$Rn), (v8i16 V128_lo:$Rm), + VectorIndexH32b:$idx)), + (!cast<Instruction>(NAME # v8i16_indexed) $Rn, $Rm, + (UImmS1XForm $idx))>; + + def : Pat<(v2i32 (OpNodeLane + (v2i32 V64:$Rn), (v2i32 V64:$Rm), + VectorIndexD32b:$idx)), + (!cast<Instruction>(NAME # v2i32_indexed) $Rn, + (SUBREG_TO_REG (i32 0), (v2i32 V64_lo:$Rm), dsub), + (UImmS1XForm $idx))>; + + def : Pat<(v2i32 (OpNodeLaneQ + (v2i32 V64:$Rn), (v4i32 V128:$Rm), + VectorIndexS32b:$idx)), + (!cast<Instruction>(NAME # v2i32_indexed) $Rn, $Rm, + (UImmS1XForm $idx))>; + + def : Pat<(v4i32 (OpNodeLane + (v4i32 V128:$Rn), (v2i32 V64:$Rm), + VectorIndexD32b:$idx)), + (!cast<Instruction>(NAME # v4i32_indexed) $Rn, + (SUBREG_TO_REG (i32 0), $Rm, dsub), + (UImmS1XForm $idx))>; + + def : Pat<(v4i32 (OpNodeLaneQ + (v4i32 V128:$Rn), + (v4i32 V128:$Rm), + VectorIndexS32b:$idx)), + (!cast<Instruction>(NAME # v4i32_indexed) $Rn, $Rm, + (UImmS1XForm $idx))>; + +} + multiclass SIMDIndexedHS<bit U, bits<4> opc, string asm, SDPatternOperator OpNode> { def v4i16_indexed : BaseSIMDIndexed<0, U, 0, 0b01, opc, V64, V64, @@ -10154,15 +10532,15 @@ class ComplexRotationOperand<int Angle, int Remainder, string Type> let DiagnosticType = "InvalidComplexRotation" # Type; let Name = "ComplexRotation" # Type; } -def complexrotateop : Operand<i32>, ImmLeaf<i32, [{ return Imm >= 0 && Imm <= 270; }], - SDNodeXForm<imm, [{ +def complexrotateop : Operand<i32>, TImmLeaf<i32, [{ return Imm >= 0 && Imm <= 270; }], + SDNodeXForm<imm, [{ return CurDAG->getTargetConstant((N->getSExtValue() / 90), SDLoc(N), MVT::i32); }]>> { let ParserMatchClass = ComplexRotationOperand<90, 0, "Even">; let PrintMethod = "printComplexRotationOp<90, 0>"; } -def complexrotateopodd : Operand<i32>, ImmLeaf<i32, [{ return Imm >= 0 && Imm <= 270; }], - SDNodeXForm<imm, [{ +def complexrotateopodd : Operand<i32>, TImmLeaf<i32, [{ return Imm >= 0 && Imm <= 270; }], + SDNodeXForm<imm, [{ return CurDAG->getTargetConstant(((N->getSExtValue() - 90) / 180), SDLoc(N), MVT::i32); }]>> { let ParserMatchClass = ComplexRotationOperand<180, 90, "Odd">; diff --git a/llvm/lib/Target/AArch64/AArch64InstrGISel.td b/llvm/lib/Target/AArch64/AArch64InstrGISel.td new file mode 100644 index 0000000000000..a0e7c782f68c3 --- /dev/null +++ b/llvm/lib/Target/AArch64/AArch64InstrGISel.td @@ -0,0 +1,124 @@ +//=----- AArch64InstrGISel.td - AArch64 GISel target pseudos -*- tablegen -*-=// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// AArch64 GlobalISel target pseudo instruction definitions. This is kept +// separately from the other tablegen files for organizational purposes, but +// share the same infrastructure. +// +//===----------------------------------------------------------------------===// + + +class AArch64GenericInstruction : GenericInstruction { + let Namespace = "AArch64"; +} + +// A pseudo to represent a relocatable add instruction as part of address +// computation. +def G_ADD_LOW : AArch64GenericInstruction { + let OutOperandList = (outs type0:$dst); + let InOperandList = (ins type1:$src, type2:$imm); + let hasSideEffects = 0; +} + +// Pseudo for a rev16 instruction. Produced post-legalization from +// G_SHUFFLE_VECTORs with appropriate masks. +def G_REV16 : AArch64GenericInstruction { + let OutOperandList = (outs type0:$dst); + let InOperandList = (ins type0:$src); + let hasSideEffects = 0; +} + +// Pseudo for a rev32 instruction. Produced post-legalization from +// G_SHUFFLE_VECTORs with appropriate masks. +def G_REV32 : AArch64GenericInstruction { + let OutOperandList = (outs type0:$dst); + let InOperandList = (ins type0:$src); + let hasSideEffects = 0; +} + +// Pseudo for a rev64 instruction. Produced post-legalization from +// G_SHUFFLE_VECTORs with appropriate masks. +def G_REV64 : AArch64GenericInstruction { + let OutOperandList = (outs type0:$dst); + let InOperandList = (ins type0:$src); + let hasSideEffects = 0; +} + +// Represents an uzp1 instruction. Produced post-legalization from +// G_SHUFFLE_VECTORs with appropriate masks. +def G_UZP1 : AArch64GenericInstruction { + let OutOperandList = (outs type0:$dst); + let InOperandList = (ins type0:$v1, type0:$v2); + let hasSideEffects = 0; +} + +// Represents an uzp2 instruction. Produced post-legalization from +// G_SHUFFLE_VECTORs with appropriate masks. +def G_UZP2 : AArch64GenericInstruction { + let OutOperandList = (outs type0:$dst); + let InOperandList = (ins type0:$v1, type0:$v2); + let hasSideEffects = 0; +} + +// Represents a zip1 instruction. Produced post-legalization from +// G_SHUFFLE_VECTORs with appropriate masks. +def G_ZIP1 : AArch64GenericInstruction { + let OutOperandList = (outs type0:$dst); + let InOperandList = (ins type0:$v1, type0:$v2); + let hasSideEffects = 0; +} + +// Represents a zip2 instruction. Produced post-legalization from +// G_SHUFFLE_VECTORs with appropriate masks. +def G_ZIP2 : AArch64GenericInstruction { + let OutOperandList = (outs type0:$dst); + let InOperandList = (ins type0:$v1, type0:$v2); + let hasSideEffects = 0; +} + +// Represents a dup instruction. Produced post-legalization from +// G_SHUFFLE_VECTORs with appropriate masks. +def G_DUP: AArch64GenericInstruction { + let OutOperandList = (outs type0:$dst); + let InOperandList = (ins type1:$lane); + let hasSideEffects = 0; +} +// Represents a trn1 instruction. Produced post-legalization from +// G_SHUFFLE_VECTORs with appropriate masks. +def G_TRN1 : AArch64GenericInstruction { + let OutOperandList = (outs type0:$dst); + let InOperandList = (ins type0:$v1, type0:$v2); + let hasSideEffects = 0; +} + +// Represents a trn2 instruction. Produced post-legalization from +// G_SHUFFLE_VECTORs with appropriate masks. +def G_TRN2 : AArch64GenericInstruction { + let OutOperandList = (outs type0:$dst); + let InOperandList = (ins type0:$v1, type0:$v2); + let hasSideEffects = 0; +} + +// Represents an ext instruction. Produced post-legalization from +// G_SHUFFLE_VECTORs with appropriate masks. +def G_EXT: AArch64GenericInstruction { + let OutOperandList = (outs type0:$dst); + let InOperandList = (ins type0:$v1, type0:$v2, untyped_imm_0:$imm); +} + +def : GINodeEquiv<G_REV16, AArch64rev16>; +def : GINodeEquiv<G_REV32, AArch64rev32>; +def : GINodeEquiv<G_REV64, AArch64rev64>; +def : GINodeEquiv<G_UZP1, AArch64uzp1>; +def : GINodeEquiv<G_UZP2, AArch64uzp2>; +def : GINodeEquiv<G_ZIP1, AArch64zip1>; +def : GINodeEquiv<G_ZIP2, AArch64zip2>; +def : GINodeEquiv<G_DUP, AArch64dup>; +def : GINodeEquiv<G_TRN1, AArch64trn1>; +def : GINodeEquiv<G_TRN2, AArch64trn2>; +def : GINodeEquiv<G_EXT, AArch64ext>; diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp index 54f3f7c101324..5139ae5ccaf19 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -24,9 +24,9 @@ #include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineMemOperand.h" +#include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/CodeGen/StackMaps.h" #include "llvm/CodeGen/TargetRegisterInfo.h" #include "llvm/CodeGen/TargetSubtargetInfo.h" @@ -111,6 +111,14 @@ unsigned AArch64InstrInfo::getInstSizeInBytes(const MachineInstr &MI) const { // This gets lowered to an instruction sequence which takes 16 bytes NumBytes = 16; break; + case AArch64::SpeculationBarrierISBDSBEndBB: + // This gets lowered to 2 4-byte instructions. + NumBytes = 8; + break; + case AArch64::SpeculationBarrierSBEndBB: + // This gets lowered to 1 4-byte instructions. + NumBytes = 4; + break; case AArch64::JumpTableDest32: case AArch64::JumpTableDest16: case AArch64::JumpTableDest8: @@ -119,11 +127,25 @@ unsigned AArch64InstrInfo::getInstSizeInBytes(const MachineInstr &MI) const { case AArch64::SPACE: NumBytes = MI.getOperand(1).getImm(); break; + case TargetOpcode::BUNDLE: + NumBytes = getInstBundleLength(MI); + break; } return NumBytes; } +unsigned AArch64InstrInfo::getInstBundleLength(const MachineInstr &MI) const { + unsigned Size = 0; + MachineBasicBlock::const_instr_iterator I = MI.getIterator(); + MachineBasicBlock::const_instr_iterator E = MI.getParent()->instr_end(); + while (++I != E && I->isInsideBundle()) { + assert(!I->isBundle() && "No nested bundle!"); + Size += getInstSizeInBytes(*I); + } + return Size; +} + static void parseCondBranch(MachineInstr *LastInst, MachineBasicBlock *&Target, SmallVectorImpl<MachineOperand> &Cond) { // Block ends with fall-through condbranch. @@ -216,6 +238,12 @@ bool AArch64InstrInfo::analyzeBranch(MachineBasicBlock &MBB, if (I == MBB.end()) return false; + // Skip over SpeculationBarrierEndBB terminators + if (I->getOpcode() == AArch64::SpeculationBarrierISBDSBEndBB || + I->getOpcode() == AArch64::SpeculationBarrierSBEndBB) { + --I; + } + if (!isUnpredicatedTerminator(*I)) return false; @@ -496,8 +524,9 @@ static unsigned canFoldIntoCSel(const MachineRegisterInfo &MRI, unsigned VReg, bool AArch64InstrInfo::canInsertSelect(const MachineBasicBlock &MBB, ArrayRef<MachineOperand> Cond, - unsigned TrueReg, unsigned FalseReg, - int &CondCycles, int &TrueCycles, + Register DstReg, Register TrueReg, + Register FalseReg, int &CondCycles, + int &TrueCycles, int &FalseCycles) const { // Check register classes. const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); @@ -506,6 +535,12 @@ bool AArch64InstrInfo::canInsertSelect(const MachineBasicBlock &MBB, if (!RC) return false; + // Also need to check the dest regclass, in case we're trying to optimize + // something like: + // %1(gpr) = PHI %2(fpr), bb1, %(fpr), bb2 + if (!RI.getCommonSubClass(RC, MRI.getRegClass(DstReg))) + return false; + // Expanding cbz/tbz requires an extra cycle of latency on the condition. unsigned ExtraCondLat = Cond.size() != 1; @@ -538,9 +573,9 @@ bool AArch64InstrInfo::canInsertSelect(const MachineBasicBlock &MBB, void AArch64InstrInfo::insertSelect(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, - const DebugLoc &DL, unsigned DstReg, + const DebugLoc &DL, Register DstReg, ArrayRef<MachineOperand> Cond, - unsigned TrueReg, unsigned FalseReg) const { + Register TrueReg, Register FalseReg) const { MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); // Parse the condition code, see parseCondBranch() above. @@ -910,7 +945,7 @@ bool AArch64InstrInfo::isSEHInstruction(const MachineInstr &MI) { } bool AArch64InstrInfo::isCoalescableExtInstr(const MachineInstr &MI, - unsigned &SrcReg, unsigned &DstReg, + Register &SrcReg, Register &DstReg, unsigned &SubIdx) const { switch (MI.getOpcode()) { default: @@ -935,6 +970,7 @@ bool AArch64InstrInfo::areMemAccessesTriviallyDisjoint( const MachineOperand *BaseOpA = nullptr, *BaseOpB = nullptr; int64_t OffsetA = 0, OffsetB = 0; unsigned WidthA = 0, WidthB = 0; + bool OffsetAIsScalable = false, OffsetBIsScalable = false; assert(MIa.mayLoadOrStore() && "MIa must be a load or store."); assert(MIb.mayLoadOrStore() && "MIb must be a load or store."); @@ -948,9 +984,14 @@ bool AArch64InstrInfo::areMemAccessesTriviallyDisjoint( // base are identical, and the offset of a lower memory access + // the width doesn't overlap the offset of a higher memory access, // then the memory accesses are different. - if (getMemOperandWithOffsetWidth(MIa, BaseOpA, OffsetA, WidthA, TRI) && - getMemOperandWithOffsetWidth(MIb, BaseOpB, OffsetB, WidthB, TRI)) { - if (BaseOpA->isIdenticalTo(*BaseOpB)) { + // If OffsetAIsScalable and OffsetBIsScalable are both true, they + // are assumed to have the same scale (vscale). + if (getMemOperandWithOffsetWidth(MIa, BaseOpA, OffsetA, OffsetAIsScalable, + WidthA, TRI) && + getMemOperandWithOffsetWidth(MIb, BaseOpB, OffsetB, OffsetBIsScalable, + WidthB, TRI)) { + if (BaseOpA->isIdenticalTo(*BaseOpB) && + OffsetAIsScalable == OffsetBIsScalable) { int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB; int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA; int LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB; @@ -984,8 +1025,8 @@ bool AArch64InstrInfo::isSchedulingBoundary(const MachineInstr &MI, /// analyzeCompare - For a comparison instruction, return the source registers /// in SrcReg and SrcReg2, and the value it compares against in CmpValue. /// Return true if the comparison instruction can be analyzed. -bool AArch64InstrInfo::analyzeCompare(const MachineInstr &MI, unsigned &SrcReg, - unsigned &SrcReg2, int &CmpMask, +bool AArch64InstrInfo::analyzeCompare(const MachineInstr &MI, Register &SrcReg, + Register &SrcReg2, int &CmpMask, int &CmpValue) const { // The first operand can be a frame index where we'd normally expect a // register. @@ -1156,10 +1197,9 @@ static bool areCFlagsAccessedBetweenInstrs( return MI.getIterator() == From; }) != To->getParent()->rend()); - // We iterate backward starting \p To until we hit \p From. - for (--To; To != From; --To) { - const MachineInstr &Instr = *To; - + // We iterate backward starting at \p To until we hit \p From. + for (const MachineInstr &Instr : + instructionsWithoutDebug(++To.getReverse(), From.getReverse())) { if (((AccessToCheck & AK_Write) && Instr.modifiesRegister(AArch64::NZCV, TRI)) || ((AccessToCheck & AK_Read) && Instr.readsRegister(AArch64::NZCV, TRI))) @@ -1180,7 +1220,7 @@ static bool areCFlagsAccessedBetweenInstrs( /// instruction. /// Only comparison with zero is supported. bool AArch64InstrInfo::optimizeCompareInstr( - MachineInstr &CmpInstr, unsigned SrcReg, unsigned SrcReg2, int CmpMask, + MachineInstr &CmpInstr, Register SrcReg, Register SrcReg2, int CmpMask, int CmpValue, const MachineRegisterInfo *MRI) const { assert(CmpInstr.getParent()); assert(MRI); @@ -1416,10 +1456,9 @@ static bool canInstrSubstituteCmpInstr(MachineInstr *MI, MachineInstr *CmpInstr, return false; UsedNZCV NZCVUsedAfterCmp; - for (auto I = std::next(CmpInstr->getIterator()), - E = CmpInstr->getParent()->instr_end(); - I != E; ++I) { - const MachineInstr &Instr = *I; + for (const MachineInstr &Instr : + instructionsWithoutDebug(std::next(CmpInstr->getIterator()), + CmpInstr->getParent()->instr_end())) { if (Instr.readsRegister(AArch64::NZCV, TRI)) { AArch64CC::CondCode CC = findCondCodeUsedByInstr(Instr); if (CC == AArch64CC::Invalid) // Unsupported conditional instruction @@ -1684,6 +1723,8 @@ unsigned AArch64InstrInfo::isStoreToStackSlot(const MachineInstr &MI, case AArch64::STRSui: case AArch64::STRDui: case AArch64::STRQui: + case AArch64::LDR_PXI: + case AArch64::STR_PXI: if (MI.getOperand(0).getSubReg() == 0 && MI.getOperand(1).isFI() && MI.getOperand(2).isImm() && MI.getOperand(2).getImm() == 0) { FrameIndex = MI.getOperand(1).getIndex(); @@ -1796,9 +1837,37 @@ unsigned AArch64InstrInfo::getLoadStoreImmIdx(unsigned Opc) { case AArch64::STNPSi: case AArch64::LDG: case AArch64::STGPi: + case AArch64::LD1B_IMM: + case AArch64::LD1H_IMM: + case AArch64::LD1W_IMM: + case AArch64::LD1D_IMM: + case AArch64::ST1B_IMM: + case AArch64::ST1H_IMM: + case AArch64::ST1W_IMM: + case AArch64::ST1D_IMM: + case AArch64::LD1B_H_IMM: + case AArch64::LD1SB_H_IMM: + case AArch64::LD1H_S_IMM: + case AArch64::LD1SH_S_IMM: + case AArch64::LD1W_D_IMM: + case AArch64::LD1SW_D_IMM: + case AArch64::ST1B_H_IMM: + case AArch64::ST1H_S_IMM: + case AArch64::ST1W_D_IMM: + case AArch64::LD1B_S_IMM: + case AArch64::LD1SB_S_IMM: + case AArch64::LD1H_D_IMM: + case AArch64::LD1SH_D_IMM: + case AArch64::ST1B_S_IMM: + case AArch64::ST1H_D_IMM: + case AArch64::LD1B_D_IMM: + case AArch64::LD1SB_D_IMM: + case AArch64::ST1B_D_IMM: return 3; case AArch64::ADDG: case AArch64::STGOffset: + case AArch64::LDR_PXI: + case AArch64::STR_PXI: return 2; } } @@ -1978,20 +2047,25 @@ bool AArch64InstrInfo::isCandidateToMergeOrPair(const MachineInstr &MI) const { return true; } -bool AArch64InstrInfo::getMemOperandWithOffset(const MachineInstr &LdSt, - const MachineOperand *&BaseOp, - int64_t &Offset, - const TargetRegisterInfo *TRI) const { +bool AArch64InstrInfo::getMemOperandsWithOffsetWidth( + const MachineInstr &LdSt, SmallVectorImpl<const MachineOperand *> &BaseOps, + int64_t &Offset, bool &OffsetIsScalable, unsigned &Width, + const TargetRegisterInfo *TRI) const { if (!LdSt.mayLoadOrStore()) return false; - unsigned Width; - return getMemOperandWithOffsetWidth(LdSt, BaseOp, Offset, Width, TRI); + const MachineOperand *BaseOp; + if (!getMemOperandWithOffsetWidth(LdSt, BaseOp, Offset, OffsetIsScalable, + Width, TRI)) + return false; + BaseOps.push_back(BaseOp); + return true; } bool AArch64InstrInfo::getMemOperandWithOffsetWidth( const MachineInstr &LdSt, const MachineOperand *&BaseOp, int64_t &Offset, - unsigned &Width, const TargetRegisterInfo *TRI) const { + bool &OffsetIsScalable, unsigned &Width, + const TargetRegisterInfo *TRI) const { assert(LdSt.mayLoadOrStore() && "Expected a memory operation."); // Handle only loads/stores with base register followed by immediate offset. if (LdSt.getNumExplicitOperands() == 3) { @@ -2010,7 +2084,7 @@ bool AArch64InstrInfo::getMemOperandWithOffsetWidth( // Get the scaling factor for the instruction and set the width for the // instruction. - unsigned Scale = 0; + TypeSize Scale(0U, false); int64_t Dummy1, Dummy2; // If this returns false, then it's an instruction we don't want to handle. @@ -2022,12 +2096,13 @@ bool AArch64InstrInfo::getMemOperandWithOffsetWidth( // set to 1. if (LdSt.getNumExplicitOperands() == 3) { BaseOp = &LdSt.getOperand(1); - Offset = LdSt.getOperand(2).getImm() * Scale; + Offset = LdSt.getOperand(2).getImm() * Scale.getKnownMinSize(); } else { assert(LdSt.getNumExplicitOperands() == 4 && "invalid number of operands"); BaseOp = &LdSt.getOperand(2); - Offset = LdSt.getOperand(3).getImm() * Scale; + Offset = LdSt.getOperand(3).getImm() * Scale.getKnownMinSize(); } + OffsetIsScalable = Scale.isScalable(); if (!BaseOp->isReg() && !BaseOp->isFI()) return false; @@ -2043,26 +2118,28 @@ AArch64InstrInfo::getMemOpBaseRegImmOfsOffsetOperand(MachineInstr &LdSt) const { return OfsOp; } -bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, unsigned &Scale, +bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, TypeSize &Scale, unsigned &Width, int64_t &MinOffset, int64_t &MaxOffset) { + const unsigned SVEMaxBytesPerVector = AArch64::SVEMaxBitsPerVector / 8; switch (Opcode) { // Not a memory operation or something we want to handle. default: - Scale = Width = 0; + Scale = TypeSize::Fixed(0); + Width = 0; MinOffset = MaxOffset = 0; return false; case AArch64::STRWpost: case AArch64::LDRWpost: Width = 32; - Scale = 4; + Scale = TypeSize::Fixed(4); MinOffset = -256; MaxOffset = 255; break; case AArch64::LDURQi: case AArch64::STURQi: Width = 16; - Scale = 1; + Scale = TypeSize::Fixed(1); MinOffset = -256; MaxOffset = 255; break; @@ -2072,7 +2149,7 @@ bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, unsigned &Scale, case AArch64::STURXi: case AArch64::STURDi: Width = 8; - Scale = 1; + Scale = TypeSize::Fixed(1); MinOffset = -256; MaxOffset = 255; break; @@ -2082,7 +2159,7 @@ bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, unsigned &Scale, case AArch64::STURWi: case AArch64::STURSi: Width = 4; - Scale = 1; + Scale = TypeSize::Fixed(1); MinOffset = -256; MaxOffset = 255; break; @@ -2093,7 +2170,7 @@ bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, unsigned &Scale, case AArch64::STURHi: case AArch64::STURHHi: Width = 2; - Scale = 1; + Scale = TypeSize::Fixed(1); MinOffset = -256; MaxOffset = 255; break; @@ -2104,7 +2181,7 @@ bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, unsigned &Scale, case AArch64::STURBi: case AArch64::STURBBi: Width = 1; - Scale = 1; + Scale = TypeSize::Fixed(1); MinOffset = -256; MaxOffset = 255; break; @@ -2112,14 +2189,15 @@ bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, unsigned &Scale, case AArch64::LDNPQi: case AArch64::STPQi: case AArch64::STNPQi: - Scale = 16; + Scale = TypeSize::Fixed(16); Width = 32; MinOffset = -64; MaxOffset = 63; break; case AArch64::LDRQui: case AArch64::STRQui: - Scale = Width = 16; + Scale = TypeSize::Fixed(16); + Width = 16; MinOffset = 0; MaxOffset = 4095; break; @@ -2131,7 +2209,7 @@ bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, unsigned &Scale, case AArch64::STPDi: case AArch64::STNPXi: case AArch64::STNPDi: - Scale = 8; + Scale = TypeSize::Fixed(8); Width = 16; MinOffset = -64; MaxOffset = 63; @@ -2141,7 +2219,8 @@ bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, unsigned &Scale, case AArch64::LDRDui: case AArch64::STRXui: case AArch64::STRDui: - Scale = Width = 8; + Scale = TypeSize::Fixed(8); + Width = 8; MinOffset = 0; MaxOffset = 4095; break; @@ -2153,7 +2232,7 @@ bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, unsigned &Scale, case AArch64::STPSi: case AArch64::STNPWi: case AArch64::STNPSi: - Scale = 4; + Scale = TypeSize::Fixed(4); Width = 8; MinOffset = -64; MaxOffset = 63; @@ -2163,7 +2242,8 @@ bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, unsigned &Scale, case AArch64::LDRSWui: case AArch64::STRWui: case AArch64::STRSui: - Scale = Width = 4; + Scale = TypeSize::Fixed(4); + Width = 4; MinOffset = 0; MaxOffset = 4095; break; @@ -2173,7 +2253,8 @@ bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, unsigned &Scale, case AArch64::LDRSHXui: case AArch64::STRHui: case AArch64::STRHHui: - Scale = Width = 2; + Scale = TypeSize::Fixed(2); + Width = 2; MinOffset = 0; MaxOffset = 4095; break; @@ -2183,18 +2264,19 @@ bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, unsigned &Scale, case AArch64::LDRSBXui: case AArch64::STRBui: case AArch64::STRBBui: - Scale = Width = 1; + Scale = TypeSize::Fixed(1); + Width = 1; MinOffset = 0; MaxOffset = 4095; break; case AArch64::ADDG: - Scale = 16; + Scale = TypeSize::Fixed(16); Width = 0; MinOffset = 0; MaxOffset = 63; break; case AArch64::TAGPstack: - Scale = 16; + Scale = TypeSize::Fixed(16); Width = 0; // TAGP with a negative offset turns into SUBP, which has a maximum offset // of 63 (not 64!). @@ -2204,31 +2286,110 @@ bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, unsigned &Scale, case AArch64::LDG: case AArch64::STGOffset: case AArch64::STZGOffset: - Scale = Width = 16; + Scale = TypeSize::Fixed(16); + Width = 16; MinOffset = -256; MaxOffset = 255; break; + case AArch64::STR_ZZZZXI: + case AArch64::LDR_ZZZZXI: + Scale = TypeSize::Scalable(16); + Width = SVEMaxBytesPerVector * 4; + MinOffset = -256; + MaxOffset = 252; + break; + case AArch64::STR_ZZZXI: + case AArch64::LDR_ZZZXI: + Scale = TypeSize::Scalable(16); + Width = SVEMaxBytesPerVector * 3; + MinOffset = -256; + MaxOffset = 253; + break; + case AArch64::STR_ZZXI: + case AArch64::LDR_ZZXI: + Scale = TypeSize::Scalable(16); + Width = SVEMaxBytesPerVector * 2; + MinOffset = -256; + MaxOffset = 254; + break; case AArch64::LDR_PXI: case AArch64::STR_PXI: - Scale = Width = 2; + Scale = TypeSize::Scalable(2); + Width = SVEMaxBytesPerVector / 8; MinOffset = -256; MaxOffset = 255; break; case AArch64::LDR_ZXI: case AArch64::STR_ZXI: - Scale = Width = 16; + Scale = TypeSize::Scalable(16); + Width = SVEMaxBytesPerVector; MinOffset = -256; MaxOffset = 255; break; + case AArch64::LD1B_IMM: + case AArch64::LD1H_IMM: + case AArch64::LD1W_IMM: + case AArch64::LD1D_IMM: + case AArch64::ST1B_IMM: + case AArch64::ST1H_IMM: + case AArch64::ST1W_IMM: + case AArch64::ST1D_IMM: + // A full vectors worth of data + // Width = mbytes * elements + Scale = TypeSize::Scalable(16); + Width = SVEMaxBytesPerVector; + MinOffset = -8; + MaxOffset = 7; + break; + case AArch64::LD1B_H_IMM: + case AArch64::LD1SB_H_IMM: + case AArch64::LD1H_S_IMM: + case AArch64::LD1SH_S_IMM: + case AArch64::LD1W_D_IMM: + case AArch64::LD1SW_D_IMM: + case AArch64::ST1B_H_IMM: + case AArch64::ST1H_S_IMM: + case AArch64::ST1W_D_IMM: + // A half vector worth of data + // Width = mbytes * elements + Scale = TypeSize::Scalable(8); + Width = SVEMaxBytesPerVector / 2; + MinOffset = -8; + MaxOffset = 7; + break; + case AArch64::LD1B_S_IMM: + case AArch64::LD1SB_S_IMM: + case AArch64::LD1H_D_IMM: + case AArch64::LD1SH_D_IMM: + case AArch64::ST1B_S_IMM: + case AArch64::ST1H_D_IMM: + // A quarter vector worth of data + // Width = mbytes * elements + Scale = TypeSize::Scalable(4); + Width = SVEMaxBytesPerVector / 4; + MinOffset = -8; + MaxOffset = 7; + break; + case AArch64::LD1B_D_IMM: + case AArch64::LD1SB_D_IMM: + case AArch64::ST1B_D_IMM: + // A eighth vector worth of data + // Width = mbytes * elements + Scale = TypeSize::Scalable(2); + Width = SVEMaxBytesPerVector / 8; + MinOffset = -8; + MaxOffset = 7; + break; case AArch64::ST2GOffset: case AArch64::STZ2GOffset: - Scale = 16; + Scale = TypeSize::Fixed(16); Width = 32; MinOffset = -256; MaxOffset = 255; break; case AArch64::STGPi: - Scale = Width = 16; + Scale = TypeSize::Fixed(16); + Width = 16; MinOffset = -64; MaxOffset = 63; break; @@ -2363,9 +2524,13 @@ static bool shouldClusterFI(const MachineFrameInfo &MFI, int FI1, /// Detect opportunities for ldp/stp formation. /// /// Only called for LdSt for which getMemOperandWithOffset returns true. -bool AArch64InstrInfo::shouldClusterMemOps(const MachineOperand &BaseOp1, - const MachineOperand &BaseOp2, - unsigned NumLoads) const { +bool AArch64InstrInfo::shouldClusterMemOps( + ArrayRef<const MachineOperand *> BaseOps1, + ArrayRef<const MachineOperand *> BaseOps2, unsigned NumLoads, + unsigned NumBytes) const { + assert(BaseOps1.size() == 1 && BaseOps2.size() == 1); + const MachineOperand &BaseOp1 = *BaseOps1.front(); + const MachineOperand &BaseOp2 = *BaseOps2.front(); const MachineInstr &FirstLdSt = *BaseOp1.getParent(); const MachineInstr &SecondLdSt = *BaseOp2.getParent(); if (BaseOp1.getType() != BaseOp2.getType()) @@ -2379,7 +2544,7 @@ bool AArch64InstrInfo::shouldClusterMemOps(const MachineOperand &BaseOp1, return false; // Only cluster up to a single pair. - if (NumLoads > 1) + if (NumLoads > 2) return false; if (!isPairableLdStInst(FirstLdSt) || !isPairableLdStInst(SecondLdSt)) @@ -2822,11 +2987,11 @@ static void storeRegPairToStackSlot(const TargetRegisterInfo &TRI, MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertBefore, const MCInstrDesc &MCID, - unsigned SrcReg, bool IsKill, + Register SrcReg, bool IsKill, unsigned SubIdx0, unsigned SubIdx1, int FI, MachineMemOperand *MMO) { - unsigned SrcReg0 = SrcReg; - unsigned SrcReg1 = SrcReg; + Register SrcReg0 = SrcReg; + Register SrcReg1 = SrcReg; if (Register::isPhysicalRegister(SrcReg)) { SrcReg0 = TRI.getSubReg(SrcReg, SubIdx0); SubIdx0 = 0; @@ -2842,18 +3007,19 @@ static void storeRegPairToStackSlot(const TargetRegisterInfo &TRI, } void AArch64InstrInfo::storeRegToStackSlot( - MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, unsigned SrcReg, + MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register SrcReg, bool isKill, int FI, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI) const { MachineFunction &MF = *MBB.getParent(); MachineFrameInfo &MFI = MF.getFrameInfo(); - unsigned Align = MFI.getObjectAlignment(FI); MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI); - MachineMemOperand *MMO = MF.getMachineMemOperand( - PtrInfo, MachineMemOperand::MOStore, MFI.getObjectSize(FI), Align); + MachineMemOperand *MMO = + MF.getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore, + MFI.getObjectSize(FI), MFI.getObjectAlign(FI)); unsigned Opc = 0; bool Offset = true; + unsigned StackID = TargetStackID::Default; switch (TRI->getSpillSize(*RC)) { case 1: if (AArch64::FPR8RegClass.hasSubClassEq(RC)) @@ -2862,6 +3028,11 @@ void AArch64InstrInfo::storeRegToStackSlot( case 2: if (AArch64::FPR16RegClass.hasSubClassEq(RC)) Opc = AArch64::STRHui; + else if (AArch64::PPRRegClass.hasSubClassEq(RC)) { + assert(Subtarget.hasSVE() && "Unexpected register store without SVE"); + Opc = AArch64::STR_PXI; + StackID = TargetStackID::SVEVector; + } break; case 4: if (AArch64::GPR32allRegClass.hasSubClassEq(RC)) { @@ -2901,6 +3072,10 @@ void AArch64InstrInfo::storeRegToStackSlot( get(AArch64::STPXi), SrcReg, isKill, AArch64::sube64, AArch64::subo64, FI, MMO); return; + } else if (AArch64::ZPRRegClass.hasSubClassEq(RC)) { + assert(Subtarget.hasSVE() && "Unexpected register store without SVE"); + Opc = AArch64::STR_ZXI; + StackID = TargetStackID::SVEVector; } break; case 24: @@ -2919,6 +3094,10 @@ void AArch64InstrInfo::storeRegToStackSlot( assert(Subtarget.hasNEON() && "Unexpected register store without NEON"); Opc = AArch64::ST1Twov2d; Offset = false; + } else if (AArch64::ZPR2RegClass.hasSubClassEq(RC)) { + assert(Subtarget.hasSVE() && "Unexpected register store without SVE"); + Opc = AArch64::STR_ZZXI; + StackID = TargetStackID::SVEVector; } break; case 48: @@ -2926,6 +3105,10 @@ void AArch64InstrInfo::storeRegToStackSlot( assert(Subtarget.hasNEON() && "Unexpected register store without NEON"); Opc = AArch64::ST1Threev2d; Offset = false; + } else if (AArch64::ZPR3RegClass.hasSubClassEq(RC)) { + assert(Subtarget.hasSVE() && "Unexpected register store without SVE"); + Opc = AArch64::STR_ZZZXI; + StackID = TargetStackID::SVEVector; } break; case 64: @@ -2933,19 +3116,13 @@ void AArch64InstrInfo::storeRegToStackSlot( assert(Subtarget.hasNEON() && "Unexpected register store without NEON"); Opc = AArch64::ST1Fourv2d; Offset = false; + } else if (AArch64::ZPR4RegClass.hasSubClassEq(RC)) { + assert(Subtarget.hasSVE() && "Unexpected register store without SVE"); + Opc = AArch64::STR_ZZZZXI; + StackID = TargetStackID::SVEVector; } break; } - unsigned StackID = TargetStackID::Default; - if (AArch64::PPRRegClass.hasSubClassEq(RC)) { - assert(Subtarget.hasSVE() && "Unexpected register store without SVE"); - Opc = AArch64::STR_PXI; - StackID = TargetStackID::SVEVector; - } else if (AArch64::ZPRRegClass.hasSubClassEq(RC)) { - assert(Subtarget.hasSVE() && "Unexpected register store without SVE"); - Opc = AArch64::STR_ZXI; - StackID = TargetStackID::SVEVector; - } assert(Opc && "Unknown register class"); MFI.setStackID(FI, StackID); @@ -2962,11 +3139,11 @@ static void loadRegPairFromStackSlot(const TargetRegisterInfo &TRI, MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertBefore, const MCInstrDesc &MCID, - unsigned DestReg, unsigned SubIdx0, + Register DestReg, unsigned SubIdx0, unsigned SubIdx1, int FI, MachineMemOperand *MMO) { - unsigned DestReg0 = DestReg; - unsigned DestReg1 = DestReg; + Register DestReg0 = DestReg; + Register DestReg1 = DestReg; bool IsUndef = true; if (Register::isPhysicalRegister(DestReg)) { DestReg0 = TRI.getSubReg(DestReg, SubIdx0); @@ -2984,18 +3161,19 @@ static void loadRegPairFromStackSlot(const TargetRegisterInfo &TRI, } void AArch64InstrInfo::loadRegFromStackSlot( - MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, unsigned DestReg, + MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register DestReg, int FI, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI) const { MachineFunction &MF = *MBB.getParent(); MachineFrameInfo &MFI = MF.getFrameInfo(); - unsigned Align = MFI.getObjectAlignment(FI); MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI); - MachineMemOperand *MMO = MF.getMachineMemOperand( - PtrInfo, MachineMemOperand::MOLoad, MFI.getObjectSize(FI), Align); + MachineMemOperand *MMO = + MF.getMachineMemOperand(PtrInfo, MachineMemOperand::MOLoad, + MFI.getObjectSize(FI), MFI.getObjectAlign(FI)); unsigned Opc = 0; bool Offset = true; + unsigned StackID = TargetStackID::Default; switch (TRI->getSpillSize(*RC)) { case 1: if (AArch64::FPR8RegClass.hasSubClassEq(RC)) @@ -3004,6 +3182,11 @@ void AArch64InstrInfo::loadRegFromStackSlot( case 2: if (AArch64::FPR16RegClass.hasSubClassEq(RC)) Opc = AArch64::LDRHui; + else if (AArch64::PPRRegClass.hasSubClassEq(RC)) { + assert(Subtarget.hasSVE() && "Unexpected register load without SVE"); + Opc = AArch64::LDR_PXI; + StackID = TargetStackID::SVEVector; + } break; case 4: if (AArch64::GPR32allRegClass.hasSubClassEq(RC)) { @@ -3043,6 +3226,10 @@ void AArch64InstrInfo::loadRegFromStackSlot( get(AArch64::LDPXi), DestReg, AArch64::sube64, AArch64::subo64, FI, MMO); return; + } else if (AArch64::ZPRRegClass.hasSubClassEq(RC)) { + assert(Subtarget.hasSVE() && "Unexpected register load without SVE"); + Opc = AArch64::LDR_ZXI; + StackID = TargetStackID::SVEVector; } break; case 24: @@ -3061,6 +3248,10 @@ void AArch64InstrInfo::loadRegFromStackSlot( assert(Subtarget.hasNEON() && "Unexpected register load without NEON"); Opc = AArch64::LD1Twov2d; Offset = false; + } else if (AArch64::ZPR2RegClass.hasSubClassEq(RC)) { + assert(Subtarget.hasSVE() && "Unexpected register load without SVE"); + Opc = AArch64::LDR_ZZXI; + StackID = TargetStackID::SVEVector; } break; case 48: @@ -3068,6 +3259,10 @@ void AArch64InstrInfo::loadRegFromStackSlot( assert(Subtarget.hasNEON() && "Unexpected register load without NEON"); Opc = AArch64::LD1Threev2d; Offset = false; + } else if (AArch64::ZPR3RegClass.hasSubClassEq(RC)) { + assert(Subtarget.hasSVE() && "Unexpected register load without SVE"); + Opc = AArch64::LDR_ZZZXI; + StackID = TargetStackID::SVEVector; } break; case 64: @@ -3075,20 +3270,14 @@ void AArch64InstrInfo::loadRegFromStackSlot( assert(Subtarget.hasNEON() && "Unexpected register load without NEON"); Opc = AArch64::LD1Fourv2d; Offset = false; + } else if (AArch64::ZPR4RegClass.hasSubClassEq(RC)) { + assert(Subtarget.hasSVE() && "Unexpected register load without SVE"); + Opc = AArch64::LDR_ZZZZXI; + StackID = TargetStackID::SVEVector; } break; } - unsigned StackID = TargetStackID::Default; - if (AArch64::PPRRegClass.hasSubClassEq(RC)) { - assert(Subtarget.hasSVE() && "Unexpected register load without SVE"); - Opc = AArch64::LDR_PXI; - StackID = TargetStackID::SVEVector; - } else if (AArch64::ZPRRegClass.hasSubClassEq(RC)) { - assert(Subtarget.hasSVE() && "Unexpected register load without SVE"); - Opc = AArch64::LDR_ZXI; - StackID = TargetStackID::SVEVector; - } assert(Opc && "Unknown register class"); MFI.setStackID(FI, StackID); @@ -3100,6 +3289,17 @@ void AArch64InstrInfo::loadRegFromStackSlot( MI.addMemOperand(MMO); } +bool llvm::isNZCVTouchedInInstructionRange(const MachineInstr &DefMI, + const MachineInstr &UseMI, + const TargetRegisterInfo *TRI) { + return any_of(instructionsWithoutDebug(std::next(DefMI.getIterator()), + UseMI.getIterator()), + [TRI](const MachineInstr &I) { + return I.modifiesRegister(AArch64::NZCV, TRI) || + I.readsRegister(AArch64::NZCV, TRI); + }); +} + // Helper function to emit a frame offset adjustment from a given // pointer (SrcReg), stored into DestReg. This function is explicit // in that it requires the opcode. @@ -3146,6 +3346,10 @@ static void emitFrameOffsetAdj(MachineBasicBlock &MBB, // assert(Offset < (1 << 24) && "unimplemented reg plus immediate"); const unsigned MaxEncodableValue = MaxEncoding << ShiftSize; + Register TmpReg = DestReg; + if (TmpReg == AArch64::XZR) + TmpReg = MBB.getParent()->getRegInfo().createVirtualRegister( + &AArch64::GPR64RegClass); do { uint64_t ThisVal = std::min<uint64_t>(Offset, MaxEncodableValue); unsigned LocalShiftSize = 0; @@ -3155,7 +3359,11 @@ static void emitFrameOffsetAdj(MachineBasicBlock &MBB, } assert((ThisVal >> ShiftSize) <= MaxEncoding && "Encoding cannot handle value that big"); - auto MBI = BuildMI(MBB, MBBI, DL, TII->get(Opc), DestReg) + + Offset -= ThisVal << LocalShiftSize; + if (Offset == 0) + TmpReg = DestReg; + auto MBI = BuildMI(MBB, MBBI, DL, TII->get(Opc), TmpReg) .addReg(SrcReg) .addImm(Sign * (int)ThisVal); if (ShiftSize) @@ -3176,8 +3384,8 @@ static void emitFrameOffsetAdj(MachineBasicBlock &MBB, BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_AddFP)) .addImm(Imm) .setMIFlag(Flag); - assert((Offset - Imm) == 0 && "Expected remaining offset to be zero to " - "emit a single SEH directive"); + assert(Offset == 0 && "Expected remaining offset to be zero to " + "emit a single SEH directive"); } else if (DestReg == AArch64::SP) { if (HasWinCFI) *HasWinCFI = true; @@ -3190,8 +3398,7 @@ static void emitFrameOffsetAdj(MachineBasicBlock &MBB, *HasWinCFI = true; } - SrcReg = DestReg; - Offset -= ThisVal << LocalShiftSize; + SrcReg = TmpReg; } while (Offset); } @@ -3414,18 +3621,6 @@ MachineInstr *AArch64InstrInfo::foldMemoryOperandImpl( return nullptr; } -static bool isSVEScaledImmInstruction(unsigned Opcode) { - switch (Opcode) { - case AArch64::LDR_ZXI: - case AArch64::STR_ZXI: - case AArch64::LDR_PXI: - case AArch64::STR_PXI: - return true; - default: - return false; - } -} - int llvm::isAArch64FrameOffsetLegal(const MachineInstr &MI, StackOffset &SOffset, bool *OutUseUnscaledOp, @@ -3458,20 +3653,23 @@ int llvm::isAArch64FrameOffsetLegal(const MachineInstr &MI, case AArch64::ST1Fourv1d: case AArch64::IRG: case AArch64::IRGstack: + case AArch64::STGloop: + case AArch64::STZGloop: return AArch64FrameOffsetCannotUpdate; } // Get the min/max offset and the scale. - unsigned Scale, Width; + TypeSize ScaleValue(0U, false); + unsigned Width; int64_t MinOff, MaxOff; - if (!AArch64InstrInfo::getMemOpInfo(MI.getOpcode(), Scale, Width, MinOff, + if (!AArch64InstrInfo::getMemOpInfo(MI.getOpcode(), ScaleValue, Width, MinOff, MaxOff)) llvm_unreachable("unhandled opcode in isAArch64FrameOffsetLegal"); // Construct the complete offset. - bool IsMulVL = isSVEScaledImmInstruction(MI.getOpcode()); - int64_t Offset = - IsMulVL ? (SOffset.getScalableBytes()) : (SOffset.getBytes()); + bool IsMulVL = ScaleValue.isScalable(); + unsigned Scale = ScaleValue.getKnownMinSize(); + int64_t Offset = IsMulVL ? SOffset.getScalableBytes() : SOffset.getBytes(); const MachineOperand &ImmOpnd = MI.getOperand(AArch64InstrInfo::getLoadStoreImmIdx(MI.getOpcode())); @@ -3484,9 +3682,14 @@ int llvm::isAArch64FrameOffsetLegal(const MachineInstr &MI, AArch64InstrInfo::getUnscaledLdSt(MI.getOpcode()); bool useUnscaledOp = UnscaledOp && (Offset % Scale || Offset < 0); if (useUnscaledOp && - !AArch64InstrInfo::getMemOpInfo(*UnscaledOp, Scale, Width, MinOff, MaxOff)) + !AArch64InstrInfo::getMemOpInfo(*UnscaledOp, ScaleValue, Width, MinOff, + MaxOff)) llvm_unreachable("unhandled opcode in isAArch64FrameOffsetLegal"); + Scale = ScaleValue.getKnownMinSize(); + assert(IsMulVL == ScaleValue.isScalable() && + "Unscaled opcode has different value for scalable"); + int64_t Remainder = Offset % Scale; assert(!(Remainder && useUnscaledOp) && "Cannot have remainder when using unscaled op"); @@ -5791,6 +5994,35 @@ outliner::OutlinedFunction AArch64InstrInfo::getOutliningCandidateInfo( return C.getMF()->getFunction().hasFnAttribute("branch-target-enforcement"); }); + // We check to see if CFI Instructions are present, and if they are + // we find the number of CFI Instructions in the candidates. + unsigned CFICount = 0; + MachineBasicBlock::iterator MBBI = RepeatedSequenceLocs[0].front(); + for (unsigned Loc = RepeatedSequenceLocs[0].getStartIdx(); + Loc < RepeatedSequenceLocs[0].getEndIdx() + 1; Loc++) { + const std::vector<MCCFIInstruction> &CFIInstructions = + RepeatedSequenceLocs[0].getMF()->getFrameInstructions(); + if (MBBI->isCFIInstruction()) { + unsigned CFIIndex = MBBI->getOperand(0).getCFIIndex(); + MCCFIInstruction CFI = CFIInstructions[CFIIndex]; + CFICount++; + } + MBBI++; + } + + // We compare the number of found CFI Instructions to the number of CFI + // instructions in the parent function for each candidate. We must check this + // since if we outline one of the CFI instructions in a function, we have to + // outline them all for correctness. If we do not, the address offsets will be + // incorrect between the two sections of the program. + for (outliner::Candidate &C : RepeatedSequenceLocs) { + std::vector<MCCFIInstruction> CFIInstructions = + C.getMF()->getFrameInstructions(); + + if (CFICount > 0 && CFICount != CFIInstructions.size()) + return outliner::OutlinedFunction(); + } + // Returns true if an instructions is safe to fix up, false otherwise. auto IsSafeToFixup = [this, &TRI](MachineInstr &MI) { if (MI.isCall()) @@ -5811,23 +6043,29 @@ outliner::OutlinedFunction AArch64InstrInfo::getOutliningCandidateInfo( if (MI.mayLoadOrStore()) { const MachineOperand *Base; // Filled with the base operand of MI. int64_t Offset; // Filled with the offset of MI. + bool OffsetIsScalable; // Does it allow us to offset the base operand and is the base the // register SP? - if (!getMemOperandWithOffset(MI, Base, Offset, &TRI) || !Base->isReg() || - Base->getReg() != AArch64::SP) + if (!getMemOperandWithOffset(MI, Base, Offset, OffsetIsScalable, &TRI) || + !Base->isReg() || Base->getReg() != AArch64::SP) + return false; + + // Fixe-up code below assumes bytes. + if (OffsetIsScalable) return false; // Find the minimum/maximum offset for this instruction and check // if fixing it up would be in range. int64_t MinOffset, MaxOffset; // Unscaled offsets for the instruction. - unsigned Scale; // The scale to multiply the offsets by. + TypeSize Scale(0U, false); // The scale to multiply the offsets by. unsigned DummyWidth; getMemOpInfo(MI.getOpcode(), Scale, DummyWidth, MinOffset, MaxOffset); Offset += 16; // Update the offset to what it would be if we outlined. - if (Offset < MinOffset * Scale || Offset > MaxOffset * Scale) + if (Offset < MinOffset * (int64_t)Scale.getFixedSize() || + Offset > MaxOffset * (int64_t)Scale.getFixedSize()) return false; // It's in range, so we can outline it. @@ -5854,7 +6092,9 @@ outliner::OutlinedFunction AArch64InstrInfo::getOutliningCandidateInfo( } else if (LastInstrOpcode == AArch64::BL || - (LastInstrOpcode == AArch64::BLR && !HasBTI)) { + ((LastInstrOpcode == AArch64::BLR || + LastInstrOpcode == AArch64::BLRNoIP) && + !HasBTI)) { // FIXME: Do we need to check if the code after this uses the value of LR? FrameID = MachineOutlinerThunk; NumBytesToCreateFrame = 0; @@ -5960,6 +6200,11 @@ outliner::OutlinedFunction AArch64InstrInfo::getOutliningCandidateInfo( } } + // If we have CFI instructions, we can only outline if the outlined section + // can be a tail call + if (FrameID != MachineOutlinerTailCall && CFICount > 0) + return outliner::OutlinedFunction(); + return outliner::OutlinedFunction(RepeatedSequenceLocs, SequenceSize, NumBytesToCreateFrame, FrameID); } @@ -5986,6 +6231,10 @@ bool AArch64InstrInfo::isFunctionSafeToOutlineFrom( if (!AFI || AFI->hasRedZone().getValueOr(true)) return false; + // FIXME: Teach the outliner to generate/handle Windows unwind info. + if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI()) + return false; + // It's safe to outline from MF. return true; } @@ -6081,6 +6330,15 @@ AArch64InstrInfo::getOutliningType(MachineBasicBlock::iterator &MIT, if (FuncInfo->getLOHRelated().count(&MI)) return outliner::InstrType::Illegal; + // We can only outline these if we will tail call the outlined function, or + // fix up the CFI offsets. Currently, CFI instructions are outlined only if + // in a tail call. + // + // FIXME: If the proper fixups for the offset are implemented, this should be + // possible. + if (MI.isCFIInstruction()) + return outliner::InstrType::Legal; + // Don't allow debug values to impact outlining type. if (MI.isDebugInstr() || MI.isIndirectDebugValue()) return outliner::InstrType::Invisible; @@ -6150,10 +6408,11 @@ AArch64InstrInfo::getOutliningType(MachineBasicBlock::iterator &MIT, // If we don't know anything about the callee, assume it depends on the // stack layout of the caller. In that case, it's only legal to outline - // as a tail-call. Whitelist the call instructions we know about so we + // as a tail-call. Explicitly list the call instructions we know about so we // don't get unexpected results with call pseudo-instructions. auto UnknownCallOutlineType = outliner::InstrType::Illegal; - if (MI.getOpcode() == AArch64::BLR || MI.getOpcode() == AArch64::BL) + if (MI.getOpcode() == AArch64::BLR || + MI.getOpcode() == AArch64::BLRNoIP || MI.getOpcode() == AArch64::BL) UnknownCallOutlineType = outliner::InstrType::LegalTerminator; if (!Callee) @@ -6205,26 +6464,29 @@ void AArch64InstrInfo::fixupPostOutline(MachineBasicBlock &MBB) const { const MachineOperand *Base; unsigned Width; int64_t Offset; + bool OffsetIsScalable; // Is this a load or store with an immediate offset with SP as the base? if (!MI.mayLoadOrStore() || - !getMemOperandWithOffsetWidth(MI, Base, Offset, Width, &RI) || + !getMemOperandWithOffsetWidth(MI, Base, Offset, OffsetIsScalable, Width, + &RI) || (Base->isReg() && Base->getReg() != AArch64::SP)) continue; // It is, so we have to fix it up. - unsigned Scale; + TypeSize Scale(0U, false); int64_t Dummy1, Dummy2; MachineOperand &StackOffsetOperand = getMemOpBaseRegImmOfsOffsetOperand(MI); assert(StackOffsetOperand.isImm() && "Stack offset wasn't immediate!"); getMemOpInfo(MI.getOpcode(), Scale, Width, Dummy1, Dummy2); assert(Scale != 0 && "Unexpected opcode!"); + assert(!OffsetIsScalable && "Expected offset to be a byte offset"); // We've pushed the return address to the stack, so add 16 to the offset. // This is safe, since we already checked if it would overflow when we // checked if this instruction was legal to outline. - int64_t NewImm = (Offset + 16) / Scale; + int64_t NewImm = (Offset + 16) / (int64_t)Scale.getFixedSize(); StackOffsetOperand.setImm(NewImm); } } @@ -6285,15 +6547,21 @@ static void signOutlinedFunction(MachineFunction &MF, MachineBasicBlock &MBB, void AArch64InstrInfo::buildOutlinedFrame( MachineBasicBlock &MBB, MachineFunction &MF, const outliner::OutlinedFunction &OF) const { - // For thunk outlining, rewrite the last instruction from a call to a - // tail-call. - if (OF.FrameConstructionID == MachineOutlinerThunk) { + + AArch64FunctionInfo *FI = MF.getInfo<AArch64FunctionInfo>(); + + if (OF.FrameConstructionID == MachineOutlinerTailCall) + FI->setOutliningStyle("Tail Call"); + else if (OF.FrameConstructionID == MachineOutlinerThunk) { + // For thunk outlining, rewrite the last instruction from a call to a + // tail-call. MachineInstr *Call = &*--MBB.instr_end(); unsigned TailOpcode; if (Call->getOpcode() == AArch64::BL) { TailOpcode = AArch64::TCRETURNdi; } else { - assert(Call->getOpcode() == AArch64::BLR); + assert(Call->getOpcode() == AArch64::BLR || + Call->getOpcode() == AArch64::BLRNoIP); TailOpcode = AArch64::TCRETURNriALL; } MachineInstr *TC = BuildMI(MF, DebugLoc(), get(TailOpcode)) @@ -6301,6 +6569,8 @@ void AArch64InstrInfo::buildOutlinedFrame( .addImm(0); MBB.insert(MBB.end(), TC); Call->eraseFromParent(); + + FI->setOutliningStyle("Thunk"); } bool IsLeafFunction = true; @@ -6320,7 +6590,8 @@ void AArch64InstrInfo::buildOutlinedFrame( IsLeafFunction = false; // LR has to be a live in so that we can save it. - MBB.addLiveIn(AArch64::LR); + if (!MBB.isLiveIn(AArch64::LR)) + MBB.addLiveIn(AArch64::LR); MachineBasicBlock::iterator It = MBB.begin(); MachineBasicBlock::iterator Et = MBB.end(); @@ -6343,7 +6614,7 @@ void AArch64InstrInfo::buildOutlinedFrame( // Add a CFI saying the stack was moved 16 B down. int64_t StackPosEntry = - MF.addFrameInst(MCCFIInstruction::createDefCfaOffset(nullptr, 16)); + MF.addFrameInst(MCCFIInstruction::cfiDefCfaOffset(nullptr, 16)); BuildMI(MBB, It, DebugLoc(), get(AArch64::CFI_INSTRUCTION)) .addCFIIndex(StackPosEntry) .setMIFlags(MachineInstr::FrameSetup); @@ -6351,7 +6622,7 @@ void AArch64InstrInfo::buildOutlinedFrame( // Add a CFI saying that the LR that we want to find is now 16 B higher than // before. int64_t LRPosEntry = - MF.addFrameInst(MCCFIInstruction::createOffset(nullptr, DwarfReg, 16)); + MF.addFrameInst(MCCFIInstruction::createOffset(nullptr, DwarfReg, -16)); BuildMI(MBB, It, DebugLoc(), get(AArch64::CFI_INSTRUCTION)) .addCFIIndex(LRPosEntry) .setMIFlags(MachineInstr::FrameSetup); @@ -6399,13 +6670,20 @@ void AArch64InstrInfo::buildOutlinedFrame( } // It's not a tail call, so we have to insert the return ourselves. + + // LR has to be a live in so that we can return to it. + if (!MBB.isLiveIn(AArch64::LR)) + MBB.addLiveIn(AArch64::LR); + MachineInstr *ret = BuildMI(MF, DebugLoc(), get(AArch64::RET)) - .addReg(AArch64::LR, RegState::Undef); + .addReg(AArch64::LR); MBB.insert(MBB.end(), ret); signOutlinedFunction(MF, MBB, ShouldSignReturnAddr, ShouldSignReturnAddrWithAKey); + FI->setOutliningStyle("Function"); + // Did we have to modify the stack by saving the link register? if (OF.FrameConstructionID != MachineOutlinerDefault) return; @@ -6519,7 +6797,8 @@ Optional<RegImmPair> AArch64InstrInfo::isAddImmediate(const MachineInstr &MI, // TODO: Handle cases where Reg is a super- or sub-register of the // destination register. - if (Reg != MI.getOperand(0).getReg()) + const MachineOperand &Op0 = MI.getOperand(0); + if (!Op0.isReg() || Reg != Op0.getReg()) return None; switch (MI.getOpcode()) { @@ -6614,5 +6893,17 @@ AArch64InstrInfo::describeLoadedValue(const MachineInstr &MI, return TargetInstrInfo::describeLoadedValue(MI, Reg); } +uint64_t AArch64InstrInfo::getElementSizeForOpcode(unsigned Opc) const { + return get(Opc).TSFlags & AArch64::ElementSizeMask; +} + +unsigned llvm::getBLRCallOpcode(const MachineFunction &MF) { + if (MF.getSubtarget<AArch64Subtarget>().hardenSlsBlr()) + return AArch64::BLRNoIP; + else + return AArch64::BLR; +} + #define GET_INSTRINFO_HELPERS +#define GET_INSTRMAP_INFO #include "AArch64GenInstrInfo.inc" diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.h b/llvm/lib/Target/AArch64/AArch64InstrInfo.h index 66e517e549035..298c04d81708d 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.h +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.h @@ -19,6 +19,7 @@ #include "llvm/ADT/Optional.h" #include "llvm/CodeGen/MachineCombinerPattern.h" #include "llvm/CodeGen/TargetInstrInfo.h" +#include "llvm/Support/TypeSize.h" #define GET_INSTRINFO_HEADER #include "AArch64GenInstrInfo.inc" @@ -51,8 +52,8 @@ public: bool isAsCheapAsAMove(const MachineInstr &MI) const override; - bool isCoalescableExtInstr(const MachineInstr &MI, unsigned &SrcReg, - unsigned &DstReg, unsigned &SubIdx) const override; + bool isCoalescableExtInstr(const MachineInstr &MI, Register &SrcReg, + Register &DstReg, unsigned &SubIdx) const override; bool areMemAccessesTriviallyDisjoint(const MachineInstr &MIa, @@ -112,14 +113,19 @@ public: /// Hint that pairing the given load or store is unprofitable. static void suppressLdStPair(MachineInstr &MI); - bool getMemOperandWithOffset(const MachineInstr &MI, - const MachineOperand *&BaseOp, - int64_t &Offset, - const TargetRegisterInfo *TRI) const override; + bool getMemOperandsWithOffsetWidth( + const MachineInstr &MI, SmallVectorImpl<const MachineOperand *> &BaseOps, + int64_t &Offset, bool &OffsetIsScalable, unsigned &Width, + const TargetRegisterInfo *TRI) const override; + /// If \p OffsetIsScalable is set to 'true', the offset is scaled by `vscale`. + /// This is true for some SVE instructions like ldr/str that have a + /// 'reg + imm' addressing mode where the immediate is an index to the + /// scalable vector located at 'reg + imm * vscale x #bytes'. bool getMemOperandWithOffsetWidth(const MachineInstr &MI, const MachineOperand *&BaseOp, - int64_t &Offset, unsigned &Width, + int64_t &Offset, bool &OffsetIsScalable, + unsigned &Width, const TargetRegisterInfo *TRI) const; /// Return the immediate offset of the base register in a load/store \p LdSt. @@ -129,12 +135,12 @@ public: /// \p Scale, \p Width, \p MinOffset, and \p MaxOffset accordingly. /// /// For unscaled instructions, \p Scale is set to 1. - static bool getMemOpInfo(unsigned Opcode, unsigned &Scale, unsigned &Width, + static bool getMemOpInfo(unsigned Opcode, TypeSize &Scale, unsigned &Width, int64_t &MinOffset, int64_t &MaxOffset); - bool shouldClusterMemOps(const MachineOperand &BaseOp1, - const MachineOperand &BaseOp2, - unsigned NumLoads) const override; + bool shouldClusterMemOps(ArrayRef<const MachineOperand *> BaseOps1, + ArrayRef<const MachineOperand *> BaseOps2, + unsigned NumLoads, unsigned NumBytes) const override; void copyPhysRegTuple(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, MCRegister DestReg, @@ -149,13 +155,13 @@ public: bool KillSrc) const override; void storeRegToStackSlot(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MBBI, unsigned SrcReg, + MachineBasicBlock::iterator MBBI, Register SrcReg, bool isKill, int FrameIndex, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI) const override; void loadRegFromStackSlot(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MBBI, unsigned DestReg, + MachineBasicBlock::iterator MBBI, Register DestReg, int FrameIndex, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI) const override; @@ -191,11 +197,12 @@ public: bool reverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const override; bool canInsertSelect(const MachineBasicBlock &, ArrayRef<MachineOperand> Cond, - unsigned, unsigned, int &, int &, int &) const override; + Register, Register, Register, int &, int &, + int &) const override; void insertSelect(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, - const DebugLoc &DL, unsigned DstReg, - ArrayRef<MachineOperand> Cond, unsigned TrueReg, - unsigned FalseReg) const override; + const DebugLoc &DL, Register DstReg, + ArrayRef<MachineOperand> Cond, Register TrueReg, + Register FalseReg) const override; void getNoop(MCInst &NopInst) const override; bool isSchedulingBoundary(const MachineInstr &MI, @@ -205,13 +212,13 @@ public: /// analyzeCompare - For a comparison instruction, return the source registers /// in SrcReg and SrcReg2, and the value it compares against in CmpValue. /// Return true if the comparison instruction can be analyzed. - bool analyzeCompare(const MachineInstr &MI, unsigned &SrcReg, - unsigned &SrcReg2, int &CmpMask, + bool analyzeCompare(const MachineInstr &MI, Register &SrcReg, + Register &SrcReg2, int &CmpMask, int &CmpValue) const override; /// optimizeCompareInstr - Convert the instruction supplying the argument to /// the comparison into one that sets the zero bit in the flags register. - bool optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg, - unsigned SrcReg2, int CmpMask, int CmpValue, + bool optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, + Register SrcReg2, int CmpMask, int CmpValue, const MachineRegisterInfo *MRI) const override; bool optimizeCondBranch(MachineInstr &MI) const override; @@ -264,6 +271,8 @@ public: MachineBasicBlock::iterator &It, MachineFunction &MF, const outliner::Candidate &C) const override; bool shouldOutlineFromFunctionByDefault(MachineFunction &MF) const override; + /// Returns the vector element size (B, H, S or D) of an SVE opcode. + uint64_t getElementSizeForOpcode(unsigned Opc) const; /// Returns true if the instruction has a shift by immediate that can be /// executed in one cycle less. static bool isFalkorShiftExtFast(const MachineInstr &MI); @@ -288,6 +297,8 @@ protected: isCopyInstrImpl(const MachineInstr &MI) const override; private: + unsigned getInstBundleLength(const MachineInstr &MI) const; + /// Sets the offsets on outlined instructions in \p MBB which use SP /// so that they will be valid post-outlining. /// @@ -305,6 +316,12 @@ private: unsigned findRegisterToSaveLRTo(const outliner::Candidate &C) const; }; +/// Return true if there is an instruction /after/ \p DefMI and before \p UseMI +/// which either reads or clobbers NZCV. +bool isNZCVTouchedInInstructionRange(const MachineInstr &DefMI, + const MachineInstr &UseMI, + const TargetRegisterInfo *TRI); + /// emitFrameOffset - Emit instructions as needed to set DestReg to SrcReg /// plus Offset. This is intended to be used from within the prolog/epilog /// insertion (PEI) pass, where a virtual scratch register may be allocated @@ -369,12 +386,24 @@ static inline bool isCondBranchOpcode(int Opc) { } static inline bool isIndirectBranchOpcode(int Opc) { - return Opc == AArch64::BR; + switch (Opc) { + case AArch64::BR: + case AArch64::BRAA: + case AArch64::BRAB: + case AArch64::BRAAZ: + case AArch64::BRABZ: + return true; + } + return false; } +/// Return opcode to be used for indirect calls. +unsigned getBLRCallOpcode(const MachineFunction &MF); + // struct TSFlags { #define TSFLAG_ELEMENT_SIZE_TYPE(X) (X) // 3-bits -#define TSFLAG_DESTRUCTIVE_INST_TYPE(X) ((X) << 3) // 1-bit +#define TSFLAG_DESTRUCTIVE_INST_TYPE(X) ((X) << 3) // 4-bit +#define TSFLAG_FALSE_LANE_TYPE(X) ((X) << 7) // 2-bits // } namespace AArch64 { @@ -389,13 +418,31 @@ enum ElementSizeType { }; enum DestructiveInstType { - DestructiveInstTypeMask = TSFLAG_DESTRUCTIVE_INST_TYPE(0x1), - NotDestructive = TSFLAG_DESTRUCTIVE_INST_TYPE(0x0), - Destructive = TSFLAG_DESTRUCTIVE_INST_TYPE(0x1), + DestructiveInstTypeMask = TSFLAG_DESTRUCTIVE_INST_TYPE(0xf), + NotDestructive = TSFLAG_DESTRUCTIVE_INST_TYPE(0x0), + DestructiveOther = TSFLAG_DESTRUCTIVE_INST_TYPE(0x1), + DestructiveUnary = TSFLAG_DESTRUCTIVE_INST_TYPE(0x2), + DestructiveBinaryImm = TSFLAG_DESTRUCTIVE_INST_TYPE(0x3), + DestructiveBinaryShImmUnpred = TSFLAG_DESTRUCTIVE_INST_TYPE(0x4), + DestructiveBinary = TSFLAG_DESTRUCTIVE_INST_TYPE(0x5), + DestructiveBinaryComm = TSFLAG_DESTRUCTIVE_INST_TYPE(0x6), + DestructiveBinaryCommWithRev = TSFLAG_DESTRUCTIVE_INST_TYPE(0x7), + DestructiveTernaryCommWithRev = TSFLAG_DESTRUCTIVE_INST_TYPE(0x8), +}; + +enum FalseLaneType { + FalseLanesMask = TSFLAG_FALSE_LANE_TYPE(0x3), + FalseLanesZero = TSFLAG_FALSE_LANE_TYPE(0x1), + FalseLanesUndef = TSFLAG_FALSE_LANE_TYPE(0x2), }; #undef TSFLAG_ELEMENT_SIZE_TYPE #undef TSFLAG_DESTRUCTIVE_INST_TYPE +#undef TSFLAG_FALSE_LANE_TYPE + +int getSVEPseudoMap(uint16_t Opcode); +int getSVERevInstr(uint16_t Opcode); +int getSVENonRevInstr(uint16_t Opcode); } } // end namespace llvm diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td index d590d4d913ff8..f4a5f639e4973 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -14,142 +14,154 @@ // ARM Instruction Predicate Definitions. // def HasV8_1a : Predicate<"Subtarget->hasV8_1aOps()">, - AssemblerPredicate<"HasV8_1aOps", "armv8.1a">; + AssemblerPredicate<(all_of HasV8_1aOps), "armv8.1a">; def HasV8_2a : Predicate<"Subtarget->hasV8_2aOps()">, - AssemblerPredicate<"HasV8_2aOps", "armv8.2a">; + AssemblerPredicate<(all_of HasV8_2aOps), "armv8.2a">; def HasV8_3a : Predicate<"Subtarget->hasV8_3aOps()">, - AssemblerPredicate<"HasV8_3aOps", "armv8.3a">; + AssemblerPredicate<(all_of HasV8_3aOps), "armv8.3a">; def HasV8_4a : Predicate<"Subtarget->hasV8_4aOps()">, - AssemblerPredicate<"HasV8_4aOps", "armv8.4a">; + AssemblerPredicate<(all_of HasV8_4aOps), "armv8.4a">; def HasV8_5a : Predicate<"Subtarget->hasV8_5aOps()">, - AssemblerPredicate<"HasV8_5aOps", "armv8.5a">; + AssemblerPredicate<(all_of HasV8_5aOps), "armv8.5a">; +def HasV8_6a : Predicate<"Subtarget->hasV8_6aOps()">, + AssemblerPredicate<(all_of HasV8_6aOps), "armv8.6a">; def HasVH : Predicate<"Subtarget->hasVH()">, - AssemblerPredicate<"FeatureVH", "vh">; + AssemblerPredicate<(all_of FeatureVH), "vh">; def HasLOR : Predicate<"Subtarget->hasLOR()">, - AssemblerPredicate<"FeatureLOR", "lor">; + AssemblerPredicate<(all_of FeatureLOR), "lor">; def HasPA : Predicate<"Subtarget->hasPA()">, - AssemblerPredicate<"FeaturePA", "pa">; + AssemblerPredicate<(all_of FeaturePA), "pa">; def HasJS : Predicate<"Subtarget->hasJS()">, - AssemblerPredicate<"FeatureJS", "jsconv">; + AssemblerPredicate<(all_of FeatureJS), "jsconv">; def HasCCIDX : Predicate<"Subtarget->hasCCIDX()">, - AssemblerPredicate<"FeatureCCIDX", "ccidx">; + AssemblerPredicate<(all_of FeatureCCIDX), "ccidx">; def HasComplxNum : Predicate<"Subtarget->hasComplxNum()">, - AssemblerPredicate<"FeatureComplxNum", "complxnum">; + AssemblerPredicate<(all_of FeatureComplxNum), "complxnum">; def HasNV : Predicate<"Subtarget->hasNV()">, - AssemblerPredicate<"FeatureNV", "nv">; + AssemblerPredicate<(all_of FeatureNV), "nv">; def HasRASv8_4 : Predicate<"Subtarget->hasRASv8_4()">, - AssemblerPredicate<"FeatureRASv8_4", "rasv8_4">; + AssemblerPredicate<(all_of FeatureRASv8_4), "rasv8_4">; def HasMPAM : Predicate<"Subtarget->hasMPAM()">, - AssemblerPredicate<"FeatureMPAM", "mpam">; + AssemblerPredicate<(all_of FeatureMPAM), "mpam">; def HasDIT : Predicate<"Subtarget->hasDIT()">, - AssemblerPredicate<"FeatureDIT", "dit">; + AssemblerPredicate<(all_of FeatureDIT), "dit">; def HasTRACEV8_4 : Predicate<"Subtarget->hasTRACEV8_4()">, - AssemblerPredicate<"FeatureTRACEV8_4", "tracev8.4">; + AssemblerPredicate<(all_of FeatureTRACEV8_4), "tracev8.4">; def HasAM : Predicate<"Subtarget->hasAM()">, - AssemblerPredicate<"FeatureAM", "am">; + AssemblerPredicate<(all_of FeatureAM), "am">; def HasSEL2 : Predicate<"Subtarget->hasSEL2()">, - AssemblerPredicate<"FeatureSEL2", "sel2">; + AssemblerPredicate<(all_of FeatureSEL2), "sel2">; def HasPMU : Predicate<"Subtarget->hasPMU()">, - AssemblerPredicate<"FeaturePMU", "pmu">; + AssemblerPredicate<(all_of FeaturePMU), "pmu">; def HasTLB_RMI : Predicate<"Subtarget->hasTLB_RMI()">, - AssemblerPredicate<"FeatureTLB_RMI", "tlb-rmi">; + AssemblerPredicate<(all_of FeatureTLB_RMI), "tlb-rmi">; def HasFMI : Predicate<"Subtarget->hasFMI()">, - AssemblerPredicate<"FeatureFMI", "fmi">; + AssemblerPredicate<(all_of FeatureFMI), "fmi">; def HasRCPC_IMMO : Predicate<"Subtarget->hasRCPCImm()">, - AssemblerPredicate<"FeatureRCPC_IMMO", "rcpc-immo">; + AssemblerPredicate<(all_of FeatureRCPC_IMMO), "rcpc-immo">; def HasFPARMv8 : Predicate<"Subtarget->hasFPARMv8()">, - AssemblerPredicate<"FeatureFPARMv8", "fp-armv8">; + AssemblerPredicate<(all_of FeatureFPARMv8), "fp-armv8">; def HasNEON : Predicate<"Subtarget->hasNEON()">, - AssemblerPredicate<"FeatureNEON", "neon">; + AssemblerPredicate<(all_of FeatureNEON), "neon">; def HasCrypto : Predicate<"Subtarget->hasCrypto()">, - AssemblerPredicate<"FeatureCrypto", "crypto">; + AssemblerPredicate<(all_of FeatureCrypto), "crypto">; def HasSM4 : Predicate<"Subtarget->hasSM4()">, - AssemblerPredicate<"FeatureSM4", "sm4">; + AssemblerPredicate<(all_of FeatureSM4), "sm4">; def HasSHA3 : Predicate<"Subtarget->hasSHA3()">, - AssemblerPredicate<"FeatureSHA3", "sha3">; + AssemblerPredicate<(all_of FeatureSHA3), "sha3">; def HasSHA2 : Predicate<"Subtarget->hasSHA2()">, - AssemblerPredicate<"FeatureSHA2", "sha2">; + AssemblerPredicate<(all_of FeatureSHA2), "sha2">; def HasAES : Predicate<"Subtarget->hasAES()">, - AssemblerPredicate<"FeatureAES", "aes">; + AssemblerPredicate<(all_of FeatureAES), "aes">; def HasDotProd : Predicate<"Subtarget->hasDotProd()">, - AssemblerPredicate<"FeatureDotProd", "dotprod">; + AssemblerPredicate<(all_of FeatureDotProd), "dotprod">; def HasCRC : Predicate<"Subtarget->hasCRC()">, - AssemblerPredicate<"FeatureCRC", "crc">; + AssemblerPredicate<(all_of FeatureCRC), "crc">; def HasLSE : Predicate<"Subtarget->hasLSE()">, - AssemblerPredicate<"FeatureLSE", "lse">; + AssemblerPredicate<(all_of FeatureLSE), "lse">; def HasRAS : Predicate<"Subtarget->hasRAS()">, - AssemblerPredicate<"FeatureRAS", "ras">; + AssemblerPredicate<(all_of FeatureRAS), "ras">; def HasRDM : Predicate<"Subtarget->hasRDM()">, - AssemblerPredicate<"FeatureRDM", "rdm">; + AssemblerPredicate<(all_of FeatureRDM), "rdm">; def HasPerfMon : Predicate<"Subtarget->hasPerfMon()">; def HasFullFP16 : Predicate<"Subtarget->hasFullFP16()">, - AssemblerPredicate<"FeatureFullFP16", "fullfp16">; + AssemblerPredicate<(all_of FeatureFullFP16), "fullfp16">; def HasFP16FML : Predicate<"Subtarget->hasFP16FML()">, - AssemblerPredicate<"FeatureFP16FML", "fp16fml">; + AssemblerPredicate<(all_of FeatureFP16FML), "fp16fml">; def HasSPE : Predicate<"Subtarget->hasSPE()">, - AssemblerPredicate<"FeatureSPE", "spe">; + AssemblerPredicate<(all_of FeatureSPE), "spe">; def HasFuseAES : Predicate<"Subtarget->hasFuseAES()">, - AssemblerPredicate<"FeatureFuseAES", + AssemblerPredicate<(all_of FeatureFuseAES), "fuse-aes">; def HasSVE : Predicate<"Subtarget->hasSVE()">, - AssemblerPredicate<"FeatureSVE", "sve">; + AssemblerPredicate<(all_of FeatureSVE), "sve">; def HasSVE2 : Predicate<"Subtarget->hasSVE2()">, - AssemblerPredicate<"FeatureSVE2", "sve2">; + AssemblerPredicate<(all_of FeatureSVE2), "sve2">; def HasSVE2AES : Predicate<"Subtarget->hasSVE2AES()">, - AssemblerPredicate<"FeatureSVE2AES", "sve2-aes">; + AssemblerPredicate<(all_of FeatureSVE2AES), "sve2-aes">; def HasSVE2SM4 : Predicate<"Subtarget->hasSVE2SM4()">, - AssemblerPredicate<"FeatureSVE2SM4", "sve2-sm4">; + AssemblerPredicate<(all_of FeatureSVE2SM4), "sve2-sm4">; def HasSVE2SHA3 : Predicate<"Subtarget->hasSVE2SHA3()">, - AssemblerPredicate<"FeatureSVE2SHA3", "sve2-sha3">; + AssemblerPredicate<(all_of FeatureSVE2SHA3), "sve2-sha3">; def HasSVE2BitPerm : Predicate<"Subtarget->hasSVE2BitPerm()">, - AssemblerPredicate<"FeatureSVE2BitPerm", "sve2-bitperm">; + AssemblerPredicate<(all_of FeatureSVE2BitPerm), "sve2-bitperm">; def HasRCPC : Predicate<"Subtarget->hasRCPC()">, - AssemblerPredicate<"FeatureRCPC", "rcpc">; + AssemblerPredicate<(all_of FeatureRCPC), "rcpc">; def HasAltNZCV : Predicate<"Subtarget->hasAlternativeNZCV()">, - AssemblerPredicate<"FeatureAltFPCmp", "altnzcv">; + AssemblerPredicate<(all_of FeatureAltFPCmp), "altnzcv">; def HasFRInt3264 : Predicate<"Subtarget->hasFRInt3264()">, - AssemblerPredicate<"FeatureFRInt3264", "frint3264">; + AssemblerPredicate<(all_of FeatureFRInt3264), "frint3264">; def HasSB : Predicate<"Subtarget->hasSB()">, - AssemblerPredicate<"FeatureSB", "sb">; + AssemblerPredicate<(all_of FeatureSB), "sb">; def HasPredRes : Predicate<"Subtarget->hasPredRes()">, - AssemblerPredicate<"FeaturePredRes", "predres">; + AssemblerPredicate<(all_of FeaturePredRes), "predres">; def HasCCDP : Predicate<"Subtarget->hasCCDP()">, - AssemblerPredicate<"FeatureCacheDeepPersist", "ccdp">; + AssemblerPredicate<(all_of FeatureCacheDeepPersist), "ccdp">; def HasBTI : Predicate<"Subtarget->hasBTI()">, - AssemblerPredicate<"FeatureBranchTargetId", "bti">; + AssemblerPredicate<(all_of FeatureBranchTargetId), "bti">; def HasMTE : Predicate<"Subtarget->hasMTE()">, - AssemblerPredicate<"FeatureMTE", "mte">; + AssemblerPredicate<(all_of FeatureMTE), "mte">; def HasTME : Predicate<"Subtarget->hasTME()">, - AssemblerPredicate<"FeatureTME", "tme">; + AssemblerPredicate<(all_of FeatureTME), "tme">; def HasETE : Predicate<"Subtarget->hasETE()">, - AssemblerPredicate<"FeatureETE", "ete">; + AssemblerPredicate<(all_of FeatureETE), "ete">; def HasTRBE : Predicate<"Subtarget->hasTRBE()">, - AssemblerPredicate<"FeatureTRBE", "trbe">; + AssemblerPredicate<(all_of FeatureTRBE), "trbe">; +def HasBF16 : Predicate<"Subtarget->hasBF16()">, + AssemblerPredicate<(all_of FeatureBF16), "bf16">; +def HasMatMulInt8 : Predicate<"Subtarget->hasMatMulInt8()">, + AssemblerPredicate<(all_of FeatureMatMulInt8), "i8mm">; +def HasMatMulFP32 : Predicate<"Subtarget->hasMatMulFP32()">, + AssemblerPredicate<(all_of FeatureMatMulFP32), "f32mm">; +def HasMatMulFP64 : Predicate<"Subtarget->hasMatMulFP64()">, + AssemblerPredicate<(all_of FeatureMatMulFP64), "f64mm">; def IsLE : Predicate<"Subtarget->isLittleEndian()">; def IsBE : Predicate<"!Subtarget->isLittleEndian()">; def IsWindows : Predicate<"Subtarget->isTargetWindows()">; +def UseExperimentalZeroingPseudos + : Predicate<"Subtarget->useExperimentalZeroingPseudos()">; def UseAlternateSExtLoadCVTF32 : Predicate<"Subtarget->useAlternateSExtLoadCVTF32Pattern()">; def UseNegativeImmediates - : Predicate<"false">, AssemblerPredicate<"!FeatureNoNegativeImmediates", + : Predicate<"false">, AssemblerPredicate<(all_of (not FeatureNoNegativeImmediates)), "NegativeImmediates">; def AArch64LocalRecover : SDNode<"ISD::LOCAL_RECOVER", @@ -227,6 +239,10 @@ def SDT_AArch64ExtVec: SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>, SDTCisSameAs<0,2>, SDTCisInt<3>]>; def SDT_AArch64vshift : SDTypeProfile<1, 2, [SDTCisSameAs<0,1>, SDTCisInt<2>]>; +def SDT_AArch64vshiftinsert : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisInt<3>, + SDTCisSameAs<0,1>, + SDTCisSameAs<0,2>]>; + def SDT_AArch64unvec : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisSameAs<0,1>]>; def SDT_AArch64fcmpz : SDTypeProfile<1, 1, []>; def SDT_AArch64fcmp : SDTypeProfile<1, 2, [SDTCisSameAs<1,2>]>; @@ -245,6 +261,7 @@ def SDT_AArch64TLSDescCall : SDTypeProfile<0, -2, [SDTCisPtrTy<0>, def SDT_AArch64ldp : SDTypeProfile<2, 1, [SDTCisVT<0, i64>, SDTCisSameAs<0, 1>, SDTCisPtrTy<2>]>; def SDT_AArch64stp : SDTypeProfile<0, 3, [SDTCisVT<0, i64>, SDTCisSameAs<0, 1>, SDTCisPtrTy<2>]>; +def SDT_AArch64stnp : SDTypeProfile<0, 3, [SDTCisVT<0, v4i32>, SDTCisSameAs<0, 1>, SDTCisPtrTy<2>]>; // Generates the general dynamic sequences, i.e. // adrp x0, :tlsdesc:var @@ -419,7 +436,14 @@ def AArch64fccmp : SDNode<"AArch64ISD::FCCMP", SDT_AArch64FCCMP>; def AArch64threadpointer : SDNode<"AArch64ISD::THREAD_POINTER", SDTPtrLeaf>; -def AArch64fcmp : SDNode<"AArch64ISD::FCMP", SDT_AArch64FCmp>; +def AArch64fcmp : SDNode<"AArch64ISD::FCMP", SDT_AArch64FCmp>; +def AArch64strict_fcmp : SDNode<"AArch64ISD::STRICT_FCMP", SDT_AArch64FCmp, + [SDNPHasChain]>; +def AArch64strict_fcmpe : SDNode<"AArch64ISD::STRICT_FCMPE", SDT_AArch64FCmp, + [SDNPHasChain]>; +def AArch64any_fcmp : PatFrags<(ops node:$lhs, node:$rhs), + [(AArch64strict_fcmp node:$lhs, node:$rhs), + (AArch64fcmp node:$lhs, node:$rhs)]>; def AArch64dup : SDNode<"AArch64ISD::DUP", SDT_AArch64Dup>; def AArch64duplane8 : SDNode<"AArch64ISD::DUPLANE8", SDT_AArch64DupLane>; @@ -457,10 +481,12 @@ def AArch64uqshli : SDNode<"AArch64ISD::UQSHL_I", SDT_AArch64vshift>; def AArch64sqshlui : SDNode<"AArch64ISD::SQSHLU_I", SDT_AArch64vshift>; def AArch64srshri : SDNode<"AArch64ISD::SRSHR_I", SDT_AArch64vshift>; def AArch64urshri : SDNode<"AArch64ISD::URSHR_I", SDT_AArch64vshift>; +def AArch64vsli : SDNode<"AArch64ISD::VSLI", SDT_AArch64vshiftinsert>; +def AArch64vsri : SDNode<"AArch64ISD::VSRI", SDT_AArch64vshiftinsert>; def AArch64not: SDNode<"AArch64ISD::NOT", SDT_AArch64unvec>; def AArch64bit: SDNode<"AArch64ISD::BIT", SDT_AArch64trivec>; -def AArch64bsl: SDNode<"AArch64ISD::BSL", SDT_AArch64trivec>; +def AArch64bsp: SDNode<"AArch64ISD::BSP", SDT_AArch64trivec>; def AArch64cmeq: SDNode<"AArch64ISD::CMEQ", SDT_AArch64binvec>; def AArch64cmge: SDNode<"AArch64ISD::CMGE", SDT_AArch64binvec>; @@ -528,6 +554,9 @@ def AArch64uminv : SDNode<"AArch64ISD::UMINV", SDT_AArch64UnaryVec>; def AArch64smaxv : SDNode<"AArch64ISD::SMAXV", SDT_AArch64UnaryVec>; def AArch64umaxv : SDNode<"AArch64ISD::UMAXV", SDT_AArch64UnaryVec>; +def AArch64srhadd : SDNode<"AArch64ISD::SRHADD", SDT_AArch64binvec>; +def AArch64urhadd : SDNode<"AArch64ISD::URHADD", SDT_AArch64binvec>; + def SDT_AArch64SETTAG : SDTypeProfile<0, 2, [SDTCisPtrTy<0>, SDTCisPtrTy<1>]>; def AArch64stg : SDNode<"AArch64ISD::STG", SDT_AArch64SETTAG, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; def AArch64stzg : SDNode<"AArch64ISD::STZG", SDT_AArch64SETTAG, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; @@ -544,6 +573,7 @@ def AArch64uunpklo : SDNode<"AArch64ISD::UUNPKLO", SDT_AArch64unpk>; def AArch64ldp : SDNode<"AArch64ISD::LDP", SDT_AArch64ldp, [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>; def AArch64stp : SDNode<"AArch64ISD::STP", SDT_AArch64stp, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; +def AArch64stnp : SDNode<"AArch64ISD::STNP", SDT_AArch64stnp, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; def AArch64tbl : SDNode<"AArch64ISD::TBL", SDT_AArch64TBL>; @@ -564,6 +594,8 @@ let RecomputePerFunction = 1 in { def UseBTI : Predicate<[{ MF->getFunction().hasFnAttribute("branch-target-enforcement") }]>; def NotUseBTI : Predicate<[{ !MF->getFunction().hasFnAttribute("branch-target-enforcement") }]>; + def SLSBLRMitigation : Predicate<[{ MF->getSubtarget<AArch64Subtarget>().hardenSlsBlr() }]>; + def NoSLSBLRMitigation : Predicate<[{ !MF->getSubtarget<AArch64Subtarget>().hardenSlsBlr() }]>; // Toggles patterns which aren't beneficial in GlobalISel when we aren't // optimizing. This allows us to selectively use patterns without impacting // SelectionDAG's behaviour. @@ -686,6 +718,14 @@ let hasSideEffects = 1, isCodeGenOnly = 1 in { : Pseudo<(outs GPR32:$dst), (ins GPR32:$src), []>, Sched<[]>; } +// SpeculationBarrierEndBB must only be used after an unconditional control +// flow, i.e. after a terminator for which isBarrier is True. +let hasSideEffects = 1, isCodeGenOnly = 1, isTerminator = 1, isBarrier = 1 in { + def SpeculationBarrierISBDSBEndBB + : Pseudo<(outs), (ins), []>, Sched<[]>; + def SpeculationBarrierSBEndBB + : Pseudo<(outs), (ins), []>, Sched<[]>; +} //===----------------------------------------------------------------------===// // System instructions. @@ -698,8 +738,15 @@ def : InstAlias<"wfe", (HINT 0b010)>; def : InstAlias<"wfi", (HINT 0b011)>; def : InstAlias<"sev", (HINT 0b100)>; def : InstAlias<"sevl", (HINT 0b101)>; +def : InstAlias<"dgh", (HINT 0b110)>; def : InstAlias<"esb", (HINT 0b10000)>, Requires<[HasRAS]>; def : InstAlias<"csdb", (HINT 20)>; +// In order to be able to write readable assembly, LLVM should accept assembly +// inputs that use Branch Target Indentification mnemonics, even with BTI disabled. +// However, in order to be compatible with other assemblers (e.g. GAS), LLVM +// should not emit these mnemonics unless BTI is enabled. +def : InstAlias<"bti", (HINT 32), 0>; +def : InstAlias<"bti $op", (HINT btihint_op:$op), 0>; def : InstAlias<"bti", (HINT 32)>, Requires<[HasBTI]>; def : InstAlias<"bti $op", (HINT btihint_op:$op)>, Requires<[HasBTI]>; @@ -731,10 +778,58 @@ def TSB : CRmSystemI<barrier_op, 0b010, "tsb", []> { // ARMv8.2-A Dot Product let Predicates = [HasDotProd] in { -defm SDOT : SIMDThreeSameVectorDot<0, "sdot", int_aarch64_neon_sdot>; -defm UDOT : SIMDThreeSameVectorDot<1, "udot", int_aarch64_neon_udot>; -defm SDOTlane : SIMDThreeSameVectorDotIndex<0, "sdot", int_aarch64_neon_sdot>; -defm UDOTlane : SIMDThreeSameVectorDotIndex<1, "udot", int_aarch64_neon_udot>; +defm SDOT : SIMDThreeSameVectorDot<0, 0, "sdot", int_aarch64_neon_sdot>; +defm UDOT : SIMDThreeSameVectorDot<1, 0, "udot", int_aarch64_neon_udot>; +defm SDOTlane : SIMDThreeSameVectorDotIndex<0, 0, 0b10, "sdot", int_aarch64_neon_sdot>; +defm UDOTlane : SIMDThreeSameVectorDotIndex<1, 0, 0b10, "udot", int_aarch64_neon_udot>; +} + +// ARMv8.6-A BFloat +let Predicates = [HasBF16] in { +defm BFDOT : SIMDThreeSameVectorBFDot<1, "bfdot">; +defm BF16DOTlane : SIMDThreeSameVectorBF16DotI<0, "bfdot">; +def BFMMLA : SIMDThreeSameVectorBF16MatrixMul<"bfmmla">; +def BFMLALB : SIMDBF16MLAL<0, "bfmlalb", int_aarch64_neon_bfmlalb>; +def BFMLALT : SIMDBF16MLAL<1, "bfmlalt", int_aarch64_neon_bfmlalt>; +def BFMLALBIdx : SIMDBF16MLALIndex<0, "bfmlalb", int_aarch64_neon_bfmlalb>; +def BFMLALTIdx : SIMDBF16MLALIndex<1, "bfmlalt", int_aarch64_neon_bfmlalt>; +def BFCVTN : SIMD_BFCVTN; +def BFCVTN2 : SIMD_BFCVTN2; +def BFCVT : BF16ToSinglePrecision<"bfcvt">; +} + +// ARMv8.6A AArch64 matrix multiplication +let Predicates = [HasMatMulInt8] in { +def SMMLA : SIMDThreeSameVectorMatMul<0, 0, "smmla", int_aarch64_neon_smmla>; +def UMMLA : SIMDThreeSameVectorMatMul<0, 1, "ummla", int_aarch64_neon_ummla>; +def USMMLA : SIMDThreeSameVectorMatMul<1, 0, "usmmla", int_aarch64_neon_usmmla>; +defm USDOT : SIMDThreeSameVectorDot<0, 1, "usdot", int_aarch64_neon_usdot>; +defm USDOTlane : SIMDThreeSameVectorDotIndex<0, 1, 0b10, "usdot", int_aarch64_neon_usdot>; + +// sudot lane has a pattern where usdot is expected (there is no sudot). +// The second operand is used in the dup operation to repeat the indexed +// element. +class BaseSIMDSUDOTIndex<bit Q, string dst_kind, string lhs_kind, + string rhs_kind, RegisterOperand RegType, + ValueType AccumType, ValueType InputType> + : BaseSIMDThreeSameVectorDotIndex<Q, 0, 1, 0b00, "sudot", dst_kind, + lhs_kind, rhs_kind, RegType, AccumType, + InputType, null_frag> { + let Pattern = [(set (AccumType RegType:$dst), + (AccumType (int_aarch64_neon_usdot (AccumType RegType:$Rd), + (InputType (bitconvert (AccumType + (AArch64duplane32 (v4i32 V128:$Rm), + VectorIndexS:$idx)))), + (InputType RegType:$Rn))))]; +} + +multiclass SIMDSUDOTIndex { + def v8i8 : BaseSIMDSUDOTIndex<0, ".2s", ".8b", ".4b", V64, v2i32, v8i8>; + def v16i8 : BaseSIMDSUDOTIndex<1, ".4s", ".16b", ".4b", V128, v4i32, v16i8>; +} + +defm SUDOTlane : SIMDSUDOTIndex; + } // ARMv8.2-A FP16 Fused Multiply-Add Long @@ -819,38 +914,56 @@ let Predicates = [HasComplxNum, HasNEON] in { // important for compatibility with other assemblers (e.g. GAS) when building // software compatible with both CPUs that do or don't implement PA. let Uses = [LR], Defs = [LR] in { - def PACIAZ : SystemNoOperands<0b000, "hint #24">; - def PACIBZ : SystemNoOperands<0b010, "hint #26">; + def PACIAZ : SystemNoOperands<0b000, "hint\t#24">; + def PACIBZ : SystemNoOperands<0b010, "hint\t#26">; let isAuthenticated = 1 in { - def AUTIAZ : SystemNoOperands<0b100, "hint #28">; - def AUTIBZ : SystemNoOperands<0b110, "hint #30">; + def AUTIAZ : SystemNoOperands<0b100, "hint\t#28">; + def AUTIBZ : SystemNoOperands<0b110, "hint\t#30">; } } let Uses = [LR, SP], Defs = [LR] in { - def PACIASP : SystemNoOperands<0b001, "hint #25">; - def PACIBSP : SystemNoOperands<0b011, "hint #27">; + def PACIASP : SystemNoOperands<0b001, "hint\t#25">; + def PACIBSP : SystemNoOperands<0b011, "hint\t#27">; let isAuthenticated = 1 in { - def AUTIASP : SystemNoOperands<0b101, "hint #29">; - def AUTIBSP : SystemNoOperands<0b111, "hint #31">; + def AUTIASP : SystemNoOperands<0b101, "hint\t#29">; + def AUTIBSP : SystemNoOperands<0b111, "hint\t#31">; } } let Uses = [X16, X17], Defs = [X17], CRm = 0b0001 in { - def PACIA1716 : SystemNoOperands<0b000, "hint #8">; - def PACIB1716 : SystemNoOperands<0b010, "hint #10">; + def PACIA1716 : SystemNoOperands<0b000, "hint\t#8">; + def PACIB1716 : SystemNoOperands<0b010, "hint\t#10">; let isAuthenticated = 1 in { - def AUTIA1716 : SystemNoOperands<0b100, "hint #12">; - def AUTIB1716 : SystemNoOperands<0b110, "hint #14">; + def AUTIA1716 : SystemNoOperands<0b100, "hint\t#12">; + def AUTIB1716 : SystemNoOperands<0b110, "hint\t#14">; } } let Uses = [LR], Defs = [LR], CRm = 0b0000 in { - def XPACLRI : SystemNoOperands<0b111, "hint #7">; -} + def XPACLRI : SystemNoOperands<0b111, "hint\t#7">; +} + +// In order to be able to write readable assembly, LLVM should accept assembly +// inputs that use pointer authentication mnemonics, even with PA disabled. +// However, in order to be compatible with other assemblers (e.g. GAS), LLVM +// should not emit these mnemonics unless PA is enabled. +def : InstAlias<"paciaz", (PACIAZ), 0>; +def : InstAlias<"pacibz", (PACIBZ), 0>; +def : InstAlias<"autiaz", (AUTIAZ), 0>; +def : InstAlias<"autibz", (AUTIBZ), 0>; +def : InstAlias<"paciasp", (PACIASP), 0>; +def : InstAlias<"pacibsp", (PACIBSP), 0>; +def : InstAlias<"autiasp", (AUTIASP), 0>; +def : InstAlias<"autibsp", (AUTIBSP), 0>; +def : InstAlias<"pacia1716", (PACIA1716), 0>; +def : InstAlias<"pacib1716", (PACIB1716), 0>; +def : InstAlias<"autia1716", (AUTIA1716), 0>; +def : InstAlias<"autib1716", (AUTIB1716), 0>; +def : InstAlias<"xpaclri", (XPACLRI), 0>; // These pointer authentication instructions require armv8.3a let Predicates = [HasPA] in { - // When compiling with PA, there is a better mnemonic for these instructions. + // When PA is enabled, a better mnemonic should be emitted. def : InstAlias<"paciaz", (PACIAZ), 1>; def : InstAlias<"pacibz", (PACIBZ), 1>; def : InstAlias<"autiaz", (AUTIAZ), 1>; @@ -884,15 +997,23 @@ let Predicates = [HasPA] in { def PACGA : SignAuthTwoOperand<0b1100, "pacga", null_frag>; // Combined Instructions - def BRAA : AuthBranchTwoOperands<0, 0, "braa">; - def BRAB : AuthBranchTwoOperands<0, 1, "brab">; - def BLRAA : AuthBranchTwoOperands<1, 0, "blraa">; - def BLRAB : AuthBranchTwoOperands<1, 1, "blrab">; + let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in { + def BRAA : AuthBranchTwoOperands<0, 0, "braa">; + def BRAB : AuthBranchTwoOperands<0, 1, "brab">; + } + let isCall = 1, Defs = [LR], Uses = [SP] in { + def BLRAA : AuthBranchTwoOperands<1, 0, "blraa">; + def BLRAB : AuthBranchTwoOperands<1, 1, "blrab">; + } - def BRAAZ : AuthOneOperand<0b000, 0, "braaz">; - def BRABZ : AuthOneOperand<0b000, 1, "brabz">; - def BLRAAZ : AuthOneOperand<0b001, 0, "blraaz">; - def BLRABZ : AuthOneOperand<0b001, 1, "blrabz">; + let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in { + def BRAAZ : AuthOneOperand<0b000, 0, "braaz">; + def BRABZ : AuthOneOperand<0b000, 1, "brabz">; + } + let isCall = 1, Defs = [LR], Uses = [SP] in { + def BLRAAZ : AuthOneOperand<0b001, 0, "blraaz">; + def BLRABZ : AuthOneOperand<0b001, 1, "blrabz">; + } let isReturn = 1, isTerminator = 1, isBarrier = 1 in { def RETAA : AuthReturn<0b010, 0, "retaa">; @@ -1538,17 +1659,29 @@ def TAGPstack // register / expression for the tagged base pointer of the current function. def : Pat<(int_aarch64_irg_sp i64:$Rm), (IRGstack SP, i64:$Rm)>; -// Large STG to be expanded into a loop. $Rm is the size, $Rn is start address. -// $Rn_wback is one past the end of the range. +// Large STG to be expanded into a loop. $sz is the size, $Rn is start address. +// $Rn_wback is one past the end of the range. $Rm is the loop counter. let isCodeGenOnly=1, mayStore=1 in { +def STGloop_wback + : Pseudo<(outs GPR64common:$Rm, GPR64sp:$Rn_wback), (ins i64imm:$sz, GPR64sp:$Rn), + [], "$Rn = $Rn_wback,@earlyclobber $Rn_wback,@earlyclobber $Rm" >, + Sched<[WriteAdr, WriteST]>; + +def STZGloop_wback + : Pseudo<(outs GPR64common:$Rm, GPR64sp:$Rn_wback), (ins i64imm:$sz, GPR64sp:$Rn), + [], "$Rn = $Rn_wback,@earlyclobber $Rn_wback,@earlyclobber $Rm" >, + Sched<[WriteAdr, WriteST]>; + +// A variant of the above where $Rn2 is an independent register not tied to the input register $Rn. +// Their purpose is to use a FrameIndex operand as $Rn (which of course can not be written back). def STGloop - : Pseudo<(outs GPR64common:$Rm_wback, GPR64sp:$Rn_wback), (ins GPR64common:$Rm, GPR64sp:$Rn), - [], "$Rn = $Rn_wback,@earlyclobber $Rn_wback,$Rm = $Rm_wback,@earlyclobber $Rm_wback" >, + : Pseudo<(outs GPR64common:$Rm, GPR64sp:$Rn2), (ins i64imm:$sz, GPR64sp:$Rn), + [], "@earlyclobber $Rn2,@earlyclobber $Rm" >, Sched<[WriteAdr, WriteST]>; def STZGloop - : Pseudo<(outs GPR64common:$Rm_wback, GPR64sp:$Rn_wback), (ins GPR64common:$Rm, GPR64sp:$Rn), - [], "$Rn = $Rn_wback,@earlyclobber $Rn_wback,$Rm = $Rm_wback,@earlyclobber $Rm_wback" >, + : Pseudo<(outs GPR64common:$Rm, GPR64sp:$Rn2), (ins i64imm:$sz, GPR64sp:$Rn), + [], "@earlyclobber $Rn2,@earlyclobber $Rm" >, Sched<[WriteAdr, WriteST]>; } @@ -1894,9 +2027,19 @@ def ERET : SpecialReturn<0b0100, "eret">; def : InstAlias<"ret", (RET LR)>; let isCall = 1, Defs = [LR], Uses = [SP] in { -def BLR : BranchReg<0b0001, "blr", [(AArch64call GPR64:$Rn)]>; + def BLR : BranchReg<0b0001, "blr", []>; + def BLRNoIP : Pseudo<(outs), (ins GPR64noip:$Rn), []>, + Sched<[WriteBrReg]>, + PseudoInstExpansion<(BLR GPR64:$Rn)>; } // isCall +def : Pat<(AArch64call GPR64:$Rn), + (BLR GPR64:$Rn)>, + Requires<[NoSLSBLRMitigation]>; +def : Pat<(AArch64call GPR64noip:$Rn), + (BLRNoIP GPR64noip:$Rn)>, + Requires<[SLSBLRMitigation]>; + let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in { def BR : BranchReg<0b0000, "br", [(brind GPR64:$Rn)]>; } // isBranch, isTerminator, isBarrier, isIndirectBranch @@ -2129,6 +2272,7 @@ let Predicates = [IsLE] in { defm : VecROLoadPat<ro64, v8i8, LDRDroW, LDRDroX>; defm : VecROLoadPat<ro64, v4i16, LDRDroW, LDRDroX>; defm : VecROLoadPat<ro64, v4f16, LDRDroW, LDRDroX>; + defm : VecROLoadPat<ro64, v4bf16, LDRDroW, LDRDroX>; } defm : VecROLoadPat<ro64, v1i64, LDRDroW, LDRDroX>; @@ -2143,6 +2287,7 @@ let Predicates = [IsLE] in { defm : VecROLoadPat<ro128, v4f32, LDRQroW, LDRQroX>; defm : VecROLoadPat<ro128, v8i16, LDRQroW, LDRQroX>; defm : VecROLoadPat<ro128, v8f16, LDRQroW, LDRQroX>; + defm : VecROLoadPat<ro128, v8bf16, LDRQroW, LDRQroX>; defm : VecROLoadPat<ro128, v16i8, LDRQroW, LDRQroX>; } } // AddedComplexity = 10 @@ -2225,6 +2370,10 @@ defm LDRQ : LoadUI<0b00, 1, 0b11, FPR128Op, uimm12s16, "ldr", [(set (f128 FPR128Op:$Rt), (load (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)))]>; +// bf16 load pattern +def : Pat <(bf16 (load (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))), + (LDRHui GPR64sp:$Rn, uimm12s2:$offset)>; + // For regular load, we do not have any alignment requirement. // Thus, it is safe to directly map the vector loads with interesting // addressing modes. @@ -2274,6 +2423,8 @@ let Predicates = [IsLE] in { (LDRDui GPR64sp:$Rn, uimm12s8:$offset)>; def : Pat<(v4f16 (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))), (LDRDui GPR64sp:$Rn, uimm12s8:$offset)>; + def : Pat<(v4bf16 (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))), + (LDRDui GPR64sp:$Rn, uimm12s8:$offset)>; } def : Pat<(v1f64 (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))), (LDRDui GPR64sp:$Rn, uimm12s8:$offset)>; @@ -2297,6 +2448,8 @@ let Predicates = [IsLE] in { (LDRQui GPR64sp:$Rn, uimm12s16:$offset)>; def : Pat<(v8f16 (load (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset))), (LDRQui GPR64sp:$Rn, uimm12s16:$offset)>; + def : Pat<(v8bf16 (load (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset))), + (LDRQui GPR64sp:$Rn, uimm12s16:$offset)>; } def : Pat<(f128 (load (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset))), (LDRQui GPR64sp:$Rn, uimm12s16:$offset)>; @@ -2381,11 +2534,11 @@ def : InstAlias<"prfm $Rt, [$Rn]", (PRFMui prfop:$Rt, GPR64sp:$Rn, 0)>; def alignedglobal : PatLeaf<(iPTR iPTR:$label), [{ if (auto *G = dyn_cast<GlobalAddressSDNode>(N)) { const DataLayout &DL = MF->getDataLayout(); - MaybeAlign Align = G->getGlobal()->getPointerAlignment(DL); - return Align && *Align >= 4 && G->getOffset() % 4 == 0; + Align Align = G->getGlobal()->getPointerAlignment(DL); + return Align >= 4 && G->getOffset() % 4 == 0; } if (auto *C = dyn_cast<ConstantPoolSDNode>(N)) - return C->getAlignment() >= 4 && C->getOffset() % 4 == 0; + return C->getAlign() >= 4 && C->getOffset() % 4 == 0; return false; }]>; @@ -2425,7 +2578,7 @@ defm LDURB : LoadUnscaled<0b00, 1, 0b01, FPR8Op, "ldur", [(set FPR8Op:$Rt, (load (am_unscaled8 GPR64sp:$Rn, simm9:$offset)))]>; defm LDURH : LoadUnscaled<0b01, 1, 0b01, FPR16Op, "ldur", - [(set FPR16Op:$Rt, + [(set (f16 FPR16Op:$Rt), (load (am_unscaled16 GPR64sp:$Rn, simm9:$offset)))]>; defm LDURS : LoadUnscaled<0b10, 1, 0b01, FPR32Op, "ldur", [(set (f32 FPR32Op:$Rt), @@ -2722,6 +2875,10 @@ defm STNPQ : StorePairNoAlloc<0b10, 1, FPR128Op, simm7s16, "stnp">; def : Pat<(AArch64stp GPR64z:$Rt, GPR64z:$Rt2, (am_indexed7s64 GPR64sp:$Rn, simm7s8:$offset)), (STPXi GPR64z:$Rt, GPR64z:$Rt2, GPR64sp:$Rn, simm7s8:$offset)>; +def : Pat<(AArch64stnp FPR128:$Rt, FPR128:$Rt2, (am_indexed7s128 GPR64sp:$Rn, simm7s16:$offset)), + (STNPQi FPR128:$Rt, FPR128:$Rt2, GPR64sp:$Rn, simm7s16:$offset)>; + + //--- // (Register offset) @@ -2791,6 +2948,7 @@ let Predicates = [IsLE] in { defm : VecROStorePat<ro64, v4i16, FPR64, STRDroW, STRDroX>; defm : VecROStorePat<ro64, v8i8, FPR64, STRDroW, STRDroX>; defm : VecROStorePat<ro64, v4f16, FPR64, STRDroW, STRDroX>; + defm : VecROStorePat<ro64, v4bf16, FPR64, STRDroW, STRDroX>; } defm : VecROStorePat<ro64, v1i64, FPR64, STRDroW, STRDroX>; @@ -2806,6 +2964,7 @@ let Predicates = [IsLE, UseSTRQro] in { defm : VecROStorePat<ro128, v8i16, FPR128, STRQroW, STRQroX>; defm : VecROStorePat<ro128, v16i8, FPR128, STRQroW, STRQroX>; defm : VecROStorePat<ro128, v8f16, FPR128, STRQroW, STRQroX>; + defm : VecROStorePat<ro128, v8bf16, FPR128, STRQroW, STRQroX>; } } // AddedComplexity = 10 @@ -2866,6 +3025,11 @@ defm STRBB : StoreUIz<0b00, 0, 0b00, GPR32z, uimm12s1, "strb", (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))]>; +// bf16 store pattern +def : Pat<(store (bf16 FPR16Op:$Rt), + (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset)), + (STRHui FPR16:$Rt, GPR64sp:$Rn, uimm12s2:$offset)>; + let AddedComplexity = 10 in { // Match all store 64 bits width whose type is compatible with FPR64 @@ -2893,6 +3057,9 @@ let Predicates = [IsLE] in { def : Pat<(store (v4f16 FPR64:$Rt), (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)), (STRDui FPR64:$Rt, GPR64sp:$Rn, uimm12s8:$offset)>; + def : Pat<(store (v4bf16 FPR64:$Rt), + (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)), + (STRDui FPR64:$Rt, GPR64sp:$Rn, uimm12s8:$offset)>; } // Match all store 128 bits width whose type is compatible with FPR128 @@ -2923,6 +3090,9 @@ let Predicates = [IsLE] in { def : Pat<(store (v8f16 FPR128:$Rt), (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)), (STRQui FPR128:$Rt, GPR64sp:$Rn, uimm12s16:$offset)>; + def : Pat<(store (v8bf16 FPR128:$Rt), + (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)), + (STRQui FPR128:$Rt, GPR64sp:$Rn, uimm12s16:$offset)>; } // truncstore i64 @@ -3030,6 +3200,9 @@ let Predicates = [IsLE] in { def : Pat<(store (v4f16 FPR64:$Rt), (am_unscaled64 GPR64sp:$Rn, simm9:$offset)), (STURDi FPR64:$Rt, GPR64sp:$Rn, simm9:$offset)>; + def : Pat<(store (v4bf16 FPR64:$Rt), + (am_unscaled64 GPR64sp:$Rn, simm9:$offset)), + (STURDi FPR64:$Rt, GPR64sp:$Rn, simm9:$offset)>; } // Match all store 128 bits width whose type is compatible with FPR128 @@ -3062,6 +3235,9 @@ let Predicates = [IsLE] in { def : Pat<(store (v8f16 FPR128:$Rt), (am_unscaled128 GPR64sp:$Rn, simm9:$offset)), (STURQi FPR128:$Rt, GPR64sp:$Rn, simm9:$offset)>; + def : Pat<(store (v8bf16 FPR128:$Rt), + (am_unscaled128 GPR64sp:$Rn, simm9:$offset)), + (STURQi FPR128:$Rt, GPR64sp:$Rn, simm9:$offset)>; } } // AddedComplexity = 10 @@ -3300,10 +3476,10 @@ defm FCVTNS : FPToIntegerUnscaled<0b00, 0b000, "fcvtns", int_aarch64_neon_fcvtns defm FCVTNU : FPToIntegerUnscaled<0b00, 0b001, "fcvtnu", int_aarch64_neon_fcvtnu>; defm FCVTPS : FPToIntegerUnscaled<0b01, 0b000, "fcvtps", int_aarch64_neon_fcvtps>; defm FCVTPU : FPToIntegerUnscaled<0b01, 0b001, "fcvtpu", int_aarch64_neon_fcvtpu>; -defm FCVTZS : FPToIntegerUnscaled<0b11, 0b000, "fcvtzs", fp_to_sint>; -defm FCVTZU : FPToIntegerUnscaled<0b11, 0b001, "fcvtzu", fp_to_uint>; -defm FCVTZS : FPToIntegerScaled<0b11, 0b000, "fcvtzs", fp_to_sint>; -defm FCVTZU : FPToIntegerScaled<0b11, 0b001, "fcvtzu", fp_to_uint>; +defm FCVTZS : FPToIntegerUnscaled<0b11, 0b000, "fcvtzs", any_fp_to_sint>; +defm FCVTZU : FPToIntegerUnscaled<0b11, 0b001, "fcvtzu", any_fp_to_uint>; +defm FCVTZS : FPToIntegerScaled<0b11, 0b000, "fcvtzs", any_fp_to_sint>; +defm FCVTZU : FPToIntegerScaled<0b11, 0b001, "fcvtzu", any_fp_to_uint>; multiclass FPToIntegerIntPats<Intrinsic round, string INST> { def : Pat<(i32 (round f16:$Rn)), (!cast<Instruction>(INST # UWHr) $Rn)>; @@ -3375,8 +3551,8 @@ def : Pat<(i64 (llround f64:$Rn)), // Scaled integer to floating point conversion instructions. //===----------------------------------------------------------------------===// -defm SCVTF : IntegerToFP<0, "scvtf", sint_to_fp>; -defm UCVTF : IntegerToFP<1, "ucvtf", uint_to_fp>; +defm SCVTF : IntegerToFP<0, "scvtf", any_sint_to_fp>; +defm UCVTF : IntegerToFP<1, "ucvtf", any_uint_to_fp>; //===----------------------------------------------------------------------===// // Unscaled integer to floating point conversion instruction. @@ -3541,8 +3717,8 @@ def : Pat<(f64 (fma FPR64:$Rn, (fneg FPR64:$Rm), (fneg FPR64:$Ra))), // Floating point comparison instructions. //===----------------------------------------------------------------------===// -defm FCMPE : FPComparison<1, "fcmpe">; -defm FCMP : FPComparison<0, "fcmp", AArch64fcmp>; +defm FCMPE : FPComparison<1, "fcmpe", AArch64strict_fcmpe>; +defm FCMP : FPComparison<0, "fcmp", AArch64any_fcmp>; //===----------------------------------------------------------------------===// // Floating point conditional comparison instructions. @@ -3603,10 +3779,6 @@ let isTerminator = 1, hasSideEffects = 1, isBarrier = 1, hasCtrlDep = 1, Sched<[]>; } -let hasSideEffects = 1, hasCtrlDep = 1, isCodeGenOnly = 1, - usesCustomInserter = 1 in -def CATCHPAD : Pseudo<(outs), (ins), [(catchpad)]>, Sched<[]>; - //===----------------------------------------------------------------------===// // Floating point immediate move. //===----------------------------------------------------------------------===// @@ -3788,12 +3960,16 @@ defm URSQRTE: SIMDTwoVectorS<1, 1, 0b11100, "ursqrte", int_aarch64_neon_ursqrte> defm USQADD : SIMDTwoVectorBHSDTied<1, 0b00011, "usqadd",int_aarch64_neon_usqadd>; defm XTN : SIMDMixedTwoVector<0, 0b10010, "xtn", trunc>; -def : Pat<(v4f16 (AArch64rev32 V64:$Rn)), (REV32v4i16 V64:$Rn)>; -def : Pat<(v4f16 (AArch64rev64 V64:$Rn)), (REV64v4i16 V64:$Rn)>; -def : Pat<(v8f16 (AArch64rev32 V128:$Rn)), (REV32v8i16 V128:$Rn)>; -def : Pat<(v8f16 (AArch64rev64 V128:$Rn)), (REV64v8i16 V128:$Rn)>; -def : Pat<(v2f32 (AArch64rev64 V64:$Rn)), (REV64v2i32 V64:$Rn)>; -def : Pat<(v4f32 (AArch64rev64 V128:$Rn)), (REV64v4i32 V128:$Rn)>; +def : Pat<(v4f16 (AArch64rev32 V64:$Rn)), (REV32v4i16 V64:$Rn)>; +def : Pat<(v4f16 (AArch64rev64 V64:$Rn)), (REV64v4i16 V64:$Rn)>; +def : Pat<(v4bf16 (AArch64rev32 V64:$Rn)), (REV32v4i16 V64:$Rn)>; +def : Pat<(v4bf16 (AArch64rev64 V64:$Rn)), (REV64v4i16 V64:$Rn)>; +def : Pat<(v8f16 (AArch64rev32 V128:$Rn)), (REV32v8i16 V128:$Rn)>; +def : Pat<(v8f16 (AArch64rev64 V128:$Rn)), (REV64v8i16 V128:$Rn)>; +def : Pat<(v8bf16 (AArch64rev32 V128:$Rn)), (REV32v8i16 V128:$Rn)>; +def : Pat<(v8bf16 (AArch64rev64 V128:$Rn)), (REV64v8i16 V128:$Rn)>; +def : Pat<(v2f32 (AArch64rev64 V64:$Rn)), (REV64v2i32 V64:$Rn)>; +def : Pat<(v4f32 (AArch64rev64 V128:$Rn)), (REV64v4i32 V128:$Rn)>; // Patterns for vector long shift (by element width). These need to match all // three of zext, sext and anyext so it's easier to pull the patterns out of the @@ -3900,7 +4076,7 @@ defm SQRDMULH : SIMDThreeSameVectorHS<1,0b10110,"sqrdmulh",int_aarch64_neon_sqrd defm SQRSHL : SIMDThreeSameVector<0,0b01011,"sqrshl", int_aarch64_neon_sqrshl>; defm SQSHL : SIMDThreeSameVector<0,0b01001,"sqshl", int_aarch64_neon_sqshl>; defm SQSUB : SIMDThreeSameVector<0,0b00101,"sqsub", int_aarch64_neon_sqsub>; -defm SRHADD : SIMDThreeSameVectorBHS<0,0b00010,"srhadd",int_aarch64_neon_srhadd>; +defm SRHADD : SIMDThreeSameVectorBHS<0,0b00010,"srhadd", AArch64srhadd>; defm SRSHL : SIMDThreeSameVector<0,0b01010,"srshl", int_aarch64_neon_srshl>; defm SSHL : SIMDThreeSameVector<0,0b01000,"sshl", int_aarch64_neon_sshl>; defm SUB : SIMDThreeSameVector<1,0b10000,"sub", sub>; @@ -3917,7 +4093,7 @@ defm UQADD : SIMDThreeSameVector<1,0b00001,"uqadd", int_aarch64_neon_uqadd>; defm UQRSHL : SIMDThreeSameVector<1,0b01011,"uqrshl", int_aarch64_neon_uqrshl>; defm UQSHL : SIMDThreeSameVector<1,0b01001,"uqshl", int_aarch64_neon_uqshl>; defm UQSUB : SIMDThreeSameVector<1,0b00101,"uqsub", int_aarch64_neon_uqsub>; -defm URHADD : SIMDThreeSameVectorBHS<1,0b00010,"urhadd", int_aarch64_neon_urhadd>; +defm URHADD : SIMDThreeSameVectorBHS<1,0b00010,"urhadd", AArch64urhadd>; defm URSHL : SIMDThreeSameVector<1,0b01010,"urshl", int_aarch64_neon_urshl>; defm USHL : SIMDThreeSameVector<1,0b01000,"ushl", int_aarch64_neon_ushl>; defm SQRDMLAH : SIMDThreeSameVectorSQRDMLxHTiedHS<1,0b10000,"sqrdmlah", @@ -3934,33 +4110,36 @@ defm : SIMDThreeSameVectorExtraPatterns<"UQSUB", usubsat>; defm AND : SIMDLogicalThreeVector<0, 0b00, "and", and>; defm BIC : SIMDLogicalThreeVector<0, 0b01, "bic", BinOpFrag<(and node:$LHS, (vnot node:$RHS))> >; -defm BIF : SIMDLogicalThreeVector<1, 0b11, "bif">; -defm BIT : SIMDLogicalThreeVectorTied<1, 0b10, "bit", AArch64bit>; -defm BSL : SIMDLogicalThreeVectorTied<1, 0b01, "bsl", - TriOpFrag<(or (and node:$LHS, node:$MHS), (and (vnot node:$LHS), node:$RHS))>>; defm EOR : SIMDLogicalThreeVector<1, 0b00, "eor", xor>; defm ORN : SIMDLogicalThreeVector<0, 0b11, "orn", BinOpFrag<(or node:$LHS, (vnot node:$RHS))> >; defm ORR : SIMDLogicalThreeVector<0, 0b10, "orr", or>; - -def : Pat<(AArch64bsl (v8i8 V64:$Rd), V64:$Rn, V64:$Rm), - (BSLv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>; -def : Pat<(AArch64bsl (v4i16 V64:$Rd), V64:$Rn, V64:$Rm), - (BSLv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>; -def : Pat<(AArch64bsl (v2i32 V64:$Rd), V64:$Rn, V64:$Rm), - (BSLv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>; -def : Pat<(AArch64bsl (v1i64 V64:$Rd), V64:$Rn, V64:$Rm), - (BSLv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>; - -def : Pat<(AArch64bsl (v16i8 V128:$Rd), V128:$Rn, V128:$Rm), - (BSLv16i8 V128:$Rd, V128:$Rn, V128:$Rm)>; -def : Pat<(AArch64bsl (v8i16 V128:$Rd), V128:$Rn, V128:$Rm), - (BSLv16i8 V128:$Rd, V128:$Rn, V128:$Rm)>; -def : Pat<(AArch64bsl (v4i32 V128:$Rd), V128:$Rn, V128:$Rm), - (BSLv16i8 V128:$Rd, V128:$Rn, V128:$Rm)>; -def : Pat<(AArch64bsl (v2i64 V128:$Rd), V128:$Rn, V128:$Rm), - (BSLv16i8 V128:$Rd, V128:$Rn, V128:$Rm)>; +// Pseudo bitwise select pattern BSP. +// It is expanded into BSL/BIT/BIF after register allocation. +defm BSP : SIMDLogicalThreeVectorPseudo<TriOpFrag<(or (and node:$LHS, node:$MHS), + (and (vnot node:$LHS), node:$RHS))>>; +defm BSL : SIMDLogicalThreeVectorTied<1, 0b01, "bsl">; +defm BIT : SIMDLogicalThreeVectorTied<1, 0b10, "bit", AArch64bit>; +defm BIF : SIMDLogicalThreeVectorTied<1, 0b11, "bif">; + +def : Pat<(AArch64bsp (v8i8 V64:$Rd), V64:$Rn, V64:$Rm), + (BSPv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>; +def : Pat<(AArch64bsp (v4i16 V64:$Rd), V64:$Rn, V64:$Rm), + (BSPv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>; +def : Pat<(AArch64bsp (v2i32 V64:$Rd), V64:$Rn, V64:$Rm), + (BSPv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>; +def : Pat<(AArch64bsp (v1i64 V64:$Rd), V64:$Rn, V64:$Rm), + (BSPv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>; + +def : Pat<(AArch64bsp (v16i8 V128:$Rd), V128:$Rn, V128:$Rm), + (BSPv16i8 V128:$Rd, V128:$Rn, V128:$Rm)>; +def : Pat<(AArch64bsp (v8i16 V128:$Rd), V128:$Rn, V128:$Rm), + (BSPv16i8 V128:$Rd, V128:$Rn, V128:$Rm)>; +def : Pat<(AArch64bsp (v4i32 V128:$Rd), V128:$Rn, V128:$Rm), + (BSPv16i8 V128:$Rd, V128:$Rn, V128:$Rm)>; +def : Pat<(AArch64bsp (v2i64 V128:$Rd), V128:$Rn, V128:$Rm), + (BSPv16i8 V128:$Rd, V128:$Rn, V128:$Rm)>; def : InstAlias<"mov{\t$dst.16b, $src.16b|.16b\t$dst, $src}", (ORRv16i8 V128:$dst, V128:$src, V128:$src), 1>; @@ -4669,6 +4848,7 @@ multiclass ExtPat<ValueType VT64, ValueType VT128, int N> { defm : ExtPat<v8i8, v16i8, 8>; defm : ExtPat<v4i16, v8i16, 4>; defm : ExtPat<v4f16, v8f16, 4>; +defm : ExtPat<v4bf16, v8bf16, 4>; defm : ExtPat<v2i32, v4i32, 2>; defm : ExtPat<v2f32, v4f32, 2>; defm : ExtPat<v1i64, v2i64, 1>; @@ -4790,16 +4970,29 @@ def : Pat<(v4f16 (AArch64dup (f16 FPR16:$Rn))), (v4f16 (DUPv4i16lane (INSERT_SUBREG (v8i16 (IMPLICIT_DEF)), FPR16:$Rn, hsub), (i64 0)))>; +def : Pat<(v4bf16 (AArch64dup (bf16 FPR16:$Rn))), + (v4bf16 (DUPv4i16lane + (INSERT_SUBREG (v8i16 (IMPLICIT_DEF)), FPR16:$Rn, hsub), + (i64 0)))>; def : Pat<(v8f16 (AArch64dup (f16 FPR16:$Rn))), (v8f16 (DUPv8i16lane (INSERT_SUBREG (v8i16 (IMPLICIT_DEF)), FPR16:$Rn, hsub), (i64 0)))>; +def : Pat<(v8bf16 (AArch64dup (bf16 FPR16:$Rn))), + (v8bf16 (DUPv8i16lane + (INSERT_SUBREG (v8i16 (IMPLICIT_DEF)), FPR16:$Rn, hsub), + (i64 0)))>; def : Pat<(v4f16 (AArch64duplane16 (v8f16 V128:$Rn), VectorIndexH:$imm)), (DUPv4i16lane V128:$Rn, VectorIndexH:$imm)>; def : Pat<(v8f16 (AArch64duplane16 (v8f16 V128:$Rn), VectorIndexH:$imm)), (DUPv8i16lane V128:$Rn, VectorIndexH:$imm)>; +def : Pat<(v4bf16 (AArch64duplane16 (v8bf16 V128:$Rn), VectorIndexH:$imm)), + (DUPv4i16lane V128:$Rn, VectorIndexH:$imm)>; +def : Pat<(v8bf16 (AArch64duplane16 (v8bf16 V128:$Rn), VectorIndexH:$imm)), + (DUPv8i16lane V128:$Rn, VectorIndexH:$imm)>; + def : Pat<(v2f32 (AArch64duplane32 (v4f32 V128:$Rn), VectorIndexS:$imm)), (DUPv2i32lane V128:$Rn, VectorIndexS:$imm)>; def : Pat<(v4f32 (AArch64duplane32 (v4f32 V128:$Rn), VectorIndexS:$imm)), @@ -4915,6 +5108,11 @@ def : Pat<(v4f16 (scalar_to_vector (f16 FPR16:$Rn))), def : Pat<(v8f16 (scalar_to_vector (f16 FPR16:$Rn))), (INSERT_SUBREG (v8f16 (IMPLICIT_DEF)), FPR16:$Rn, hsub)>; +def : Pat<(v4bf16 (scalar_to_vector (bf16 FPR16:$Rn))), + (INSERT_SUBREG (v4bf16 (IMPLICIT_DEF)), FPR16:$Rn, hsub)>; +def : Pat<(v8bf16 (scalar_to_vector (bf16 FPR16:$Rn))), + (INSERT_SUBREG (v8bf16 (IMPLICIT_DEF)), FPR16:$Rn, hsub)>; + def : Pat<(v2i32 (scalar_to_vector (i32 FPR32:$Rn))), (v2i32 (INSERT_SUBREG (v2i32 (IMPLICIT_DEF)), (i32 FPR32:$Rn), ssub))>; @@ -4931,6 +5129,11 @@ def : Pat<(v4f16 (scalar_to_vector (f16 FPR16:$Rn))), def : Pat<(v8f16 (scalar_to_vector (f16 FPR16:$Rn))), (INSERT_SUBREG (v8f16 (IMPLICIT_DEF)), FPR16:$Rn, hsub)>; +def : Pat<(v4bf16 (scalar_to_vector (bf16 FPR16:$Rn))), + (INSERT_SUBREG (v4bf16 (IMPLICIT_DEF)), FPR16:$Rn, hsub)>; +def : Pat<(v8bf16 (scalar_to_vector (bf16 FPR16:$Rn))), + (INSERT_SUBREG (v8bf16 (IMPLICIT_DEF)), FPR16:$Rn, hsub)>; + def : Pat<(v4f32 (scalar_to_vector (f32 FPR32:$Rn))), (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FPR32:$Rn, ssub)>; def : Pat<(v2f32 (scalar_to_vector (f32 FPR32:$Rn))), @@ -4956,6 +5159,23 @@ def : Pat<(v8f16 (vector_insert (v8f16 V128:$Rn), (v8f16 (INSERT_SUBREG (v8f16 (IMPLICIT_DEF)), FPR16:$Rm, hsub)), (i64 0))>; +def : Pat<(v4bf16 (vector_insert (v4bf16 V64:$Rn), + (bf16 FPR16:$Rm), (i64 VectorIndexS:$imm))), + (EXTRACT_SUBREG + (INSvi16lane + (v8bf16 (INSERT_SUBREG (v8bf16 (IMPLICIT_DEF)), V64:$Rn, dsub)), + VectorIndexS:$imm, + (v8bf16 (INSERT_SUBREG (v8bf16 (IMPLICIT_DEF)), FPR16:$Rm, hsub)), + (i64 0)), + dsub)>; + +def : Pat<(v8bf16 (vector_insert (v8bf16 V128:$Rn), + (bf16 FPR16:$Rm), (i64 VectorIndexH:$imm))), + (INSvi16lane + V128:$Rn, VectorIndexH:$imm, + (v8bf16 (INSERT_SUBREG (v8bf16 (IMPLICIT_DEF)), FPR16:$Rm, hsub)), + (i64 0))>; + def : Pat<(v2f32 (vector_insert (v2f32 V64:$Rn), (f32 FPR32:$Rm), (i64 VectorIndexS:$imm))), (EXTRACT_SUBREG @@ -5037,6 +5257,7 @@ multiclass Neon_INS_elt_pattern<ValueType VT128, ValueType VT64, } defm : Neon_INS_elt_pattern<v8f16, v4f16, f16, INSvi16lane>; +defm : Neon_INS_elt_pattern<v8bf16, v4bf16, bf16, INSvi16lane>; defm : Neon_INS_elt_pattern<v4f32, v2f32, f32, INSvi32lane>; defm : Neon_INS_elt_pattern<v2f64, v1f64, f64, INSvi64lane>; @@ -5050,6 +5271,9 @@ def : Pat<(vector_extract (v4f32 V128:$Rn), 0), (f32 (EXTRACT_SUBREG V128:$Rn, ssub))>; def : Pat<(vector_extract (v8f16 V128:$Rn), 0), (f16 (EXTRACT_SUBREG V128:$Rn, hsub))>; +def : Pat<(vector_extract (v8bf16 V128:$Rn), 0), + (bf16 (EXTRACT_SUBREG V128:$Rn, hsub))>; + def : Pat<(vector_extract (v2f64 V128:$Rn), VectorIndexD:$idx), (f64 (CPYi64 V128:$Rn, VectorIndexD:$idx))>; @@ -5057,6 +5281,8 @@ def : Pat<(vector_extract (v4f32 V128:$Rn), VectorIndexS:$idx), (f32 (CPYi32 V128:$Rn, VectorIndexS:$idx))>; def : Pat<(vector_extract (v8f16 V128:$Rn), VectorIndexH:$idx), (f16 (CPYi16 V128:$Rn, VectorIndexH:$idx))>; +def : Pat<(vector_extract (v8bf16 V128:$Rn), VectorIndexH:$idx), + (bf16 (CPYi16 V128:$Rn, VectorIndexH:$idx))>; // All concat_vectors operations are canonicalised to act on i64 vectors for // AArch64. In the general case we need an instruction, which had just as well be @@ -5072,6 +5298,7 @@ def : ConcatPat<v4i32, v2i32>; def : ConcatPat<v4f32, v2f32>; def : ConcatPat<v8i16, v4i16>; def : ConcatPat<v8f16, v4f16>; +def : ConcatPat<v8bf16, v4bf16>; def : ConcatPat<v16i8, v8i8>; // If the high lanes are undef, though, we can just ignore them: @@ -5613,6 +5840,11 @@ def : Pat<(v2f64 (fmul V128:$Rn, (AArch64dup (f64 FPR64:$Rm)))), defm SQDMULH : SIMDIndexedHS<0, 0b1100, "sqdmulh", int_aarch64_neon_sqdmulh>; defm SQRDMULH : SIMDIndexedHS<0, 0b1101, "sqrdmulh", int_aarch64_neon_sqrdmulh>; +defm SQDMULH : SIMDIndexedHSPatterns<int_aarch64_neon_sqdmulh_lane, + int_aarch64_neon_sqdmulh_laneq>; +defm SQRDMULH : SIMDIndexedHSPatterns<int_aarch64_neon_sqrdmulh_lane, + int_aarch64_neon_sqrdmulh_laneq>; + // Generated by MachineCombine defm MLA : SIMDVectorIndexedHSTied<1, 0b0000, "mla", null_frag>; defm MLS : SIMDVectorIndexedHSTied<1, 0b0100, "mls", null_frag>; @@ -5780,8 +6012,8 @@ defm RSHRN : SIMDVectorRShiftNarrowBHS<0, 0b10001, "rshrn", defm SHL : SIMDVectorLShiftBHSD<0, 0b01010, "shl", AArch64vshl>; defm SHRN : SIMDVectorRShiftNarrowBHS<0, 0b10000, "shrn", BinOpFrag<(trunc (AArch64vashr node:$LHS, node:$RHS))>>; -defm SLI : SIMDVectorLShiftBHSDTied<1, 0b01010, "sli", int_aarch64_neon_vsli>; -def : Pat<(v1i64 (int_aarch64_neon_vsli (v1i64 FPR64:$Rd), (v1i64 FPR64:$Rn), +defm SLI : SIMDVectorLShiftBHSDTied<1, 0b01010, "sli", AArch64vsli>; +def : Pat<(v1i64 (AArch64vsli (v1i64 FPR64:$Rd), (v1i64 FPR64:$Rn), (i32 vecshiftL64:$imm))), (SLId FPR64:$Rd, FPR64:$Rn, vecshiftL64:$imm)>; defm SQRSHRN : SIMDVectorRShiftNarrowBHS<0, 0b10011, "sqrshrn", @@ -5794,8 +6026,8 @@ defm SQSHRN : SIMDVectorRShiftNarrowBHS<0, 0b10010, "sqshrn", int_aarch64_neon_sqshrn>; defm SQSHRUN : SIMDVectorRShiftNarrowBHS<1, 0b10000, "sqshrun", int_aarch64_neon_sqshrun>; -defm SRI : SIMDVectorRShiftBHSDTied<1, 0b01000, "sri", int_aarch64_neon_vsri>; -def : Pat<(v1i64 (int_aarch64_neon_vsri (v1i64 FPR64:$Rd), (v1i64 FPR64:$Rn), +defm SRI : SIMDVectorRShiftBHSDTied<1, 0b01000, "sri", AArch64vsri>; +def : Pat<(v1i64 (AArch64vsri (v1i64 FPR64:$Rd), (v1i64 FPR64:$Rn), (i32 vecshiftR64:$imm))), (SRId FPR64:$Rd, FPR64:$Rn, vecshiftR64:$imm)>; defm SRSHR : SIMDVectorRShiftBHSD<0, 0b00100, "srshr", AArch64srshri>; @@ -6147,6 +6379,10 @@ def : Pat<(v4f16 (AArch64dup (f16 (load GPR64sp:$Rn)))), (LD1Rv4h GPR64sp:$Rn)>; def : Pat<(v8f16 (AArch64dup (f16 (load GPR64sp:$Rn)))), (LD1Rv8h GPR64sp:$Rn)>; +def : Pat<(v4bf16 (AArch64dup (bf16 (load GPR64sp:$Rn)))), + (LD1Rv4h GPR64sp:$Rn)>; +def : Pat<(v8bf16 (AArch64dup (bf16 (load GPR64sp:$Rn)))), + (LD1Rv8h GPR64sp:$Rn)>; class Ld1Lane128Pat<SDPatternOperator scalar_load, Operand VecIndex, ValueType VTy, ValueType STy, Instruction LD1> @@ -6161,6 +6397,7 @@ def : Ld1Lane128Pat<load, VectorIndexS, v4f32, f32, LD1i32>; def : Ld1Lane128Pat<load, VectorIndexD, v2i64, i64, LD1i64>; def : Ld1Lane128Pat<load, VectorIndexD, v2f64, f64, LD1i64>; def : Ld1Lane128Pat<load, VectorIndexH, v8f16, f16, LD1i16>; +def : Ld1Lane128Pat<load, VectorIndexH, v8bf16, bf16, LD1i16>; class Ld1Lane64Pat<SDPatternOperator scalar_load, Operand VecIndex, ValueType VTy, ValueType STy, Instruction LD1> @@ -6176,6 +6413,7 @@ def : Ld1Lane64Pat<extloadi16, VectorIndexH, v4i16, i32, LD1i16>; def : Ld1Lane64Pat<load, VectorIndexS, v2i32, i32, LD1i32>; def : Ld1Lane64Pat<load, VectorIndexS, v2f32, f32, LD1i32>; def : Ld1Lane64Pat<load, VectorIndexH, v4f16, f16, LD1i16>; +def : Ld1Lane64Pat<load, VectorIndexH, v4bf16, bf16, LD1i16>; defm LD1 : SIMDLdSt1SingleAliases<"ld1">; @@ -6204,6 +6442,7 @@ def : St1Lane128Pat<store, VectorIndexS, v4f32, f32, ST1i32>; def : St1Lane128Pat<store, VectorIndexD, v2i64, i64, ST1i64>; def : St1Lane128Pat<store, VectorIndexD, v2f64, f64, ST1i64>; def : St1Lane128Pat<store, VectorIndexH, v8f16, f16, ST1i16>; +def : St1Lane128Pat<store, VectorIndexH, v8bf16, bf16, ST1i16>; let AddedComplexity = 19 in class St1Lane64Pat<SDPatternOperator scalar_store, Operand VecIndex, @@ -6219,6 +6458,7 @@ def : St1Lane64Pat<truncstorei16, VectorIndexH, v4i16, i32, ST1i16>; def : St1Lane64Pat<store, VectorIndexS, v2i32, i32, ST1i32>; def : St1Lane64Pat<store, VectorIndexS, v2f32, f32, ST1i32>; def : St1Lane64Pat<store, VectorIndexH, v4f16, f16, ST1i16>; +def : St1Lane64Pat<store, VectorIndexH, v4bf16, bf16, ST1i16>; multiclass St1LanePost64Pat<SDPatternOperator scalar_store, Operand VecIndex, ValueType VTy, ValueType STy, Instruction ST1, @@ -6244,6 +6484,7 @@ defm : St1LanePost64Pat<post_store, VectorIndexS, v2f32, f32, ST1i32_POST, 4>; defm : St1LanePost64Pat<post_store, VectorIndexD, v1i64, i64, ST1i64_POST, 8>; defm : St1LanePost64Pat<post_store, VectorIndexD, v1f64, f64, ST1i64_POST, 8>; defm : St1LanePost64Pat<post_store, VectorIndexH, v4f16, f16, ST1i16_POST, 2>; +defm : St1LanePost64Pat<post_store, VectorIndexH, v4bf16, bf16, ST1i16_POST, 2>; multiclass St1LanePost128Pat<SDPatternOperator scalar_store, Operand VecIndex, ValueType VTy, ValueType STy, Instruction ST1, @@ -6268,6 +6509,7 @@ defm : St1LanePost128Pat<post_store, VectorIndexS, v4f32, f32, ST1i32_POST, 4>; defm : St1LanePost128Pat<post_store, VectorIndexD, v2i64, i64, ST1i64_POST, 8>; defm : St1LanePost128Pat<post_store, VectorIndexD, v2f64, f64, ST1i64_POST, 8>; defm : St1LanePost128Pat<post_store, VectorIndexH, v8f16, f16, ST1i16_POST, 2>; +defm : St1LanePost128Pat<post_store, VectorIndexH, v8bf16, bf16, ST1i16_POST, 2>; let mayStore = 1, hasSideEffects = 0 in { defm ST2 : SIMDStSingleB<1, 0b000, "st2", VecListTwob, GPR64pi2>; @@ -6508,6 +6750,7 @@ def : Pat<(v4i32 (mulhu V128:$Rn, V128:$Rm)), def : Pat<(v8i8 (AArch64NvCast (v2i32 FPR64:$src))), (v8i8 FPR64:$src)>; def : Pat<(v4i16 (AArch64NvCast (v2i32 FPR64:$src))), (v4i16 FPR64:$src)>; def : Pat<(v4f16 (AArch64NvCast (v2i32 FPR64:$src))), (v4f16 FPR64:$src)>; +def : Pat<(v4bf16 (AArch64NvCast (v2i32 FPR64:$src))), (v4bf16 FPR64:$src)>; def : Pat<(v2i32 (AArch64NvCast (v2i32 FPR64:$src))), (v2i32 FPR64:$src)>; def : Pat<(v2f32 (AArch64NvCast (v2i32 FPR64:$src))), (v2f32 FPR64:$src)>; def : Pat<(v1i64 (AArch64NvCast (v2i32 FPR64:$src))), (v1i64 FPR64:$src)>; @@ -6515,12 +6758,14 @@ def : Pat<(v1i64 (AArch64NvCast (v2i32 FPR64:$src))), (v1i64 FPR64:$src)>; def : Pat<(v8i8 (AArch64NvCast (v4i16 FPR64:$src))), (v8i8 FPR64:$src)>; def : Pat<(v4i16 (AArch64NvCast (v4i16 FPR64:$src))), (v4i16 FPR64:$src)>; def : Pat<(v4f16 (AArch64NvCast (v4i16 FPR64:$src))), (v4f16 FPR64:$src)>; +def : Pat<(v4bf16 (AArch64NvCast (v4i16 FPR64:$src))), (v4bf16 FPR64:$src)>; def : Pat<(v2i32 (AArch64NvCast (v4i16 FPR64:$src))), (v2i32 FPR64:$src)>; def : Pat<(v1i64 (AArch64NvCast (v4i16 FPR64:$src))), (v1i64 FPR64:$src)>; def : Pat<(v8i8 (AArch64NvCast (v8i8 FPR64:$src))), (v8i8 FPR64:$src)>; def : Pat<(v4i16 (AArch64NvCast (v8i8 FPR64:$src))), (v4i16 FPR64:$src)>; def : Pat<(v4f16 (AArch64NvCast (v8i8 FPR64:$src))), (v4f16 FPR64:$src)>; +def : Pat<(v4bf16 (AArch64NvCast (v8i8 FPR64:$src))), (v4bf16 FPR64:$src)>; def : Pat<(v2i32 (AArch64NvCast (v8i8 FPR64:$src))), (v2i32 FPR64:$src)>; def : Pat<(v2f32 (AArch64NvCast (v8i8 FPR64:$src))), (v2f32 FPR64:$src)>; def : Pat<(v1i64 (AArch64NvCast (v8i8 FPR64:$src))), (v1i64 FPR64:$src)>; @@ -6528,6 +6773,7 @@ def : Pat<(v1i64 (AArch64NvCast (v8i8 FPR64:$src))), (v1i64 FPR64:$src)>; def : Pat<(v8i8 (AArch64NvCast (f64 FPR64:$src))), (v8i8 FPR64:$src)>; def : Pat<(v4i16 (AArch64NvCast (f64 FPR64:$src))), (v4i16 FPR64:$src)>; def : Pat<(v4f16 (AArch64NvCast (f64 FPR64:$src))), (v4f16 FPR64:$src)>; +def : Pat<(v4bf16 (AArch64NvCast (f64 FPR64:$src))), (v4bf16 FPR64:$src)>; def : Pat<(v2i32 (AArch64NvCast (f64 FPR64:$src))), (v2i32 FPR64:$src)>; def : Pat<(v2f32 (AArch64NvCast (f64 FPR64:$src))), (v2f32 FPR64:$src)>; def : Pat<(v1i64 (AArch64NvCast (f64 FPR64:$src))), (v1i64 FPR64:$src)>; @@ -6544,6 +6790,7 @@ def : Pat<(v1f64 (AArch64NvCast (v2f32 FPR64:$src))), (v1f64 FPR64:$src)>; def : Pat<(v16i8 (AArch64NvCast (v4i32 FPR128:$src))), (v16i8 FPR128:$src)>; def : Pat<(v8i16 (AArch64NvCast (v4i32 FPR128:$src))), (v8i16 FPR128:$src)>; def : Pat<(v8f16 (AArch64NvCast (v4i32 FPR128:$src))), (v8f16 FPR128:$src)>; +def : Pat<(v8bf16 (AArch64NvCast (v4i32 FPR128:$src))), (v8bf16 FPR128:$src)>; def : Pat<(v4i32 (AArch64NvCast (v4i32 FPR128:$src))), (v4i32 FPR128:$src)>; def : Pat<(v4f32 (AArch64NvCast (v4i32 FPR128:$src))), (v4f32 FPR128:$src)>; def : Pat<(v2i64 (AArch64NvCast (v4i32 FPR128:$src))), (v2i64 FPR128:$src)>; @@ -6552,6 +6799,7 @@ def : Pat<(v2f64 (AArch64NvCast (v4i32 FPR128:$src))), (v2f64 FPR128:$src)>; def : Pat<(v16i8 (AArch64NvCast (v8i16 FPR128:$src))), (v16i8 FPR128:$src)>; def : Pat<(v8i16 (AArch64NvCast (v8i16 FPR128:$src))), (v8i16 FPR128:$src)>; def : Pat<(v8f16 (AArch64NvCast (v8i16 FPR128:$src))), (v8f16 FPR128:$src)>; +def : Pat<(v8bf16 (AArch64NvCast (v8i16 FPR128:$src))), (v8bf16 FPR128:$src)>; def : Pat<(v4i32 (AArch64NvCast (v8i16 FPR128:$src))), (v4i32 FPR128:$src)>; def : Pat<(v2i64 (AArch64NvCast (v8i16 FPR128:$src))), (v2i64 FPR128:$src)>; def : Pat<(v4f32 (AArch64NvCast (v8i16 FPR128:$src))), (v4f32 FPR128:$src)>; @@ -6560,6 +6808,7 @@ def : Pat<(v2f64 (AArch64NvCast (v8i16 FPR128:$src))), (v2f64 FPR128:$src)>; def : Pat<(v16i8 (AArch64NvCast (v16i8 FPR128:$src))), (v16i8 FPR128:$src)>; def : Pat<(v8i16 (AArch64NvCast (v16i8 FPR128:$src))), (v8i16 FPR128:$src)>; def : Pat<(v8f16 (AArch64NvCast (v16i8 FPR128:$src))), (v8f16 FPR128:$src)>; +def : Pat<(v8bf16 (AArch64NvCast (v16i8 FPR128:$src))), (v8bf16 FPR128:$src)>; def : Pat<(v4i32 (AArch64NvCast (v16i8 FPR128:$src))), (v4i32 FPR128:$src)>; def : Pat<(v2i64 (AArch64NvCast (v16i8 FPR128:$src))), (v2i64 FPR128:$src)>; def : Pat<(v4f32 (AArch64NvCast (v16i8 FPR128:$src))), (v4f32 FPR128:$src)>; @@ -6568,6 +6817,7 @@ def : Pat<(v2f64 (AArch64NvCast (v16i8 FPR128:$src))), (v2f64 FPR128:$src)>; def : Pat<(v16i8 (AArch64NvCast (v2i64 FPR128:$src))), (v16i8 FPR128:$src)>; def : Pat<(v8i16 (AArch64NvCast (v2i64 FPR128:$src))), (v8i16 FPR128:$src)>; def : Pat<(v8f16 (AArch64NvCast (v2i64 FPR128:$src))), (v8f16 FPR128:$src)>; +def : Pat<(v8bf16 (AArch64NvCast (v2i64 FPR128:$src))), (v8bf16 FPR128:$src)>; def : Pat<(v4i32 (AArch64NvCast (v2i64 FPR128:$src))), (v4i32 FPR128:$src)>; def : Pat<(v2i64 (AArch64NvCast (v2i64 FPR128:$src))), (v2i64 FPR128:$src)>; def : Pat<(v4f32 (AArch64NvCast (v2i64 FPR128:$src))), (v4f32 FPR128:$src)>; @@ -6579,6 +6829,7 @@ def : Pat<(v4i32 (AArch64NvCast (v4f32 FPR128:$src))), (v4i32 FPR128:$src)>; def : Pat<(v4f32 (AArch64NvCast (v4f32 FPR128:$src))), (v4f32 FPR128:$src)>; def : Pat<(v2i64 (AArch64NvCast (v4f32 FPR128:$src))), (v2i64 FPR128:$src)>; def : Pat<(v8f16 (AArch64NvCast (v4f32 FPR128:$src))), (v8f16 FPR128:$src)>; +def : Pat<(v8bf16 (AArch64NvCast (v4f32 FPR128:$src))), (v8bf16 FPR128:$src)>; def : Pat<(v2f64 (AArch64NvCast (v4f32 FPR128:$src))), (v2f64 FPR128:$src)>; def : Pat<(v16i8 (AArch64NvCast (v2f64 FPR128:$src))), (v16i8 FPR128:$src)>; @@ -6587,6 +6838,7 @@ def : Pat<(v4i32 (AArch64NvCast (v2f64 FPR128:$src))), (v4i32 FPR128:$src)>; def : Pat<(v2i64 (AArch64NvCast (v2f64 FPR128:$src))), (v2i64 FPR128:$src)>; def : Pat<(v2f64 (AArch64NvCast (v2f64 FPR128:$src))), (v2f64 FPR128:$src)>; def : Pat<(v8f16 (AArch64NvCast (v2f64 FPR128:$src))), (v8f16 FPR128:$src)>; +def : Pat<(v8bf16 (AArch64NvCast (v2f64 FPR128:$src))), (v8bf16 FPR128:$src)>; def : Pat<(v4f32 (AArch64NvCast (v2f64 FPR128:$src))), (v4f32 FPR128:$src)>; let Predicates = [IsLE] in { @@ -6594,6 +6846,7 @@ def : Pat<(v8i8 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>; def : Pat<(v4i16 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>; def : Pat<(v2i32 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>; def : Pat<(v4f16 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>; +def : Pat<(v4bf16 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>; def : Pat<(v2f32 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>; def : Pat<(i64 (bitconvert (v8i8 V64:$Vn))), @@ -6604,6 +6857,8 @@ def : Pat<(i64 (bitconvert (v2i32 V64:$Vn))), (COPY_TO_REGCLASS V64:$Vn, GPR64)>; def : Pat<(i64 (bitconvert (v4f16 V64:$Vn))), (COPY_TO_REGCLASS V64:$Vn, GPR64)>; +def : Pat<(i64 (bitconvert (v4bf16 V64:$Vn))), + (COPY_TO_REGCLASS V64:$Vn, GPR64)>; def : Pat<(i64 (bitconvert (v2f32 V64:$Vn))), (COPY_TO_REGCLASS V64:$Vn, GPR64)>; def : Pat<(i64 (bitconvert (v1f64 V64:$Vn))), @@ -6618,6 +6873,8 @@ def : Pat<(v2i32 (bitconvert GPR64:$Xn)), (REV64v2i32 (COPY_TO_REGCLASS GPR64:$Xn, FPR64))>; def : Pat<(v4f16 (bitconvert GPR64:$Xn)), (REV64v4i16 (COPY_TO_REGCLASS GPR64:$Xn, FPR64))>; +def : Pat<(v4bf16 (bitconvert GPR64:$Xn)), + (REV64v4i16 (COPY_TO_REGCLASS GPR64:$Xn, FPR64))>; def : Pat<(v2f32 (bitconvert GPR64:$Xn)), (REV64v2i32 (COPY_TO_REGCLASS GPR64:$Xn, FPR64))>; @@ -6629,6 +6886,8 @@ def : Pat<(i64 (bitconvert (v2i32 V64:$Vn))), (REV64v2i32 (COPY_TO_REGCLASS V64:$Vn, GPR64))>; def : Pat<(i64 (bitconvert (v4f16 V64:$Vn))), (REV64v4i16 (COPY_TO_REGCLASS V64:$Vn, GPR64))>; +def : Pat<(i64 (bitconvert (v4bf16 V64:$Vn))), + (REV64v4i16 (COPY_TO_REGCLASS V64:$Vn, GPR64))>; def : Pat<(i64 (bitconvert (v2f32 V64:$Vn))), (REV64v2i32 (COPY_TO_REGCLASS V64:$Vn, GPR64))>; } @@ -6658,6 +6917,7 @@ def : Pat<(v1i64 (bitconvert (v2i32 FPR64:$src))), (v1i64 FPR64:$src)>; def : Pat<(v1i64 (bitconvert (v4i16 FPR64:$src))), (v1i64 FPR64:$src)>; def : Pat<(v1i64 (bitconvert (v8i8 FPR64:$src))), (v1i64 FPR64:$src)>; def : Pat<(v1i64 (bitconvert (v4f16 FPR64:$src))), (v1i64 FPR64:$src)>; +def : Pat<(v1i64 (bitconvert (v4bf16 FPR64:$src))), (v1i64 FPR64:$src)>; def : Pat<(v1i64 (bitconvert (v2f32 FPR64:$src))), (v1i64 FPR64:$src)>; } let Predicates = [IsBE] in { @@ -6669,6 +6929,8 @@ def : Pat<(v1i64 (bitconvert (v8i8 FPR64:$src))), (v1i64 (REV64v8i8 FPR64:$src))>; def : Pat<(v1i64 (bitconvert (v4f16 FPR64:$src))), (v1i64 (REV64v4i16 FPR64:$src))>; +def : Pat<(v1i64 (bitconvert (v4bf16 FPR64:$src))), + (v1i64 (REV64v4i16 FPR64:$src))>; def : Pat<(v1i64 (bitconvert (v2f32 FPR64:$src))), (v1i64 (REV64v2i32 FPR64:$src))>; } @@ -6682,6 +6944,7 @@ def : Pat<(v2i32 (bitconvert (v8i8 FPR64:$src))), (v2i32 FPR64:$src)>; def : Pat<(v2i32 (bitconvert (f64 FPR64:$src))), (v2i32 FPR64:$src)>; def : Pat<(v2i32 (bitconvert (v1f64 FPR64:$src))), (v2i32 FPR64:$src)>; def : Pat<(v2i32 (bitconvert (v4f16 FPR64:$src))), (v2i32 FPR64:$src)>; +def : Pat<(v2i32 (bitconvert (v4bf16 FPR64:$src))), (v2i32 FPR64:$src)>; } let Predicates = [IsBE] in { def : Pat<(v2i32 (bitconvert (v1i64 FPR64:$src))), @@ -6696,6 +6959,8 @@ def : Pat<(v2i32 (bitconvert (v1f64 FPR64:$src))), (v2i32 (REV64v2i32 FPR64:$src))>; def : Pat<(v2i32 (bitconvert (v4f16 FPR64:$src))), (v2i32 (REV32v4i16 FPR64:$src))>; +def : Pat<(v2i32 (bitconvert (v4bf16 FPR64:$src))), + (v2i32 (REV32v4i16 FPR64:$src))>; } def : Pat<(v2i32 (bitconvert (v2f32 FPR64:$src))), (v2i32 FPR64:$src)>; @@ -6722,6 +6987,7 @@ def : Pat<(v4i16 (bitconvert (v1f64 FPR64:$src))), (v4i16 (REV64v4i16 FPR64:$src))>; } def : Pat<(v4i16 (bitconvert (v4f16 FPR64:$src))), (v4i16 FPR64:$src)>; +def : Pat<(v4i16 (bitconvert (v4bf16 FPR64:$src))), (v4i16 FPR64:$src)>; let Predicates = [IsLE] in { def : Pat<(v4f16 (bitconvert (v1i64 FPR64:$src))), (v4f16 FPR64:$src)>; @@ -6730,6 +6996,13 @@ def : Pat<(v4f16 (bitconvert (v8i8 FPR64:$src))), (v4f16 FPR64:$src)>; def : Pat<(v4f16 (bitconvert (f64 FPR64:$src))), (v4f16 FPR64:$src)>; def : Pat<(v4f16 (bitconvert (v2f32 FPR64:$src))), (v4f16 FPR64:$src)>; def : Pat<(v4f16 (bitconvert (v1f64 FPR64:$src))), (v4f16 FPR64:$src)>; + +def : Pat<(v4bf16 (bitconvert (v1i64 FPR64:$src))), (v4bf16 FPR64:$src)>; +def : Pat<(v4bf16 (bitconvert (v2i32 FPR64:$src))), (v4bf16 FPR64:$src)>; +def : Pat<(v4bf16 (bitconvert (v8i8 FPR64:$src))), (v4bf16 FPR64:$src)>; +def : Pat<(v4bf16 (bitconvert (f64 FPR64:$src))), (v4bf16 FPR64:$src)>; +def : Pat<(v4bf16 (bitconvert (v2f32 FPR64:$src))), (v4bf16 FPR64:$src)>; +def : Pat<(v4bf16 (bitconvert (v1f64 FPR64:$src))), (v4bf16 FPR64:$src)>; } let Predicates = [IsBE] in { def : Pat<(v4f16 (bitconvert (v1i64 FPR64:$src))), @@ -6744,8 +7017,22 @@ def : Pat<(v4f16 (bitconvert (v2f32 FPR64:$src))), (v4f16 (REV32v4i16 FPR64:$src))>; def : Pat<(v4f16 (bitconvert (v1f64 FPR64:$src))), (v4f16 (REV64v4i16 FPR64:$src))>; + +def : Pat<(v4bf16 (bitconvert (v1i64 FPR64:$src))), + (v4bf16 (REV64v4i16 FPR64:$src))>; +def : Pat<(v4bf16 (bitconvert (v2i32 FPR64:$src))), + (v4bf16 (REV32v4i16 FPR64:$src))>; +def : Pat<(v4bf16 (bitconvert (v8i8 FPR64:$src))), + (v4bf16 (REV16v8i8 FPR64:$src))>; +def : Pat<(v4bf16 (bitconvert (f64 FPR64:$src))), + (v4bf16 (REV64v4i16 FPR64:$src))>; +def : Pat<(v4bf16 (bitconvert (v2f32 FPR64:$src))), + (v4bf16 (REV32v4i16 FPR64:$src))>; +def : Pat<(v4bf16 (bitconvert (v1f64 FPR64:$src))), + (v4bf16 (REV64v4i16 FPR64:$src))>; } def : Pat<(v4f16 (bitconvert (v4i16 FPR64:$src))), (v4f16 FPR64:$src)>; +def : Pat<(v4bf16 (bitconvert (v4i16 FPR64:$src))), (v4bf16 FPR64:$src)>; let Predicates = [IsLE] in { def : Pat<(v8i8 (bitconvert (v1i64 FPR64:$src))), (v8i8 FPR64:$src)>; @@ -6755,6 +7042,7 @@ def : Pat<(v8i8 (bitconvert (f64 FPR64:$src))), (v8i8 FPR64:$src)>; def : Pat<(v8i8 (bitconvert (v2f32 FPR64:$src))), (v8i8 FPR64:$src)>; def : Pat<(v8i8 (bitconvert (v1f64 FPR64:$src))), (v8i8 FPR64:$src)>; def : Pat<(v8i8 (bitconvert (v4f16 FPR64:$src))), (v8i8 FPR64:$src)>; +def : Pat<(v8i8 (bitconvert (v4bf16 FPR64:$src))), (v8i8 FPR64:$src)>; } let Predicates = [IsBE] in { def : Pat<(v8i8 (bitconvert (v1i64 FPR64:$src))), @@ -6771,6 +7059,8 @@ def : Pat<(v8i8 (bitconvert (v1f64 FPR64:$src))), (v8i8 (REV64v8i8 FPR64:$src))>; def : Pat<(v8i8 (bitconvert (v4f16 FPR64:$src))), (v8i8 (REV16v8i8 FPR64:$src))>; +def : Pat<(v8i8 (bitconvert (v4bf16 FPR64:$src))), + (v8i8 (REV16v8i8 FPR64:$src))>; } let Predicates = [IsLE] in { @@ -6779,6 +7069,7 @@ def : Pat<(f64 (bitconvert (v4i16 FPR64:$src))), (f64 FPR64:$src)>; def : Pat<(f64 (bitconvert (v2f32 FPR64:$src))), (f64 FPR64:$src)>; def : Pat<(f64 (bitconvert (v8i8 FPR64:$src))), (f64 FPR64:$src)>; def : Pat<(f64 (bitconvert (v4f16 FPR64:$src))), (f64 FPR64:$src)>; +def : Pat<(f64 (bitconvert (v4bf16 FPR64:$src))), (f64 FPR64:$src)>; } let Predicates = [IsBE] in { def : Pat<(f64 (bitconvert (v2i32 FPR64:$src))), @@ -6791,6 +7082,8 @@ def : Pat<(f64 (bitconvert (v8i8 FPR64:$src))), (f64 (REV64v8i8 FPR64:$src))>; def : Pat<(f64 (bitconvert (v4f16 FPR64:$src))), (f64 (REV64v4i16 FPR64:$src))>; +def : Pat<(f64 (bitconvert (v4bf16 FPR64:$src))), + (f64 (REV64v4i16 FPR64:$src))>; } def : Pat<(f64 (bitconvert (v1i64 FPR64:$src))), (f64 FPR64:$src)>; def : Pat<(f64 (bitconvert (v1f64 FPR64:$src))), (f64 FPR64:$src)>; @@ -6801,6 +7094,7 @@ def : Pat<(v1f64 (bitconvert (v4i16 FPR64:$src))), (v1f64 FPR64:$src)>; def : Pat<(v1f64 (bitconvert (v8i8 FPR64:$src))), (v1f64 FPR64:$src)>; def : Pat<(v1f64 (bitconvert (v2f32 FPR64:$src))), (v1f64 FPR64:$src)>; def : Pat<(v1f64 (bitconvert (v4f16 FPR64:$src))), (v1f64 FPR64:$src)>; +def : Pat<(v1f64 (bitconvert (v4bf16 FPR64:$src))), (v1f64 FPR64:$src)>; } let Predicates = [IsBE] in { def : Pat<(v1f64 (bitconvert (v2i32 FPR64:$src))), @@ -6813,6 +7107,8 @@ def : Pat<(v1f64 (bitconvert (v2f32 FPR64:$src))), (v1f64 (REV64v2i32 FPR64:$src))>; def : Pat<(v1f64 (bitconvert (v4f16 FPR64:$src))), (v1f64 (REV64v4i16 FPR64:$src))>; +def : Pat<(v1f64 (bitconvert (v4bf16 FPR64:$src))), + (v1f64 (REV64v4i16 FPR64:$src))>; } def : Pat<(v1f64 (bitconvert (v1i64 FPR64:$src))), (v1f64 FPR64:$src)>; def : Pat<(v1f64 (bitconvert (f64 FPR64:$src))), (v1f64 FPR64:$src)>; @@ -6824,6 +7120,7 @@ def : Pat<(v2f32 (bitconvert (v8i8 FPR64:$src))), (v2f32 FPR64:$src)>; def : Pat<(v2f32 (bitconvert (v1f64 FPR64:$src))), (v2f32 FPR64:$src)>; def : Pat<(v2f32 (bitconvert (f64 FPR64:$src))), (v2f32 FPR64:$src)>; def : Pat<(v2f32 (bitconvert (v4f16 FPR64:$src))), (v2f32 FPR64:$src)>; +def : Pat<(v2f32 (bitconvert (v4bf16 FPR64:$src))), (v2f32 FPR64:$src)>; } let Predicates = [IsBE] in { def : Pat<(v2f32 (bitconvert (v1i64 FPR64:$src))), @@ -6838,6 +7135,8 @@ def : Pat<(v2f32 (bitconvert (f64 FPR64:$src))), (v2f32 (REV64v2i32 FPR64:$src))>; def : Pat<(v2f32 (bitconvert (v4f16 FPR64:$src))), (v2f32 (REV32v4i16 FPR64:$src))>; +def : Pat<(v2f32 (bitconvert (v4bf16 FPR64:$src))), + (v2f32 (REV32v4i16 FPR64:$src))>; } def : Pat<(v2f32 (bitconvert (v2i32 FPR64:$src))), (v2f32 FPR64:$src)>; @@ -6848,6 +7147,7 @@ def : Pat<(f128 (bitconvert (v8i16 FPR128:$src))), (f128 FPR128:$src)>; def : Pat<(f128 (bitconvert (v2f64 FPR128:$src))), (f128 FPR128:$src)>; def : Pat<(f128 (bitconvert (v4f32 FPR128:$src))), (f128 FPR128:$src)>; def : Pat<(f128 (bitconvert (v8f16 FPR128:$src))), (f128 FPR128:$src)>; +def : Pat<(f128 (bitconvert (v8bf16 FPR128:$src))), (f128 FPR128:$src)>; def : Pat<(f128 (bitconvert (v16i8 FPR128:$src))), (f128 FPR128:$src)>; } let Predicates = [IsBE] in { @@ -6862,6 +7162,9 @@ def : Pat<(f128 (bitconvert (v8i16 FPR128:$src))), def : Pat<(f128 (bitconvert (v8f16 FPR128:$src))), (f128 (EXTv16i8 (REV64v8i16 FPR128:$src), (REV64v8i16 FPR128:$src), (i32 8)))>; +def : Pat<(f128 (bitconvert (v8bf16 FPR128:$src))), + (f128 (EXTv16i8 (REV64v8i16 FPR128:$src), + (REV64v8i16 FPR128:$src), (i32 8)))>; def : Pat<(f128 (bitconvert (v2f64 FPR128:$src))), (f128 (EXTv16i8 FPR128:$src, FPR128:$src, (i32 8)))>; def : Pat<(f128 (bitconvert (v4f32 FPR128:$src))), @@ -6877,6 +7180,7 @@ def : Pat<(v2f64 (bitconvert (f128 FPR128:$src))), (v2f64 FPR128:$src)>; def : Pat<(v2f64 (bitconvert (v4i32 FPR128:$src))), (v2f64 FPR128:$src)>; def : Pat<(v2f64 (bitconvert (v8i16 FPR128:$src))), (v2f64 FPR128:$src)>; def : Pat<(v2f64 (bitconvert (v8f16 FPR128:$src))), (v2f64 FPR128:$src)>; +def : Pat<(v2f64 (bitconvert (v8bf16 FPR128:$src))), (v2f64 FPR128:$src)>; def : Pat<(v2f64 (bitconvert (v16i8 FPR128:$src))), (v2f64 FPR128:$src)>; def : Pat<(v2f64 (bitconvert (v4f32 FPR128:$src))), (v2f64 FPR128:$src)>; } @@ -6890,6 +7194,8 @@ def : Pat<(v2f64 (bitconvert (v8i16 FPR128:$src))), (v2f64 (REV64v8i16 FPR128:$src))>; def : Pat<(v2f64 (bitconvert (v8f16 FPR128:$src))), (v2f64 (REV64v8i16 FPR128:$src))>; +def : Pat<(v2f64 (bitconvert (v8bf16 FPR128:$src))), + (v2f64 (REV64v8i16 FPR128:$src))>; def : Pat<(v2f64 (bitconvert (v16i8 FPR128:$src))), (v2f64 (REV64v16i8 FPR128:$src))>; def : Pat<(v2f64 (bitconvert (v4f32 FPR128:$src))), @@ -6901,6 +7207,7 @@ let Predicates = [IsLE] in { def : Pat<(v4f32 (bitconvert (f128 FPR128:$src))), (v4f32 FPR128:$src)>; def : Pat<(v4f32 (bitconvert (v8i16 FPR128:$src))), (v4f32 FPR128:$src)>; def : Pat<(v4f32 (bitconvert (v8f16 FPR128:$src))), (v4f32 FPR128:$src)>; +def : Pat<(v4f32 (bitconvert (v8bf16 FPR128:$src))), (v4f32 FPR128:$src)>; def : Pat<(v4f32 (bitconvert (v16i8 FPR128:$src))), (v4f32 FPR128:$src)>; def : Pat<(v4f32 (bitconvert (v2i64 FPR128:$src))), (v4f32 FPR128:$src)>; def : Pat<(v4f32 (bitconvert (v2f64 FPR128:$src))), (v4f32 FPR128:$src)>; @@ -6913,6 +7220,8 @@ def : Pat<(v4f32 (bitconvert (v8i16 FPR128:$src))), (v4f32 (REV32v8i16 FPR128:$src))>; def : Pat<(v4f32 (bitconvert (v8f16 FPR128:$src))), (v4f32 (REV32v8i16 FPR128:$src))>; +def : Pat<(v4f32 (bitconvert (v8bf16 FPR128:$src))), + (v4f32 (REV32v8i16 FPR128:$src))>; def : Pat<(v4f32 (bitconvert (v16i8 FPR128:$src))), (v4f32 (REV32v16i8 FPR128:$src))>; def : Pat<(v4f32 (bitconvert (v2i64 FPR128:$src))), @@ -6929,6 +7238,7 @@ def : Pat<(v2i64 (bitconvert (v8i16 FPR128:$src))), (v2i64 FPR128:$src)>; def : Pat<(v2i64 (bitconvert (v16i8 FPR128:$src))), (v2i64 FPR128:$src)>; def : Pat<(v2i64 (bitconvert (v4f32 FPR128:$src))), (v2i64 FPR128:$src)>; def : Pat<(v2i64 (bitconvert (v8f16 FPR128:$src))), (v2i64 FPR128:$src)>; +def : Pat<(v2i64 (bitconvert (v8bf16 FPR128:$src))), (v2i64 FPR128:$src)>; } let Predicates = [IsBE] in { def : Pat<(v2i64 (bitconvert (f128 FPR128:$src))), @@ -6944,6 +7254,8 @@ def : Pat<(v2i64 (bitconvert (v4f32 FPR128:$src))), (v2i64 (REV64v4i32 FPR128:$src))>; def : Pat<(v2i64 (bitconvert (v8f16 FPR128:$src))), (v2i64 (REV64v8i16 FPR128:$src))>; +def : Pat<(v2i64 (bitconvert (v8bf16 FPR128:$src))), + (v2i64 (REV64v8i16 FPR128:$src))>; } def : Pat<(v2i64 (bitconvert (v2f64 FPR128:$src))), (v2i64 FPR128:$src)>; @@ -6954,6 +7266,7 @@ def : Pat<(v4i32 (bitconvert (v8i16 FPR128:$src))), (v4i32 FPR128:$src)>; def : Pat<(v4i32 (bitconvert (v16i8 FPR128:$src))), (v4i32 FPR128:$src)>; def : Pat<(v4i32 (bitconvert (v2f64 FPR128:$src))), (v4i32 FPR128:$src)>; def : Pat<(v4i32 (bitconvert (v8f16 FPR128:$src))), (v4i32 FPR128:$src)>; +def : Pat<(v4i32 (bitconvert (v8bf16 FPR128:$src))), (v4i32 FPR128:$src)>; } let Predicates = [IsBE] in { def : Pat<(v4i32 (bitconvert (f128 FPR128:$src))), @@ -6970,6 +7283,8 @@ def : Pat<(v4i32 (bitconvert (v2f64 FPR128:$src))), (v4i32 (REV64v4i32 FPR128:$src))>; def : Pat<(v4i32 (bitconvert (v8f16 FPR128:$src))), (v4i32 (REV32v8i16 FPR128:$src))>; +def : Pat<(v4i32 (bitconvert (v8bf16 FPR128:$src))), + (v4i32 (REV32v8i16 FPR128:$src))>; } def : Pat<(v4i32 (bitconvert (v4f32 FPR128:$src))), (v4i32 FPR128:$src)>; @@ -6998,6 +7313,7 @@ def : Pat<(v8i16 (bitconvert (v4f32 FPR128:$src))), (v8i16 (REV32v8i16 FPR128:$src))>; } def : Pat<(v8i16 (bitconvert (v8f16 FPR128:$src))), (v8i16 FPR128:$src)>; +def : Pat<(v8i16 (bitconvert (v8bf16 FPR128:$src))), (v8i16 FPR128:$src)>; let Predicates = [IsLE] in { def : Pat<(v8f16 (bitconvert (f128 FPR128:$src))), (v8f16 FPR128:$src)>; @@ -7006,6 +7322,13 @@ def : Pat<(v8f16 (bitconvert (v4i32 FPR128:$src))), (v8f16 FPR128:$src)>; def : Pat<(v8f16 (bitconvert (v16i8 FPR128:$src))), (v8f16 FPR128:$src)>; def : Pat<(v8f16 (bitconvert (v2f64 FPR128:$src))), (v8f16 FPR128:$src)>; def : Pat<(v8f16 (bitconvert (v4f32 FPR128:$src))), (v8f16 FPR128:$src)>; + +def : Pat<(v8bf16 (bitconvert (f128 FPR128:$src))), (v8bf16 FPR128:$src)>; +def : Pat<(v8bf16 (bitconvert (v2i64 FPR128:$src))), (v8bf16 FPR128:$src)>; +def : Pat<(v8bf16 (bitconvert (v4i32 FPR128:$src))), (v8bf16 FPR128:$src)>; +def : Pat<(v8bf16 (bitconvert (v16i8 FPR128:$src))), (v8bf16 FPR128:$src)>; +def : Pat<(v8bf16 (bitconvert (v2f64 FPR128:$src))), (v8bf16 FPR128:$src)>; +def : Pat<(v8bf16 (bitconvert (v4f32 FPR128:$src))), (v8bf16 FPR128:$src)>; } let Predicates = [IsBE] in { def : Pat<(v8f16 (bitconvert (f128 FPR128:$src))), @@ -7022,8 +7345,24 @@ def : Pat<(v8f16 (bitconvert (v2f64 FPR128:$src))), (v8f16 (REV64v8i16 FPR128:$src))>; def : Pat<(v8f16 (bitconvert (v4f32 FPR128:$src))), (v8f16 (REV32v8i16 FPR128:$src))>; + +def : Pat<(v8bf16 (bitconvert (f128 FPR128:$src))), + (v8bf16 (EXTv16i8 (REV64v8i16 FPR128:$src), + (REV64v8i16 FPR128:$src), + (i32 8)))>; +def : Pat<(v8bf16 (bitconvert (v2i64 FPR128:$src))), + (v8bf16 (REV64v8i16 FPR128:$src))>; +def : Pat<(v8bf16 (bitconvert (v4i32 FPR128:$src))), + (v8bf16 (REV32v8i16 FPR128:$src))>; +def : Pat<(v8bf16 (bitconvert (v16i8 FPR128:$src))), + (v8bf16 (REV16v16i8 FPR128:$src))>; +def : Pat<(v8bf16 (bitconvert (v2f64 FPR128:$src))), + (v8bf16 (REV64v8i16 FPR128:$src))>; +def : Pat<(v8bf16 (bitconvert (v4f32 FPR128:$src))), + (v8bf16 (REV32v8i16 FPR128:$src))>; } def : Pat<(v8f16 (bitconvert (v8i16 FPR128:$src))), (v8f16 FPR128:$src)>; +def : Pat<(v8bf16 (bitconvert (v8i16 FPR128:$src))), (v8bf16 FPR128:$src)>; let Predicates = [IsLE] in { def : Pat<(v16i8 (bitconvert (f128 FPR128:$src))), (v16i8 FPR128:$src)>; @@ -7033,6 +7372,7 @@ def : Pat<(v16i8 (bitconvert (v8i16 FPR128:$src))), (v16i8 FPR128:$src)>; def : Pat<(v16i8 (bitconvert (v2f64 FPR128:$src))), (v16i8 FPR128:$src)>; def : Pat<(v16i8 (bitconvert (v4f32 FPR128:$src))), (v16i8 FPR128:$src)>; def : Pat<(v16i8 (bitconvert (v8f16 FPR128:$src))), (v16i8 FPR128:$src)>; +def : Pat<(v16i8 (bitconvert (v8bf16 FPR128:$src))), (v16i8 FPR128:$src)>; } let Predicates = [IsBE] in { def : Pat<(v16i8 (bitconvert (f128 FPR128:$src))), @@ -7051,6 +7391,8 @@ def : Pat<(v16i8 (bitconvert (v4f32 FPR128:$src))), (v16i8 (REV32v16i8 FPR128:$src))>; def : Pat<(v16i8 (bitconvert (v8f16 FPR128:$src))), (v16i8 (REV16v16i8 FPR128:$src))>; +def : Pat<(v16i8 (bitconvert (v8bf16 FPR128:$src))), + (v16i8 (REV16v16i8 FPR128:$src))>; } def : Pat<(v4i16 (extract_subvector V128:$Rn, (i64 0))), @@ -7061,6 +7403,8 @@ def : Pat<(v2f32 (extract_subvector V128:$Rn, (i64 0))), (EXTRACT_SUBREG V128:$Rn, dsub)>; def : Pat<(v4f16 (extract_subvector V128:$Rn, (i64 0))), (EXTRACT_SUBREG V128:$Rn, dsub)>; +def : Pat<(v4bf16 (extract_subvector V128:$Rn, (i64 0))), + (EXTRACT_SUBREG V128:$Rn, dsub)>; def : Pat<(v2i32 (extract_subvector V128:$Rn, (i64 0))), (EXTRACT_SUBREG V128:$Rn, dsub)>; def : Pat<(v1i64 (extract_subvector V128:$Rn, (i64 0))), @@ -7092,6 +7436,8 @@ multiclass InsertSubvectorUndef<ValueType Ty> { (INSERT_SUBREG (v8i16 (IMPLICIT_DEF)), FPR64:$src, dsub)>; def : Pat<(insert_subvector undef, (v4f16 FPR64:$src), (Ty 0)), (INSERT_SUBREG (v8f16 (IMPLICIT_DEF)), FPR64:$src, dsub)>; + def : Pat<(insert_subvector undef, (v4bf16 FPR64:$src), (Ty 0)), + (INSERT_SUBREG (v8bf16 (IMPLICIT_DEF)), FPR64:$src, dsub)>; def : Pat<(insert_subvector undef, (v8i8 FPR64:$src), (Ty 0)), (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), FPR64:$src, dsub)>; } @@ -7317,3 +7663,5 @@ let AddedComplexity = 10 in { include "AArch64InstrAtomics.td" include "AArch64SVEInstrInfo.td" + +include "AArch64InstrGISel.td" diff --git a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp index 3156bb4469638..d975b8bd04fe6 100644 --- a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp +++ b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp @@ -29,6 +29,7 @@ #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/TargetRegisterInfo.h" #include "llvm/IR/DebugLoc.h" +#include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCRegisterInfo.h" #include "llvm/Pass.h" #include "llvm/Support/CommandLine.h" @@ -66,6 +67,10 @@ static cl::opt<unsigned> LdStLimit("aarch64-load-store-scan-limit", static cl::opt<unsigned> UpdateLimit("aarch64-update-scan-limit", cl::init(100), cl::Hidden); +// Enable register renaming to find additional store pairing opportunities. +static cl::opt<bool> EnableRenaming("aarch64-load-store-renaming", + cl::init(true), cl::Hidden); + #define AARCH64_LOAD_STORE_OPT_NAME "AArch64 load / store optimization pass" namespace { @@ -673,14 +678,14 @@ AArch64LoadStoreOpt::mergeNarrowZeroStores(MachineBasicBlock::iterator I, assert(isPromotableZeroStoreInst(*I) && isPromotableZeroStoreInst(*MergeMI) && "Expected promotable zero stores."); - MachineBasicBlock::iterator NextI = I; - ++NextI; + MachineBasicBlock::iterator E = I->getParent()->end(); + MachineBasicBlock::iterator NextI = next_nodbg(I, E); // If NextI is the second of the two instructions to be merged, we need // to skip one further. Either way we merge will invalidate the iterator, // and we don't need to scan the new instruction, as it's a pairwise // instruction, which we're not considering for further action anyway. if (NextI == MergeMI) - ++NextI; + NextI = next_nodbg(NextI, E); unsigned Opc = I->getOpcode(); bool IsScaled = !TII->isUnscaledLdSt(Opc); @@ -743,18 +748,17 @@ static bool forAllMIsUntilDef(MachineInstr &MI, MCPhysReg DefReg, const TargetRegisterInfo *TRI, unsigned Limit, std::function<bool(MachineInstr &, bool)> &Fn) { auto MBB = MI.getParent(); - for (MachineBasicBlock::reverse_iterator I = MI.getReverseIterator(), - E = MBB->rend(); - I != E; I++) { + for (MachineInstr &I : + instructionsWithoutDebug(MI.getReverseIterator(), MBB->instr_rend())) { if (!Limit) return false; --Limit; - bool isDef = any_of(I->operands(), [DefReg, TRI](MachineOperand &MOP) { + bool isDef = any_of(I.operands(), [DefReg, TRI](MachineOperand &MOP) { return MOP.isReg() && MOP.isDef() && !MOP.isDebug() && MOP.getReg() && TRI->regsOverlap(MOP.getReg(), DefReg); }); - if (!Fn(*I, isDef)) + if (!Fn(I, isDef)) return false; if (isDef) break; @@ -778,14 +782,14 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I, MachineBasicBlock::iterator Paired, const LdStPairFlags &Flags) { - MachineBasicBlock::iterator NextI = I; - ++NextI; + MachineBasicBlock::iterator E = I->getParent()->end(); + MachineBasicBlock::iterator NextI = next_nodbg(I, E); // If NextI is the second of the two instructions to be merged, we need // to skip one further. Either way we merge will invalidate the iterator, // and we don't need to scan the new instruction, as it's a pairwise // instruction, which we're not considering for further action anyway. if (NextI == Paired) - ++NextI; + NextI = next_nodbg(NextI, E); int SExtIdx = Flags.getSExtIdx(); unsigned Opc = @@ -1004,8 +1008,8 @@ AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I, MachineBasicBlock::iterator AArch64LoadStoreOpt::promoteLoadFromStore(MachineBasicBlock::iterator LoadI, MachineBasicBlock::iterator StoreI) { - MachineBasicBlock::iterator NextI = LoadI; - ++NextI; + MachineBasicBlock::iterator NextI = + next_nodbg(LoadI, LoadI->getParent()->end()); int LoadSize = TII->getMemScale(*LoadI); int StoreSize = TII->getMemScale(*StoreI); @@ -1140,24 +1144,11 @@ static int alignTo(int Num, int PowOf2) { return (Num + PowOf2 - 1) & ~(PowOf2 - 1); } -static bool mayAlias(MachineInstr &MIa, MachineInstr &MIb, - AliasAnalysis *AA) { - // One of the instructions must modify memory. - if (!MIa.mayStore() && !MIb.mayStore()) - return false; - - // Both instructions must be memory operations. - if (!MIa.mayLoadOrStore() && !MIb.mayLoadOrStore()) - return false; - - return MIa.mayAlias(AA, MIb, /*UseTBAA*/false); -} - static bool mayAlias(MachineInstr &MIa, SmallVectorImpl<MachineInstr *> &MemInsns, AliasAnalysis *AA) { for (MachineInstr *MIb : MemInsns) - if (mayAlias(MIa, *MIb, AA)) + if (MIa.mayAlias(AA, *MIb, /*UseTBAA*/ false)) return true; return false; @@ -1183,7 +1174,7 @@ bool AArch64LoadStoreOpt::findMatchingStore( unsigned Count = 0; do { - --MBBI; + MBBI = prev_nodbg(MBBI, B); MachineInstr &MI = *MBBI; // Don't count transient instructions towards the search limit since there @@ -1215,7 +1206,7 @@ bool AArch64LoadStoreOpt::findMatchingStore( return false; // If we encounter a store aliased with the load, return early. - if (MI.mayStore() && mayAlias(LoadMI, MI, AA)) + if (MI.mayStore() && LoadMI.mayAlias(AA, MI, /*UseTBAA*/ false)) return false; } while (MBBI != B && Count < Limit); return false; @@ -1296,7 +1287,23 @@ canRenameUpToDef(MachineInstr &FirstMI, LiveRegUnits &UsedInBetween, LLVM_DEBUG(dbgs() << " Operand not killed at " << FirstMI << "\n"); return false; } - auto canRenameMOP = [](const MachineOperand &MOP) { + auto canRenameMOP = [TRI](const MachineOperand &MOP) { + if (MOP.isReg()) { + auto *RegClass = TRI->getMinimalPhysRegClass(MOP.getReg()); + // Renaming registers with multiple disjunct sub-registers (e.g. the + // result of a LD3) means that all sub-registers are renamed, potentially + // impacting other instructions we did not check. Bail out. + // Note that this relies on the structure of the AArch64 register file. In + // particular, a subregister cannot be written without overwriting the + // whole register. + if (RegClass->HasDisjunctSubRegs) { + LLVM_DEBUG( + dbgs() + << " Cannot rename operands with multiple disjunct subregisters (" + << MOP << ")\n"); + return false; + } + } return MOP.isImplicit() || (MOP.isRenamable() && !MOP.isEarlyClobber() && !MOP.isTied()); }; @@ -1325,6 +1332,19 @@ canRenameUpToDef(MachineInstr &FirstMI, LiveRegUnits &UsedInBetween, // For defs, check if we can rename the first def of RegToRename. if (FoundDef) { + // For some pseudo instructions, we might not generate code in the end + // (e.g. KILL) and we would end up without a correct def for the rename + // register. + // TODO: This might be overly conservative and we could handle those cases + // in multiple ways: + // 1. Insert an extra copy, to materialize the def. + // 2. Skip pseudo-defs until we find an non-pseudo def. + if (MI.isPseudo()) { + LLVM_DEBUG(dbgs() << " Cannot rename pseudo instruction " << MI + << "\n"); + return false; + } + for (auto &MOP : MI.operands()) { if (!MOP.isReg() || !MOP.isDef() || MOP.isDebug() || !MOP.getReg() || !TRI->regsOverlap(MOP.getReg(), RegToRename)) @@ -1422,7 +1442,7 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I, MachineBasicBlock::iterator MBBI = I; MachineBasicBlock::iterator MBBIWithRenameReg; MachineInstr &FirstMI = *I; - ++MBBI; + MBBI = next_nodbg(MBBI, E); bool MayLoad = FirstMI.mayLoad(); bool IsUnscaled = TII->isUnscaledLdSt(FirstMI); @@ -1433,6 +1453,9 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I, bool IsPromotableZeroStore = isPromotableZeroStoreInst(FirstMI); Optional<bool> MaybeCanRename = None; + if (!EnableRenaming) + MaybeCanRename = {false}; + SmallPtrSet<const TargetRegisterClass *, 5> RequiredClasses; LiveRegUnits UsedInBetween; UsedInBetween.init(*TRI); @@ -1447,7 +1470,8 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I, // Remember any instructions that read/write memory between FirstMI and MI. SmallVector<MachineInstr *, 4> MemInsns; - for (unsigned Count = 0; MBBI != E && Count < Limit; ++MBBI) { + for (unsigned Count = 0; MBBI != E && Count < Limit; + MBBI = next_nodbg(MBBI, E)) { MachineInstr &MI = *MBBI; UsedInBetween.accumulate(MI); @@ -1616,12 +1640,13 @@ AArch64LoadStoreOpt::mergeUpdateInsn(MachineBasicBlock::iterator I, assert((Update->getOpcode() == AArch64::ADDXri || Update->getOpcode() == AArch64::SUBXri) && "Unexpected base register update instruction to merge!"); - MachineBasicBlock::iterator NextI = I; + MachineBasicBlock::iterator E = I->getParent()->end(); + MachineBasicBlock::iterator NextI = next_nodbg(I, E); // Return the instruction following the merged instruction, which is // the instruction following our unmerged load. Unless that's the add/sub // instruction we're merging, in which case it's the one after that. - if (++NextI == Update) - ++NextI; + if (NextI == Update) + NextI = next_nodbg(NextI, E); int Value = Update->getOperand(2).getImm(); assert(AArch64_AM::getShiftValue(Update->getOperand(3).getImm()) == 0 && @@ -1759,8 +1784,24 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnForward( // insn (inclusive) and the second insn. ModifiedRegUnits.clear(); UsedRegUnits.clear(); - ++MBBI; - for (unsigned Count = 0; MBBI != E && Count < Limit; ++MBBI) { + MBBI = next_nodbg(MBBI, E); + + // We can't post-increment the stack pointer if any instruction between + // the memory access (I) and the increment (MBBI) can access the memory + // region defined by [SP, MBBI]. + const bool BaseRegSP = BaseReg == AArch64::SP; + if (BaseRegSP) { + // FIXME: For now, we always block the optimization over SP in windows + // targets as it requires to adjust the unwind/debug info, messing up + // the unwind info can actually cause a miscompile. + const MCAsmInfo *MAI = I->getMF()->getTarget().getMCAsmInfo(); + if (MAI->usesWindowsCFI() && + I->getMF()->getFunction().needsUnwindTableEntry()) + return E; + } + + for (unsigned Count = 0; MBBI != E && Count < Limit; + MBBI = next_nodbg(MBBI, E)) { MachineInstr &MI = *MBBI; // Don't count transient instructions towards the search limit since there @@ -1777,8 +1818,11 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnForward( // Otherwise, if the base register is used or modified, we have no match, so // return early. + // If we are optimizing SP, do not allow instructions that may load or store + // in between the load and the optimized value update. if (!ModifiedRegUnits.available(BaseReg) || - !UsedRegUnits.available(BaseReg)) + !UsedRegUnits.available(BaseReg) || + (BaseRegSP && MBBI->mayLoadOrStore())) return E; } return E; @@ -1815,7 +1859,7 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward( UsedRegUnits.clear(); unsigned Count = 0; do { - --MBBI; + MBBI = prev_nodbg(MBBI, B); MachineInstr &MI = *MBBI; // Don't count transient instructions towards the search limit since there diff --git a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp new file mode 100644 index 0000000000000..a37e380725544 --- /dev/null +++ b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp @@ -0,0 +1,32 @@ +//=- AArch64MachineFunctionInfo.cpp - AArch64 Machine Function Info ---------=// + +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file implements AArch64-specific per-machine-function +/// information. +/// +//===----------------------------------------------------------------------===// + +#include "AArch64MachineFunctionInfo.h" + +using namespace llvm; + +yaml::AArch64FunctionInfo::AArch64FunctionInfo( + const llvm::AArch64FunctionInfo &MFI) + : HasRedZone(MFI.hasRedZone()) {} + +void yaml::AArch64FunctionInfo::mappingImpl(yaml::IO &YamlIO) { + MappingTraits<AArch64FunctionInfo>::mapping(YamlIO, *this); +} + +void AArch64FunctionInfo::initializeBaseYamlFields( + const yaml::AArch64FunctionInfo &YamlMFI) { + if (YamlMFI.HasRedZone.hasValue()) + HasRedZone = YamlMFI.HasRedZone; +} diff --git a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h index 6ddb3fdb00463..84aa53f2bece1 100644 --- a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h +++ b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h @@ -18,6 +18,7 @@ #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/CodeGen/CallingConvLower.h" +#include "llvm/CodeGen/MIRYamlMapping.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/TargetFrameLowering.h" #include "llvm/IR/Function.h" @@ -26,6 +27,10 @@ namespace llvm { +namespace yaml { +struct AArch64FunctionInfo; +} // end namespace yaml + class MachineInstr; /// AArch64FunctionInfo - This class is derived from MachineFunctionInfo and @@ -126,6 +131,10 @@ class AArch64FunctionInfo final : public MachineFunctionInfo { // stack slot. unsigned TaggedBasePointerOffset = 0; + /// OutliningStyle denotes, if a function was outined, how it was outlined, + /// e.g. Tail Call, Thunk, or Function if none apply. + Optional<std::string> OutliningStyle; + public: AArch64FunctionInfo() = default; @@ -137,6 +146,7 @@ public: if (MF.getFunction().hasFnAttribute(Attribute::NoRedZone)) HasRedZone = false; } + void initializeBaseYamlFields(const yaml::AArch64FunctionInfo &YamlMFI); unsigned getBytesInStackArgArea() const { return BytesInStackArgArea; } void setBytesInStackArgArea(unsigned bytes) { BytesInStackArgArea = bytes; } @@ -173,6 +183,9 @@ public: void setLocalStackSize(uint64_t Size) { LocalStackSize = Size; } uint64_t getLocalStackSize() const { return LocalStackSize; } + void setOutliningStyle(std::string Style) { OutliningStyle = Style; } + Optional<std::string> getOutliningStyle() const { return OutliningStyle; } + void setCalleeSavedStackSize(unsigned Size) { CalleeSavedStackSize = Size; HasCalleeSavedStackSize = true; @@ -333,6 +346,25 @@ private: DenseMap<int, std::pair<unsigned, MCSymbol *>> JumpTableEntryInfo; }; +namespace yaml { +struct AArch64FunctionInfo final : public yaml::MachineFunctionInfo { + Optional<bool> HasRedZone; + + AArch64FunctionInfo() = default; + AArch64FunctionInfo(const llvm::AArch64FunctionInfo &MFI); + + void mappingImpl(yaml::IO &YamlIO) override; + ~AArch64FunctionInfo() = default; +}; + +template <> struct MappingTraits<AArch64FunctionInfo> { + static void mapping(IO &YamlIO, AArch64FunctionInfo &MFI) { + YamlIO.mapOptional("hasRedZone", MFI.HasRedZone); + } +}; + +} // end namespace yaml + } // end namespace llvm #endif // LLVM_LIB_TARGET_AARCH64_AARCH64MACHINEFUNCTIONINFO_H diff --git a/llvm/lib/Target/AArch64/AArch64PromoteConstant.cpp b/llvm/lib/Target/AArch64/AArch64PromoteConstant.cpp index 9135f1b401223..9044c94bc4fe5 100644 --- a/llvm/lib/Target/AArch64/AArch64PromoteConstant.cpp +++ b/llvm/lib/Target/AArch64/AArch64PromoteConstant.cpp @@ -250,6 +250,20 @@ static bool isConstantUsingVectorTy(const Type *CstTy) { return false; } +// Returns true if \p C contains only ConstantData leafs and no global values, +// block addresses or constant expressions. Traverses ConstantAggregates. +static bool containsOnlyConstantData(const Constant *C) { + if (isa<ConstantData>(C)) + return true; + + if (isa<GlobalValue>(C) || isa<BlockAddress>(C) || isa<ConstantExpr>(C)) + return false; + + return all_of(C->operands(), [](const Use &U) { + return containsOnlyConstantData(cast<Constant>(&U)); + }); +} + /// Check if the given use (Instruction + OpIdx) of Cst should be converted into /// a load of a global variable initialized with Cst. /// A use should be converted if it is legal to do so. @@ -304,7 +318,7 @@ static bool shouldConvertUse(const Constant *Cst, const Instruction *Instr, // Do not mess with inline asm. const CallInst *CI = dyn_cast<const CallInst>(Instr); - return !(CI && isa<const InlineAsm>(CI->getCalledValue())); + return !(CI && CI->isInlineAsm()); } /// Check if the given Cst should be converted into @@ -550,9 +564,10 @@ bool AArch64PromoteConstant::runOnFunction(Function &F, for (Use &U : I.operands()) { Constant *Cst = dyn_cast<Constant>(U); // There is no point in promoting global values as they are already - // global. Do not promote constant expressions either, as they may - // require some code expansion. - if (!Cst || isa<GlobalValue>(Cst) || isa<ConstantExpr>(Cst)) + // global. Do not promote constants containing constant expression, global + // values or blockaddresses either, as they may require some code + // expansion. + if (!Cst || isa<GlobalValue>(Cst) || !containsOnlyConstantData(Cst)) continue; // Check if this constant is worth promoting. diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp index 14f839cd4f812..886158ca44901 100644 --- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp @@ -43,24 +43,27 @@ AArch64RegisterInfo::AArch64RegisterInfo(const Triple &TT) const MCPhysReg * AArch64RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { assert(MF && "Invalid MachineFunction pointer."); - if (MF->getFunction().getCallingConv() == CallingConv::CFGuard_Check) - return CSR_Win_AArch64_CFGuard_Check_SaveList; - if (MF->getSubtarget<AArch64Subtarget>().isTargetWindows()) - return CSR_Win_AArch64_AAPCS_SaveList; + if (MF->getFunction().getCallingConv() == CallingConv::GHC) // GHC set of callee saved regs is empty as all those regs are // used for passing STG regs around return CSR_AArch64_NoRegs_SaveList; if (MF->getFunction().getCallingConv() == CallingConv::AnyReg) return CSR_AArch64_AllRegs_SaveList; + + // Darwin has its own CSR_AArch64_AAPCS_SaveList, which means most CSR save + // lists depending on that will need to have their Darwin variant as well. + if (MF->getSubtarget<AArch64Subtarget>().isTargetDarwin()) + return getDarwinCalleeSavedRegs(MF); + + if (MF->getFunction().getCallingConv() == CallingConv::CFGuard_Check) + return CSR_Win_AArch64_CFGuard_Check_SaveList; + if (MF->getSubtarget<AArch64Subtarget>().isTargetWindows()) + return CSR_Win_AArch64_AAPCS_SaveList; if (MF->getFunction().getCallingConv() == CallingConv::AArch64_VectorCall) return CSR_AArch64_AAVPCS_SaveList; if (MF->getFunction().getCallingConv() == CallingConv::AArch64_SVE_VectorCall) return CSR_AArch64_SVE_AAPCS_SaveList; - if (MF->getFunction().getCallingConv() == CallingConv::CXX_FAST_TLS) - return MF->getInfo<AArch64FunctionInfo>()->isSplitCSR() ? - CSR_AArch64_CXX_TLS_Darwin_PE_SaveList : - CSR_AArch64_CXX_TLS_Darwin_SaveList; if (MF->getSubtarget<AArch64Subtarget>().getTargetLowering() ->supportSwiftError() && MF->getFunction().getAttributes().hasAttrSomewhere( @@ -68,17 +71,47 @@ AArch64RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { return CSR_AArch64_AAPCS_SwiftError_SaveList; if (MF->getFunction().getCallingConv() == CallingConv::PreserveMost) return CSR_AArch64_RT_MostRegs_SaveList; - if (MF->getSubtarget<AArch64Subtarget>().isTargetDarwin()) - return CSR_Darwin_AArch64_AAPCS_SaveList; + if (MF->getFunction().getCallingConv() == CallingConv::Win64) + // This is for OSes other than Windows; Windows is a separate case further + // above. + return CSR_AArch64_AAPCS_X18_SaveList; return CSR_AArch64_AAPCS_SaveList; } +const MCPhysReg * +AArch64RegisterInfo::getDarwinCalleeSavedRegs(const MachineFunction *MF) const { + assert(MF && "Invalid MachineFunction pointer."); + assert(MF->getSubtarget<AArch64Subtarget>().isTargetDarwin() && + "Invalid subtarget for getDarwinCalleeSavedRegs"); + + if (MF->getFunction().getCallingConv() == CallingConv::CFGuard_Check) + report_fatal_error( + "Calling convention CFGuard_Check is unsupported on Darwin."); + if (MF->getFunction().getCallingConv() == CallingConv::AArch64_VectorCall) + return CSR_Darwin_AArch64_AAVPCS_SaveList; + if (MF->getFunction().getCallingConv() == CallingConv::AArch64_SVE_VectorCall) + report_fatal_error( + "Calling convention SVE_VectorCall is unsupported on Darwin."); + if (MF->getFunction().getCallingConv() == CallingConv::CXX_FAST_TLS) + return MF->getInfo<AArch64FunctionInfo>()->isSplitCSR() + ? CSR_Darwin_AArch64_CXX_TLS_PE_SaveList + : CSR_Darwin_AArch64_CXX_TLS_SaveList; + if (MF->getSubtarget<AArch64Subtarget>().getTargetLowering() + ->supportSwiftError() && + MF->getFunction().getAttributes().hasAttrSomewhere( + Attribute::SwiftError)) + return CSR_Darwin_AArch64_AAPCS_SwiftError_SaveList; + if (MF->getFunction().getCallingConv() == CallingConv::PreserveMost) + return CSR_Darwin_AArch64_RT_MostRegs_SaveList; + return CSR_Darwin_AArch64_AAPCS_SaveList; +} + const MCPhysReg *AArch64RegisterInfo::getCalleeSavedRegsViaCopy( const MachineFunction *MF) const { assert(MF && "Invalid MachineFunction pointer."); if (MF->getFunction().getCallingConv() == CallingConv::CXX_FAST_TLS && MF->getInfo<AArch64FunctionInfo>()->isSplitCSR()) - return CSR_AArch64_CXX_TLS_Darwin_ViaCopy_SaveList; + return CSR_Darwin_AArch64_CXX_TLS_ViaCopy_SaveList; return nullptr; } @@ -113,6 +146,32 @@ AArch64RegisterInfo::getSubClassWithSubReg(const TargetRegisterClass *RC, } const uint32_t * +AArch64RegisterInfo::getDarwinCallPreservedMask(const MachineFunction &MF, + CallingConv::ID CC) const { + assert(MF.getSubtarget<AArch64Subtarget>().isTargetDarwin() && + "Invalid subtarget for getDarwinCallPreservedMask"); + + if (CC == CallingConv::CXX_FAST_TLS) + return CSR_Darwin_AArch64_CXX_TLS_RegMask; + if (CC == CallingConv::AArch64_VectorCall) + return CSR_Darwin_AArch64_AAVPCS_RegMask; + if (CC == CallingConv::AArch64_SVE_VectorCall) + report_fatal_error( + "Calling convention SVE_VectorCall is unsupported on Darwin."); + if (CC == CallingConv::CFGuard_Check) + report_fatal_error( + "Calling convention CFGuard_Check is unsupported on Darwin."); + if (MF.getSubtarget<AArch64Subtarget>() + .getTargetLowering() + ->supportSwiftError() && + MF.getFunction().getAttributes().hasAttrSomewhere(Attribute::SwiftError)) + return CSR_Darwin_AArch64_AAPCS_SwiftError_RegMask; + if (CC == CallingConv::PreserveMost) + return CSR_Darwin_AArch64_RT_MostRegs_RegMask; + return CSR_Darwin_AArch64_AAPCS_RegMask; +} + +const uint32_t * AArch64RegisterInfo::getCallPreservedMask(const MachineFunction &MF, CallingConv::ID CC) const { bool SCS = MF.getFunction().hasFnAttribute(Attribute::ShadowCallStack); @@ -121,9 +180,14 @@ AArch64RegisterInfo::getCallPreservedMask(const MachineFunction &MF, return SCS ? CSR_AArch64_NoRegs_SCS_RegMask : CSR_AArch64_NoRegs_RegMask; if (CC == CallingConv::AnyReg) return SCS ? CSR_AArch64_AllRegs_SCS_RegMask : CSR_AArch64_AllRegs_RegMask; - if (CC == CallingConv::CXX_FAST_TLS) - return SCS ? CSR_AArch64_CXX_TLS_Darwin_SCS_RegMask - : CSR_AArch64_CXX_TLS_Darwin_RegMask; + + // All the following calling conventions are handled differently on Darwin. + if (MF.getSubtarget<AArch64Subtarget>().isTargetDarwin()) { + if (SCS) + report_fatal_error("ShadowCallStack attribute not supported on Darwin."); + return getDarwinCallPreservedMask(MF, CC); + } + if (CC == CallingConv::AArch64_VectorCall) return SCS ? CSR_AArch64_AAVPCS_SCS_RegMask : CSR_AArch64_AAVPCS_RegMask; if (CC == CallingConv::AArch64_SVE_VectorCall) @@ -145,7 +209,7 @@ AArch64RegisterInfo::getCallPreservedMask(const MachineFunction &MF, const uint32_t *AArch64RegisterInfo::getTLSCallPreservedMask() const { if (TT.isOSDarwin()) - return CSR_AArch64_TLS_Darwin_RegMask; + return CSR_Darwin_AArch64_TLS_RegMask; assert(TT.isOSBinFormatELF() && "Invalid target"); return CSR_AArch64_TLS_ELF_RegMask; @@ -186,6 +250,8 @@ AArch64RegisterInfo::getThisReturnPreservedMask(const MachineFunction &MF, // In case that the calling convention does not use the same register for // both, the function should return NULL (does not currently apply) assert(CC != CallingConv::GHC && "should not be GHC calling convention."); + if (MF.getSubtarget<AArch64Subtarget>().isTargetDarwin()) + return CSR_Darwin_AArch64_AAPCS_ThisReturn_RegMask; return CSR_AArch64_AAPCS_ThisReturn_RegMask; } @@ -222,7 +288,7 @@ AArch64RegisterInfo::getReservedRegs(const MachineFunction &MF) const { } bool AArch64RegisterInfo::isReservedReg(const MachineFunction &MF, - unsigned Reg) const { + MCRegister Reg) const { return getReservedRegs(MF)[Reg]; } @@ -240,11 +306,11 @@ void AArch64RegisterInfo::emitReservedArgRegCallError( } bool AArch64RegisterInfo::isAsmClobberable(const MachineFunction &MF, - unsigned PhysReg) const { + MCRegister PhysReg) const { return !isReservedReg(MF, PhysReg); } -bool AArch64RegisterInfo::isConstantPhysReg(unsigned PhysReg) const { +bool AArch64RegisterInfo::isConstantPhysReg(MCRegister PhysReg) const { return PhysReg == AArch64::WZR || PhysReg == AArch64::XZR; } @@ -390,12 +456,16 @@ bool AArch64RegisterInfo::needsFrameBaseReg(MachineInstr *MI, if (isFrameOffsetLegal(MI, AArch64::SP, Offset)) return false; + // If even offset 0 is illegal, we don't want a virtual base register. + if (!isFrameOffsetLegal(MI, AArch64::SP, 0)) + return false; + // The offset likely isn't legal; we want to allocate a virtual base register. return true; } bool AArch64RegisterInfo::isFrameOffsetLegal(const MachineInstr *MI, - unsigned BaseReg, + Register BaseReg, int64_t Offset) const { assert(MI && "Unable to get the legal offset for nil instruction."); StackOffset SaveOffset(Offset, MVT::i8); @@ -405,7 +475,7 @@ bool AArch64RegisterInfo::isFrameOffsetLegal(const MachineInstr *MI, /// Insert defining instruction(s) for BaseReg to be a pointer to FrameIdx /// at the beginning of the basic block. void AArch64RegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB, - unsigned BaseReg, + Register BaseReg, int FrameIdx, int64_t Offset) const { MachineBasicBlock::iterator Ins = MBB->begin(); @@ -426,7 +496,7 @@ void AArch64RegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB, .addImm(Shifter); } -void AArch64RegisterInfo::resolveFrameIndex(MachineInstr &MI, unsigned BaseReg, +void AArch64RegisterInfo::resolveFrameIndex(MachineInstr &MI, Register BaseReg, int64_t Offset) const { // ARM doesn't need the general 64-bit offsets StackOffset Off(Offset, MVT::i8); @@ -445,6 +515,27 @@ void AArch64RegisterInfo::resolveFrameIndex(MachineInstr &MI, unsigned BaseReg, (void)Done; } +// Create a scratch register for the frame index elimination in an instruction. +// This function has special handling of stack tagging loop pseudos, in which +// case it can also change the instruction opcode (but not the operands). +static Register +createScratchRegisterForInstruction(MachineInstr &MI, + const AArch64InstrInfo *TII) { + // ST*Gloop have a reserved scratch register in operand 1. Use it, and also + // replace the instruction with the writeback variant because it will now + // satisfy the operand constraints for it. + if (MI.getOpcode() == AArch64::STGloop) { + MI.setDesc(TII->get(AArch64::STGloop_wback)); + return MI.getOperand(1).getReg(); + } else if (MI.getOpcode() == AArch64::STZGloop) { + MI.setDesc(TII->get(AArch64::STZGloop_wback)); + return MI.getOperand(1).getReg(); + } else { + return MI.getMF()->getRegInfo().createVirtualRegister( + &AArch64::GPR64RegClass); + } +} + void AArch64RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj, unsigned FIOperandNum, RegScavenger *RS) const { @@ -461,7 +552,7 @@ void AArch64RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, int FrameIndex = MI.getOperand(FIOperandNum).getIndex(); bool Tagged = MI.getOperand(FIOperandNum).getTargetFlags() & AArch64II::MO_TAGGED; - unsigned FrameReg; + Register FrameReg; // Special handling of dbg_value, stackmap and patchpoint instructions. if (MI.isDebugValue() || MI.getOpcode() == TargetOpcode::STACKMAP || @@ -531,8 +622,7 @@ void AArch64RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, // If we get here, the immediate doesn't fit into the instruction. We folded // as much as possible above. Handle the rest, providing a register that is // SP+LargeImm. - Register ScratchReg = - MF.getRegInfo().createVirtualRegister(&AArch64::GPR64RegClass); + Register ScratchReg = createScratchRegisterForInstruction(MI, TII); emitFrameOffset(MBB, II, MI.getDebugLoc(), ScratchReg, FrameReg, Offset, TII); MI.getOperand(FIOperandNum).ChangeToRegister(ScratchReg, false, false, true); } @@ -572,6 +662,8 @@ unsigned AArch64RegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC, return 32; case AArch64::FPR128_loRegClassID: + case AArch64::FPR64_loRegClassID: + case AArch64::FPR16_loRegClassID: return 16; } } diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.h b/llvm/lib/Target/AArch64/AArch64RegisterInfo.h index 2c3f82c530d8a..22a8ba76c6111 100644 --- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.h +++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.h @@ -34,7 +34,7 @@ public: return getEncodingValue(i); } - bool isReservedReg(const MachineFunction &MF, unsigned Reg) const; + bool isReservedReg(const MachineFunction &MF, MCRegister Reg) const; bool isAnyArgRegReserved(const MachineFunction &MF) const; void emitReservedArgRegCallError(const MachineFunction &MF) const; @@ -44,10 +44,13 @@ public: /// Code Generation virtual methods... const MCPhysReg *getCalleeSavedRegs(const MachineFunction *MF) const override; + const MCPhysReg *getDarwinCalleeSavedRegs(const MachineFunction *MF) const; const MCPhysReg * getCalleeSavedRegsViaCopy(const MachineFunction *MF) const; const uint32_t *getCallPreservedMask(const MachineFunction &MF, CallingConv::ID) const override; + const uint32_t *getDarwinCallPreservedMask(const MachineFunction &MF, + CallingConv::ID) const; unsigned getCSRFirstUseCost() const override { // The cost will be compared against BlockFrequency where entry has the @@ -83,8 +86,8 @@ public: BitVector getReservedRegs(const MachineFunction &MF) const override; bool isAsmClobberable(const MachineFunction &MF, - unsigned PhysReg) const override; - bool isConstantPhysReg(unsigned PhysReg) const override; + MCRegister PhysReg) const override; + bool isConstantPhysReg(MCRegister PhysReg) const override; const TargetRegisterClass * getPointerRegClass(const MachineFunction &MF, unsigned Kind = 0) const override; @@ -96,12 +99,12 @@ public: bool requiresFrameIndexScavenging(const MachineFunction &MF) const override; bool needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const override; - bool isFrameOffsetLegal(const MachineInstr *MI, unsigned BaseReg, + bool isFrameOffsetLegal(const MachineInstr *MI, Register BaseReg, int64_t Offset) const override; - void materializeFrameBaseRegister(MachineBasicBlock *MBB, unsigned BaseReg, + void materializeFrameBaseRegister(MachineBasicBlock *MBB, Register BaseReg, int FrameIdx, int64_t Offset) const override; - void resolveFrameIndex(MachineInstr &MI, unsigned BaseReg, + void resolveFrameIndex(MachineInstr &MI, Register BaseReg, int64_t Offset) const override; void eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj, unsigned FIOperandNum, @@ -118,10 +121,6 @@ public: unsigned getRegPressureLimit(const TargetRegisterClass *RC, MachineFunction &MF) const override; - bool trackLivenessAfterRegAlloc(const MachineFunction&) const override { - return true; - } - unsigned getLocalAddressRegister(const MachineFunction &MF) const; }; diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.td b/llvm/lib/Target/AArch64/AArch64RegisterInfo.td index f52feab039530..bd05c56009a1d 100644 --- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.td +++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.td @@ -422,25 +422,35 @@ def Q31 : AArch64Reg<31, "q31", [D31], ["v31", ""]>, DwarfRegAlias<B31>; def FPR8 : RegisterClass<"AArch64", [untyped], 8, (sequence "B%u", 0, 31)> { let Size = 8; } -def FPR16 : RegisterClass<"AArch64", [f16], 16, (sequence "H%u", 0, 31)> { +def FPR16 : RegisterClass<"AArch64", [f16, bf16], 16, (sequence "H%u", 0, 31)> { + let Size = 16; +} + +def FPR16_lo : RegisterClass<"AArch64", [f16], 16, (trunc FPR16, 16)> { let Size = 16; } def FPR32 : RegisterClass<"AArch64", [f32, i32], 32,(sequence "S%u", 0, 31)>; def FPR64 : RegisterClass<"AArch64", [f64, i64, v2f32, v1f64, v8i8, v4i16, v2i32, - v1i64, v4f16], - 64, (sequence "D%u", 0, 31)>; + v1i64, v4f16, v4bf16], + 64, (sequence "D%u", 0, 31)>; +def FPR64_lo : RegisterClass<"AArch64", + [v8i8, v4i16, v2i32, v1i64, v4f16, v4bf16, v2f32, + v1f64], + 64, (trunc FPR64, 16)>; + // We don't (yet) have an f128 legal type, so don't use that here. We // normalize 128-bit vectors to v2f64 for arg passing and such, so use // that here. def FPR128 : RegisterClass<"AArch64", [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64, f128, - v8f16], + v8f16, v8bf16], 128, (sequence "Q%u", 0, 31)>; // The lower 16 vector registers. Some instructions can only take registers // in this range. def FPR128_lo : RegisterClass<"AArch64", - [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64, v8f16], + [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64, v8f16, + v8bf16], 128, (trunc FPR128, 16)>; // Pairs, triples, and quads of 64-bit vector registers. @@ -503,6 +513,9 @@ def VectorRegLoAsmOperand : AsmOperandClass { let Name = "VectorRegLo"; let PredicateMethod = "isNeonVectorRegLo"; } +def V64_lo : RegisterOperand<FPR64_lo, "printVRegOperand"> { + let ParserMatchClass = VectorRegLoAsmOperand; +} def V128_lo : RegisterOperand<FPR128_lo, "printVRegOperand"> { let ParserMatchClass = VectorRegLoAsmOperand; } @@ -641,6 +654,10 @@ def FPR16Op : RegisterOperand<FPR16, "printOperand"> { let ParserMatchClass = FPRAsmOperand<"FPR16">; } +def FPR16Op_lo : RegisterOperand<FPR16_lo, "printOperand"> { + let ParserMatchClass = FPRAsmOperand<"FPR16_lo">; +} + def FPR32Op : RegisterOperand<FPR32, "printOperand"> { let ParserMatchClass = FPRAsmOperand<"FPR32">; } @@ -664,11 +681,11 @@ def XSeqPairs : RegisterTuples<[sube64, subo64], [(decimate (rotl GPR64, 0), 2), (decimate (rotl GPR64, 1), 2)]>; -def WSeqPairsClass : RegisterClass<"AArch64", [untyped], 32, +def WSeqPairsClass : RegisterClass<"AArch64", [untyped], 32, (add WSeqPairs)>{ let Size = 64; } -def XSeqPairsClass : RegisterClass<"AArch64", [untyped], 64, +def XSeqPairsClass : RegisterClass<"AArch64", [untyped], 64, (add XSeqPairs)>{ let Size = 128; } @@ -780,7 +797,7 @@ def Z30 : AArch64Reg<30, "z30", [Q30, Z30_HI]>, DwarfRegNum<[126]>; def Z31 : AArch64Reg<31, "z31", [Q31, Z31_HI]>, DwarfRegNum<[127]>; } -// Enum descibing the element size for destructive +// Enum describing the element size for destructive // operations. class ElementSizeEnum<bits<3> val> { bits<3> Value = val; @@ -862,6 +879,7 @@ def PPR3b64 : PPRRegOp<"d", PPRAsmOp3b64, ElementSizeD, PPR_3b>; class ZPRClass<int lastreg> : RegisterClass<"AArch64", [nxv16i8, nxv8i16, nxv4i32, nxv2i64, nxv2f16, nxv4f16, nxv8f16, + nxv2bf16, nxv4bf16, nxv8bf16, nxv2f32, nxv4f32, nxv2f64], 128, (sequence "Z%u", 0, lastreg)> { diff --git a/llvm/lib/Target/AArch64/AArch64SIMDInstrOpt.cpp b/llvm/lib/Target/AArch64/AArch64SIMDInstrOpt.cpp index 28a7e680849b0..fc31e701d3af1 100644 --- a/llvm/lib/Target/AArch64/AArch64SIMDInstrOpt.cpp +++ b/llvm/lib/Target/AArch64/AArch64SIMDInstrOpt.cpp @@ -219,7 +219,7 @@ shouldReplaceInst(MachineFunction *MF, const MCInstrDesc *InstDesc, SmallVectorImpl<const MCInstrDesc*> &InstDescRepl) { // Check if replacement decision is already available in the cached table. // if so, return it. - std::string Subtarget = SchedModel.getSubtargetInfo()->getCPU(); + std::string Subtarget = std::string(SchedModel.getSubtargetInfo()->getCPU()); auto InstID = std::make_pair(InstDesc->getOpcode(), Subtarget); if (SIMDInstrTable.find(InstID) != SIMDInstrTable.end()) return SIMDInstrTable[InstID]; @@ -288,7 +288,8 @@ bool AArch64SIMDInstrOpt::shouldExitEarly(MachineFunction *MF, Subpass SP) { // For this optimization, check for all concerned instructions. case Interleave: - std::string Subtarget = SchedModel.getSubtargetInfo()->getCPU(); + std::string Subtarget = + std::string(SchedModel.getSubtargetInfo()->getCPU()); if (InterlEarlyExit.find(Subtarget) != InterlEarlyExit.end()) return InterlEarlyExit[Subtarget]; diff --git a/llvm/lib/Target/AArch64/AArch64SLSHardening.cpp b/llvm/lib/Target/AArch64/AArch64SLSHardening.cpp new file mode 100644 index 0000000000000..cb4dc8462f68d --- /dev/null +++ b/llvm/lib/Target/AArch64/AArch64SLSHardening.cpp @@ -0,0 +1,443 @@ +//===- AArch64SLSHardening.cpp - Harden Straight Line Missspeculation -----===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file contains a pass to insert code to mitigate against side channel +// vulnerabilities that may happen under straight line miss-speculation. +// +//===----------------------------------------------------------------------===// + +#include "AArch64InstrInfo.h" +#include "AArch64Subtarget.h" +#include "Utils/AArch64BaseInfo.h" +#include "llvm/ADT/BitVector.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/CodeGen/IndirectThunks.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineOperand.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/RegisterScavenging.h" +#include "llvm/IR/DebugLoc.h" +#include "llvm/Pass.h" +#include "llvm/Support/CodeGen.h" +#include "llvm/Support/Debug.h" +#include "llvm/Target/TargetMachine.h" +#include <cassert> + +using namespace llvm; + +#define DEBUG_TYPE "aarch64-sls-hardening" + +#define AARCH64_SLS_HARDENING_NAME "AArch64 sls hardening pass" + +namespace { + +class AArch64SLSHardening : public MachineFunctionPass { +public: + const TargetInstrInfo *TII; + const TargetRegisterInfo *TRI; + const AArch64Subtarget *ST; + + static char ID; + + AArch64SLSHardening() : MachineFunctionPass(ID) { + initializeAArch64SLSHardeningPass(*PassRegistry::getPassRegistry()); + } + + bool runOnMachineFunction(MachineFunction &Fn) override; + + StringRef getPassName() const override { return AARCH64_SLS_HARDENING_NAME; } + +private: + bool hardenReturnsAndBRs(MachineBasicBlock &MBB) const; + bool hardenBLRs(MachineBasicBlock &MBB) const; + MachineBasicBlock &ConvertBLRToBL(MachineBasicBlock &MBB, + MachineBasicBlock::iterator) const; +}; + +} // end anonymous namespace + +char AArch64SLSHardening::ID = 0; + +INITIALIZE_PASS(AArch64SLSHardening, "aarch64-sls-hardening", + AARCH64_SLS_HARDENING_NAME, false, false) + +static void insertSpeculationBarrier(const AArch64Subtarget *ST, + MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + DebugLoc DL, + bool AlwaysUseISBDSB = false) { + assert(MBBI != MBB.begin() && + "Must not insert SpeculationBarrierEndBB as only instruction in MBB."); + assert(std::prev(MBBI)->isBarrier() && + "SpeculationBarrierEndBB must only follow unconditional control flow " + "instructions."); + assert(std::prev(MBBI)->isTerminator() && + "SpeculationBarrierEndBB must only follow terminators."); + const TargetInstrInfo *TII = ST->getInstrInfo(); + unsigned BarrierOpc = ST->hasSB() && !AlwaysUseISBDSB + ? AArch64::SpeculationBarrierSBEndBB + : AArch64::SpeculationBarrierISBDSBEndBB; + if (MBBI == MBB.end() || + (MBBI->getOpcode() != AArch64::SpeculationBarrierSBEndBB && + MBBI->getOpcode() != AArch64::SpeculationBarrierISBDSBEndBB)) + BuildMI(MBB, MBBI, DL, TII->get(BarrierOpc)); +} + +bool AArch64SLSHardening::runOnMachineFunction(MachineFunction &MF) { + ST = &MF.getSubtarget<AArch64Subtarget>(); + TII = MF.getSubtarget().getInstrInfo(); + TRI = MF.getSubtarget().getRegisterInfo(); + + bool Modified = false; + for (auto &MBB : MF) { + Modified |= hardenReturnsAndBRs(MBB); + Modified |= hardenBLRs(MBB); + } + + return Modified; +} + +static bool isBLR(const MachineInstr &MI) { + switch (MI.getOpcode()) { + case AArch64::BLR: + case AArch64::BLRNoIP: + return true; + case AArch64::BLRAA: + case AArch64::BLRAB: + case AArch64::BLRAAZ: + case AArch64::BLRABZ: + llvm_unreachable("Currently, LLVM's code generator does not support " + "producing BLRA* instructions. Therefore, there's no " + "support in this pass for those instructions."); + } + return false; +} + +bool AArch64SLSHardening::hardenReturnsAndBRs(MachineBasicBlock &MBB) const { + if (!ST->hardenSlsRetBr()) + return false; + bool Modified = false; + MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator(), E = MBB.end(); + MachineBasicBlock::iterator NextMBBI; + for (; MBBI != E; MBBI = NextMBBI) { + MachineInstr &MI = *MBBI; + NextMBBI = std::next(MBBI); + if (MI.isReturn() || isIndirectBranchOpcode(MI.getOpcode())) { + assert(MI.isTerminator()); + insertSpeculationBarrier(ST, MBB, std::next(MBBI), MI.getDebugLoc()); + Modified = true; + } + } + return Modified; +} + +static const char SLSBLRNamePrefix[] = "__llvm_slsblr_thunk_"; + +static const struct ThunkNameAndReg { + const char* Name; + Register Reg; +} SLSBLRThunks[] = { + { "__llvm_slsblr_thunk_x0", AArch64::X0}, + { "__llvm_slsblr_thunk_x1", AArch64::X1}, + { "__llvm_slsblr_thunk_x2", AArch64::X2}, + { "__llvm_slsblr_thunk_x3", AArch64::X3}, + { "__llvm_slsblr_thunk_x4", AArch64::X4}, + { "__llvm_slsblr_thunk_x5", AArch64::X5}, + { "__llvm_slsblr_thunk_x6", AArch64::X6}, + { "__llvm_slsblr_thunk_x7", AArch64::X7}, + { "__llvm_slsblr_thunk_x8", AArch64::X8}, + { "__llvm_slsblr_thunk_x9", AArch64::X9}, + { "__llvm_slsblr_thunk_x10", AArch64::X10}, + { "__llvm_slsblr_thunk_x11", AArch64::X11}, + { "__llvm_slsblr_thunk_x12", AArch64::X12}, + { "__llvm_slsblr_thunk_x13", AArch64::X13}, + { "__llvm_slsblr_thunk_x14", AArch64::X14}, + { "__llvm_slsblr_thunk_x15", AArch64::X15}, + // X16 and X17 are deliberately missing, as the mitigation requires those + // register to not be used in BLR. See comment in ConvertBLRToBL for more + // details. + { "__llvm_slsblr_thunk_x18", AArch64::X18}, + { "__llvm_slsblr_thunk_x19", AArch64::X19}, + { "__llvm_slsblr_thunk_x20", AArch64::X20}, + { "__llvm_slsblr_thunk_x21", AArch64::X21}, + { "__llvm_slsblr_thunk_x22", AArch64::X22}, + { "__llvm_slsblr_thunk_x23", AArch64::X23}, + { "__llvm_slsblr_thunk_x24", AArch64::X24}, + { "__llvm_slsblr_thunk_x25", AArch64::X25}, + { "__llvm_slsblr_thunk_x26", AArch64::X26}, + { "__llvm_slsblr_thunk_x27", AArch64::X27}, + { "__llvm_slsblr_thunk_x28", AArch64::X28}, + { "__llvm_slsblr_thunk_x29", AArch64::FP}, + // X30 is deliberately missing, for similar reasons as X16 and X17 are + // missing. + { "__llvm_slsblr_thunk_x31", AArch64::XZR}, +}; + +namespace { +struct SLSBLRThunkInserter : ThunkInserter<SLSBLRThunkInserter> { + const char *getThunkPrefix() { return SLSBLRNamePrefix; } + bool mayUseThunk(const MachineFunction &MF) { + // FIXME: This could also check if there are any BLRs in the function + // to more accurately reflect if a thunk will be needed. + return MF.getSubtarget<AArch64Subtarget>().hardenSlsBlr(); + } + void insertThunks(MachineModuleInfo &MMI); + void populateThunk(MachineFunction &MF); +}; +} // namespace + +void SLSBLRThunkInserter::insertThunks(MachineModuleInfo &MMI) { + // FIXME: It probably would be possible to filter which thunks to produce + // based on which registers are actually used in BLR instructions in this + // function. But would that be a worthwhile optimization? + for (auto T : SLSBLRThunks) + createThunkFunction(MMI, T.Name); +} + +void SLSBLRThunkInserter::populateThunk(MachineFunction &MF) { + // FIXME: How to better communicate Register number, rather than through + // name and lookup table? + assert(MF.getName().startswith(getThunkPrefix())); + auto ThunkIt = llvm::find_if( + SLSBLRThunks, [&MF](auto T) { return T.Name == MF.getName(); }); + assert(ThunkIt != std::end(SLSBLRThunks)); + Register ThunkReg = ThunkIt->Reg; + + const TargetInstrInfo *TII = + MF.getSubtarget<AArch64Subtarget>().getInstrInfo(); + assert (MF.size() == 1); + MachineBasicBlock *Entry = &MF.front(); + Entry->clear(); + + // These thunks need to consist of the following instructions: + // __llvm_slsblr_thunk_xN: + // BR xN + // barrierInsts + Entry->addLiveIn(ThunkReg); + // MOV X16, ThunkReg == ORR X16, XZR, ThunkReg, LSL #0 + BuildMI(Entry, DebugLoc(), TII->get(AArch64::ORRXrs), AArch64::X16) + .addReg(AArch64::XZR) + .addReg(ThunkReg) + .addImm(0); + BuildMI(Entry, DebugLoc(), TII->get(AArch64::BR)).addReg(AArch64::X16); + // Make sure the thunks do not make use of the SB extension in case there is + // a function somewhere that will call to it that for some reason disabled + // the SB extension locally on that function, even though it's enabled for + // the module otherwise. Therefore set AlwaysUseISBSDB to true. + insertSpeculationBarrier(&MF.getSubtarget<AArch64Subtarget>(), *Entry, + Entry->end(), DebugLoc(), true /*AlwaysUseISBDSB*/); +} + +MachineBasicBlock & +AArch64SLSHardening::ConvertBLRToBL(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI) const { + // Transform a BLR to a BL as follows: + // Before: + // |-----------------------------| + // | ... | + // | instI | + // | BLR xN | + // | instJ | + // | ... | + // |-----------------------------| + // + // After: + // |-----------------------------| + // | ... | + // | instI | + // | BL __llvm_slsblr_thunk_xN | + // | instJ | + // | ... | + // |-----------------------------| + // + // __llvm_slsblr_thunk_xN: + // |-----------------------------| + // | BR xN | + // | barrierInsts | + // |-----------------------------| + // + // The __llvm_slsblr_thunk_xN thunks are created by the SLSBLRThunkInserter. + // This function merely needs to transform BLR xN into BL + // __llvm_slsblr_thunk_xN. + // + // Since linkers are allowed to clobber X16 and X17 on function calls, the + // above mitigation only works if the original BLR instruction was not + // BLR X16 nor BLR X17. Code generation before must make sure that no BLR + // X16|X17 was produced if the mitigation is enabled. + + MachineInstr &BLR = *MBBI; + assert(isBLR(BLR)); + unsigned BLOpcode; + Register Reg; + bool RegIsKilled; + switch (BLR.getOpcode()) { + case AArch64::BLR: + case AArch64::BLRNoIP: + BLOpcode = AArch64::BL; + Reg = BLR.getOperand(0).getReg(); + assert(Reg != AArch64::X16 && Reg != AArch64::X17 && Reg != AArch64::LR); + RegIsKilled = BLR.getOperand(0).isKill(); + break; + case AArch64::BLRAA: + case AArch64::BLRAB: + case AArch64::BLRAAZ: + case AArch64::BLRABZ: + llvm_unreachable("BLRA instructions cannot yet be produced by LLVM, " + "therefore there is no need to support them for now."); + default: + llvm_unreachable("unhandled BLR"); + } + DebugLoc DL = BLR.getDebugLoc(); + + // If we'd like to support also BLRAA and BLRAB instructions, we'd need + // a lot more different kind of thunks. + // For example, a + // + // BLRAA xN, xM + // + // instruction probably would need to be transformed to something like: + // + // BL __llvm_slsblraa_thunk_x<N>_x<M> + // + // __llvm_slsblraa_thunk_x<N>_x<M>: + // BRAA x<N>, x<M> + // barrierInsts + // + // Given that about 30 different values of N are possible and about 30 + // different values of M are possible in the above, with the current way + // of producing indirect thunks, we'd be producing about 30 times 30, i.e. + // about 900 thunks (where most might not be actually called). This would + // multiply further by two to support both BLRAA and BLRAB variants of those + // instructions. + // If we'd want to support this, we'd probably need to look into a different + // way to produce thunk functions, based on which variants are actually + // needed, rather than producing all possible variants. + // So far, LLVM does never produce BLRA* instructions, so let's leave this + // for the future when LLVM can start producing BLRA* instructions. + MachineFunction &MF = *MBBI->getMF(); + MCContext &Context = MBB.getParent()->getContext(); + auto ThunkIt = + llvm::find_if(SLSBLRThunks, [Reg](auto T) { return T.Reg == Reg; }); + assert (ThunkIt != std::end(SLSBLRThunks)); + MCSymbol *Sym = Context.getOrCreateSymbol(ThunkIt->Name); + + MachineInstr *BL = BuildMI(MBB, MBBI, DL, TII->get(BLOpcode)).addSym(Sym); + + // Now copy the implicit operands from BLR to BL and copy other necessary + // info. + // However, both BLR and BL instructions implictly use SP and implicitly + // define LR. Blindly copying implicit operands would result in SP and LR + // operands to be present multiple times. While this may not be too much of + // an issue, let's avoid that for cleanliness, by removing those implicit + // operands from the BL created above before we copy over all implicit + // operands from the BLR. + int ImpLROpIdx = -1; + int ImpSPOpIdx = -1; + for (unsigned OpIdx = BL->getNumExplicitOperands(); + OpIdx < BL->getNumOperands(); OpIdx++) { + MachineOperand Op = BL->getOperand(OpIdx); + if (!Op.isReg()) + continue; + if (Op.getReg() == AArch64::LR && Op.isDef()) + ImpLROpIdx = OpIdx; + if (Op.getReg() == AArch64::SP && !Op.isDef()) + ImpSPOpIdx = OpIdx; + } + assert(ImpLROpIdx != -1); + assert(ImpSPOpIdx != -1); + int FirstOpIdxToRemove = std::max(ImpLROpIdx, ImpSPOpIdx); + int SecondOpIdxToRemove = std::min(ImpLROpIdx, ImpSPOpIdx); + BL->RemoveOperand(FirstOpIdxToRemove); + BL->RemoveOperand(SecondOpIdxToRemove); + // Now copy over the implicit operands from the original BLR + BL->copyImplicitOps(MF, BLR); + MF.moveCallSiteInfo(&BLR, BL); + // Also add the register called in the BLR as being used in the called thunk. + BL->addOperand(MachineOperand::CreateReg(Reg, false /*isDef*/, true /*isImp*/, + RegIsKilled /*isKill*/)); + // Remove BLR instruction + MBB.erase(MBBI); + + return MBB; +} + +bool AArch64SLSHardening::hardenBLRs(MachineBasicBlock &MBB) const { + if (!ST->hardenSlsBlr()) + return false; + bool Modified = false; + MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end(); + MachineBasicBlock::iterator NextMBBI; + for (; MBBI != E; MBBI = NextMBBI) { + MachineInstr &MI = *MBBI; + NextMBBI = std::next(MBBI); + if (isBLR(MI)) { + ConvertBLRToBL(MBB, MBBI); + Modified = true; + } + } + return Modified; +} + +FunctionPass *llvm::createAArch64SLSHardeningPass() { + return new AArch64SLSHardening(); +} + +namespace { +class AArch64IndirectThunks : public MachineFunctionPass { +public: + static char ID; + + AArch64IndirectThunks() : MachineFunctionPass(ID) {} + + StringRef getPassName() const override { return "AArch64 Indirect Thunks"; } + + bool doInitialization(Module &M) override; + bool runOnMachineFunction(MachineFunction &MF) override; + +private: + std::tuple<SLSBLRThunkInserter> TIs; + + // FIXME: When LLVM moves to C++17, these can become folds + template <typename... ThunkInserterT> + static void initTIs(Module &M, + std::tuple<ThunkInserterT...> &ThunkInserters) { + (void)std::initializer_list<int>{ + (std::get<ThunkInserterT>(ThunkInserters).init(M), 0)...}; + } + template <typename... ThunkInserterT> + static bool runTIs(MachineModuleInfo &MMI, MachineFunction &MF, + std::tuple<ThunkInserterT...> &ThunkInserters) { + bool Modified = false; + (void)std::initializer_list<int>{ + Modified |= std::get<ThunkInserterT>(ThunkInserters).run(MMI, MF)...}; + return Modified; + } +}; + +} // end anonymous namespace + +char AArch64IndirectThunks::ID = 0; + +FunctionPass *llvm::createAArch64IndirectThunks() { + return new AArch64IndirectThunks(); +} + +bool AArch64IndirectThunks::doInitialization(Module &M) { + initTIs(M, TIs); + return false; +} + +bool AArch64IndirectThunks::runOnMachineFunction(MachineFunction &MF) { + LLVM_DEBUG(dbgs() << getPassName() << '\n'); + auto &MMI = getAnalysis<MachineModuleInfoWrapperPass>().getMMI(); + return runTIs(MMI, MF, TIs); +} diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td index c849d7af9a40b..28a54e6f7d79f 100644 --- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -10,65 +10,188 @@ // //===----------------------------------------------------------------------===// -def SDT_AArch64_GLD1 : SDTypeProfile<1, 4, [ +// For predicated nodes where the entire operation is controlled by a governing +// predicate, please stick to a similar naming convention as used for the +// ISD nodes: +// +// SDNode <=> AArch64ISD +// ------------------------------- +// _m<n> <=> _MERGE_OP<n> +// _mt <=> _MERGE_PASSTHRU +// _z <=> _MERGE_ZERO +// _p <=> _PRED +// +// Given the context of this file, it is not strictly necessary to use _p to +// distinguish predicated from unpredicated nodes given that most SVE +// instructions are predicated. + +// Contiguous loads - node definitions +// +def SDT_AArch64_LD1 : SDTypeProfile<1, 3, [ + SDTCisVec<0>, SDTCisVec<1>, SDTCisPtrTy<2>, + SDTCVecEltisVT<1,i1>, SDTCisSameNumEltsAs<0,1> +]>; + +def AArch64ld1_z : SDNode<"AArch64ISD::LD1_MERGE_ZERO", SDT_AArch64_LD1, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>; +def AArch64ld1s_z : SDNode<"AArch64ISD::LD1S_MERGE_ZERO", SDT_AArch64_LD1, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>; + +// Non-faulting & first-faulting loads - node definitions +// +def AArch64ldnf1_z : SDNode<"AArch64ISD::LDNF1_MERGE_ZERO", SDT_AArch64_LD1, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue, SDNPOutGlue]>; +def AArch64ldff1_z : SDNode<"AArch64ISD::LDFF1_MERGE_ZERO", SDT_AArch64_LD1, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue, SDNPOutGlue]>; + +def AArch64ldnf1s_z : SDNode<"AArch64ISD::LDNF1S_MERGE_ZERO", SDT_AArch64_LD1, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue, SDNPOutGlue]>; +def AArch64ldff1s_z : SDNode<"AArch64ISD::LDFF1S_MERGE_ZERO", SDT_AArch64_LD1, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue, SDNPOutGlue]>; + +// Contiguous load and replicate - node definitions +// + +def SDT_AArch64_LD1Replicate : SDTypeProfile<1, 2, [ + SDTCisVec<0>, SDTCisVec<1>, SDTCisPtrTy<2>, + SDTCVecEltisVT<1,i1>, SDTCisSameNumEltsAs<0,1> +]>; + +def AArch64ld1rq_z : SDNode<"AArch64ISD::LD1RQ_MERGE_ZERO", SDT_AArch64_LD1Replicate, [SDNPHasChain, SDNPMayLoad]>; +def AArch64ld1ro_z : SDNode<"AArch64ISD::LD1RO_MERGE_ZERO", SDT_AArch64_LD1Replicate, [SDNPHasChain, SDNPMayLoad]>; + +// Gather loads - node definitions +// +def SDT_AArch64_GATHER_SV : SDTypeProfile<1, 4, [ SDTCisVec<0>, SDTCisVec<1>, SDTCisPtrTy<2>, SDTCisVec<3>, SDTCisVT<4, OtherVT>, SDTCVecEltisVT<1,i1>, SDTCisSameNumEltsAs<0,1> ]>; -def SDT_AArch64_GLD1_IMM : SDTypeProfile<1, 4, [ +def SDT_AArch64_GATHER_VS : SDTypeProfile<1, 4, [ SDTCisVec<0>, SDTCisVec<1>, SDTCisVec<2>, SDTCisInt<3>, SDTCisVT<4, OtherVT>, SDTCVecEltisVT<1,i1>, SDTCisSameNumEltsAs<0,1> ]>; -def SDT_AArch64_SST1 : SDTypeProfile<0, 5, [ +def AArch64ld1_gather_z : SDNode<"AArch64ISD::GLD1_MERGE_ZERO", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad]>; +def AArch64ld1_gather_scaled_z : SDNode<"AArch64ISD::GLD1_SCALED_MERGE_ZERO", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad]>; +def AArch64ld1_gather_uxtw_z : SDNode<"AArch64ISD::GLD1_UXTW_MERGE_ZERO", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad]>; +def AArch64ld1_gather_sxtw_z : SDNode<"AArch64ISD::GLD1_SXTW_MERGE_ZERO", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad]>; +def AArch64ld1_gather_uxtw_scaled_z : SDNode<"AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad]>; +def AArch64ld1_gather_sxtw_scaled_z : SDNode<"AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad]>; +def AArch64ld1_gather_imm_z : SDNode<"AArch64ISD::GLD1_IMM_MERGE_ZERO", SDT_AArch64_GATHER_VS, [SDNPHasChain, SDNPMayLoad]>; + +def AArch64ld1s_gather_z : SDNode<"AArch64ISD::GLD1S_MERGE_ZERO", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad]>; +def AArch64ld1s_gather_scaled_z : SDNode<"AArch64ISD::GLD1S_SCALED_MERGE_ZERO", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad]>; +def AArch64ld1s_gather_uxtw_z : SDNode<"AArch64ISD::GLD1S_UXTW_MERGE_ZERO", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad]>; +def AArch64ld1s_gather_sxtw_z : SDNode<"AArch64ISD::GLD1S_SXTW_MERGE_ZERO", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad]>; +def AArch64ld1s_gather_uxtw_scaled_z : SDNode<"AArch64ISD::GLD1S_UXTW_SCALED_MERGE_ZERO", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad]>; +def AArch64ld1s_gather_sxtw_scaled_z : SDNode<"AArch64ISD::GLD1S_SXTW_SCALED_MERGE_ZERO", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad]>; +def AArch64ld1s_gather_imm_z : SDNode<"AArch64ISD::GLD1S_IMM_MERGE_ZERO", SDT_AArch64_GATHER_VS, [SDNPHasChain, SDNPMayLoad]>; + +def AArch64ldff1_gather_z : SDNode<"AArch64ISD::GLDFF1_MERGE_ZERO", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue, SDNPOutGlue]>; +def AArch64ldff1_gather_scaled_z : SDNode<"AArch64ISD::GLDFF1_SCALED_MERGE_ZERO", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue, SDNPOutGlue]>; +def AArch64ldff1_gather_uxtw_z : SDNode<"AArch64ISD::GLDFF1_UXTW_MERGE_ZERO", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue, SDNPOutGlue]>; +def AArch64ldff1_gather_sxtw_z : SDNode<"AArch64ISD::GLDFF1_SXTW_MERGE_ZERO", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue, SDNPOutGlue]>; +def AArch64ldff1_gather_uxtw_scaled_z : SDNode<"AArch64ISD::GLDFF1_UXTW_SCALED_MERGE_ZERO", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue, SDNPOutGlue]>; +def AArch64ldff1_gather_sxtw_scaled_z : SDNode<"AArch64ISD::GLDFF1_SXTW_SCALED_MERGE_ZERO", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue, SDNPOutGlue]>; +def AArch64ldff1_gather_imm_z : SDNode<"AArch64ISD::GLDFF1_IMM_MERGE_ZERO", SDT_AArch64_GATHER_VS, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue, SDNPOutGlue]>; + +def AArch64ldff1s_gather_z : SDNode<"AArch64ISD::GLDFF1S_MERGE_ZERO", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue, SDNPOutGlue]>; +def AArch64ldff1s_gather_scaled_z : SDNode<"AArch64ISD::GLDFF1S_SCALED_MERGE_ZERO", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue, SDNPOutGlue]>; +def AArch64ldff1s_gather_uxtw_z : SDNode<"AArch64ISD::GLDFF1S_UXTW_MERGE_ZERO", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue, SDNPOutGlue]>; +def AArch64ldff1s_gather_sxtw_z : SDNode<"AArch64ISD::GLDFF1S_SXTW_MERGE_ZERO", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue, SDNPOutGlue]>; +def AArch64ldff1s_gather_uxtw_scaled_z : SDNode<"AArch64ISD::GLDFF1S_UXTW_SCALED_MERGE_ZERO", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue, SDNPOutGlue]>; +def AArch64ldff1s_gather_sxtw_scaled_z : SDNode<"AArch64ISD::GLDFF1S_SXTW_SCALED_MERGE_ZERO", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue, SDNPOutGlue]>; +def AArch64ldff1s_gather_imm_z : SDNode<"AArch64ISD::GLDFF1S_IMM_MERGE_ZERO", SDT_AArch64_GATHER_VS, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue, SDNPOutGlue]>; + +def AArch64ldnt1_gather_z : SDNode<"AArch64ISD::GLDNT1_MERGE_ZERO", SDT_AArch64_GATHER_VS, [SDNPHasChain, SDNPMayLoad]>; +def AArch64ldnt1s_gather_z : SDNode<"AArch64ISD::GLDNT1S_MERGE_ZERO", SDT_AArch64_GATHER_VS, [SDNPHasChain, SDNPMayLoad]>; + +// Contiguous stores - node definitions +// +def SDT_AArch64_ST1 : SDTypeProfile<0, 4, [ + SDTCisVec<0>, SDTCisPtrTy<1>, SDTCisVec<2>, + SDTCVecEltisVT<2,i1>, SDTCisSameNumEltsAs<0,2> +]>; + +def AArch64st1 : SDNode<"AArch64ISD::ST1_PRED", SDT_AArch64_ST1, [SDNPHasChain, SDNPMayStore]>; + +// Scatter stores - node definitions +// +def SDT_AArch64_SCATTER_SV : SDTypeProfile<0, 5, [ SDTCisVec<0>, SDTCisVec<1>, SDTCisPtrTy<2>, SDTCisVec<3>, SDTCisVT<4, OtherVT>, SDTCVecEltisVT<1,i1>, SDTCisSameNumEltsAs<0,1> ]>; -def SDT_AArch64_SST1_IMM : SDTypeProfile<0, 5, [ +def SDT_AArch64_SCATTER_VS : SDTypeProfile<0, 5, [ SDTCisVec<0>, SDTCisVec<1>, SDTCisVec<2>, SDTCisInt<3>, SDTCisVT<4, OtherVT>, SDTCVecEltisVT<1,i1>, SDTCisSameNumEltsAs<0,1> ]>; -def AArch64st1_scatter : SDNode<"AArch64ISD::SST1", SDT_AArch64_SST1, [SDNPHasChain, SDNPMayStore, SDNPOptInGlue]>; -def AArch64st1_scatter_scaled : SDNode<"AArch64ISD::SST1_SCALED", SDT_AArch64_SST1, [SDNPHasChain, SDNPMayStore, SDNPOptInGlue]>; -def AArch64st1_scatter_uxtw : SDNode<"AArch64ISD::SST1_UXTW", SDT_AArch64_SST1, [SDNPHasChain, SDNPMayStore, SDNPOptInGlue]>; -def AArch64st1_scatter_sxtw : SDNode<"AArch64ISD::SST1_SXTW", SDT_AArch64_SST1, [SDNPHasChain, SDNPMayStore, SDNPOptInGlue]>; -def AArch64st1_scatter_uxtw_scaled : SDNode<"AArch64ISD::SST1_UXTW_SCALED", SDT_AArch64_SST1, [SDNPHasChain, SDNPMayStore, SDNPOptInGlue]>; -def AArch64st1_scatter_sxtw_scaled : SDNode<"AArch64ISD::SST1_SXTW_SCALED", SDT_AArch64_SST1, [SDNPHasChain, SDNPMayStore, SDNPOptInGlue]>; -def AArch64st1_scatter_imm : SDNode<"AArch64ISD::SST1_IMM", SDT_AArch64_SST1_IMM, [SDNPHasChain, SDNPMayStore, SDNPOptInGlue]>; - -def AArch64ld1_gather : SDNode<"AArch64ISD::GLD1", SDT_AArch64_GLD1, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>; -def AArch64ld1_gather_scaled : SDNode<"AArch64ISD::GLD1_SCALED", SDT_AArch64_GLD1, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>; -def AArch64ld1_gather_uxtw : SDNode<"AArch64ISD::GLD1_UXTW", SDT_AArch64_GLD1, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>; -def AArch64ld1_gather_sxtw : SDNode<"AArch64ISD::GLD1_SXTW", SDT_AArch64_GLD1, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>; -def AArch64ld1_gather_uxtw_scaled : SDNode<"AArch64ISD::GLD1_UXTW_SCALED", SDT_AArch64_GLD1, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>; -def AArch64ld1_gather_sxtw_scaled : SDNode<"AArch64ISD::GLD1_SXTW_SCALED", SDT_AArch64_GLD1, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>; -def AArch64ld1_gather_imm : SDNode<"AArch64ISD::GLD1_IMM", SDT_AArch64_GLD1_IMM, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>; - -def AArch64ld1s_gather : SDNode<"AArch64ISD::GLD1S", SDT_AArch64_GLD1, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>; -def AArch64ld1s_gather_scaled : SDNode<"AArch64ISD::GLD1S_SCALED", SDT_AArch64_GLD1, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>; -def AArch64ld1s_gather_uxtw : SDNode<"AArch64ISD::GLD1S_UXTW", SDT_AArch64_GLD1, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>; -def AArch64ld1s_gather_sxtw : SDNode<"AArch64ISD::GLD1S_SXTW", SDT_AArch64_GLD1, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>; -def AArch64ld1s_gather_uxtw_scaled : SDNode<"AArch64ISD::GLD1S_UXTW_SCALED", SDT_AArch64_GLD1, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>; -def AArch64ld1s_gather_sxtw_scaled : SDNode<"AArch64ISD::GLD1S_SXTW_SCALED", SDT_AArch64_GLD1, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>; -def AArch64ld1s_gather_imm : SDNode<"AArch64ISD::GLD1S_IMM", SDT_AArch64_GLD1_IMM, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>; +def AArch64st1_scatter : SDNode<"AArch64ISD::SST1_PRED", SDT_AArch64_SCATTER_SV, [SDNPHasChain, SDNPMayStore]>; +def AArch64st1_scatter_scaled : SDNode<"AArch64ISD::SST1_SCALED_PRED", SDT_AArch64_SCATTER_SV, [SDNPHasChain, SDNPMayStore]>; +def AArch64st1_scatter_uxtw : SDNode<"AArch64ISD::SST1_UXTW_PRED", SDT_AArch64_SCATTER_SV, [SDNPHasChain, SDNPMayStore]>; +def AArch64st1_scatter_sxtw : SDNode<"AArch64ISD::SST1_SXTW_PRED", SDT_AArch64_SCATTER_SV, [SDNPHasChain, SDNPMayStore]>; +def AArch64st1_scatter_uxtw_scaled : SDNode<"AArch64ISD::SST1_UXTW_SCALED_PRED", SDT_AArch64_SCATTER_SV, [SDNPHasChain, SDNPMayStore]>; +def AArch64st1_scatter_sxtw_scaled : SDNode<"AArch64ISD::SST1_SXTW_SCALED_PRED", SDT_AArch64_SCATTER_SV, [SDNPHasChain, SDNPMayStore]>; +def AArch64st1_scatter_imm : SDNode<"AArch64ISD::SST1_IMM_PRED", SDT_AArch64_SCATTER_VS, [SDNPHasChain, SDNPMayStore]>; + +def AArch64stnt1_scatter : SDNode<"AArch64ISD::SSTNT1_PRED", SDT_AArch64_SCATTER_VS, [SDNPHasChain, SDNPMayStore]>; + +// AArch64 SVE/SVE2 - the remaining node definitions +// + +// SVE CNT/INC/RDVL +def sve_rdvl_imm : ComplexPattern<i32, 1, "SelectRDVLImm<-32, 31, 16>">; +def sve_cnth_imm : ComplexPattern<i32, 1, "SelectRDVLImm<1, 16, 8>">; +def sve_cntw_imm : ComplexPattern<i32, 1, "SelectRDVLImm<1, 16, 4>">; +def sve_cntd_imm : ComplexPattern<i32, 1, "SelectRDVLImm<1, 16, 2>">; + +// SVE DEC +def sve_cnth_imm_neg : ComplexPattern<i32, 1, "SelectRDVLImm<1, 16, -8>">; +def sve_cntw_imm_neg : ComplexPattern<i32, 1, "SelectRDVLImm<1, 16, -4>">; +def sve_cntd_imm_neg : ComplexPattern<i32, 1, "SelectRDVLImm<1, 16, -2>">; def SDT_AArch64Reduce : SDTypeProfile<1, 2, [SDTCisVec<1>, SDTCisVec<2>]>; +def AArch64faddv_p : SDNode<"AArch64ISD::FADDV_PRED", SDT_AArch64Reduce>; +def AArch64fmaxv_p : SDNode<"AArch64ISD::FMAXV_PRED", SDT_AArch64Reduce>; +def AArch64fmaxnmv_p : SDNode<"AArch64ISD::FMAXNMV_PRED", SDT_AArch64Reduce>; +def AArch64fminv_p : SDNode<"AArch64ISD::FMINV_PRED", SDT_AArch64Reduce>; +def AArch64fminnmv_p : SDNode<"AArch64ISD::FMINNMV_PRED", SDT_AArch64Reduce>; +def AArch64smaxv_p : SDNode<"AArch64ISD::SMAXV_PRED", SDT_AArch64Reduce>; +def AArch64umaxv_p : SDNode<"AArch64ISD::UMAXV_PRED", SDT_AArch64Reduce>; +def AArch64sminv_p : SDNode<"AArch64ISD::SMINV_PRED", SDT_AArch64Reduce>; +def AArch64uminv_p : SDNode<"AArch64ISD::UMINV_PRED", SDT_AArch64Reduce>; +def AArch64orv_p : SDNode<"AArch64ISD::ORV_PRED", SDT_AArch64Reduce>; +def AArch64eorv_p : SDNode<"AArch64ISD::EORV_PRED", SDT_AArch64Reduce>; +def AArch64andv_p : SDNode<"AArch64ISD::ANDV_PRED", SDT_AArch64Reduce>; +def AArch64lasta : SDNode<"AArch64ISD::LASTA", SDT_AArch64Reduce>; +def AArch64lastb : SDNode<"AArch64ISD::LASTB", SDT_AArch64Reduce>; + +def SDT_AArch64Arith : SDTypeProfile<1, 3, [ + SDTCisVec<0>, SDTCisVec<1>, SDTCisVec<2>, SDTCisVec<3>, + SDTCVecEltisVT<1,i1>, SDTCisSameAs<2,3> +]>; + +def SDT_AArch64FMA : SDTypeProfile<1, 4, [ + SDTCisVec<0>, SDTCisVec<1>, SDTCisVec<2>, SDTCisVec<3>, SDTCisVec<4>, + SDTCVecEltisVT<1,i1>, SDTCisSameAs<2,3>, SDTCisSameAs<3,4> +]>; -def AArch64smaxv_pred : SDNode<"AArch64ISD::SMAXV_PRED", SDT_AArch64Reduce>; -def AArch64umaxv_pred : SDNode<"AArch64ISD::UMAXV_PRED", SDT_AArch64Reduce>; -def AArch64sminv_pred : SDNode<"AArch64ISD::SMINV_PRED", SDT_AArch64Reduce>; -def AArch64uminv_pred : SDNode<"AArch64ISD::UMINV_PRED", SDT_AArch64Reduce>; -def AArch64orv_pred : SDNode<"AArch64ISD::ORV_PRED", SDT_AArch64Reduce>; -def AArch64eorv_pred : SDNode<"AArch64ISD::EORV_PRED", SDT_AArch64Reduce>; -def AArch64andv_pred : SDNode<"AArch64ISD::ANDV_PRED", SDT_AArch64Reduce>; -def AArch64lasta : SDNode<"AArch64ISD::LASTA", SDT_AArch64Reduce>; -def AArch64lastb : SDNode<"AArch64ISD::LASTB", SDT_AArch64Reduce>; +// Predicated operations with the result of inactive lanes being unspecified. +def AArch64add_p : SDNode<"AArch64ISD::ADD_PRED", SDT_AArch64Arith>; +def AArch64fadd_p : SDNode<"AArch64ISD::FADD_PRED", SDT_AArch64Arith>; +def AArch64fma_p : SDNode<"AArch64ISD::FMA_PRED", SDT_AArch64FMA>; +def AArch64sdiv_p : SDNode<"AArch64ISD::SDIV_PRED", SDT_AArch64Arith>; +def AArch64udiv_p : SDNode<"AArch64ISD::UDIV_PRED", SDT_AArch64Arith>; + +// Merging op1 into the inactive lanes. +def AArch64smin_m1 : SDNode<"AArch64ISD::SMIN_MERGE_OP1", SDT_AArch64Arith>; +def AArch64umin_m1 : SDNode<"AArch64ISD::UMIN_MERGE_OP1", SDT_AArch64Arith>; +def AArch64smax_m1 : SDNode<"AArch64ISD::SMAX_MERGE_OP1", SDT_AArch64Arith>; +def AArch64umax_m1 : SDNode<"AArch64ISD::UMAX_MERGE_OP1", SDT_AArch64Arith>; +def AArch64lsl_m1 : SDNode<"AArch64ISD::SHL_MERGE_OP1", SDT_AArch64Arith>; +def AArch64lsr_m1 : SDNode<"AArch64ISD::SRL_MERGE_OP1", SDT_AArch64Arith>; +def AArch64asr_m1 : SDNode<"AArch64ISD::SRA_MERGE_OP1", SDT_AArch64Arith>; def SDT_AArch64ReduceWithInit : SDTypeProfile<1, 3, [SDTCisVec<1>, SDTCisVec<3>]>; def AArch64clasta_n : SDNode<"AArch64ISD::CLASTA_N", SDT_AArch64ReduceWithInit>; def AArch64clastb_n : SDNode<"AArch64ISD::CLASTB_N", SDT_AArch64ReduceWithInit>; +def AArch64fadda_p : SDNode<"AArch64ISD::FADDA_PRED", SDT_AArch64ReduceWithInit>; def SDT_AArch64Rev : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisSameAs<0,1>]>; def AArch64rev : SDNode<"AArch64ISD::REV", SDT_AArch64Rev>; @@ -76,42 +199,57 @@ def AArch64rev : SDNode<"AArch64ISD::REV", SDT_AArch64Rev>; def SDT_AArch64PTest : SDTypeProfile<0, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>]>; def AArch64ptest : SDNode<"AArch64ISD::PTEST", SDT_AArch64PTest>; -let Predicates = [HasSVE] in { +def SDT_AArch64DUP_PRED : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0, 3>, SDTCisVec<1>, SDTCVecEltisVT<1,i1>]>; +def AArch64dup_mt : SDNode<"AArch64ISD::DUP_MERGE_PASSTHRU", SDT_AArch64DUP_PRED>; + +def SDT_IndexVector : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<1, 2>, SDTCisInt<2>]>; +def index_vector : SDNode<"AArch64ISD::INDEX_VECTOR", SDT_IndexVector, []>; - def RDFFR_PPz : sve_int_rdffr_pred<0b0, "rdffr">; - def RDFFRS_PPz : sve_int_rdffr_pred<0b1, "rdffrs">; - def RDFFR_P : sve_int_rdffr_unpred<"rdffr">; - def SETFFR : sve_int_setffr<"setffr">; - def WRFFR : sve_int_wrffr<"wrffr">; +def reinterpret_cast : SDNode<"AArch64ISD::REINTERPRET_CAST", SDTUnaryOp>; - defm ADD_ZZZ : sve_int_bin_cons_arit_0<0b000, "add", add>; - defm SUB_ZZZ : sve_int_bin_cons_arit_0<0b001, "sub", sub>; - defm SQADD_ZZZ : sve_int_bin_cons_arit_0<0b100, "sqadd", saddsat>; - defm UQADD_ZZZ : sve_int_bin_cons_arit_0<0b101, "uqadd", uaddsat>; - defm SQSUB_ZZZ : sve_int_bin_cons_arit_0<0b110, "sqsub", ssubsat>; - defm UQSUB_ZZZ : sve_int_bin_cons_arit_0<0b111, "uqsub", usubsat>; +let Predicates = [HasSVE] in { + defm RDFFR_PPz : sve_int_rdffr_pred<0b0, "rdffr", int_aarch64_sve_rdffr_z>; + def RDFFRS_PPz : sve_int_rdffr_pred<0b1, "rdffrs">; + defm RDFFR_P : sve_int_rdffr_unpred<"rdffr", int_aarch64_sve_rdffr>; + def SETFFR : sve_int_setffr<"setffr", int_aarch64_sve_setffr>; + def WRFFR : sve_int_wrffr<"wrffr", int_aarch64_sve_wrffr>; + + defm ADD_ZZZ : sve_int_bin_cons_arit_0<0b000, "add", add, null_frag>; + defm SUB_ZZZ : sve_int_bin_cons_arit_0<0b001, "sub", sub, null_frag>; + defm SQADD_ZZZ : sve_int_bin_cons_arit_0<0b100, "sqadd", saddsat, int_aarch64_sve_sqadd_x>; + defm UQADD_ZZZ : sve_int_bin_cons_arit_0<0b101, "uqadd", uaddsat, int_aarch64_sve_uqadd_x>; + defm SQSUB_ZZZ : sve_int_bin_cons_arit_0<0b110, "sqsub", ssubsat, int_aarch64_sve_sqsub_x>; + defm UQSUB_ZZZ : sve_int_bin_cons_arit_0<0b111, "uqsub", usubsat, int_aarch64_sve_uqsub_x>; defm AND_ZZZ : sve_int_bin_cons_log<0b00, "and", and>; defm ORR_ZZZ : sve_int_bin_cons_log<0b01, "orr", or>; defm EOR_ZZZ : sve_int_bin_cons_log<0b10, "eor", xor>; defm BIC_ZZZ : sve_int_bin_cons_log<0b11, "bic", null_frag>; - defm ADD_ZPmZ : sve_int_bin_pred_arit_0<0b000, "add", int_aarch64_sve_add>; - defm SUB_ZPmZ : sve_int_bin_pred_arit_0<0b001, "sub", int_aarch64_sve_sub>; - defm SUBR_ZPmZ : sve_int_bin_pred_arit_0<0b011, "subr", int_aarch64_sve_subr>; + defm ADD_ZPmZ : sve_int_bin_pred_arit_0<0b000, "add", "ADD_ZPZZ", int_aarch64_sve_add, DestructiveBinaryComm>; + defm SUB_ZPmZ : sve_int_bin_pred_arit_0<0b001, "sub", "SUB_ZPZZ", int_aarch64_sve_sub, DestructiveBinaryCommWithRev, "SUBR_ZPmZ">; + defm SUBR_ZPmZ : sve_int_bin_pred_arit_0<0b011, "subr", "SUBR_ZPZZ", int_aarch64_sve_subr, DestructiveBinaryCommWithRev, "SUB_ZPmZ", /*isReverseInstr*/ 1>; + + defm ADD_ZPZZ : sve_int_bin_pred_bhsd<AArch64add_p>; + + let Predicates = [HasSVE, UseExperimentalZeroingPseudos] in { + defm ADD_ZPZZ : sve_int_bin_pred_zeroing_bhsd<int_aarch64_sve_add>; + defm SUB_ZPZZ : sve_int_bin_pred_zeroing_bhsd<int_aarch64_sve_sub>; + defm SUBR_ZPZZ : sve_int_bin_pred_zeroing_bhsd<int_aarch64_sve_subr>; + } defm ORR_ZPmZ : sve_int_bin_pred_log<0b000, "orr", int_aarch64_sve_orr>; defm EOR_ZPmZ : sve_int_bin_pred_log<0b001, "eor", int_aarch64_sve_eor>; defm AND_ZPmZ : sve_int_bin_pred_log<0b010, "and", int_aarch64_sve_and>; defm BIC_ZPmZ : sve_int_bin_pred_log<0b011, "bic", int_aarch64_sve_bic>; - defm ADD_ZI : sve_int_arith_imm0<0b000, "add", add>; - defm SUB_ZI : sve_int_arith_imm0<0b001, "sub", sub>; + defm ADD_ZI : sve_int_arith_imm0<0b000, "add", add, null_frag>; + defm SUB_ZI : sve_int_arith_imm0<0b001, "sub", sub, null_frag>; defm SUBR_ZI : sve_int_arith_imm0_subr<0b011, "subr", sub>; - defm SQADD_ZI : sve_int_arith_imm0<0b100, "sqadd", saddsat>; - defm UQADD_ZI : sve_int_arith_imm0<0b101, "uqadd", uaddsat>; - defm SQSUB_ZI : sve_int_arith_imm0<0b110, "sqsub", ssubsat>; - defm UQSUB_ZI : sve_int_arith_imm0<0b111, "uqsub", usubsat>; + defm SQADD_ZI : sve_int_arith_imm0<0b100, "sqadd", saddsat, int_aarch64_sve_sqadd_x>; + defm UQADD_ZI : sve_int_arith_imm0<0b101, "uqadd", uaddsat, int_aarch64_sve_uqadd_x>; + defm SQSUB_ZI : sve_int_arith_imm0<0b110, "sqsub", ssubsat, int_aarch64_sve_sqsub_x>; + defm UQSUB_ZI : sve_int_arith_imm0<0b111, "uqsub", usubsat, int_aarch64_sve_uqsub_x>; defm MAD_ZPmZZ : sve_int_mladdsub_vvv_pred<0b0, "mad", int_aarch64_sve_mad>; defm MSB_ZPmZZ : sve_int_mladdsub_vvv_pred<0b1, "msb", int_aarch64_sve_msb>; @@ -121,32 +259,45 @@ let Predicates = [HasSVE] in { // SVE predicated integer reductions. defm SADDV_VPZ : sve_int_reduce_0_saddv<0b000, "saddv", int_aarch64_sve_saddv>; defm UADDV_VPZ : sve_int_reduce_0_uaddv<0b001, "uaddv", int_aarch64_sve_uaddv, int_aarch64_sve_saddv>; - defm SMAXV_VPZ : sve_int_reduce_1<0b000, "smaxv", AArch64smaxv_pred>; - defm UMAXV_VPZ : sve_int_reduce_1<0b001, "umaxv", AArch64umaxv_pred>; - defm SMINV_VPZ : sve_int_reduce_1<0b010, "sminv", AArch64sminv_pred>; - defm UMINV_VPZ : sve_int_reduce_1<0b011, "uminv", AArch64uminv_pred>; - defm ORV_VPZ : sve_int_reduce_2<0b000, "orv", AArch64orv_pred>; - defm EORV_VPZ : sve_int_reduce_2<0b001, "eorv", AArch64eorv_pred>; - defm ANDV_VPZ : sve_int_reduce_2<0b010, "andv", AArch64andv_pred>; + defm SMAXV_VPZ : sve_int_reduce_1<0b000, "smaxv", AArch64smaxv_p>; + defm UMAXV_VPZ : sve_int_reduce_1<0b001, "umaxv", AArch64umaxv_p>; + defm SMINV_VPZ : sve_int_reduce_1<0b010, "sminv", AArch64sminv_p>; + defm UMINV_VPZ : sve_int_reduce_1<0b011, "uminv", AArch64uminv_p>; + defm ORV_VPZ : sve_int_reduce_2<0b000, "orv", AArch64orv_p>; + defm EORV_VPZ : sve_int_reduce_2<0b001, "eorv", AArch64eorv_p>; + defm ANDV_VPZ : sve_int_reduce_2<0b010, "andv", AArch64andv_p>; defm ORR_ZI : sve_int_log_imm<0b00, "orr", "orn", or>; defm EOR_ZI : sve_int_log_imm<0b01, "eor", "eon", xor>; defm AND_ZI : sve_int_log_imm<0b10, "and", "bic", and>; - defm SMAX_ZI : sve_int_arith_imm1<0b00, "smax", smax>; - defm SMIN_ZI : sve_int_arith_imm1<0b10, "smin", smin>; - defm UMAX_ZI : sve_int_arith_imm1_unsigned<0b01, "umax", umax>; - defm UMIN_ZI : sve_int_arith_imm1_unsigned<0b11, "umin", umin>; + defm SMAX_ZI : sve_int_arith_imm1<0b00, "smax", AArch64smax_m1>; + defm SMIN_ZI : sve_int_arith_imm1<0b10, "smin", AArch64smin_m1>; + defm UMAX_ZI : sve_int_arith_imm1_unsigned<0b01, "umax", AArch64umax_m1>; + defm UMIN_ZI : sve_int_arith_imm1_unsigned<0b11, "umin", AArch64umin_m1>; - defm MUL_ZI : sve_int_arith_imm2<"mul", mul>; - defm MUL_ZPmZ : sve_int_bin_pred_arit_2<0b000, "mul", int_aarch64_sve_mul>; + defm MUL_ZI : sve_int_arith_imm2<"mul", mul>; + defm MUL_ZPmZ : sve_int_bin_pred_arit_2<0b000, "mul", int_aarch64_sve_mul>; defm SMULH_ZPmZ : sve_int_bin_pred_arit_2<0b010, "smulh", int_aarch64_sve_smulh>; defm UMULH_ZPmZ : sve_int_bin_pred_arit_2<0b011, "umulh", int_aarch64_sve_umulh>; - defm SDIV_ZPmZ : sve_int_bin_pred_arit_2_div<0b100, "sdiv", int_aarch64_sve_sdiv>; - defm UDIV_ZPmZ : sve_int_bin_pred_arit_2_div<0b101, "udiv", int_aarch64_sve_udiv>; - defm SDIVR_ZPmZ : sve_int_bin_pred_arit_2_div<0b110, "sdivr", int_aarch64_sve_sdivr>; - defm UDIVR_ZPmZ : sve_int_bin_pred_arit_2_div<0b111, "udivr", int_aarch64_sve_udivr>; + // Add unpredicated alternative for the mul instruction. + def : Pat<(mul nxv16i8:$Op1, nxv16i8:$Op2), + (MUL_ZPmZ_B (PTRUE_B 31), $Op1, $Op2)>; + def : Pat<(mul nxv8i16:$Op1, nxv8i16:$Op2), + (MUL_ZPmZ_H (PTRUE_H 31), $Op1, $Op2)>; + def : Pat<(mul nxv4i32:$Op1, nxv4i32:$Op2), + (MUL_ZPmZ_S (PTRUE_S 31), $Op1, $Op2)>; + def : Pat<(mul nxv2i64:$Op1, nxv2i64:$Op2), + (MUL_ZPmZ_D (PTRUE_D 31), $Op1, $Op2)>; + + defm SDIV_ZPmZ : sve_int_bin_pred_arit_2_div<0b100, "sdiv", "SDIV_ZPZZ", int_aarch64_sve_sdiv, DestructiveBinaryCommWithRev, "SDIVR_ZPmZ">; + defm UDIV_ZPmZ : sve_int_bin_pred_arit_2_div<0b101, "udiv", "UDIV_ZPZZ", int_aarch64_sve_udiv, DestructiveBinaryCommWithRev, "UDIVR_ZPmZ">; + defm SDIVR_ZPmZ : sve_int_bin_pred_arit_2_div<0b110, "sdivr", "SDIVR_ZPZZ", int_aarch64_sve_sdivr, DestructiveBinaryCommWithRev, "SDIV_ZPmZ", /*isReverseInstr*/ 1>; + defm UDIVR_ZPmZ : sve_int_bin_pred_arit_2_div<0b111, "udivr", "UDIVR_ZPZZ", int_aarch64_sve_udivr, DestructiveBinaryCommWithRev, "UDIV_ZPmZ", /*isReverseInstr*/ 1>; + + defm SDIV_ZPZZ : sve_int_bin_pred_sd<AArch64sdiv_p>; + defm UDIV_ZPZZ : sve_int_bin_pred_sd<AArch64udiv_p>; defm SDOT_ZZZ : sve_intx_dot<0b0, "sdot", int_aarch64_sve_sdot>; defm UDOT_ZZZ : sve_intx_dot<0b1, "udot", int_aarch64_sve_udot>; @@ -166,15 +317,20 @@ let Predicates = [HasSVE] in { defm CLS_ZPmZ : sve_int_un_pred_arit_1< 0b000, "cls", int_aarch64_sve_cls>; defm CLZ_ZPmZ : sve_int_un_pred_arit_1< 0b001, "clz", int_aarch64_sve_clz>; defm CNT_ZPmZ : sve_int_un_pred_arit_1< 0b010, "cnt", int_aarch64_sve_cnt>; + + let Predicates = [HasSVE, HasBF16] in { + def : SVE_3_Op_Pat<nxv8i16, int_aarch64_sve_cnt, nxv8i16, nxv8i1, nxv8bf16, !cast<Instruction>(CNT_ZPmZ_H)>; + } + defm CNOT_ZPmZ : sve_int_un_pred_arit_1< 0b011, "cnot", int_aarch64_sve_cnot>; defm NOT_ZPmZ : sve_int_un_pred_arit_1< 0b110, "not", int_aarch64_sve_not>; defm FABS_ZPmZ : sve_int_un_pred_arit_1_fp<0b100, "fabs", int_aarch64_sve_fabs>; defm FNEG_ZPmZ : sve_int_un_pred_arit_1_fp<0b101, "fneg", int_aarch64_sve_fneg>; - defm SMAX_ZPmZ : sve_int_bin_pred_arit_1<0b000, "smax", int_aarch64_sve_smax>; - defm UMAX_ZPmZ : sve_int_bin_pred_arit_1<0b001, "umax", int_aarch64_sve_umax>; - defm SMIN_ZPmZ : sve_int_bin_pred_arit_1<0b010, "smin", int_aarch64_sve_smin>; - defm UMIN_ZPmZ : sve_int_bin_pred_arit_1<0b011, "umin", int_aarch64_sve_umin>; + defm SMAX_ZPmZ : sve_int_bin_pred_arit_1<0b000, "smax", AArch64smax_m1>; + defm UMAX_ZPmZ : sve_int_bin_pred_arit_1<0b001, "umax", AArch64umax_m1>; + defm SMIN_ZPmZ : sve_int_bin_pred_arit_1<0b010, "smin", AArch64smin_m1>; + defm UMIN_ZPmZ : sve_int_bin_pred_arit_1<0b011, "umin", AArch64umin_m1>; defm SABD_ZPmZ : sve_int_bin_pred_arit_1<0b100, "sabd", int_aarch64_sve_sabd>; defm UABD_ZPmZ : sve_int_bin_pred_arit_1<0b101, "uabd", int_aarch64_sve_uabd>; @@ -190,19 +346,36 @@ let Predicates = [HasSVE] in { defm FMAX_ZPmI : sve_fp_2op_i_p_zds<0b110, "fmax", sve_fpimm_zero_one>; defm FMIN_ZPmI : sve_fp_2op_i_p_zds<0b111, "fmin", sve_fpimm_zero_one>; - defm FADD_ZPmZ : sve_fp_2op_p_zds<0b0000, "fadd", int_aarch64_sve_fadd>; - defm FSUB_ZPmZ : sve_fp_2op_p_zds<0b0001, "fsub", int_aarch64_sve_fsub>; - defm FMUL_ZPmZ : sve_fp_2op_p_zds<0b0010, "fmul", int_aarch64_sve_fmul>; - defm FSUBR_ZPmZ : sve_fp_2op_p_zds<0b0011, "fsubr", int_aarch64_sve_fsubr>; - defm FMAXNM_ZPmZ : sve_fp_2op_p_zds<0b0100, "fmaxnm", int_aarch64_sve_fmaxnm>; - defm FMINNM_ZPmZ : sve_fp_2op_p_zds<0b0101, "fminnm", int_aarch64_sve_fminnm>; - defm FMAX_ZPmZ : sve_fp_2op_p_zds<0b0110, "fmax", int_aarch64_sve_fmax>; - defm FMIN_ZPmZ : sve_fp_2op_p_zds<0b0111, "fmin", int_aarch64_sve_fmin>; - defm FABD_ZPmZ : sve_fp_2op_p_zds<0b1000, "fabd", int_aarch64_sve_fabd>; + defm FADD_ZPmZ : sve_fp_2op_p_zds<0b0000, "fadd", "FADD_ZPZZ", int_aarch64_sve_fadd, DestructiveBinaryComm>; + defm FSUB_ZPmZ : sve_fp_2op_p_zds<0b0001, "fsub", "FSUB_ZPZZ", int_aarch64_sve_fsub, DestructiveBinaryCommWithRev, "FSUBR_ZPmZ">; + defm FMUL_ZPmZ : sve_fp_2op_p_zds<0b0010, "fmul", "FMUL_ZPZZ", int_aarch64_sve_fmul, DestructiveBinaryComm>; + defm FSUBR_ZPmZ : sve_fp_2op_p_zds<0b0011, "fsubr", "FSUBR_ZPZZ", int_aarch64_sve_fsubr, DestructiveBinaryCommWithRev, "FSUB_ZPmZ", /*isReverseInstr*/ 1>; + defm FMAXNM_ZPmZ : sve_fp_2op_p_zds<0b0100, "fmaxnm", "FMAXNM_ZPZZ", int_aarch64_sve_fmaxnm, DestructiveBinaryComm>; + defm FMINNM_ZPmZ : sve_fp_2op_p_zds<0b0101, "fminnm", "FMINNM_ZPZZ", int_aarch64_sve_fminnm, DestructiveBinaryComm>; + defm FMAX_ZPmZ : sve_fp_2op_p_zds<0b0110, "fmax", "FMAX_ZPZZ", int_aarch64_sve_fmax, DestructiveBinaryComm>; + defm FMIN_ZPmZ : sve_fp_2op_p_zds<0b0111, "fmin", "FMIN_ZPZZ", int_aarch64_sve_fmin, DestructiveBinaryComm>; + defm FABD_ZPmZ : sve_fp_2op_p_zds<0b1000, "fabd", "FABD_ZPZZ", int_aarch64_sve_fabd, DestructiveBinaryComm>; defm FSCALE_ZPmZ : sve_fp_2op_p_zds_fscale<0b1001, "fscale", int_aarch64_sve_fscale>; - defm FMULX_ZPmZ : sve_fp_2op_p_zds<0b1010, "fmulx", int_aarch64_sve_fmulx>; - defm FDIVR_ZPmZ : sve_fp_2op_p_zds<0b1100, "fdivr", int_aarch64_sve_fdivr>; - defm FDIV_ZPmZ : sve_fp_2op_p_zds<0b1101, "fdiv", int_aarch64_sve_fdiv>; + defm FMULX_ZPmZ : sve_fp_2op_p_zds<0b1010, "fmulx", "FMULX_ZPZZ", int_aarch64_sve_fmulx, DestructiveBinaryComm>; + defm FDIVR_ZPmZ : sve_fp_2op_p_zds<0b1100, "fdivr", "FDIVR_ZPZZ", int_aarch64_sve_fdivr, DestructiveBinaryCommWithRev, "FDIV_ZPmZ", /*isReverseInstr*/ 1>; + defm FDIV_ZPmZ : sve_fp_2op_p_zds<0b1101, "fdiv", "FDIV_ZPZZ", int_aarch64_sve_fdiv, DestructiveBinaryCommWithRev, "FDIVR_ZPmZ">; + + defm FADD_ZPZZ : sve_fp_bin_pred_hfd<AArch64fadd_p>; + + let Predicates = [HasSVE, UseExperimentalZeroingPseudos] in { + defm FADD_ZPZZ : sve_fp_2op_p_zds_zeroing_hsd<int_aarch64_sve_fadd>; + defm FSUB_ZPZZ : sve_fp_2op_p_zds_zeroing_hsd<int_aarch64_sve_fsub>; + defm FMUL_ZPZZ : sve_fp_2op_p_zds_zeroing_hsd<int_aarch64_sve_fmul>; + defm FSUBR_ZPZZ : sve_fp_2op_p_zds_zeroing_hsd<int_aarch64_sve_fsubr>; + defm FMAXNM_ZPZZ : sve_fp_2op_p_zds_zeroing_hsd<int_aarch64_sve_fmaxnm>; + defm FMINNM_ZPZZ : sve_fp_2op_p_zds_zeroing_hsd<int_aarch64_sve_fminnm>; + defm FMAX_ZPZZ : sve_fp_2op_p_zds_zeroing_hsd<int_aarch64_sve_fmax>; + defm FMIN_ZPZZ : sve_fp_2op_p_zds_zeroing_hsd<int_aarch64_sve_fmin>; + defm FABD_ZPZZ : sve_fp_2op_p_zds_zeroing_hsd<int_aarch64_sve_fabd>; + defm FMULX_ZPZZ : sve_fp_2op_p_zds_zeroing_hsd<int_aarch64_sve_fmulx>; + defm FDIVR_ZPZZ : sve_fp_2op_p_zds_zeroing_hsd<int_aarch64_sve_fdivr>; + defm FDIV_ZPZZ : sve_fp_2op_p_zds_zeroing_hsd<int_aarch64_sve_fdiv>; + } defm FADD_ZZZ : sve_fp_3op_u_zd<0b000, "fadd", fadd>; defm FSUB_ZZZ : sve_fp_3op_u_zd<0b001, "fsub", fsub>; @@ -226,6 +399,16 @@ let Predicates = [HasSVE] in { defm FNMAD_ZPmZZ : sve_fp_3op_p_zds_b<0b10, "fnmad", int_aarch64_sve_fnmad>; defm FNMSB_ZPmZZ : sve_fp_3op_p_zds_b<0b11, "fnmsb", int_aarch64_sve_fnmsb>; + // Add patterns for FMA where disabled lanes are undef. + // FIXME: Implement a pseudo so we can choose a better instruction after + // regalloc. + def : Pat<(nxv8f16 (AArch64fma_p nxv8i1:$P, nxv8f16:$Op1, nxv8f16:$Op2, nxv8f16:$Op3)), + (FMLA_ZPmZZ_H $P, $Op3, $Op1, $Op2)>; + def : Pat<(nxv4f32 (AArch64fma_p nxv4i1:$P, nxv4f32:$Op1, nxv4f32:$Op2, nxv4f32:$Op3)), + (FMLA_ZPmZZ_S $P, $Op3, $Op1, $Op2)>; + def : Pat<(nxv2f64 (AArch64fma_p nxv2i1:$P, nxv2f64:$Op1, nxv2f64:$Op2, nxv2f64:$Op3)), + (FMLA_ZPmZZ_D $P, $Op3, $Op1, $Op2)>; + defm FTMAD_ZZI : sve_fp_ftmad<"ftmad", int_aarch64_sve_ftmad_x>; defm FMLA_ZZZI : sve_fp_fma_by_indexed_elem<0b0, "fmla", int_aarch64_sve_fmla_lane>; @@ -235,12 +418,21 @@ let Predicates = [HasSVE] in { defm FMUL_ZZZI : sve_fp_fmul_by_indexed_elem<"fmul", int_aarch64_sve_fmul_lane>; // SVE floating point reductions. - defm FADDA_VPZ : sve_fp_2op_p_vd<0b000, "fadda", int_aarch64_sve_fadda>; - defm FADDV_VPZ : sve_fp_fast_red<0b000, "faddv", int_aarch64_sve_faddv>; - defm FMAXNMV_VPZ : sve_fp_fast_red<0b100, "fmaxnmv", int_aarch64_sve_fmaxnmv>; - defm FMINNMV_VPZ : sve_fp_fast_red<0b101, "fminnmv", int_aarch64_sve_fminnmv>; - defm FMAXV_VPZ : sve_fp_fast_red<0b110, "fmaxv", int_aarch64_sve_fmaxv>; - defm FMINV_VPZ : sve_fp_fast_red<0b111, "fminv", int_aarch64_sve_fminv>; + defm FADDA_VPZ : sve_fp_2op_p_vd<0b000, "fadda", AArch64fadda_p>; + defm FADDV_VPZ : sve_fp_fast_red<0b000, "faddv", AArch64faddv_p>; + defm FMAXNMV_VPZ : sve_fp_fast_red<0b100, "fmaxnmv", AArch64fmaxnmv_p>; + defm FMINNMV_VPZ : sve_fp_fast_red<0b101, "fminnmv", AArch64fminnmv_p>; + defm FMAXV_VPZ : sve_fp_fast_red<0b110, "fmaxv", AArch64fmaxv_p>; + defm FMINV_VPZ : sve_fp_fast_red<0b111, "fminv", AArch64fminv_p>; + + // Use more efficient NEON instructions to extract elements within the NEON + // part (first 128bits) of an SVE register. + def : Pat<(vector_extract (nxv8f16 ZPR:$Zs), (i64 0)), + (f16 (EXTRACT_SUBREG (v8f16 (EXTRACT_SUBREG ZPR:$Zs, zsub)), hsub))>; + def : Pat<(vector_extract (nxv4f32 ZPR:$Zs), (i64 0)), + (f32 (EXTRACT_SUBREG (v4f32 (EXTRACT_SUBREG ZPR:$Zs, zsub)), ssub))>; + def : Pat<(vector_extract (nxv2f64 ZPR:$Zs), (i64 0)), + (f64 (EXTRACT_SUBREG (v2f64 (EXTRACT_SUBREG ZPR:$Zs, zsub)), dsub))>; // Splat immediate (unpredicated) defm DUP_ZI : sve_int_dup_imm<"dup">; @@ -257,18 +449,88 @@ let Predicates = [HasSVE] in { defm DUP_ZZI : sve_int_perm_dup_i<"dup">; // Splat scalar register (predicated) - defm CPY_ZPmR : sve_int_perm_cpy_r<"cpy">; - defm CPY_ZPmV : sve_int_perm_cpy_v<"cpy">; + defm CPY_ZPmR : sve_int_perm_cpy_r<"cpy", AArch64dup_mt>; + defm CPY_ZPmV : sve_int_perm_cpy_v<"cpy", AArch64dup_mt>; + + let Predicates = [HasSVE, HasBF16] in { + def : Pat<(nxv8bf16 (AArch64dup_mt nxv8i1:$pg, bf16:$splat, nxv8bf16:$passthru)), + (CPY_ZPmV_H $passthru, $pg, $splat)>; + } + + // Duplicate FP scalar into all vector elements + def : Pat<(nxv8f16 (AArch64dup (f16 FPR16:$src))), + (DUP_ZZI_H (INSERT_SUBREG (IMPLICIT_DEF), FPR16:$src, hsub), 0)>; + def : Pat<(nxv4f16 (AArch64dup (f16 FPR16:$src))), + (DUP_ZZI_H (INSERT_SUBREG (IMPLICIT_DEF), FPR16:$src, hsub), 0)>; + def : Pat<(nxv2f16 (AArch64dup (f16 FPR16:$src))), + (DUP_ZZI_H (INSERT_SUBREG (IMPLICIT_DEF), FPR16:$src, hsub), 0)>; + def : Pat<(nxv4f32 (AArch64dup (f32 FPR32:$src))), + (DUP_ZZI_S (INSERT_SUBREG (IMPLICIT_DEF), FPR32:$src, ssub), 0)>; + def : Pat<(nxv2f32 (AArch64dup (f32 FPR32:$src))), + (DUP_ZZI_S (INSERT_SUBREG (IMPLICIT_DEF), FPR32:$src, ssub), 0)>; + def : Pat<(nxv2f64 (AArch64dup (f64 FPR64:$src))), + (DUP_ZZI_D (INSERT_SUBREG (IMPLICIT_DEF), FPR64:$src, dsub), 0)>; + let Predicates = [HasSVE, HasBF16] in { + def : Pat<(nxv8bf16 (AArch64dup (bf16 FPR16:$src))), + (DUP_ZZI_H (INSERT_SUBREG (IMPLICIT_DEF), FPR16:$src, hsub), 0)>; + } + + // Duplicate +0.0 into all vector elements + def : Pat<(nxv8f16 (AArch64dup (f16 fpimm0))), (DUP_ZI_H 0, 0)>; + def : Pat<(nxv4f16 (AArch64dup (f16 fpimm0))), (DUP_ZI_H 0, 0)>; + def : Pat<(nxv2f16 (AArch64dup (f16 fpimm0))), (DUP_ZI_H 0, 0)>; + def : Pat<(nxv4f32 (AArch64dup (f32 fpimm0))), (DUP_ZI_S 0, 0)>; + def : Pat<(nxv2f32 (AArch64dup (f32 fpimm0))), (DUP_ZI_S 0, 0)>; + def : Pat<(nxv2f64 (AArch64dup (f64 fpimm0))), (DUP_ZI_D 0, 0)>; + let Predicates = [HasSVE, HasBF16] in { + def : Pat<(nxv8bf16 (AArch64dup (bf16 fpimm0))), (DUP_ZI_H 0, 0)>; + } + + // Duplicate Int immediate into all vector elements + def : Pat<(nxv16i8 (AArch64dup (i32 (SVE8BitLslImm i32:$a, i32:$b)))), + (DUP_ZI_B $a, $b)>; + def : Pat<(nxv8i16 (AArch64dup (i32 (SVE8BitLslImm i32:$a, i32:$b)))), + (DUP_ZI_H $a, $b)>; + def : Pat<(nxv4i32 (AArch64dup (i32 (SVE8BitLslImm i32:$a, i32:$b)))), + (DUP_ZI_S $a, $b)>; + def : Pat<(nxv2i64 (AArch64dup (i64 (SVE8BitLslImm i32:$a, i32:$b)))), + (DUP_ZI_D $a, $b)>; + + // Duplicate FP immediate into all vector elements + let AddedComplexity = 2 in { + def : Pat<(nxv8f16 (AArch64dup fpimm16:$imm8)), + (FDUP_ZI_H fpimm16:$imm8)>; + def : Pat<(nxv4f16 (AArch64dup fpimm16:$imm8)), + (FDUP_ZI_H fpimm16:$imm8)>; + def : Pat<(nxv2f16 (AArch64dup fpimm16:$imm8)), + (FDUP_ZI_H fpimm16:$imm8)>; + def : Pat<(nxv4f32 (AArch64dup fpimm32:$imm8)), + (FDUP_ZI_S fpimm32:$imm8)>; + def : Pat<(nxv2f32 (AArch64dup fpimm32:$imm8)), + (FDUP_ZI_S fpimm32:$imm8)>; + def : Pat<(nxv2f64 (AArch64dup fpimm64:$imm8)), + (FDUP_ZI_D fpimm64:$imm8)>; + } // Select elements from either vector (predicated) defm SEL_ZPZZ : sve_int_sel_vvv<"sel", vselect>; defm SPLICE_ZPZ : sve_int_perm_splice<"splice", int_aarch64_sve_splice>; + + let Predicates = [HasSVE, HasBF16] in { + def : SVE_3_Op_Pat<nxv8bf16, vselect, nxv8i1, nxv8bf16, nxv8bf16, SEL_ZPZZ_H>; + def : SVE_3_Op_Pat<nxv8bf16, int_aarch64_sve_splice, nxv8i1, nxv8bf16, nxv8bf16, SPLICE_ZPZ_H>; + } + defm COMPACT_ZPZ : sve_int_perm_compact<"compact", int_aarch64_sve_compact>; defm INSR_ZR : sve_int_perm_insrs<"insr", AArch64insr>; defm INSR_ZV : sve_int_perm_insrv<"insr", AArch64insr>; defm EXT_ZZI : sve_int_perm_extract_i<"ext", AArch64ext>; + let Predicates = [HasSVE, HasBF16] in { + def : SVE_2_Op_Pat<nxv8bf16, AArch64insr, nxv8bf16, bf16, INSR_ZV_H>; + } + defm RBIT_ZPmZ : sve_int_perm_rev_rbit<"rbit", int_aarch64_sve_rbit>; defm REVB_ZPmZ : sve_int_perm_rev_revb<"revb", int_aarch64_sve_revb, bswap>; defm REVH_ZPmZ : sve_int_perm_rev_revh<"revh", int_aarch64_sve_revh>; @@ -277,6 +539,10 @@ let Predicates = [HasSVE] in { defm REV_PP : sve_int_perm_reverse_p<"rev", AArch64rev>; defm REV_ZZ : sve_int_perm_reverse_z<"rev", AArch64rev>; + let Predicates = [HasSVE, HasBF16] in { + def : SVE_1_Op_Pat<nxv8bf16, AArch64rev, nxv8bf16, REV_ZZ_H>; + } + defm SUNPKLO_ZZ : sve_int_perm_unpk<0b00, "sunpklo", AArch64sunpklo>; defm SUNPKHI_ZZ : sve_int_perm_unpk<0b01, "sunpkhi", AArch64sunpkhi>; defm UUNPKLO_ZZ : sve_int_perm_unpk<0b10, "uunpklo", AArch64uunpklo>; @@ -290,34 +556,34 @@ let Predicates = [HasSVE] in { def MOVPRFX_ZZ : sve_int_bin_cons_misc_0_c<0b00000001, "movprfx", ZPRAny>; defm FEXPA_ZZ : sve_int_bin_cons_misc_0_c_fexpa<"fexpa", int_aarch64_sve_fexpa_x>; - def BRKPA_PPzPP : sve_int_brkp<0b00, "brkpa">; - def BRKPAS_PPzPP : sve_int_brkp<0b10, "brkpas">; - def BRKPB_PPzPP : sve_int_brkp<0b01, "brkpb">; - def BRKPBS_PPzPP : sve_int_brkp<0b11, "brkpbs">; + defm BRKPA_PPzPP : sve_int_brkp<0b00, "brkpa", int_aarch64_sve_brkpa_z>; + defm BRKPAS_PPzPP : sve_int_brkp<0b10, "brkpas", null_frag>; + defm BRKPB_PPzPP : sve_int_brkp<0b01, "brkpb", int_aarch64_sve_brkpb_z>; + defm BRKPBS_PPzPP : sve_int_brkp<0b11, "brkpbs", null_frag>; - def BRKN_PPzP : sve_int_brkn<0b0, "brkn">; - def BRKNS_PPzP : sve_int_brkn<0b1, "brkns">; + defm BRKN_PPzP : sve_int_brkn<0b0, "brkn", int_aarch64_sve_brkn_z>; + defm BRKNS_PPzP : sve_int_brkn<0b1, "brkns", null_frag>; - defm BRKA_PPzP : sve_int_break_z<0b000, "brka">; - defm BRKA_PPmP : sve_int_break_m<0b001, "brka">; - defm BRKAS_PPzP : sve_int_break_z<0b010, "brkas">; - defm BRKB_PPzP : sve_int_break_z<0b100, "brkb">; - defm BRKB_PPmP : sve_int_break_m<0b101, "brkb">; - defm BRKBS_PPzP : sve_int_break_z<0b110, "brkbs">; + defm BRKA_PPzP : sve_int_break_z<0b000, "brka", int_aarch64_sve_brka_z>; + defm BRKA_PPmP : sve_int_break_m<0b001, "brka", int_aarch64_sve_brka>; + defm BRKAS_PPzP : sve_int_break_z<0b010, "brkas", null_frag>; + defm BRKB_PPzP : sve_int_break_z<0b100, "brkb", int_aarch64_sve_brkb_z>; + defm BRKB_PPmP : sve_int_break_m<0b101, "brkb", int_aarch64_sve_brkb>; + defm BRKBS_PPzP : sve_int_break_z<0b110, "brkbs", null_frag>; def PTEST_PP : sve_int_ptest<0b010000, "ptest">; def PFALSE : sve_int_pfalse<0b000000, "pfalse">; defm PFIRST : sve_int_pfirst<0b00000, "pfirst", int_aarch64_sve_pfirst>; defm PNEXT : sve_int_pnext<0b00110, "pnext", int_aarch64_sve_pnext>; - defm AND_PPzPP : sve_int_pred_log<0b0000, "and", int_aarch64_sve_and_z>; + defm AND_PPzPP : sve_int_pred_log<0b0000, "and", int_aarch64_sve_and_z, and>; defm BIC_PPzPP : sve_int_pred_log<0b0001, "bic", int_aarch64_sve_bic_z>; - defm EOR_PPzPP : sve_int_pred_log<0b0010, "eor", int_aarch64_sve_eor_z>; + defm EOR_PPzPP : sve_int_pred_log<0b0010, "eor", int_aarch64_sve_eor_z, xor>; defm SEL_PPPP : sve_int_pred_log<0b0011, "sel", vselect>; defm ANDS_PPzPP : sve_int_pred_log<0b0100, "ands", null_frag>; defm BICS_PPzPP : sve_int_pred_log<0b0101, "bics", null_frag>; defm EORS_PPzPP : sve_int_pred_log<0b0110, "eors", null_frag>; - defm ORR_PPzPP : sve_int_pred_log<0b1000, "orr", int_aarch64_sve_orr_z>; + defm ORR_PPzPP : sve_int_pred_log<0b1000, "orr", int_aarch64_sve_orr_z, or>; defm ORN_PPzPP : sve_int_pred_log<0b1001, "orn", int_aarch64_sve_orn_z>; defm NOR_PPzPP : sve_int_pred_log<0b1010, "nor", int_aarch64_sve_nor_z>; defm NAND_PPzPP : sve_int_pred_log<0b1011, "nand", int_aarch64_sve_nand_z>; @@ -333,11 +599,23 @@ let Predicates = [HasSVE] in { defm CLASTA_ZPZ : sve_int_perm_clast_zz<0, "clasta", int_aarch64_sve_clasta>; defm CLASTB_ZPZ : sve_int_perm_clast_zz<1, "clastb", int_aarch64_sve_clastb>; + let Predicates = [HasSVE, HasBF16] in { + def : SVE_3_Op_Pat<bf16, AArch64clasta_n, nxv8i1, bf16, nxv8bf16, CLASTA_VPZ_H>; + def : SVE_3_Op_Pat<bf16, AArch64clastb_n, nxv8i1, bf16, nxv8bf16, CLASTB_VPZ_H>; + def : SVE_3_Op_Pat<nxv8bf16, int_aarch64_sve_clasta, nxv8i1, nxv8bf16, nxv8bf16, CLASTA_ZPZ_H>; + def : SVE_3_Op_Pat<nxv8bf16, int_aarch64_sve_clastb, nxv8i1, nxv8bf16, nxv8bf16, CLASTB_ZPZ_H>; + } + defm LASTA_RPZ : sve_int_perm_last_r<0, "lasta", AArch64lasta>; defm LASTB_RPZ : sve_int_perm_last_r<1, "lastb", AArch64lastb>; defm LASTA_VPZ : sve_int_perm_last_v<0, "lasta", AArch64lasta>; defm LASTB_VPZ : sve_int_perm_last_v<1, "lastb", AArch64lastb>; + let Predicates = [HasSVE, HasBF16] in { + def : SVE_2_Op_Pat<bf16, AArch64lasta, nxv8i1, nxv8bf16, LASTA_VPZ_H>; + def : SVE_2_Op_Pat<bf16, AArch64lastb, nxv8i1, nxv8bf16, LASTB_VPZ_H>; + } + // continuous load with reg+immediate defm LD1B_IMM : sve_mem_cld_si<0b0000, "ld1b", Z_b, ZPR8>; defm LD1B_H_IMM : sve_mem_cld_si<0b0001, "ld1b", Z_h, ZPR16>; @@ -468,115 +746,115 @@ let Predicates = [HasSVE] in { // Gathers using unscaled 32-bit offsets, e.g. // ld1h z0.s, p0/z, [x0, z0.s, uxtw] - defm GLD1SB_S : sve_mem_32b_gld_vs_32_unscaled<0b0000, "ld1sb", AArch64ld1s_gather_sxtw, AArch64ld1s_gather_uxtw, ZPR32ExtSXTW8Only, ZPR32ExtUXTW8Only, nxv4i8>; - defm GLDFF1SB_S : sve_mem_32b_gld_vs_32_unscaled<0b0001, "ldff1sb", null_frag, null_frag, ZPR32ExtSXTW8Only, ZPR32ExtUXTW8Only, nxv4i8>; - defm GLD1B_S : sve_mem_32b_gld_vs_32_unscaled<0b0010, "ld1b", AArch64ld1_gather_sxtw, AArch64ld1_gather_uxtw, ZPR32ExtSXTW8Only, ZPR32ExtUXTW8Only, nxv4i8>; - defm GLDFF1B_S : sve_mem_32b_gld_vs_32_unscaled<0b0011, "ldff1b", null_frag, null_frag, ZPR32ExtSXTW8Only, ZPR32ExtUXTW8Only, nxv4i8>; - defm GLD1SH_S : sve_mem_32b_gld_vs_32_unscaled<0b0100, "ld1sh", AArch64ld1s_gather_sxtw, AArch64ld1s_gather_uxtw, ZPR32ExtSXTW8, ZPR32ExtUXTW8, nxv4i16>; - defm GLDFF1SH_S : sve_mem_32b_gld_vs_32_unscaled<0b0101, "ldff1sh", null_frag, null_frag, ZPR32ExtSXTW8, ZPR32ExtUXTW8, nxv4i16>; - defm GLD1H_S : sve_mem_32b_gld_vs_32_unscaled<0b0110, "ld1h", AArch64ld1_gather_sxtw, AArch64ld1_gather_uxtw, ZPR32ExtSXTW8, ZPR32ExtUXTW8, nxv4i16>; - defm GLDFF1H_S : sve_mem_32b_gld_vs_32_unscaled<0b0111, "ldff1h", null_frag, null_frag, ZPR32ExtSXTW8, ZPR32ExtUXTW8, nxv4i16>; - defm GLD1W : sve_mem_32b_gld_vs_32_unscaled<0b1010, "ld1w", AArch64ld1_gather_sxtw, AArch64ld1_gather_uxtw, ZPR32ExtSXTW8, ZPR32ExtUXTW8, nxv4i32>; - defm GLDFF1W : sve_mem_32b_gld_vs_32_unscaled<0b1011, "ldff1w", null_frag, null_frag, ZPR32ExtSXTW8, ZPR32ExtUXTW8, nxv4i32>; + defm GLD1SB_S : sve_mem_32b_gld_vs_32_unscaled<0b0000, "ld1sb", AArch64ld1s_gather_sxtw_z, AArch64ld1s_gather_uxtw_z, ZPR32ExtSXTW8Only, ZPR32ExtUXTW8Only, nxv4i8>; + defm GLDFF1SB_S : sve_mem_32b_gld_vs_32_unscaled<0b0001, "ldff1sb", AArch64ldff1s_gather_sxtw_z, AArch64ldff1s_gather_uxtw_z, ZPR32ExtSXTW8Only, ZPR32ExtUXTW8Only, nxv4i8>; + defm GLD1B_S : sve_mem_32b_gld_vs_32_unscaled<0b0010, "ld1b", AArch64ld1_gather_sxtw_z, AArch64ld1_gather_uxtw_z, ZPR32ExtSXTW8Only, ZPR32ExtUXTW8Only, nxv4i8>; + defm GLDFF1B_S : sve_mem_32b_gld_vs_32_unscaled<0b0011, "ldff1b", AArch64ldff1_gather_sxtw_z, AArch64ldff1_gather_uxtw_z, ZPR32ExtSXTW8Only, ZPR32ExtUXTW8Only, nxv4i8>; + defm GLD1SH_S : sve_mem_32b_gld_vs_32_unscaled<0b0100, "ld1sh", AArch64ld1s_gather_sxtw_z, AArch64ld1s_gather_uxtw_z, ZPR32ExtSXTW8, ZPR32ExtUXTW8, nxv4i16>; + defm GLDFF1SH_S : sve_mem_32b_gld_vs_32_unscaled<0b0101, "ldff1sh", AArch64ldff1s_gather_sxtw_z, AArch64ldff1s_gather_uxtw_z, ZPR32ExtSXTW8, ZPR32ExtUXTW8, nxv4i16>; + defm GLD1H_S : sve_mem_32b_gld_vs_32_unscaled<0b0110, "ld1h", AArch64ld1_gather_sxtw_z, AArch64ld1_gather_uxtw_z, ZPR32ExtSXTW8, ZPR32ExtUXTW8, nxv4i16>; + defm GLDFF1H_S : sve_mem_32b_gld_vs_32_unscaled<0b0111, "ldff1h", AArch64ldff1_gather_sxtw_z, AArch64ldff1_gather_uxtw_z, ZPR32ExtSXTW8, ZPR32ExtUXTW8, nxv4i16>; + defm GLD1W : sve_mem_32b_gld_vs_32_unscaled<0b1010, "ld1w", AArch64ld1_gather_sxtw_z, AArch64ld1_gather_uxtw_z, ZPR32ExtSXTW8, ZPR32ExtUXTW8, nxv4i32>; + defm GLDFF1W : sve_mem_32b_gld_vs_32_unscaled<0b1011, "ldff1w", AArch64ldff1_gather_sxtw_z, AArch64ldff1_gather_uxtw_z, ZPR32ExtSXTW8, ZPR32ExtUXTW8, nxv4i32>; // Gathers using scaled 32-bit offsets, e.g. // ld1h z0.s, p0/z, [x0, z0.s, uxtw #1] - defm GLD1SH_S : sve_mem_32b_gld_sv_32_scaled<0b0100, "ld1sh", AArch64ld1s_gather_sxtw_scaled, AArch64ld1s_gather_uxtw_scaled, ZPR32ExtSXTW16, ZPR32ExtUXTW16, nxv4i16>; - defm GLDFF1SH_S : sve_mem_32b_gld_sv_32_scaled<0b0101, "ldff1sh", null_frag, null_frag, ZPR32ExtSXTW16, ZPR32ExtUXTW16, nxv4i16>; - defm GLD1H_S : sve_mem_32b_gld_sv_32_scaled<0b0110, "ld1h", AArch64ld1_gather_sxtw_scaled, AArch64ld1_gather_uxtw_scaled, ZPR32ExtSXTW16, ZPR32ExtUXTW16, nxv4i16>; - defm GLDFF1H_S : sve_mem_32b_gld_sv_32_scaled<0b0111, "ldff1h", null_frag, null_frag, ZPR32ExtSXTW16, ZPR32ExtUXTW16, nxv4i16>; - defm GLD1W : sve_mem_32b_gld_sv_32_scaled<0b1010, "ld1w", AArch64ld1_gather_sxtw_scaled, AArch64ld1_gather_uxtw_scaled, ZPR32ExtSXTW32, ZPR32ExtUXTW32, nxv4i32>; - defm GLDFF1W : sve_mem_32b_gld_sv_32_scaled<0b1011, "ldff1w", null_frag, null_frag, ZPR32ExtSXTW32, ZPR32ExtUXTW32, nxv4i32>; + defm GLD1SH_S : sve_mem_32b_gld_sv_32_scaled<0b0100, "ld1sh", AArch64ld1s_gather_sxtw_scaled_z, AArch64ld1s_gather_uxtw_scaled_z, ZPR32ExtSXTW16, ZPR32ExtUXTW16, nxv4i16>; + defm GLDFF1SH_S : sve_mem_32b_gld_sv_32_scaled<0b0101, "ldff1sh", AArch64ldff1s_gather_sxtw_scaled_z, AArch64ldff1s_gather_uxtw_scaled_z, ZPR32ExtSXTW16, ZPR32ExtUXTW16, nxv4i16>; + defm GLD1H_S : sve_mem_32b_gld_sv_32_scaled<0b0110, "ld1h", AArch64ld1_gather_sxtw_scaled_z, AArch64ld1_gather_uxtw_scaled_z, ZPR32ExtSXTW16, ZPR32ExtUXTW16, nxv4i16>; + defm GLDFF1H_S : sve_mem_32b_gld_sv_32_scaled<0b0111, "ldff1h", AArch64ldff1_gather_sxtw_scaled_z, AArch64ldff1_gather_uxtw_scaled_z, ZPR32ExtSXTW16, ZPR32ExtUXTW16, nxv4i16>; + defm GLD1W : sve_mem_32b_gld_sv_32_scaled<0b1010, "ld1w", AArch64ld1_gather_sxtw_scaled_z, AArch64ld1_gather_uxtw_scaled_z, ZPR32ExtSXTW32, ZPR32ExtUXTW32, nxv4i32>; + defm GLDFF1W : sve_mem_32b_gld_sv_32_scaled<0b1011, "ldff1w", AArch64ldff1_gather_sxtw_scaled_z, AArch64ldff1_gather_uxtw_scaled_z, ZPR32ExtSXTW32, ZPR32ExtUXTW32, nxv4i32>; // Gathers using 32-bit pointers with scaled offset, e.g. // ld1h z0.s, p0/z, [z0.s, #16] - defm GLD1SB_S : sve_mem_32b_gld_vi_32_ptrs<0b0000, "ld1sb", imm0_31, AArch64ld1s_gather_imm, nxv4i8>; - defm GLDFF1SB_S : sve_mem_32b_gld_vi_32_ptrs<0b0001, "ldff1sb", imm0_31, null_frag, nxv4i8>; - defm GLD1B_S : sve_mem_32b_gld_vi_32_ptrs<0b0010, "ld1b", imm0_31, AArch64ld1_gather_imm, nxv4i8>; - defm GLDFF1B_S : sve_mem_32b_gld_vi_32_ptrs<0b0011, "ldff1b", imm0_31, null_frag, nxv4i8>; - defm GLD1SH_S : sve_mem_32b_gld_vi_32_ptrs<0b0100, "ld1sh", uimm5s2, AArch64ld1s_gather_imm, nxv4i16>; - defm GLDFF1SH_S : sve_mem_32b_gld_vi_32_ptrs<0b0101, "ldff1sh", uimm5s2, null_frag, nxv4i16>; - defm GLD1H_S : sve_mem_32b_gld_vi_32_ptrs<0b0110, "ld1h", uimm5s2, AArch64ld1_gather_imm, nxv4i16>; - defm GLDFF1H_S : sve_mem_32b_gld_vi_32_ptrs<0b0111, "ldff1h", uimm5s2, null_frag, nxv4i16>; - defm GLD1W : sve_mem_32b_gld_vi_32_ptrs<0b1010, "ld1w", uimm5s4, AArch64ld1_gather_imm, nxv4i32>; - defm GLDFF1W : sve_mem_32b_gld_vi_32_ptrs<0b1011, "ldff1w", uimm5s4, null_frag, nxv4i32>; + defm GLD1SB_S : sve_mem_32b_gld_vi_32_ptrs<0b0000, "ld1sb", imm0_31, AArch64ld1s_gather_imm_z, nxv4i8>; + defm GLDFF1SB_S : sve_mem_32b_gld_vi_32_ptrs<0b0001, "ldff1sb", imm0_31, AArch64ldff1s_gather_imm_z, nxv4i8>; + defm GLD1B_S : sve_mem_32b_gld_vi_32_ptrs<0b0010, "ld1b", imm0_31, AArch64ld1_gather_imm_z, nxv4i8>; + defm GLDFF1B_S : sve_mem_32b_gld_vi_32_ptrs<0b0011, "ldff1b", imm0_31, AArch64ldff1_gather_imm_z, nxv4i8>; + defm GLD1SH_S : sve_mem_32b_gld_vi_32_ptrs<0b0100, "ld1sh", uimm5s2, AArch64ld1s_gather_imm_z, nxv4i16>; + defm GLDFF1SH_S : sve_mem_32b_gld_vi_32_ptrs<0b0101, "ldff1sh", uimm5s2, AArch64ldff1s_gather_imm_z, nxv4i16>; + defm GLD1H_S : sve_mem_32b_gld_vi_32_ptrs<0b0110, "ld1h", uimm5s2, AArch64ld1_gather_imm_z, nxv4i16>; + defm GLDFF1H_S : sve_mem_32b_gld_vi_32_ptrs<0b0111, "ldff1h", uimm5s2, AArch64ldff1_gather_imm_z, nxv4i16>; + defm GLD1W : sve_mem_32b_gld_vi_32_ptrs<0b1010, "ld1w", uimm5s4, AArch64ld1_gather_imm_z, nxv4i32>; + defm GLDFF1W : sve_mem_32b_gld_vi_32_ptrs<0b1011, "ldff1w", uimm5s4, AArch64ldff1_gather_imm_z, nxv4i32>; // Gathers using 64-bit pointers with scaled offset, e.g. // ld1h z0.d, p0/z, [z0.d, #16] - defm GLD1SB_D : sve_mem_64b_gld_vi_64_ptrs<0b0000, "ld1sb", imm0_31, AArch64ld1s_gather_imm, nxv2i8>; - defm GLDFF1SB_D : sve_mem_64b_gld_vi_64_ptrs<0b0001, "ldff1sb", imm0_31, null_frag, nxv2i8>; - defm GLD1B_D : sve_mem_64b_gld_vi_64_ptrs<0b0010, "ld1b", imm0_31, AArch64ld1_gather_imm, nxv2i8>; - defm GLDFF1B_D : sve_mem_64b_gld_vi_64_ptrs<0b0011, "ldff1b", imm0_31, null_frag, nxv2i8>; - defm GLD1SH_D : sve_mem_64b_gld_vi_64_ptrs<0b0100, "ld1sh", uimm5s2, AArch64ld1s_gather_imm, nxv2i16>; - defm GLDFF1SH_D : sve_mem_64b_gld_vi_64_ptrs<0b0101, "ldff1sh", uimm5s2, null_frag, nxv2i16>; - defm GLD1H_D : sve_mem_64b_gld_vi_64_ptrs<0b0110, "ld1h", uimm5s2, AArch64ld1_gather_imm, nxv2i16>; - defm GLDFF1H_D : sve_mem_64b_gld_vi_64_ptrs<0b0111, "ldff1h", uimm5s2, null_frag, nxv2i16>; - defm GLD1SW_D : sve_mem_64b_gld_vi_64_ptrs<0b1000, "ld1sw", uimm5s4, AArch64ld1s_gather_imm, nxv2i32>; - defm GLDFF1SW_D : sve_mem_64b_gld_vi_64_ptrs<0b1001, "ldff1sw", uimm5s4, null_frag, nxv2i32>; - defm GLD1W_D : sve_mem_64b_gld_vi_64_ptrs<0b1010, "ld1w", uimm5s4, AArch64ld1_gather_imm, nxv2i32>; - defm GLDFF1W_D : sve_mem_64b_gld_vi_64_ptrs<0b1011, "ldff1w", uimm5s4, null_frag, nxv2i32>; - defm GLD1D : sve_mem_64b_gld_vi_64_ptrs<0b1110, "ld1d", uimm5s8, AArch64ld1_gather_imm, nxv2i64>; - defm GLDFF1D : sve_mem_64b_gld_vi_64_ptrs<0b1111, "ldff1d", uimm5s8, null_frag, nxv2i64>; + defm GLD1SB_D : sve_mem_64b_gld_vi_64_ptrs<0b0000, "ld1sb", imm0_31, AArch64ld1s_gather_imm_z, nxv2i8>; + defm GLDFF1SB_D : sve_mem_64b_gld_vi_64_ptrs<0b0001, "ldff1sb", imm0_31, AArch64ldff1s_gather_imm_z, nxv2i8>; + defm GLD1B_D : sve_mem_64b_gld_vi_64_ptrs<0b0010, "ld1b", imm0_31, AArch64ld1_gather_imm_z, nxv2i8>; + defm GLDFF1B_D : sve_mem_64b_gld_vi_64_ptrs<0b0011, "ldff1b", imm0_31, AArch64ldff1_gather_imm_z, nxv2i8>; + defm GLD1SH_D : sve_mem_64b_gld_vi_64_ptrs<0b0100, "ld1sh", uimm5s2, AArch64ld1s_gather_imm_z, nxv2i16>; + defm GLDFF1SH_D : sve_mem_64b_gld_vi_64_ptrs<0b0101, "ldff1sh", uimm5s2, AArch64ldff1s_gather_imm_z, nxv2i16>; + defm GLD1H_D : sve_mem_64b_gld_vi_64_ptrs<0b0110, "ld1h", uimm5s2, AArch64ld1_gather_imm_z, nxv2i16>; + defm GLDFF1H_D : sve_mem_64b_gld_vi_64_ptrs<0b0111, "ldff1h", uimm5s2, AArch64ldff1_gather_imm_z, nxv2i16>; + defm GLD1SW_D : sve_mem_64b_gld_vi_64_ptrs<0b1000, "ld1sw", uimm5s4, AArch64ld1s_gather_imm_z, nxv2i32>; + defm GLDFF1SW_D : sve_mem_64b_gld_vi_64_ptrs<0b1001, "ldff1sw", uimm5s4, AArch64ldff1s_gather_imm_z, nxv2i32>; + defm GLD1W_D : sve_mem_64b_gld_vi_64_ptrs<0b1010, "ld1w", uimm5s4, AArch64ld1_gather_imm_z, nxv2i32>; + defm GLDFF1W_D : sve_mem_64b_gld_vi_64_ptrs<0b1011, "ldff1w", uimm5s4, AArch64ldff1_gather_imm_z, nxv2i32>; + defm GLD1D : sve_mem_64b_gld_vi_64_ptrs<0b1110, "ld1d", uimm5s8, AArch64ld1_gather_imm_z, nxv2i64>; + defm GLDFF1D : sve_mem_64b_gld_vi_64_ptrs<0b1111, "ldff1d", uimm5s8, AArch64ldff1_gather_imm_z, nxv2i64>; // Gathers using unscaled 64-bit offsets, e.g. // ld1h z0.d, p0/z, [x0, z0.d] - defm GLD1SB_D : sve_mem_64b_gld_vs2_64_unscaled<0b0000, "ld1sb", AArch64ld1s_gather, nxv2i8>; - defm GLDFF1SB_D : sve_mem_64b_gld_vs2_64_unscaled<0b0001, "ldff1sb", null_frag, nxv2i8>; - defm GLD1B_D : sve_mem_64b_gld_vs2_64_unscaled<0b0010, "ld1b", AArch64ld1_gather, nxv2i8>; - defm GLDFF1B_D : sve_mem_64b_gld_vs2_64_unscaled<0b0011, "ldff1b", null_frag, nxv2i8>; - defm GLD1SH_D : sve_mem_64b_gld_vs2_64_unscaled<0b0100, "ld1sh", AArch64ld1s_gather, nxv2i16>; - defm GLDFF1SH_D : sve_mem_64b_gld_vs2_64_unscaled<0b0101, "ldff1sh", null_frag, nxv2i16>; - defm GLD1H_D : sve_mem_64b_gld_vs2_64_unscaled<0b0110, "ld1h", AArch64ld1_gather, nxv2i16>; - defm GLDFF1H_D : sve_mem_64b_gld_vs2_64_unscaled<0b0111, "ldff1h", null_frag, nxv2i16>; - defm GLD1SW_D : sve_mem_64b_gld_vs2_64_unscaled<0b1000, "ld1sw", AArch64ld1s_gather, nxv2i32>; - defm GLDFF1SW_D : sve_mem_64b_gld_vs2_64_unscaled<0b1001, "ldff1sw", null_frag, nxv2i32>; - defm GLD1W_D : sve_mem_64b_gld_vs2_64_unscaled<0b1010, "ld1w", AArch64ld1_gather, nxv2i32>; - defm GLDFF1W_D : sve_mem_64b_gld_vs2_64_unscaled<0b1011, "ldff1w", null_frag, nxv2i32>; - defm GLD1D : sve_mem_64b_gld_vs2_64_unscaled<0b1110, "ld1d", AArch64ld1_gather, nxv2i64>; - defm GLDFF1D : sve_mem_64b_gld_vs2_64_unscaled<0b1111, "ldff1d", null_frag, nxv2i64>; + defm GLD1SB_D : sve_mem_64b_gld_vs2_64_unscaled<0b0000, "ld1sb", AArch64ld1s_gather_z, nxv2i8>; + defm GLDFF1SB_D : sve_mem_64b_gld_vs2_64_unscaled<0b0001, "ldff1sb", AArch64ldff1s_gather_z, nxv2i8>; + defm GLD1B_D : sve_mem_64b_gld_vs2_64_unscaled<0b0010, "ld1b", AArch64ld1_gather_z, nxv2i8>; + defm GLDFF1B_D : sve_mem_64b_gld_vs2_64_unscaled<0b0011, "ldff1b", AArch64ldff1_gather_z, nxv2i8>; + defm GLD1SH_D : sve_mem_64b_gld_vs2_64_unscaled<0b0100, "ld1sh", AArch64ld1s_gather_z, nxv2i16>; + defm GLDFF1SH_D : sve_mem_64b_gld_vs2_64_unscaled<0b0101, "ldff1sh", AArch64ldff1s_gather_z, nxv2i16>; + defm GLD1H_D : sve_mem_64b_gld_vs2_64_unscaled<0b0110, "ld1h", AArch64ld1_gather_z, nxv2i16>; + defm GLDFF1H_D : sve_mem_64b_gld_vs2_64_unscaled<0b0111, "ldff1h", AArch64ldff1_gather_z, nxv2i16>; + defm GLD1SW_D : sve_mem_64b_gld_vs2_64_unscaled<0b1000, "ld1sw", AArch64ld1s_gather_z, nxv2i32>; + defm GLDFF1SW_D : sve_mem_64b_gld_vs2_64_unscaled<0b1001, "ldff1sw", AArch64ldff1s_gather_z, nxv2i32>; + defm GLD1W_D : sve_mem_64b_gld_vs2_64_unscaled<0b1010, "ld1w", AArch64ld1_gather_z, nxv2i32>; + defm GLDFF1W_D : sve_mem_64b_gld_vs2_64_unscaled<0b1011, "ldff1w", AArch64ldff1_gather_z, nxv2i32>; + defm GLD1D : sve_mem_64b_gld_vs2_64_unscaled<0b1110, "ld1d", AArch64ld1_gather_z, nxv2i64>; + defm GLDFF1D : sve_mem_64b_gld_vs2_64_unscaled<0b1111, "ldff1d", AArch64ldff1_gather_z, nxv2i64>; // Gathers using scaled 64-bit offsets, e.g. // ld1h z0.d, p0/z, [x0, z0.d, lsl #1] - defm GLD1SH_D : sve_mem_64b_gld_sv2_64_scaled<0b0100, "ld1sh", AArch64ld1s_gather_scaled, ZPR64ExtLSL16, nxv2i16>; - defm GLDFF1SH_D : sve_mem_64b_gld_sv2_64_scaled<0b0101, "ldff1sh", null_frag, ZPR64ExtLSL16, nxv2i16>; - defm GLD1H_D : sve_mem_64b_gld_sv2_64_scaled<0b0110, "ld1h", AArch64ld1_gather_scaled, ZPR64ExtLSL16, nxv2i16>; - defm GLDFF1H_D : sve_mem_64b_gld_sv2_64_scaled<0b0111, "ldff1h", null_frag, ZPR64ExtLSL16, nxv2i16>; - defm GLD1SW_D : sve_mem_64b_gld_sv2_64_scaled<0b1000, "ld1sw", AArch64ld1s_gather_scaled, ZPR64ExtLSL32, nxv2i32>; - defm GLDFF1SW_D : sve_mem_64b_gld_sv2_64_scaled<0b1001, "ldff1sw", null_frag, ZPR64ExtLSL32, nxv2i32>; - defm GLD1W_D : sve_mem_64b_gld_sv2_64_scaled<0b1010, "ld1w", AArch64ld1_gather_scaled, ZPR64ExtLSL32, nxv2i32>; - defm GLDFF1W_D : sve_mem_64b_gld_sv2_64_scaled<0b1011, "ldff1w", null_frag, ZPR64ExtLSL32, nxv2i32>; - defm GLD1D : sve_mem_64b_gld_sv2_64_scaled<0b1110, "ld1d", AArch64ld1_gather_scaled, ZPR64ExtLSL64, nxv2i64>; - defm GLDFF1D : sve_mem_64b_gld_sv2_64_scaled<0b1111, "ldff1d", null_frag, ZPR64ExtLSL64, nxv2i64>; + defm GLD1SH_D : sve_mem_64b_gld_sv2_64_scaled<0b0100, "ld1sh", AArch64ld1s_gather_scaled_z, ZPR64ExtLSL16, nxv2i16>; + defm GLDFF1SH_D : sve_mem_64b_gld_sv2_64_scaled<0b0101, "ldff1sh", AArch64ldff1s_gather_scaled_z, ZPR64ExtLSL16, nxv2i16>; + defm GLD1H_D : sve_mem_64b_gld_sv2_64_scaled<0b0110, "ld1h", AArch64ld1_gather_scaled_z, ZPR64ExtLSL16, nxv2i16>; + defm GLDFF1H_D : sve_mem_64b_gld_sv2_64_scaled<0b0111, "ldff1h", AArch64ldff1_gather_scaled_z, ZPR64ExtLSL16, nxv2i16>; + defm GLD1SW_D : sve_mem_64b_gld_sv2_64_scaled<0b1000, "ld1sw", AArch64ld1s_gather_scaled_z, ZPR64ExtLSL32, nxv2i32>; + defm GLDFF1SW_D : sve_mem_64b_gld_sv2_64_scaled<0b1001, "ldff1sw", AArch64ldff1s_gather_scaled_z, ZPR64ExtLSL32, nxv2i32>; + defm GLD1W_D : sve_mem_64b_gld_sv2_64_scaled<0b1010, "ld1w", AArch64ld1_gather_scaled_z, ZPR64ExtLSL32, nxv2i32>; + defm GLDFF1W_D : sve_mem_64b_gld_sv2_64_scaled<0b1011, "ldff1w", AArch64ldff1_gather_scaled_z, ZPR64ExtLSL32, nxv2i32>; + defm GLD1D : sve_mem_64b_gld_sv2_64_scaled<0b1110, "ld1d", AArch64ld1_gather_scaled_z, ZPR64ExtLSL64, nxv2i64>; + defm GLDFF1D : sve_mem_64b_gld_sv2_64_scaled<0b1111, "ldff1d", AArch64ldff1_gather_scaled_z, ZPR64ExtLSL64, nxv2i64>; // Gathers using unscaled 32-bit offsets unpacked in 64-bits elements, e.g. // ld1h z0.d, p0/z, [x0, z0.d, uxtw] - defm GLD1SB_D : sve_mem_64b_gld_vs_32_unscaled<0b0000, "ld1sb", AArch64ld1s_gather_sxtw, AArch64ld1s_gather_uxtw, ZPR64ExtSXTW8Only, ZPR64ExtUXTW8Only, nxv2i8>; - defm GLDFF1SB_D : sve_mem_64b_gld_vs_32_unscaled<0b0001, "ldff1sb", null_frag, null_frag, ZPR64ExtSXTW8Only, ZPR64ExtUXTW8Only, nxv2i8>; - defm GLD1B_D : sve_mem_64b_gld_vs_32_unscaled<0b0010, "ld1b", AArch64ld1_gather_sxtw, AArch64ld1_gather_uxtw, ZPR64ExtSXTW8Only, ZPR64ExtUXTW8Only, nxv2i8>; - defm GLDFF1B_D : sve_mem_64b_gld_vs_32_unscaled<0b0011, "ldff1b", null_frag, null_frag, ZPR64ExtSXTW8Only, ZPR64ExtUXTW8Only, nxv2i8>; - defm GLD1SH_D : sve_mem_64b_gld_vs_32_unscaled<0b0100, "ld1sh", AArch64ld1s_gather_sxtw, AArch64ld1s_gather_uxtw, ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i16>; - defm GLDFF1SH_D : sve_mem_64b_gld_vs_32_unscaled<0b0101, "ldff1sh", null_frag, null_frag, ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i16>; - defm GLD1H_D : sve_mem_64b_gld_vs_32_unscaled<0b0110, "ld1h", AArch64ld1_gather_sxtw, AArch64ld1_gather_uxtw, ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i16>; - defm GLDFF1H_D : sve_mem_64b_gld_vs_32_unscaled<0b0111, "ldff1h", null_frag, null_frag, ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i16>; - defm GLD1SW_D : sve_mem_64b_gld_vs_32_unscaled<0b1000, "ld1sw", AArch64ld1s_gather_sxtw, AArch64ld1s_gather_uxtw, ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i32>; - defm GLDFF1SW_D : sve_mem_64b_gld_vs_32_unscaled<0b1001, "ldff1sw", null_frag, null_frag, ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i32>; - defm GLD1W_D : sve_mem_64b_gld_vs_32_unscaled<0b1010, "ld1w", AArch64ld1_gather_sxtw, AArch64ld1_gather_uxtw, ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i32>; - defm GLDFF1W_D : sve_mem_64b_gld_vs_32_unscaled<0b1011, "ldff1w", null_frag, null_frag, ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i32>; - defm GLD1D : sve_mem_64b_gld_vs_32_unscaled<0b1110, "ld1d", AArch64ld1_gather_sxtw, AArch64ld1_gather_uxtw, ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i64>; - defm GLDFF1D : sve_mem_64b_gld_vs_32_unscaled<0b1111, "ldff1d", null_frag, null_frag, ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i64>; + defm GLD1SB_D : sve_mem_64b_gld_vs_32_unscaled<0b0000, "ld1sb", AArch64ld1s_gather_sxtw_z, AArch64ld1s_gather_uxtw_z, ZPR64ExtSXTW8Only, ZPR64ExtUXTW8Only, nxv2i8>; + defm GLDFF1SB_D : sve_mem_64b_gld_vs_32_unscaled<0b0001, "ldff1sb", AArch64ldff1s_gather_sxtw_z, AArch64ldff1s_gather_uxtw_z, ZPR64ExtSXTW8Only, ZPR64ExtUXTW8Only, nxv2i8>; + defm GLD1B_D : sve_mem_64b_gld_vs_32_unscaled<0b0010, "ld1b", AArch64ld1_gather_sxtw_z, AArch64ld1_gather_uxtw_z, ZPR64ExtSXTW8Only, ZPR64ExtUXTW8Only, nxv2i8>; + defm GLDFF1B_D : sve_mem_64b_gld_vs_32_unscaled<0b0011, "ldff1b", AArch64ldff1_gather_sxtw_z, AArch64ldff1_gather_uxtw_z, ZPR64ExtSXTW8Only, ZPR64ExtUXTW8Only, nxv2i8>; + defm GLD1SH_D : sve_mem_64b_gld_vs_32_unscaled<0b0100, "ld1sh", AArch64ld1s_gather_sxtw_z, AArch64ld1s_gather_uxtw_z, ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i16>; + defm GLDFF1SH_D : sve_mem_64b_gld_vs_32_unscaled<0b0101, "ldff1sh", AArch64ldff1s_gather_sxtw_z, AArch64ldff1s_gather_uxtw_z, ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i16>; + defm GLD1H_D : sve_mem_64b_gld_vs_32_unscaled<0b0110, "ld1h", AArch64ld1_gather_sxtw_z, AArch64ld1_gather_uxtw_z, ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i16>; + defm GLDFF1H_D : sve_mem_64b_gld_vs_32_unscaled<0b0111, "ldff1h", AArch64ldff1_gather_sxtw_z, AArch64ldff1_gather_uxtw_z, ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i16>; + defm GLD1SW_D : sve_mem_64b_gld_vs_32_unscaled<0b1000, "ld1sw", AArch64ld1s_gather_sxtw_z, AArch64ld1s_gather_uxtw_z, ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i32>; + defm GLDFF1SW_D : sve_mem_64b_gld_vs_32_unscaled<0b1001, "ldff1sw", AArch64ldff1s_gather_sxtw_z, AArch64ldff1s_gather_uxtw_z, ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i32>; + defm GLD1W_D : sve_mem_64b_gld_vs_32_unscaled<0b1010, "ld1w", AArch64ld1_gather_sxtw_z, AArch64ld1_gather_uxtw_z, ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i32>; + defm GLDFF1W_D : sve_mem_64b_gld_vs_32_unscaled<0b1011, "ldff1w", AArch64ldff1_gather_sxtw_z, AArch64ldff1_gather_uxtw_z, ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i32>; + defm GLD1D : sve_mem_64b_gld_vs_32_unscaled<0b1110, "ld1d", AArch64ld1_gather_sxtw_z, AArch64ld1_gather_uxtw_z, ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i64>; + defm GLDFF1D : sve_mem_64b_gld_vs_32_unscaled<0b1111, "ldff1d", AArch64ldff1_gather_sxtw_z, AArch64ldff1_gather_uxtw_z, ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i64>; // Gathers using scaled 32-bit offsets unpacked in 64-bits elements, e.g. // ld1h z0.d, p0/z, [x0, z0.d, uxtw #1] - defm GLD1SH_D : sve_mem_64b_gld_sv_32_scaled<0b0100, "ld1sh", AArch64ld1s_gather_sxtw_scaled, AArch64ld1s_gather_uxtw_scaled, ZPR64ExtSXTW16, ZPR64ExtUXTW16, nxv2i16>; - defm GLDFF1SH_D : sve_mem_64b_gld_sv_32_scaled<0b0101, "ldff1sh", null_frag, null_frag, ZPR64ExtSXTW16, ZPR64ExtUXTW16, nxv2i16>; - defm GLD1H_D : sve_mem_64b_gld_sv_32_scaled<0b0110, "ld1h", AArch64ld1_gather_sxtw_scaled, AArch64ld1_gather_uxtw_scaled, ZPR64ExtSXTW16, ZPR64ExtUXTW16, nxv2i16>; - defm GLDFF1H_D : sve_mem_64b_gld_sv_32_scaled<0b0111, "ldff1h", null_frag, null_frag, ZPR64ExtSXTW16, ZPR64ExtUXTW16, nxv2i16>; - defm GLD1SW_D : sve_mem_64b_gld_sv_32_scaled<0b1000, "ld1sw", AArch64ld1s_gather_sxtw_scaled, AArch64ld1s_gather_uxtw_scaled, ZPR64ExtSXTW32, ZPR64ExtUXTW32, nxv2i32>; - defm GLDFF1SW_D : sve_mem_64b_gld_sv_32_scaled<0b1001, "ldff1sw", null_frag, null_frag, ZPR64ExtSXTW32, ZPR64ExtUXTW32, nxv2i32>; - defm GLD1W_D : sve_mem_64b_gld_sv_32_scaled<0b1010, "ld1w", AArch64ld1_gather_sxtw_scaled, AArch64ld1_gather_uxtw_scaled, ZPR64ExtSXTW32, ZPR64ExtUXTW32, nxv2i32>; - defm GLDFF1W_D : sve_mem_64b_gld_sv_32_scaled<0b1011, "ldff1w", null_frag, null_frag, ZPR64ExtSXTW32, ZPR64ExtUXTW32, nxv2i32>; - defm GLD1D : sve_mem_64b_gld_sv_32_scaled<0b1110, "ld1d", AArch64ld1_gather_sxtw_scaled, AArch64ld1_gather_uxtw_scaled, ZPR64ExtSXTW64, ZPR64ExtUXTW64, nxv2i64>; - defm GLDFF1D : sve_mem_64b_gld_sv_32_scaled<0b1111, "ldff1d", null_frag, null_frag, ZPR64ExtSXTW64, ZPR64ExtUXTW64, nxv2i64>; + defm GLD1SH_D : sve_mem_64b_gld_sv_32_scaled<0b0100, "ld1sh", AArch64ld1s_gather_sxtw_scaled_z, AArch64ld1s_gather_uxtw_scaled_z, ZPR64ExtSXTW16, ZPR64ExtUXTW16, nxv2i16>; + defm GLDFF1SH_D : sve_mem_64b_gld_sv_32_scaled<0b0101, "ldff1sh", AArch64ldff1s_gather_sxtw_scaled_z, AArch64ldff1s_gather_uxtw_scaled_z, ZPR64ExtSXTW16, ZPR64ExtUXTW16, nxv2i16>; + defm GLD1H_D : sve_mem_64b_gld_sv_32_scaled<0b0110, "ld1h", AArch64ld1_gather_sxtw_scaled_z, AArch64ld1_gather_uxtw_scaled_z, ZPR64ExtSXTW16, ZPR64ExtUXTW16, nxv2i16>; + defm GLDFF1H_D : sve_mem_64b_gld_sv_32_scaled<0b0111, "ldff1h", AArch64ldff1_gather_sxtw_scaled_z, AArch64ldff1_gather_uxtw_scaled_z, ZPR64ExtSXTW16, ZPR64ExtUXTW16, nxv2i16>; + defm GLD1SW_D : sve_mem_64b_gld_sv_32_scaled<0b1000, "ld1sw", AArch64ld1s_gather_sxtw_scaled_z, AArch64ld1s_gather_uxtw_scaled_z, ZPR64ExtSXTW32, ZPR64ExtUXTW32, nxv2i32>; + defm GLDFF1SW_D : sve_mem_64b_gld_sv_32_scaled<0b1001, "ldff1sw", AArch64ldff1s_gather_sxtw_scaled_z, AArch64ldff1s_gather_uxtw_scaled_z, ZPR64ExtSXTW32, ZPR64ExtUXTW32, nxv2i32>; + defm GLD1W_D : sve_mem_64b_gld_sv_32_scaled<0b1010, "ld1w", AArch64ld1_gather_sxtw_scaled_z, AArch64ld1_gather_uxtw_scaled_z, ZPR64ExtSXTW32, ZPR64ExtUXTW32, nxv2i32>; + defm GLDFF1W_D : sve_mem_64b_gld_sv_32_scaled<0b1011, "ldff1w", AArch64ldff1_gather_sxtw_scaled_z, AArch64ldff1_gather_uxtw_scaled_z, ZPR64ExtSXTW32, ZPR64ExtUXTW32, nxv2i32>; + defm GLD1D : sve_mem_64b_gld_sv_32_scaled<0b1110, "ld1d", AArch64ld1_gather_sxtw_scaled_z, AArch64ld1_gather_uxtw_scaled_z, ZPR64ExtSXTW64, ZPR64ExtUXTW64, nxv2i64>; + defm GLDFF1D : sve_mem_64b_gld_sv_32_scaled<0b1111, "ldff1d", AArch64ldff1_gather_sxtw_scaled_z, AArch64ldff1_gather_uxtw_scaled_z, ZPR64ExtSXTW64, ZPR64ExtUXTW64, nxv2i64>; // Non-temporal contiguous loads (register + immediate) defm LDNT1B_ZRI : sve_mem_cldnt_si<0b00, "ldnt1b", Z_b, ZPR8>; @@ -640,16 +918,16 @@ let Predicates = [HasSVE] in { // Scatters using 32/64-bit pointers with offset, e.g. // st1h z0.s, p0, [z0.s, #16] - defm SST1B_S : sve_mem_32b_sst_vi_ptrs<0b001, "st1b", timm0_31, AArch64st1_scatter_imm, nxv4i8>; - defm SST1H_S : sve_mem_32b_sst_vi_ptrs<0b011, "st1h", tuimm5s2, AArch64st1_scatter_imm, nxv4i16>; - defm SST1W : sve_mem_32b_sst_vi_ptrs<0b101, "st1w", tuimm5s4, AArch64st1_scatter_imm, nxv4i32>; + defm SST1B_S : sve_mem_32b_sst_vi_ptrs<0b001, "st1b", imm0_31, AArch64st1_scatter_imm, nxv4i8>; + defm SST1H_S : sve_mem_32b_sst_vi_ptrs<0b011, "st1h", uimm5s2, AArch64st1_scatter_imm, nxv4i16>; + defm SST1W : sve_mem_32b_sst_vi_ptrs<0b101, "st1w", uimm5s4, AArch64st1_scatter_imm, nxv4i32>; // Scatters using 32/64-bit pointers with offset, e.g. // st1h z0.d, p0, [z0.d, #16] - defm SST1B_D : sve_mem_64b_sst_vi_ptrs<0b000, "st1b", timm0_31, AArch64st1_scatter_imm, nxv2i8>; - defm SST1H_D : sve_mem_64b_sst_vi_ptrs<0b010, "st1h", tuimm5s2, AArch64st1_scatter_imm, nxv2i16>; - defm SST1W_D : sve_mem_64b_sst_vi_ptrs<0b100, "st1w", tuimm5s4, AArch64st1_scatter_imm, nxv2i32>; - defm SST1D : sve_mem_64b_sst_vi_ptrs<0b110, "st1d", tuimm5s8, AArch64st1_scatter_imm, nxv2i64>; + defm SST1B_D : sve_mem_64b_sst_vi_ptrs<0b000, "st1b", imm0_31, AArch64st1_scatter_imm, nxv2i8>; + defm SST1H_D : sve_mem_64b_sst_vi_ptrs<0b010, "st1h", uimm5s2, AArch64st1_scatter_imm, nxv2i16>; + defm SST1W_D : sve_mem_64b_sst_vi_ptrs<0b100, "st1w", uimm5s4, AArch64st1_scatter_imm, nxv2i32>; + defm SST1D : sve_mem_64b_sst_vi_ptrs<0b110, "st1d", uimm5s8, AArch64st1_scatter_imm, nxv2i64>; // Scatters using unscaled 64-bit offsets, e.g. // st1h z0.d, p0, [x0, z0.d] @@ -722,47 +1000,92 @@ let Predicates = [HasSVE] in { def PRFS_PRR : sve_mem_prfm_ss<0b101, "prfw", GPR64NoXZRshifted32>; def PRFD_PRR : sve_mem_prfm_ss<0b111, "prfd", GPR64NoXZRshifted64>; +multiclass sve_prefetch<SDPatternOperator prefetch, ValueType PredTy, Instruction RegImmInst, Instruction RegRegInst, int scale, ComplexPattern AddrCP> { + // reg + imm + let AddedComplexity = 2 in { + def _reg_imm : Pat<(prefetch (PredTy PPR_3b:$gp), (am_sve_indexed_s6 GPR64sp:$base, simm6s1:$offset), (i32 sve_prfop:$prfop)), + (RegImmInst sve_prfop:$prfop, PPR_3b:$gp, GPR64:$base, simm6s1:$offset)>; + } + + // reg + reg + let AddedComplexity = 1 in { + def _reg_reg : Pat<(prefetch (PredTy PPR_3b:$gp), (AddrCP GPR64sp:$base, GPR64:$index), (i32 sve_prfop:$prfop)), + (RegRegInst sve_prfop:$prfop, PPR_3b:$gp, GPR64:$base, GPR64:$index)>; + } + + // default fallback + def _default : Pat<(prefetch (PredTy PPR_3b:$gp), GPR64:$base, (i32 sve_prfop:$prfop)), + (RegImmInst sve_prfop:$prfop, PPR_3b:$gp, GPR64:$base, (i64 0))>; + } + + defm : sve_prefetch<int_aarch64_sve_prf, nxv16i1, PRFB_PRI, PRFB_PRR, 0, am_sve_regreg_lsl0>; + defm : sve_prefetch<int_aarch64_sve_prf, nxv8i1, PRFH_PRI, PRFH_PRR, 1, am_sve_regreg_lsl1>; + defm : sve_prefetch<int_aarch64_sve_prf, nxv4i1, PRFW_PRI, PRFS_PRR, 2, am_sve_regreg_lsl2>; + defm : sve_prefetch<int_aarch64_sve_prf, nxv2i1, PRFD_PRI, PRFD_PRR, 3, am_sve_regreg_lsl3>; + // Gather prefetch using scaled 32-bit offsets, e.g. // prfh pldl1keep, p0, [x0, z0.s, uxtw #1] - defm PRFB_S : sve_mem_32b_prfm_sv_scaled<0b00, "prfb", ZPR32ExtSXTW8Only, ZPR32ExtUXTW8Only>; - defm PRFH_S : sve_mem_32b_prfm_sv_scaled<0b01, "prfh", ZPR32ExtSXTW16, ZPR32ExtUXTW16>; - defm PRFW_S : sve_mem_32b_prfm_sv_scaled<0b10, "prfw", ZPR32ExtSXTW32, ZPR32ExtUXTW32>; - defm PRFD_S : sve_mem_32b_prfm_sv_scaled<0b11, "prfd", ZPR32ExtSXTW64, ZPR32ExtUXTW64>; + defm PRFB_S : sve_mem_32b_prfm_sv_scaled<0b00, "prfb", ZPR32ExtSXTW8Only, ZPR32ExtUXTW8Only, int_aarch64_sve_prfb_gather_sxtw_index, int_aarch64_sve_prfb_gather_uxtw_index>; + defm PRFH_S : sve_mem_32b_prfm_sv_scaled<0b01, "prfh", ZPR32ExtSXTW16, ZPR32ExtUXTW16, int_aarch64_sve_prfh_gather_sxtw_index, int_aarch64_sve_prfh_gather_uxtw_index>; + defm PRFW_S : sve_mem_32b_prfm_sv_scaled<0b10, "prfw", ZPR32ExtSXTW32, ZPR32ExtUXTW32, int_aarch64_sve_prfw_gather_sxtw_index, int_aarch64_sve_prfw_gather_uxtw_index>; + defm PRFD_S : sve_mem_32b_prfm_sv_scaled<0b11, "prfd", ZPR32ExtSXTW64, ZPR32ExtUXTW64, int_aarch64_sve_prfd_gather_sxtw_index, int_aarch64_sve_prfd_gather_uxtw_index>; // Gather prefetch using unpacked, scaled 32-bit offsets, e.g. // prfh pldl1keep, p0, [x0, z0.d, uxtw #1] - defm PRFB_D : sve_mem_64b_prfm_sv_ext_scaled<0b00, "prfb", ZPR64ExtSXTW8Only, ZPR64ExtUXTW8Only>; - defm PRFH_D : sve_mem_64b_prfm_sv_ext_scaled<0b01, "prfh", ZPR64ExtSXTW16, ZPR64ExtUXTW16>; - defm PRFW_D : sve_mem_64b_prfm_sv_ext_scaled<0b10, "prfw", ZPR64ExtSXTW32, ZPR64ExtUXTW32>; - defm PRFD_D : sve_mem_64b_prfm_sv_ext_scaled<0b11, "prfd", ZPR64ExtSXTW64, ZPR64ExtUXTW64>; + defm PRFB_D : sve_mem_64b_prfm_sv_ext_scaled<0b00, "prfb", ZPR64ExtSXTW8Only, ZPR64ExtUXTW8Only, int_aarch64_sve_prfb_gather_sxtw_index, int_aarch64_sve_prfb_gather_uxtw_index>; + defm PRFH_D : sve_mem_64b_prfm_sv_ext_scaled<0b01, "prfh", ZPR64ExtSXTW16, ZPR64ExtUXTW16, int_aarch64_sve_prfh_gather_sxtw_index, int_aarch64_sve_prfh_gather_uxtw_index>; + defm PRFW_D : sve_mem_64b_prfm_sv_ext_scaled<0b10, "prfw", ZPR64ExtSXTW32, ZPR64ExtUXTW32, int_aarch64_sve_prfw_gather_sxtw_index, int_aarch64_sve_prfw_gather_uxtw_index>; + defm PRFD_D : sve_mem_64b_prfm_sv_ext_scaled<0b11, "prfd", ZPR64ExtSXTW64, ZPR64ExtUXTW64, int_aarch64_sve_prfd_gather_sxtw_index, int_aarch64_sve_prfd_gather_uxtw_index>; // Gather prefetch using scaled 64-bit offsets, e.g. // prfh pldl1keep, p0, [x0, z0.d, lsl #1] - defm PRFB_D_SCALED : sve_mem_64b_prfm_sv_lsl_scaled<0b00, "prfb", ZPR64ExtLSL8>; - defm PRFH_D_SCALED : sve_mem_64b_prfm_sv_lsl_scaled<0b01, "prfh", ZPR64ExtLSL16>; - defm PRFW_D_SCALED : sve_mem_64b_prfm_sv_lsl_scaled<0b10, "prfw", ZPR64ExtLSL32>; - defm PRFD_D_SCALED : sve_mem_64b_prfm_sv_lsl_scaled<0b11, "prfd", ZPR64ExtLSL64>; + defm PRFB_D_SCALED : sve_mem_64b_prfm_sv_lsl_scaled<0b00, "prfb", ZPR64ExtLSL8, int_aarch64_sve_prfb_gather_index>; + defm PRFH_D_SCALED : sve_mem_64b_prfm_sv_lsl_scaled<0b01, "prfh", ZPR64ExtLSL16, int_aarch64_sve_prfh_gather_index>; + defm PRFW_D_SCALED : sve_mem_64b_prfm_sv_lsl_scaled<0b10, "prfw", ZPR64ExtLSL32, int_aarch64_sve_prfw_gather_index>; + defm PRFD_D_SCALED : sve_mem_64b_prfm_sv_lsl_scaled<0b11, "prfd", ZPR64ExtLSL64, int_aarch64_sve_prfd_gather_index>; // Gather prefetch using 32/64-bit pointers with offset, e.g. // prfh pldl1keep, p0, [z0.s, #16] // prfh pldl1keep, p0, [z0.d, #16] - defm PRFB_S_PZI : sve_mem_32b_prfm_vi<0b00, "prfb", imm0_31>; - defm PRFH_S_PZI : sve_mem_32b_prfm_vi<0b01, "prfh", uimm5s2>; - defm PRFW_S_PZI : sve_mem_32b_prfm_vi<0b10, "prfw", uimm5s4>; - defm PRFD_S_PZI : sve_mem_32b_prfm_vi<0b11, "prfd", uimm5s8>; + defm PRFB_S_PZI : sve_mem_32b_prfm_vi<0b00, "prfb", imm0_31, int_aarch64_sve_prfb_gather_scalar_offset>; + defm PRFH_S_PZI : sve_mem_32b_prfm_vi<0b01, "prfh", uimm5s2, int_aarch64_sve_prfh_gather_scalar_offset>; + defm PRFW_S_PZI : sve_mem_32b_prfm_vi<0b10, "prfw", uimm5s4, int_aarch64_sve_prfw_gather_scalar_offset>; + defm PRFD_S_PZI : sve_mem_32b_prfm_vi<0b11, "prfd", uimm5s8, int_aarch64_sve_prfd_gather_scalar_offset>; - defm PRFB_D_PZI : sve_mem_64b_prfm_vi<0b00, "prfb", imm0_31>; - defm PRFH_D_PZI : sve_mem_64b_prfm_vi<0b01, "prfh", uimm5s2>; - defm PRFW_D_PZI : sve_mem_64b_prfm_vi<0b10, "prfw", uimm5s4>; - defm PRFD_D_PZI : sve_mem_64b_prfm_vi<0b11, "prfd", uimm5s8>; + defm PRFB_D_PZI : sve_mem_64b_prfm_vi<0b00, "prfb", imm0_31, int_aarch64_sve_prfb_gather_scalar_offset>; + defm PRFH_D_PZI : sve_mem_64b_prfm_vi<0b01, "prfh", uimm5s2, int_aarch64_sve_prfh_gather_scalar_offset>; + defm PRFW_D_PZI : sve_mem_64b_prfm_vi<0b10, "prfw", uimm5s4, int_aarch64_sve_prfw_gather_scalar_offset>; + defm PRFD_D_PZI : sve_mem_64b_prfm_vi<0b11, "prfd", uimm5s8, int_aarch64_sve_prfd_gather_scalar_offset>; defm ADR_SXTW_ZZZ_D : sve_int_bin_cons_misc_0_a_sxtw<0b00, "adr">; defm ADR_UXTW_ZZZ_D : sve_int_bin_cons_misc_0_a_uxtw<0b01, "adr">; defm ADR_LSL_ZZZ_S : sve_int_bin_cons_misc_0_a_32_lsl<0b10, "adr">; defm ADR_LSL_ZZZ_D : sve_int_bin_cons_misc_0_a_64_lsl<0b11, "adr">; + def : Pat<(nxv4i32 (int_aarch64_sve_adrb nxv4i32:$Op1, nxv4i32:$Op2)), + (ADR_LSL_ZZZ_S_0 $Op1, $Op2)>; + def : Pat<(nxv4i32 (int_aarch64_sve_adrh nxv4i32:$Op1, nxv4i32:$Op2)), + (ADR_LSL_ZZZ_S_1 $Op1, $Op2)>; + def : Pat<(nxv4i32 (int_aarch64_sve_adrw nxv4i32:$Op1, nxv4i32:$Op2)), + (ADR_LSL_ZZZ_S_2 $Op1, $Op2)>; + def : Pat<(nxv4i32 (int_aarch64_sve_adrd nxv4i32:$Op1, nxv4i32:$Op2)), + (ADR_LSL_ZZZ_S_3 $Op1, $Op2)>; + + def : Pat<(nxv2i64 (int_aarch64_sve_adrb nxv2i64:$Op1, nxv2i64:$Op2)), + (ADR_LSL_ZZZ_D_0 $Op1, $Op2)>; + def : Pat<(nxv2i64 (int_aarch64_sve_adrh nxv2i64:$Op1, nxv2i64:$Op2)), + (ADR_LSL_ZZZ_D_1 $Op1, $Op2)>; + def : Pat<(nxv2i64 (int_aarch64_sve_adrw nxv2i64:$Op1, nxv2i64:$Op2)), + (ADR_LSL_ZZZ_D_2 $Op1, $Op2)>; + def : Pat<(nxv2i64 (int_aarch64_sve_adrd nxv2i64:$Op1, nxv2i64:$Op2)), + (ADR_LSL_ZZZ_D_3 $Op1, $Op2)>; + defm TBL_ZZZ : sve_int_perm_tbl<"tbl", AArch64tbl>; + let Predicates = [HasSVE, HasBF16] in { + def : SVE_2_Op_Pat<nxv8bf16, AArch64tbl, nxv8bf16, nxv8i16, TBL_ZZZ_H>; + } + defm ZIP1_ZZZ : sve_int_perm_bin_perm_zz<0b000, "zip1", AArch64zip1>; defm ZIP2_ZZZ : sve_int_perm_bin_perm_zz<0b001, "zip2", AArch64zip2>; defm UZP1_ZZZ : sve_int_perm_bin_perm_zz<0b010, "uzp1", AArch64uzp1>; @@ -770,6 +1093,15 @@ let Predicates = [HasSVE] in { defm TRN1_ZZZ : sve_int_perm_bin_perm_zz<0b100, "trn1", AArch64trn1>; defm TRN2_ZZZ : sve_int_perm_bin_perm_zz<0b101, "trn2", AArch64trn2>; + let Predicates = [HasSVE, HasBF16] in { + def : SVE_2_Op_Pat<nxv8bf16, AArch64zip1, nxv8bf16, nxv8bf16, ZIP1_ZZZ_H>; + def : SVE_2_Op_Pat<nxv8bf16, AArch64zip2, nxv8bf16, nxv8bf16, ZIP2_ZZZ_H>; + def : SVE_2_Op_Pat<nxv8bf16, AArch64uzp1, nxv8bf16, nxv8bf16, UZP1_ZZZ_H>; + def : SVE_2_Op_Pat<nxv8bf16, AArch64uzp2, nxv8bf16, nxv8bf16, UZP2_ZZZ_H>; + def : SVE_2_Op_Pat<nxv8bf16, AArch64trn1, nxv8bf16, nxv8bf16, TRN1_ZZZ_H>; + def : SVE_2_Op_Pat<nxv8bf16, AArch64trn2, nxv8bf16, nxv8bf16, TRN2_ZZZ_H>; + } + defm ZIP1_PPP : sve_int_perm_bin_perm_pp<0b000, "zip1", AArch64zip1>; defm ZIP2_PPP : sve_int_perm_bin_perm_pp<0b001, "zip2", AArch64zip2>; defm UZP1_PPP : sve_int_perm_bin_perm_pp<0b010, "uzp1", AArch64uzp1>; @@ -777,12 +1109,12 @@ let Predicates = [HasSVE] in { defm TRN1_PPP : sve_int_perm_bin_perm_pp<0b100, "trn1", AArch64trn1>; defm TRN2_PPP : sve_int_perm_bin_perm_pp<0b101, "trn2", AArch64trn2>; - defm CMPHS_PPzZZ : sve_int_cmp_0<0b000, "cmphs", int_aarch64_sve_cmphs, SETUGE>; - defm CMPHI_PPzZZ : sve_int_cmp_0<0b001, "cmphi", int_aarch64_sve_cmphi, SETUGT>; - defm CMPGE_PPzZZ : sve_int_cmp_0<0b100, "cmpge", int_aarch64_sve_cmpge, SETGE>; - defm CMPGT_PPzZZ : sve_int_cmp_0<0b101, "cmpgt", int_aarch64_sve_cmpgt, SETGT>; - defm CMPEQ_PPzZZ : sve_int_cmp_0<0b110, "cmpeq", int_aarch64_sve_cmpeq, SETEQ>; - defm CMPNE_PPzZZ : sve_int_cmp_0<0b111, "cmpne", int_aarch64_sve_cmpne, SETNE>; + defm CMPHS_PPzZZ : sve_int_cmp_0<0b000, "cmphs", SETUGE, SETULE>; + defm CMPHI_PPzZZ : sve_int_cmp_0<0b001, "cmphi", SETUGT, SETULT>; + defm CMPGE_PPzZZ : sve_int_cmp_0<0b100, "cmpge", SETGE, SETLE>; + defm CMPGT_PPzZZ : sve_int_cmp_0<0b101, "cmpgt", SETGT, SETLT>; + defm CMPEQ_PPzZZ : sve_int_cmp_0<0b110, "cmpeq", SETEQ, SETEQ>; + defm CMPNE_PPzZZ : sve_int_cmp_0<0b111, "cmpne", SETNE, SETNE>; defm CMPEQ_WIDE_PPzZZ : sve_int_cmp_0_wide<0b010, "cmpeq", int_aarch64_sve_cmpeq_wide>; defm CMPNE_WIDE_PPzZZ : sve_int_cmp_0_wide<0b011, "cmpne", int_aarch64_sve_cmpne_wide>; @@ -795,22 +1127,22 @@ let Predicates = [HasSVE] in { defm CMPLO_WIDE_PPzZZ : sve_int_cmp_1_wide<0b110, "cmplo", int_aarch64_sve_cmplo_wide>; defm CMPLS_WIDE_PPzZZ : sve_int_cmp_1_wide<0b111, "cmpls", int_aarch64_sve_cmpls_wide>; - defm CMPGE_PPzZI : sve_int_scmp_vi<0b000, "cmpge", SETGE, int_aarch64_sve_cmpge>; - defm CMPGT_PPzZI : sve_int_scmp_vi<0b001, "cmpgt", SETGT, int_aarch64_sve_cmpgt>; - defm CMPLT_PPzZI : sve_int_scmp_vi<0b010, "cmplt", SETLT, null_frag, int_aarch64_sve_cmpgt>; - defm CMPLE_PPzZI : sve_int_scmp_vi<0b011, "cmple", SETLE, null_frag, int_aarch64_sve_cmpge>; - defm CMPEQ_PPzZI : sve_int_scmp_vi<0b100, "cmpeq", SETEQ, int_aarch64_sve_cmpeq>; - defm CMPNE_PPzZI : sve_int_scmp_vi<0b101, "cmpne", SETNE, int_aarch64_sve_cmpne>; - defm CMPHS_PPzZI : sve_int_ucmp_vi<0b00, "cmphs", SETUGE, int_aarch64_sve_cmphs>; - defm CMPHI_PPzZI : sve_int_ucmp_vi<0b01, "cmphi", SETUGT, int_aarch64_sve_cmphi>; - defm CMPLO_PPzZI : sve_int_ucmp_vi<0b10, "cmplo", SETULT, null_frag, int_aarch64_sve_cmphi>; - defm CMPLS_PPzZI : sve_int_ucmp_vi<0b11, "cmpls", SETULE, null_frag, int_aarch64_sve_cmphs>; - - defm FCMGE_PPzZZ : sve_fp_3op_p_pd<0b000, "fcmge", int_aarch64_sve_fcmpge>; - defm FCMGT_PPzZZ : sve_fp_3op_p_pd<0b001, "fcmgt", int_aarch64_sve_fcmpgt>; - defm FCMEQ_PPzZZ : sve_fp_3op_p_pd<0b010, "fcmeq", int_aarch64_sve_fcmpeq>; - defm FCMNE_PPzZZ : sve_fp_3op_p_pd<0b011, "fcmne", int_aarch64_sve_fcmpne>; - defm FCMUO_PPzZZ : sve_fp_3op_p_pd<0b100, "fcmuo", int_aarch64_sve_fcmpuo>; + defm CMPGE_PPzZI : sve_int_scmp_vi<0b000, "cmpge", SETGE, SETLE>; + defm CMPGT_PPzZI : sve_int_scmp_vi<0b001, "cmpgt", SETGT, SETLT>; + defm CMPLT_PPzZI : sve_int_scmp_vi<0b010, "cmplt", SETLT, SETGT>; + defm CMPLE_PPzZI : sve_int_scmp_vi<0b011, "cmple", SETLE, SETGE>; + defm CMPEQ_PPzZI : sve_int_scmp_vi<0b100, "cmpeq", SETEQ, SETEQ>; + defm CMPNE_PPzZI : sve_int_scmp_vi<0b101, "cmpne", SETNE, SETEQ>; + defm CMPHS_PPzZI : sve_int_ucmp_vi<0b00, "cmphs", SETUGE, SETULE>; + defm CMPHI_PPzZI : sve_int_ucmp_vi<0b01, "cmphi", SETUGT, SETULT>; + defm CMPLO_PPzZI : sve_int_ucmp_vi<0b10, "cmplo", SETULT, SETUGT>; + defm CMPLS_PPzZI : sve_int_ucmp_vi<0b11, "cmpls", SETULE, SETUGE>; + + defm FCMGE_PPzZZ : sve_fp_3op_p_pd_cc<0b000, "fcmge", int_aarch64_sve_fcmpge, setoge>; + defm FCMGT_PPzZZ : sve_fp_3op_p_pd_cc<0b001, "fcmgt", int_aarch64_sve_fcmpgt, setogt>; + defm FCMEQ_PPzZZ : sve_fp_3op_p_pd_cc<0b010, "fcmeq", int_aarch64_sve_fcmpeq, setoeq>; + defm FCMNE_PPzZZ : sve_fp_3op_p_pd_cc<0b011, "fcmne", int_aarch64_sve_fcmpne, setone>; + defm FCMUO_PPzZZ : sve_fp_3op_p_pd_cc<0b100, "fcmuo", int_aarch64_sve_fcmpuo, setuo>; defm FACGE_PPzZZ : sve_fp_3op_p_pd<0b101, "facge", int_aarch64_sve_facge>; defm FACGT_PPzZZ : sve_fp_3op_p_pd<0b111, "facgt", int_aarch64_sve_facgt>; @@ -928,71 +1260,78 @@ let Predicates = [HasSVE] in { defm INCP_ZP : sve_int_count_v<0b10000, "incp">; defm DECP_ZP : sve_int_count_v<0b10100, "decp">; - defm INDEX_RR : sve_int_index_rr<"index">; - defm INDEX_IR : sve_int_index_ir<"index">; - defm INDEX_RI : sve_int_index_ri<"index">; - defm INDEX_II : sve_int_index_ii<"index">; + defm INDEX_RR : sve_int_index_rr<"index", index_vector>; + defm INDEX_IR : sve_int_index_ir<"index", index_vector>; + defm INDEX_RI : sve_int_index_ri<"index", index_vector>; + defm INDEX_II : sve_int_index_ii<"index", index_vector>; // Unpredicated shifts - defm ASR_ZZI : sve_int_bin_cons_shift_imm_right<0b00, "asr">; - defm LSR_ZZI : sve_int_bin_cons_shift_imm_right<0b01, "lsr">; - defm LSL_ZZI : sve_int_bin_cons_shift_imm_left< 0b11, "lsl">; + defm ASR_ZZI : sve_int_bin_cons_shift_imm_right<0b00, "asr", AArch64asr_m1>; + defm LSR_ZZI : sve_int_bin_cons_shift_imm_right<0b01, "lsr", AArch64lsr_m1>; + defm LSL_ZZI : sve_int_bin_cons_shift_imm_left< 0b11, "lsl", AArch64lsl_m1>; defm ASR_WIDE_ZZZ : sve_int_bin_cons_shift_wide<0b00, "asr">; defm LSR_WIDE_ZZZ : sve_int_bin_cons_shift_wide<0b01, "lsr">; defm LSL_WIDE_ZZZ : sve_int_bin_cons_shift_wide<0b11, "lsl">; // Predicated shifts - defm ASR_ZPmI : sve_int_bin_pred_shift_imm_right<0b0000, "asr">; - defm LSR_ZPmI : sve_int_bin_pred_shift_imm_right<0b0001, "lsr">; + defm ASR_ZPmI : sve_int_bin_pred_shift_imm_right<0b0000, "asr", "ASR_ZPZI">; + defm LSR_ZPmI : sve_int_bin_pred_shift_imm_right<0b0001, "lsr", "LSR_ZPZI">; defm LSL_ZPmI : sve_int_bin_pred_shift_imm_left< 0b0011, "lsl">; - defm ASRD_ZPmI : sve_int_bin_pred_shift_imm_right<0b0100, "asrd", int_aarch64_sve_asrd>; + defm ASRD_ZPmI : sve_int_bin_pred_shift_imm_right<0b0100, "asrd", "ASRD_ZPZI", int_aarch64_sve_asrd>; + + let Predicates = [HasSVE, UseExperimentalZeroingPseudos] in { + defm ASR_ZPZZ : sve_int_bin_pred_zeroing_bhsd<AArch64asr_m1>; + defm LSR_ZPZZ : sve_int_bin_pred_zeroing_bhsd<AArch64lsr_m1>; + defm LSL_ZPZZ : sve_int_bin_pred_zeroing_bhsd<AArch64lsl_m1>; + defm ASRD_ZPZI : sve_int_bin_pred_shift_imm_right_zeroing_bhsd<int_aarch64_sve_asrd>; + } - defm ASR_ZPmZ : sve_int_bin_pred_shift<0b000, "asr", int_aarch64_sve_asr>; - defm LSR_ZPmZ : sve_int_bin_pred_shift<0b001, "lsr", int_aarch64_sve_lsr>; - defm LSL_ZPmZ : sve_int_bin_pred_shift<0b011, "lsl", int_aarch64_sve_lsl>; - defm ASRR_ZPmZ : sve_int_bin_pred_shift<0b100, "asrr", null_frag>; - defm LSRR_ZPmZ : sve_int_bin_pred_shift<0b101, "lsrr", null_frag>; - defm LSLR_ZPmZ : sve_int_bin_pred_shift<0b111, "lslr", null_frag>; + defm ASR_ZPmZ : sve_int_bin_pred_shift<0b000, "asr", "ASR_ZPZZ", AArch64asr_m1, "ASRR_ZPmZ">; + defm LSR_ZPmZ : sve_int_bin_pred_shift<0b001, "lsr", "LSR_ZPZZ", AArch64lsr_m1, "LSRR_ZPmZ">; + defm LSL_ZPmZ : sve_int_bin_pred_shift<0b011, "lsl", "LSL_ZPZZ", AArch64lsl_m1, "LSLR_ZPmZ">; + defm ASRR_ZPmZ : sve_int_bin_pred_shift<0b100, "asrr", "ASRR_ZPZZ", null_frag, "ASR_ZPmZ", /*isReverseInstr*/ 1>; + defm LSRR_ZPmZ : sve_int_bin_pred_shift<0b101, "lsrr", "LSRR_ZPZZ", null_frag, "LSR_ZPmZ", /*isReverseInstr*/ 1>; + defm LSLR_ZPmZ : sve_int_bin_pred_shift<0b111, "lslr", "LSLR_ZPZZ", null_frag, "LSL_ZPmZ", /*isReverseInstr*/ 1>; defm ASR_WIDE_ZPmZ : sve_int_bin_pred_shift_wide<0b000, "asr", int_aarch64_sve_asr_wide>; defm LSR_WIDE_ZPmZ : sve_int_bin_pred_shift_wide<0b001, "lsr", int_aarch64_sve_lsr_wide>; defm LSL_WIDE_ZPmZ : sve_int_bin_pred_shift_wide<0b011, "lsl", int_aarch64_sve_lsl_wide>; - defm FCVT_ZPmZ_StoH : sve_fp_2op_p_zd<0b1001000, "fcvt", ZPR32, ZPR16, int_aarch64_sve_fcvt_f16f32, nxv8f16, nxv16i1, nxv4f32, ElementSizeS>; - defm FCVT_ZPmZ_HtoS : sve_fp_2op_p_zd<0b1001001, "fcvt", ZPR16, ZPR32, int_aarch64_sve_fcvt_f32f16, nxv4f32, nxv16i1, nxv8f16, ElementSizeS>; - defm SCVTF_ZPmZ_HtoH : sve_fp_2op_p_zd<0b0110010, "scvtf", ZPR16, ZPR16, int_aarch64_sve_scvtf, nxv8f16, nxv8i1, nxv8i16, ElementSizeH>; - defm SCVTF_ZPmZ_StoS : sve_fp_2op_p_zd<0b1010100, "scvtf", ZPR32, ZPR32, int_aarch64_sve_scvtf, nxv4f32, nxv4i1, nxv4i32, ElementSizeS>; - defm UCVTF_ZPmZ_StoS : sve_fp_2op_p_zd<0b1010101, "ucvtf", ZPR32, ZPR32, int_aarch64_sve_ucvtf, nxv4f32, nxv4i1, nxv4i32, ElementSizeS>; - defm UCVTF_ZPmZ_HtoH : sve_fp_2op_p_zd<0b0110011, "ucvtf", ZPR16, ZPR16, int_aarch64_sve_ucvtf, nxv8f16, nxv8i1, nxv8i16, ElementSizeH>; - defm FCVTZS_ZPmZ_HtoH : sve_fp_2op_p_zd<0b0111010, "fcvtzs", ZPR16, ZPR16, int_aarch64_sve_fcvtzs, nxv8i16, nxv8i1, nxv8f16, ElementSizeH>; - defm FCVTZS_ZPmZ_StoS : sve_fp_2op_p_zd<0b1011100, "fcvtzs", ZPR32, ZPR32, int_aarch64_sve_fcvtzs, nxv4i32, nxv4i1, nxv4f32, ElementSizeS>; - defm FCVTZU_ZPmZ_HtoH : sve_fp_2op_p_zd<0b0111011, "fcvtzu", ZPR16, ZPR16, int_aarch64_sve_fcvtzu, nxv8i16, nxv8i1, nxv8f16, ElementSizeH>; - defm FCVTZU_ZPmZ_StoS : sve_fp_2op_p_zd<0b1011101, "fcvtzu", ZPR32, ZPR32, int_aarch64_sve_fcvtzu, nxv4i32, nxv4i1, nxv4f32, ElementSizeS>; - defm FCVT_ZPmZ_DtoH : sve_fp_2op_p_zd<0b1101000, "fcvt", ZPR64, ZPR16, int_aarch64_sve_fcvt_f16f64, nxv8f16, nxv16i1, nxv2f64, ElementSizeD>; - defm FCVT_ZPmZ_HtoD : sve_fp_2op_p_zd<0b1101001, "fcvt", ZPR16, ZPR64, int_aarch64_sve_fcvt_f64f16, nxv2f64, nxv16i1, nxv8f16, ElementSizeD>; - defm FCVT_ZPmZ_DtoS : sve_fp_2op_p_zd<0b1101010, "fcvt", ZPR64, ZPR32, int_aarch64_sve_fcvt_f32f64, nxv4f32, nxv16i1, nxv2f64, ElementSizeD>; - defm FCVT_ZPmZ_StoD : sve_fp_2op_p_zd<0b1101011, "fcvt", ZPR32, ZPR64, int_aarch64_sve_fcvt_f64f32, nxv2f64, nxv16i1, nxv4f32, ElementSizeD>; - defm SCVTF_ZPmZ_StoD : sve_fp_2op_p_zd<0b1110000, "scvtf", ZPR32, ZPR64, int_aarch64_sve_scvtf_f64i32, nxv2f64, nxv16i1, nxv4i32, ElementSizeD>; - defm UCVTF_ZPmZ_StoD : sve_fp_2op_p_zd<0b1110001, "ucvtf", ZPR32, ZPR64, int_aarch64_sve_ucvtf_f64i32, nxv2f64, nxv16i1, nxv4i32, ElementSizeD>; - defm UCVTF_ZPmZ_StoH : sve_fp_2op_p_zd<0b0110101, "ucvtf", ZPR32, ZPR16, int_aarch64_sve_ucvtf_f16i32, nxv8f16, nxv16i1, nxv4i32, ElementSizeS>; - defm SCVTF_ZPmZ_DtoS : sve_fp_2op_p_zd<0b1110100, "scvtf", ZPR64, ZPR32, int_aarch64_sve_scvtf_f32i64, nxv4f32, nxv16i1, nxv2i64, ElementSizeD>; - defm SCVTF_ZPmZ_StoH : sve_fp_2op_p_zd<0b0110100, "scvtf", ZPR32, ZPR16, int_aarch64_sve_scvtf_f16i32, nxv8f16, nxv16i1, nxv4i32, ElementSizeS>; - defm SCVTF_ZPmZ_DtoH : sve_fp_2op_p_zd<0b0110110, "scvtf", ZPR64, ZPR16, int_aarch64_sve_scvtf_f16i64, nxv8f16, nxv16i1, nxv2i64, ElementSizeD>; - defm UCVTF_ZPmZ_DtoS : sve_fp_2op_p_zd<0b1110101, "ucvtf", ZPR64, ZPR32, int_aarch64_sve_ucvtf_f32i64, nxv4f32, nxv16i1, nxv2i64, ElementSizeD>; - defm UCVTF_ZPmZ_DtoH : sve_fp_2op_p_zd<0b0110111, "ucvtf", ZPR64, ZPR16, int_aarch64_sve_ucvtf_f16i64, nxv8f16, nxv16i1, nxv2i64, ElementSizeD>; - defm SCVTF_ZPmZ_DtoD : sve_fp_2op_p_zd<0b1110110, "scvtf", ZPR64, ZPR64, int_aarch64_sve_scvtf, nxv2f64, nxv2i1, nxv2i64, ElementSizeD>; - defm UCVTF_ZPmZ_DtoD : sve_fp_2op_p_zd<0b1110111, "ucvtf", ZPR64, ZPR64, int_aarch64_sve_ucvtf, nxv2f64, nxv2i1, nxv2i64, ElementSizeD>; - defm FCVTZS_ZPmZ_DtoS : sve_fp_2op_p_zd<0b1111000, "fcvtzs", ZPR64, ZPR32, int_aarch64_sve_fcvtzs_i32f64, nxv4i32, nxv16i1, nxv2f64, ElementSizeD>; - defm FCVTZU_ZPmZ_DtoS : sve_fp_2op_p_zd<0b1111001, "fcvtzu", ZPR64, ZPR32, int_aarch64_sve_fcvtzu_i32f64, nxv4i32, nxv16i1, nxv2f64, ElementSizeD>; - defm FCVTZS_ZPmZ_StoD : sve_fp_2op_p_zd<0b1111100, "fcvtzs", ZPR32, ZPR64, int_aarch64_sve_fcvtzs_i64f32, nxv2i64, nxv16i1, nxv4f32, ElementSizeD>; - defm FCVTZS_ZPmZ_HtoS : sve_fp_2op_p_zd<0b0111100, "fcvtzs", ZPR16, ZPR32, int_aarch64_sve_fcvtzs_i32f16, nxv4i32, nxv16i1, nxv8f16, ElementSizeS>; - defm FCVTZS_ZPmZ_HtoD : sve_fp_2op_p_zd<0b0111110, "fcvtzs", ZPR16, ZPR64, int_aarch64_sve_fcvtzs_i64f16, nxv2i64, nxv16i1, nxv8f16, ElementSizeD>; - defm FCVTZU_ZPmZ_HtoS : sve_fp_2op_p_zd<0b0111101, "fcvtzu", ZPR16, ZPR32, int_aarch64_sve_fcvtzu_i32f16, nxv4i32, nxv16i1, nxv8f16, ElementSizeS>; - defm FCVTZU_ZPmZ_HtoD : sve_fp_2op_p_zd<0b0111111, "fcvtzu", ZPR16, ZPR64, int_aarch64_sve_fcvtzu_i64f16, nxv2i64, nxv16i1, nxv8f16, ElementSizeD>; - defm FCVTZU_ZPmZ_StoD : sve_fp_2op_p_zd<0b1111101, "fcvtzu", ZPR32, ZPR64, int_aarch64_sve_fcvtzu_i64f32, nxv2i64, nxv16i1, nxv4f32, ElementSizeD>; - defm FCVTZS_ZPmZ_DtoD : sve_fp_2op_p_zd<0b1111110, "fcvtzs", ZPR64, ZPR64, int_aarch64_sve_fcvtzs, nxv2i64, nxv2i1, nxv2f64, ElementSizeD>; - defm FCVTZU_ZPmZ_DtoD : sve_fp_2op_p_zd<0b1111111, "fcvtzu", ZPR64, ZPR64, int_aarch64_sve_fcvtzu, nxv2i64, nxv2i1, nxv2f64, ElementSizeD>; + defm FCVT_ZPmZ_StoH : sve_fp_2op_p_zd<0b1001000, "fcvt", ZPR32, ZPR16, int_aarch64_sve_fcvt_f16f32, nxv8f16, nxv4i1, nxv4f32, ElementSizeS>; + defm FCVT_ZPmZ_HtoS : sve_fp_2op_p_zd<0b1001001, "fcvt", ZPR16, ZPR32, int_aarch64_sve_fcvt_f32f16, nxv4f32, nxv4i1, nxv8f16, ElementSizeS>; + defm SCVTF_ZPmZ_HtoH : sve_fp_2op_p_zd<0b0110010, "scvtf", ZPR16, ZPR16, int_aarch64_sve_scvtf, nxv8f16, nxv8i1, nxv8i16, ElementSizeH>; + defm SCVTF_ZPmZ_StoS : sve_fp_2op_p_zd<0b1010100, "scvtf", ZPR32, ZPR32, int_aarch64_sve_scvtf, nxv4f32, nxv4i1, nxv4i32, ElementSizeS>; + defm UCVTF_ZPmZ_StoS : sve_fp_2op_p_zd<0b1010101, "ucvtf", ZPR32, ZPR32, int_aarch64_sve_ucvtf, nxv4f32, nxv4i1, nxv4i32, ElementSizeS>; + defm UCVTF_ZPmZ_HtoH : sve_fp_2op_p_zd<0b0110011, "ucvtf", ZPR16, ZPR16, int_aarch64_sve_ucvtf, nxv8f16, nxv8i1, nxv8i16, ElementSizeH>; + defm FCVTZS_ZPmZ_HtoH : sve_fp_2op_p_zd<0b0111010, "fcvtzs", ZPR16, ZPR16, int_aarch64_sve_fcvtzs, nxv8i16, nxv8i1, nxv8f16, ElementSizeH>; + defm FCVTZS_ZPmZ_StoS : sve_fp_2op_p_zd<0b1011100, "fcvtzs", ZPR32, ZPR32, int_aarch64_sve_fcvtzs, nxv4i32, nxv4i1, nxv4f32, ElementSizeS>; + defm FCVTZU_ZPmZ_HtoH : sve_fp_2op_p_zd<0b0111011, "fcvtzu", ZPR16, ZPR16, int_aarch64_sve_fcvtzu, nxv8i16, nxv8i1, nxv8f16, ElementSizeH>; + defm FCVTZU_ZPmZ_StoS : sve_fp_2op_p_zd<0b1011101, "fcvtzu", ZPR32, ZPR32, int_aarch64_sve_fcvtzu, nxv4i32, nxv4i1, nxv4f32, ElementSizeS>; + defm FCVT_ZPmZ_DtoH : sve_fp_2op_p_zd<0b1101000, "fcvt", ZPR64, ZPR16, int_aarch64_sve_fcvt_f16f64, nxv8f16, nxv2i1, nxv2f64, ElementSizeD>; + defm FCVT_ZPmZ_HtoD : sve_fp_2op_p_zd<0b1101001, "fcvt", ZPR16, ZPR64, int_aarch64_sve_fcvt_f64f16, nxv2f64, nxv2i1, nxv8f16, ElementSizeD>; + defm FCVT_ZPmZ_DtoS : sve_fp_2op_p_zd<0b1101010, "fcvt", ZPR64, ZPR32, int_aarch64_sve_fcvt_f32f64, nxv4f32, nxv2i1, nxv2f64, ElementSizeD>; + defm FCVT_ZPmZ_StoD : sve_fp_2op_p_zd<0b1101011, "fcvt", ZPR32, ZPR64, int_aarch64_sve_fcvt_f64f32, nxv2f64, nxv2i1, nxv4f32, ElementSizeD>; + defm SCVTF_ZPmZ_StoD : sve_fp_2op_p_zd<0b1110000, "scvtf", ZPR32, ZPR64, int_aarch64_sve_scvtf_f64i32, nxv2f64, nxv2i1, nxv4i32, ElementSizeD>; + defm UCVTF_ZPmZ_StoD : sve_fp_2op_p_zd<0b1110001, "ucvtf", ZPR32, ZPR64, int_aarch64_sve_ucvtf_f64i32, nxv2f64, nxv2i1, nxv4i32, ElementSizeD>; + defm UCVTF_ZPmZ_StoH : sve_fp_2op_p_zd<0b0110101, "ucvtf", ZPR32, ZPR16, int_aarch64_sve_ucvtf_f16i32, nxv8f16, nxv4i1, nxv4i32, ElementSizeS>; + defm SCVTF_ZPmZ_DtoS : sve_fp_2op_p_zd<0b1110100, "scvtf", ZPR64, ZPR32, int_aarch64_sve_scvtf_f32i64, nxv4f32, nxv2i1, nxv2i64, ElementSizeD>; + defm SCVTF_ZPmZ_StoH : sve_fp_2op_p_zd<0b0110100, "scvtf", ZPR32, ZPR16, int_aarch64_sve_scvtf_f16i32, nxv8f16, nxv4i1, nxv4i32, ElementSizeS>; + defm SCVTF_ZPmZ_DtoH : sve_fp_2op_p_zd<0b0110110, "scvtf", ZPR64, ZPR16, int_aarch64_sve_scvtf_f16i64, nxv8f16, nxv2i1, nxv2i64, ElementSizeD>; + defm UCVTF_ZPmZ_DtoS : sve_fp_2op_p_zd<0b1110101, "ucvtf", ZPR64, ZPR32, int_aarch64_sve_ucvtf_f32i64, nxv4f32, nxv2i1, nxv2i64, ElementSizeD>; + defm UCVTF_ZPmZ_DtoH : sve_fp_2op_p_zd<0b0110111, "ucvtf", ZPR64, ZPR16, int_aarch64_sve_ucvtf_f16i64, nxv8f16, nxv2i1, nxv2i64, ElementSizeD>; + defm SCVTF_ZPmZ_DtoD : sve_fp_2op_p_zd<0b1110110, "scvtf", ZPR64, ZPR64, int_aarch64_sve_scvtf, nxv2f64, nxv2i1, nxv2i64, ElementSizeD>; + defm UCVTF_ZPmZ_DtoD : sve_fp_2op_p_zd<0b1110111, "ucvtf", ZPR64, ZPR64, int_aarch64_sve_ucvtf, nxv2f64, nxv2i1, nxv2i64, ElementSizeD>; + defm FCVTZS_ZPmZ_DtoS : sve_fp_2op_p_zd<0b1111000, "fcvtzs", ZPR64, ZPR32, int_aarch64_sve_fcvtzs_i32f64, nxv4i32, nxv2i1, nxv2f64, ElementSizeD>; + defm FCVTZU_ZPmZ_DtoS : sve_fp_2op_p_zd<0b1111001, "fcvtzu", ZPR64, ZPR32, int_aarch64_sve_fcvtzu_i32f64, nxv4i32, nxv2i1, nxv2f64, ElementSizeD>; + defm FCVTZS_ZPmZ_StoD : sve_fp_2op_p_zd<0b1111100, "fcvtzs", ZPR32, ZPR64, int_aarch64_sve_fcvtzs_i64f32, nxv2i64, nxv2i1, nxv4f32, ElementSizeD>; + defm FCVTZS_ZPmZ_HtoS : sve_fp_2op_p_zd<0b0111100, "fcvtzs", ZPR16, ZPR32, int_aarch64_sve_fcvtzs_i32f16, nxv4i32, nxv4i1, nxv8f16, ElementSizeS>; + defm FCVTZS_ZPmZ_HtoD : sve_fp_2op_p_zd<0b0111110, "fcvtzs", ZPR16, ZPR64, int_aarch64_sve_fcvtzs_i64f16, nxv2i64, nxv2i1, nxv8f16, ElementSizeD>; + defm FCVTZU_ZPmZ_HtoS : sve_fp_2op_p_zd<0b0111101, "fcvtzu", ZPR16, ZPR32, int_aarch64_sve_fcvtzu_i32f16, nxv4i32, nxv4i1, nxv8f16, ElementSizeS>; + defm FCVTZU_ZPmZ_HtoD : sve_fp_2op_p_zd<0b0111111, "fcvtzu", ZPR16, ZPR64, int_aarch64_sve_fcvtzu_i64f16, nxv2i64, nxv2i1, nxv8f16, ElementSizeD>; + defm FCVTZU_ZPmZ_StoD : sve_fp_2op_p_zd<0b1111101, "fcvtzu", ZPR32, ZPR64, int_aarch64_sve_fcvtzu_i64f32, nxv2i64, nxv2i1, nxv4f32, ElementSizeD>; + defm FCVTZS_ZPmZ_DtoD : sve_fp_2op_p_zd<0b1111110, "fcvtzs", ZPR64, ZPR64, int_aarch64_sve_fcvtzs, nxv2i64, nxv2i1, nxv2f64, ElementSizeD>; + defm FCVTZU_ZPmZ_DtoD : sve_fp_2op_p_zd<0b1111111, "fcvtzu", ZPR64, ZPR64, int_aarch64_sve_fcvtzu, nxv2i64, nxv2i1, nxv2f64, ElementSizeD>; defm FRINTN_ZPmZ : sve_fp_2op_p_zd_HSD<0b00000, "frintn", int_aarch64_sve_frintn>; defm FRINTP_ZPmZ : sve_fp_2op_p_zd_HSD<0b00001, "frintp", int_aarch64_sve_frintp>; @@ -1004,6 +1343,18 @@ let Predicates = [HasSVE] in { defm FRECPX_ZPmZ : sve_fp_2op_p_zd_HSD<0b01100, "frecpx", int_aarch64_sve_frecpx>; defm FSQRT_ZPmZ : sve_fp_2op_p_zd_HSD<0b01101, "fsqrt", int_aarch64_sve_fsqrt>; + let Predicates = [HasBF16, HasSVE] in { + defm BFDOT_ZZZ : sve_bfloat_dot<"bfdot", int_aarch64_sve_bfdot>; + defm BFDOT_ZZI : sve_bfloat_dot_indexed<"bfdot", int_aarch64_sve_bfdot_lane>; + defm BFMMLA_ZZZ : sve_bfloat_matmul<"bfmmla", int_aarch64_sve_bfmmla>; + defm BFMMLA_B_ZZZ : sve_bfloat_matmul_longvecl<0b0, "bfmlalb", int_aarch64_sve_bfmlalb>; + defm BFMMLA_T_ZZZ : sve_bfloat_matmul_longvecl<0b1, "bfmlalt", int_aarch64_sve_bfmlalt>; + defm BFMMLA_B_ZZI : sve_bfloat_matmul_longvecl_idx<0b0, "bfmlalb", int_aarch64_sve_bfmlalb_lane>; + defm BFMMLA_T_ZZI : sve_bfloat_matmul_longvecl_idx<0b1, "bfmlalt", int_aarch64_sve_bfmlalt_lane>; + defm BFCVT_ZPmZ : sve_bfloat_convert<0b1, "bfcvt", int_aarch64_sve_fcvt_bf16f32>; + defm BFCVTNT_ZPmZ : sve_bfloat_convert<0b0, "bfcvtnt", int_aarch64_sve_fcvtnt_bf16f32>; + } + // InstAliases def : InstAlias<"mov $Zd, $Zn", (ORR_ZZZ ZPR64:$Zd, ZPR64:$Zn, ZPR64:$Zn), 1>; @@ -1089,6 +1440,20 @@ let Predicates = [HasSVE] in { def : InstAlias<"fcmlt $Zd, $Pg/z, $Zm, $Zn", (FCMGT_PPzZZ_D PPR64:$Zd, PPR3bAny:$Pg, ZPR64:$Zn, ZPR64:$Zm), 0>; + // Pseudo instructions representing unpredicated LDR and STR for ZPR2,3,4. + // These get expanded to individual LDR_ZXI/STR_ZXI instructions in + // AArch64ExpandPseudoInsts. + let mayLoad = 1, hasSideEffects = 0 in { + def LDR_ZZXI : Pseudo<(outs ZZ_b:$Zd), (ins GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>; + def LDR_ZZZXI : Pseudo<(outs ZZZ_b:$Zd), (ins GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>; + def LDR_ZZZZXI : Pseudo<(outs ZZZZ_b:$Zd), (ins GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>; + } + let mayStore = 1, hasSideEffects = 0 in { + def STR_ZZXI : Pseudo<(outs), (ins ZZ_b:$Zs, GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>; + def STR_ZZZXI : Pseudo<(outs), (ins ZZZ_b:$Zs, GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>; + def STR_ZZZZXI : Pseudo<(outs), (ins ZZZZ_b:$Zs, GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>; + } + def : Pat<(AArch64ptest (nxv16i1 PPR:$pg), (nxv16i1 PPR:$src)), (PTEST_PP PPR:$pg, PPR:$src)>; def : Pat<(AArch64ptest (nxv8i1 PPR:$pg), (nxv8i1 PPR:$src)), @@ -1098,6 +1463,25 @@ let Predicates = [HasSVE] in { def : Pat<(AArch64ptest (nxv2i1 PPR:$pg), (nxv2i1 PPR:$src)), (PTEST_PP PPR:$pg, PPR:$src)>; + // LD1R of 128-bit masked data + def : Pat<(nxv16i8 (AArch64ld1rq_z PPR:$gp, GPR64:$base)), + (LD1RQ_B_IMM $gp, $base, (i64 0))>; + def : Pat<(nxv8i16 (AArch64ld1rq_z PPR:$gp, GPR64:$base)), + (LD1RQ_H_IMM $gp, $base, (i64 0))>; + def : Pat<(nxv4i32 (AArch64ld1rq_z PPR:$gp, GPR64:$base)), + (LD1RQ_W_IMM $gp, $base, (i64 0))>; + def : Pat<(nxv2i64 (AArch64ld1rq_z PPR:$gp, GPR64:$base)), + (LD1RQ_D_IMM $gp, $base, (i64 0))>; + + def : Pat<(nxv16i8 (AArch64ld1rq_z PPR:$gp, (add GPR64:$base, (i64 simm4s16:$imm)))), + (LD1RQ_B_IMM $gp, $base, simm4s16:$imm)>; + def : Pat<(nxv8i16 (AArch64ld1rq_z PPR:$gp, (add GPR64:$base, (i64 simm4s16:$imm)))), + (LD1RQ_H_IMM $gp, $base, simm4s16:$imm)>; + def : Pat<(nxv4i32 (AArch64ld1rq_z PPR:$gp, (add GPR64:$base, (i64 simm4s16:$imm)))), + (LD1RQ_W_IMM $gp, $base, simm4s16:$imm)>; + def : Pat<(nxv2i64 (AArch64ld1rq_z PPR:$gp, (add GPR64:$base, (i64 simm4s16:$imm)))), + (LD1RQ_D_IMM $gp, $base, simm4s16:$imm)>; + def : Pat<(sext_inreg (nxv2i64 ZPR:$Zs), nxv2i32), (SXTW_ZPmZ_D (IMPLICIT_DEF), (PTRUE_D 31), ZPR:$Zs)>; def : Pat<(sext_inreg (nxv2i64 ZPR:$Zs), nxv2i16), (SXTH_ZPmZ_D (IMPLICIT_DEF), (PTRUE_D 31), ZPR:$Zs)>; def : Pat<(sext_inreg (nxv2i64 ZPR:$Zs), nxv2i8), (SXTB_ZPmZ_D (IMPLICIT_DEF), (PTRUE_D 31), ZPR:$Zs)>; @@ -1105,346 +1489,899 @@ let Predicates = [HasSVE] in { def : Pat<(sext_inreg (nxv4i32 ZPR:$Zs), nxv4i8), (SXTB_ZPmZ_S (IMPLICIT_DEF), (PTRUE_S 31), ZPR:$Zs)>; def : Pat<(sext_inreg (nxv8i16 ZPR:$Zs), nxv8i8), (SXTB_ZPmZ_H (IMPLICIT_DEF), (PTRUE_H 31), ZPR:$Zs)>; - def : Pat<(nxv16i8 (bitconvert (nxv8i16 ZPR:$src))), (nxv16i8 ZPR:$src)>; - def : Pat<(nxv16i8 (bitconvert (nxv4i32 ZPR:$src))), (nxv16i8 ZPR:$src)>; - def : Pat<(nxv16i8 (bitconvert (nxv2i64 ZPR:$src))), (nxv16i8 ZPR:$src)>; - def : Pat<(nxv16i8 (bitconvert (nxv8f16 ZPR:$src))), (nxv16i8 ZPR:$src)>; - def : Pat<(nxv16i8 (bitconvert (nxv4f32 ZPR:$src))), (nxv16i8 ZPR:$src)>; - def : Pat<(nxv16i8 (bitconvert (nxv2f64 ZPR:$src))), (nxv16i8 ZPR:$src)>; - - def : Pat<(nxv8i16 (bitconvert (nxv16i8 ZPR:$src))), (nxv8i16 ZPR:$src)>; - def : Pat<(nxv8i16 (bitconvert (nxv4i32 ZPR:$src))), (nxv8i16 ZPR:$src)>; - def : Pat<(nxv8i16 (bitconvert (nxv2i64 ZPR:$src))), (nxv8i16 ZPR:$src)>; - def : Pat<(nxv8i16 (bitconvert (nxv8f16 ZPR:$src))), (nxv8i16 ZPR:$src)>; - def : Pat<(nxv8i16 (bitconvert (nxv4f32 ZPR:$src))), (nxv8i16 ZPR:$src)>; - def : Pat<(nxv8i16 (bitconvert (nxv2f64 ZPR:$src))), (nxv8i16 ZPR:$src)>; - - def : Pat<(nxv4i32 (bitconvert (nxv16i8 ZPR:$src))), (nxv4i32 ZPR:$src)>; - def : Pat<(nxv4i32 (bitconvert (nxv8i16 ZPR:$src))), (nxv4i32 ZPR:$src)>; - def : Pat<(nxv4i32 (bitconvert (nxv2i64 ZPR:$src))), (nxv4i32 ZPR:$src)>; - def : Pat<(nxv4i32 (bitconvert (nxv8f16 ZPR:$src))), (nxv4i32 ZPR:$src)>; - def : Pat<(nxv4i32 (bitconvert (nxv4f32 ZPR:$src))), (nxv4i32 ZPR:$src)>; - def : Pat<(nxv4i32 (bitconvert (nxv2f64 ZPR:$src))), (nxv4i32 ZPR:$src)>; - - def : Pat<(nxv2i64 (bitconvert (nxv16i8 ZPR:$src))), (nxv2i64 ZPR:$src)>; - def : Pat<(nxv2i64 (bitconvert (nxv8i16 ZPR:$src))), (nxv2i64 ZPR:$src)>; - def : Pat<(nxv2i64 (bitconvert (nxv4i32 ZPR:$src))), (nxv2i64 ZPR:$src)>; - def : Pat<(nxv2i64 (bitconvert (nxv8f16 ZPR:$src))), (nxv2i64 ZPR:$src)>; - def : Pat<(nxv2i64 (bitconvert (nxv4f32 ZPR:$src))), (nxv2i64 ZPR:$src)>; - def : Pat<(nxv2i64 (bitconvert (nxv2f64 ZPR:$src))), (nxv2i64 ZPR:$src)>; - - def : Pat<(nxv8f16 (bitconvert (nxv16i8 ZPR:$src))), (nxv8f16 ZPR:$src)>; - def : Pat<(nxv8f16 (bitconvert (nxv8i16 ZPR:$src))), (nxv8f16 ZPR:$src)>; - def : Pat<(nxv8f16 (bitconvert (nxv4i32 ZPR:$src))), (nxv8f16 ZPR:$src)>; - def : Pat<(nxv8f16 (bitconvert (nxv2i64 ZPR:$src))), (nxv8f16 ZPR:$src)>; - def : Pat<(nxv8f16 (bitconvert (nxv4f32 ZPR:$src))), (nxv8f16 ZPR:$src)>; - def : Pat<(nxv8f16 (bitconvert (nxv2f64 ZPR:$src))), (nxv8f16 ZPR:$src)>; - - def : Pat<(nxv4f32 (bitconvert (nxv16i8 ZPR:$src))), (nxv4f32 ZPR:$src)>; - def : Pat<(nxv4f32 (bitconvert (nxv8i16 ZPR:$src))), (nxv4f32 ZPR:$src)>; - def : Pat<(nxv4f32 (bitconvert (nxv4i32 ZPR:$src))), (nxv4f32 ZPR:$src)>; - def : Pat<(nxv4f32 (bitconvert (nxv2i64 ZPR:$src))), (nxv4f32 ZPR:$src)>; - def : Pat<(nxv4f32 (bitconvert (nxv8f16 ZPR:$src))), (nxv4f32 ZPR:$src)>; - def : Pat<(nxv4f32 (bitconvert (nxv2f64 ZPR:$src))), (nxv4f32 ZPR:$src)>; - - def : Pat<(nxv2f64 (bitconvert (nxv16i8 ZPR:$src))), (nxv2f64 ZPR:$src)>; - def : Pat<(nxv2f64 (bitconvert (nxv8i16 ZPR:$src))), (nxv2f64 ZPR:$src)>; - def : Pat<(nxv2f64 (bitconvert (nxv4i32 ZPR:$src))), (nxv2f64 ZPR:$src)>; - def : Pat<(nxv2f64 (bitconvert (nxv2i64 ZPR:$src))), (nxv2f64 ZPR:$src)>; - def : Pat<(nxv2f64 (bitconvert (nxv8f16 ZPR:$src))), (nxv2f64 ZPR:$src)>; - def : Pat<(nxv2f64 (bitconvert (nxv4f32 ZPR:$src))), (nxv2f64 ZPR:$src)>; + // General case that we ideally never want to match. + def : Pat<(vscale GPR64:$scale), (MADDXrrr (UBFMXri (RDVLI_XI 1), 4, 63), $scale, XZR)>; + + let AddedComplexity = 5 in { + def : Pat<(vscale (i64 1)), (UBFMXri (RDVLI_XI 1), 4, 63)>; + def : Pat<(vscale (i64 -1)), (SBFMXri (RDVLI_XI -1), 4, 63)>; + + def : Pat<(vscale (sve_rdvl_imm i32:$imm)), (RDVLI_XI $imm)>; + def : Pat<(vscale (sve_cnth_imm i32:$imm)), (CNTH_XPiI 31, $imm)>; + def : Pat<(vscale (sve_cntw_imm i32:$imm)), (CNTW_XPiI 31, $imm)>; + def : Pat<(vscale (sve_cntd_imm i32:$imm)), (CNTD_XPiI 31, $imm)>; + + def : Pat<(vscale (sve_cnth_imm_neg i32:$imm)), (SUBXrs XZR, (CNTH_XPiI 31, $imm), 0)>; + def : Pat<(vscale (sve_cntw_imm_neg i32:$imm)), (SUBXrs XZR, (CNTW_XPiI 31, $imm), 0)>; + def : Pat<(vscale (sve_cntd_imm_neg i32:$imm)), (SUBXrs XZR, (CNTD_XPiI 31, $imm), 0)>; + } + + // FIXME: BigEndian requires an additional REV instruction to satisfy the + // constraint that none of the bits change when stored to memory as one + // type, and and reloaded as another type. + let Predicates = [IsLE] in { + def : Pat<(nxv16i8 (bitconvert (nxv8i16 ZPR:$src))), (nxv16i8 ZPR:$src)>; + def : Pat<(nxv16i8 (bitconvert (nxv4i32 ZPR:$src))), (nxv16i8 ZPR:$src)>; + def : Pat<(nxv16i8 (bitconvert (nxv2i64 ZPR:$src))), (nxv16i8 ZPR:$src)>; + def : Pat<(nxv16i8 (bitconvert (nxv8f16 ZPR:$src))), (nxv16i8 ZPR:$src)>; + def : Pat<(nxv16i8 (bitconvert (nxv4f32 ZPR:$src))), (nxv16i8 ZPR:$src)>; + def : Pat<(nxv16i8 (bitconvert (nxv2f64 ZPR:$src))), (nxv16i8 ZPR:$src)>; + + def : Pat<(nxv8i16 (bitconvert (nxv16i8 ZPR:$src))), (nxv8i16 ZPR:$src)>; + def : Pat<(nxv8i16 (bitconvert (nxv4i32 ZPR:$src))), (nxv8i16 ZPR:$src)>; + def : Pat<(nxv8i16 (bitconvert (nxv2i64 ZPR:$src))), (nxv8i16 ZPR:$src)>; + def : Pat<(nxv8i16 (bitconvert (nxv8f16 ZPR:$src))), (nxv8i16 ZPR:$src)>; + def : Pat<(nxv8i16 (bitconvert (nxv4f32 ZPR:$src))), (nxv8i16 ZPR:$src)>; + def : Pat<(nxv8i16 (bitconvert (nxv2f64 ZPR:$src))), (nxv8i16 ZPR:$src)>; + + def : Pat<(nxv4i32 (bitconvert (nxv16i8 ZPR:$src))), (nxv4i32 ZPR:$src)>; + def : Pat<(nxv4i32 (bitconvert (nxv8i16 ZPR:$src))), (nxv4i32 ZPR:$src)>; + def : Pat<(nxv4i32 (bitconvert (nxv2i64 ZPR:$src))), (nxv4i32 ZPR:$src)>; + def : Pat<(nxv4i32 (bitconvert (nxv8f16 ZPR:$src))), (nxv4i32 ZPR:$src)>; + def : Pat<(nxv4i32 (bitconvert (nxv4f32 ZPR:$src))), (nxv4i32 ZPR:$src)>; + def : Pat<(nxv4i32 (bitconvert (nxv2f64 ZPR:$src))), (nxv4i32 ZPR:$src)>; + + def : Pat<(nxv2i64 (bitconvert (nxv16i8 ZPR:$src))), (nxv2i64 ZPR:$src)>; + def : Pat<(nxv2i64 (bitconvert (nxv8i16 ZPR:$src))), (nxv2i64 ZPR:$src)>; + def : Pat<(nxv2i64 (bitconvert (nxv4i32 ZPR:$src))), (nxv2i64 ZPR:$src)>; + def : Pat<(nxv2i64 (bitconvert (nxv8f16 ZPR:$src))), (nxv2i64 ZPR:$src)>; + def : Pat<(nxv2i64 (bitconvert (nxv4f32 ZPR:$src))), (nxv2i64 ZPR:$src)>; + def : Pat<(nxv2i64 (bitconvert (nxv2f64 ZPR:$src))), (nxv2i64 ZPR:$src)>; + + def : Pat<(nxv8f16 (bitconvert (nxv16i8 ZPR:$src))), (nxv8f16 ZPR:$src)>; + def : Pat<(nxv8f16 (bitconvert (nxv8i16 ZPR:$src))), (nxv8f16 ZPR:$src)>; + def : Pat<(nxv8f16 (bitconvert (nxv4i32 ZPR:$src))), (nxv8f16 ZPR:$src)>; + def : Pat<(nxv8f16 (bitconvert (nxv2i64 ZPR:$src))), (nxv8f16 ZPR:$src)>; + def : Pat<(nxv8f16 (bitconvert (nxv4f32 ZPR:$src))), (nxv8f16 ZPR:$src)>; + def : Pat<(nxv8f16 (bitconvert (nxv2f64 ZPR:$src))), (nxv8f16 ZPR:$src)>; + + def : Pat<(nxv4f32 (bitconvert (nxv16i8 ZPR:$src))), (nxv4f32 ZPR:$src)>; + def : Pat<(nxv4f32 (bitconvert (nxv8i16 ZPR:$src))), (nxv4f32 ZPR:$src)>; + def : Pat<(nxv4f32 (bitconvert (nxv4i32 ZPR:$src))), (nxv4f32 ZPR:$src)>; + def : Pat<(nxv4f32 (bitconvert (nxv2i64 ZPR:$src))), (nxv4f32 ZPR:$src)>; + def : Pat<(nxv4f32 (bitconvert (nxv8f16 ZPR:$src))), (nxv4f32 ZPR:$src)>; + def : Pat<(nxv4f32 (bitconvert (nxv2f64 ZPR:$src))), (nxv4f32 ZPR:$src)>; + + def : Pat<(nxv2f64 (bitconvert (nxv16i8 ZPR:$src))), (nxv2f64 ZPR:$src)>; + def : Pat<(nxv2f64 (bitconvert (nxv8i16 ZPR:$src))), (nxv2f64 ZPR:$src)>; + def : Pat<(nxv2f64 (bitconvert (nxv4i32 ZPR:$src))), (nxv2f64 ZPR:$src)>; + def : Pat<(nxv2f64 (bitconvert (nxv2i64 ZPR:$src))), (nxv2f64 ZPR:$src)>; + def : Pat<(nxv2f64 (bitconvert (nxv8f16 ZPR:$src))), (nxv2f64 ZPR:$src)>; + def : Pat<(nxv2f64 (bitconvert (nxv4f32 ZPR:$src))), (nxv2f64 ZPR:$src)>; + + } + + let Predicates = [IsLE, HasBF16, HasSVE] in { + def : Pat<(nxv2i64 (bitconvert (nxv8bf16 ZPR:$src))), (nxv2i64 ZPR:$src)>; + def : Pat<(nxv8bf16 (bitconvert (nxv2i64 ZPR:$src))), (nxv8bf16 ZPR:$src)>; + def : Pat<(nxv8bf16 (bitconvert (nxv8i16 ZPR:$src))), (nxv8bf16 ZPR:$src)>; + } + + let Predicates = [IsLE, HasSVE, HasBF16] in { + def : Pat<(nxv8bf16 (bitconvert (nxv16i8 ZPR:$src))), (nxv8bf16 ZPR:$src)>; + def : Pat<(nxv8bf16 (bitconvert (nxv8i16 ZPR:$src))), (nxv8bf16 ZPR:$src)>; + def : Pat<(nxv8bf16 (bitconvert (nxv4i32 ZPR:$src))), (nxv8bf16 ZPR:$src)>; + def : Pat<(nxv8bf16 (bitconvert (nxv2i64 ZPR:$src))), (nxv8bf16 ZPR:$src)>; + def : Pat<(nxv8bf16 (bitconvert (nxv8f16 ZPR:$src))), (nxv8bf16 ZPR:$src)>; + def : Pat<(nxv8bf16 (bitconvert (nxv4f32 ZPR:$src))), (nxv8bf16 ZPR:$src)>; + def : Pat<(nxv8bf16 (bitconvert (nxv2f64 ZPR:$src))), (nxv8bf16 ZPR:$src)>; + + def : Pat<(nxv16i8 (bitconvert (nxv8bf16 ZPR:$src))), (nxv16i8 ZPR:$src)>; + def : Pat<(nxv8i16 (bitconvert (nxv8bf16 ZPR:$src))), (nxv8i16 ZPR:$src)>; + def : Pat<(nxv4i32 (bitconvert (nxv8bf16 ZPR:$src))), (nxv4i32 ZPR:$src)>; + def : Pat<(nxv2i64 (bitconvert (nxv8bf16 ZPR:$src))), (nxv2i64 ZPR:$src)>; + def : Pat<(nxv8f16 (bitconvert (nxv8bf16 ZPR:$src))), (nxv8f16 ZPR:$src)>; + def : Pat<(nxv4f32 (bitconvert (nxv8bf16 ZPR:$src))), (nxv4f32 ZPR:$src)>; + def : Pat<(nxv2f64 (bitconvert (nxv8bf16 ZPR:$src))), (nxv2f64 ZPR:$src)>; + } + + def : Pat<(nxv16i1 (reinterpret_cast (nxv16i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>; + def : Pat<(nxv16i1 (reinterpret_cast (nxv8i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>; + def : Pat<(nxv16i1 (reinterpret_cast (nxv4i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>; + def : Pat<(nxv16i1 (reinterpret_cast (nxv2i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>; + def : Pat<(nxv8i1 (reinterpret_cast (nxv16i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>; + def : Pat<(nxv8i1 (reinterpret_cast (nxv4i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>; + def : Pat<(nxv8i1 (reinterpret_cast (nxv2i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>; + def : Pat<(nxv4i1 (reinterpret_cast (nxv16i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>; + def : Pat<(nxv4i1 (reinterpret_cast (nxv8i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>; + def : Pat<(nxv4i1 (reinterpret_cast (nxv2i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>; + def : Pat<(nxv2i1 (reinterpret_cast (nxv16i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>; + def : Pat<(nxv2i1 (reinterpret_cast (nxv8i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>; + def : Pat<(nxv2i1 (reinterpret_cast (nxv4i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>; + + def : Pat<(nxv16i1 (and PPR:$Ps1, PPR:$Ps2)), + (AND_PPzPP (PTRUE_B 31), PPR:$Ps1, PPR:$Ps2)>; + def : Pat<(nxv8i1 (and PPR:$Ps1, PPR:$Ps2)), + (AND_PPzPP (PTRUE_H 31), PPR:$Ps1, PPR:$Ps2)>; + def : Pat<(nxv4i1 (and PPR:$Ps1, PPR:$Ps2)), + (AND_PPzPP (PTRUE_S 31), PPR:$Ps1, PPR:$Ps2)>; + def : Pat<(nxv2i1 (and PPR:$Ps1, PPR:$Ps2)), + (AND_PPzPP (PTRUE_D 31), PPR:$Ps1, PPR:$Ps2)>; // Add more complex addressing modes here as required multiclass pred_load<ValueType Ty, ValueType PredTy, SDPatternOperator Load, - Instruction RegImmInst> { - + Instruction RegRegInst, Instruction RegImmInst, ComplexPattern AddrCP> { + // reg + reg + let AddedComplexity = 1 in { + def _reg_reg_z : Pat<(Ty (Load (AddrCP GPR64:$base, GPR64:$offset), (PredTy PPR:$gp), (SVEDup0Undef))), + (RegRegInst PPR:$gp, GPR64:$base, GPR64:$offset)>; + } + // reg + imm + let AddedComplexity = 2 in { + def _reg_imm_z : Pat<(Ty (Load (am_sve_indexed_s4 GPR64sp:$base, simm4s1:$offset), (PredTy PPR:$gp), (SVEDup0Undef))), + (RegImmInst PPR:$gp, GPR64:$base, simm4s1:$offset)>; + } def _default_z : Pat<(Ty (Load GPR64:$base, (PredTy PPR:$gp), (SVEDup0Undef))), (RegImmInst PPR:$gp, GPR64:$base, (i64 0))>; } // 2-element contiguous loads - defm : pred_load<nxv2i64, nxv2i1, zext_masked_load_i8, LD1B_D_IMM>; - defm : pred_load<nxv2i64, nxv2i1, asext_masked_load_i8, LD1SB_D_IMM>; - defm : pred_load<nxv2i64, nxv2i1, zext_masked_load_i16, LD1H_D_IMM>; - defm : pred_load<nxv2i64, nxv2i1, asext_masked_load_i16, LD1SH_D_IMM>; - defm : pred_load<nxv2i64, nxv2i1, zext_masked_load_i32, LD1W_D_IMM>; - defm : pred_load<nxv2i64, nxv2i1, asext_masked_load_i32, LD1SW_D_IMM>; - defm : pred_load<nxv2i64, nxv2i1, nonext_masked_load, LD1D_IMM>; - defm : pred_load<nxv2f16, nxv2i1, nonext_masked_load, LD1H_D_IMM>; - defm : pred_load<nxv2f32, nxv2i1, nonext_masked_load, LD1W_D_IMM>; - defm : pred_load<nxv2f64, nxv2i1, nonext_masked_load, LD1D_IMM>; + defm : pred_load<nxv2i64, nxv2i1, zext_masked_load_i8, LD1B_D, LD1B_D_IMM, am_sve_regreg_lsl0>; + defm : pred_load<nxv2i64, nxv2i1, asext_masked_load_i8, LD1SB_D, LD1SB_D_IMM, am_sve_regreg_lsl0>; + defm : pred_load<nxv2i64, nxv2i1, zext_masked_load_i16, LD1H_D, LD1H_D_IMM, am_sve_regreg_lsl1>; + defm : pred_load<nxv2i64, nxv2i1, asext_masked_load_i16, LD1SH_D, LD1SH_D_IMM, am_sve_regreg_lsl1>; + defm : pred_load<nxv2i64, nxv2i1, zext_masked_load_i32, LD1W_D, LD1W_D_IMM, am_sve_regreg_lsl2>; + defm : pred_load<nxv2i64, nxv2i1, asext_masked_load_i32, LD1SW_D, LD1SW_D_IMM, am_sve_regreg_lsl2>; + defm : pred_load<nxv2i64, nxv2i1, nonext_masked_load, LD1D, LD1D_IMM, am_sve_regreg_lsl3>; + defm : pred_load<nxv2f16, nxv2i1, nonext_masked_load, LD1H_D, LD1H_D_IMM, am_sve_regreg_lsl1>; + defm : pred_load<nxv2f32, nxv2i1, nonext_masked_load, LD1W_D, LD1W_D_IMM, am_sve_regreg_lsl2>; + defm : pred_load<nxv2f64, nxv2i1, nonext_masked_load, LD1D, LD1D_IMM, am_sve_regreg_lsl3>; // 4-element contiguous loads - defm : pred_load<nxv4i32, nxv4i1, zext_masked_load_i8, LD1B_S_IMM>; - defm : pred_load<nxv4i32, nxv4i1, asext_masked_load_i8, LD1SB_S_IMM>; - defm : pred_load<nxv4i32, nxv4i1, zext_masked_load_i16, LD1H_S_IMM>; - defm : pred_load<nxv4i32, nxv4i1, asext_masked_load_i16, LD1SH_S_IMM>; - defm : pred_load<nxv4i32, nxv4i1, nonext_masked_load, LD1W_IMM>; - defm : pred_load<nxv4f16, nxv4i1, nonext_masked_load, LD1H_S_IMM>; - defm : pred_load<nxv4f32, nxv4i1, nonext_masked_load, LD1W_IMM>; + defm : pred_load<nxv4i32, nxv4i1, zext_masked_load_i8, LD1B_S, LD1B_S_IMM, am_sve_regreg_lsl0>; + defm : pred_load<nxv4i32, nxv4i1, asext_masked_load_i8, LD1SB_S, LD1SB_S_IMM, am_sve_regreg_lsl0>; + defm : pred_load<nxv4i32, nxv4i1, zext_masked_load_i16, LD1H_S, LD1H_S_IMM, am_sve_regreg_lsl1>; + defm : pred_load<nxv4i32, nxv4i1, asext_masked_load_i16, LD1SH_S, LD1SH_S_IMM, am_sve_regreg_lsl1>; + defm : pred_load<nxv4i32, nxv4i1, nonext_masked_load, LD1W, LD1W_IMM, am_sve_regreg_lsl2>; + defm : pred_load<nxv4f16, nxv4i1, nonext_masked_load, LD1H_S, LD1H_S_IMM, am_sve_regreg_lsl1>; + defm : pred_load<nxv4f32, nxv4i1, nonext_masked_load, LD1W, LD1W_IMM, am_sve_regreg_lsl2>; // 8-element contiguous loads - defm : pred_load<nxv8i16, nxv8i1, zext_masked_load_i8, LD1B_H_IMM>; - defm : pred_load<nxv8i16, nxv8i1, asext_masked_load_i8, LD1SB_H_IMM>; - defm : pred_load<nxv8i16, nxv8i1, nonext_masked_load, LD1H_IMM>; - defm : pred_load<nxv8f16, nxv8i1, nonext_masked_load, LD1H_IMM>; + defm : pred_load<nxv8i16, nxv8i1, zext_masked_load_i8, LD1B_H, LD1B_H_IMM, am_sve_regreg_lsl0>; + defm : pred_load<nxv8i16, nxv8i1, asext_masked_load_i8, LD1SB_H, LD1SB_H_IMM, am_sve_regreg_lsl0>; + defm : pred_load<nxv8i16, nxv8i1, nonext_masked_load, LD1H, LD1H_IMM, am_sve_regreg_lsl1>; + defm : pred_load<nxv8f16, nxv8i1, nonext_masked_load, LD1H, LD1H_IMM, am_sve_regreg_lsl1>; + + let Predicates = [HasBF16, HasSVE] in { + defm : pred_load<nxv8bf16, nxv8i1, nonext_masked_load, LD1H, LD1H_IMM, am_sve_regreg_lsl1>; + } // 16-element contiguous loads - defm : pred_load<nxv16i8, nxv16i1, nonext_masked_load, LD1B_IMM>; + defm : pred_load<nxv16i8, nxv16i1, nonext_masked_load, LD1B, LD1B_IMM, am_sve_regreg_lsl0>; multiclass pred_store<ValueType Ty, ValueType PredTy, SDPatternOperator Store, - Instruction RegImmInst> { + Instruction RegRegInst, Instruction RegImmInst, ComplexPattern AddrCP> { + // reg + reg + let AddedComplexity = 1 in { + def _reg_reg : Pat<(Store (Ty ZPR:$vec), (AddrCP GPR64:$base, GPR64:$offset), (PredTy PPR:$gp)), + (RegRegInst ZPR:$vec, PPR:$gp, GPR64:$base, GPR64:$offset)>; + } + // reg + imm + let AddedComplexity = 2 in { + def _reg_imm : Pat<(Store (Ty ZPR:$vec), (am_sve_indexed_s4 GPR64sp:$base, simm4s1:$offset), (PredTy PPR:$gp)), + (RegImmInst ZPR:$vec, PPR:$gp, GPR64:$base, simm4s1:$offset)>; + } def _default : Pat<(Store (Ty ZPR:$vec), GPR64:$base, (PredTy PPR:$gp)), (RegImmInst ZPR:$vec, PPR:$gp, GPR64:$base, (i64 0))>; } // 2-element contiguous stores - defm : pred_store<nxv2i64, nxv2i1, trunc_masked_store_i8, ST1B_D_IMM>; - defm : pred_store<nxv2i64, nxv2i1, trunc_masked_store_i16, ST1H_D_IMM>; - defm : pred_store<nxv2i64, nxv2i1, trunc_masked_store_i32, ST1W_D_IMM>; - defm : pred_store<nxv2i64, nxv2i1, nontrunc_masked_store, ST1D_IMM>; - defm : pred_store<nxv2f16, nxv2i1, nontrunc_masked_store, ST1H_D_IMM>; - defm : pred_store<nxv2f32, nxv2i1, nontrunc_masked_store, ST1W_D_IMM>; - defm : pred_store<nxv2f64, nxv2i1, nontrunc_masked_store, ST1D_IMM>; + defm : pred_store<nxv2i64, nxv2i1, trunc_masked_store_i8, ST1B_D, ST1B_D_IMM, am_sve_regreg_lsl0>; + defm : pred_store<nxv2i64, nxv2i1, trunc_masked_store_i16, ST1H_D, ST1H_D_IMM, am_sve_regreg_lsl1>; + defm : pred_store<nxv2i64, nxv2i1, trunc_masked_store_i32, ST1W_D, ST1W_D_IMM, am_sve_regreg_lsl2>; + defm : pred_store<nxv2i64, nxv2i1, nontrunc_masked_store, ST1D, ST1D_IMM, am_sve_regreg_lsl3>; + defm : pred_store<nxv2f16, nxv2i1, nontrunc_masked_store, ST1H_D, ST1H_D_IMM, am_sve_regreg_lsl1>; + defm : pred_store<nxv2f32, nxv2i1, nontrunc_masked_store, ST1W_D, ST1W_D_IMM, am_sve_regreg_lsl2>; + defm : pred_store<nxv2f64, nxv2i1, nontrunc_masked_store, ST1D, ST1D_IMM, am_sve_regreg_lsl3>; // 4-element contiguous stores - defm : pred_store<nxv4i32, nxv4i1, trunc_masked_store_i8, ST1B_S_IMM>; - defm : pred_store<nxv4i32, nxv4i1, trunc_masked_store_i16, ST1H_S_IMM>; - defm : pred_store<nxv4i32, nxv4i1, nontrunc_masked_store, ST1W_IMM>; - defm : pred_store<nxv4f16, nxv4i1, nontrunc_masked_store, ST1H_S_IMM>; - defm : pred_store<nxv4f32, nxv4i1, nontrunc_masked_store, ST1W_IMM>; + defm : pred_store<nxv4i32, nxv4i1, trunc_masked_store_i8, ST1B_S, ST1B_S_IMM, am_sve_regreg_lsl0>; + defm : pred_store<nxv4i32, nxv4i1, trunc_masked_store_i16, ST1H_S, ST1H_S_IMM, am_sve_regreg_lsl1>; + defm : pred_store<nxv4i32, nxv4i1, nontrunc_masked_store, ST1W, ST1W_IMM, am_sve_regreg_lsl2>; + defm : pred_store<nxv4f16, nxv4i1, nontrunc_masked_store, ST1H_S, ST1H_S_IMM, am_sve_regreg_lsl1>; + defm : pred_store<nxv4f32, nxv4i1, nontrunc_masked_store, ST1W, ST1W_IMM, am_sve_regreg_lsl2>; // 8-element contiguous stores - defm : pred_store<nxv8i16, nxv8i1, trunc_masked_store_i8, ST1B_H_IMM>; - defm : pred_store<nxv8i16, nxv8i1, nontrunc_masked_store, ST1H_IMM>; - defm : pred_store<nxv8f16, nxv8i1, nontrunc_masked_store, ST1H_IMM>; + defm : pred_store<nxv8i16, nxv8i1, trunc_masked_store_i8, ST1B_H, ST1B_H_IMM, am_sve_regreg_lsl0>; + defm : pred_store<nxv8i16, nxv8i1, nontrunc_masked_store, ST1H, ST1H_IMM, am_sve_regreg_lsl1>; + defm : pred_store<nxv8f16, nxv8i1, nontrunc_masked_store, ST1H, ST1H_IMM, am_sve_regreg_lsl1>; + + let Predicates = [HasBF16, HasSVE] in { + defm : pred_store<nxv8bf16, nxv8i1, nontrunc_masked_store, ST1H, ST1H_IMM, am_sve_regreg_lsl1>; + } // 16-element contiguous stores - defm : pred_store<nxv16i8, nxv16i1, nontrunc_masked_store, ST1B_IMM>; + defm : pred_store<nxv16i8, nxv16i1, nontrunc_masked_store, ST1B, ST1B_IMM, am_sve_regreg_lsl0>; + + defm : pred_load<nxv16i8, nxv16i1, non_temporal_load, LDNT1B_ZRR, LDNT1B_ZRI, am_sve_regreg_lsl0>; + defm : pred_load<nxv8i16, nxv8i1, non_temporal_load, LDNT1H_ZRR, LDNT1H_ZRI, am_sve_regreg_lsl1>; + defm : pred_load<nxv4i32, nxv4i1, non_temporal_load, LDNT1W_ZRR, LDNT1W_ZRI, am_sve_regreg_lsl2>; + defm : pred_load<nxv2i64, nxv2i1, non_temporal_load, LDNT1D_ZRR, LDNT1D_ZRI, am_sve_regreg_lsl3>; + + defm : pred_store<nxv16i8, nxv16i1, non_temporal_store, STNT1B_ZRR, STNT1B_ZRI, am_sve_regreg_lsl0>; + defm : pred_store<nxv8i16, nxv8i1, non_temporal_store, STNT1H_ZRR, STNT1H_ZRI, am_sve_regreg_lsl1>; + defm : pred_store<nxv4i32, nxv4i1, non_temporal_store, STNT1W_ZRR, STNT1W_ZRI, am_sve_regreg_lsl2>; + defm : pred_store<nxv2i64, nxv2i1, non_temporal_store, STNT1D_ZRR, STNT1D_ZRI, am_sve_regreg_lsl3>; + + multiclass unpred_store<PatFrag Store, ValueType Ty, Instruction RegImmInst, + Instruction PTrue> { + let AddedComplexity = 1 in { + def _imm : Pat<(Store (Ty ZPR:$val), (am_sve_indexed_s4 GPR64sp:$base, simm4s1:$offset)), + (RegImmInst ZPR:$val, (PTrue 31), GPR64sp:$base, simm4s1:$offset)>; + } + let AddedComplexity = 2 in { + def _fi : Pat<(Store (Ty ZPR:$val), (am_sve_fi GPR64sp:$base, simm4s1:$offset)), + (RegImmInst ZPR:$val, (PTrue 31), GPR64sp:$base, simm4s1:$offset)>; + } + + def : Pat<(Store (Ty ZPR:$val), GPR64:$base), + (RegImmInst ZPR:$val, (PTrue 31), GPR64:$base, (i64 0))>; + } + + defm : unpred_store< store, nxv16i8, ST1B_IMM, PTRUE_B>; + defm : unpred_store< truncstorevi8, nxv8i16, ST1B_H_IMM, PTRUE_H>; + defm : unpred_store< truncstorevi8, nxv4i32, ST1B_S_IMM, PTRUE_S>; + defm : unpred_store< truncstorevi8, nxv2i64, ST1B_D_IMM, PTRUE_D>; + defm : unpred_store< store, nxv8i16, ST1H_IMM, PTRUE_H>; + defm : unpred_store<truncstorevi16, nxv4i32, ST1H_S_IMM, PTRUE_S>; + defm : unpred_store<truncstorevi16, nxv2i64, ST1H_D_IMM, PTRUE_D>; + defm : unpred_store< store, nxv4i32, ST1W_IMM, PTRUE_S>; + defm : unpred_store<truncstorevi32, nxv2i64, ST1W_D_IMM, PTRUE_D>; + defm : unpred_store< store, nxv2i64, ST1D_IMM, PTRUE_D>; + defm : unpred_store< store, nxv8f16, ST1H_IMM, PTRUE_H>; + defm : unpred_store< store, nxv8bf16, ST1H_IMM, PTRUE_H>; + defm : unpred_store< store, nxv4f16, ST1H_S_IMM, PTRUE_S>; + defm : unpred_store< store, nxv2f16, ST1H_D_IMM, PTRUE_D>; + defm : unpred_store< store, nxv4f32, ST1W_IMM, PTRUE_S>; + defm : unpred_store< store, nxv4f32, ST1W_D_IMM, PTRUE_D>; + defm : unpred_store< store, nxv2f64, ST1D_IMM, PTRUE_D>; + + multiclass unpred_load<PatFrag Load, ValueType Ty, Instruction RegImmInst, + Instruction PTrue> { + let AddedComplexity = 1 in { + def _imm: Pat<(Ty (Load (am_sve_indexed_s4 GPR64sp:$base, simm4s1:$offset))), + (RegImmInst (PTrue 31), GPR64sp:$base, simm4s1:$offset)>; + } + + let AddedComplexity = 2 in { + def _fi : Pat<(Ty (Load (am_sve_fi GPR64sp:$base, simm4s1:$offset))), + (RegImmInst (PTrue 31), GPR64sp:$base, simm4s1:$offset)>; + } + + def : Pat<(Ty (Load GPR64:$base)), + (RegImmInst (PTrue 31), GPR64:$base, (i64 0))>; + } + + defm : unpred_load< load, nxv16i8, LD1B_IMM, PTRUE_B>; + defm : unpred_load< zextloadvi8, nxv8i16, LD1B_H_IMM, PTRUE_H>; + defm : unpred_load< zextloadvi8, nxv4i32, LD1B_S_IMM, PTRUE_S>; + defm : unpred_load< zextloadvi8, nxv2i64, LD1B_D_IMM, PTRUE_D>; + defm : unpred_load< extloadvi8, nxv8i16, LD1B_H_IMM, PTRUE_H>; + defm : unpred_load< extloadvi8, nxv4i32, LD1B_S_IMM, PTRUE_S>; + defm : unpred_load< extloadvi8, nxv2i64, LD1B_D_IMM, PTRUE_D>; + defm : unpred_load< sextloadvi8, nxv8i16, LD1SB_H_IMM, PTRUE_H>; + defm : unpred_load< sextloadvi8, nxv4i32, LD1SB_S_IMM, PTRUE_S>; + defm : unpred_load< sextloadvi8, nxv2i64, LD1SB_D_IMM, PTRUE_D>; + defm : unpred_load< load, nxv8i16, LD1H_IMM, PTRUE_H>; + defm : unpred_load<zextloadvi16, nxv4i32, LD1H_S_IMM, PTRUE_S>; + defm : unpred_load<zextloadvi16, nxv2i64, LD1H_D_IMM, PTRUE_D>; + defm : unpred_load< extloadvi16, nxv4i32, LD1H_S_IMM, PTRUE_S>; + defm : unpred_load< extloadvi16, nxv2i64, LD1H_D_IMM, PTRUE_D>; + defm : unpred_load<sextloadvi16, nxv4i32, LD1SH_S_IMM, PTRUE_S>; + defm : unpred_load<sextloadvi16, nxv2i64, LD1SH_D_IMM, PTRUE_D>; + defm : unpred_load< load, nxv4i32, LD1W_IMM, PTRUE_S>; + defm : unpred_load<zextloadvi32, nxv2i64, LD1W_D_IMM, PTRUE_D>; + defm : unpred_load< extloadvi32, nxv2i64, LD1W_D_IMM, PTRUE_D>; + defm : unpred_load<sextloadvi32, nxv2i64, LD1SW_D_IMM, PTRUE_D>; + defm : unpred_load< load, nxv2i64, LD1D_IMM, PTRUE_D>; + defm : unpred_load< load, nxv8f16, LD1H_IMM, PTRUE_H>; + defm : unpred_load< load, nxv8bf16, LD1H_IMM, PTRUE_H>; + defm : unpred_load< load, nxv4f16, LD1H_S_IMM, PTRUE_S>; + defm : unpred_load< load, nxv2f16, LD1H_D_IMM, PTRUE_D>; + defm : unpred_load< load, nxv4f32, LD1W_IMM, PTRUE_S>; + defm : unpred_load< load, nxv2f32, LD1W_D_IMM, PTRUE_D>; + defm : unpred_load< load, nxv2f64, LD1D_IMM, PTRUE_D>; + + multiclass unpred_store_predicate<ValueType Ty, Instruction Store> { + def _fi : Pat<(store (Ty PPR:$val), (am_sve_fi GPR64sp:$base, simm9:$offset)), + (Store PPR:$val, GPR64sp:$base, simm9:$offset)>; + + def _default : Pat<(store (Ty PPR:$Val), GPR64:$base), + (Store PPR:$Val, GPR64:$base, (i64 0))>; + } + + defm Pat_Store_P16 : unpred_store_predicate<nxv16i1, STR_PXI>; + defm Pat_Store_P8 : unpred_store_predicate<nxv8i1, STR_PXI>; + defm Pat_Store_P4 : unpred_store_predicate<nxv4i1, STR_PXI>; + defm Pat_Store_P2 : unpred_store_predicate<nxv2i1, STR_PXI>; + + multiclass unpred_load_predicate<ValueType Ty, Instruction Load> { + def _fi : Pat<(Ty (load (am_sve_fi GPR64sp:$base, simm9:$offset))), + (Load GPR64sp:$base, simm9:$offset)>; + + def _default : Pat<(Ty (load GPR64:$base)), + (Load GPR64:$base, (i64 0))>; + } + + defm Pat_Load_P16 : unpred_load_predicate<nxv16i1, LDR_PXI>; + defm Pat_Load_P8 : unpred_load_predicate<nxv8i1, LDR_PXI>; + defm Pat_Load_P4 : unpred_load_predicate<nxv4i1, LDR_PXI>; + defm Pat_Load_P2 : unpred_load_predicate<nxv2i1, LDR_PXI>; + + multiclass ld1<Instruction RegRegInst, Instruction RegImmInst, ValueType Ty, + SDPatternOperator Load, ValueType PredTy, ValueType MemVT, ComplexPattern AddrCP> { + // reg + reg + let AddedComplexity = 1 in { + def : Pat<(Ty (Load (PredTy PPR:$gp), (AddrCP GPR64:$base, GPR64:$offset), MemVT)), + (RegRegInst PPR:$gp, GPR64sp:$base, GPR64:$offset)>; + } + + // scalar + immediate (mul vl) + let AddedComplexity = 2 in { + def : Pat<(Ty (Load (PredTy PPR:$gp), (am_sve_indexed_s4 GPR64sp:$base, simm4s1:$offset), MemVT)), + (RegImmInst PPR:$gp, GPR64sp:$base, simm4s1:$offset)>; + } + + // base + def : Pat<(Ty (Load (PredTy PPR:$gp), GPR64:$base, MemVT)), + (RegImmInst PPR:$gp, GPR64sp:$base, (i64 0))>; + } + + // 2-element contiguous loads + defm : ld1<LD1B_D, LD1B_D_IMM, nxv2i64, AArch64ld1_z, nxv2i1, nxv2i8, am_sve_regreg_lsl0>; + defm : ld1<LD1SB_D, LD1SB_D_IMM, nxv2i64, AArch64ld1s_z, nxv2i1, nxv2i8, am_sve_regreg_lsl0>; + defm : ld1<LD1H_D, LD1H_D_IMM, nxv2i64, AArch64ld1_z, nxv2i1, nxv2i16, am_sve_regreg_lsl1>; + defm : ld1<LD1SH_D, LD1SH_D_IMM, nxv2i64, AArch64ld1s_z, nxv2i1, nxv2i16, am_sve_regreg_lsl1>; + defm : ld1<LD1W_D, LD1W_D_IMM, nxv2i64, AArch64ld1_z, nxv2i1, nxv2i32, am_sve_regreg_lsl2>; + defm : ld1<LD1SW_D, LD1SW_D_IMM, nxv2i64, AArch64ld1s_z, nxv2i1, nxv2i32, am_sve_regreg_lsl2>; + defm : ld1<LD1D, LD1D_IMM, nxv2i64, AArch64ld1_z, nxv2i1, nxv2i64, am_sve_regreg_lsl3>; + defm : ld1<LD1D, LD1D_IMM, nxv2f64, AArch64ld1_z, nxv2i1, nxv2f64, am_sve_regreg_lsl3>; + + // 4-element contiguous loads + defm : ld1<LD1B_S, LD1B_S_IMM, nxv4i32, AArch64ld1_z, nxv4i1, nxv4i8, am_sve_regreg_lsl0>; + defm : ld1<LD1SB_S, LD1SB_S_IMM, nxv4i32, AArch64ld1s_z, nxv4i1, nxv4i8, am_sve_regreg_lsl0>; + defm : ld1<LD1H_S, LD1H_S_IMM, nxv4i32, AArch64ld1_z, nxv4i1, nxv4i16, am_sve_regreg_lsl1>; + defm : ld1<LD1SH_S, LD1SH_S_IMM, nxv4i32, AArch64ld1s_z, nxv4i1, nxv4i16, am_sve_regreg_lsl1>; + defm : ld1<LD1W, LD1W_IMM, nxv4i32, AArch64ld1_z, nxv4i1, nxv4i32, am_sve_regreg_lsl2>; + defm : ld1<LD1W, LD1W_IMM, nxv4f32, AArch64ld1_z, nxv4i1, nxv4f32, am_sve_regreg_lsl2>; + + // 8-element contiguous loads + defm : ld1<LD1B_H, LD1B_H_IMM, nxv8i16, AArch64ld1_z, nxv8i1, nxv8i8, am_sve_regreg_lsl0>; + defm : ld1<LD1SB_H, LD1SB_H_IMM, nxv8i16, AArch64ld1s_z, nxv8i1, nxv8i8, am_sve_regreg_lsl0>; + defm : ld1<LD1H, LD1H_IMM, nxv8i16, AArch64ld1_z, nxv8i1, nxv8i16, am_sve_regreg_lsl1>; + defm : ld1<LD1H, LD1H_IMM, nxv8f16, AArch64ld1_z, nxv8i1, nxv8f16, am_sve_regreg_lsl1>; + + let Predicates = [HasBF16, HasSVE] in { + defm : ld1<LD1H, LD1H_IMM, nxv8bf16, AArch64ld1_z, nxv8i1, nxv8bf16, am_sve_regreg_lsl1>; + } + + // 16-element contiguous loads + defm : ld1<LD1B, LD1B_IMM, nxv16i8, AArch64ld1_z, nxv16i1, nxv16i8, am_sve_regreg_lsl0>; + + multiclass ldnf1<Instruction I, ValueType Ty, SDPatternOperator Load, ValueType PredTy, ValueType MemVT> { + // scalar + immediate (mul vl) + let AddedComplexity = 1 in { + def : Pat<(Ty (Load (PredTy PPR:$gp), (am_sve_indexed_s4 GPR64sp:$base, simm4s1:$offset), MemVT)), + (I PPR:$gp, GPR64sp:$base, simm4s1:$offset)>; + } + + // base + def : Pat<(Ty (Load (PredTy PPR:$gp), GPR64:$base, MemVT)), + (I PPR:$gp, GPR64sp:$base, (i64 0))>; + } + + // 2-element contiguous non-faulting loads + defm : ldnf1<LDNF1B_D_IMM, nxv2i64, AArch64ldnf1_z, nxv2i1, nxv2i8>; + defm : ldnf1<LDNF1SB_D_IMM, nxv2i64, AArch64ldnf1s_z, nxv2i1, nxv2i8>; + defm : ldnf1<LDNF1H_D_IMM, nxv2i64, AArch64ldnf1_z, nxv2i1, nxv2i16>; + defm : ldnf1<LDNF1SH_D_IMM, nxv2i64, AArch64ldnf1s_z, nxv2i1, nxv2i16>; + defm : ldnf1<LDNF1W_D_IMM, nxv2i64, AArch64ldnf1_z, nxv2i1, nxv2i32>; + defm : ldnf1<LDNF1SW_D_IMM, nxv2i64, AArch64ldnf1s_z, nxv2i1, nxv2i32>; + defm : ldnf1<LDNF1D_IMM, nxv2i64, AArch64ldnf1_z, nxv2i1, nxv2i64>; + defm : ldnf1<LDNF1D_IMM, nxv2f64, AArch64ldnf1_z, nxv2i1, nxv2f64>; + + // 4-element contiguous non-faulting loads + defm : ldnf1<LDNF1B_S_IMM, nxv4i32, AArch64ldnf1_z, nxv4i1, nxv4i8>; + defm : ldnf1<LDNF1SB_S_IMM, nxv4i32, AArch64ldnf1s_z, nxv4i1, nxv4i8>; + defm : ldnf1<LDNF1H_S_IMM, nxv4i32, AArch64ldnf1_z, nxv4i1, nxv4i16>; + defm : ldnf1<LDNF1SH_S_IMM, nxv4i32, AArch64ldnf1s_z, nxv4i1, nxv4i16>; + defm : ldnf1<LDNF1W_IMM, nxv4i32, AArch64ldnf1_z, nxv4i1, nxv4i32>; + defm : ldnf1<LDNF1W_IMM, nxv4f32, AArch64ldnf1_z, nxv4i1, nxv4f32>; + + // 8-element contiguous non-faulting loads + defm : ldnf1<LDNF1B_H_IMM, nxv8i16, AArch64ldnf1_z, nxv8i1, nxv8i8>; + defm : ldnf1<LDNF1SB_H_IMM, nxv8i16, AArch64ldnf1s_z, nxv8i1, nxv8i8>; + defm : ldnf1<LDNF1H_IMM, nxv8i16, AArch64ldnf1_z, nxv8i1, nxv8i16>; + defm : ldnf1<LDNF1H_IMM, nxv8f16, AArch64ldnf1_z, nxv8i1, nxv8f16>; + + let Predicates = [HasBF16, HasSVE] in { + defm : ldnf1<LDNF1H_IMM, nxv8bf16, AArch64ldnf1_z, nxv8i1, nxv8bf16>; + } + + // 16-element contiguous non-faulting loads + defm : ldnf1<LDNF1B_IMM, nxv16i8, AArch64ldnf1_z, nxv16i1, nxv16i8>; - defm : pred_load<nxv16i8, nxv16i1, non_temporal_load, LDNT1B_ZRI>; - defm : pred_load<nxv8i16, nxv8i1, non_temporal_load, LDNT1H_ZRI>; - defm : pred_load<nxv4i32, nxv4i1, non_temporal_load, LDNT1W_ZRI>; - defm : pred_load<nxv2i64, nxv2i1, non_temporal_load, LDNT1D_ZRI>; + multiclass ldff1<Instruction I, ValueType Ty, SDPatternOperator Load, ValueType PredTy, ValueType MemVT, ComplexPattern AddrCP> { + // reg + reg + let AddedComplexity = 1 in { + def : Pat<(Ty (Load (PredTy PPR:$gp), (AddrCP GPR64:$base, GPR64:$offset), MemVT)), + (I PPR:$gp, GPR64sp:$base, GPR64:$offset)>; + } - defm : pred_store<nxv16i8, nxv16i1, non_temporal_store, STNT1B_ZRI>; - defm : pred_store<nxv8i16, nxv8i1, non_temporal_store, STNT1H_ZRI>; - defm : pred_store<nxv4i32, nxv4i1, non_temporal_store, STNT1W_ZRI>; - defm : pred_store<nxv2i64, nxv2i1, non_temporal_store, STNT1D_ZRI>; + // Base + def : Pat<(Ty (Load (PredTy PPR:$gp), GPR64:$base, MemVT)), + (I PPR:$gp, GPR64sp:$base, XZR)>; + } + + // 2-element contiguous first faulting loads + defm : ldff1<LDFF1B_D, nxv2i64, AArch64ldff1_z, nxv2i1, nxv2i8, am_sve_regreg_lsl0>; + defm : ldff1<LDFF1SB_D, nxv2i64, AArch64ldff1s_z, nxv2i1, nxv2i8, am_sve_regreg_lsl0>; + defm : ldff1<LDFF1H_D, nxv2i64, AArch64ldff1_z, nxv2i1, nxv2i16, am_sve_regreg_lsl1>; + defm : ldff1<LDFF1SH_D, nxv2i64, AArch64ldff1s_z, nxv2i1, nxv2i16, am_sve_regreg_lsl1>; + defm : ldff1<LDFF1W_D, nxv2i64, AArch64ldff1_z, nxv2i1, nxv2i32, am_sve_regreg_lsl2>; + defm : ldff1<LDFF1SW_D, nxv2i64, AArch64ldff1s_z, nxv2i1, nxv2i32, am_sve_regreg_lsl2>; + defm : ldff1<LDFF1D, nxv2i64, AArch64ldff1_z, nxv2i1, nxv2i64, am_sve_regreg_lsl3>; + defm : ldff1<LDFF1W_D, nxv2f32, AArch64ldff1_z, nxv2i1, nxv2f32, am_sve_regreg_lsl2>; + defm : ldff1<LDFF1D, nxv2f64, AArch64ldff1_z, nxv2i1, nxv2f64, am_sve_regreg_lsl3>; + + // 4-element contiguous first faulting loads + defm : ldff1<LDFF1B_S, nxv4i32, AArch64ldff1_z, nxv4i1, nxv4i8, am_sve_regreg_lsl0>; + defm : ldff1<LDFF1SB_S, nxv4i32, AArch64ldff1s_z, nxv4i1, nxv4i8, am_sve_regreg_lsl0>; + defm : ldff1<LDFF1H_S, nxv4i32, AArch64ldff1_z, nxv4i1, nxv4i16, am_sve_regreg_lsl1>; + defm : ldff1<LDFF1SH_S, nxv4i32, AArch64ldff1s_z, nxv4i1, nxv4i16, am_sve_regreg_lsl1>; + defm : ldff1<LDFF1W, nxv4i32, AArch64ldff1_z, nxv4i1, nxv4i32, am_sve_regreg_lsl2>; + defm : ldff1<LDFF1W, nxv4f32, AArch64ldff1_z, nxv4i1, nxv4f32, am_sve_regreg_lsl2>; + + // 8-element contiguous first faulting loads + defm : ldff1<LDFF1B_H, nxv8i16, AArch64ldff1_z, nxv8i1, nxv8i8, am_sve_regreg_lsl0>; + defm : ldff1<LDFF1SB_H, nxv8i16, AArch64ldff1s_z, nxv8i1, nxv8i8, am_sve_regreg_lsl0>; + defm : ldff1<LDFF1H, nxv8i16, AArch64ldff1_z, nxv8i1, nxv8i16, am_sve_regreg_lsl1>; + defm : ldff1<LDFF1H, nxv8f16, AArch64ldff1_z, nxv8i1, nxv8f16, am_sve_regreg_lsl1>; + + let Predicates = [HasBF16, HasSVE] in { + defm : ldff1<LDFF1H, nxv8bf16, AArch64ldff1_z, nxv8i1, nxv8bf16, am_sve_regreg_lsl1>; + } + + // 16-element contiguous first faulting loads + defm : ldff1<LDFF1B, nxv16i8, AArch64ldff1_z, nxv16i1, nxv16i8, am_sve_regreg_lsl0>; + + multiclass st1<Instruction RegRegInst, Instruction RegImmInst, ValueType Ty, + SDPatternOperator Store, ValueType PredTy, ValueType MemVT, ComplexPattern AddrCP> { + // reg + reg + let AddedComplexity = 1 in { + def : Pat<(Store (Ty ZPR:$vec), (AddrCP GPR64:$base, GPR64:$offset), (PredTy PPR:$gp), MemVT), + (RegRegInst ZPR:$vec, PPR:$gp, GPR64sp:$base, GPR64:$offset)>; + } + + // scalar + immediate (mul vl) + let AddedComplexity = 2 in { + def : Pat<(Store (Ty ZPR:$vec), (am_sve_indexed_s4 GPR64sp:$base, simm4s1:$offset), (PredTy PPR:$gp), MemVT), + (RegImmInst ZPR:$vec, PPR:$gp, GPR64sp:$base, simm4s1:$offset)>; + } + + // base + def : Pat<(Store (Ty ZPR:$vec), GPR64:$base, (PredTy PPR:$gp), MemVT), + (RegImmInst ZPR:$vec, PPR:$gp, GPR64:$base, (i64 0))>; + } + + // 2-element contiguous store + defm : st1<ST1B_D, ST1B_D_IMM, nxv2i64, AArch64st1, nxv2i1, nxv2i8, am_sve_regreg_lsl0>; + defm : st1<ST1H_D, ST1H_D_IMM, nxv2i64, AArch64st1, nxv2i1, nxv2i16, am_sve_regreg_lsl1>; + defm : st1<ST1W_D, ST1W_D_IMM, nxv2i64, AArch64st1, nxv2i1, nxv2i32, am_sve_regreg_lsl2>; + defm : st1<ST1D, ST1D_IMM, nxv2i64, AArch64st1, nxv2i1, nxv2i64, am_sve_regreg_lsl3>; + + // 4-element contiguous store + defm : st1<ST1B_S, ST1B_S_IMM, nxv4i32, AArch64st1, nxv4i1, nxv4i8, am_sve_regreg_lsl0>; + defm : st1<ST1H_S, ST1H_S_IMM, nxv4i32, AArch64st1, nxv4i1, nxv4i16, am_sve_regreg_lsl1>; + defm : st1<ST1W, ST1W_IMM, nxv4i32, AArch64st1, nxv4i1, nxv4i32, am_sve_regreg_lsl2>; + + // 8-element contiguous store + defm : st1<ST1B_H, ST1B_H_IMM, nxv8i16, AArch64st1, nxv8i1, nxv8i8, am_sve_regreg_lsl0>; + defm : st1<ST1H, ST1H_IMM, nxv8i16, AArch64st1, nxv8i1, nxv8i16, am_sve_regreg_lsl1>; + + // 16-element contiguous store + defm : st1<ST1B, ST1B_IMM, nxv16i8, AArch64st1, nxv16i1, nxv16i8, am_sve_regreg_lsl0>; + + def : Pat<(nxv16i8 (vector_insert (nxv16i8 (undef)), (i32 FPR32:$src), 0)), + (INSERT_SUBREG (nxv16i8 (IMPLICIT_DEF)), FPR32:$src, ssub)>; + def : Pat<(nxv8i16 (vector_insert (nxv8i16 (undef)), (i32 FPR32:$src), 0)), + (INSERT_SUBREG (nxv8i16 (IMPLICIT_DEF)), FPR32:$src, ssub)>; + def : Pat<(nxv4i32 (vector_insert (nxv4i32 (undef)), (i32 FPR32:$src), 0)), + (INSERT_SUBREG (nxv4i32 (IMPLICIT_DEF)), FPR32:$src, ssub)>; + def : Pat<(nxv2i64 (vector_insert (nxv2i64 (undef)), (i64 FPR64:$src), 0)), + (INSERT_SUBREG (nxv2i64 (IMPLICIT_DEF)), FPR64:$src, dsub)>; + + // Insert scalar into vector[0] + def : Pat<(nxv16i8 (vector_insert (nxv16i8 ZPR:$vec), (i32 GPR32:$src), 0)), + (CPY_ZPmR_B ZPR:$vec, (PTRUE_B 1), GPR32:$src)>; + def : Pat<(nxv8i16 (vector_insert (nxv8i16 ZPR:$vec), (i32 GPR32:$src), 0)), + (CPY_ZPmR_H ZPR:$vec, (PTRUE_H 1), GPR32:$src)>; + def : Pat<(nxv4i32 (vector_insert (nxv4i32 ZPR:$vec), (i32 GPR32:$src), 0)), + (CPY_ZPmR_S ZPR:$vec, (PTRUE_S 1), GPR32:$src)>; + def : Pat<(nxv2i64 (vector_insert (nxv2i64 ZPR:$vec), (i64 GPR64:$src), 0)), + (CPY_ZPmR_D ZPR:$vec, (PTRUE_D 1), GPR64:$src)>; + + def : Pat<(nxv8f16 (vector_insert (nxv8f16 ZPR:$vec), (f16 FPR16:$src), 0)), + (SEL_ZPZZ_H (PTRUE_H 1), (INSERT_SUBREG (IMPLICIT_DEF), FPR16:$src, hsub), ZPR:$vec)>; + def : Pat<(nxv4f32 (vector_insert (nxv4f32 ZPR:$vec), (f32 FPR32:$src), 0)), + (SEL_ZPZZ_S (PTRUE_S 1), (INSERT_SUBREG (IMPLICIT_DEF), FPR32:$src, ssub), ZPR:$vec)>; + def : Pat<(nxv2f64 (vector_insert (nxv2f64 ZPR:$vec), (f64 FPR64:$src), 0)), + (SEL_ZPZZ_D (PTRUE_D 1), (INSERT_SUBREG (IMPLICIT_DEF), FPR64:$src, dsub), ZPR:$vec)>; + + // Insert scalar into vector with scalar index + def : Pat<(nxv16i8 (vector_insert (nxv16i8 ZPR:$vec), GPR32:$src, GPR64:$index)), + (CPY_ZPmR_B ZPR:$vec, + (CMPEQ_PPzZZ_B (PTRUE_B 31), + (INDEX_II_B 0, 1), + (DUP_ZR_B (i32 (EXTRACT_SUBREG GPR64:$index, sub_32)))), + GPR32:$src)>; + def : Pat<(nxv8i16 (vector_insert (nxv8i16 ZPR:$vec), GPR32:$src, GPR64:$index)), + (CPY_ZPmR_H ZPR:$vec, + (CMPEQ_PPzZZ_H (PTRUE_H 31), + (INDEX_II_H 0, 1), + (DUP_ZR_H (i32 (EXTRACT_SUBREG GPR64:$index, sub_32)))), + GPR32:$src)>; + def : Pat<(nxv4i32 (vector_insert (nxv4i32 ZPR:$vec), GPR32:$src, GPR64:$index)), + (CPY_ZPmR_S ZPR:$vec, + (CMPEQ_PPzZZ_S (PTRUE_S 31), + (INDEX_II_S 0, 1), + (DUP_ZR_S (i32 (EXTRACT_SUBREG GPR64:$index, sub_32)))), + GPR32:$src)>; + def : Pat<(nxv2i64 (vector_insert (nxv2i64 ZPR:$vec), GPR64:$src, GPR64:$index)), + (CPY_ZPmR_D ZPR:$vec, + (CMPEQ_PPzZZ_D (PTRUE_D 31), + (INDEX_II_D 0, 1), + (DUP_ZR_D GPR64:$index)), + GPR64:$src)>; + + // Insert FP scalar into vector with scalar index + def : Pat<(nxv8f16 (vector_insert (nxv8f16 ZPR:$vec), (f16 FPR16:$src), GPR64:$index)), + (CPY_ZPmV_H ZPR:$vec, + (CMPEQ_PPzZZ_H (PTRUE_H 31), + (INDEX_II_H 0, 1), + (DUP_ZR_H (i32 (EXTRACT_SUBREG GPR64:$index, sub_32)))), + $src)>; + def : Pat<(nxv4f32 (vector_insert (nxv4f32 ZPR:$vec), (f32 FPR32:$src), GPR64:$index)), + (CPY_ZPmV_S ZPR:$vec, + (CMPEQ_PPzZZ_S (PTRUE_S 31), + (INDEX_II_S 0, 1), + (DUP_ZR_S (i32 (EXTRACT_SUBREG GPR64:$index, sub_32)))), + $src)>; + def : Pat<(nxv2f64 (vector_insert (nxv2f64 ZPR:$vec), (f64 FPR64:$src), GPR64:$index)), + (CPY_ZPmV_D ZPR:$vec, + (CMPEQ_PPzZZ_D (PTRUE_D 31), + (INDEX_II_D 0, 1), + (DUP_ZR_D $index)), + $src)>; + + // Extract element from vector with immediate index + def : Pat<(i32 (vector_extract (nxv16i8 ZPR:$vec), sve_elm_idx_extdup_b:$index)), + (EXTRACT_SUBREG (DUP_ZZI_B ZPR:$vec, sve_elm_idx_extdup_b:$index), ssub)>; + def : Pat<(i32 (vector_extract (nxv8i16 ZPR:$vec), sve_elm_idx_extdup_h:$index)), + (EXTRACT_SUBREG (DUP_ZZI_H ZPR:$vec, sve_elm_idx_extdup_h:$index), ssub)>; + def : Pat<(i32 (vector_extract (nxv4i32 ZPR:$vec), sve_elm_idx_extdup_s:$index)), + (EXTRACT_SUBREG (DUP_ZZI_S ZPR:$vec, sve_elm_idx_extdup_s:$index), ssub)>; + def : Pat<(i64 (vector_extract (nxv2i64 ZPR:$vec), sve_elm_idx_extdup_d:$index)), + (EXTRACT_SUBREG (DUP_ZZI_D ZPR:$vec, sve_elm_idx_extdup_d:$index), dsub)>; + def : Pat<(f16 (vector_extract (nxv8f16 ZPR:$vec), sve_elm_idx_extdup_h:$index)), + (EXTRACT_SUBREG (DUP_ZZI_H ZPR:$vec, sve_elm_idx_extdup_h:$index), hsub)>; + def : Pat<(f32 (vector_extract (nxv4f32 ZPR:$vec), sve_elm_idx_extdup_s:$index)), + (EXTRACT_SUBREG (DUP_ZZI_S ZPR:$vec, sve_elm_idx_extdup_s:$index), ssub)>; + def : Pat<(f64 (vector_extract (nxv2f64 ZPR:$vec), sve_elm_idx_extdup_d:$index)), + (EXTRACT_SUBREG (DUP_ZZI_D ZPR:$vec, sve_elm_idx_extdup_d:$index), dsub)>; + + // Extract element from vector with scalar index + def : Pat<(i32 (vector_extract (nxv16i8 ZPR:$vec), GPR64:$index)), + (LASTB_RPZ_B (WHILELS_PXX_B XZR, GPR64:$index), + ZPR:$vec)>; + def : Pat<(i32 (vector_extract (nxv8i16 ZPR:$vec), GPR64:$index)), + (LASTB_RPZ_H (WHILELS_PXX_H XZR, GPR64:$index), + ZPR:$vec)>; + def : Pat<(i32 (vector_extract (nxv4i32 ZPR:$vec), GPR64:$index)), + (LASTB_RPZ_S (WHILELS_PXX_S XZR, GPR64:$index), + ZPR:$vec)>; + def : Pat<(i64 (vector_extract (nxv2i64 ZPR:$vec), GPR64:$index)), + (LASTB_RPZ_D (WHILELS_PXX_D XZR, GPR64:$index), + ZPR:$vec)>; + + def : Pat<(f16 (vector_extract (nxv8f16 ZPR:$vec), GPR64:$index)), + (LASTB_VPZ_H (WHILELS_PXX_H XZR, GPR64:$index), + ZPR:$vec)>; + def : Pat<(f32 (vector_extract (nxv4f32 ZPR:$vec), GPR64:$index)), + (LASTB_VPZ_S (WHILELS_PXX_S XZR, GPR64:$index), + ZPR:$vec)>; + def : Pat<(f64 (vector_extract (nxv2f64 ZPR:$vec), GPR64:$index)), + (LASTB_VPZ_D (WHILELS_PXX_D XZR, GPR64:$index), + ZPR:$vec)>; +} + +let Predicates = [HasSVE, HasMatMulInt8] in { + defm SMMLA_ZZZ : sve_int_matmul<0b00, "smmla", int_aarch64_sve_smmla>; + defm UMMLA_ZZZ : sve_int_matmul<0b11, "ummla", int_aarch64_sve_ummla>; + defm USMMLA_ZZZ : sve_int_matmul<0b10, "usmmla", int_aarch64_sve_usmmla>; + defm USDOT_ZZZ : sve_int_dot_mixed<"usdot", int_aarch64_sve_usdot>; + defm USDOT_ZZZI : sve_int_dot_mixed_indexed<0, "usdot", int_aarch64_sve_usdot_lane>; + defm SUDOT_ZZZI : sve_int_dot_mixed_indexed<1, "sudot", int_aarch64_sve_sudot_lane>; +} + +let Predicates = [HasSVE, HasMatMulFP32] in { + defm FMMLA_ZZZ_S : sve_fp_matrix_mla<0, "fmmla", ZPR32, int_aarch64_sve_fmmla, nxv4f32>; +} + +let Predicates = [HasSVE, HasMatMulFP64] in { + defm FMMLA_ZZZ_D : sve_fp_matrix_mla<1, "fmmla", ZPR64, int_aarch64_sve_fmmla, nxv2f64>; + defm LD1RO_B_IMM : sve_mem_ldor_si<0b00, "ld1rob", Z_b, ZPR8, nxv16i8, nxv16i1, AArch64ld1ro_z>; + defm LD1RO_H_IMM : sve_mem_ldor_si<0b01, "ld1roh", Z_h, ZPR16, nxv8i16, nxv8i1, AArch64ld1ro_z>; + defm LD1RO_W_IMM : sve_mem_ldor_si<0b10, "ld1row", Z_s, ZPR32, nxv4i32, nxv4i1, AArch64ld1ro_z>; + defm LD1RO_D_IMM : sve_mem_ldor_si<0b11, "ld1rod", Z_d, ZPR64, nxv2i64, nxv2i1, AArch64ld1ro_z>; + defm LD1RO_B : sve_mem_ldor_ss<0b00, "ld1rob", Z_b, ZPR8, GPR64NoXZRshifted8, nxv16i8, nxv16i1, AArch64ld1ro_z, am_sve_regreg_lsl0>; + defm LD1RO_H : sve_mem_ldor_ss<0b01, "ld1roh", Z_h, ZPR16, GPR64NoXZRshifted16, nxv8i16, nxv8i1, AArch64ld1ro_z, am_sve_regreg_lsl1>; + defm LD1RO_W : sve_mem_ldor_ss<0b10, "ld1row", Z_s, ZPR32, GPR64NoXZRshifted32, nxv4i32, nxv4i1, AArch64ld1ro_z, am_sve_regreg_lsl2>; + defm LD1RO_D : sve_mem_ldor_ss<0b11, "ld1rod", Z_d, ZPR64, GPR64NoXZRshifted64, nxv2i64, nxv2i1, AArch64ld1ro_z, am_sve_regreg_lsl3>; + defm ZIP1_ZZZ_Q : sve_int_perm_bin_perm_128_zz<0b00, 0, "zip1", int_aarch64_sve_zip1q>; + defm ZIP2_ZZZ_Q : sve_int_perm_bin_perm_128_zz<0b00, 1, "zip2", int_aarch64_sve_zip2q>; + defm UZP1_ZZZ_Q : sve_int_perm_bin_perm_128_zz<0b01, 0, "uzp1", int_aarch64_sve_uzp1q>; + defm UZP2_ZZZ_Q : sve_int_perm_bin_perm_128_zz<0b01, 1, "uzp2", int_aarch64_sve_uzp2q>; + defm TRN1_ZZZ_Q : sve_int_perm_bin_perm_128_zz<0b11, 0, "trn1", int_aarch64_sve_trn1q>; + defm TRN2_ZZZ_Q : sve_int_perm_bin_perm_128_zz<0b11, 1, "trn2", int_aarch64_sve_trn2q>; +} + +let Predicates = [HasSVE, HasMatMulFP64, HasBF16] in { + def : SVE_2_Op_Pat<nxv8bf16, int_aarch64_sve_zip1q, nxv8bf16, nxv8bf16, ZIP1_ZZZ_Q>; + def : SVE_2_Op_Pat<nxv8bf16, int_aarch64_sve_zip2q, nxv8bf16, nxv8bf16, ZIP2_ZZZ_Q>; + def : SVE_2_Op_Pat<nxv8bf16, int_aarch64_sve_uzp1q, nxv8bf16, nxv8bf16, UZP1_ZZZ_Q>; + def : SVE_2_Op_Pat<nxv8bf16, int_aarch64_sve_uzp2q, nxv8bf16, nxv8bf16, UZP2_ZZZ_Q>; + def : SVE_2_Op_Pat<nxv8bf16, int_aarch64_sve_trn1q, nxv8bf16, nxv8bf16, TRN1_ZZZ_Q>; + def : SVE_2_Op_Pat<nxv8bf16, int_aarch64_sve_trn2q, nxv8bf16, nxv8bf16, TRN2_ZZZ_Q>; } let Predicates = [HasSVE2] in { // SVE2 integer multiply-add (indexed) - defm MLA_ZZZI : sve2_int_mla_by_indexed_elem<0b01, 0b0, "mla">; - defm MLS_ZZZI : sve2_int_mla_by_indexed_elem<0b01, 0b1, "mls">; + defm MLA_ZZZI : sve2_int_mla_by_indexed_elem<0b01, 0b0, "mla", int_aarch64_sve_mla_lane>; + defm MLS_ZZZI : sve2_int_mla_by_indexed_elem<0b01, 0b1, "mls", int_aarch64_sve_mls_lane>; // SVE2 saturating multiply-add high (indexed) - defm SQRDMLAH_ZZZI : sve2_int_mla_by_indexed_elem<0b10, 0b0, "sqrdmlah">; - defm SQRDMLSH_ZZZI : sve2_int_mla_by_indexed_elem<0b10, 0b1, "sqrdmlsh">; + defm SQRDMLAH_ZZZI : sve2_int_mla_by_indexed_elem<0b10, 0b0, "sqrdmlah", int_aarch64_sve_sqrdmlah_lane>; + defm SQRDMLSH_ZZZI : sve2_int_mla_by_indexed_elem<0b10, 0b1, "sqrdmlsh", int_aarch64_sve_sqrdmlsh_lane>; // SVE2 saturating multiply-add high (vectors, unpredicated) - defm SQRDMLAH_ZZZ : sve2_int_mla<0b0, "sqrdmlah">; - defm SQRDMLSH_ZZZ : sve2_int_mla<0b1, "sqrdmlsh">; + defm SQRDMLAH_ZZZ : sve2_int_mla<0b0, "sqrdmlah", int_aarch64_sve_sqrdmlah>; + defm SQRDMLSH_ZZZ : sve2_int_mla<0b1, "sqrdmlsh", int_aarch64_sve_sqrdmlsh>; // SVE2 integer multiply (indexed) - defm MUL_ZZZI : sve2_int_mul_by_indexed_elem<0b1110, "mul">; + defm MUL_ZZZI : sve2_int_mul_by_indexed_elem<0b1110, "mul", int_aarch64_sve_mul_lane>; // SVE2 saturating multiply high (indexed) - defm SQDMULH_ZZZI : sve2_int_mul_by_indexed_elem<0b1100, "sqdmulh">; - defm SQRDMULH_ZZZI : sve2_int_mul_by_indexed_elem<0b1101, "sqrdmulh">; + defm SQDMULH_ZZZI : sve2_int_mul_by_indexed_elem<0b1100, "sqdmulh", int_aarch64_sve_sqdmulh_lane>; + defm SQRDMULH_ZZZI : sve2_int_mul_by_indexed_elem<0b1101, "sqrdmulh", int_aarch64_sve_sqrdmulh_lane>; // SVE2 signed saturating doubling multiply high (unpredicated) - defm SQDMULH_ZZZ : sve2_int_mul<0b100, "sqdmulh">; - defm SQRDMULH_ZZZ : sve2_int_mul<0b101, "sqrdmulh">; + defm SQDMULH_ZZZ : sve2_int_mul<0b100, "sqdmulh", int_aarch64_sve_sqdmulh>; + defm SQRDMULH_ZZZ : sve2_int_mul<0b101, "sqrdmulh", int_aarch64_sve_sqrdmulh>; // SVE2 integer multiply vectors (unpredicated) - defm MUL_ZZZ : sve2_int_mul<0b000, "mul">; - defm SMULH_ZZZ : sve2_int_mul<0b010, "smulh">; - defm UMULH_ZZZ : sve2_int_mul<0b011, "umulh">; - def PMUL_ZZZ_B : sve2_int_mul<0b00, 0b001, "pmul", ZPR8>; - + defm MUL_ZZZ : sve2_int_mul<0b000, "mul", mul>; + defm SMULH_ZZZ : sve2_int_mul<0b010, "smulh", null_frag>; + defm UMULH_ZZZ : sve2_int_mul<0b011, "umulh", null_frag>; + defm PMUL_ZZZ : sve2_int_mul_single<0b001, "pmul", int_aarch64_sve_pmul>; + + // Add patterns for unpredicated version of smulh and umulh. + def : Pat<(nxv16i8 (int_aarch64_sve_smulh (nxv16i1 (AArch64ptrue 31)), nxv16i8:$Op1, nxv16i8:$Op2)), + (SMULH_ZZZ_B $Op1, $Op2)>; + def : Pat<(nxv8i16 (int_aarch64_sve_smulh (nxv8i1 (AArch64ptrue 31)), nxv8i16:$Op1, nxv8i16:$Op2)), + (SMULH_ZZZ_H $Op1, $Op2)>; + def : Pat<(nxv4i32 (int_aarch64_sve_smulh (nxv4i1 (AArch64ptrue 31)), nxv4i32:$Op1, nxv4i32:$Op2)), + (SMULH_ZZZ_S $Op1, $Op2)>; + def : Pat<(nxv2i64 (int_aarch64_sve_smulh (nxv2i1 (AArch64ptrue 31)), nxv2i64:$Op1, nxv2i64:$Op2)), + (SMULH_ZZZ_D $Op1, $Op2)>; + def : Pat<(nxv16i8 (int_aarch64_sve_umulh (nxv16i1 (AArch64ptrue 31)), nxv16i8:$Op1, nxv16i8:$Op2)), + (UMULH_ZZZ_B $Op1, $Op2)>; + def : Pat<(nxv8i16 (int_aarch64_sve_umulh (nxv8i1 (AArch64ptrue 31)), nxv8i16:$Op1, nxv8i16:$Op2)), + (UMULH_ZZZ_H $Op1, $Op2)>; + def : Pat<(nxv4i32 (int_aarch64_sve_umulh (nxv4i1 (AArch64ptrue 31)), nxv4i32:$Op1, nxv4i32:$Op2)), + (UMULH_ZZZ_S $Op1, $Op2)>; + def : Pat<(nxv2i64 (int_aarch64_sve_umulh (nxv2i1 (AArch64ptrue 31)), nxv2i64:$Op1, nxv2i64:$Op2)), + (UMULH_ZZZ_D $Op1, $Op2)>; // SVE2 complex integer dot product (indexed) - defm CDOT_ZZZI : sve2_cintx_dot_by_indexed_elem<"cdot">; + defm CDOT_ZZZI : sve2_cintx_dot_by_indexed_elem<"cdot", int_aarch64_sve_cdot_lane>; // SVE2 complex integer dot product - defm CDOT_ZZZ : sve2_cintx_dot<"cdot">; + defm CDOT_ZZZ : sve2_cintx_dot<"cdot", int_aarch64_sve_cdot>; // SVE2 complex integer multiply-add (indexed) - defm CMLA_ZZZI : sve2_cmla_by_indexed_elem<0b0, "cmla">; + defm CMLA_ZZZI : sve2_cmla_by_indexed_elem<0b0, "cmla", int_aarch64_sve_cmla_lane_x>; // SVE2 complex saturating multiply-add (indexed) - defm SQRDCMLAH_ZZZI : sve2_cmla_by_indexed_elem<0b1, "sqrdcmlah">; + defm SQRDCMLAH_ZZZI : sve2_cmla_by_indexed_elem<0b1, "sqrdcmlah", int_aarch64_sve_sqrdcmlah_lane_x>; // SVE2 complex integer multiply-add - defm CMLA_ZZZ : sve2_int_cmla<0b0, "cmla">; - defm SQRDCMLAH_ZZZ : sve2_int_cmla<0b1, "sqrdcmlah">; + defm CMLA_ZZZ : sve2_int_cmla<0b0, "cmla", int_aarch64_sve_cmla_x>; + defm SQRDCMLAH_ZZZ : sve2_int_cmla<0b1, "sqrdcmlah", int_aarch64_sve_sqrdcmlah_x>; // SVE2 integer multiply long (indexed) - defm SMULLB_ZZZI : sve2_int_mul_long_by_indexed_elem<0b000, "smullb">; - defm SMULLT_ZZZI : sve2_int_mul_long_by_indexed_elem<0b001, "smullt">; - defm UMULLB_ZZZI : sve2_int_mul_long_by_indexed_elem<0b010, "umullb">; - defm UMULLT_ZZZI : sve2_int_mul_long_by_indexed_elem<0b011, "umullt">; + defm SMULLB_ZZZI : sve2_int_mul_long_by_indexed_elem<0b000, "smullb", int_aarch64_sve_smullb_lane>; + defm SMULLT_ZZZI : sve2_int_mul_long_by_indexed_elem<0b001, "smullt", int_aarch64_sve_smullt_lane>; + defm UMULLB_ZZZI : sve2_int_mul_long_by_indexed_elem<0b010, "umullb", int_aarch64_sve_umullb_lane>; + defm UMULLT_ZZZI : sve2_int_mul_long_by_indexed_elem<0b011, "umullt", int_aarch64_sve_umullt_lane>; // SVE2 saturating multiply (indexed) - defm SQDMULLB_ZZZI : sve2_int_mul_long_by_indexed_elem<0b100, "sqdmullb">; - defm SQDMULLT_ZZZI : sve2_int_mul_long_by_indexed_elem<0b101, "sqdmullt">; + defm SQDMULLB_ZZZI : sve2_int_mul_long_by_indexed_elem<0b100, "sqdmullb", int_aarch64_sve_sqdmullb_lane>; + defm SQDMULLT_ZZZI : sve2_int_mul_long_by_indexed_elem<0b101, "sqdmullt", int_aarch64_sve_sqdmullt_lane>; // SVE2 integer multiply-add long (indexed) - defm SMLALB_ZZZI : sve2_int_mla_long_by_indexed_elem<0b1000, "smlalb">; - defm SMLALT_ZZZI : sve2_int_mla_long_by_indexed_elem<0b1001, "smlalt">; - defm UMLALB_ZZZI : sve2_int_mla_long_by_indexed_elem<0b1010, "umlalb">; - defm UMLALT_ZZZI : sve2_int_mla_long_by_indexed_elem<0b1011, "umlalt">; - defm SMLSLB_ZZZI : sve2_int_mla_long_by_indexed_elem<0b1100, "smlslb">; - defm SMLSLT_ZZZI : sve2_int_mla_long_by_indexed_elem<0b1101, "smlslt">; - defm UMLSLB_ZZZI : sve2_int_mla_long_by_indexed_elem<0b1110, "umlslb">; - defm UMLSLT_ZZZI : sve2_int_mla_long_by_indexed_elem<0b1111, "umlslt">; + defm SMLALB_ZZZI : sve2_int_mla_long_by_indexed_elem<0b1000, "smlalb", int_aarch64_sve_smlalb_lane>; + defm SMLALT_ZZZI : sve2_int_mla_long_by_indexed_elem<0b1001, "smlalt", int_aarch64_sve_smlalt_lane>; + defm UMLALB_ZZZI : sve2_int_mla_long_by_indexed_elem<0b1010, "umlalb", int_aarch64_sve_umlalb_lane>; + defm UMLALT_ZZZI : sve2_int_mla_long_by_indexed_elem<0b1011, "umlalt", int_aarch64_sve_umlalt_lane>; + defm SMLSLB_ZZZI : sve2_int_mla_long_by_indexed_elem<0b1100, "smlslb", int_aarch64_sve_smlslb_lane>; + defm SMLSLT_ZZZI : sve2_int_mla_long_by_indexed_elem<0b1101, "smlslt", int_aarch64_sve_smlslt_lane>; + defm UMLSLB_ZZZI : sve2_int_mla_long_by_indexed_elem<0b1110, "umlslb", int_aarch64_sve_umlslb_lane>; + defm UMLSLT_ZZZI : sve2_int_mla_long_by_indexed_elem<0b1111, "umlslt", int_aarch64_sve_umlslt_lane>; // SVE2 integer multiply-add long (vectors, unpredicated) - defm SMLALB_ZZZ : sve2_int_mla_long<0b10000, "smlalb">; - defm SMLALT_ZZZ : sve2_int_mla_long<0b10001, "smlalt">; - defm UMLALB_ZZZ : sve2_int_mla_long<0b10010, "umlalb">; - defm UMLALT_ZZZ : sve2_int_mla_long<0b10011, "umlalt">; - defm SMLSLB_ZZZ : sve2_int_mla_long<0b10100, "smlslb">; - defm SMLSLT_ZZZ : sve2_int_mla_long<0b10101, "smlslt">; - defm UMLSLB_ZZZ : sve2_int_mla_long<0b10110, "umlslb">; - defm UMLSLT_ZZZ : sve2_int_mla_long<0b10111, "umlslt">; + defm SMLALB_ZZZ : sve2_int_mla_long<0b10000, "smlalb", int_aarch64_sve_smlalb>; + defm SMLALT_ZZZ : sve2_int_mla_long<0b10001, "smlalt", int_aarch64_sve_smlalt>; + defm UMLALB_ZZZ : sve2_int_mla_long<0b10010, "umlalb", int_aarch64_sve_umlalb>; + defm UMLALT_ZZZ : sve2_int_mla_long<0b10011, "umlalt", int_aarch64_sve_umlalt>; + defm SMLSLB_ZZZ : sve2_int_mla_long<0b10100, "smlslb", int_aarch64_sve_smlslb>; + defm SMLSLT_ZZZ : sve2_int_mla_long<0b10101, "smlslt", int_aarch64_sve_smlslt>; + defm UMLSLB_ZZZ : sve2_int_mla_long<0b10110, "umlslb", int_aarch64_sve_umlslb>; + defm UMLSLT_ZZZ : sve2_int_mla_long<0b10111, "umlslt", int_aarch64_sve_umlslt>; // SVE2 saturating multiply-add long (indexed) - defm SQDMLALB_ZZZI : sve2_int_mla_long_by_indexed_elem<0b0100, "sqdmlalb">; - defm SQDMLALT_ZZZI : sve2_int_mla_long_by_indexed_elem<0b0101, "sqdmlalt">; - defm SQDMLSLB_ZZZI : sve2_int_mla_long_by_indexed_elem<0b0110, "sqdmlslb">; - defm SQDMLSLT_ZZZI : sve2_int_mla_long_by_indexed_elem<0b0111, "sqdmlslt">; + defm SQDMLALB_ZZZI : sve2_int_mla_long_by_indexed_elem<0b0100, "sqdmlalb", int_aarch64_sve_sqdmlalb_lane>; + defm SQDMLALT_ZZZI : sve2_int_mla_long_by_indexed_elem<0b0101, "sqdmlalt", int_aarch64_sve_sqdmlalt_lane>; + defm SQDMLSLB_ZZZI : sve2_int_mla_long_by_indexed_elem<0b0110, "sqdmlslb", int_aarch64_sve_sqdmlslb_lane>; + defm SQDMLSLT_ZZZI : sve2_int_mla_long_by_indexed_elem<0b0111, "sqdmlslt", int_aarch64_sve_sqdmlslt_lane>; // SVE2 saturating multiply-add long (vectors, unpredicated) - defm SQDMLALB_ZZZ : sve2_int_mla_long<0b11000, "sqdmlalb">; - defm SQDMLALT_ZZZ : sve2_int_mla_long<0b11001, "sqdmlalt">; - defm SQDMLSLB_ZZZ : sve2_int_mla_long<0b11010, "sqdmlslb">; - defm SQDMLSLT_ZZZ : sve2_int_mla_long<0b11011, "sqdmlslt">; + defm SQDMLALB_ZZZ : sve2_int_mla_long<0b11000, "sqdmlalb", int_aarch64_sve_sqdmlalb>; + defm SQDMLALT_ZZZ : sve2_int_mla_long<0b11001, "sqdmlalt", int_aarch64_sve_sqdmlalt>; + defm SQDMLSLB_ZZZ : sve2_int_mla_long<0b11010, "sqdmlslb", int_aarch64_sve_sqdmlslb>; + defm SQDMLSLT_ZZZ : sve2_int_mla_long<0b11011, "sqdmlslt", int_aarch64_sve_sqdmlslt>; // SVE2 saturating multiply-add interleaved long - defm SQDMLALBT_ZZZ : sve2_int_mla_long<0b00010, "sqdmlalbt">; - defm SQDMLSLBT_ZZZ : sve2_int_mla_long<0b00011, "sqdmlslbt">; + defm SQDMLALBT_ZZZ : sve2_int_mla_long<0b00010, "sqdmlalbt", int_aarch64_sve_sqdmlalbt>; + defm SQDMLSLBT_ZZZ : sve2_int_mla_long<0b00011, "sqdmlslbt", int_aarch64_sve_sqdmlslbt>; // SVE2 integer halving add/subtract (predicated) - defm SHADD_ZPmZ : sve2_int_arith_pred<0b100000, "shadd">; - defm UHADD_ZPmZ : sve2_int_arith_pred<0b100010, "uhadd">; - defm SHSUB_ZPmZ : sve2_int_arith_pred<0b100100, "shsub">; - defm UHSUB_ZPmZ : sve2_int_arith_pred<0b100110, "uhsub">; - defm SRHADD_ZPmZ : sve2_int_arith_pred<0b101000, "srhadd">; - defm URHADD_ZPmZ : sve2_int_arith_pred<0b101010, "urhadd">; - defm SHSUBR_ZPmZ : sve2_int_arith_pred<0b101100, "shsubr">; - defm UHSUBR_ZPmZ : sve2_int_arith_pred<0b101110, "uhsubr">; + defm SHADD_ZPmZ : sve2_int_arith_pred<0b100000, "shadd", int_aarch64_sve_shadd>; + defm UHADD_ZPmZ : sve2_int_arith_pred<0b100010, "uhadd", int_aarch64_sve_uhadd>; + defm SHSUB_ZPmZ : sve2_int_arith_pred<0b100100, "shsub", int_aarch64_sve_shsub>; + defm UHSUB_ZPmZ : sve2_int_arith_pred<0b100110, "uhsub", int_aarch64_sve_uhsub>; + defm SRHADD_ZPmZ : sve2_int_arith_pred<0b101000, "srhadd", int_aarch64_sve_srhadd>; + defm URHADD_ZPmZ : sve2_int_arith_pred<0b101010, "urhadd", int_aarch64_sve_urhadd>; + defm SHSUBR_ZPmZ : sve2_int_arith_pred<0b101100, "shsubr", int_aarch64_sve_shsubr>; + defm UHSUBR_ZPmZ : sve2_int_arith_pred<0b101110, "uhsubr", int_aarch64_sve_uhsubr>; // SVE2 integer pairwise add and accumulate long - defm SADALP_ZPmZ : sve2_int_sadd_long_accum_pairwise<0, "sadalp">; - defm UADALP_ZPmZ : sve2_int_sadd_long_accum_pairwise<1, "uadalp">; + defm SADALP_ZPmZ : sve2_int_sadd_long_accum_pairwise<0, "sadalp", int_aarch64_sve_sadalp>; + defm UADALP_ZPmZ : sve2_int_sadd_long_accum_pairwise<1, "uadalp", int_aarch64_sve_uadalp>; // SVE2 integer pairwise arithmetic - defm ADDP_ZPmZ : sve2_int_arith_pred<0b100011, "addp">; - defm SMAXP_ZPmZ : sve2_int_arith_pred<0b101001, "smaxp">; - defm UMAXP_ZPmZ : sve2_int_arith_pred<0b101011, "umaxp">; - defm SMINP_ZPmZ : sve2_int_arith_pred<0b101101, "sminp">; - defm UMINP_ZPmZ : sve2_int_arith_pred<0b101111, "uminp">; + defm ADDP_ZPmZ : sve2_int_arith_pred<0b100011, "addp", int_aarch64_sve_addp>; + defm SMAXP_ZPmZ : sve2_int_arith_pred<0b101001, "smaxp", int_aarch64_sve_smaxp>; + defm UMAXP_ZPmZ : sve2_int_arith_pred<0b101011, "umaxp", int_aarch64_sve_umaxp>; + defm SMINP_ZPmZ : sve2_int_arith_pred<0b101101, "sminp", int_aarch64_sve_sminp>; + defm UMINP_ZPmZ : sve2_int_arith_pred<0b101111, "uminp", int_aarch64_sve_uminp>; // SVE2 integer unary operations (predicated) - defm URECPE_ZPmZ : sve2_int_un_pred_arit_s<0b000, "urecpe">; - defm URSQRTE_ZPmZ : sve2_int_un_pred_arit_s<0b001, "ursqrte">; - defm SQABS_ZPmZ : sve2_int_un_pred_arit<0b100, "sqabs">; - defm SQNEG_ZPmZ : sve2_int_un_pred_arit<0b101, "sqneg">; + defm URECPE_ZPmZ : sve2_int_un_pred_arit_s<0b000, "urecpe", int_aarch64_sve_urecpe>; + defm URSQRTE_ZPmZ : sve2_int_un_pred_arit_s<0b001, "ursqrte", int_aarch64_sve_ursqrte>; + defm SQABS_ZPmZ : sve2_int_un_pred_arit<0b100, "sqabs", int_aarch64_sve_sqabs>; + defm SQNEG_ZPmZ : sve2_int_un_pred_arit<0b101, "sqneg", int_aarch64_sve_sqneg>; // SVE2 saturating add/subtract - defm SQADD_ZPmZ : sve2_int_arith_pred<0b110000, "sqadd">; - defm UQADD_ZPmZ : sve2_int_arith_pred<0b110010, "uqadd">; - defm SQSUB_ZPmZ : sve2_int_arith_pred<0b110100, "sqsub">; - defm UQSUB_ZPmZ : sve2_int_arith_pred<0b110110, "uqsub">; - defm SUQADD_ZPmZ : sve2_int_arith_pred<0b111000, "suqadd">; - defm USQADD_ZPmZ : sve2_int_arith_pred<0b111010, "usqadd">; - defm SQSUBR_ZPmZ : sve2_int_arith_pred<0b111100, "sqsubr">; - defm UQSUBR_ZPmZ : sve2_int_arith_pred<0b111110, "uqsubr">; + defm SQADD_ZPmZ : sve2_int_arith_pred<0b110000, "sqadd", int_aarch64_sve_sqadd>; + defm UQADD_ZPmZ : sve2_int_arith_pred<0b110010, "uqadd", int_aarch64_sve_uqadd>; + defm SQSUB_ZPmZ : sve2_int_arith_pred<0b110100, "sqsub", int_aarch64_sve_sqsub>; + defm UQSUB_ZPmZ : sve2_int_arith_pred<0b110110, "uqsub", int_aarch64_sve_uqsub>; + defm SUQADD_ZPmZ : sve2_int_arith_pred<0b111000, "suqadd", int_aarch64_sve_suqadd>; + defm USQADD_ZPmZ : sve2_int_arith_pred<0b111010, "usqadd", int_aarch64_sve_usqadd>; + defm SQSUBR_ZPmZ : sve2_int_arith_pred<0b111100, "sqsubr", int_aarch64_sve_sqsubr>; + defm UQSUBR_ZPmZ : sve2_int_arith_pred<0b111110, "uqsubr", int_aarch64_sve_uqsubr>; // SVE2 saturating/rounding bitwise shift left (predicated) - defm SRSHL_ZPmZ : sve2_int_arith_pred<0b000100, "srshl">; - defm URSHL_ZPmZ : sve2_int_arith_pred<0b000110, "urshl">; - defm SRSHLR_ZPmZ : sve2_int_arith_pred<0b001100, "srshlr">; - defm URSHLR_ZPmZ : sve2_int_arith_pred<0b001110, "urshlr">; - defm SQSHL_ZPmZ : sve2_int_arith_pred<0b010000, "sqshl">; - defm UQSHL_ZPmZ : sve2_int_arith_pred<0b010010, "uqshl">; - defm SQRSHL_ZPmZ : sve2_int_arith_pred<0b010100, "sqrshl">; - defm UQRSHL_ZPmZ : sve2_int_arith_pred<0b010110, "uqrshl">; - defm SQSHLR_ZPmZ : sve2_int_arith_pred<0b011000, "sqshlr">; - defm UQSHLR_ZPmZ : sve2_int_arith_pred<0b011010, "uqshlr">; - defm SQRSHLR_ZPmZ : sve2_int_arith_pred<0b011100, "sqrshlr">; - defm UQRSHLR_ZPmZ : sve2_int_arith_pred<0b011110, "uqrshlr">; + defm SRSHL_ZPmZ : sve2_int_arith_pred<0b000100, "srshl", int_aarch64_sve_srshl>; + defm URSHL_ZPmZ : sve2_int_arith_pred<0b000110, "urshl", int_aarch64_sve_urshl>; + defm SRSHLR_ZPmZ : sve2_int_arith_pred<0b001100, "srshlr", null_frag>; + defm URSHLR_ZPmZ : sve2_int_arith_pred<0b001110, "urshlr", null_frag>; + defm SQSHL_ZPmZ : sve2_int_arith_pred<0b010000, "sqshl", int_aarch64_sve_sqshl>; + defm UQSHL_ZPmZ : sve2_int_arith_pred<0b010010, "uqshl", int_aarch64_sve_uqshl>; + defm SQRSHL_ZPmZ : sve2_int_arith_pred<0b010100, "sqrshl", int_aarch64_sve_sqrshl>; + defm UQRSHL_ZPmZ : sve2_int_arith_pred<0b010110, "uqrshl", int_aarch64_sve_uqrshl>; + defm SQSHLR_ZPmZ : sve2_int_arith_pred<0b011000, "sqshlr", null_frag>; + defm UQSHLR_ZPmZ : sve2_int_arith_pred<0b011010, "uqshlr", null_frag>; + defm SQRSHLR_ZPmZ : sve2_int_arith_pred<0b011100, "sqrshlr", null_frag>; + defm UQRSHLR_ZPmZ : sve2_int_arith_pred<0b011110, "uqrshlr", null_frag>; + + let Predicates = [HasSVE2, UseExperimentalZeroingPseudos] in { + defm SQSHL_ZPZI : sve_int_bin_pred_shift_imm_left_zeroing_bhsd<null_frag>; + defm UQSHL_ZPZI : sve_int_bin_pred_shift_imm_left_zeroing_bhsd<null_frag>; + defm SRSHR_ZPZI : sve_int_bin_pred_shift_imm_right_zeroing_bhsd<int_aarch64_sve_srshr>; + defm URSHR_ZPZI : sve_int_bin_pred_shift_imm_right_zeroing_bhsd<int_aarch64_sve_urshr>; + defm SQSHLU_ZPZI : sve_int_bin_pred_shift_imm_left_zeroing_bhsd<int_aarch64_sve_sqshlu>; + } // SVE2 predicated shifts - defm SQSHL_ZPmI : sve_int_bin_pred_shift_imm_left< 0b0110, "sqshl">; - defm UQSHL_ZPmI : sve_int_bin_pred_shift_imm_left< 0b0111, "uqshl">; - defm SRSHR_ZPmI : sve_int_bin_pred_shift_imm_right<0b1100, "srshr">; - defm URSHR_ZPmI : sve_int_bin_pred_shift_imm_right<0b1101, "urshr">; - defm SQSHLU_ZPmI : sve_int_bin_pred_shift_imm_left< 0b1111, "sqshlu">; + defm SQSHL_ZPmI : sve_int_bin_pred_shift_imm_left< 0b0110, "sqshl", "SQSHL_ZPZI">; + defm UQSHL_ZPmI : sve_int_bin_pred_shift_imm_left< 0b0111, "uqshl", "UQSHL_ZPZI">; + defm SRSHR_ZPmI : sve_int_bin_pred_shift_imm_right<0b1100, "srshr", "SRSHR_ZPZI", int_aarch64_sve_srshr>; + defm URSHR_ZPmI : sve_int_bin_pred_shift_imm_right<0b1101, "urshr", "URSHR_ZPZI", int_aarch64_sve_urshr>; + defm SQSHLU_ZPmI : sve2_int_bin_pred_shift_imm_left< 0b1111, "sqshlu", "SQSHLU_ZPZI", int_aarch64_sve_sqshlu>; // SVE2 integer add/subtract long - defm SADDLB_ZZZ : sve2_wide_int_arith_long<0b00000, "saddlb">; - defm SADDLT_ZZZ : sve2_wide_int_arith_long<0b00001, "saddlt">; - defm UADDLB_ZZZ : sve2_wide_int_arith_long<0b00010, "uaddlb">; - defm UADDLT_ZZZ : sve2_wide_int_arith_long<0b00011, "uaddlt">; - defm SSUBLB_ZZZ : sve2_wide_int_arith_long<0b00100, "ssublb">; - defm SSUBLT_ZZZ : sve2_wide_int_arith_long<0b00101, "ssublt">; - defm USUBLB_ZZZ : sve2_wide_int_arith_long<0b00110, "usublb">; - defm USUBLT_ZZZ : sve2_wide_int_arith_long<0b00111, "usublt">; - defm SABDLB_ZZZ : sve2_wide_int_arith_long<0b01100, "sabdlb">; - defm SABDLT_ZZZ : sve2_wide_int_arith_long<0b01101, "sabdlt">; - defm UABDLB_ZZZ : sve2_wide_int_arith_long<0b01110, "uabdlb">; - defm UABDLT_ZZZ : sve2_wide_int_arith_long<0b01111, "uabdlt">; + defm SADDLB_ZZZ : sve2_wide_int_arith_long<0b00000, "saddlb", int_aarch64_sve_saddlb>; + defm SADDLT_ZZZ : sve2_wide_int_arith_long<0b00001, "saddlt", int_aarch64_sve_saddlt>; + defm UADDLB_ZZZ : sve2_wide_int_arith_long<0b00010, "uaddlb", int_aarch64_sve_uaddlb>; + defm UADDLT_ZZZ : sve2_wide_int_arith_long<0b00011, "uaddlt", int_aarch64_sve_uaddlt>; + defm SSUBLB_ZZZ : sve2_wide_int_arith_long<0b00100, "ssublb", int_aarch64_sve_ssublb>; + defm SSUBLT_ZZZ : sve2_wide_int_arith_long<0b00101, "ssublt", int_aarch64_sve_ssublt>; + defm USUBLB_ZZZ : sve2_wide_int_arith_long<0b00110, "usublb", int_aarch64_sve_usublb>; + defm USUBLT_ZZZ : sve2_wide_int_arith_long<0b00111, "usublt", int_aarch64_sve_usublt>; + defm SABDLB_ZZZ : sve2_wide_int_arith_long<0b01100, "sabdlb", int_aarch64_sve_sabdlb>; + defm SABDLT_ZZZ : sve2_wide_int_arith_long<0b01101, "sabdlt", int_aarch64_sve_sabdlt>; + defm UABDLB_ZZZ : sve2_wide_int_arith_long<0b01110, "uabdlb", int_aarch64_sve_uabdlb>; + defm UABDLT_ZZZ : sve2_wide_int_arith_long<0b01111, "uabdlt", int_aarch64_sve_uabdlt>; // SVE2 integer add/subtract wide - defm SADDWB_ZZZ : sve2_wide_int_arith_wide<0b000, "saddwb">; - defm SADDWT_ZZZ : sve2_wide_int_arith_wide<0b001, "saddwt">; - defm UADDWB_ZZZ : sve2_wide_int_arith_wide<0b010, "uaddwb">; - defm UADDWT_ZZZ : sve2_wide_int_arith_wide<0b011, "uaddwt">; - defm SSUBWB_ZZZ : sve2_wide_int_arith_wide<0b100, "ssubwb">; - defm SSUBWT_ZZZ : sve2_wide_int_arith_wide<0b101, "ssubwt">; - defm USUBWB_ZZZ : sve2_wide_int_arith_wide<0b110, "usubwb">; - defm USUBWT_ZZZ : sve2_wide_int_arith_wide<0b111, "usubwt">; + defm SADDWB_ZZZ : sve2_wide_int_arith_wide<0b000, "saddwb", int_aarch64_sve_saddwb>; + defm SADDWT_ZZZ : sve2_wide_int_arith_wide<0b001, "saddwt", int_aarch64_sve_saddwt>; + defm UADDWB_ZZZ : sve2_wide_int_arith_wide<0b010, "uaddwb", int_aarch64_sve_uaddwb>; + defm UADDWT_ZZZ : sve2_wide_int_arith_wide<0b011, "uaddwt", int_aarch64_sve_uaddwt>; + defm SSUBWB_ZZZ : sve2_wide_int_arith_wide<0b100, "ssubwb", int_aarch64_sve_ssubwb>; + defm SSUBWT_ZZZ : sve2_wide_int_arith_wide<0b101, "ssubwt", int_aarch64_sve_ssubwt>; + defm USUBWB_ZZZ : sve2_wide_int_arith_wide<0b110, "usubwb", int_aarch64_sve_usubwb>; + defm USUBWT_ZZZ : sve2_wide_int_arith_wide<0b111, "usubwt", int_aarch64_sve_usubwt>; // SVE2 integer multiply long - defm SQDMULLB_ZZZ : sve2_wide_int_arith_long<0b11000, "sqdmullb">; - defm SQDMULLT_ZZZ : sve2_wide_int_arith_long<0b11001, "sqdmullt">; - defm SMULLB_ZZZ : sve2_wide_int_arith_long<0b11100, "smullb">; - defm SMULLT_ZZZ : sve2_wide_int_arith_long<0b11101, "smullt">; - defm UMULLB_ZZZ : sve2_wide_int_arith_long<0b11110, "umullb">; - defm UMULLT_ZZZ : sve2_wide_int_arith_long<0b11111, "umullt">; - defm PMULLB_ZZZ : sve2_pmul_long<0b0, "pmullb">; - defm PMULLT_ZZZ : sve2_pmul_long<0b1, "pmullt">; + defm SQDMULLB_ZZZ : sve2_wide_int_arith_long<0b11000, "sqdmullb", int_aarch64_sve_sqdmullb>; + defm SQDMULLT_ZZZ : sve2_wide_int_arith_long<0b11001, "sqdmullt", int_aarch64_sve_sqdmullt>; + defm SMULLB_ZZZ : sve2_wide_int_arith_long<0b11100, "smullb", int_aarch64_sve_smullb>; + defm SMULLT_ZZZ : sve2_wide_int_arith_long<0b11101, "smullt", int_aarch64_sve_smullt>; + defm UMULLB_ZZZ : sve2_wide_int_arith_long<0b11110, "umullb", int_aarch64_sve_umullb>; + defm UMULLT_ZZZ : sve2_wide_int_arith_long<0b11111, "umullt", int_aarch64_sve_umullt>; + defm PMULLB_ZZZ : sve2_pmul_long<0b0, "pmullb", int_aarch64_sve_pmullb_pair>; + defm PMULLT_ZZZ : sve2_pmul_long<0b1, "pmullt", int_aarch64_sve_pmullt_pair>; // SVE2 bitwise shift and insert - defm SRI_ZZI : sve2_int_bin_shift_imm_right<0b0, "sri">; - defm SLI_ZZI : sve2_int_bin_shift_imm_left< 0b1, "sli">; + defm SRI_ZZI : sve2_int_bin_shift_imm_right<0b0, "sri", int_aarch64_sve_sri>; + defm SLI_ZZI : sve2_int_bin_shift_imm_left< 0b1, "sli", int_aarch64_sve_sli>; // SVE2 bitwise shift right and accumulate - defm SSRA_ZZI : sve2_int_bin_accum_shift_imm_right<0b00, "ssra">; - defm USRA_ZZI : sve2_int_bin_accum_shift_imm_right<0b01, "usra">; - defm SRSRA_ZZI : sve2_int_bin_accum_shift_imm_right<0b10, "srsra">; - defm URSRA_ZZI : sve2_int_bin_accum_shift_imm_right<0b11, "ursra">; + defm SSRA_ZZI : sve2_int_bin_accum_shift_imm_right<0b00, "ssra", int_aarch64_sve_ssra>; + defm USRA_ZZI : sve2_int_bin_accum_shift_imm_right<0b01, "usra", int_aarch64_sve_usra>; + defm SRSRA_ZZI : sve2_int_bin_accum_shift_imm_right<0b10, "srsra", int_aarch64_sve_srsra>; + defm URSRA_ZZI : sve2_int_bin_accum_shift_imm_right<0b11, "ursra", int_aarch64_sve_ursra>; // SVE2 complex integer add - defm CADD_ZZI : sve2_int_cadd<0b0, "cadd">; - defm SQCADD_ZZI : sve2_int_cadd<0b1, "sqcadd">; + defm CADD_ZZI : sve2_int_cadd<0b0, "cadd", int_aarch64_sve_cadd_x>; + defm SQCADD_ZZI : sve2_int_cadd<0b1, "sqcadd", int_aarch64_sve_sqcadd_x>; // SVE2 integer absolute difference and accumulate - defm SABA_ZZZ : sve2_int_absdiff_accum<0b0, "saba">; - defm UABA_ZZZ : sve2_int_absdiff_accum<0b1, "uaba">; + defm SABA_ZZZ : sve2_int_absdiff_accum<0b0, "saba", int_aarch64_sve_saba>; + defm UABA_ZZZ : sve2_int_absdiff_accum<0b1, "uaba", int_aarch64_sve_uaba>; // SVE2 integer absolute difference and accumulate long - defm SABALB_ZZZ : sve2_int_absdiff_accum_long<0b00, "sabalb">; - defm SABALT_ZZZ : sve2_int_absdiff_accum_long<0b01, "sabalt">; - defm UABALB_ZZZ : sve2_int_absdiff_accum_long<0b10, "uabalb">; - defm UABALT_ZZZ : sve2_int_absdiff_accum_long<0b11, "uabalt">; + defm SABALB_ZZZ : sve2_int_absdiff_accum_long<0b00, "sabalb", int_aarch64_sve_sabalb>; + defm SABALT_ZZZ : sve2_int_absdiff_accum_long<0b01, "sabalt", int_aarch64_sve_sabalt>; + defm UABALB_ZZZ : sve2_int_absdiff_accum_long<0b10, "uabalb", int_aarch64_sve_uabalb>; + defm UABALT_ZZZ : sve2_int_absdiff_accum_long<0b11, "uabalt", int_aarch64_sve_uabalt>; // SVE2 integer add/subtract long with carry - defm ADCLB_ZZZ : sve2_int_addsub_long_carry<0b00, "adclb">; - defm ADCLT_ZZZ : sve2_int_addsub_long_carry<0b01, "adclt">; - defm SBCLB_ZZZ : sve2_int_addsub_long_carry<0b10, "sbclb">; - defm SBCLT_ZZZ : sve2_int_addsub_long_carry<0b11, "sbclt">; + defm ADCLB_ZZZ : sve2_int_addsub_long_carry<0b00, "adclb", int_aarch64_sve_adclb>; + defm ADCLT_ZZZ : sve2_int_addsub_long_carry<0b01, "adclt", int_aarch64_sve_adclt>; + defm SBCLB_ZZZ : sve2_int_addsub_long_carry<0b10, "sbclb", int_aarch64_sve_sbclb>; + defm SBCLT_ZZZ : sve2_int_addsub_long_carry<0b11, "sbclt", int_aarch64_sve_sbclt>; // SVE2 bitwise shift right narrow (bottom) defm SQSHRUNB_ZZI : sve2_int_bin_shift_imm_right_narrow_bottom<0b000, "sqshrunb", int_aarch64_sve_sqshrunb>; @@ -1489,29 +2426,29 @@ let Predicates = [HasSVE2] in { defm SQXTUNT_ZZ : sve2_int_sat_extract_narrow_top<0b10, "sqxtunt", int_aarch64_sve_sqxtunt>; // SVE2 character match - defm MATCH_PPzZZ : sve2_char_match<0b0, "match">; - defm NMATCH_PPzZZ : sve2_char_match<0b1, "nmatch">; + defm MATCH_PPzZZ : sve2_char_match<0b0, "match", int_aarch64_sve_match>; + defm NMATCH_PPzZZ : sve2_char_match<0b1, "nmatch", int_aarch64_sve_nmatch>; // SVE2 bitwise exclusive-or interleaved - defm EORBT_ZZZ : sve2_bitwise_xor_interleaved<0b0, "eorbt">; - defm EORTB_ZZZ : sve2_bitwise_xor_interleaved<0b1, "eortb">; + defm EORBT_ZZZ : sve2_bitwise_xor_interleaved<0b0, "eorbt", int_aarch64_sve_eorbt>; + defm EORTB_ZZZ : sve2_bitwise_xor_interleaved<0b1, "eortb", int_aarch64_sve_eortb>; // SVE2 bitwise shift left long - defm SSHLLB_ZZI : sve2_bitwise_shift_left_long<0b00, "sshllb">; - defm SSHLLT_ZZI : sve2_bitwise_shift_left_long<0b01, "sshllt">; - defm USHLLB_ZZI : sve2_bitwise_shift_left_long<0b10, "ushllb">; - defm USHLLT_ZZI : sve2_bitwise_shift_left_long<0b11, "ushllt">; + defm SSHLLB_ZZI : sve2_bitwise_shift_left_long<0b00, "sshllb", int_aarch64_sve_sshllb>; + defm SSHLLT_ZZI : sve2_bitwise_shift_left_long<0b01, "sshllt", int_aarch64_sve_sshllt>; + defm USHLLB_ZZI : sve2_bitwise_shift_left_long<0b10, "ushllb", int_aarch64_sve_ushllb>; + defm USHLLT_ZZI : sve2_bitwise_shift_left_long<0b11, "ushllt", int_aarch64_sve_ushllt>; // SVE2 integer add/subtract interleaved long - defm SADDLBT_ZZZ : sve2_misc_int_addsub_long_interleaved<0b00, "saddlbt">; - defm SSUBLBT_ZZZ : sve2_misc_int_addsub_long_interleaved<0b10, "ssublbt">; - defm SSUBLTB_ZZZ : sve2_misc_int_addsub_long_interleaved<0b11, "ssubltb">; + defm SADDLBT_ZZZ : sve2_misc_int_addsub_long_interleaved<0b00, "saddlbt", int_aarch64_sve_saddlbt>; + defm SSUBLBT_ZZZ : sve2_misc_int_addsub_long_interleaved<0b10, "ssublbt", int_aarch64_sve_ssublbt>; + defm SSUBLTB_ZZZ : sve2_misc_int_addsub_long_interleaved<0b11, "ssubltb", int_aarch64_sve_ssubltb>; // SVE2 histogram generation (segment) - def HISTSEG_ZZZ : sve2_hist_gen_segment<"histseg">; + def HISTSEG_ZZZ : sve2_hist_gen_segment<"histseg", int_aarch64_sve_histseg>; // SVE2 histogram generation (vector) - defm HISTCNT_ZPzZZ : sve2_hist_gen_vector<"histcnt">; + defm HISTCNT_ZPzZZ : sve2_hist_gen_vector<"histcnt", int_aarch64_sve_histcnt>; // SVE2 floating-point base 2 logarithm as integer defm FLOGB_ZPmZ : sve2_fp_flogb<"flogb", int_aarch64_sve_flogb>; @@ -1542,50 +2479,57 @@ let Predicates = [HasSVE2] in { defm FMLSLT_ZZZ_SHH : sve2_fp_mla_long<0b11, "fmlslt", int_aarch64_sve_fmlslt>; // SVE2 bitwise ternary operations - defm EOR3_ZZZZ_D : sve2_int_bitwise_ternary_op<0b000, "eor3">; - defm BCAX_ZZZZ_D : sve2_int_bitwise_ternary_op<0b010, "bcax">; - def BSL_ZZZZ_D : sve2_int_bitwise_ternary_op_d<0b001, "bsl">; - def BSL1N_ZZZZ_D : sve2_int_bitwise_ternary_op_d<0b011, "bsl1n">; - def BSL2N_ZZZZ_D : sve2_int_bitwise_ternary_op_d<0b101, "bsl2n">; - def NBSL_ZZZZ_D : sve2_int_bitwise_ternary_op_d<0b111, "nbsl">; + defm EOR3_ZZZZ : sve2_int_bitwise_ternary_op<0b000, "eor3", int_aarch64_sve_eor3>; + defm BCAX_ZZZZ : sve2_int_bitwise_ternary_op<0b010, "bcax", int_aarch64_sve_bcax>; + defm BSL_ZZZZ : sve2_int_bitwise_ternary_op<0b001, "bsl", int_aarch64_sve_bsl>; + defm BSL1N_ZZZZ : sve2_int_bitwise_ternary_op<0b011, "bsl1n", int_aarch64_sve_bsl1n>; + defm BSL2N_ZZZZ : sve2_int_bitwise_ternary_op<0b101, "bsl2n", int_aarch64_sve_bsl2n>; + defm NBSL_ZZZZ : sve2_int_bitwise_ternary_op<0b111, "nbsl", int_aarch64_sve_nbsl>; // SVE2 bitwise xor and rotate right by immediate - defm XAR_ZZZI : sve2_int_rotate_right_imm<"xar">; + defm XAR_ZZZI : sve2_int_rotate_right_imm<"xar", int_aarch64_sve_xar>; // SVE2 extract vector (immediate offset, constructive) def EXT_ZZI_B : sve2_int_perm_extract_i_cons<"ext">; // SVE2 non-temporal gather loads - defm LDNT1SB_ZZR_S : sve2_mem_gldnt_vs<0b00000, "ldnt1sb", Z_s, ZPR32>; - defm LDNT1B_ZZR_S : sve2_mem_gldnt_vs<0b00001, "ldnt1b", Z_s, ZPR32>; - defm LDNT1SH_ZZR_S : sve2_mem_gldnt_vs<0b00100, "ldnt1sh", Z_s, ZPR32>; - defm LDNT1H_ZZR_S : sve2_mem_gldnt_vs<0b00101, "ldnt1h", Z_s, ZPR32>; - defm LDNT1W_ZZR_S : sve2_mem_gldnt_vs<0b01001, "ldnt1w", Z_s, ZPR32>; - - defm LDNT1SB_ZZR_D : sve2_mem_gldnt_vs<0b10000, "ldnt1sb", Z_d, ZPR64>; - defm LDNT1B_ZZR_D : sve2_mem_gldnt_vs<0b10010, "ldnt1b", Z_d, ZPR64>; - defm LDNT1SH_ZZR_D : sve2_mem_gldnt_vs<0b10100, "ldnt1sh", Z_d, ZPR64>; - defm LDNT1H_ZZR_D : sve2_mem_gldnt_vs<0b10110, "ldnt1h", Z_d, ZPR64>; - defm LDNT1SW_ZZR_D : sve2_mem_gldnt_vs<0b11000, "ldnt1sw", Z_d, ZPR64>; - defm LDNT1W_ZZR_D : sve2_mem_gldnt_vs<0b11010, "ldnt1w", Z_d, ZPR64>; - defm LDNT1D_ZZR_D : sve2_mem_gldnt_vs<0b11110, "ldnt1d", Z_d, ZPR64>; + defm LDNT1SB_ZZR_S : sve2_mem_gldnt_vs_32_ptrs<0b00000, "ldnt1sb", AArch64ldnt1s_gather_z, nxv4i8>; + defm LDNT1B_ZZR_S : sve2_mem_gldnt_vs_32_ptrs<0b00001, "ldnt1b", AArch64ldnt1_gather_z, nxv4i8>; + defm LDNT1SH_ZZR_S : sve2_mem_gldnt_vs_32_ptrs<0b00100, "ldnt1sh", AArch64ldnt1s_gather_z, nxv4i16>; + defm LDNT1H_ZZR_S : sve2_mem_gldnt_vs_32_ptrs<0b00101, "ldnt1h", AArch64ldnt1_gather_z, nxv4i16>; + defm LDNT1W_ZZR_S : sve2_mem_gldnt_vs_32_ptrs<0b01001, "ldnt1w", AArch64ldnt1_gather_z, nxv4i32>; + + defm LDNT1SB_ZZR_D : sve2_mem_gldnt_vs_64_ptrs<0b10000, "ldnt1sb", AArch64ldnt1s_gather_z, nxv2i8>; + defm LDNT1B_ZZR_D : sve2_mem_gldnt_vs_64_ptrs<0b10010, "ldnt1b", AArch64ldnt1_gather_z, nxv2i8>; + defm LDNT1SH_ZZR_D : sve2_mem_gldnt_vs_64_ptrs<0b10100, "ldnt1sh", AArch64ldnt1s_gather_z, nxv2i16>; + defm LDNT1H_ZZR_D : sve2_mem_gldnt_vs_64_ptrs<0b10110, "ldnt1h", AArch64ldnt1_gather_z, nxv2i16>; + defm LDNT1SW_ZZR_D : sve2_mem_gldnt_vs_64_ptrs<0b11000, "ldnt1sw", AArch64ldnt1s_gather_z, nxv2i32>; + defm LDNT1W_ZZR_D : sve2_mem_gldnt_vs_64_ptrs<0b11010, "ldnt1w", AArch64ldnt1_gather_z, nxv2i32>; + defm LDNT1D_ZZR_D : sve2_mem_gldnt_vs_64_ptrs<0b11110, "ldnt1d", AArch64ldnt1_gather_z, nxv2i64>; // SVE2 vector splice (constructive) defm SPLICE_ZPZZ : sve2_int_perm_splice_cons<"splice">; // SVE2 non-temporal scatter stores - defm STNT1B_ZZR_S : sve2_mem_sstnt_vs<0b001, "stnt1b", Z_s, ZPR32>; - defm STNT1H_ZZR_S : sve2_mem_sstnt_vs<0b011, "stnt1h", Z_s, ZPR32>; - defm STNT1W_ZZR_S : sve2_mem_sstnt_vs<0b101, "stnt1w", Z_s, ZPR32>; + defm STNT1B_ZZR_S : sve2_mem_sstnt_vs_32_ptrs<0b001, "stnt1b", AArch64stnt1_scatter, nxv4i8>; + defm STNT1H_ZZR_S : sve2_mem_sstnt_vs_32_ptrs<0b011, "stnt1h", AArch64stnt1_scatter, nxv4i16>; + defm STNT1W_ZZR_S : sve2_mem_sstnt_vs_32_ptrs<0b101, "stnt1w", AArch64stnt1_scatter, nxv4i32>; - defm STNT1B_ZZR_D : sve2_mem_sstnt_vs<0b000, "stnt1b", Z_d, ZPR64>; - defm STNT1H_ZZR_D : sve2_mem_sstnt_vs<0b010, "stnt1h", Z_d, ZPR64>; - defm STNT1W_ZZR_D : sve2_mem_sstnt_vs<0b100, "stnt1w", Z_d, ZPR64>; - defm STNT1D_ZZR_D : sve2_mem_sstnt_vs<0b110, "stnt1d", Z_d, ZPR64>; + defm STNT1B_ZZR_D : sve2_mem_sstnt_vs_64_ptrs<0b000, "stnt1b", AArch64stnt1_scatter, nxv2i8>; + defm STNT1H_ZZR_D : sve2_mem_sstnt_vs_64_ptrs<0b010, "stnt1h", AArch64stnt1_scatter, nxv2i16>; + defm STNT1W_ZZR_D : sve2_mem_sstnt_vs_64_ptrs<0b100, "stnt1w", AArch64stnt1_scatter, nxv2i32>; + defm STNT1D_ZZR_D : sve2_mem_sstnt_vs_64_ptrs<0b110, "stnt1d", AArch64stnt1_scatter, nxv2i64>; // SVE2 table lookup (three sources) - defm TBL_ZZZZ : sve2_int_perm_tbl<"tbl">; - defm TBX_ZZZ : sve2_int_perm_tbx<"tbx">; + defm TBL_ZZZZ : sve2_int_perm_tbl<"tbl", int_aarch64_sve_tbl2>; + defm TBX_ZZZ : sve2_int_perm_tbx<"tbx", int_aarch64_sve_tbx>; + + let Predicates = [HasSVE, HasBF16] in { + def : SVE_3_Op_Pat<nxv8bf16, int_aarch64_sve_tbx, nxv8bf16, nxv8bf16, nxv8i16, TBX_ZZZ_H>; + def : Pat<(nxv8bf16 (int_aarch64_sve_tbl2 nxv8bf16:$Op1, nxv8bf16:$Op2, nxv8i16:$Op3)), + (nxv8bf16 (TBL_ZZZZ_H (REG_SEQUENCE ZPR2, nxv8bf16:$Op1, zsub0, nxv8bf16:$Op2, zsub1), + nxv8i16:$Op3))>; + } // SVE2 integer compare scalar count and limit defm WHILEGE_PWW : sve_int_while4_rr<0b000, "whilege", int_aarch64_sve_whilege>; @@ -1599,43 +2543,41 @@ let Predicates = [HasSVE2] in { defm WHILEHI_PXX : sve_int_while8_rr<0b101, "whilehi", int_aarch64_sve_whilehi>; // SVE2 pointer conflict compare - defm WHILEWR_PXX : sve2_int_while_rr<0b0, "whilewr">; - defm WHILERW_PXX : sve2_int_while_rr<0b1, "whilerw">; + defm WHILEWR_PXX : sve2_int_while_rr<0b0, "whilewr", "int_aarch64_sve_whilewr">; + defm WHILERW_PXX : sve2_int_while_rr<0b1, "whilerw", "int_aarch64_sve_whilerw">; } let Predicates = [HasSVE2AES] in { // SVE2 crypto destructive binary operations - def AESE_ZZZ_B : sve2_crypto_des_bin_op<0b00, "aese", ZPR8>; - def AESD_ZZZ_B : sve2_crypto_des_bin_op<0b01, "aesd", ZPR8>; + defm AESE_ZZZ_B : sve2_crypto_des_bin_op<0b00, "aese", ZPR8, int_aarch64_sve_aese, nxv16i8>; + defm AESD_ZZZ_B : sve2_crypto_des_bin_op<0b01, "aesd", ZPR8, int_aarch64_sve_aesd, nxv16i8>; // SVE2 crypto unary operations - def AESMC_ZZ_B : sve2_crypto_unary_op<0b0, "aesmc">; - def AESIMC_ZZ_B : sve2_crypto_unary_op<0b1, "aesimc">; + defm AESMC_ZZ_B : sve2_crypto_unary_op<0b0, "aesmc", int_aarch64_sve_aesmc>; + defm AESIMC_ZZ_B : sve2_crypto_unary_op<0b1, "aesimc", int_aarch64_sve_aesimc>; // PMULLB and PMULLT instructions which operate with 64-bit source and // 128-bit destination elements are enabled with crypto extensions, similar // to NEON PMULL2 instruction. - def PMULLB_ZZZ_Q : sve2_wide_int_arith<0b00, 0b11010, "pmullb", - ZPR128, ZPR64, ZPR64>; - def PMULLT_ZZZ_Q : sve2_wide_int_arith<0b00, 0b11011, "pmullt", - ZPR128, ZPR64, ZPR64>; + defm PMULLB_ZZZ_Q : sve2_wide_int_arith_pmul<0b00, 0b11010, "pmullb", int_aarch64_sve_pmullb_pair>; + defm PMULLT_ZZZ_Q : sve2_wide_int_arith_pmul<0b00, 0b11011, "pmullt", int_aarch64_sve_pmullt_pair>; } let Predicates = [HasSVE2SM4] in { // SVE2 crypto constructive binary operations - def SM4EKEY_ZZZ_S : sve2_crypto_cons_bin_op<0b0, "sm4ekey", ZPR32>; + defm SM4EKEY_ZZZ_S : sve2_crypto_cons_bin_op<0b0, "sm4ekey", ZPR32, int_aarch64_sve_sm4ekey, nxv4i32>; // SVE2 crypto destructive binary operations - def SM4E_ZZZ_S : sve2_crypto_des_bin_op<0b10, "sm4e", ZPR32>; + defm SM4E_ZZZ_S : sve2_crypto_des_bin_op<0b10, "sm4e", ZPR32, int_aarch64_sve_sm4e, nxv4i32>; } let Predicates = [HasSVE2SHA3] in { // SVE2 crypto constructive binary operations - def RAX1_ZZZ_D : sve2_crypto_cons_bin_op<0b1, "rax1", ZPR64>; + defm RAX1_ZZZ_D : sve2_crypto_cons_bin_op<0b1, "rax1", ZPR64, int_aarch64_sve_rax1, nxv2i64>; } let Predicates = [HasSVE2BitPerm] in { // SVE2 bitwise permute - defm BEXT_ZZZ : sve2_misc_bitwise<0b1100, "bext">; - defm BDEP_ZZZ : sve2_misc_bitwise<0b1101, "bdep">; - defm BGRP_ZZZ : sve2_misc_bitwise<0b1110, "bgrp">; + defm BEXT_ZZZ : sve2_misc_bitwise<0b1100, "bext", int_aarch64_sve_bext_x>; + defm BDEP_ZZZ : sve2_misc_bitwise<0b1101, "bdep", int_aarch64_sve_bdep_x>; + defm BGRP_ZZZ : sve2_misc_bitwise<0b1110, "bgrp", int_aarch64_sve_bgrp_x>; } diff --git a/llvm/lib/Target/AArch64/AArch64SchedA53.td b/llvm/lib/Target/AArch64/AArch64SchedA53.td index a6df0f3f083cb..c5ff1fcb274b7 100644 --- a/llvm/lib/Target/AArch64/AArch64SchedA53.td +++ b/llvm/lib/Target/AArch64/AArch64SchedA53.td @@ -26,7 +26,8 @@ def CortexA53Model : SchedMachineModel { // v 1.0 Spreadsheet let CompleteModel = 1; - list<Predicate> UnsupportedFeatures = SVEUnsupported.F; + list<Predicate> UnsupportedFeatures = !listconcat(SVEUnsupported.F, + PAUnsupported.F); } diff --git a/llvm/lib/Target/AArch64/AArch64SchedA57.td b/llvm/lib/Target/AArch64/AArch64SchedA57.td index 9f566d1c7079b..7c40da05c3056 100644 --- a/llvm/lib/Target/AArch64/AArch64SchedA57.td +++ b/llvm/lib/Target/AArch64/AArch64SchedA57.td @@ -31,7 +31,8 @@ def CortexA57Model : SchedMachineModel { let LoopMicroOpBufferSize = 16; let CompleteModel = 1; - list<Predicate> UnsupportedFeatures = SVEUnsupported.F; + list<Predicate> UnsupportedFeatures = !listconcat(SVEUnsupported.F, + PAUnsupported.F); } //===----------------------------------------------------------------------===// @@ -501,7 +502,7 @@ def : InstRW<[A57Write_5cyc_2V], (instregex "^FRINT[AIMNPXZ](v4f32|v2f64)")>; // Q form - v16i8, v8i16, v4i32, v2i64 // ASIMD bitwise insert, Q-form -def : InstRW<[A57Write_3cyc_2V], (instregex "^(BIF|BIT|BSL)v16i8")>; +def : InstRW<[A57Write_3cyc_2V], (instregex "^(BIF|BIT|BSL|BSP)v16i8")>; // ASIMD duplicate, gen reg, D-form and Q-form def : InstRW<[A57Write_8cyc_1L_1V], (instregex "^CPY")>; diff --git a/llvm/lib/Target/AArch64/AArch64SchedCyclone.td b/llvm/lib/Target/AArch64/AArch64SchedCyclone.td index 798ecb7508c08..8abcb804d5c71 100644 --- a/llvm/lib/Target/AArch64/AArch64SchedCyclone.td +++ b/llvm/lib/Target/AArch64/AArch64SchedCyclone.td @@ -18,7 +18,8 @@ def CycloneModel : SchedMachineModel { let MispredictPenalty = 16; // 14-19 cycles are typical. let CompleteModel = 1; - list<Predicate> UnsupportedFeatures = SVEUnsupported.F; + list<Predicate> UnsupportedFeatures = !listconcat(SVEUnsupported.F, + PAUnsupported.F); } //===----------------------------------------------------------------------===// @@ -494,7 +495,7 @@ def : InstRW<[CyWriteV3], (instregex "SQRSHLv","UQRSHLv")>; // WriteV includes: // SHLL,SSHLL,USHLL // SLI,SRI -// BIF,BIT,BSL +// BIF,BIT,BSL,BSP // EXT // CLS,CLZ,CNT,RBIT,REV16,REV32,REV64,XTN // XTN2 diff --git a/llvm/lib/Target/AArch64/AArch64SchedExynosM3.td b/llvm/lib/Target/AArch64/AArch64SchedExynosM3.td index d1734c455b2b4..8413a06ed3916 100644 --- a/llvm/lib/Target/AArch64/AArch64SchedExynosM3.td +++ b/llvm/lib/Target/AArch64/AArch64SchedExynosM3.td @@ -24,7 +24,8 @@ def ExynosM3Model : SchedMachineModel { let MispredictPenalty = 16; // Minimum branch misprediction penalty. let CompleteModel = 1; // Use the default model otherwise. - list<Predicate> UnsupportedFeatures = SVEUnsupported.F; + list<Predicate> UnsupportedFeatures = !listconcat(SVEUnsupported.F, + PAUnsupported.F); } //===----------------------------------------------------------------------===// @@ -660,7 +661,7 @@ def : InstRW<[M3WriteNEONY], (instrs FSQRTv2f64)>; // ASIMD miscellaneous instructions. def : InstRW<[M3WriteNALU1], (instregex "^RBITv")>; -def : InstRW<[M3WriteNALU1], (instregex "^(BIF|BIT|BSL)v")>; +def : InstRW<[M3WriteNALU1], (instregex "^(BIF|BIT|BSL|BSP)v")>; def : InstRW<[M3WriteNEONB], (instregex "^DUPv.+gpr")>; def : InstRW<[M3WriteNSHF1], (instregex "^DUPv.+lane")>; def : InstRW<[M3WriteNSHF1], (instregex "^EXTv")>; diff --git a/llvm/lib/Target/AArch64/AArch64SchedExynosM4.td b/llvm/lib/Target/AArch64/AArch64SchedExynosM4.td index d2284f9fa0b50..34e8beb423ce9 100644 --- a/llvm/lib/Target/AArch64/AArch64SchedExynosM4.td +++ b/llvm/lib/Target/AArch64/AArch64SchedExynosM4.td @@ -24,7 +24,8 @@ def ExynosM4Model : SchedMachineModel { let MispredictPenalty = 16; // Minimum branch misprediction penalty. let CompleteModel = 1; // Use the default model otherwise. - list<Predicate> UnsupportedFeatures = SVEUnsupported.F; + list<Predicate> UnsupportedFeatures = !listconcat(SVEUnsupported.F, + PAUnsupported.F); } //===----------------------------------------------------------------------===// @@ -803,7 +804,7 @@ def : InstRW<[M4WriteNEONY], (instrs FSQRTv2f64)>; // ASIMD miscellaneous instructions. def : InstRW<[M4WriteNALU1], (instregex "^RBITv")>; -def : InstRW<[M4WriteNALU1], (instregex "^(BIF|BIT|BSL)v")>; +def : InstRW<[M4WriteNALU1], (instregex "^(BIF|BIT|BSL|BSP)v")>; def : InstRW<[M4WriteNALU1], (instregex "^CL[STZ]v")>; def : InstRW<[M4WriteNEONB], (instregex "^DUPv.+gpr")>; def : InstRW<[M4WriteNSHF1], (instregex "^CPY")>; diff --git a/llvm/lib/Target/AArch64/AArch64SchedExynosM5.td b/llvm/lib/Target/AArch64/AArch64SchedExynosM5.td index df7402591e7b9..403aac80e47bf 100644 --- a/llvm/lib/Target/AArch64/AArch64SchedExynosM5.td +++ b/llvm/lib/Target/AArch64/AArch64SchedExynosM5.td @@ -24,7 +24,8 @@ def ExynosM5Model : SchedMachineModel { let MispredictPenalty = 15; // Minimum branch misprediction penalty. let CompleteModel = 1; // Use the default model otherwise. - list<Predicate> UnsupportedFeatures = SVEUnsupported.F; + list<Predicate> UnsupportedFeatures = !listconcat(SVEUnsupported.F, + PAUnsupported.F); } //===----------------------------------------------------------------------===// @@ -841,7 +842,7 @@ def : InstRW<[M5WriteNEONY], (instrs FSQRTv2f64)>; // ASIMD miscellaneous instructions. def : InstRW<[M5WriteNALU2], (instregex "^RBITv")>; -def : InstRW<[M5WriteNALU2], (instregex "^(BIF|BIT|BSL)v")>; +def : InstRW<[M5WriteNALU2], (instregex "^(BIF|BIT|BSL|BSP)v")>; def : InstRW<[M5WriteNALU2], (instregex "^CL[STZ]v")>; def : InstRW<[M5WriteNEONB], (instregex "^DUPv.+gpr")>; def : InstRW<[M5WriteNSHF2], (instregex "^CPY")>; diff --git a/llvm/lib/Target/AArch64/AArch64SchedFalkor.td b/llvm/lib/Target/AArch64/AArch64SchedFalkor.td index 92d03963de57f..a17ab36d7f9e0 100644 --- a/llvm/lib/Target/AArch64/AArch64SchedFalkor.td +++ b/llvm/lib/Target/AArch64/AArch64SchedFalkor.td @@ -23,8 +23,8 @@ def FalkorModel : SchedMachineModel { let MispredictPenalty = 11; // Minimum branch misprediction penalty. let CompleteModel = 1; - list<Predicate> UnsupportedFeatures = SVEUnsupported.F; - + list<Predicate> UnsupportedFeatures = !listconcat(SVEUnsupported.F, + PAUnsupported.F); // FIXME: Remove when all errors have been fixed. let FullInstRWOverlapCheck = 0; } diff --git a/llvm/lib/Target/AArch64/AArch64SchedFalkorDetails.td b/llvm/lib/Target/AArch64/AArch64SchedFalkorDetails.td index 697a0f69c58cb..f2cd83caffa2b 100644 --- a/llvm/lib/Target/AArch64/AArch64SchedFalkorDetails.td +++ b/llvm/lib/Target/AArch64/AArch64SchedFalkorDetails.td @@ -911,7 +911,7 @@ def : InstRW<[FalkorWr_1VXVY_1cyc], (instregex "^DUP(v16i8|v8i16)(gpr|lane)$") def : InstRW<[FalkorWr_1VXVY_1cyc], (instregex "^CPY(i8|i16|i32|i64)$")>; def : InstRW<[FalkorWr_1GTOV_1cyc], (instregex "^INSv(i8|i16)(gpr|lane)$")>; def : InstRW<[FalkorWr_1VTOG_1cyc], (instregex "^(S|U)MOVv.*$")>; -def : InstRW<[FalkorWr_1VXVY_1cyc], (instregex "^(BIF|BIT|BSL)v8i8$")>; +def : InstRW<[FalkorWr_1VXVY_1cyc], (instregex "^(BIF|BIT|BSL|BSP)v8i8$")>; def : InstRW<[FalkorWr_1VXVY_1cyc], (instrs EXTv8i8)>; def : InstRW<[FalkorWr_1VXVY_0cyc], (instregex "(MOVI|MVNI)(D|v8b_ns|v2i32|v4i16|v2s_msl)$")>; // imm fwd def : InstRW<[FalkorWr_1VXVY_1cyc], (instrs TBLv8i8One)>; @@ -935,7 +935,7 @@ def : InstRW<[FalkorWr_FMUL64_1VXVY_6cyc], def : InstRW<[FalkorWr_1GTOV_1VXVY_2cyc], (instregex "^INSv(i32|i64)(gpr|lane)$")>; def : InstRW<[FalkorWr_2GTOV_1cyc], (instregex "^DUP(v4i32|v2i64)(gpr|lane)$")>; -def : InstRW<[FalkorWr_2VXVY_1cyc], (instregex "^(BIF|BIT|BSL)v16i8$")>; +def : InstRW<[FalkorWr_2VXVY_1cyc], (instregex "^(BIF|BIT|BSL|BSP)v16i8$")>; def : InstRW<[FalkorWr_2VXVY_1cyc], (instrs EXTv16i8)>; def : InstRW<[FalkorWr_2VXVY_0cyc], (instregex "(MOVI|MVNI)(v2d_ns|v16b_ns|v4i32|v8i16|v4s_msl)$")>; // imm fwd def : InstRW<[FalkorWr_2VXVY_1cyc], (instrs NOTv16i8)>; diff --git a/llvm/lib/Target/AArch64/AArch64SchedKryo.td b/llvm/lib/Target/AArch64/AArch64SchedKryo.td index 0e1a24103121e..ba14bf1f50de1 100644 --- a/llvm/lib/Target/AArch64/AArch64SchedKryo.td +++ b/llvm/lib/Target/AArch64/AArch64SchedKryo.td @@ -27,8 +27,8 @@ def KryoModel : SchedMachineModel { let LoopMicroOpBufferSize = 16; let CompleteModel = 1; - list<Predicate> UnsupportedFeatures = SVEUnsupported.F; - + list<Predicate> UnsupportedFeatures = !listconcat(SVEUnsupported.F, + PAUnsupported.F); // FIXME: Remove when all errors have been fixed. let FullInstRWOverlapCheck = 0; } diff --git a/llvm/lib/Target/AArch64/AArch64SchedKryoDetails.td b/llvm/lib/Target/AArch64/AArch64SchedKryoDetails.td index 4c60992e6351a..bc5ad0f8beced 100644 --- a/llvm/lib/Target/AArch64/AArch64SchedKryoDetails.td +++ b/llvm/lib/Target/AArch64/AArch64SchedKryoDetails.td @@ -462,13 +462,13 @@ def KryoWrite_1cyc_X_noRSV_74ln : let Latency = 1; let NumMicroOps = 2; } def : InstRW<[KryoWrite_1cyc_X_noRSV_74ln], - (instrs BIFv8i8, BITv8i8, BSLv8i8)>; + (instrs BIFv8i8, BITv8i8, BSLv8i8, BSPv8i8)>; def KryoWrite_1cyc_X_X_75ln : SchedWriteRes<[KryoUnitX, KryoUnitX]> { let Latency = 1; let NumMicroOps = 2; } def : InstRW<[KryoWrite_1cyc_X_X_75ln], - (instrs BIFv16i8, BITv16i8, BSLv16i8)>; + (instrs BIFv16i8, BITv16i8, BSLv16i8, BSPv16i8)>; def KryoWrite_0cyc_noRSV_11ln : SchedWriteRes<[]> { let Latency = 0; let NumMicroOps = 1; diff --git a/llvm/lib/Target/AArch64/AArch64SchedThunderX.td b/llvm/lib/Target/AArch64/AArch64SchedThunderX.td index 3b6aecf5c0353..9c50f97085830 100644 --- a/llvm/lib/Target/AArch64/AArch64SchedThunderX.td +++ b/llvm/lib/Target/AArch64/AArch64SchedThunderX.td @@ -25,8 +25,8 @@ def ThunderXT8XModel : SchedMachineModel { let PostRAScheduler = 1; // Use PostRA scheduler. let CompleteModel = 1; - list<Predicate> UnsupportedFeatures = SVEUnsupported.F; - + list<Predicate> UnsupportedFeatures = !listconcat(SVEUnsupported.F, + PAUnsupported.F); // FIXME: Remove when all errors have been fixed. let FullInstRWOverlapCheck = 0; } diff --git a/llvm/lib/Target/AArch64/AArch64SchedThunderX2T99.td b/llvm/lib/Target/AArch64/AArch64SchedThunderX2T99.td index e2a293c068774..95c29dd2a567f 100644 --- a/llvm/lib/Target/AArch64/AArch64SchedThunderX2T99.td +++ b/llvm/lib/Target/AArch64/AArch64SchedThunderX2T99.td @@ -25,8 +25,8 @@ def ThunderX2T99Model : SchedMachineModel { let PostRAScheduler = 1; // Using PostRA sched. let CompleteModel = 1; - list<Predicate> UnsupportedFeatures = SVEUnsupported.F; - + list<Predicate> UnsupportedFeatures = !listconcat(SVEUnsupported.F, + PAUnsupported.F); // FIXME: Remove when all errors have been fixed. let FullInstRWOverlapCheck = 0; } @@ -1482,7 +1482,7 @@ def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^RBITv")>; // ASIMD bitwise insert, D-form // ASIMD bitwise insert, Q-form def : InstRW<[THX2T99Write_5Cyc_F01], - (instregex "^BIFv", "^BITv", "^BSLv")>; + (instregex "^BIFv", "^BITv", "^BSLv", "^BSPv")>; // ASIMD count, D-form // ASIMD count, Q-form diff --git a/llvm/lib/Target/AArch64/AArch64SchedThunderX3T110.td b/llvm/lib/Target/AArch64/AArch64SchedThunderX3T110.td new file mode 100644 index 0000000000000..00838cc4b9bd4 --- /dev/null +++ b/llvm/lib/Target/AArch64/AArch64SchedThunderX3T110.td @@ -0,0 +1,1997 @@ +//=- AArch64SchedThunderX3T110.td - Marvell ThunderX3 T110 ---*- tablegen -*-=// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file defines the scheduling model for Marvell ThunderX3T110 +// family of processors. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// Pipeline Description. + +def ThunderX3T110Model : SchedMachineModel { + let IssueWidth = 4; // 4 micro-ops dispatched at a time. + let MicroOpBufferSize = 70; // 70 entries in micro-op re-order buffer. + let LoadLatency = 4; // Optimistic load latency. + let MispredictPenalty = 12; // Extra cycles for mispredicted branch. + // Determined via a mix of micro-arch details and experimentation. + let LoopMicroOpBufferSize = 128; // FIXME: might be much bigger in TX3. + let PostRAScheduler = 1; // Using PostRA sched. + let CompleteModel = 1; + + list<Predicate> UnsupportedFeatures = !listconcat(SVEUnsupported.F, + PAUnsupported.F); + // FIXME: Remove when all errors have been fixed. + let FullInstRWOverlapCheck = 0; +} + +let SchedModel = ThunderX3T110Model in { + +// Issue ports. + +// Port 0: ALU. +def THX3T110P0 : ProcResource<1>; + +// Port 1: ALU. +def THX3T110P1 : ProcResource<1>; + +// Port 2: ALU/Branch. +def THX3T110P2 : ProcResource<1>; + +// Port 3: ALU/Branch. +def THX3T110P3 : ProcResource<1>; + +// Port 4: Load/Store. +def THX3T110P4 : ProcResource<1>; + +// Port 5: Load/store. +def THX3T110P5 : ProcResource<1>; + +// Port 6: FP/Neon/SIMD/Crypto. +def THX3T110P6FP0 : ProcResource<1>; + +// Port 7: FP/Neon/SIMD/Crypto. +def THX3T110P7FP1 : ProcResource<1>; + +// Port 8: FP/Neon/SIMD/Crypto. +def THX3T110P8FP2 : ProcResource<1>; + +// Port 9: FP/Neon/SIMD/Crypto. +def THX3T110P9FP3 : ProcResource<1>; + +// Port 10: Store Data Unit. +def THX3T110SD0 : ProcResource<1>; + +// Define groups for the functional units on each issue port. Each group +// created will be used by a WriteRes. + +// Integer divide/mulhi micro-ops only on port I1. +def THX3T110I1 : ProcResGroup<[THX3T110P1]>; + +// Branch micro-ops on ports I2/I3. +def THX3T110I23 : ProcResGroup<[THX3T110P2, THX3T110P3]>; + +// Branch micro-ops on ports I1/I2/I3. +def THX3T110I123 : ProcResGroup<[THX3T110P1, THX3T110P2, THX3T110P3]>; + +// Integer micro-ops on ports I0/I1/I2. +def THX3T110I012 : ProcResGroup<[THX3T110P0, THX3T110P1, THX3T110P2]>; + +// Integer micro-ops on ports I0/I1/I2/I3. +def THX3T110I0123 : ProcResGroup<[THX3T110P0, THX3T110P1, + THX3T110P2, THX3T110P3]>; + +// FP micro-ops on ports FP0/FP1/FP2/FP3. +def THX3T110FP0123 : ProcResGroup<[THX3T110P6FP0, THX3T110P7FP1, + THX3T110P8FP2, THX3T110P9FP3]>; + +// FP micro-ops on ports FP2/FP3. +def THX3T110FP23 : ProcResGroup<[THX3T110P8FP2, THX3T110P9FP3]>; + +// ASIMD micro-ops on ports FP0/FP1/FP2/FP3. +def THX3T110SIMD : ProcResGroup<[THX3T110P6FP0, THX3T110P7FP1, + THX3T110P8FP2, THX3T110P9FP3]>; + +// Store data micro-ops only on port 10. +def THX3T110SD : ProcResGroup<[THX3T110SD0]>; + +// Load/store micro-ops on ports P4/P5. +def THX3T110LS : ProcResGroup<[THX3T110P4, THX3T110P5]>; + +// 70 entry unified scheduler. +def THX3T110ANY: ProcResGroup<[THX3T110P0, THX3T110P1, THX3T110P2, + THX3T110P3, THX3T110P4, THX3T110P5, + THX3T110P6FP0, THX3T110P7FP1, + THX3T110P8FP2, THX3T110P9FP3]> { + let BufferSize = 70; +} + +// Define commonly used write types for InstRW specializations. +// All definitions follow the format: THX3T110Write_<NumCycles>Cyc_<Resources>. + +// 3 cycles on I1. +def THX3T110Write_3Cyc_I1 : SchedWriteRes<[THX3T110I1]> { + let Latency = 3; + let NumMicroOps = 2; +} + +// 4 cycles on I1. +def THX3T110Write_4Cyc_I1 : SchedWriteRes<[THX3T110I1]> { + let Latency = 4; + let NumMicroOps = 2; +} + +// 5 cycles on I1. +def THX3T110Write_5Cyc_I1 : SchedWriteRes<[THX3T110I1]> { + let Latency = 5; + let NumMicroOps = 2; +} + +// 7 cycles on I1. +def THX3T110Write_7Cyc_I1 : SchedWriteRes<[THX3T110I1]> { + let Latency = 7; + let NumMicroOps = 3; +} + +// 23 cycles on I1. +def THX3T110Write_23Cyc_I1 : SchedWriteRes<[THX3T110I1]> { + let Latency = 23; + let ResourceCycles = [13, 23]; + let NumMicroOps = 4; +} + +// 39 cycles on I1. +def THX3T110Write_39Cyc_I1 : SchedWriteRes<[THX3T110I1]> { + let Latency = 39; + let ResourceCycles = [13, 39]; + let NumMicroOps = 4; +} + +// 1 cycle on I2/I3 +def THX3T110Write_1Cyc_I23 : SchedWriteRes<[THX3T110I23]> { + let Latency = 1; + let NumMicroOps = 2; +} + +// 8 cycles on I2/I3 +def THX3T110Write_8Cyc_I23 : SchedWriteRes<[THX3T110I23]> { + let Latency = 8; + let NumMicroOps = 3; +} + +// 1 cycle on I1/I2/I3 +def THX3T110Write_1Cyc_I123 : SchedWriteRes<[THX3T110I123]> { + let Latency = 1; + let NumMicroOps = 2; +} + +// 8 cycles on I1/I2/I3 +def THX3T110Write_8Cyc_I123 : SchedWriteRes<[THX3T110I123]> { + let Latency = 8; + let NumMicroOps = 3; +} + +// 1 cycle on I0/I1/I2/I3. +def THX3T110Write_1Cyc_I0123 : SchedWriteRes<[THX3T110I0123]> { + let Latency = 1; + let NumMicroOps = 2; +} + +// 2 cycles on I0/I1/I2/I3. +def THX3T110Write_2Cyc_I0123 : SchedWriteRes<[THX3T110I0123]> { + let Latency = 2; + let NumMicroOps = 2; +} + +// 3 cycles on I0/I1/I2/I3. +def THX3T110Write_3Cyc_I0123 : SchedWriteRes<[THX3T110I0123]> { + let Latency = 3; + let NumMicroOps = 2; +} + +// 4 cycles on I0/I1/I2/I3. +def THX3T110Write_4Cyc_I0123 : SchedWriteRes<[THX3T110I0123]> { + let Latency = 4; + let NumMicroOps = 3; +} + +// 5 cycles on I0/I1/I2/I3. +def THX3T110Write_5Cyc_I0123 : SchedWriteRes<[THX3T110I0123]> { + let Latency = 5; + let NumMicroOps = 3; +} + +// 6 cycles on I0/I1/I2/I3. +def THX3T110Write_6Cyc_I0123 : SchedWriteRes<[THX3T110I0123]> { + let Latency = 6; + let NumMicroOps = 3; +} + +// 8 cycles on I0/I1/I2/I3. +def THX3T110Write_8Cyc_I0123 : SchedWriteRes<[THX3T110I0123]> { + let Latency = 8; + let NumMicroOps = 4; +} + +// 13 cycles on I0/I1/I2/I3. +def THX3T110Write_13Cyc_I0123 : SchedWriteRes<[THX3T110I0123]> { + let Latency = 13; + let NumMicroOps = 3; +} + +// 23 cycles on I0/I1/I2/I3. +def THX3T110Write_23Cyc_I0123 : SchedWriteRes<[THX3T110I0123]> { + let Latency = 23; + let NumMicroOps = 3; +} + +// 39 cycles on I0/I1/I2/I3. +def THX3T110Write_39Cyc_I0123 : SchedWriteRes<[THX3T110I0123]> { + let Latency = 39; + let NumMicroOps = 3; +} + +// 4 cycles on F2/F3. +def THX3T110Write_4Cyc_F23 : SchedWriteRes<[THX3T110FP23]> { + let Latency = 4; + let NumMicroOps = 2; +} + +// 5 cycles on F0/F1/F2/F3. +def THX3T110Write_5Cyc_F01 : SchedWriteRes<[THX3T110FP0123]> { + let Latency = 5; + let NumMicroOps = 2; +} + +// 6 cycles on F0/F1/F2/F3. +def THX3T110Write_6Cyc_F01 : SchedWriteRes<[THX3T110FP0123]> { + let Latency = 6; + let NumMicroOps = 3; +} + +// 7 cycles on F0/F1/F2/F3. +def THX3T110Write_7Cyc_F01 : SchedWriteRes<[THX3T110FP0123]> { + let Latency = 7; + let NumMicroOps = 3; +} + +// 8 cycles on F0/F1/F2/F3. +def THX3T110Write_8Cyc_F01 : SchedWriteRes<[THX3T110FP0123]> { + let Latency = 8; + let NumMicroOps = 3; +} + +// 10 cycles on F0/F1/F2/F3. +def THX3T110Write_10Cyc_F01 : SchedWriteRes<[THX3T110FP0123]> { + let Latency = 10; + let NumMicroOps = 3; +} + +// 16 cycles on F0/F1/F2/F3. +def THX3T110Write_16Cyc_F01 : SchedWriteRes<[THX3T110FP0123]> { + let Latency = 16; + let NumMicroOps = 3; + let ResourceCycles = [8]; +} + +// 23 cycles on F0/F1/F2/F3. +def THX3T110Write_23Cyc_F01 : SchedWriteRes<[THX3T110FP0123]> { + let Latency = 23; + let NumMicroOps = 3; + let ResourceCycles = [11]; +} + +// 1 cycle on LS0/LS1. +def THX3T110Write_1Cyc_LS01 : SchedWriteRes<[THX3T110LS]> { + let Latency = 1; + let NumMicroOps = 1; +} + +// 2 cycles on LS0/LS1. +def THX3T110Write_2Cyc_LS01 : SchedWriteRes<[THX3T110LS]> { + let Latency = 2; + let NumMicroOps = 2; +} + +// 4 cycles on LS0/LS1. +def THX3T110Write_4Cyc_LS01 : SchedWriteRes<[THX3T110LS]> { + let Latency = 4; + let NumMicroOps = 2; + let ResourceCycles = [2]; +} + +// 5 cycles on LS0/LS1. +def THX3T110Write_5Cyc_LS01 : SchedWriteRes<[THX3T110LS]> { + let Latency = 5; + let NumMicroOps = 3; +} + +// 6 cycles on LS0/LS1. +def THX3T110Write_6Cyc_LS01 : SchedWriteRes<[THX3T110LS]> { + let Latency = 6; + let NumMicroOps = 3; +} + +// 4 + 5 cycles on LS0/LS1. +// First resource is available after 4 cycles. +// Second resource is available after 5 cycles. +// Load vector pair, immed offset, Q-form [LDP/LDNP]. +def THX3T110Write_4_5Cyc_LS01 : SchedWriteRes<[THX3T110LS]> { + let Latency = 4; + let NumMicroOps = 2; + let ResourceCycles = [4, 5]; +} + +// 4 + 8 cycles on LS0/LS1. +// First resource is available after 4 cycles. +// Second resource is available after 8 cycles. +// Load vector pair, immed offset, S/D-form [LDP/LDNP]. +def THX3T110Write_4_8Cyc_LS01 : SchedWriteRes<[THX3T110LS]> { + let Latency = 4; + let NumMicroOps = 2; + let ResourceCycles = [4, 8]; +} + +// 11 cycles on LS0/LS1 and I1. +def THX3T110Write_11Cyc_LS01_I1 : + SchedWriteRes<[THX3T110LS, THX3T110I1]> { + let Latency = 11; + let NumMicroOps = 4; +} + +// 1 cycles on LS0/LS1 and I0/I1/I2/I3. +def THX3T110Write_1Cyc_LS01_I0123 : + SchedWriteRes<[THX3T110LS, THX3T110I0123]> { + let Latency = 1; + let NumMicroOps = 2; +} + +// 1 cycles on LS0/LS1 and 2 of I0/I1/I2/I3. +def THX3T110Write_1Cyc_LS01_I0123_I0123 : + SchedWriteRes<[THX3T110LS, THX3T110I0123, THX3T110I0123]> { + let Latency = 1; + let NumMicroOps = 3; +} + +// 4 cycles on LS0/LS1 and I0/I1/I2/I3. +def THX3T110Write_4Cyc_LS01_I0123 : + SchedWriteRes<[THX3T110LS, THX3T110I0123]> { + let Latency = 4; + let NumMicroOps = 3; +} + +// 4 cycles on LS0/LS1 and 2 of I0/I1/I2/I3. +def THX3T110Write_4Cyc_LS01_I0123_I0123 : + SchedWriteRes<[THX3T110LS, THX3T110I0123, THX3T110I0123]> { + let Latency = 4; + let NumMicroOps = 3; +} + +// 5 cycles on LS0/LS1 and I0/I1/I2/I3. +def THX3T110Write_5Cyc_LS01_I0123 : + SchedWriteRes<[THX3T110LS, THX3T110I0123]> { + let Latency = 5; + let NumMicroOps = 3; +} + +// 5 cycles on LS0/LS1 and 2 of I0/I1/I2/I3. +def THX3T110Write_5Cyc_LS01_I0123_I0123 : + SchedWriteRes<[THX3T110LS, THX3T110I0123, THX3T110I0123]> { + let Latency = 5; + let NumMicroOps = 3; +} + +// 6 cycles on LS0/LS1 and I0/I1/I2/I3. +def THX3T110Write_6Cyc_LS01_I012 : + SchedWriteRes<[THX3T110LS, THX3T110I0123]> { + let Latency = 6; + let NumMicroOps = 4; +} + +// 6 cycles on LS0/LS1 and 2 of I0/I1/I2/I3. +def THX3T110Write_6Cyc_LS01_I0123_I0123 : + SchedWriteRes<[THX3T110LS, THX3T110I0123, THX3T110I0123]> { + let Latency = 6; + let NumMicroOps = 3; +} + +// 1 cycle on LS0/LS1 and SD. +def THX3T110Write_1Cyc_LS01_SD : + SchedWriteRes<[THX3T110LS, THX3T110SD]> { + let Latency = 1; + let NumMicroOps = 2; +} + +// 2 cycles on LS0/LS1 and SD. +def THX3T110Write_2Cyc_LS01_SD : + SchedWriteRes<[THX3T110LS, THX3T110SD]> { + let Latency = 2; + let NumMicroOps = 2; +} + +// 4 cycles on LS0/LS1 and SD. +def THX3T110Write_4Cyc_LS01_SD : + SchedWriteRes<[THX3T110LS, THX3T110SD]> { + let Latency = 4; + let NumMicroOps = 3; +} + +// 5 cycles on LS0/LS1 and SD. +def THX3T110Write_5Cyc_LS01_SD : + SchedWriteRes<[THX3T110LS, THX3T110SD]> { + let Latency = 5; + let NumMicroOps = 4; +} + +// 6 cycles on LS0/LS1 and SD. +def THX3T110Write_6Cyc_LS01_SD : + SchedWriteRes<[THX3T110LS, THX3T110SD]> { + let Latency = 6; + let NumMicroOps = 5; +} + +// 1 cycle on LS0/LS1, SD and I0/I1/I2/I3. +def THX3T110Write_1Cyc_LS01_SD_I0123 : + SchedWriteRes<[THX3T110LS, THX3T110SD, THX3T110I0123]> { + let Latency = 1; + let NumMicroOps = 2; +} + +// 2 cycles on LS0/LS1, SD and I0/I1/I2/I3. +def THX3T110Write_2Cyc_LS01_SD_I0123 : + SchedWriteRes<[THX3T110LS, THX3T110SD, THX3T110I0123]> { + let Latency = 2; + let NumMicroOps = 2; +} + +// 4 cycles on LS0/LS1, SD and I0/I1/I2/I3. +def THX3T110Write_4Cyc_LS01_SD_I0123 : + SchedWriteRes<[THX3T110LS, THX3T110SD, THX3T110I0123]> { + let Latency = 4; + let NumMicroOps = 3; +} + +// 5 cycles on LS0/LS1, SD and I0/I1/I2/I3. +def THX3T110Write_5Cyc_LS01_SD_I0123 : + SchedWriteRes<[THX3T110LS, THX3T110SD, THX3T110I0123]> { + let Latency = 5; + let NumMicroOps = 4; +} + +// 6 cycles on LS0/LS1, SD and I0/I1/I2/I3. +def THX3T110Write_6Cyc_LS01_SD_I0123 : + SchedWriteRes<[THX3T110LS, THX3T110SD, THX3T110I0123]> { + let Latency = 6; + let NumMicroOps = 5; +} + +// 1 cycles on LS0/LS1 and F0/F1/F2/F3. +def THX3T110Write_1Cyc_LS01_F0123 : + SchedWriteRes<[THX3T110LS, THX3T110FP0123]> { + let Latency = 1; + let NumMicroOps = 2; +} + +// 5 cycles on LS0/LS1 and F0/F1/F2/F3. +def THX3T110Write_5Cyc_LS01_F0123 : + SchedWriteRes<[THX3T110LS, THX3T110FP0123]> { + let Latency = 5; + let NumMicroOps = 3; +} + +// 6 cycles on LS0/LS1 and F0/F1/F2/F3. +def THX3T110Write_6Cyc_LS01_F0123 : + SchedWriteRes<[THX3T110LS, THX3T110FP0123]> { + let Latency = 6; + let NumMicroOps = 3; +} + +// 7 cycles on LS0/LS1 and F0/F1/F2/F3. +def THX3T110Write_7Cyc_LS01_F0123 : + SchedWriteRes<[THX3T110LS, THX3T110FP0123]> { + let Latency = 7; + let NumMicroOps = 3; +} + +// 8 cycles on LS0/LS1 and F0/F1/F2/F3. +def THX3T110Write_8Cyc_LS01_F0123 : + SchedWriteRes<[THX3T110LS, THX3T110FP0123]> { + let Latency = 8; + let NumMicroOps = 3; +} + +// 8 cycles on LS0/LS1 and I0/I1/I2/I3. +def THX3T110Write_8Cyc_LS01_I0123 : + SchedWriteRes<[THX3T110LS, THX3T110I0123]> { + let Latency = 8; + let NumMicroOps = 3; +} + +// 12 cycles on LS0/LS1 and I0/I1/I2/I3. +def THX3T110Write_12Cyc_LS01_I0123 : + SchedWriteRes<[THX3T110LS, THX3T110I0123]> { + let Latency = 12; + let NumMicroOps = 4; +} + +// 16 cycles on LS0/LS1 and I0/I1/I2/I3. +def THX3T110Write_16Cyc_LS01_I0123 : + SchedWriteRes<[THX3T110LS, THX3T110I0123]> { + let Latency = 16; + let NumMicroOps = 5; +} + +// 24 cycles on LS0/LS1 and I0/I1/I2/I3. +def THX3T110Write_24Cyc_LS01_I0123 : + SchedWriteRes<[THX3T110LS, THX3T110I0123]> { + let Latency = 24; + let NumMicroOps = 10; +} + +// 32 cycles on LS0/LS1 and I0/I1/I2/I3. +def THX3T110Write_32Cyc_LS01_I0123 : + SchedWriteRes<[THX3T110LS, THX3T110I0123]> { + let Latency = 32; + let NumMicroOps = 14; +} + +// 3 cycles on F0/F1/F2/F3. +def THX3T110Write_3Cyc_F0123 : SchedWriteRes<[THX3T110FP0123]> { + let Latency = 3; + let NumMicroOps = 2; +} + +// 4 cycles on F0/F1/F2/F3. +def THX3T110Write_4Cyc_F0123 : SchedWriteRes<[THX3T110FP0123]> { + let Latency = 4; + let NumMicroOps = 2; +} + +// 5 cycles on F0/F1/F2/F3. +def THX3T110Write_5Cyc_F0123 : SchedWriteRes<[THX3T110FP0123]> { + let Latency = 5; + let NumMicroOps = 2; +} + +// 10 cycles on F0/F1/F2/F3. +def THX3T110Write_10Cyc_F0123 : SchedWriteRes<[THX3T110FP0123]> { + let Latency = 10; + let NumMicroOps = 4; +} + +// 15 cycles on F0/F1/F2/F3. +def THX3T110Write_15Cyc_F0123 : SchedWriteRes<[THX3T110FP0123]> { + let Latency = 15; + let NumMicroOps = 7; +} + +// 16 cycles on F0/F1/F2/F3. +def THX3T110Write_16Cyc_F0123 : SchedWriteRes<[THX3T110FP0123]> { + let Latency = 16; + let NumMicroOps = 3; +} + +// 18 cycles on F0/F1/F2/F3. +def THX3T110Write_18Cyc_F0123 : SchedWriteRes<[THX3T110FP0123]> { + let Latency = 18; + let NumMicroOps = 3; +} + +// 19 cycles on F0/F1/F2/F3. +def THX3T110Write_19Cyc_F0123 : SchedWriteRes<[THX3T110FP0123]> { + let Latency = 19; + let NumMicroOps = 4; +} + +// 20 cycles on F0/F1/F2/F3. +def THX3T110Write_20Cyc_F0123 : SchedWriteRes<[THX3T110FP0123]> { + let Latency = 20; + let NumMicroOps = 4; +} + +// 23 cycles on F0/F1/F2/F3. +def THX3T110Write_23Cyc_F0123 : SchedWriteRes<[THX3T110FP0123]> { + let Latency = 23; + let NumMicroOps = 4; +} + +// 3 cycles on F2/F3 and 4 cycles on F0/F1/F2/F3. +def THX3T110Write_3_4Cyc_F23_F0123 : + SchedWriteRes<[THX3T110FP23, THX3T110FP0123]> { + let Latency = 3; + let NumMicroOps = 2; + let ResourceCycles = [3, 4]; +} + + +// Define commonly used read types. + +// No forwarding is provided for these types. +def : ReadAdvance<ReadI, 0>; +def : ReadAdvance<ReadISReg, 0>; +def : ReadAdvance<ReadIEReg, 0>; +def : ReadAdvance<ReadIM, 0>; +def : ReadAdvance<ReadIMA, 0>; +def : ReadAdvance<ReadID, 0>; +def : ReadAdvance<ReadExtrHi, 0>; +def : ReadAdvance<ReadAdrBase, 0>; +def : ReadAdvance<ReadVLD, 0>; + +//===----------------------------------------------------------------------===// +// 3. Instruction Tables. + +//--- +// 3.1 Branch Instructions +//--- + +// Branch, immed +// Branch and link, immed +// Compare and branch +def : WriteRes<WriteBr, [THX3T110I23]> { + let Latency = 1; + let NumMicroOps = 2; +} + +// Branch, register +// Branch and link, register != LR +// Branch and link, register = LR +def : WriteRes<WriteBrReg, [THX3T110I23]> { + let Latency = 1; + let NumMicroOps = 2; +} + +def : WriteRes<WriteSys, []> { let Latency = 1; } +def : WriteRes<WriteBarrier, []> { let Latency = 1; } +def : WriteRes<WriteHint, []> { let Latency = 1; } + +def : WriteRes<WriteAtomic, []> { + let Latency = 4; + let NumMicroOps = 2; +} + +//--- +// Branch +//--- +def : InstRW<[THX3T110Write_1Cyc_I23], (instrs B, BL, BR, BLR)>; +def : InstRW<[THX3T110Write_1Cyc_I23], (instrs Bcc)>; +def : InstRW<[THX3T110Write_1Cyc_I23], (instrs RET)>; +def : InstRW<[THX3T110Write_1Cyc_I23], + (instrs CBZW, CBZX, CBNZW, CBNZX, TBZW, TBZX, TBNZW, TBNZX)>; + +//--- +// 3.2 Arithmetic and Logical Instructions +// 3.3 Move and Shift Instructions +//--- + + +// ALU, basic +// Conditional compare +// Conditional select +// Address generation +def : WriteRes<WriteI, [THX3T110I0123]> { + let Latency = 1; + let ResourceCycles = [1]; + let NumMicroOps = 2; +} + +def : InstRW<[WriteI], + (instregex "ADD?(W|X)r(i|r|s|x)", "ADDS?(W|X)r(i|r|s|x)(64)?", + "AND?(W|X)r(i|r|s|x)", "ANDS?(W|X)r(i|r|s|x)", + "ADC(W|X)r", + "BIC?(W|X)r(i|r|s|x)", "BICS?(W|X)r(i|r|s|x)", + "EON?(W|X)r(i|r|s|x)", "ORN?(W|X)r(i|r|s|x)", + "ORR?(W|X)r(i|r|s|x)", "SUB?(W|X)r(i|r|s|x)", + "SUBS?(W|X)r(i|r|s|x)", "SBC(W|X)r", + "SBCS(W|X)r", "CCMN(W|X)(i|r)", + "CCMP(W|X)(i|r)", "CSEL(W|X)r", + "CSINC(W|X)r", "CSINV(W|X)r", + "CSNEG(W|X)r")>; + +def : InstRW<[WriteI], (instrs COPY)>; + +// ALU, extend and/or shift +def : WriteRes<WriteISReg, [THX3T110I0123]> { + let Latency = 2; + let ResourceCycles = [2]; + let NumMicroOps = 2; +} + +def : InstRW<[WriteISReg], + (instregex "ADD?(W|X)r(i|r|s|x)", "ADDS?(W|X)r(i|r|s|x)(64)?", + "AND?(W|X)r(i|r|s|x)", "ANDS?(W|X)r(i|r|s|x)", + "ADC(W|X)r", + "BIC?(W|X)r(i|r|s|x)", "BICS?(W|X)r(i|r|s|x)", + "EON?(W|X)r(i|r|s|x)", "ORN?(W|X)r(i|r|s|x)", + "ORR?(W|X)r(i|r|s|x)", "SUB?(W|X)r(i|r|s|x)", + "SUBS?(W|X)r(i|r|s|x)", "SBC(W|X)r", + "SBCS(W|X)r", "CCMN(W|X)(i|r)", + "CCMP(W|X)(i|r)", "CSEL(W|X)r", + "CSINC(W|X)r", "CSINV(W|X)r", + "CSNEG(W|X)r")>; + +def : WriteRes<WriteIEReg, [THX3T110I0123]> { + let Latency = 1; + let ResourceCycles = [1]; + let NumMicroOps = 2; +} + +def : InstRW<[WriteIEReg], + (instregex "ADD?(W|X)r(i|r|s|x)", "ADDS?(W|X)r(i|r|s|x)(64)?", + "AND?(W|X)r(i|r|s|x)", "ANDS?(W|X)r(i|r|s|x)", + "ADC(W|X)r", + "BIC?(W|X)r(i|r|s|x)", "BICS?(W|X)r(i|r|s|x)", + "EON?(W|X)r(i|r|s|x)", "ORN?(W|X)r(i|r|s|x)", + "ORR?(W|X)r(i|r|s|x)", "SUB?(W|X)r(i|r|s|x)", + "SUBS?(W|X)r(i|r|s|x)", "SBC(W|X)r", + "SBCS(W|X)r", "CCMN(W|X)(i|r)", + "CCMP(W|X)(i|r)", "CSEL(W|X)r", + "CSINC(W|X)r", "CSINV(W|X)r", + "CSNEG(W|X)r")>; + +// Move immed +def : WriteRes<WriteImm, [THX3T110I0123]> { + let Latency = 1; + let NumMicroOps = 2; +} + +def : InstRW<[THX3T110Write_1Cyc_I0123], + (instrs MOVKWi, MOVKXi, MOVNWi, MOVNXi, MOVZWi, MOVZXi)>; + +def : InstRW<[THX3T110Write_1Cyc_I0123], + (instrs ASRVWr, ASRVXr, LSLVWr, LSLVXr, RORVWr, RORVXr)>; + +// Variable shift +def : WriteRes<WriteIS, [THX3T110I0123]> { + let Latency = 1; + let NumMicroOps = 2; +} + +//--- +// 3.4 Divide and Multiply Instructions +//--- + +// Divide, W-form +// Latency range of 13-23/13-39. +def : WriteRes<WriteID32, [THX3T110I1]> { + let Latency = 39; + let ResourceCycles = [39]; + let NumMicroOps = 4; +} + +// Divide, X-form +def : WriteRes<WriteID64, [THX3T110I1]> { + let Latency = 23; + let ResourceCycles = [23]; + let NumMicroOps = 4; +} + +// Multiply accumulate, W-form +def : WriteRes<WriteIM32, [THX3T110I0123]> { + let Latency = 5; + let NumMicroOps = 3; +} + +// Multiply accumulate, X-form +def : WriteRes<WriteIM64, [THX3T110I0123]> { + let Latency = 5; + let NumMicroOps = 3; +} + +//def : InstRW<[WriteIM32, ReadIM, ReadIM, ReadIMA, THX3T110Write_5Cyc_I012], +// (instrs MADDWrrr, MSUBWrrr)>; +def : InstRW<[WriteIM32], (instrs MADDWrrr, MSUBWrrr)>; +def : InstRW<[WriteIM32], (instrs MADDXrrr, MSUBXrrr)>; +def : InstRW<[THX3T110Write_5Cyc_I0123], + (instregex "(S|U)(MADDL|MSUBL)rrr")>; + +def : InstRW<[WriteID32], (instrs SDIVWr, UDIVWr)>; +def : InstRW<[WriteID64], (instrs SDIVXr, UDIVXr)>; + +// Bitfield extract, two reg +def : WriteRes<WriteExtr, [THX3T110I0123]> { + let Latency = 1; + let NumMicroOps = 2; +} + +// Multiply high +def : InstRW<[THX3T110Write_4Cyc_I1], (instrs SMULHrr, UMULHrr)>; + +// Miscellaneous Data-Processing Instructions +// Bitfield extract +def : InstRW<[THX3T110Write_1Cyc_I0123], (instrs EXTRWrri, EXTRXrri)>; + +// Bitifield move - basic +def : InstRW<[THX3T110Write_1Cyc_I0123], + (instrs SBFMWri, SBFMXri, UBFMWri, UBFMXri)>; + +// Bitfield move, insert +def : InstRW<[THX3T110Write_1Cyc_I0123], (instregex "^BFM")>; +def : InstRW<[THX3T110Write_1Cyc_I0123], (instregex "(S|U)?BFM.*")>; + +// Count leading +def : InstRW<[THX3T110Write_3_4Cyc_F23_F0123], + (instregex "^CLS(W|X)r$", "^CLZ(W|X)r$")>; + +// Reverse bits +def : InstRW<[THX3T110Write_3_4Cyc_F23_F0123], (instrs RBITWr, RBITXr)>; + +// Cryptography Extensions +def : InstRW<[THX3T110Write_4Cyc_F0123], (instregex "^AES[DE]")>; +def : InstRW<[THX3T110Write_4Cyc_F0123], (instregex "^AESI?MC")>; +def : InstRW<[THX3T110Write_4Cyc_F0123], (instregex "^PMULL")>; +def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "^SHA1SU0")>; +def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "^SHA1(H|SU1)")>; +def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "^SHA1[CMP]")>; +def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "^SHA256SU0")>; +def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "^SHA256(H|H2|SU1)")>; + +// CRC Instructions +// def : InstRW<[THX3T110Write_4Cyc_I1], (instregex "^CRC32", "^CRC32C")>; +def : InstRW<[THX3T110Write_4Cyc_I1], + (instrs CRC32Brr, CRC32Hrr, CRC32Wrr, CRC32Xrr)>; + +def : InstRW<[THX3T110Write_4Cyc_I1], + (instrs CRC32CBrr, CRC32CHrr, CRC32CWrr, CRC32CXrr)>; + +// Reverse bits/bytes +// NOTE: Handled by WriteI. + +//--- +// 3.6 Load Instructions +// 3.10 FP Load Instructions +//--- + +// Load register, literal +// Load register, unscaled immed +// Load register, immed unprivileged +// Load register, unsigned immed +def : WriteRes<WriteLD, [THX3T110LS]> { + let Latency = 4; + let NumMicroOps = 4; +} + +// Load register, immed post-index +// NOTE: Handled by WriteLD, WriteI. +// Load register, immed pre-index +// NOTE: Handled by WriteLD, WriteAdr. +def : WriteRes<WriteAdr, [THX3T110I0123]> { + let Latency = 1; + let NumMicroOps = 2; +} + +// Load pair, immed offset, normal +// Load pair, immed offset, signed words, base != SP +// Load pair, immed offset signed words, base = SP +// LDP only breaks into *one* LS micro-op. Thus +// the resources are handled by WriteLD. +def : WriteRes<WriteLDHi, []> { + let Latency = 4; + let NumMicroOps = 4; +} + +// Load register offset, basic +// Load register, register offset, scale by 4/8 +// Load register, register offset, scale by 2 +// Load register offset, extend +// Load register, register offset, extend, scale by 4/8 +// Load register, register offset, extend, scale by 2 +def THX3T110WriteLDIdx : SchedWriteVariant<[ + SchedVar<ScaledIdxPred, [THX3T110Write_4Cyc_LS01_I0123_I0123]>, + SchedVar<NoSchedPred, [THX3T110Write_4Cyc_LS01_I0123]>]>; +def : SchedAlias<WriteLDIdx, THX3T110WriteLDIdx>; + +def THX3T110ReadAdrBase : SchedReadVariant<[ + SchedVar<ScaledIdxPred, [ReadDefault]>, + SchedVar<NoSchedPred, [ReadDefault]>]>; +def : SchedAlias<ReadAdrBase, THX3T110ReadAdrBase>; + +// Load pair, immed pre-index, normal +// Load pair, immed pre-index, signed words +// Load pair, immed post-index, normal +// Load pair, immed post-index, signed words +def : InstRW<[THX3T110Write_4Cyc_LS01_I0123, WriteLDHi], (instrs LDNPDi)>; +def : InstRW<[THX3T110Write_4Cyc_LS01_I0123, WriteLDHi], (instrs LDNPQi)>; +def : InstRW<[THX3T110Write_4Cyc_LS01_I0123, WriteLDHi], (instrs LDNPSi)>; +def : InstRW<[THX3T110Write_4Cyc_LS01_I0123, WriteLDHi], (instrs LDNPWi)>; +def : InstRW<[THX3T110Write_4Cyc_LS01_I0123, WriteLDHi], (instrs LDNPXi)>; + +def : InstRW<[THX3T110Write_4Cyc_LS01_I0123, WriteLDHi], (instrs LDPDi)>; +def : InstRW<[THX3T110Write_4Cyc_LS01_I0123, WriteLDHi], (instrs LDPQi)>; +def : InstRW<[THX3T110Write_4Cyc_LS01_I0123, WriteLDHi], (instrs LDPSi)>; +def : InstRW<[THX3T110Write_4Cyc_LS01_I0123, WriteLDHi], (instrs LDPSWi)>; +def : InstRW<[THX3T110Write_4Cyc_LS01_I0123, WriteLDHi], (instrs LDPWi)>; +def : InstRW<[THX3T110Write_4Cyc_LS01_I0123, WriteLDHi], (instrs LDPXi)>; + +def : InstRW<[THX3T110Write_4Cyc_LS01], (instrs LDRBui)>; +def : InstRW<[THX3T110Write_4Cyc_LS01], (instrs LDRDui)>; +def : InstRW<[THX3T110Write_4Cyc_LS01], (instrs LDRHui)>; +def : InstRW<[THX3T110Write_5Cyc_LS01], (instrs LDRQui)>; +def : InstRW<[THX3T110Write_5Cyc_LS01], (instrs LDRSui)>; + +def : InstRW<[THX3T110Write_4Cyc_LS01], (instrs LDRDl)>; +def : InstRW<[THX3T110Write_4Cyc_LS01], (instrs LDRQl)>; +def : InstRW<[THX3T110Write_4Cyc_LS01], (instrs LDRWl)>; +def : InstRW<[THX3T110Write_4Cyc_LS01], (instrs LDRXl)>; + +def : InstRW<[THX3T110Write_4Cyc_LS01], (instrs LDTRBi)>; +def : InstRW<[THX3T110Write_4Cyc_LS01], (instrs LDTRHi)>; +def : InstRW<[THX3T110Write_4Cyc_LS01], (instrs LDTRWi)>; +def : InstRW<[THX3T110Write_4Cyc_LS01], (instrs LDTRXi)>; + +def : InstRW<[THX3T110Write_4Cyc_LS01], (instrs LDTRSBWi)>; +def : InstRW<[THX3T110Write_4Cyc_LS01], (instrs LDTRSBXi)>; +def : InstRW<[THX3T110Write_4Cyc_LS01], (instrs LDTRSHWi)>; +def : InstRW<[THX3T110Write_4Cyc_LS01], (instrs LDTRSHXi)>; +def : InstRW<[THX3T110Write_4Cyc_LS01], (instrs LDTRSWi)>; + +def : InstRW<[THX3T110Write_5Cyc_LS01_I0123, WriteLDHi, WriteAdr], + (instrs LDPDpre)>; +def : InstRW<[THX3T110Write_5Cyc_LS01_I0123, WriteLDHi, WriteAdr], + (instrs LDPQpre)>; +def : InstRW<[THX3T110Write_5Cyc_LS01_I0123, WriteLDHi, WriteAdr], + (instrs LDPSpre)>; +def : InstRW<[THX3T110Write_5Cyc_LS01_I0123, WriteLDHi, WriteAdr], + (instrs LDPWpre)>; +def : InstRW<[THX3T110Write_5Cyc_LS01_I0123, WriteLDHi, WriteAdr], + (instrs LDPWpre)>; + +def : InstRW<[THX3T110Write_4Cyc_LS01, WriteAdr], + (instrs LDRBpre, LDRDpre, LDRHpre, LDRQpre, + LDRSpre, LDRWpre, LDRXpre, + LDRSBWpre, LDRSBXpre, LDRSBWpost, LDRSBXpost, + LDRSHWpre, LDRSHXpre, LDRSHWpost, LDRSHXpost, + LDRBBpre, LDRBBpost, LDRHHpre, LDRHHpost)>; + +def : InstRW<[THX3T110Write_5Cyc_LS01_I0123, WriteLDHi, WriteAdr], + (instrs LDPDpost, LDPQpost, LDPSpost, LDPWpost, LDPXpost)>; + +def : InstRW<[THX3T110Write_5Cyc_LS01_I0123, WriteI], + (instrs LDRBpost, LDRDpost, LDRHpost, + LDRQpost, LDRSpost, LDRWpost, LDRXpost)>; + +def : InstRW<[THX3T110Write_4Cyc_LS01_I0123_I0123, WriteLDHi, WriteAdr], + (instrs LDPDpre, LDPQpre, LDPSpre, LDPWpre, LDPXpre)>; + +def : InstRW<[THX3T110Write_4Cyc_LS01_I0123_I0123, WriteAdr], + (instrs LDRBpre, LDRDpre, LDRHpre, LDRQpre, + LDRSpre, LDRWpre, LDRXpre)>; + +def : InstRW<[THX3T110Write_4Cyc_LS01_I0123_I0123, WriteLDHi, WriteAdr], + (instrs LDPDpost, LDPQpost, LDPSpost, LDPWpost, LDPXpost)>; + +def : InstRW<[THX3T110Write_4Cyc_LS01_I0123_I0123, WriteI], + (instrs LDRBpost, LDRDpost, LDRHpost, LDRQpost, + LDRSpost, LDRWpost, LDRXpost)>; + +def : InstRW<[THX3T110Write_4Cyc_LS01_I0123, ReadAdrBase], (instrs LDRBroW)>; +def : InstRW<[THX3T110Write_4Cyc_LS01_I0123, ReadAdrBase], (instrs LDRDroW)>; +def : InstRW<[THX3T110Write_4Cyc_LS01_I0123, ReadAdrBase], (instrs LDRHroW)>; +def : InstRW<[THX3T110Write_4Cyc_LS01_I0123, ReadAdrBase], (instrs LDRHHroW)>; +def : InstRW<[THX3T110Write_4Cyc_LS01_I0123, ReadAdrBase], (instrs LDRQroW)>; +def : InstRW<[THX3T110Write_4Cyc_LS01_I0123, ReadAdrBase], (instrs LDRSroW)>; +def : InstRW<[THX3T110Write_4Cyc_LS01_I0123, ReadAdrBase], (instrs LDRSHWroW)>; +def : InstRW<[THX3T110Write_4Cyc_LS01_I0123, ReadAdrBase], (instrs LDRSHXroW)>; +def : InstRW<[THX3T110Write_4Cyc_LS01_I0123, ReadAdrBase], (instrs LDRWroW)>; +def : InstRW<[THX3T110Write_4Cyc_LS01_I0123, ReadAdrBase], (instrs LDRXroW)>; + +def : InstRW<[THX3T110Write_4Cyc_LS01_I0123, ReadAdrBase], (instrs LDRBroX)>; +def : InstRW<[THX3T110Write_4Cyc_LS01_I0123, ReadAdrBase], (instrs LDRDroX)>; +def : InstRW<[THX3T110Write_4Cyc_LS01_I0123, ReadAdrBase], (instrs LDRHHroX)>; +def : InstRW<[THX3T110Write_4Cyc_LS01_I0123, ReadAdrBase], (instrs LDRHroX)>; +def : InstRW<[THX3T110Write_4Cyc_LS01_I0123, ReadAdrBase], (instrs LDRQroX)>; +def : InstRW<[THX3T110Write_4Cyc_LS01_I0123, ReadAdrBase], (instrs LDRSroX)>; +def : InstRW<[THX3T110Write_4Cyc_LS01_I0123, ReadAdrBase], (instrs LDRSHWroX)>; +def : InstRW<[THX3T110Write_4Cyc_LS01_I0123, ReadAdrBase], (instrs LDRSHXroX)>; +def : InstRW<[THX3T110Write_4Cyc_LS01_I0123, ReadAdrBase], (instrs LDRWroX)>; +def : InstRW<[THX3T110Write_4Cyc_LS01_I0123, ReadAdrBase], (instrs LDRXroX)>; + +def : InstRW<[THX3T110Write_4Cyc_LS01], (instrs LDURBi)>; +def : InstRW<[THX3T110Write_4Cyc_LS01], (instrs LDURBBi)>; +def : InstRW<[THX3T110Write_4Cyc_LS01], (instrs LDURDi)>; +def : InstRW<[THX3T110Write_4Cyc_LS01], (instrs LDURHi)>; +def : InstRW<[THX3T110Write_4Cyc_LS01], (instrs LDURHHi)>; +def : InstRW<[THX3T110Write_4Cyc_LS01], (instrs LDURQi)>; +def : InstRW<[THX3T110Write_4Cyc_LS01], (instrs LDURSi)>; +def : InstRW<[THX3T110Write_4Cyc_LS01], (instrs LDURXi)>; +def : InstRW<[THX3T110Write_4Cyc_LS01], (instrs LDURSBWi)>; +def : InstRW<[THX3T110Write_4Cyc_LS01], (instrs LDURSBXi)>; +def : InstRW<[THX3T110Write_4Cyc_LS01], (instrs LDURSHWi)>; +def : InstRW<[THX3T110Write_4Cyc_LS01], (instrs LDURSHXi)>; +def : InstRW<[THX3T110Write_4Cyc_LS01], (instrs LDURSWi)>; + +// Load exclusive +def : InstRW<[THX3T110Write_4Cyc_LS01], (instregex "^LDAR(B|H|W|X)$")>; +def : InstRW<[THX3T110Write_4Cyc_LS01], (instregex "^LDAXR(B|H|W|X)$")>; +def : InstRW<[THX3T110Write_4Cyc_LS01], (instregex "^LDXR(B|H|W|X)$")>; +def : InstRW<[THX3T110Write_4Cyc_LS01], (instregex "^LDAXP(W|X)$")>; +def : InstRW<[THX3T110Write_4Cyc_LS01], (instregex "^LDXP(W|X)$")>; + +//--- +// Prefetch +//--- +def : InstRW<[THX3T110Write_6Cyc_LS01_I012], (instrs PRFMl)>; +def : InstRW<[THX3T110Write_6Cyc_LS01_I012], (instrs PRFUMi)>; +def : InstRW<[THX3T110Write_6Cyc_LS01_I012], (instrs PRFMui)>; +def : InstRW<[THX3T110Write_6Cyc_LS01_I012], (instrs PRFMroW)>; +def : InstRW<[THX3T110Write_6Cyc_LS01_I012], (instrs PRFMroX)>; + +//-- +// 3.7 Store Instructions +// 3.11 FP Store Instructions +//-- + +// Store register, unscaled immed +// Store register, immed unprivileged +// Store register, unsigned immed +def : WriteRes<WriteST, [THX3T110LS, THX3T110SD]> { + let Latency = 1; + let NumMicroOps = 2; +} + +// Store register, immed post-index +// NOTE: Handled by WriteAdr, WriteST, ReadAdrBase + +// Store register, immed pre-index +// NOTE: Handled by WriteAdr, WriteST + +// Store register, register offset, basic +// Store register, register offset, scaled by 4/8 +// Store register, register offset, scaled by 2 +// Store register, register offset, extend +// Store register, register offset, extend, scale by 4/8 +// Store register, register offset, extend, scale by 1 +def : WriteRes<WriteSTIdx, [THX3T110LS, THX3T110SD, THX3T110I0123]> { + let Latency = 1; + let NumMicroOps = 2; +} + +// Store pair, immed offset, W-form +// Store pair, immed offset, X-form +def : WriteRes<WriteSTP, [THX3T110LS, THX3T110SD]> { + let Latency = 1; + let NumMicroOps = 2; +} + +// Store pair, immed post-index, W-form +// Store pair, immed post-index, X-form +// Store pair, immed pre-index, W-form +// Store pair, immed pre-index, X-form +// NOTE: Handled by WriteAdr, WriteSTP. +def : InstRW<[THX3T110Write_1Cyc_LS01_SD], (instrs STURBi)>; +def : InstRW<[THX3T110Write_1Cyc_LS01_SD], (instrs STURBBi)>; +def : InstRW<[THX3T110Write_1Cyc_LS01_SD], (instrs STURDi)>; +def : InstRW<[THX3T110Write_1Cyc_LS01_SD], (instrs STURHi)>; +def : InstRW<[THX3T110Write_1Cyc_LS01_SD], (instrs STURHHi)>; +def : InstRW<[THX3T110Write_1Cyc_LS01_SD], (instrs STURQi)>; +def : InstRW<[THX3T110Write_1Cyc_LS01_SD], (instrs STURSi)>; +def : InstRW<[THX3T110Write_1Cyc_LS01_SD], (instrs STURWi)>; +def : InstRW<[THX3T110Write_1Cyc_LS01_SD], (instrs STURXi)>; + +def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_SD], (instrs STTRBi)>; +def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_SD], (instrs STTRHi)>; +def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_SD], (instrs STTRWi)>; +def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_SD], (instrs STTRXi)>; + +def : InstRW<[THX3T110Write_1Cyc_LS01_SD], (instrs STNPDi)>; +def : InstRW<[THX3T110Write_1Cyc_LS01_SD], (instrs STNPQi)>; +def : InstRW<[THX3T110Write_1Cyc_LS01_SD], (instrs STNPXi)>; +def : InstRW<[THX3T110Write_1Cyc_LS01_SD], (instrs STNPWi)>; + +def : InstRW<[THX3T110Write_1Cyc_LS01_SD], (instrs STPDi)>; +def : InstRW<[THX3T110Write_1Cyc_LS01_SD], (instrs STPQi)>; +def : InstRW<[THX3T110Write_1Cyc_LS01_SD], (instrs STPXi)>; +def : InstRW<[THX3T110Write_1Cyc_LS01_SD], (instrs STPWi)>; + +def : InstRW<[THX3T110Write_1Cyc_LS01_I0123], (instrs STRBui)>; +def : InstRW<[THX3T110Write_1Cyc_LS01_I0123], (instrs STRDui)>; +def : InstRW<[THX3T110Write_1Cyc_LS01_I0123], (instrs STRHui)>; +def : InstRW<[THX3T110Write_1Cyc_LS01_I0123], (instrs STRQui)>; +def : InstRW<[THX3T110Write_1Cyc_LS01_I0123], (instrs STRXui)>; +def : InstRW<[THX3T110Write_1Cyc_LS01_I0123], (instrs STRWui)>; + +def : InstRW<[WriteSTP, THX3T110Write_1Cyc_LS01_SD], (instrs STRBui)>; +def : InstRW<[WriteSTP, THX3T110Write_1Cyc_LS01_SD], (instrs STRDui)>; +def : InstRW<[WriteSTP, THX3T110Write_1Cyc_LS01_SD], (instrs STRHui)>; +def : InstRW<[WriteSTP, THX3T110Write_1Cyc_LS01_SD], (instrs STRQui)>; +def : InstRW<[WriteSTP, THX3T110Write_1Cyc_LS01_SD], (instrs STRXui)>; +def : InstRW<[WriteSTP, THX3T110Write_1Cyc_LS01_SD], (instrs STRWui)>; + +def : InstRW<[WriteSTIdx, THX3T110Write_1Cyc_LS01_SD_I0123], (instrs STRBui)>; +def : InstRW<[WriteSTIdx, THX3T110Write_1Cyc_LS01_SD_I0123], (instrs STRDui)>; +def : InstRW<[WriteSTIdx, THX3T110Write_1Cyc_LS01_SD_I0123], (instrs STRHui)>; +def : InstRW<[WriteSTIdx, THX3T110Write_1Cyc_LS01_SD_I0123], (instrs STRQui)>; +def : InstRW<[WriteSTIdx, THX3T110Write_1Cyc_LS01_SD_I0123], (instrs STRXui)>; +def : InstRW<[WriteSTIdx, THX3T110Write_1Cyc_LS01_SD_I0123], (instrs STRWui)>; + +def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_I0123], + (instrs STPDpre, STPDpost)>; +def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_I0123, ReadAdrBase], + (instrs STPDpre, STPDpost)>; +def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_I0123], + (instrs STPQpre, STPQpost)>; +def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_I0123, ReadAdrBase], + (instrs STPQpre, STPQpost)>; +def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_I0123], + (instrs STPSpre, STPSpost)>; +def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_I0123, ReadAdrBase], + (instrs STPSpre, STPSpost)>; +def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_I0123], + (instrs STPWpre, STPWpost)>; +def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_I0123, ReadAdrBase], + (instrs STPWpre, STPWpost)>; +def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_I0123], + (instrs STPXpre, STPXpost)>; +def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_I0123, ReadAdrBase], + (instrs STPXpre, STPXpost)>; +def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_I0123], + (instrs STRBpre, STRBpost)>; +def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_I0123, ReadAdrBase], + (instrs STRBpre, STRBpost)>; +def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_I0123], + (instrs STRBBpre, STRBBpost)>; +def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_I0123, ReadAdrBase], + (instrs STRBBpre, STRBBpost)>; +def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_I0123], + (instrs STRDpre, STRDpost)>; +def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_I0123, ReadAdrBase], + (instrs STRDpre, STRDpost)>; +def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_I0123], + (instrs STRHpre, STRHpost)>; +def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_I0123, ReadAdrBase], + (instrs STRHpre, STRHpost)>; +def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_I0123], + (instrs STRHHpre, STRHHpost)>; +def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_I0123, ReadAdrBase], + (instrs STRHHpre, STRHHpost)>; +def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_I0123], + (instrs STRQpre, STRQpost)>; +def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_I0123, ReadAdrBase], + (instrs STRQpre, STRQpost)>; +def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_I0123], + (instrs STRSpre, STRSpost)>; +def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_I0123, ReadAdrBase], + (instrs STRSpre, STRSpost)>; +def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_I0123], + (instrs STRWpre, STRWpost)>; +def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_I0123, ReadAdrBase], + (instrs STRWpre, STRWpost)>; +def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_I0123], + (instrs STRXpre, STRXpost)>; +def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_I0123, ReadAdrBase], + (instrs STRXpre, STRXpost)>; +def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_I0123, ReadAdrBase], + (instrs STRBroW, STRBroX)>; +def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_I0123, ReadAdrBase], + (instrs STRBBroW, STRBBroX)>; +def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_I0123, ReadAdrBase], + (instrs STRDroW, STRDroX)>; +def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_I0123, ReadAdrBase], + (instrs STRHroW, STRHroX)>; +def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_I0123, ReadAdrBase], + (instrs STRHHroW, STRHHroX)>; +def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_I0123, ReadAdrBase], + (instrs STRQroW, STRQroX)>; +def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_I0123, ReadAdrBase], + (instrs STRSroW, STRSroX)>; +def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_I0123, ReadAdrBase], + (instrs STRWroW, STRWroX)>; +def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_I0123, ReadAdrBase], + (instrs STRXroW, STRXroX)>; + +// Store exclusive +def : InstRW<[THX3T110Write_4Cyc_LS01_SD], (instrs STNPWi, STNPXi)>; +def : InstRW<[THX3T110Write_4Cyc_LS01_SD], (instregex "^STLR(B|H|W|X)$")>; +def : InstRW<[THX3T110Write_4Cyc_LS01_SD], (instregex "^STXP(W|X)$")>; +def : InstRW<[THX3T110Write_4Cyc_LS01_SD], (instregex "^STXR(B|H|W|X)$")>; +def : InstRW<[THX3T110Write_4Cyc_LS01_SD], (instregex "^STLXP(W|X)$")>; +def : InstRW<[THX3T110Write_4Cyc_LS01_SD], (instregex "^STLXR(B|H|W|X)$")>; + +//--- +// 3.8 FP Data Processing Instructions +//--- + +// FP absolute value +// FP min/max +// FP negate +def : WriteRes<WriteF, [THX3T110FP0123]> { + let Latency = 5; + let NumMicroOps = 2; +} + +// FP arithmetic +def : InstRW<[THX3T110Write_6Cyc_F01], (instregex "^FADD", "^FSUB")>; + +// FP compare +def : WriteRes<WriteFCmp, [THX3T110FP0123]> { + let Latency = 5; + let NumMicroOps = 2; +} + +// FP Mul, Div, Sqrt +def : WriteRes<WriteFDiv, [THX3T110FP0123]> { + let Latency = 22; + let ResourceCycles = [19]; +} + +def THX3T110XWriteFDiv : SchedWriteRes<[THX3T110FP0123]> { + let Latency = 16; + let ResourceCycles = [8]; + let NumMicroOps = 4; +} + +def THX3T110XWriteFDivSP : SchedWriteRes<[THX3T110FP0123]> { + let Latency = 16; + let ResourceCycles = [8]; + let NumMicroOps = 4; +} + +def THX3T110XWriteFDivDP : SchedWriteRes<[THX3T110FP0123]> { + let Latency = 23; + let ResourceCycles = [12]; + let NumMicroOps = 4; +} + +def THX3T110XWriteFSqrtSP : SchedWriteRes<[THX3T110FP0123]> { + let Latency = 16; + let ResourceCycles = [8]; + let NumMicroOps = 4; +} + +def THX3T110XWriteFSqrtDP : SchedWriteRes<[THX3T110FP0123]> { + let Latency = 23; + let ResourceCycles = [12]; + let NumMicroOps = 4; +} + +// FP divide, S-form +// FP square root, S-form +def : InstRW<[THX3T110XWriteFDivSP], (instrs FDIVSrr)>; +def : InstRW<[THX3T110XWriteFSqrtSP], (instrs FSQRTSr)>; +def : InstRW<[THX3T110XWriteFDivSP], (instregex "^FDIVv.*32$")>; +def : InstRW<[THX3T110XWriteFSqrtSP], (instregex "^.*SQRT.*32$")>; +def : InstRW<[THX3T110Write_16Cyc_F01], (instregex "^FDIVSrr", "^FSQRTSr")>; + +// FP divide, D-form +// FP square root, D-form +def : InstRW<[THX3T110XWriteFDivDP], (instrs FDIVDrr)>; +def : InstRW<[THX3T110XWriteFSqrtDP], (instrs FSQRTDr)>; +def : InstRW<[THX3T110XWriteFDivDP], (instregex "^FDIVv.*64$")>; +def : InstRW<[THX3T110XWriteFSqrtDP], (instregex "^.*SQRT.*64$")>; +def : InstRW<[THX3T110Write_23Cyc_F01], (instregex "^FDIVDrr", "^FSQRTDr")>; + +// FP multiply +// FP multiply accumulate +def : WriteRes<WriteFMul, [THX3T110FP0123]> { + let Latency = 6; + let ResourceCycles = [2]; + let NumMicroOps = 3; +} + +def THX3T110XWriteFMul : SchedWriteRes<[THX3T110FP0123]> { + let Latency = 6; + let ResourceCycles = [2]; + let NumMicroOps = 3; +} + +def THX3T110XWriteFMulAcc : SchedWriteRes<[THX3T110FP0123]> { + let Latency = 6; + let ResourceCycles = [2]; + let NumMicroOps = 3; +} + +def : InstRW<[THX3T110XWriteFMul], (instregex "^FMUL", "^FNMUL")>; +def : InstRW<[THX3T110XWriteFMulAcc], + (instregex "^FMADD", "^FMSUB", "^FNMADD", "^FNMSUB")>; + +// FP round to integral +def : InstRW<[THX3T110Write_7Cyc_F01], + (instregex "^FRINT(A|I|M|N|P|X|Z)(Sr|Dr)")>; + +// FP select +def : InstRW<[THX3T110Write_3_4Cyc_F23_F0123], (instregex "^FCSEL")>; + +//--- +// 3.9 FP Miscellaneous Instructions +//--- + +// FP convert, from vec to vec reg +// FP convert, from gen to vec reg +// FP convert, from vec to gen reg +def : WriteRes<WriteFCvt, [THX3T110FP0123]> { + let Latency = 7; + let NumMicroOps = 3; +} + +// FP move, immed +// FP move, register +def : WriteRes<WriteFImm, [THX3T110FP0123]> { + let Latency = 4; + let NumMicroOps = 2; +} + +// FP transfer, from gen to vec reg +// FP transfer, from vec to gen reg +def : WriteRes<WriteFCopy, [THX3T110FP0123]> { + let Latency = 4; + let NumMicroOps = 2; +} + +def : InstRW<[THX3T110Write_5Cyc_F01], (instrs FMOVXDHighr, FMOVDXHighr)>; + +//--- +// 3.12 ASIMD Integer Instructions +//--- + +// ASIMD absolute diff, D-form +// ASIMD absolute diff, Q-form +// ASIMD absolute diff accum, D-form +// ASIMD absolute diff accum, Q-form +// ASIMD absolute diff accum long +// ASIMD absolute diff long +// ASIMD arith, basic +// ASIMD arith, complex +// ASIMD compare +// ASIMD logical (AND, BIC, EOR) +// ASIMD max/min, basic +// ASIMD max/min, reduce, 4H/4S +// ASIMD max/min, reduce, 8B/8H +// ASIMD max/min, reduce, 16B +// ASIMD multiply, D-form +// ASIMD multiply, Q-form +// ASIMD multiply accumulate long +// ASIMD multiply accumulate saturating long +// ASIMD multiply long +// ASIMD pairwise add and accumulate +// ASIMD shift accumulate +// ASIMD shift by immed, basic +// ASIMD shift by immed and insert, basic, D-form +// ASIMD shift by immed and insert, basic, Q-form +// ASIMD shift by immed, complex +// ASIMD shift by register, basic, D-form +// ASIMD shift by register, basic, Q-form +// ASIMD shift by register, complex, D-form +// ASIMD shift by register, complex, Q-form +def : WriteRes<WriteV, [THX3T110FP0123]> { + let Latency = 5; + let NumMicroOps = 4; + let ResourceCycles = [4]; +} + +// ASIMD arith, reduce, 4H/4S +// ASIMD arith, reduce, 8B/8H +// ASIMD arith, reduce, 16B + +// ASIMD logical (MVN (alias for NOT), ORN, ORR) +def : InstRW<[THX3T110Write_5Cyc_F0123], + (instregex "^ANDv", "^BICv", "^EORv", "^ORRv", "^ORNv", "^NOTv")>; + +// ASIMD arith, reduce +def : InstRW<[THX3T110Write_5Cyc_F0123], + (instregex "^ADDVv", "^SADDLVv", "^UADDLVv")>; + +// ASIMD polynomial (8x8) multiply long +def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "^(S|U|SQD)MULL")>; +def : InstRW<[THX3T110Write_5Cyc_F0123], + (instregex "(S|U|SQD)(MLAL|MLSL|MULL)v.*")>; +def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "^PMULL(v8i8|v16i8)")>; +def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "^PMULL(v1i64|v2i64)")>; + +// ASIMD absolute diff accum, D-form +def : InstRW<[THX3T110Write_5Cyc_F0123], + (instregex "^[SU]ABA(v8i8|v4i16|v2i32)$")>; +// ASIMD absolute diff accum, Q-form +def : InstRW<[THX3T110Write_5Cyc_F0123], + (instregex "^[SU]ABA(v16i8|v8i16|v4i32)$")>; +// ASIMD absolute diff accum long +def : InstRW<[THX3T110Write_5Cyc_F0123], + (instregex "^[SU]ABAL")>; +// ASIMD arith, reduce, 4H/4S +def : InstRW<[THX3T110Write_5Cyc_F0123], + (instregex "^[SU]?ADDL?V(v8i8|v4i16|v2i32)v$")>; +// ASIMD arith, reduce, 8B +def : InstRW<[THX3T110Write_5Cyc_F0123], + (instregex "^[SU]?ADDL?V(v8i16|v4i32)v$")>; +// ASIMD arith, reduce, 16B/16H +def : InstRW<[THX3T110Write_10Cyc_F0123], + (instregex "^[SU]?ADDL?Vv16i8v$")>; +// ASIMD max/min, reduce, 4H/4S +def : InstRW<[THX3T110Write_5Cyc_F0123], + (instregex "^[SU](MIN|MAX)V(v4i16|v4i32)v$")>; +// ASIMD max/min, reduce, 8B/8H +def : InstRW<[THX3T110Write_5Cyc_F0123], + (instregex "^[SU](MIN|MAX)V(v8i8|v8i16)v$")>; +// ASIMD max/min, reduce, 16B/16H +def : InstRW<[THX3T110Write_5Cyc_F0123], + (instregex "^[SU](MIN|MAX)Vv16i8v$")>; +// ASIMD multiply, D-form +def : InstRW<[THX3T110Write_5Cyc_F0123], + (instregex "^(P?MUL|SQR?DMULH)" # + "(v8i8|v4i16|v2i32|v1i8|v1i16|v1i32|v1i64)" # + "(_indexed)?$")>; +// ASIMD multiply, Q-form +def : InstRW<[THX3T110Write_5Cyc_F0123], + (instregex "^(P?MUL|SQR?DMULH)(v16i8|v8i16|v4i32)(_indexed)?$")>; +// ASIMD multiply accumulate, D-form +def : InstRW<[THX3T110Write_5Cyc_F0123], + (instregex "^ML[AS](v8i8|v4i16|v2i32)(_indexed)?$")>; +// ASIMD multiply accumulate, Q-form +def : InstRW<[THX3T110Write_5Cyc_F0123], + (instregex "^ML[AS](v16i8|v8i16|v4i32)(_indexed)?$")>; +// ASIMD shift accumulate +def : InstRW<[THX3T110Write_5Cyc_F0123], + (instregex "SRSRAv","SSRAv","URSRAv","USRAv")>; + +// ASIMD shift by immed, basic +def : InstRW<[THX3T110Write_5Cyc_F0123], + (instregex "RSHRNv","SHRNv", "SQRSHRNv","SQRSHRUNv", + "SQSHRNv","SQSHRUNv", "UQRSHRNv", + "UQSHRNv","SQXTNv","SQXTUNv","UQXTNv")>; +// ASIMD shift by immed, complex +def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "^[SU]?(Q|R){1,2}SHR")>; +def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "^SQSHLU")>; +// ASIMD shift by register, basic, Q-form +def : InstRW<[THX3T110Write_5Cyc_F01], + (instregex "^[SU]SHL(v16i8|v8i16|v4i32|v2i64)")>; +// ASIMD shift by register, complex, D-form +def : InstRW<[THX3T110Write_5Cyc_F0123], + (instregex "^[SU][QR]{1,2}SHL" # + "(v1i8|v1i16|v1i32|v1i64|v8i8|v4i16|v2i32|b|d|h|s)")>; +// ASIMD shift by register, complex, Q-form +def : InstRW<[THX3T110Write_5Cyc_F0123], + (instregex "^[SU][QR]{1,2}SHL(v16i8|v8i16|v4i32|v2i64)")>; + +// ASIMD Arithmetic +def : InstRW<[THX3T110Write_5Cyc_F0123], + (instregex "(ADD|SUB)(v8i8|v4i16|v2i32|v1i64)")>; +def : InstRW<[THX3T110Write_5Cyc_F0123], + (instregex "(ADD|SUB)(v16i8|v8i16|v4i32|v2i64)")>; +def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "(ADD|SUB)HNv.*")>; +def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "(RADD|RSUB)HNv.*")>; +def : InstRW<[THX3T110Write_5Cyc_F0123], + (instregex "^SQADD", "^SQNEG", "^SQSUB", "^SRHADD", + "^SUQADD", "^UQADD", "^UQSUB", "^URHADD", "^USQADD")>; +def : InstRW<[THX3T110Write_5Cyc_F0123], + (instregex "ADDP(v16i8|v8i16|v4i32|v2i64)")>; +def : InstRW<[THX3T110Write_5Cyc_F0123], + (instregex "((AND|ORN|EOR|EON)S?(Xr[rsi]|v16i8|v8i16|v4i32)|" # + "(ORR|BIC)S?(Xr[rs]|v16i8|v8i16|v4i32))")>; +def : InstRW<[THX3T110Write_5Cyc_F0123], + (instregex "(CLS|CLZ|CNT)(v4i32|v8i16|v16i8)")>; +def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "^SADALP","^UADALP")>; +def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "^SADDLPv","^UADDLPv")>; +def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "^SADDLV","^UADDLV")>; +def : InstRW<[THX3T110Write_5Cyc_F0123], + (instregex "^ADDVv","^SMAXVv","^UMAXVv","^SMINVv","^UMINVv")>; +def : InstRW<[THX3T110Write_5Cyc_F0123], + (instregex "^SABAv","^UABAv","^SABALv","^UABALv")>; +def : InstRW<[THX3T110Write_5Cyc_F0123], + (instregex "^SQADDv","^SQSUBv","^UQADDv","^UQSUBv")>; +def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "^SUQADDv","^USQADDv")>; +def : InstRW<[THX3T110Write_5Cyc_F0123], + (instregex "^ADDHNv","^RADDHNv", "^RSUBHNv", + "^SQABS", "^SQADD", "^SQNEG", "^SQSUB", + "^SRHADD", "^SUBHNv", "^SUQADD", + "^UQADD", "^UQSUB", "^URHADD", "^USQADD")>; +def : InstRW<[THX3T110Write_5Cyc_F0123], + (instregex "^CMEQv","^CMGEv","^CMGTv", + "^CMLEv","^CMLTv", "^CMHIv","^CMHSv")>; +def : InstRW<[THX3T110Write_5Cyc_F0123], + (instregex "^SMAXv","^SMINv","^UMAXv","^UMINv", + "^SMAXPv","^SMINPv","^UMAXPv","^UMINPv")>; +def : InstRW<[THX3T110Write_5Cyc_F0123], + (instregex "^SABDv","^UABDv", "^SABDLv","^UABDLv")>; + +//--- +// 3.13 ASIMD Floating-point Instructions +//--- + +// ASIMD FP absolute value +def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "^FABSv")>; + +// ASIMD FP arith, normal, D-form +// ASIMD FP arith, normal, Q-form +def : InstRW<[THX3T110Write_3_4Cyc_F23_F0123], + (instregex "^FABDv", "^FADDv", "^FSUBv")>; + +// ASIMD FP arith,pairwise, D-form +// ASIMD FP arith, pairwise, Q-form +def : InstRW<[THX3T110Write_3_4Cyc_F23_F0123], (instregex "^FADDPv")>; + +// ASIMD FP compare, D-form +// ASIMD FP compare, Q-form +def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "^FACGEv", "^FACGTv")>; +def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "^FCMEQv", "^FCMGEv", + "^FCMGTv", "^FCMLEv", + "^FCMLTv")>; + +// ASIMD FP round, D-form +def : InstRW<[THX3T110Write_5Cyc_F0123], + (instregex "^FRINT[AIMNPXZ](v2f32)")>; +// ASIMD FP round, Q-form +def : InstRW<[THX3T110Write_5Cyc_F0123], + (instregex "^FRINT[AIMNPXZ](v4f32|v2f64)")>; + +// ASIMD FP convert, long +// ASIMD FP convert, narrow +// ASIMD FP convert, other, D-form +// ASIMD FP convert, other, Q-form +// NOTE: Handled by WriteV. + +// ASIMD FP convert, long and narrow +def : InstRW<[THX3T110Write_5Cyc_F01], (instregex "^FCVT(L|N|XN)v")>; +// ASIMD FP convert, other, D-form +def : InstRW<[THX3T110Write_5Cyc_F01], + (instregex "^[FVSU]CVT([AMNPZ][SU])?(_Int)?(v2f32|v1i32|v2i32|v1i64)")>; +// ASIMD FP convert, other, Q-form +def : InstRW<[THX3T110Write_5Cyc_F01], + (instregex "^[FVSU]CVT([AMNPZ][SU])?(_Int)?(v4f32|v2f64|v4i32|v2i64)")>; + +// ASIMD FP divide, D-form, F32 +def : InstRW<[THX3T110Write_16Cyc_F0123], (instrs FDIVv2f32)>; +def : InstRW<[THX3T110Write_16Cyc_F0123], (instregex "FDIVv2f32")>; + +// ASIMD FP divide, Q-form, F32 +def : InstRW<[THX3T110Write_16Cyc_F0123], (instrs FDIVv4f32)>; +def : InstRW<[THX3T110Write_16Cyc_F0123], (instregex "FDIVv4f32")>; + +// ASIMD FP divide, Q-form, F64 +def : InstRW<[THX3T110Write_23Cyc_F0123], (instrs FDIVv2f64)>; +def : InstRW<[THX3T110Write_23Cyc_F0123], (instregex "FDIVv2f64")>; + +// ASIMD FP max/min, normal, D-form +// ASIMD FP max/min, normal, Q-form +def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "^FMAXv", "^FMAXNMv", + "^FMINv", "^FMINNMv")>; + +// ASIMD FP max/min, pairwise, D-form +// ASIMD FP max/min, pairwise, Q-form +def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "^FMAXPv", "^FMAXNMPv", + "^FMINPv", "^FMINNMPv")>; + +// ASIMD FP max/min, reduce +def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "^FMAXVv", "^FMAXNMVv", + "^FMINVv", "^FMINNMVv")>; + +// ASIMD FP multiply, D-form, FZ +// ASIMD FP multiply, D-form, no FZ +// ASIMD FP multiply, Q-form, FZ +// ASIMD FP multiply, Q-form, no FZ +def : InstRW<[THX3T110Write_5Cyc_F0123], + (instregex "^FMULv", "^FMULXv")>; +def : InstRW<[THX3T110Write_5Cyc_F0123], + (instregex "^FMULX?(v2f32|v1i32|v2i32|v1i64|32|64)")>; +def : InstRW<[THX3T110Write_5Cyc_F0123], + (instregex "^FMULX?(v4f32|v2f64|v4i32|v2i64)")>; + +// ASIMD FP multiply accumulate, Dform, FZ +// ASIMD FP multiply accumulate, Dform, no FZ +// ASIMD FP multiply accumulate, Qform, FZ +// ASIMD FP multiply accumulate, Qform, no FZ +def : InstRW<[THX3T110Write_5Cyc_F0123], + (instregex "^FMLAv", "^FMLSv")>; +def : InstRW<[THX3T110Write_5Cyc_F0123], + (instregex "^FML[AS](v2f32|v1i32|v2i32|v1i64)")>; +def : InstRW<[THX3T110Write_5Cyc_F0123], + (instregex "^FML[AS](v4f32|v2f64|v4i32|v2i64)")>; + +// ASIMD FP negate +def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "^FNEGv")>; + +//-- +// 3.14 ASIMD Miscellaneous Instructions +//-- + +// ASIMD bit reverse +def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "^RBITv")>; + +// ASIMD bitwise insert, D-form +// ASIMD bitwise insert, Q-form +def : InstRW<[THX3T110Write_3_4Cyc_F23_F0123], + (instregex "^BIFv", "^BITv", "^BSLv")>; + +// ASIMD count, D-form +// ASIMD count, Q-form +def : InstRW<[THX3T110Write_3_4Cyc_F23_F0123], + (instregex "^CLSv", "^CLZv", "^CNTv")>; + +// ASIMD duplicate, gen reg +// ASIMD duplicate, element +def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "^DUPv")>; +def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "^CPY")>; +def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "^DUPv.+gpr")>; + +// ASIMD extract +def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "^EXTv")>; + +// ASIMD extract narrow +def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "^XTNv")>; + +// ASIMD extract narrow, saturating +def : InstRW<[THX3T110Write_5Cyc_F0123], + (instregex "^SQXTNv", "^SQXTUNv", "^UQXTNv")>; + +// ASIMD insert, element to element +def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "^INSv")>; + +// ASIMD transfer, element to gen reg +def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "^[SU]MOVv")>; + +// ASIMD move, integer immed +def : InstRW<[THX3T110Write_3_4Cyc_F23_F0123], (instregex "^MOVIv")>; + +// ASIMD move, FP immed +def : InstRW<[THX3T110Write_3_4Cyc_F23_F0123], (instregex "^FMOVv")>; + +// ASIMD transpose +def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "^TRN1", "^TRN2")>; + +// ASIMD unzip/zip +def : InstRW<[THX3T110Write_5Cyc_F0123], + (instregex "^UZP1", "^UZP2", "^ZIP1", "^ZIP2")>; + +// ASIMD reciprocal estimate, D-form +// ASIMD reciprocal estimate, Q-form +def : InstRW<[THX3T110Write_5Cyc_F0123], + (instregex "^FRECPEv", "^FRECPXv", "^URECPEv", + "^FRSQRTEv", "^URSQRTEv")>; + +// ASIMD reciprocal step, D-form, FZ +// ASIMD reciprocal step, D-form, no FZ +// ASIMD reciprocal step, Q-form, FZ +// ASIMD reciprocal step, Q-form, no FZ +def : InstRW<[THX3T110Write_5Cyc_F0123], + (instregex "^FRECPSv", "^FRSQRTSv")>; + +// ASIMD reverse +def : InstRW<[THX3T110Write_5Cyc_F0123], + (instregex "^REV16v", "^REV32v", "^REV64v")>; + +// ASIMD table lookup, D-form +// ASIMD table lookup, Q-form +def : InstRW<[THX3T110Write_5Cyc_F0123], + (instrs TBLv8i8One, TBLv16i8One, TBXv8i8One, TBXv16i8One)>; +def : InstRW<[THX3T110Write_10Cyc_F0123], + (instrs TBLv8i8Two, TBLv16i8Two, TBXv8i8Two, TBXv16i8Two)>; +def : InstRW<[THX3T110Write_15Cyc_F0123], + (instrs TBLv8i8Three, TBLv16i8Three, TBXv8i8Three, TBXv16i8Three)>; +def : InstRW<[THX3T110Write_20Cyc_F0123], + (instrs TBLv8i8Four, TBLv16i8Four, TBXv8i8Four, TBXv16i8Four)>; + +// ASIMD transfer, element to word or word +def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "^[SU]MOVv")>; + +// ASIMD transfer, element to gen reg +def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "(S|U)MOVv.*")>; + +// ASIMD transfer gen reg to element +def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "^INSv")>; + +// ASIMD transpose +def : InstRW<[THX3T110Write_5Cyc_F0123], + (instregex "^TRN1v", "^TRN2v", "^UZP1v", "^UZP2v")>; + +// ASIMD unzip/zip +def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "^ZIP1v", "^ZIP2v")>; + +//-- +// 3.15 ASIMD Load Instructions +//-- + +// ASIMD load, 1 element, multiple, 1 reg, D-form +// ASIMD load, 1 element, multiple, 1 reg, Q-form +def : InstRW<[THX3T110Write_4Cyc_LS01], + (instregex "^LD1Onev(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[THX3T110Write_4Cyc_LS01, WriteAdr], + (instregex "^LD1Onev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; + +// ASIMD load, 1 element, multiple, 2 reg, D-form +// ASIMD load, 1 element, multiple, 2 reg, Q-form +def : InstRW<[THX3T110Write_4Cyc_LS01], + (instregex "^LD1Twov(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[THX3T110Write_4Cyc_LS01, WriteAdr], + (instregex "^LD1Twov(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; + +// ASIMD load, 1 element, multiple, 3 reg, D-form +// ASIMD load, 1 element, multiple, 3 reg, Q-form +def : InstRW<[THX3T110Write_5Cyc_LS01], + (instregex "^LD1Threev(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[THX3T110Write_5Cyc_LS01, WriteAdr], + (instregex "^LD1Threev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; + +// ASIMD load, 1 element, multiple, 4 reg, D-form +// ASIMD load, 1 element, multiple, 4 reg, Q-form +def : InstRW<[THX3T110Write_6Cyc_LS01], + (instregex "^LD1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[THX3T110Write_6Cyc_LS01, WriteAdr], + (instregex "^LD1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; + +// ASIMD load, 1 element, one lane, B/H/S +// ASIMD load, 1 element, one lane, D +def : InstRW<[THX3T110Write_5Cyc_LS01_F0123], + (instregex "^LD1i(8|16|32|64)$")>; +def : InstRW<[THX3T110Write_5Cyc_LS01_F0123, WriteAdr], + (instregex "^LD1i(8|16|32|64)_POST$")>; + +// ASIMD load, 1 element, all lanes, D-form, B/H/S +// ASIMD load, 1 element, all lanes, D-form, D +// ASIMD load, 1 element, all lanes, Q-form +def : InstRW<[THX3T110Write_5Cyc_LS01_F0123], + (instregex "^LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[THX3T110Write_5Cyc_LS01_F0123, WriteAdr], + (instregex "^LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; + +// ASIMD load, 2 element, multiple, D-form, B/H/S +// ASIMD load, 2 element, multiple, Q-form, D +def : InstRW<[THX3T110Write_5Cyc_LS01_F0123], + (instregex "^LD2Twov(8b|4h|2s|16b|8h|4s|2d)$")>; +def : InstRW<[THX3T110Write_5Cyc_LS01_F0123, WriteAdr], + (instregex "^LD2Twov(8b|4h|2s|16b|8h|4s|2d)_POST$")>; + +// ASIMD load, 2 element, one lane, B/H +// ASIMD load, 2 element, one lane, S +// ASIMD load, 2 element, one lane, D +def : InstRW<[THX3T110Write_5Cyc_LS01_F0123], + (instregex "^LD2i(8|16|32|64)$")>; +def : InstRW<[THX3T110Write_5Cyc_LS01_F0123, WriteAdr], + (instregex "^LD2i(8|16|32|64)_POST$")>; + +// ASIMD load, 2 element, all lanes, D-form, B/H/S +// ASIMD load, 2 element, all lanes, D-form, D +// ASIMD load, 2 element, all lanes, Q-form +def : InstRW<[THX3T110Write_5Cyc_LS01_F0123], + (instregex "^LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[THX3T110Write_5Cyc_LS01_F0123, WriteAdr], + (instregex "^LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; + +// ASIMD load, 3 element, multiple, D-form, B/H/S +// ASIMD load, 3 element, multiple, Q-form, B/H/S +// ASIMD load, 3 element, multiple, Q-form, D +def : InstRW<[THX3T110Write_8Cyc_LS01_F0123], + (instregex "^LD3Threev(8b|4h|2s|16b|8h|4s|2d)$")>; +def : InstRW<[THX3T110Write_8Cyc_LS01_F0123, WriteAdr], + (instregex "^LD3Threev(8b|4h|2s|16b|8h|4s|2d)_POST$")>; + +// ASIMD load, 3 element, one lone, B/H +// ASIMD load, 3 element, one lane, S +// ASIMD load, 3 element, one lane, D +def : InstRW<[THX3T110Write_7Cyc_LS01_F0123], + (instregex "^LD3i(8|16|32|64)$")>; +def : InstRW<[THX3T110Write_7Cyc_LS01_F0123, WriteAdr], + (instregex "^LD3i(8|16|32|64)_POST$")>; + +// ASIMD load, 3 element, all lanes, D-form, B/H/S +// ASIMD load, 3 element, all lanes, D-form, D +// ASIMD load, 3 element, all lanes, Q-form, B/H/S +// ASIMD load, 3 element, all lanes, Q-form, D +def : InstRW<[THX3T110Write_7Cyc_LS01_F0123], + (instregex "^LD3Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[THX3T110Write_7Cyc_LS01_F0123, WriteAdr], + (instregex "^LD3Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; + +// ASIMD load, 4 element, multiple, D-form, B/H/S +// ASIMD load, 4 element, multiple, Q-form, B/H/S +// ASIMD load, 4 element, multiple, Q-form, D +def : InstRW<[THX3T110Write_8Cyc_LS01_F0123], + (instregex "^LD4Fourv(8b|4h|2s|16b|8h|4s|2d)$")>; +def : InstRW<[THX3T110Write_8Cyc_LS01_F0123, WriteAdr], + (instregex "^LD4Fourv(8b|4h|2s|16b|8h|4s|2d)_POST$")>; + +// ASIMD load, 4 element, one lane, B/H +// ASIMD load, 4 element, one lane, S +// ASIMD load, 4 element, one lane, D +def : InstRW<[THX3T110Write_6Cyc_LS01_F0123], + (instregex "^LD4i(8|16|32|64)$")>; +def : InstRW<[THX3T110Write_6Cyc_LS01_F0123, WriteAdr], + (instregex "^LD4i(8|16|32|64)_POST$")>; + +// ASIMD load, 4 element, all lanes, D-form, B/H/S +// ASIMD load, 4 element, all lanes, D-form, D +// ASIMD load, 4 element, all lanes, Q-form, B/H/S +// ASIMD load, 4 element, all lanes, Q-form, D +def : InstRW<[THX3T110Write_6Cyc_LS01_F0123], + (instregex "^LD4Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[THX3T110Write_6Cyc_LS01_F0123, WriteAdr], + (instregex "^LD4Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; + +//-- +// 3.16 ASIMD Store Instructions +//-- + +// ASIMD store, 1 element, multiple, 1 reg, D-form +// ASIMD store, 1 element, multiple, 1 reg, Q-form +def : InstRW<[THX3T110Write_1Cyc_LS01], + (instregex "^ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[THX3T110Write_1Cyc_LS01, WriteAdr], + (instregex "^ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; + +// ASIMD store, 1 element, multiple, 2 reg, D-form +// ASIMD store, 1 element, multiple, 2 reg, Q-form +def : InstRW<[THX3T110Write_1Cyc_LS01], + (instregex "^ST1Twov(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[THX3T110Write_1Cyc_LS01, WriteAdr], + (instregex "^ST1Twov(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; + +// ASIMD store, 1 element, multiple, 3 reg, D-form +// ASIMD store, 1 element, multiple, 3 reg, Q-form +def : InstRW<[THX3T110Write_1Cyc_LS01], + (instregex "^ST1Threev(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[THX3T110Write_1Cyc_LS01, WriteAdr], + (instregex "^ST1Threev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; + +// ASIMD store, 1 element, multiple, 4 reg, D-form +// ASIMD store, 1 element, multiple, 4 reg, Q-form +def : InstRW<[THX3T110Write_1Cyc_LS01], + (instregex "^ST1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[THX3T110Write_1Cyc_LS01, WriteAdr], + (instregex "^ST1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; + +// ASIMD store, 1 element, one lane, B/H/S +// ASIMD store, 1 element, one lane, D +def : InstRW<[THX3T110Write_1Cyc_LS01_F0123], + (instregex "^ST1i(8|16|32|64)$")>; +def : InstRW<[THX3T110Write_1Cyc_LS01_F0123, WriteAdr], + (instregex "^ST1i(8|16|32|64)_POST$")>; + +// ASIMD store, 2 element, multiple, D-form, B/H/S +// ASIMD store, 2 element, multiple, Q-form, B/H/S +// ASIMD store, 2 element, multiple, Q-form, D +def : InstRW<[THX3T110Write_1Cyc_LS01_F0123], + (instregex "^ST2Twov(8b|4h|2s|16b|8h|4s|2d)$")>; +def : InstRW<[THX3T110Write_1Cyc_LS01_F0123, WriteAdr], + (instregex "^ST2Twov(8b|4h|2s|16b|8h|4s|2d)_POST$")>; + +// ASIMD store, 2 element, one lane, B/H/S +// ASIMD store, 2 element, one lane, D +def : InstRW<[THX3T110Write_1Cyc_LS01_F0123], + (instregex "^ST2i(8|16|32|64)$")>; +def : InstRW<[THX3T110Write_1Cyc_LS01_F0123, WriteAdr], + (instregex "^ST2i(8|16|32|64)_POST$")>; + +// ASIMD store, 3 element, multiple, D-form, B/H/S +// ASIMD store, 3 element, multiple, Q-form, B/H/S +// ASIMD store, 3 element, multiple, Q-form, D +def : InstRW<[THX3T110Write_1Cyc_LS01_F0123], + (instregex "^ST3Threev(8b|4h|2s|16b|8h|4s|2d)$")>; +def : InstRW<[THX3T110Write_1Cyc_LS01_F0123, WriteAdr], + (instregex "^ST3Threev(8b|4h|2s|16b|8h|4s|2d)_POST$")>; + +// ASIMD store, 3 element, one lane, B/H +// ASIMD store, 3 element, one lane, S +// ASIMD store, 3 element, one lane, D +def : InstRW<[THX3T110Write_1Cyc_LS01_F0123], + (instregex "^ST3i(8|16|32|64)$")>; +def : InstRW<[THX3T110Write_1Cyc_LS01_F0123, WriteAdr], + (instregex "^ST3i(8|16|32|64)_POST$")>; + +// ASIMD store, 4 element, multiple, D-form, B/H/S +// ASIMD store, 4 element, multiple, Q-form, B/H/S +// ASIMD store, 4 element, multiple, Q-form, D +def : InstRW<[THX3T110Write_1Cyc_LS01_F0123], + (instregex "^ST4Fourv(8b|4h|2s|16b|8h|4s|2d)$")>; +def : InstRW<[THX3T110Write_1Cyc_LS01_F0123, WriteAdr], + (instregex "^ST4Fourv(8b|4h|2s|16b|8h|4s|2d)_POST$")>; + +// ASIMD store, 4 element, one lane, B/H +// ASIMD store, 4 element, one lane, S +// ASIMD store, 4 element, one lane, D +def : InstRW<[THX3T110Write_1Cyc_LS01_F0123], + (instregex "^ST4i(8|16|32|64)$")>; +def : InstRW<[THX3T110Write_1Cyc_LS01_F0123, WriteAdr], + (instregex "^ST4i(8|16|32|64)_POST$")>; + +// V8.1a Atomics (LSE) +def : InstRW<[THX3T110Write_4Cyc_I0123, WriteAtomic], + (instrs CASB, CASH, CASW, CASX)>; + +def : InstRW<[THX3T110Write_6Cyc_I0123, WriteAtomic], + (instrs CASAB, CASAH, CASAW, CASAX)>; + +def : InstRW<[THX3T110Write_6Cyc_I0123, WriteAtomic], + (instrs CASLB, CASLH, CASLW, CASLX)>; + +def : InstRW<[THX3T110Write_8Cyc_I0123, WriteAtomic], + (instrs CASALB, CASALH, CASALW, CASALX)>; + +def : InstRW<[THX3T110Write_6Cyc_I0123, WriteAtomic], + (instrs LDLARB, LDLARH, LDLARW, LDLARX)>; + +def : InstRW<[THX3T110Write_4Cyc_I0123, WriteAtomic], + (instrs LDADDB, LDADDH, LDADDW, LDADDX)>; + +def : InstRW<[THX3T110Write_6Cyc_I0123, WriteAtomic], + (instrs LDADDAB, LDADDAH, LDADDAW, LDADDAX)>; + +def : InstRW<[THX3T110Write_6Cyc_I0123, WriteAtomic], + (instrs LDADDLB, LDADDLH, LDADDLW, LDADDLX)>; + +def : InstRW<[THX3T110Write_8Cyc_I0123, WriteAtomic], + (instrs LDADDALB, LDADDALH, LDADDALW, LDADDALX)>; + +def : InstRW<[THX3T110Write_4Cyc_I0123, WriteAtomic], + (instrs LDCLRB, LDCLRH, LDCLRW, LDCLRX)>; + +def : InstRW<[THX3T110Write_6Cyc_I0123, WriteAtomic], + (instrs LDCLRAB, LDCLRAH, LDCLRAW, LDCLRAX)>; + +def : InstRW<[THX3T110Write_6Cyc_I0123, WriteAtomic], + (instrs LDCLRLB, LDCLRLH, LDCLRLW, LDCLRLX)>; + +def : InstRW<[THX3T110Write_8Cyc_I0123, WriteAtomic], + (instrs LDCLRALB, LDCLRALH, LDCLRALW, LDCLRALX)>; + +def : InstRW<[THX3T110Write_4Cyc_I0123, WriteAtomic], + (instrs LDEORB, LDEORH, LDEORW, LDEORX)>; + +def : InstRW<[THX3T110Write_6Cyc_I0123, WriteAtomic], + (instrs LDEORAB, LDEORAH, LDEORAW, LDEORAX)>; + +def : InstRW<[THX3T110Write_6Cyc_I0123, WriteAtomic], + (instrs LDEORLB, LDEORLH, LDEORLW, LDEORLX)>; + +def : InstRW<[THX3T110Write_8Cyc_I0123, WriteAtomic], + (instrs LDEORALB, LDEORALH, LDEORALW, LDEORALX)>; + +def : InstRW<[THX3T110Write_4Cyc_I0123, WriteAtomic], + (instrs LDSETB, LDSETH, LDSETW, LDSETX)>; + +def : InstRW<[THX3T110Write_6Cyc_I0123, WriteAtomic], + (instrs LDSETAB, LDSETAH, LDSETAW, LDSETAX)>; + +def : InstRW<[THX3T110Write_6Cyc_I0123, WriteAtomic], + (instrs LDSETLB, LDSETLH, LDSETLW, LDSETLX)>; + +def : InstRW<[THX3T110Write_8Cyc_I0123, WriteAtomic], + (instrs LDSETALB, LDSETALH, LDSETALW, LDSETALX)>; + +def : InstRW<[THX3T110Write_4Cyc_I0123, WriteAtomic], + (instrs LDSMAXB, LDSMAXH, LDSMAXW, LDSMAXX, + LDSMAXAB, LDSMAXAH, LDSMAXAW, LDSMAXAX, + LDSMAXLB, LDSMAXLH, LDSMAXLW, LDSMAXLX, + LDSMAXALB, LDSMAXALH, LDSMAXALW, LDSMAXALX)>; + +def : InstRW<[THX3T110Write_4Cyc_I0123, WriteAtomic], + (instrs LDSMINB, LDSMINH, LDSMINW, LDSMINX, + LDSMINAB, LDSMINAH, LDSMINAW, LDSMINAX, + LDSMINLB, LDSMINLH, LDSMINLW, LDSMINLX, + LDSMINALB, LDSMINALH, LDSMINALW, LDSMINALX)>; + +def : InstRW<[THX3T110Write_4Cyc_I0123, WriteAtomic], + (instrs LDUMAXB, LDUMAXH, LDUMAXW, LDUMAXX, + LDUMAXAB, LDUMAXAH, LDUMAXAW, LDUMAXAX, + LDUMAXLB, LDUMAXLH, LDUMAXLW, LDUMAXLX, + LDUMAXALB, LDUMAXALH, LDUMAXALW, LDUMAXALX)>; + +def : InstRW<[THX3T110Write_4Cyc_I0123, WriteAtomic], + (instrs LDUMINB, LDUMINH, LDUMINW, LDUMINX, + LDUMINAB, LDUMINAH, LDUMINAW, LDUMINAX, + LDUMINLB, LDUMINLH, LDUMINLW, LDUMINLX, + LDUMINALB, LDUMINALH, LDUMINALW, LDUMINALX)>; + +def : InstRW<[THX3T110Write_4Cyc_I0123, WriteAtomic], + (instrs SWPB, SWPH, SWPW, SWPX)>; + +def : InstRW<[THX3T110Write_6Cyc_I0123, WriteAtomic], + (instrs SWPAB, SWPAH, SWPAW, SWPAX)>; + +def : InstRW<[THX3T110Write_6Cyc_I0123, WriteAtomic], + (instrs SWPLB, SWPLH, SWPLW, SWPLX)>; + +def : InstRW<[THX3T110Write_8Cyc_I0123, WriteAtomic], + (instrs SWPALB, SWPALH, SWPALW, SWPALX)>; + +def : InstRW<[THX3T110Write_4Cyc_I0123, WriteAtomic], + (instrs STLLRB, STLLRH, STLLRW, STLLRX)>; + +// V8.3a PAC +def : InstRW<[THX3T110Write_11Cyc_LS01_I1], (instregex "^LDRAA", "^LDRAB")>; +def : InstRW<[THX3T110Write_8Cyc_I123], + (instrs BLRAA, BLRAAZ, BLRAB, BLRABZ, + BRAA, BRAAZ, BRAB, BRABZ)>; +def : InstRW<[THX3T110Write_8Cyc_I123], (instrs RETAA, RETAB)>; + +} // SchedModel = ThunderX3T110Model diff --git a/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp b/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp index ba61ed726e840..8f814d185e859 100644 --- a/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp @@ -17,7 +17,7 @@ using namespace llvm; SDValue AArch64SelectionDAGInfo::EmitTargetCodeForMemset( SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src, - SDValue Size, unsigned Align, bool isVolatile, + SDValue Size, Align Alignment, bool isVolatile, MachinePointerInfo DstPtrInfo) const { // Check to see if there is a specialized entry-point for memory zeroing. ConstantSDNode *V = dyn_cast<ConstantSDNode>(Src); @@ -117,7 +117,7 @@ SDValue AArch64SelectionDAGInfo::EmitTargetCodeForSetTag( MachineFunction &MF = DAG.getMachineFunction(); MachineMemOperand *BaseMemOperand = MF.getMachineMemOperand( - DstPtrInfo, MachineMemOperand::MOStore, ObjSize, 16); + DstPtrInfo, MachineMemOperand::MOStore, ObjSize, Align(16)); bool UseSetTagRangeLoop = kSetTagLoopThreshold >= 0 && (int)ObjSize >= kSetTagLoopThreshold; @@ -125,21 +125,18 @@ SDValue AArch64SelectionDAGInfo::EmitTargetCodeForSetTag( return EmitUnrolledSetTag(DAG, dl, Chain, Addr, ObjSize, BaseMemOperand, ZeroData); - if (ObjSize % 32 != 0) { - SDNode *St1 = DAG.getMachineNode( - ZeroData ? AArch64::STZGPostIndex : AArch64::STGPostIndex, dl, - {MVT::i64, MVT::Other}, - {Addr, Addr, DAG.getTargetConstant(1, dl, MVT::i64), Chain}); - DAG.setNodeMemRefs(cast<MachineSDNode>(St1), {BaseMemOperand}); - ObjSize -= 16; - Addr = SDValue(St1, 0); - Chain = SDValue(St1, 1); - } - const EVT ResTys[] = {MVT::i64, MVT::i64, MVT::Other}; - SDValue Ops[] = {DAG.getConstant(ObjSize, dl, MVT::i64), Addr, Chain}; - SDNode *St = DAG.getMachineNode( - ZeroData ? AArch64::STZGloop : AArch64::STGloop, dl, ResTys, Ops); + + unsigned Opcode; + if (Addr.getOpcode() == ISD::FrameIndex) { + int FI = cast<FrameIndexSDNode>(Addr)->getIndex(); + Addr = DAG.getTargetFrameIndex(FI, MVT::i64); + Opcode = ZeroData ? AArch64::STZGloop : AArch64::STGloop; + } else { + Opcode = ZeroData ? AArch64::STZGloop_wback : AArch64::STGloop_wback; + } + SDValue Ops[] = {DAG.getTargetConstant(ObjSize, dl, MVT::i64), Addr, Chain}; + SDNode *St = DAG.getMachineNode(Opcode, dl, ResTys, Ops); DAG.setNodeMemRefs(cast<MachineSDNode>(St), {BaseMemOperand}); return SDValue(St, 2); diff --git a/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.h b/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.h index d0967fb973cc3..d94fd8471b7b9 100644 --- a/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.h +++ b/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.h @@ -21,7 +21,8 @@ class AArch64SelectionDAGInfo : public SelectionDAGTargetInfo { public: SDValue EmitTargetCodeForMemset(SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src, - SDValue Size, unsigned Align, bool isVolatile, + SDValue Size, Align Alignment, + bool isVolatile, MachinePointerInfo DstPtrInfo) const override; SDValue EmitTargetCodeForSetTag(SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Op1, SDValue Op2, diff --git a/llvm/lib/Target/AArch64/AArch64StackOffset.h b/llvm/lib/Target/AArch64/AArch64StackOffset.h index f95b5dc5246e9..6fa1c744f77e2 100644 --- a/llvm/lib/Target/AArch64/AArch64StackOffset.h +++ b/llvm/lib/Target/AArch64/AArch64StackOffset.h @@ -16,6 +16,7 @@ #include "llvm/Support/MachineValueType.h" #include "llvm/Support/TypeSize.h" +#include <cassert> namespace llvm { diff --git a/llvm/lib/Target/AArch64/AArch64StackTagging.cpp b/llvm/lib/Target/AArch64/AArch64StackTagging.cpp index 975502818fcd2..61f27cbc3b29d 100644 --- a/llvm/lib/Target/AArch64/AArch64StackTagging.cpp +++ b/llvm/lib/Target/AArch64/AArch64StackTagging.cpp @@ -19,10 +19,13 @@ #include "llvm/ADT/Optional.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/CFG.h" #include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/PostDominators.h" #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/ScalarEvolutionExpressions.h" +#include "llvm/Analysis/StackSafetyAnalysis.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/CodeGen/LiveRegUnits.h" #include "llvm/CodeGen/MachineBasicBlock.h" @@ -44,6 +47,7 @@ #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/IntrinsicsAArch64.h" #include "llvm/IR/Metadata.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/Casting.h" #include "llvm/Support/Debug.h" @@ -61,6 +65,11 @@ static cl::opt<bool> ClMergeInit( "stack-tagging-merge-init", cl::Hidden, cl::init(true), cl::ZeroOrMore, cl::desc("merge stack variable initializers with tagging when possible")); +static cl::opt<bool> + ClUseStackSafety("stack-tagging-use-stack-safety", cl::Hidden, + cl::init(true), cl::ZeroOrMore, + cl::desc("Use Stack Safety analysis results")); + static cl::opt<unsigned> ClScanLimit("stack-tagging-merge-init-scan-limit", cl::init(40), cl::Hidden); @@ -256,8 +265,9 @@ public: Type *EltTy = VecTy->getElementType(); if (EltTy->isPointerTy()) { uint32_t EltSize = DL->getTypeSizeInBits(EltTy); - Type *NewTy = VectorType::get(IntegerType::get(Ctx, EltSize), - VecTy->getNumElements()); + auto *NewTy = FixedVectorType::get( + IntegerType::get(Ctx, EltSize), + cast<FixedVectorType>(VecTy)->getNumElements()); V = IRB.CreatePointerCast(V, NewTy); } } @@ -275,15 +285,17 @@ class AArch64StackTagging : public FunctionPass { int Tag; // -1 for non-tagged allocations }; - bool MergeInit; + const bool MergeInit; + const bool UseStackSafety; public: static char ID; // Pass ID, replacement for typeid - AArch64StackTagging(bool MergeInit = true) + AArch64StackTagging(bool IsOptNone = false) : FunctionPass(ID), - MergeInit(ClMergeInit.getNumOccurrences() > 0 ? ClMergeInit - : MergeInit) { + MergeInit(ClMergeInit.getNumOccurrences() ? ClMergeInit : !IsOptNone), + UseStackSafety(ClUseStackSafety.getNumOccurrences() ? ClUseStackSafety + : !IsOptNone) { initializeAArch64StackTaggingPass(*PassRegistry::getPassRegistry()); } @@ -305,13 +317,16 @@ public: StringRef getPassName() const override { return "AArch64 Stack Tagging"; } private: - Function *F; - Function *SetTagFunc; - const DataLayout *DL; - AAResults *AA; + Function *F = nullptr; + Function *SetTagFunc = nullptr; + const DataLayout *DL = nullptr; + AAResults *AA = nullptr; + const StackSafetyGlobalInfo *SSI = nullptr; void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); + if (UseStackSafety) + AU.addRequired<StackSafetyGlobalInfoWrapperPass>(); if (MergeInit) AU.addRequired<AAResultsWrapperPass>(); } @@ -323,11 +338,13 @@ char AArch64StackTagging::ID = 0; INITIALIZE_PASS_BEGIN(AArch64StackTagging, DEBUG_TYPE, "AArch64 Stack Tagging", false, false) +INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) +INITIALIZE_PASS_DEPENDENCY(StackSafetyGlobalInfoWrapperPass) INITIALIZE_PASS_END(AArch64StackTagging, DEBUG_TYPE, "AArch64 Stack Tagging", false, false) -FunctionPass *llvm::createAArch64StackTaggingPass(bool MergeInit) { - return new AArch64StackTagging(MergeInit); +FunctionPass *llvm::createAArch64StackTaggingPass(bool IsOptNone) { + return new AArch64StackTagging(IsOptNone); } Instruction *AArch64StackTagging::collectInitializers(Instruction *StartInst, @@ -400,7 +417,9 @@ bool AArch64StackTagging::isInterestingAlloca(const AllocaInst &AI) { // dynamic alloca instrumentation for them as well. !AI.isUsedWithInAlloca() && // swifterror allocas are register promoted by ISel - !AI.isSwiftError(); + !AI.isSwiftError() && + // safe allocas are not interesting + !(SSI && SSI->isSafe(AI)); return IsInteresting; } @@ -482,7 +501,7 @@ void AArch64StackTagging::alignAndPadAlloca(AllocaInfo &Info) { auto *NewAI = new AllocaInst( TypeWithPadding, Info.AI->getType()->getAddressSpace(), nullptr, "", Info.AI); NewAI->takeName(Info.AI); - NewAI->setAlignment(MaybeAlign(Info.AI->getAlignment())); + NewAI->setAlignment(Info.AI->getAlign()); NewAI->setUsedWithInAlloca(Info.AI->isUsedWithInAlloca()); NewAI->setSwiftError(Info.AI->isSwiftError()); NewAI->copyMetadata(*Info.AI); @@ -516,6 +535,8 @@ bool AArch64StackTagging::runOnFunction(Function &Fn) { if (!Fn.hasFnAttribute(Attribute::SanitizeMemTag)) return false; + if (UseStackSafety) + SSI = &getAnalysis<StackSafetyGlobalInfoWrapperPass>().getResult(); F = &Fn; DL = &Fn.getParent()->getDataLayout(); if (MergeInit) diff --git a/llvm/lib/Target/AArch64/AArch64StorePairSuppress.cpp b/llvm/lib/Target/AArch64/AArch64StorePairSuppress.cpp index 5deb601822b8c..a94856ef4fba3 100644 --- a/llvm/lib/Target/AArch64/AArch64StorePairSuppress.cpp +++ b/llvm/lib/Target/AArch64/AArch64StorePairSuppress.cpp @@ -149,7 +149,9 @@ bool AArch64StorePairSuppress::runOnMachineFunction(MachineFunction &MF) { continue; const MachineOperand *BaseOp; int64_t Offset; - if (TII->getMemOperandWithOffset(MI, BaseOp, Offset, TRI) && + bool OffsetIsScalable; + if (TII->getMemOperandWithOffset(MI, BaseOp, Offset, OffsetIsScalable, + TRI) && BaseOp->isReg()) { Register BaseReg = BaseOp->getReg(); if (PrevBaseReg == BaseReg) { diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp index 3636d8d2b628c..029535cb98b57 100644 --- a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp +++ b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp @@ -13,12 +13,12 @@ #include "AArch64Subtarget.h" #include "AArch64.h" -#include "AArch64CallLowering.h" #include "AArch64InstrInfo.h" -#include "AArch64LegalizerInfo.h" #include "AArch64PBQPRegAlloc.h" -#include "AArch64RegisterBankInfo.h" #include "AArch64TargetMachine.h" +#include "GISel/AArch64CallLowering.h" +#include "GISel/AArch64LegalizerInfo.h" +#include "GISel/AArch64RegisterBankInfo.h" #include "MCTargetDesc/AArch64AddressingModes.h" #include "llvm/CodeGen/GlobalISel/InstructionSelect.h" #include "llvm/CodeGen/MachineScheduler.h" @@ -47,6 +47,18 @@ static cl::opt<bool> cl::desc("Call nonlazybind functions via direct GOT load"), cl::init(false), cl::Hidden); +static cl::opt<unsigned> SVEVectorBitsMax( + "aarch64-sve-vector-bits-max", + cl::desc("Assume SVE vector registers are at most this big, " + "with zero meaning no maximum size is assumed."), + cl::init(0), cl::Hidden); + +static cl::opt<unsigned> SVEVectorBitsMin( + "aarch64-sve-vector-bits-min", + cl::desc("Assume SVE vector registers are at least this big, " + "with zero meaning no minimum size is assumed."), + cl::init(0), cl::Hidden); + AArch64Subtarget & AArch64Subtarget::initializeSubtargetDependencies(StringRef FS, StringRef CPUString) { @@ -68,6 +80,9 @@ void AArch64Subtarget::initializeProperties() { switch (ARMProcFamily) { case Others: break; + case Carmel: + CacheLineSize = 64; + break; case CortexA35: break; case CortexA53: @@ -86,8 +101,16 @@ void AArch64Subtarget::initializeProperties() { case CortexA73: case CortexA75: case CortexA76: + case CortexA77: + case CortexA78: + case CortexX1: PrefFunctionLogAlignment = 4; break; + case A64FX: + CacheLineSize = 256; + PrefFunctionLogAlignment = 5; + PrefLoopLogAlignment = 5; + break; case AppleA7: case AppleA10: case AppleA11: @@ -160,6 +183,17 @@ void AArch64Subtarget::initializeProperties() { PrefFunctionLogAlignment = 4; PrefLoopLogAlignment = 2; break; + case ThunderX3T110: + CacheLineSize = 64; + PrefFunctionLogAlignment = 4; + PrefLoopLogAlignment = 2; + MaxInterleaveFactor = 4; + PrefetchDistance = 128; + MinPrefetchStride = 1024; + MaxPrefetchIterationsAhead = 4; + // FIXME: remove this to enable 64-bit SLP if performance looks good. + MinVectorRegisterBitWidth = 128; + break; } } @@ -177,6 +211,7 @@ AArch64Subtarget::AArch64Subtarget(const Triple &TT, const std::string &CPU, ReserveXRegister.set(18); CallLoweringInfo.reset(new AArch64CallLowering(*getTargetLowering())); + InlineAsmLoweringInfo.reset(new InlineAsmLowering(getTargetLowering())); Legalizer.reset(new AArch64LegalizerInfo(*this)); auto *RBI = new AArch64RegisterBankInfo(*getRegisterInfo()); @@ -194,6 +229,10 @@ const CallLowering *AArch64Subtarget::getCallLowering() const { return CallLoweringInfo.get(); } +const InlineAsmLowering *AArch64Subtarget::getInlineAsmLowering() const { + return InlineAsmLoweringInfo.get(); +} + InstructionSelector *AArch64Subtarget::getInstructionSelector() const { return InstSelector.get(); } @@ -305,3 +344,25 @@ void AArch64Subtarget::mirFileLoaded(MachineFunction &MF) const { if (!MFI.isMaxCallFrameSizeComputed()) MFI.computeMaxCallFrameSize(MF); } + +unsigned AArch64Subtarget::getMaxSVEVectorSizeInBits() const { + assert(HasSVE && "Tried to get SVE vector length without SVE support!"); + assert(SVEVectorBitsMax % 128 == 0 && + "SVE requires vector length in multiples of 128!"); + assert((SVEVectorBitsMax >= SVEVectorBitsMin || SVEVectorBitsMax == 0) && + "Minimum SVE vector size should not be larger than its maximum!"); + if (SVEVectorBitsMax == 0) + return 0; + return (std::max(SVEVectorBitsMin, SVEVectorBitsMax) / 128) * 128; +} + +unsigned AArch64Subtarget::getMinSVEVectorSizeInBits() const { + assert(HasSVE && "Tried to get SVE vector length without SVE support!"); + assert(SVEVectorBitsMin % 128 == 0 && + "SVE requires vector length in multiples of 128!"); + assert((SVEVectorBitsMax >= SVEVectorBitsMin || SVEVectorBitsMax == 0) && + "Minimum SVE vector size should not be larger than its maximum!"); + if (SVEVectorBitsMax == 0) + return (SVEVectorBitsMin / 128) * 128; + return (std::min(SVEVectorBitsMin, SVEVectorBitsMax) / 128) * 128; +} diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.h b/llvm/lib/Target/AArch64/AArch64Subtarget.h index 79c2c161d3cb2..b111f00169488 100644 --- a/llvm/lib/Target/AArch64/AArch64Subtarget.h +++ b/llvm/lib/Target/AArch64/AArch64Subtarget.h @@ -19,6 +19,7 @@ #include "AArch64RegisterInfo.h" #include "AArch64SelectionDAGInfo.h" #include "llvm/CodeGen/GlobalISel/CallLowering.h" +#include "llvm/CodeGen/GlobalISel/InlineAsmLowering.h" #include "llvm/CodeGen/GlobalISel/InstructionSelector.h" #include "llvm/CodeGen/GlobalISel/LegalizerInfo.h" #include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h" @@ -38,11 +39,13 @@ class AArch64Subtarget final : public AArch64GenSubtargetInfo { public: enum ARMProcFamilyEnum : uint8_t { Others, + A64FX, AppleA7, AppleA10, AppleA11, AppleA12, AppleA13, + Carmel, CortexA35, CortexA53, CortexA55, @@ -52,6 +55,9 @@ public: CortexA73, CortexA75, CortexA76, + CortexA77, + CortexA78, + CortexX1, ExynosM3, Falkor, Kryo, @@ -63,7 +69,8 @@ public: ThunderXT81, ThunderXT83, ThunderXT88, - TSV110 + TSV110, + ThunderX3T110 }; protected: @@ -75,6 +82,7 @@ protected: bool HasV8_3aOps = false; bool HasV8_4aOps = false; bool HasV8_5aOps = false; + bool HasV8_6aOps = false; bool HasFPARMv8 = false; bool HasNEON = false; @@ -99,6 +107,10 @@ protected: bool HasPAN_RWV = false; bool HasCCPP = false; + // SVE extensions + bool HasSVE = false; + bool UseExperimentalZeroingPseudos = false; + // Armv8.2 Crypto extensions bool HasSM4 = false; bool HasSHA3 = false; @@ -125,8 +137,6 @@ protected: bool HasRCPC_IMMO = false; bool HasLSLFast = false; - bool HasSVE = false; - bool HasSVE2 = false; bool HasRCPC = false; bool HasAggressiveFMA = false; @@ -143,7 +153,17 @@ protected: bool HasMTE = false; bool HasTME = false; + // Armv8.6-A Extensions + bool HasBF16 = false; + bool HasMatMulInt8 = false; + bool HasMatMulFP32 = false; + bool HasMatMulFP64 = false; + bool HasAMVS = false; + bool HasFineGrainedTraps = false; + bool HasEnhancedCounterVirtualization = false; + // Arm SVE2 extensions + bool HasSVE2 = false; bool HasSVE2AES = false; bool HasSVE2SM4 = false; bool HasSVE2SHA3 = false; @@ -196,6 +216,8 @@ protected: bool UseEL2ForTP = false; bool UseEL3ForTP = false; bool AllowTaggedGlobals = false; + bool HardenSlsRetBr = false; + bool HardenSlsBlr = false; uint8_t MaxInterleaveFactor = 2; uint8_t VectorInsertExtractBaseCost = 3; uint16_t CacheLineSize = 0; @@ -225,6 +247,7 @@ protected: /// GlobalISel related APIs. std::unique_ptr<CallLowering> CallLoweringInfo; + std::unique_ptr<InlineAsmLowering> InlineAsmLoweringInfo; std::unique_ptr<InstructionSelector> InstSelector; std::unique_ptr<LegalizerInfo> Legalizer; std::unique_ptr<RegisterBankInfo> RegBankInfo; @@ -260,6 +283,7 @@ public: return &getInstrInfo()->getRegisterInfo(); } const CallLowering *getCallLowering() const override; + const InlineAsmLowering *getInlineAsmLowering() const override; InstructionSelector *getInstructionSelector() const override; const LegalizerInfo *getLegalizerInfo() const override; const RegisterBankInfo *getRegBankInfo() const override; @@ -347,6 +371,9 @@ public: hasFuseCCSelect() || hasFuseLiterals(); } + bool hardenSlsRetBr() const { return HardenSlsRetBr; } + bool hardenSlsBlr() const { return HardenSlsBlr; } + bool useEL1ForTP() const { return UseEL1ForTP; } bool useEL2ForTP() const { return UseEL2ForTP; } bool useEL3ForTP() const { return UseEL3ForTP; } @@ -359,7 +386,12 @@ public: } unsigned getCacheLineSize() const override { return CacheLineSize; } unsigned getPrefetchDistance() const override { return PrefetchDistance; } - unsigned getMinPrefetchStride() const override { return MinPrefetchStride; } + unsigned getMinPrefetchStride(unsigned NumMemAccesses, + unsigned NumStridedMemAccesses, + unsigned NumPrefetches, + bool HasCall) const override { + return MinPrefetchStride; + } unsigned getMaxPrefetchIterationsAhead() const override { return MaxPrefetchIterationsAhead; } @@ -372,6 +404,10 @@ public: unsigned getWideningBaseCost() const { return WideningBaseCost; } + bool useExperimentalZeroingPseudos() const { + return UseExperimentalZeroingPseudos; + } + /// CPU has TBI (top byte of addresses is ignored during HW address /// translation) and OS enables it. bool supportsAddressTopByteIgnored() const; @@ -401,6 +437,16 @@ public: bool hasSVE2SM4() const { return HasSVE2SM4; } bool hasSVE2SHA3() const { return HasSVE2SHA3; } bool hasSVE2BitPerm() const { return HasSVE2BitPerm; } + bool hasMatMulInt8() const { return HasMatMulInt8; } + bool hasMatMulFP32() const { return HasMatMulFP32; } + bool hasMatMulFP64() const { return HasMatMulFP64; } + + // Armv8.6-A Extensions + bool hasBF16() const { return HasBF16; } + bool hasFineGrainedTraps() const { return HasFineGrainedTraps; } + bool hasEnhancedCounterVirtualization() const { + return HasEnhancedCounterVirtualization; + } bool isLittleEndian() const { return IsLittle; } @@ -438,6 +484,7 @@ public: bool hasDIT() const { return HasDIT; } bool hasTRACEV8_4() const { return HasTRACEV8_4; } bool hasAM() const { return HasAM; } + bool hasAMVS() const { return HasAMVS; } bool hasSEL2() const { return HasSEL2; } bool hasPMU() const { return HasPMU; } bool hasTLB_RMI() const { return HasTLB_RMI; } @@ -497,6 +544,12 @@ public: } void mirFileLoaded(MachineFunction &MF) const override; + + // Return the known range for the bit length of SVE data registers. A value + // of 0 means nothing is known about that particular limit beyong what's + // implied by the architecture. + unsigned getMaxSVEVectorSizeInBits() const; + unsigned getMinSVEVectorSizeInBits() const; }; } // End llvm namespace diff --git a/llvm/lib/Target/AArch64/AArch64SystemOperands.td b/llvm/lib/Target/AArch64/AArch64SystemOperands.td index 6e82d326e5194..ceceabc6ff4ed 100644 --- a/llvm/lib/Target/AArch64/AArch64SystemOperands.td +++ b/llvm/lib/Target/AArch64/AArch64SystemOperands.td @@ -18,18 +18,18 @@ include "llvm/TableGen/SearchableTable.td" //===----------------------------------------------------------------------===// def HasCCPP : Predicate<"Subtarget->hasCCPP()">, - AssemblerPredicate<"FeatureCCPP", "ccpp">; + AssemblerPredicate<(all_of FeatureCCPP), "ccpp">; def HasPAN : Predicate<"Subtarget->hasPAN()">, - AssemblerPredicate<"FeaturePAN", + AssemblerPredicate<(all_of FeaturePAN), "ARM v8.1 Privileged Access-Never extension">; def HasPsUAO : Predicate<"Subtarget->hasPsUAO()">, - AssemblerPredicate<"FeaturePsUAO", + AssemblerPredicate<(all_of FeaturePsUAO), "ARM v8.2 UAO PState extension (psuao)">; def HasPAN_RWV : Predicate<"Subtarget->hasPAN_RWV()">, - AssemblerPredicate<"FeaturePAN_RWV", + AssemblerPredicate<(all_of FeaturePAN_RWV), "ARM v8.2 PAN AT S1E1R and AT S1E1W Variation">; //===----------------------------------------------------------------------===// @@ -338,7 +338,7 @@ def : PState<"PAN", 0b00100>; // v8.2a "User Access Override" extension-specific PStates let Requires = [{ {AArch64::FeaturePsUAO} }] in def : PState<"UAO", 0b00011>; -// v8.4a timining insensitivity of data processing instructions +// v8.4a timing insensitivity of data processing instructions let Requires = [{ {AArch64::FeatureDIT} }] in def : PState<"DIT", 0b11010>; // v8.5a Spectre Mitigation @@ -844,7 +844,7 @@ def : RWSysReg<"SP_EL2", 0b11, 0b110, 0b0100, 0b0001, 0b000>; def : RWSysReg<"SPSel", 0b11, 0b000, 0b0100, 0b0010, 0b000>; def : RWSysReg<"NZCV", 0b11, 0b011, 0b0100, 0b0010, 0b000>; def : RWSysReg<"DAIF", 0b11, 0b011, 0b0100, 0b0010, 0b001>; -def : RWSysReg<"CurrentEL", 0b11, 0b000, 0b0100, 0b0010, 0b010>; +def : ROSysReg<"CurrentEL", 0b11, 0b000, 0b0100, 0b0010, 0b010>; def : RWSysReg<"SPSR_irq", 0b11, 0b100, 0b0100, 0b0011, 0b000>; def : RWSysReg<"SPSR_abt", 0b11, 0b100, 0b0100, 0b0011, 0b001>; def : RWSysReg<"SPSR_und", 0b11, 0b100, 0b0100, 0b0011, 0b010>; @@ -1167,7 +1167,6 @@ def : RWSysReg<"ICC_SRE_EL3", 0b11, 0b110, 0b1100, 0b1100, 0b101>; def : RWSysReg<"ICC_IGRPEN0_EL1", 0b11, 0b000, 0b1100, 0b1100, 0b110>; def : RWSysReg<"ICC_IGRPEN1_EL1", 0b11, 0b000, 0b1100, 0b1100, 0b111>; def : RWSysReg<"ICC_IGRPEN1_EL3", 0b11, 0b110, 0b1100, 0b1100, 0b111>; -def : RWSysReg<"ICC_SEIEN_EL1", 0b11, 0b000, 0b1100, 0b1101, 0b000>; def : RWSysReg<"ICC_AP0R0_EL1", 0b11, 0b000, 0b1100, 0b1000, 0b100>; def : RWSysReg<"ICC_AP0R1_EL1", 0b11, 0b000, 0b1100, 0b1000, 0b101>; def : RWSysReg<"ICC_AP0R2_EL1", 0b11, 0b000, 0b1100, 0b1000, 0b110>; @@ -1185,9 +1184,8 @@ def : RWSysReg<"ICH_AP1R1_EL2", 0b11, 0b100, 0b1100, 0b1001, 0b001>; def : RWSysReg<"ICH_AP1R2_EL2", 0b11, 0b100, 0b1100, 0b1001, 0b010>; def : RWSysReg<"ICH_AP1R3_EL2", 0b11, 0b100, 0b1100, 0b1001, 0b011>; def : RWSysReg<"ICH_HCR_EL2", 0b11, 0b100, 0b1100, 0b1011, 0b000>; -def : RWSysReg<"ICH_MISR_EL2", 0b11, 0b100, 0b1100, 0b1011, 0b010>; +def : ROSysReg<"ICH_MISR_EL2", 0b11, 0b100, 0b1100, 0b1011, 0b010>; def : RWSysReg<"ICH_VMCR_EL2", 0b11, 0b100, 0b1100, 0b1011, 0b111>; -def : RWSysReg<"ICH_VSEIR_EL2", 0b11, 0b100, 0b1100, 0b1001, 0b100>; def : RWSysReg<"ICH_LR0_EL2", 0b11, 0b100, 0b1100, 0b1100, 0b000>; def : RWSysReg<"ICH_LR1_EL2", 0b11, 0b100, 0b1100, 0b1100, 0b001>; def : RWSysReg<"ICH_LR2_EL2", 0b11, 0b100, 0b1100, 0b1100, 0b010>; @@ -1260,7 +1258,7 @@ let Requires = [{ {AArch64::FeatureSPE} }] in { def : RWSysReg<"PMBLIMITR_EL1", 0b11, 0b000, 0b1001, 0b1010, 0b000>; def : RWSysReg<"PMBPTR_EL1", 0b11, 0b000, 0b1001, 0b1010, 0b001>; def : RWSysReg<"PMBSR_EL1", 0b11, 0b000, 0b1001, 0b1010, 0b011>; -def : RWSysReg<"PMBIDR_EL1", 0b11, 0b000, 0b1001, 0b1010, 0b111>; +def : ROSysReg<"PMBIDR_EL1", 0b11, 0b000, 0b1001, 0b1010, 0b111>; def : RWSysReg<"PMSCR_EL2", 0b11, 0b100, 0b1001, 0b1001, 0b000>; def : RWSysReg<"PMSCR_EL12", 0b11, 0b101, 0b1001, 0b1001, 0b000>; def : RWSysReg<"PMSCR_EL1", 0b11, 0b000, 0b1001, 0b1001, 0b000>; @@ -1269,7 +1267,7 @@ def : RWSysReg<"PMSIRR_EL1", 0b11, 0b000, 0b1001, 0b1001, 0b011>; def : RWSysReg<"PMSFCR_EL1", 0b11, 0b000, 0b1001, 0b1001, 0b100>; def : RWSysReg<"PMSEVFR_EL1", 0b11, 0b000, 0b1001, 0b1001, 0b101>; def : RWSysReg<"PMSLATFR_EL1", 0b11, 0b000, 0b1001, 0b1001, 0b110>; -def : RWSysReg<"PMSIDR_EL1", 0b11, 0b000, 0b1001, 0b1001, 0b111>; +def : ROSysReg<"PMSIDR_EL1", 0b11, 0b000, 0b1001, 0b1001, 0b111>; } // v8.2a "RAS extension" registers @@ -1333,7 +1331,6 @@ def : RWSysReg<"PMMIR_EL1", 0b11, 0b000, 0b1001, 0b1110, 0b110>; let Requires = [{ {AArch64::FeatureRASv8_4} }] in { def : RWSysReg<"ERXPFGCTL_EL1", 0b11, 0b000, 0b0101, 0b0100, 0b101>; def : RWSysReg<"ERXPFGCDN_EL1", 0b11, 0b000, 0b0101, 0b0100, 0b110>; -def : RWSysReg<"ERXTS_EL1", 0b11, 0b000, 0b0101, 0b0101, 0b111>; def : RWSysReg<"ERXMISC2_EL1", 0b11, 0b000, 0b0101, 0b0101, 0b010>; def : RWSysReg<"ERXMISC3_EL1", 0b11, 0b000, 0b0101, 0b0101, 0b011>; def : ROSysReg<"ERXPFGF_EL1", 0b11, 0b000, 0b0101, 0b0100, 0b100>; @@ -1360,7 +1357,7 @@ def : RWSysReg<"MPAMVPM7_EL2", 0b11, 0b100, 0b1010, 0b0110, 0b111>; def : ROSysReg<"MPAMIDR_EL1", 0b11, 0b000, 0b1010, 0b0100, 0b100>; } //FeatureMPAM -// v8.4a Activitiy Monitor registers +// v8.4a Activity Monitor registers // Op0 Op1 CRn CRm Op2 let Requires = [{ {AArch64::FeatureAM} }] in { def : RWSysReg<"AMCR_EL0", 0b11, 0b011, 0b1101, 0b0010, 0b000>; @@ -1426,7 +1423,7 @@ def : RWSysReg<"TRFCR_EL2", 0b11, 0b100, 0b0001, 0b0010, 0b001>; def : RWSysReg<"TRFCR_EL12", 0b11, 0b101, 0b0001, 0b0010, 0b001>; } //FeatureTRACEV8_4 -// v8.4a Timining insensitivity of data processing instructions +// v8.4a Timing insensitivity of data processing instructions // DIT: Data Independent Timing instructions // Op0 Op1 CRn CRm Op2 let Requires = [{ {AArch64::FeatureDIT} }] in { @@ -1490,6 +1487,41 @@ def : RWSysReg<"TRBTRG_EL1", 0b11, 0b000, 0b1001, 0b1011, 0b110>; def : ROSysReg<"TRBIDR_EL1", 0b11, 0b000, 0b1001, 0b1011, 0b111>; } // FeatureTRBE + +// v8.6a Activity Monitors Virtualization Support +let Requires = [{ {AArch64::FeatureAMVS} }] in { +foreach n = 0-15 in { + foreach x = 0-1 in { + def : RWSysReg<"AMEVCNTVOFF"#x#n#"_EL2", + 0b11, 0b100, 0b1101, 0b1000, 0b000>{ + let Encoding{4} = x; + let Encoding{3-0} = n; + } + } +} +} + +// v8.6a Fine Grained Virtualization Traps +// Op0 Op1 CRn CRm Op2 +let Requires = [{ {AArch64::FeatureFineGrainedTraps} }] in { +def : RWSysReg<"HFGRTR_EL2", 0b11, 0b100, 0b0001, 0b0001, 0b100>; +def : RWSysReg<"HFGWTR_EL2", 0b11, 0b100, 0b0001, 0b0001, 0b101>; +def : RWSysReg<"HFGITR_EL2", 0b11, 0b100, 0b0001, 0b0001, 0b110>; +def : RWSysReg<"HDFGRTR_EL2", 0b11, 0b100, 0b0011, 0b0001, 0b100>; +def : RWSysReg<"HDFGWTR_EL2", 0b11, 0b100, 0b0011, 0b0001, 0b101>; +} + +// v8.6a Enhanced Counter Virtualization +// Op0 Op1 CRn CRm Op2 +let Requires = [{ {AArch64::FeatureEnhancedCounterVirtualization} }] in { +def : RWSysReg<"CNTSCALE_EL2", 0b11, 0b100, 0b1110, 0b0000, 0b100>; +def : RWSysReg<"CNTISCALE_EL2", 0b11, 0b100, 0b1110, 0b0000, 0b101>; +def : RWSysReg<"CNTPOFF_EL2", 0b11, 0b100, 0b1110, 0b0000, 0b110>; +def : RWSysReg<"CNTVFRQ_EL2", 0b11, 0b100, 0b1110, 0b0000, 0b111>; +def : RWSysReg<"CNTPCTSS_EL0", 0b11, 0b011, 0b1110, 0b0000, 0b101>; +def : RWSysReg<"CNTVCTSS_EL0", 0b11, 0b011, 0b1110, 0b0000, 0b110>; +} + // Cyclone specific system registers // Op0 Op1 CRn CRm Op2 let Requires = [{ {AArch64::ProcAppleA7} }] in diff --git a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp index 115a7da8a6d90..a63b9a97ada55 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp @@ -11,6 +11,7 @@ #include "AArch64TargetMachine.h" #include "AArch64.h" +#include "AArch64MachineFunctionInfo.h" #include "AArch64MacroFusion.h" #include "AArch64Subtarget.h" #include "AArch64TargetObjectFile.h" @@ -26,6 +27,7 @@ #include "llvm/CodeGen/GlobalISel/Legalizer.h" #include "llvm/CodeGen/GlobalISel/Localizer.h" #include "llvm/CodeGen/GlobalISel/RegBankSelect.h" +#include "llvm/CodeGen/MIRParser/MIParser.h" #include "llvm/CodeGen/MachineScheduler.h" #include "llvm/CodeGen/Passes.h" #include "llvm/CodeGen/TargetPassConfig.h" @@ -146,6 +148,11 @@ static cl::opt<int> EnableGlobalISelAtO( cl::desc("Enable GlobalISel at or below an opt level (-1 to disable)"), cl::init(0)); +static cl::opt<bool> EnableSVEIntrinsicOpts( + "aarch64-sve-intrinsic-opts", cl::Hidden, + cl::desc("Enable SVE intrinsic opts"), + cl::init(true)); + static cl::opt<bool> EnableFalkorHWPFFix("aarch64-enable-falkor-hwpf-fix", cl::init(true), cl::Hidden); @@ -176,13 +183,16 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAArch64Target() { initializeAArch64LoadStoreOptPass(*PR); initializeAArch64SIMDInstrOptPass(*PR); initializeAArch64PreLegalizerCombinerPass(*PR); + initializeAArch64PostLegalizerCombinerPass(*PR); initializeAArch64PromoteConstantPass(*PR); initializeAArch64RedundantCopyEliminationPass(*PR); initializeAArch64StorePairSuppressPass(*PR); initializeFalkorHWPFFixPass(*PR); initializeFalkorMarkStridedAccessesLegacyPass(*PR); initializeLDTLSCleanupPass(*PR); + initializeSVEIntrinsicOptsPass(*PR); initializeAArch64SpeculationHardeningPass(*PR); + initializeAArch64SLSHardeningPass(*PR); initializeAArch64StackTaggingPass(*PR); initializeAArch64StackTaggingPreRAPass(*PR); } @@ -236,12 +246,8 @@ getEffectiveAArch64CodeModel(const Triple &TT, Optional<CodeModel::Model> CM, if (CM) { if (*CM != CodeModel::Small && *CM != CodeModel::Tiny && *CM != CodeModel::Large) { - if (!TT.isOSFuchsia()) - report_fatal_error( - "Only small, tiny and large code models are allowed on AArch64"); - else if (*CM != CodeModel::Kernel) - report_fatal_error("Only small, tiny, kernel, and large code models " - "are allowed on AArch64"); + report_fatal_error( + "Only small, tiny and large code models are allowed on AArch64"); } else if (*CM == CodeModel::Tiny && !TT.isOSBinFormatELF()) report_fatal_error("tiny code model is only supported on ELF"); return *CM; @@ -313,6 +319,9 @@ AArch64TargetMachine::AArch64TargetMachine(const Target &T, const Triple &TT, // AArch64 supports default outlining behaviour. setSupportsDefaultOutlining(true); + + // AArch64 supports the debug entry values. + setSupportsDebugEntryValues(true); } AArch64TargetMachine::~AArch64TargetMachine() = default; @@ -403,6 +412,7 @@ public: bool addIRTranslator() override; void addPreLegalizeMachineIR() override; bool addLegalizeMachineIR() override; + void addPreRegBankSelect() override; bool addRegBankSelect() override; void addPreGlobalInstructionSelect() override; bool addGlobalInstructionSelect() override; @@ -435,6 +445,10 @@ void AArch64PassConfig::addIRPasses() { // ourselves. addPass(createAtomicExpandPass()); + // Expand any SVE vector library calls that we can't code generate directly. + if (EnableSVEIntrinsicOpts && TM->getOptLevel() == CodeGenOpt::Aggressive) + addPass(createSVEIntrinsicOptsPass()); + // Cmpxchg instructions are often used with a subsequent comparison to // determine whether it succeeded. We can exploit existing control-flow in // ldrex/strex loops to simplify this, but it needs tidying up. @@ -454,6 +468,9 @@ void AArch64PassConfig::addIRPasses() { TargetPassConfig::addIRPasses(); + addPass(createAArch64StackTaggingPass( + /*IsOptNone=*/TM->getOptLevel() == CodeGenOpt::None)); + // Match interleaved memory accesses to ldN/stN intrinsics. if (TM->getOptLevel() != CodeGenOpt::None) { addPass(createInterleavedLoadCombinePass()); @@ -473,9 +490,6 @@ void AArch64PassConfig::addIRPasses() { addPass(createLICMPass()); } - addPass(createAArch64StackTaggingPass(/* MergeInit = */ TM->getOptLevel() != - CodeGenOpt::None)); - // Add Control Flow Guard checks. if (TM->getTargetTriple().isOSWindows()) addPass(createCFGuardCheckPass()); @@ -541,6 +555,14 @@ bool AArch64PassConfig::addLegalizeMachineIR() { return false; } +void AArch64PassConfig::addPreRegBankSelect() { + // For now we don't add this to the pipeline for -O0. We could do in future + // if we split the combines into separate O0/opt groupings. + bool IsOptNone = getOptLevel() == CodeGenOpt::None; + if (!IsOptNone) + addPass(createAArch64PostLegalizeCombiner(IsOptNone)); +} + bool AArch64PassConfig::addRegBankSelect() { addPass(new RegBankSelect()); return false; @@ -614,6 +636,9 @@ void AArch64PassConfig::addPreSched2() { // info. addPass(createAArch64SpeculationHardeningPass()); + addPass(createAArch64IndirectThunks()); + addPass(createAArch64SLSHardeningPass()); + if (TM->getOptLevel() != CodeGenOpt::None) { if (EnableFalkorHWPFFix) addPass(createFalkorHWPFFixPass()); @@ -648,4 +673,28 @@ void AArch64PassConfig::addPreEmitPass() { if (TM->getOptLevel() != CodeGenOpt::None && EnableCollectLOH && TM->getTargetTriple().isOSBinFormatMachO()) addPass(createAArch64CollectLOHPass()); + + // SVE bundles move prefixes with destructive operations. + addPass(createUnpackMachineBundles(nullptr)); +} + +yaml::MachineFunctionInfo * +AArch64TargetMachine::createDefaultFuncInfoYAML() const { + return new yaml::AArch64FunctionInfo(); +} + +yaml::MachineFunctionInfo * +AArch64TargetMachine::convertFuncInfoToYAML(const MachineFunction &MF) const { + const auto *MFI = MF.getInfo<AArch64FunctionInfo>(); + return new yaml::AArch64FunctionInfo(*MFI); +} + +bool AArch64TargetMachine::parseMachineFunctionInfo( + const yaml::MachineFunctionInfo &MFI, PerFunctionMIParsingState &PFS, + SMDiagnostic &Error, SMRange &SourceRange) const { + const auto &YamlMFI = + reinterpret_cast<const yaml::AArch64FunctionInfo &>(MFI); + MachineFunction &MF = PFS.MF; + MF.getInfo<AArch64FunctionInfo>()->initializeBaseYamlFields(YamlMFI); + return false; } diff --git a/llvm/lib/Target/AArch64/AArch64TargetMachine.h b/llvm/lib/Target/AArch64/AArch64TargetMachine.h index 5264efb89b9c5..7738a42293919 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetMachine.h +++ b/llvm/lib/Target/AArch64/AArch64TargetMachine.h @@ -49,6 +49,14 @@ public: return TLOF.get(); } + yaml::MachineFunctionInfo *createDefaultFuncInfoYAML() const override; + yaml::MachineFunctionInfo * + convertFuncInfoToYAML(const MachineFunction &MF) const override; + bool parseMachineFunctionInfo(const yaml::MachineFunctionInfo &, + PerFunctionMIParsingState &PFS, + SMDiagnostic &Error, + SMRange &SourceRange) const override; + private: bool isLittle; }; diff --git a/llvm/lib/Target/AArch64/AArch64TargetObjectFile.cpp b/llvm/lib/Target/AArch64/AArch64TargetObjectFile.cpp index 54562094fcf56..dfc66f0cb4c16 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetObjectFile.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetObjectFile.cpp @@ -20,7 +20,6 @@ using namespace dwarf; void AArch64_ELFTargetObjectFile::Initialize(MCContext &Ctx, const TargetMachine &TM) { TargetLoweringObjectFileELF::Initialize(Ctx, TM); - InitializeELF(TM.Options.UseInitArray); // AARCH64 ELF ABI does not define static relocation type for TLS offset // within a module. Do not generate AT_location for TLS variables. SupportDebugThreadLocalLocation = false; @@ -43,7 +42,7 @@ const MCExpr *AArch64_MachoTargetObjectFile::getTTypeGlobalReference( const MCExpr *Res = MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_GOT, getContext()); MCSymbol *PCSym = getContext().createTempSymbol(); - Streamer.EmitLabel(PCSym); + Streamer.emitLabel(PCSym); const MCExpr *PC = MCSymbolRefExpr::create(PCSym, getContext()); return MCBinaryExpr::createSub(Res, PC, getContext()); } @@ -68,7 +67,7 @@ const MCExpr *AArch64_MachoTargetObjectFile::getIndirectSymViaGOTPCRel( const MCExpr *Res = MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_GOT, getContext()); MCSymbol *PCSym = getContext().createTempSymbol(); - Streamer.EmitLabel(PCSym); + Streamer.emitLabel(PCSym); const MCExpr *PC = MCSymbolRefExpr::create(PCSym, getContext()); return MCBinaryExpr::createSub(Res, PC, getContext()); } diff --git a/llvm/lib/Target/AArch64/AArch64TargetObjectFile.h b/llvm/lib/Target/AArch64/AArch64TargetObjectFile.h index 1cb4c028c80d2..28324c2ae608f 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetObjectFile.h +++ b/llvm/lib/Target/AArch64/AArch64TargetObjectFile.h @@ -18,6 +18,11 @@ class AArch64TargetMachine; /// This implementation is used for AArch64 ELF targets (Linux in particular). class AArch64_ELFTargetObjectFile : public TargetLoweringObjectFileELF { void Initialize(MCContext &Ctx, const TargetMachine &TM) override; + +public: + AArch64_ELFTargetObjectFile() { + PLTRelativeVariantKind = MCSymbolRefExpr::VK_PLT; + } }; /// AArch64_MachoTargetObjectFile - This TLOF implementation is used for Darwin. diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index 4724d6b8daea7..cf6de797727be 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -57,7 +57,8 @@ int AArch64TTIImpl::getIntImmCost(int64_t Val) { } /// Calculate the cost of materializing the given constant. -int AArch64TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) { +int AArch64TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty, + TTI::TargetCostKind CostKind) { assert(Ty->isIntegerTy()); unsigned BitSize = Ty->getPrimitiveSizeInBits(); @@ -82,7 +83,8 @@ int AArch64TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) { } int AArch64TTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx, - const APInt &Imm, Type *Ty) { + const APInt &Imm, Type *Ty, + TTI::TargetCostKind CostKind) { assert(Ty->isIntegerTy()); unsigned BitSize = Ty->getPrimitiveSizeInBits(); @@ -139,16 +141,17 @@ int AArch64TTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx, if (Idx == ImmIdx) { int NumConstants = (BitSize + 63) / 64; - int Cost = AArch64TTIImpl::getIntImmCost(Imm, Ty); + int Cost = AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind); return (Cost <= NumConstants * TTI::TCC_Basic) ? static_cast<int>(TTI::TCC_Free) : Cost; } - return AArch64TTIImpl::getIntImmCost(Imm, Ty); + return AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind); } int AArch64TTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, - const APInt &Imm, Type *Ty) { + const APInt &Imm, Type *Ty, + TTI::TargetCostKind CostKind) { assert(Ty->isIntegerTy()); unsigned BitSize = Ty->getPrimitiveSizeInBits(); @@ -161,7 +164,7 @@ int AArch64TTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, // selected instruction, so we compute the materialization cost for the // immediate directly. if (IID >= Intrinsic::aarch64_addg && IID <= Intrinsic::aarch64_udiv) - return AArch64TTIImpl::getIntImmCost(Imm, Ty); + return AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind); switch (IID) { default: @@ -174,7 +177,7 @@ int AArch64TTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, case Intrinsic::umul_with_overflow: if (Idx == 1) { int NumConstants = (BitSize + 63) / 64; - int Cost = AArch64TTIImpl::getIntImmCost(Imm, Ty); + int Cost = AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind); return (Cost <= NumConstants * TTI::TCC_Basic) ? static_cast<int>(TTI::TCC_Free) : Cost; @@ -190,7 +193,7 @@ int AArch64TTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, return TTI::TCC_Free; break; } - return AArch64TTIImpl::getIntImmCost(Imm, Ty); + return AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind); } TargetTransformInfo::PopcntSupportKind @@ -208,8 +211,8 @@ bool AArch64TTIImpl::isWideningInstruction(Type *DstTy, unsigned Opcode, // A helper that returns a vector type from the given type. The number of // elements in type Ty determine the vector width. auto toVectorTy = [&](Type *ArgTy) { - return VectorType::get(ArgTy->getScalarType(), - DstTy->getVectorNumElements()); + return FixedVectorType::get(ArgTy->getScalarType(), + cast<FixedVectorType>(DstTy)->getNumElements()); }; // Exit early if DstTy is not a vector type whose elements are at least @@ -251,7 +254,7 @@ bool AArch64TTIImpl::isWideningInstruction(Type *DstTy, unsigned Opcode, // Legalize the source type and ensure it can be used in a widening // operation. - Type *SrcTy = toVectorTy(Extend->getSrcTy()); + auto *SrcTy = toVectorTy(Extend->getSrcTy()); auto SrcTyL = TLI->getTypeLegalizationCost(DL, SrcTy); unsigned SrcElTySize = SrcTyL.second.getScalarSizeInBits(); if (!SrcTyL.second.isVector() || SrcElTySize != SrcTy->getScalarSizeInBits()) @@ -267,6 +270,7 @@ bool AArch64TTIImpl::isWideningInstruction(Type *DstTy, unsigned Opcode, } int AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, + TTI::TargetCostKind CostKind, const Instruction *I) { int ISD = TLI->InstructionOpcodeToISD(Opcode); assert(ISD && "Invalid opcode"); @@ -291,11 +295,18 @@ int AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, } } + // TODO: Allow non-throughput costs that aren't binary. + auto AdjustCost = [&CostKind](int Cost) { + if (CostKind != TTI::TCK_RecipThroughput) + return Cost == 0 ? 0 : 1; + return Cost; + }; + EVT SrcTy = TLI->getValueType(DL, Src); EVT DstTy = TLI->getValueType(DL, Dst); if (!SrcTy.isSimple() || !DstTy.isSimple()) - return BaseT::getCastInstrCost(Opcode, Dst, Src); + return AdjustCost(BaseT::getCastInstrCost(Opcode, Dst, Src, CostKind, I)); static const TypeConversionCostTblEntry ConversionTbl[] = { @@ -397,9 +408,9 @@ int AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, if (const auto *Entry = ConvertCostTableLookup(ConversionTbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT())) - return Entry->Cost; + return AdjustCost(Entry->Cost); - return BaseT::getCastInstrCost(Opcode, Dst, Src); + return AdjustCost(BaseT::getCastInstrCost(Opcode, Dst, Src, CostKind, I)); } int AArch64TTIImpl::getExtractWithExtendCost(unsigned Opcode, Type *Dst, @@ -425,17 +436,18 @@ int AArch64TTIImpl::getExtractWithExtendCost(unsigned Opcode, Type *Dst, auto VecLT = TLI->getTypeLegalizationCost(DL, VecTy); auto DstVT = TLI->getValueType(DL, Dst); auto SrcVT = TLI->getValueType(DL, Src); + TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; // If the resulting type is still a vector and the destination type is legal, // we may get the extension for free. If not, get the default cost for the // extend. if (!VecLT.second.isVector() || !TLI->isTypeLegal(DstVT)) - return Cost + getCastInstrCost(Opcode, Dst, Src); + return Cost + getCastInstrCost(Opcode, Dst, Src, CostKind); // The destination type should be larger than the element type. If not, get // the default cost for the extend. if (DstVT.getSizeInBits() < SrcVT.getSizeInBits()) - return Cost + getCastInstrCost(Opcode, Dst, Src); + return Cost + getCastInstrCost(Opcode, Dst, Src, CostKind); switch (Opcode) { default: @@ -454,7 +466,16 @@ int AArch64TTIImpl::getExtractWithExtendCost(unsigned Opcode, Type *Dst, } // If we are unable to perform the extend for free, get the default cost. - return Cost + getCastInstrCost(Opcode, Dst, Src); + return Cost + getCastInstrCost(Opcode, Dst, Src, CostKind); +} + +unsigned AArch64TTIImpl::getCFInstrCost(unsigned Opcode, + TTI::TargetCostKind CostKind) { + if (CostKind != TTI::TCK_RecipThroughput) + return Opcode == Instruction::PHI ? 0 : 1; + assert(CostKind == TTI::TCK_RecipThroughput && "unexpected CostKind"); + // Branches are assumed to be predicted. + return 0; } int AArch64TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, @@ -483,10 +504,17 @@ int AArch64TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, } int AArch64TTIImpl::getArithmeticInstrCost( - unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info, + unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, + TTI::OperandValueKind Opd1Info, TTI::OperandValueKind Opd2Info, TTI::OperandValueProperties Opd1PropInfo, TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value *> Args, const Instruction *CxtI) { + // TODO: Handle more cost kinds. + if (CostKind != TTI::TCK_RecipThroughput) + return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info, + Opd2Info, Opd1PropInfo, + Opd2PropInfo, Args, CxtI); + // Legalize the type. std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty); @@ -504,7 +532,8 @@ int AArch64TTIImpl::getArithmeticInstrCost( switch (ISD) { default: - return Cost + BaseT::getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info, + return Cost + BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info, + Opd2Info, Opd1PropInfo, Opd2PropInfo); case ISD::SDIV: if (Opd2Info == TargetTransformInfo::OK_UniformConstantValue && @@ -513,16 +542,20 @@ int AArch64TTIImpl::getArithmeticInstrCost( // normally expanded to the sequence ADD + CMP + SELECT + SRA. // The OperandValue properties many not be same as that of previous // operation; conservatively assume OP_None. - Cost += getArithmeticInstrCost(Instruction::Add, Ty, Opd1Info, Opd2Info, + Cost += getArithmeticInstrCost(Instruction::Add, Ty, CostKind, + Opd1Info, Opd2Info, TargetTransformInfo::OP_None, TargetTransformInfo::OP_None); - Cost += getArithmeticInstrCost(Instruction::Sub, Ty, Opd1Info, Opd2Info, + Cost += getArithmeticInstrCost(Instruction::Sub, Ty, CostKind, + Opd1Info, Opd2Info, TargetTransformInfo::OP_None, TargetTransformInfo::OP_None); - Cost += getArithmeticInstrCost(Instruction::Select, Ty, Opd1Info, Opd2Info, + Cost += getArithmeticInstrCost(Instruction::Select, Ty, CostKind, + Opd1Info, Opd2Info, TargetTransformInfo::OP_None, TargetTransformInfo::OP_None); - Cost += getArithmeticInstrCost(Instruction::AShr, Ty, Opd1Info, Opd2Info, + Cost += getArithmeticInstrCost(Instruction::AShr, Ty, CostKind, + Opd1Info, Opd2Info, TargetTransformInfo::OP_None, TargetTransformInfo::OP_None); return Cost; @@ -535,31 +568,34 @@ int AArch64TTIImpl::getArithmeticInstrCost( // Vector signed division by constant are expanded to the // sequence MULHS + ADD/SUB + SRA + SRL + ADD, and unsigned division // to MULHS + SUB + SRL + ADD + SRL. - int MulCost = getArithmeticInstrCost(Instruction::Mul, Ty, Opd1Info, - Opd2Info, + int MulCost = getArithmeticInstrCost(Instruction::Mul, Ty, CostKind, + Opd1Info, Opd2Info, TargetTransformInfo::OP_None, TargetTransformInfo::OP_None); - int AddCost = getArithmeticInstrCost(Instruction::Add, Ty, Opd1Info, - Opd2Info, + int AddCost = getArithmeticInstrCost(Instruction::Add, Ty, CostKind, + Opd1Info, Opd2Info, TargetTransformInfo::OP_None, TargetTransformInfo::OP_None); - int ShrCost = getArithmeticInstrCost(Instruction::AShr, Ty, Opd1Info, - Opd2Info, + int ShrCost = getArithmeticInstrCost(Instruction::AShr, Ty, CostKind, + Opd1Info, Opd2Info, TargetTransformInfo::OP_None, TargetTransformInfo::OP_None); return MulCost * 2 + AddCost * 2 + ShrCost * 2 + 1; } } - Cost += BaseT::getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info, + Cost += BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info, + Opd2Info, Opd1PropInfo, Opd2PropInfo); if (Ty->isVectorTy()) { // On AArch64, vector divisions are not supported natively and are // expanded into scalar divisions of each pair of elements. - Cost += getArithmeticInstrCost(Instruction::ExtractElement, Ty, Opd1Info, - Opd2Info, Opd1PropInfo, Opd2PropInfo); - Cost += getArithmeticInstrCost(Instruction::InsertElement, Ty, Opd1Info, - Opd2Info, Opd1PropInfo, Opd2PropInfo); + Cost += getArithmeticInstrCost(Instruction::ExtractElement, Ty, CostKind, + Opd1Info, Opd2Info, Opd1PropInfo, + Opd2PropInfo); + Cost += getArithmeticInstrCost(Instruction::InsertElement, Ty, CostKind, + Opd1Info, Opd2Info, Opd1PropInfo, + Opd2PropInfo); // TODO: if one of the arguments is scalar, then it's not necessary to // double the cost of handling the vector elements. Cost += Cost; @@ -574,6 +610,16 @@ int AArch64TTIImpl::getArithmeticInstrCost( // These nodes are marked as 'custom' for combining purposes only. // We know that they are legal. See LowerAdd in ISelLowering. return (Cost + 1) * LT.first; + + case ISD::FADD: + // These nodes are marked as 'custom' just to lower them to SVE. + // We know said lowering will incur no additional cost. + if (isa<FixedVectorType>(Ty) && !Ty->getScalarType()->isFP128Ty()) + return (Cost + 2) * LT.first; + + return Cost + BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info, + Opd2Info, + Opd1PropInfo, Opd2PropInfo); } } @@ -596,7 +642,12 @@ int AArch64TTIImpl::getAddressComputationCost(Type *Ty, ScalarEvolution *SE, } int AArch64TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, - Type *CondTy, const Instruction *I) { + Type *CondTy, + TTI::TargetCostKind CostKind, + const Instruction *I) { + // TODO: Handle other cost kinds. + if (CostKind != TTI::TCK_RecipThroughput) + return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, CostKind, I); int ISD = TLI->InstructionOpcodeToISD(Opcode); // We don't lower some vector selects well that are wider than the register @@ -623,13 +674,18 @@ int AArch64TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, return Entry->Cost; } } - return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, I); + return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, CostKind, I); } AArch64TTIImpl::TTI::MemCmpExpansionOptions AArch64TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const { TTI::MemCmpExpansionOptions Options; - Options.AllowOverlappingLoads = !ST->requiresStrictAlign(); + if (ST->requiresStrictAlign()) { + // TODO: Add cost modeling for strict align. Misaligned loads expand to + // a bunch of instructions when strict align is enabled. + return Options; + } + Options.AllowOverlappingLoads = true; Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize); Options.NumLoadsPerBlock = Options.MaxNumLoads; // TODO: Though vector loads usually perform well on AArch64, in some targets @@ -641,7 +697,17 @@ AArch64TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const { int AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Ty, MaybeAlign Alignment, unsigned AddressSpace, + TTI::TargetCostKind CostKind, const Instruction *I) { + // TODO: Handle other cost kinds. + if (CostKind != TTI::TCK_RecipThroughput) + return 1; + + // Type legalization can't handle structs + if (TLI->getValueType(DL, Ty, true) == MVT::Other) + return BaseT::getMemoryOpCost(Opcode, Ty, Alignment, AddressSpace, + CostKind); + auto LT = TLI->getTypeLegalizationCost(DL, Ty); if (ST->isMisaligned128StoreSlow() && Opcode == Instruction::Store && @@ -656,7 +722,8 @@ int AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Ty, return LT.first * 2 * AmortizationCost; } - if (Ty->isVectorTy() && Ty->getVectorElementType()->isIntegerTy(8)) { + if (Ty->isVectorTy() && + cast<VectorType>(Ty)->getElementType()->isIntegerTy(8)) { unsigned ProfitableNumElements; if (Opcode == Instruction::Store) // We use a custom trunc store lowering so v.4b should be profitable. @@ -666,8 +733,8 @@ int AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Ty, // have to promote the elements to v.2. ProfitableNumElements = 8; - if (Ty->getVectorNumElements() < ProfitableNumElements) { - unsigned NumVecElts = Ty->getVectorNumElements(); + if (cast<FixedVectorType>(Ty)->getNumElements() < ProfitableNumElements) { + unsigned NumVecElts = cast<FixedVectorType>(Ty)->getNumElements(); unsigned NumVectorizableInstsToAmortize = NumVecElts * 2; // We generate 2 instructions per vector element. return NumVectorizableInstsToAmortize * NumVecElts * 2; @@ -677,20 +744,18 @@ int AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Ty, return LT.first; } -int AArch64TTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, - unsigned Factor, - ArrayRef<unsigned> Indices, - unsigned Alignment, - unsigned AddressSpace, - bool UseMaskForCond, - bool UseMaskForGaps) { +int AArch64TTIImpl::getInterleavedMemoryOpCost( + unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices, + Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, + bool UseMaskForCond, bool UseMaskForGaps) { assert(Factor >= 2 && "Invalid interleave factor"); - assert(isa<VectorType>(VecTy) && "Expect a vector type"); + auto *VecVTy = cast<FixedVectorType>(VecTy); if (!UseMaskForCond && !UseMaskForGaps && Factor <= TLI->getMaxSupportedInterleaveFactor()) { - unsigned NumElts = VecTy->getVectorNumElements(); - auto *SubVecTy = VectorType::get(VecTy->getScalarType(), NumElts / Factor); + unsigned NumElts = VecVTy->getNumElements(); + auto *SubVecTy = + FixedVectorType::get(VecTy->getScalarType(), NumElts / Factor); // ldN/stN only support legal vector types of size 64 or 128 in bits. // Accesses having vector types that are a multiple of 128 bits can be @@ -701,18 +766,20 @@ int AArch64TTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, } return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, - Alignment, AddressSpace, + Alignment, AddressSpace, CostKind, UseMaskForCond, UseMaskForGaps); } int AArch64TTIImpl::getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys) { int Cost = 0; + TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; for (auto *I : Tys) { if (!I->isVectorTy()) continue; - if (I->getScalarSizeInBits() * I->getVectorNumElements() == 128) - Cost += getMemoryOpCost(Instruction::Store, I, Align(128), 0) + - getMemoryOpCost(Instruction::Load, I, Align(128), 0); + if (I->getScalarSizeInBits() * cast<FixedVectorType>(I)->getNumElements() == + 128) + Cost += getMemoryOpCost(Instruction::Store, I, Align(128), 0, CostKind) + + getMemoryOpCost(Instruction::Load, I, Align(128), 0, CostKind); } return Cost; } @@ -792,6 +859,11 @@ void AArch64TTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, getFalkorUnrollingPreferences(L, SE, UP); } +void AArch64TTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE, + TTI::PeelingPreferences &PP) { + BaseT::getPeelingPreferences(L, SE, PP); +} + Value *AArch64TTIImpl::getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst, Type *ExpectedType) { switch (Inst->getIntrinsicID()) { @@ -902,7 +974,7 @@ bool AArch64TTIImpl::shouldConsiderAddressTypePromotion( bool AArch64TTIImpl::useReductionIntrinsic(unsigned Opcode, Type *Ty, TTI::ReductionFlags Flags) const { - assert(isa<VectorType>(Ty) && "Expected Ty to be a vector type"); + auto *VTy = cast<VectorType>(Ty); unsigned ScalarBits = Ty->getScalarSizeInBits(); switch (Opcode) { case Instruction::FAdd: @@ -913,10 +985,10 @@ bool AArch64TTIImpl::useReductionIntrinsic(unsigned Opcode, Type *Ty, case Instruction::Mul: return false; case Instruction::Add: - return ScalarBits * Ty->getVectorNumElements() >= 128; + return ScalarBits * cast<FixedVectorType>(VTy)->getNumElements() >= 128; case Instruction::ICmp: return (ScalarBits < 64) && - (ScalarBits * Ty->getVectorNumElements() >= 128); + (ScalarBits * cast<FixedVectorType>(VTy)->getNumElements() >= 128); case Instruction::FCmp: return Flags.NoNaN; default: @@ -925,11 +997,14 @@ bool AArch64TTIImpl::useReductionIntrinsic(unsigned Opcode, Type *Ty, return false; } -int AArch64TTIImpl::getArithmeticReductionCost(unsigned Opcode, Type *ValTy, - bool IsPairwiseForm) { +int AArch64TTIImpl::getArithmeticReductionCost(unsigned Opcode, + VectorType *ValTy, + bool IsPairwiseForm, + TTI::TargetCostKind CostKind) { if (IsPairwiseForm) - return BaseT::getArithmeticReductionCost(Opcode, ValTy, IsPairwiseForm); + return BaseT::getArithmeticReductionCost(Opcode, ValTy, IsPairwiseForm, + CostKind); std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy); MVT MTy = LT.second; @@ -950,11 +1025,12 @@ int AArch64TTIImpl::getArithmeticReductionCost(unsigned Opcode, Type *ValTy, if (const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy)) return LT.first * Entry->Cost; - return BaseT::getArithmeticReductionCost(Opcode, ValTy, IsPairwiseForm); + return BaseT::getArithmeticReductionCost(Opcode, ValTy, IsPairwiseForm, + CostKind); } -int AArch64TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, - Type *SubTp) { +int AArch64TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, + int Index, VectorType *SubTp) { if (Kind == TTI::SK_Broadcast || Kind == TTI::SK_Transpose || Kind == TTI::SK_Select || Kind == TTI::SK_PermuteSingleSrc) { static const CostTblEntry ShuffleTbl[] = { diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h index 6f4569a497831..1f029689a60e6 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -72,11 +72,11 @@ public: using BaseT::getIntImmCost; int getIntImmCost(int64_t Val); - int getIntImmCost(const APInt &Imm, Type *Ty); + int getIntImmCost(const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind); int getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, - Type *Ty); + Type *Ty, TTI::TargetCostKind CostKind); int getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, - Type *Ty); + Type *Ty, TTI::TargetCostKind CostKind); TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth); /// @} @@ -98,6 +98,8 @@ public: unsigned getRegisterBitWidth(bool Vector) const { if (Vector) { + if (ST->hasSVE()) + return std::max(ST->getMinSVEVectorSizeInBits(), 128u); if (ST->hasNEON()) return 128; return 0; @@ -112,15 +114,19 @@ public: unsigned getMaxInterleaveFactor(unsigned VF); int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, + TTI::TargetCostKind CostKind, const Instruction *I = nullptr); int getExtractWithExtendCost(unsigned Opcode, Type *Dst, VectorType *VecTy, unsigned Index); + unsigned getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind); + int getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index); int getArithmeticInstrCost( unsigned Opcode, Type *Ty, + TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput, TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue, TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue, TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None, @@ -131,30 +137,37 @@ public: int getAddressComputationCost(Type *Ty, ScalarEvolution *SE, const SCEV *Ptr); int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, + TTI::TargetCostKind CostKind, const Instruction *I = nullptr); TTI::MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const; int getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, - unsigned AddressSpace, const Instruction *I = nullptr); + unsigned AddressSpace, + TTI::TargetCostKind CostKind, + const Instruction *I = nullptr); int getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys); void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP); + void getPeelingPreferences(Loop *L, ScalarEvolution &SE, + TTI::PeelingPreferences &PP); + Value *getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst, Type *ExpectedType); bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info); - bool isLegalMaskedLoadStore(Type *DataType, MaybeAlign Alignment) { - if (!isa<VectorType>(DataType) || !ST->hasSVE()) + bool isLegalMaskedLoadStore(Type *DataType, Align Alignment) { + if (!isa<ScalableVectorType>(DataType) || !ST->hasSVE()) return false; - Type *Ty = DataType->getVectorElementType(); - if (Ty->isHalfTy() || Ty->isFloatTy() || Ty->isDoubleTy()) + Type *Ty = cast<ScalableVectorType>(DataType)->getElementType(); + if (Ty->isBFloatTy() || Ty->isHalfTy() || + Ty->isFloatTy() || Ty->isDoubleTy()) return true; if (Ty->isIntegerTy(8) || Ty->isIntegerTy(16) || @@ -164,26 +177,58 @@ public: return false; } - bool isLegalMaskedLoad(Type *DataType, MaybeAlign Alignment) { + bool isLegalMaskedLoad(Type *DataType, Align Alignment) { return isLegalMaskedLoadStore(DataType, Alignment); } - bool isLegalMaskedStore(Type *DataType, MaybeAlign Alignment) { + bool isLegalMaskedStore(Type *DataType, Align Alignment) { return isLegalMaskedLoadStore(DataType, Alignment); } - int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, - ArrayRef<unsigned> Indices, unsigned Alignment, - unsigned AddressSpace, - bool UseMaskForCond = false, - bool UseMaskForGaps = false); + bool isLegalNTStore(Type *DataType, Align Alignment) { + // NOTE: The logic below is mostly geared towards LV, which calls it with + // vectors with 2 elements. We might want to improve that, if other + // users show up. + // Nontemporal vector stores can be directly lowered to STNP, if the vector + // can be halved so that each half fits into a register. That's the case if + // the element type fits into a register and the number of elements is a + // power of 2 > 1. + if (auto *DataTypeVTy = dyn_cast<VectorType>(DataType)) { + unsigned NumElements = + cast<FixedVectorType>(DataTypeVTy)->getNumElements(); + unsigned EltSize = DataTypeVTy->getElementType()->getScalarSizeInBits(); + return NumElements > 1 && isPowerOf2_64(NumElements) && EltSize >= 8 && + EltSize <= 128 && isPowerOf2_64(EltSize); + } + return BaseT::isLegalNTStore(DataType, Alignment); + } + + int getInterleavedMemoryOpCost( + unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices, + Align Alignment, unsigned AddressSpace, + TTI::TargetCostKind CostKind = TTI::TCK_SizeAndLatency, + bool UseMaskForCond = false, bool UseMaskForGaps = false); bool shouldConsiderAddressTypePromotion(const Instruction &I, bool &AllowPromotionWithoutCommonHeader); bool shouldExpandReduction(const IntrinsicInst *II) const { - return false; + switch (II->getIntrinsicID()) { + case Intrinsic::experimental_vector_reduce_v2_fadd: + case Intrinsic::experimental_vector_reduce_v2_fmul: + // We don't have legalization support for ordered FP reductions. + return !II->getFastMathFlags().allowReassoc(); + + case Intrinsic::experimental_vector_reduce_fmax: + case Intrinsic::experimental_vector_reduce_fmin: + // Lowering asserts that there are no NaNs. + return !II->getFastMathFlags().noNaNs(); + + default: + // Don't expand anything else, let legalization deal with it. + return false; + } } unsigned getGISelRematGlobalCost() const { @@ -193,10 +238,12 @@ public: bool useReductionIntrinsic(unsigned Opcode, Type *Ty, TTI::ReductionFlags Flags) const; - int getArithmeticReductionCost(unsigned Opcode, Type *Ty, - bool IsPairwiseForm); + int getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, + bool IsPairwiseForm, + TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput); - int getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp); + int getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, int Index, + VectorType *SubTp); /// @} }; diff --git a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp index be4c960224727..0ac09c4f96f04 100644 --- a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp +++ b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp @@ -260,6 +260,8 @@ public: bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name, SMLoc NameLoc, OperandVector &Operands) override; bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) override; + OperandMatchResultTy tryParseRegister(unsigned &RegNo, SMLoc &StartLoc, + SMLoc &EndLoc) override; bool ParseDirective(AsmToken DirectiveID) override; unsigned validateTargetOperandClass(MCParsedAsmOperand &Op, unsigned Kind) override; @@ -755,12 +757,13 @@ public: return false; int64_t Val = MCE->getValue(); - int64_t SVal = typename std::make_signed<T>::type(Val); - int64_t UVal = typename std::make_unsigned<T>::type(Val); - if (Val != SVal && Val != UVal) + // Avoid left shift by 64 directly. + uint64_t Upper = UINT64_C(-1) << (sizeof(T) * 4) << (sizeof(T) * 4); + // Allow all-0 or all-1 in top bits to permit bitwise NOT. + if ((Val & Upper) && (Val & Upper) != Upper) return false; - return AArch64_AM::isLogicalImmediate(UVal, sizeof(T) * 8); + return AArch64_AM::isLogicalImmediate(Val & ~Upper, sizeof(T) * 8); } bool isShiftedImm() const { return Kind == k_ShiftedImm; } @@ -852,8 +855,7 @@ public: if (!isShiftedImm() && (!isImm() || !isa<MCConstantExpr>(getImm()))) return DiagnosticPredicateTy::NoMatch; - bool IsByte = - std::is_same<int8_t, typename std::make_signed<T>::type>::value; + bool IsByte = std::is_same<int8_t, std::make_signed_t<T>>::value; if (auto ShiftedImm = getShiftedVal<8>()) if (!(IsByte && ShiftedImm->second) && AArch64_AM::isSVECpyImm<T>(uint64_t(ShiftedImm->first) @@ -870,8 +872,7 @@ public: if (!isShiftedImm() && (!isImm() || !isa<MCConstantExpr>(getImm()))) return DiagnosticPredicateTy::NoMatch; - bool IsByte = - std::is_same<int8_t, typename std::make_signed<T>::type>::value; + bool IsByte = std::is_same<int8_t, std::make_signed_t<T>>::value; if (auto ShiftedImm = getShiftedVal<8>()) if (!(IsByte && ShiftedImm->second) && AArch64_AM::isSVEAddSubImm<T>(ShiftedImm->first @@ -969,11 +970,15 @@ public: bool isMOVZMovAlias() const { if (!isImm()) return false; - const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); - if (!CE) return false; - uint64_t Value = CE->getValue(); + const MCExpr *E = getImm(); + if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(E)) { + uint64_t Value = CE->getValue(); - return AArch64_AM::isMOVZMovAlias(Value, Shift, RegWidth); + return AArch64_AM::isMOVZMovAlias(Value, Shift, RegWidth); + } + // Only supports the case of Shift being 0 if an expression is used as an + // operand + return !Shift && E; } template<int RegWidth, int Shift> @@ -1033,8 +1038,10 @@ public: bool isNeonVectorRegLo() const { return Kind == k_Register && Reg.Kind == RegKind::NeonVector && - AArch64MCRegisterClasses[AArch64::FPR128_loRegClassID].contains( - Reg.RegNum); + (AArch64MCRegisterClasses[AArch64::FPR128_loRegClassID].contains( + Reg.RegNum) || + AArch64MCRegisterClasses[AArch64::FPR64_loRegClassID].contains( + Reg.RegNum)); } template <unsigned Class> bool isSVEVectorReg() const { @@ -1606,7 +1613,7 @@ public: void addLogicalImmOperands(MCInst &Inst, unsigned N) const { assert(N == 1 && "Invalid number of operands!"); const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm()); - typename std::make_unsigned<T>::type Val = MCE->getValue(); + std::make_unsigned_t<T> Val = MCE->getValue(); uint64_t encoding = AArch64_AM::encodeLogicalImmediate(Val, sizeof(T) * 8); Inst.addOperand(MCOperand::createImm(encoding)); } @@ -1615,7 +1622,7 @@ public: void addLogicalImmNotOperands(MCInst &Inst, unsigned N) const { assert(N == 1 && "Invalid number of operands!"); const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm()); - typename std::make_unsigned<T>::type Val = ~MCE->getValue(); + std::make_unsigned_t<T> Val = ~MCE->getValue(); uint64_t encoding = AArch64_AM::encodeLogicalImmediate(Val, sizeof(T) * 8); Inst.addOperand(MCOperand::createImm(encoding)); } @@ -1771,9 +1778,13 @@ public: void addMOVZMovAliasOperands(MCInst &Inst, unsigned N) const { assert(N == 1 && "Invalid number of operands!"); - const MCConstantExpr *CE = cast<MCConstantExpr>(getImm()); - uint64_t Value = CE->getValue(); - Inst.addOperand(MCOperand::createImm((Value >> Shift) & 0xffff)); + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + if (CE) { + uint64_t Value = CE->getValue(); + Inst.addOperand(MCOperand::createImm((Value >> Shift) & 0xffff)); + } else { + addExpr(Inst, getImm()); + } } template<int Shift> @@ -2243,10 +2254,16 @@ static unsigned matchSVEPredicateVectorRegName(StringRef Name) { bool AArch64AsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) { + return tryParseRegister(RegNo, StartLoc, EndLoc) != MatchOperand_Success; +} + +OperandMatchResultTy AArch64AsmParser::tryParseRegister(unsigned &RegNo, + SMLoc &StartLoc, + SMLoc &EndLoc) { StartLoc = getLoc(); auto Res = tryParseScalarRegister(RegNo); EndLoc = SMLoc::getFromPointer(getLoc().getPointer() - 1); - return Res != MatchOperand_Success; + return Res; } // Matches a register name or register alias previously defined by '.req' @@ -2404,9 +2421,9 @@ AArch64AsmParser::tryParsePrefetch(OperandVector &Operands) { return MatchOperand_ParseFail; } - Parser.Lex(); // Eat identifier token. Operands.push_back(AArch64Operand::CreatePrefetch( *PRFM, Tok.getString(), S, getContext())); + Parser.Lex(); // Eat identifier token. return MatchOperand_Success; } @@ -2427,9 +2444,9 @@ AArch64AsmParser::tryParsePSBHint(OperandVector &Operands) { return MatchOperand_ParseFail; } - Parser.Lex(); // Eat identifier token. Operands.push_back(AArch64Operand::CreatePSBHint( PSB->Encoding, Tok.getString(), S, getContext())); + Parser.Lex(); // Eat identifier token. return MatchOperand_Success; } @@ -2450,9 +2467,9 @@ AArch64AsmParser::tryParseBTIHint(OperandVector &Operands) { return MatchOperand_ParseFail; } - Parser.Lex(); // Eat identifier token. Operands.push_back(AArch64Operand::CreateBTIHint( BTI->Encoding, Tok.getString(), S, getContext())); + Parser.Lex(); // Eat identifier token. return MatchOperand_Success; } @@ -2827,6 +2844,7 @@ static const struct Extension { {"tlb-rmi", {AArch64::FeatureTLB_RMI}}, {"pan-rwv", {AArch64::FeaturePAN_RWV}}, {"ccpp", {AArch64::FeatureCCPP}}, + {"rcpc", {AArch64::FeatureRCPC}}, {"sve", {AArch64::FeatureSVE}}, {"sve2", {AArch64::FeatureSVE2}}, {"sve2-aes", {AArch64::FeatureSVE2AES}}, @@ -2851,6 +2869,8 @@ static void setRequiredFeatureString(FeatureBitset FBS, std::string &Str) { Str += "ARMv8.4a"; else if (FBS[AArch64::HasV8_5aOps]) Str += "ARMv8.5a"; + else if (FBS[AArch64::HasV8_6aOps]) + Str += "ARMv8.6a"; else { auto ext = std::find_if(std::begin(ExtensionMap), std::end(ExtensionMap), @@ -3771,7 +3791,7 @@ bool AArch64AsmParser::ParseInstruction(ParseInstructionInfo &Info, // First check for the AArch64-specific .req directive. if (Parser.getTok().is(AsmToken::Identifier) && - Parser.getTok().getIdentifier() == ".req") { + Parser.getTok().getIdentifier().lower() == ".req") { parseDirectiveReq(Name, NameLoc); // We always return 'error' for this, as we're done with this // statement and don't need to match the 'instruction." @@ -4106,6 +4126,16 @@ bool AArch64AsmParser::validateInstruction(MCInst &Inst, SMLoc &IDLoc, "unpredictable STXP instruction, status is also a source"); break; } + case AArch64::LDRABwriteback: + case AArch64::LDRAAwriteback: { + unsigned Xt = Inst.getOperand(0).getReg(); + unsigned Xn = Inst.getOperand(1).getReg(); + if (Xt == Xn) + return Error(Loc[0], + "unpredictable LDRA instruction, writeback base" + " is also a destination"); + break; + } } @@ -4235,6 +4265,8 @@ bool AArch64AsmParser::showMatchError(SMLoc Loc, unsigned ErrCode, return Error(Loc, "index must be a multiple of 4 in range [-32, 28]."); case Match_InvalidMemoryIndexed16SImm4: return Error(Loc, "index must be a multiple of 16 in range [-128, 112]."); + case Match_InvalidMemoryIndexed32SImm4: + return Error(Loc, "index must be a multiple of 32 in range [-256, 224]."); case Match_InvalidMemoryIndexed1SImm6: return Error(Loc, "index must be an integer in range [-32, 31]."); case Match_InvalidMemoryIndexedSImm8: @@ -4824,7 +4856,7 @@ bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, return true; Inst.setLoc(IDLoc); - Out.EmitInstruction(Inst, getSTI()); + Out.emitInstruction(Inst, getSTI()); return false; } case Match_MissingFeature: { @@ -4894,6 +4926,7 @@ bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, case Match_InvalidMemoryIndexed4SImm4: case Match_InvalidMemoryIndexed1SImm6: case Match_InvalidMemoryIndexed16SImm4: + case Match_InvalidMemoryIndexed32SImm4: case Match_InvalidMemoryIndexed4SImm7: case Match_InvalidMemoryIndexed8SImm7: case Match_InvalidMemoryIndexed16SImm7: @@ -5024,7 +5057,7 @@ bool AArch64AsmParser::ParseDirective(AsmToken DirectiveID) { getContext().getObjectFileInfo()->getObjectFileType(); bool IsMachO = Format == MCObjectFileInfo::IsMachO; - StringRef IDVal = DirectiveID.getIdentifier(); + auto IDVal = DirectiveID.getIdentifier().lower(); SMLoc Loc = DirectiveID.getLoc(); if (IDVal == ".arch") parseDirectiveArch(Loc); @@ -5076,6 +5109,7 @@ static void ExpandCryptoAEK(AArch64::ArchKind ArchKind, break; case AArch64::ArchKind::ARMV8_4A: case AArch64::ArchKind::ARMV8_5A: + case AArch64::ArchKind::ARMV8_6A: RequestedExtensions.push_back("sm4"); RequestedExtensions.push_back("sha3"); RequestedExtensions.push_back("sha2"); @@ -5095,6 +5129,7 @@ static void ExpandCryptoAEK(AArch64::ArchKind ArchKind, break; case AArch64::ArchKind::ARMV8_4A: case AArch64::ArchKind::ARMV8_5A: + case AArch64::ArchKind::ARMV8_6A: RequestedExtensions.push_back("nosm4"); RequestedExtensions.push_back("nosha3"); RequestedExtensions.push_back("nosha2"); @@ -5314,7 +5349,7 @@ bool AArch64AsmParser::parseDirectiveTLSDescCall(SMLoc L) { Inst.setOpcode(AArch64::TLSDESCCALL); Inst.addOperand(MCOperand::createExpr(Expr)); - getParser().getStreamer().EmitInstruction(Inst, getSTI()); + getParser().getStreamer().emitInstruction(Inst, getSTI()); return false; } @@ -5365,7 +5400,7 @@ bool AArch64AsmParser::parseDirectiveLOH(StringRef IDVal, SMLoc Loc) { "unexpected token in '" + Twine(IDVal) + "' directive")) return true; - getStreamer().EmitLOHDirective((MCLOHType)Kind, Args); + getStreamer().emitLOHDirective((MCLOHType)Kind, Args); return false; } @@ -5458,7 +5493,7 @@ bool AArch64AsmParser::parseDirectiveUnreq(SMLoc L) { bool AArch64AsmParser::parseDirectiveCFINegateRAState() { if (parseToken(AsmToken::EndOfStatement, "unexpected token in directive")) return true; - getStreamer().EmitCFINegateRAState(); + getStreamer().emitCFINegateRAState(); return false; } @@ -5468,7 +5503,7 @@ bool AArch64AsmParser::parseDirectiveCFIBKeyFrame() { if (parseToken(AsmToken::EndOfStatement, "unexpected token in '.cfi_b_key_frame'")) return true; - getStreamer().EmitCFIBKeyFrame(); + getStreamer().emitCFIBKeyFrame(); return false; } diff --git a/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp b/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp index d6db88603429f..1ff4abb340540 100644 --- a/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp +++ b/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp @@ -146,6 +146,9 @@ static DecodeStatus DecodeExclusiveLdStInstruction(MCInst &Inst, uint32_t insn, static DecodeStatus DecodePairLdStInstruction(MCInst &Inst, uint32_t insn, uint64_t Address, const void *Decoder); +static DecodeStatus DecodeAuthLoadInstruction(MCInst &Inst, uint32_t insn, + uint64_t Address, + const void *Decoder); static DecodeStatus DecodeAddSubERegInstruction(MCInst &Inst, uint32_t insn, uint64_t Address, const void *Decoder); @@ -1501,6 +1504,39 @@ static DecodeStatus DecodePairLdStInstruction(MCInst &Inst, uint32_t insn, return Success; } +static DecodeStatus DecodeAuthLoadInstruction(MCInst &Inst, uint32_t insn, + uint64_t Addr, + const void *Decoder) { + unsigned Rt = fieldFromInstruction(insn, 0, 5); + unsigned Rn = fieldFromInstruction(insn, 5, 5); + uint64_t offset = fieldFromInstruction(insn, 22, 1) << 9 | + fieldFromInstruction(insn, 12, 9); + unsigned writeback = fieldFromInstruction(insn, 11, 1); + + switch (Inst.getOpcode()) { + default: + return Fail; + case AArch64::LDRAAwriteback: + case AArch64::LDRABwriteback: + DecodeGPR64spRegisterClass(Inst, Rn /* writeback register */, Addr, + Decoder); + break; + case AArch64::LDRAAindexed: + case AArch64::LDRABindexed: + break; + } + + DecodeGPR64RegisterClass(Inst, Rt, Addr, Decoder); + DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder); + DecodeSImm<10>(Inst, offset, Addr, Decoder); + + if (writeback && Rt == Rn && Rn != 31) { + return SoftFail; + } + + return Success; +} + static DecodeStatus DecodeAddSubERegInstruction(MCInst &Inst, uint32_t insn, uint64_t Addr, const void *Decoder) { diff --git a/llvm/lib/Target/AArch64/AArch64CallLowering.cpp b/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp index 76ff238234d99..11a8d5def4296 100644 --- a/llvm/lib/Target/AArch64/AArch64CallLowering.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp @@ -62,10 +62,9 @@ struct IncomingArgHandler : public CallLowering::ValueHandler { auto &MFI = MIRBuilder.getMF().getFrameInfo(); int FI = MFI.CreateFixedObject(Size, Offset, true); MPO = MachinePointerInfo::getFixedStack(MIRBuilder.getMF(), FI); - Register AddrReg = MRI.createGenericVirtualRegister(LLT::pointer(0, 64)); - MIRBuilder.buildFrameIndex(AddrReg, FI); + auto AddrReg = MIRBuilder.buildFrameIndex(LLT::pointer(0, 64), FI); StackUsed = std::max(StackUsed, Size + Offset); - return AddrReg; + return AddrReg.getReg(0); } void assignValueToReg(Register ValVReg, Register PhysReg, @@ -87,10 +86,10 @@ struct IncomingArgHandler : public CallLowering::ValueHandler { void assignValueToAddress(Register ValVReg, Register Addr, uint64_t Size, MachinePointerInfo &MPO, CCValAssign &VA) override { - // FIXME: Get alignment - auto MMO = MIRBuilder.getMF().getMachineMemOperand( + MachineFunction &MF = MIRBuilder.getMF(); + auto MMO = MF.getMachineMemOperand( MPO, MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant, Size, - 1); + inferAlignFromPtrInfo(MF, MPO)); MIRBuilder.buildLoad(ValVReg, Addr, *MMO); } @@ -134,7 +133,7 @@ struct OutgoingArgHandler : public CallLowering::ValueHandler { int FPDiff = 0) : ValueHandler(MIRBuilder, MRI, AssignFn), MIB(MIB), AssignFnVarArg(AssignFnVarArg), IsTailCall(IsTailCall), FPDiff(FPDiff), - StackSize(0) {} + StackSize(0), SPReg(0) {} bool isIncomingArgumentHandler() const override { return false; } @@ -147,23 +146,20 @@ struct OutgoingArgHandler : public CallLowering::ValueHandler { if (IsTailCall) { Offset += FPDiff; int FI = MF.getFrameInfo().CreateFixedObject(Size, Offset, true); - Register FIReg = MRI.createGenericVirtualRegister(p0); - MIRBuilder.buildFrameIndex(FIReg, FI); + auto FIReg = MIRBuilder.buildFrameIndex(p0, FI); MPO = MachinePointerInfo::getFixedStack(MF, FI); - return FIReg; + return FIReg.getReg(0); } - Register SPReg = MRI.createGenericVirtualRegister(p0); - MIRBuilder.buildCopy(SPReg, Register(AArch64::SP)); + if (!SPReg) + SPReg = MIRBuilder.buildCopy(p0, Register(AArch64::SP)).getReg(0); - Register OffsetReg = MRI.createGenericVirtualRegister(s64); - MIRBuilder.buildConstant(OffsetReg, Offset); + auto OffsetReg = MIRBuilder.buildConstant(s64, Offset); - Register AddrReg = MRI.createGenericVirtualRegister(p0); - MIRBuilder.buildPtrAdd(AddrReg, SPReg, OffsetReg); + auto AddrReg = MIRBuilder.buildPtrAdd(p0, SPReg, OffsetReg); MPO = MachinePointerInfo::getStack(MF, Offset); - return AddrReg; + return AddrReg.getReg(0); } void assignValueToReg(Register ValVReg, Register PhysReg, @@ -175,17 +171,33 @@ struct OutgoingArgHandler : public CallLowering::ValueHandler { void assignValueToAddress(Register ValVReg, Register Addr, uint64_t Size, MachinePointerInfo &MPO, CCValAssign &VA) override { - if (VA.getLocInfo() == CCValAssign::LocInfo::AExt) { - Size = VA.getLocVT().getSizeInBits() / 8; - ValVReg = MIRBuilder.buildAnyExt(LLT::scalar(Size * 8), ValVReg) - ->getOperand(0) - .getReg(); - } - auto MMO = MIRBuilder.getMF().getMachineMemOperand( - MPO, MachineMemOperand::MOStore, Size, 1); + MachineFunction &MF = MIRBuilder.getMF(); + auto MMO = MF.getMachineMemOperand(MPO, MachineMemOperand::MOStore, Size, + inferAlignFromPtrInfo(MF, MPO)); MIRBuilder.buildStore(ValVReg, Addr, *MMO); } + void assignValueToAddress(const CallLowering::ArgInfo &Arg, Register Addr, + uint64_t Size, MachinePointerInfo &MPO, + CCValAssign &VA) override { + unsigned MaxSize = Size * 8; + // For varargs, we always want to extend them to 8 bytes, in which case + // we disable setting a max. + if (!Arg.IsFixed) + MaxSize = 0; + + Register ValVReg = VA.getLocInfo() != CCValAssign::LocInfo::FPExt + ? extendRegister(Arg.Regs[0], VA, MaxSize) + : Arg.Regs[0]; + + // If we extended we might need to adjust the MMO's Size. + const LLT RegTy = MRI.getType(ValVReg); + if (RegTy.getSizeInBytes() > Size) + Size = RegTy.getSizeInBytes(); + + assignValueToAddress(ValVReg, Addr, Size, MPO, VA); + } + bool assignArg(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, const CallLowering::ArgInfo &Info, @@ -209,6 +221,9 @@ struct OutgoingArgHandler : public CallLowering::ValueHandler { /// callee's. Unused elsewhere. int FPDiff; uint64_t StackSize; + + // Cache the SP register vreg if we need it more than once in this call site. + Register SPReg; }; } // namespace @@ -222,13 +237,13 @@ void AArch64CallLowering::splitToValueTypes( const AArch64TargetLowering &TLI = *getTLI<AArch64TargetLowering>(); LLVMContext &Ctx = OrigArg.Ty->getContext(); - if (OrigArg.Ty->isVoidTy()) - return; - SmallVector<EVT, 4> SplitVTs; SmallVector<uint64_t, 4> Offsets; ComputeValueVTs(TLI, DL, OrigArg.Ty, SplitVTs, &Offsets, 0); + if (SplitVTs.size() == 0) + return; + if (SplitVTs.size() == 1) { // No splitting to do, but we want to replace the original type (e.g. [1 x // double] -> double). @@ -322,8 +337,7 @@ bool AArch64CallLowering::lowerReturn(MachineIRBuilder &MIRBuilder, } auto Undef = MIRBuilder.buildUndef({OldLLT}); CurVReg = - MIRBuilder.buildMerge({NewLLT}, {CurVReg, Undef.getReg(0)}) - .getReg(0); + MIRBuilder.buildMerge({NewLLT}, {CurVReg, Undef}).getReg(0); } else { // Just do a vector extend. CurVReg = MIRBuilder.buildInstr(ExtendOp, {NewLLT}, {CurVReg}) @@ -413,6 +427,14 @@ static void handleMustTailForwardedRegisters(MachineIRBuilder &MIRBuilder, } } +bool AArch64CallLowering::fallBackToDAGISel(const Function &F) const { + if (isa<ScalableVectorType>(F.getReturnType())) + return true; + return llvm::any_of(F.args(), [](const Argument &A) { + return isa<ScalableVectorType>(A.getType()); + }); +} + bool AArch64CallLowering::lowerFormalArguments( MachineIRBuilder &MIRBuilder, const Function &F, ArrayRef<ArrayRef<Register>> VRegs) const { @@ -424,7 +446,7 @@ bool AArch64CallLowering::lowerFormalArguments( SmallVector<ArgInfo, 8> SplitArgs; unsigned i = 0; for (auto &Arg : F.args()) { - if (DL.getTypeStoreSize(Arg.getType()) == 0) + if (DL.getTypeStoreSize(Arg.getType()).isZero()) continue; ArgInfo OrigArg{VRegs[i], Arg.getType()}; @@ -759,17 +781,17 @@ bool AArch64CallLowering::isEligibleForTailCallOptimization( return true; } -static unsigned getCallOpcode(const Function &CallerF, bool IsIndirect, +static unsigned getCallOpcode(const MachineFunction &CallerF, bool IsIndirect, bool IsTailCall) { if (!IsTailCall) - return IsIndirect ? AArch64::BLR : AArch64::BL; + return IsIndirect ? getBLRCallOpcode(CallerF) : (unsigned)AArch64::BL; if (!IsIndirect) return AArch64::TCRETURNdi; // When BTI is enabled, we need to use TCRETURNriBTI to make sure that we use // x16 or x17. - if (CallerF.hasFnAttribute("branch-target-enforcement")) + if (CallerF.getFunction().hasFnAttribute("branch-target-enforcement")) return AArch64::TCRETURNriBTI; return AArch64::TCRETURNri; @@ -805,7 +827,7 @@ bool AArch64CallLowering::lowerTailCall( if (!IsSibCall) CallSeqStart = MIRBuilder.buildInstr(AArch64::ADJCALLSTACKDOWN); - unsigned Opc = getCallOpcode(F, Info.Callee.isReg(), true); + unsigned Opc = getCallOpcode(MF, Info.Callee.isReg(), true); auto MIB = MIRBuilder.buildInstrNoInsert(Opc); MIB.add(Info.Callee); @@ -863,7 +885,6 @@ bool AArch64CallLowering::lowerTailCall( const auto &Forwards = FuncInfo->getForwardedMustTailRegParms(); // Do the actual argument marshalling. - SmallVector<unsigned, 8> PhysRegs; OutgoingArgHandler Handler(MIRBuilder, MRI, MIB, AssignFnFixed, AssignFnVarArg, true, FPDiff); if (!handleAssignments(MIRBuilder, OutArgs, Handler)) @@ -965,7 +986,7 @@ bool AArch64CallLowering::lowerCall(MachineIRBuilder &MIRBuilder, // Create a temporarily-floating call instruction so we can add the implicit // uses of arg registers. - unsigned Opc = getCallOpcode(F, Info.Callee.isReg(), false); + unsigned Opc = getCallOpcode(MF, Info.Callee.isReg(), false); auto MIB = MIRBuilder.buildInstrNoInsert(Opc); MIB.add(Info.Callee); @@ -981,7 +1002,6 @@ bool AArch64CallLowering::lowerCall(MachineIRBuilder &MIRBuilder, TRI->emitReservedArgRegCallError(MF); // Do the actual argument marshalling. - SmallVector<unsigned, 8> PhysRegs; OutgoingArgHandler Handler(MIRBuilder, MRI, MIB, AssignFnFixed, AssignFnVarArg, false); if (!handleAssignments(MIRBuilder, OutArgs, Handler)) diff --git a/llvm/lib/Target/AArch64/AArch64CallLowering.h b/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.h index b0c601c7062c0..640a862530596 100644 --- a/llvm/lib/Target/AArch64/AArch64CallLowering.h +++ b/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.h @@ -37,6 +37,8 @@ public: ArrayRef<Register> VRegs, Register SwiftErrorVReg) const override; + bool fallBackToDAGISel(const Function &F) const override; + bool lowerFormalArguments(MachineIRBuilder &MIRBuilder, const Function &F, ArrayRef<ArrayRef<Register>> VRegs) const override; diff --git a/llvm/lib/Target/AArch64/AArch64InstructionSelector.cpp b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp index b9ac2657e1c5a..408f0cb77e738 100644 --- a/llvm/lib/Target/AArch64/AArch64InstructionSelector.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp @@ -31,6 +31,8 @@ #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/TargetOpcodes.h" +#include "llvm/IR/Constants.h" #include "llvm/IR/Type.h" #include "llvm/IR/IntrinsicsAArch64.h" #include "llvm/Support/Debug.h" @@ -63,6 +65,9 @@ public: // cache it here for each run of the selector. ProduceNonFlagSettingCondBr = !MF.getFunction().hasFnAttribute(Attribute::SpeculativeLoadHardening); + MFReturnAddr = Register(); + + processPHIs(MF); } private: @@ -71,23 +76,33 @@ private: bool selectImpl(MachineInstr &I, CodeGenCoverage &CoverageInfo) const; // A lowering phase that runs before any selection attempts. - - void preISelLower(MachineInstr &I) const; + // Returns true if the instruction was modified. + bool preISelLower(MachineInstr &I); // An early selection function that runs before the selectImpl() call. bool earlySelect(MachineInstr &I) const; + // Do some preprocessing of G_PHIs before we begin selection. + void processPHIs(MachineFunction &MF); + bool earlySelectSHL(MachineInstr &I, MachineRegisterInfo &MRI) const; /// Eliminate same-sized cross-bank copies into stores before selectImpl(). - void contractCrossBankCopyIntoStore(MachineInstr &I, - MachineRegisterInfo &MRI) const; + bool contractCrossBankCopyIntoStore(MachineInstr &I, + MachineRegisterInfo &MRI); + + bool convertPtrAddToAdd(MachineInstr &I, MachineRegisterInfo &MRI); bool selectVaStartAAPCS(MachineInstr &I, MachineFunction &MF, MachineRegisterInfo &MRI) const; bool selectVaStartDarwin(MachineInstr &I, MachineFunction &MF, MachineRegisterInfo &MRI) const; + bool tryOptAndIntoCompareBranch(MachineInstr *LHS, + int64_t CmpConstant, + const CmpInst::Predicate &Pred, + MachineBasicBlock *DstMBB, + MachineIRBuilder &MIB) const; bool selectCompareBranch(MachineInstr &I, MachineFunction &MF, MachineRegisterInfo &MRI) const; @@ -112,6 +127,8 @@ private: const RegisterBank &RB, MachineIRBuilder &MIRBuilder) const; bool selectInsertElt(MachineInstr &I, MachineRegisterInfo &MRI) const; + bool tryOptConstantBuildVec(MachineInstr &MI, LLT DstTy, + MachineRegisterInfo &MRI) const; bool selectBuildVector(MachineInstr &I, MachineRegisterInfo &MRI) const; bool selectMergeValues(MachineInstr &I, MachineRegisterInfo &MRI) const; bool selectUnmergeValues(MachineInstr &I, MachineRegisterInfo &MRI) const; @@ -123,7 +140,7 @@ private: MachineRegisterInfo &MRI) const; bool selectIntrinsicWithSideEffects(MachineInstr &I, MachineRegisterInfo &MRI) const; - bool selectIntrinsic(MachineInstr &I, MachineRegisterInfo &MRI) const; + bool selectIntrinsic(MachineInstr &I, MachineRegisterInfo &MRI); bool selectVectorICmp(MachineInstr &I, MachineRegisterInfo &MRI) const; bool selectIntrinsicTrunc(MachineInstr &I, MachineRegisterInfo &MRI) const; bool selectIntrinsicRound(MachineInstr &I, MachineRegisterInfo &MRI) const; @@ -131,17 +148,25 @@ private: bool selectBrJT(MachineInstr &I, MachineRegisterInfo &MRI) const; bool selectTLSGlobalValue(MachineInstr &I, MachineRegisterInfo &MRI) const; - unsigned emitConstantPoolEntry(Constant *CPVal, MachineFunction &MF) const; - MachineInstr *emitLoadFromConstantPool(Constant *CPVal, + unsigned emitConstantPoolEntry(const Constant *CPVal, + MachineFunction &MF) const; + MachineInstr *emitLoadFromConstantPool(const Constant *CPVal, MachineIRBuilder &MIRBuilder) const; // Emit a vector concat operation. MachineInstr *emitVectorConcat(Optional<Register> Dst, Register Op1, Register Op2, MachineIRBuilder &MIRBuilder) const; - MachineInstr *emitIntegerCompare(MachineOperand &LHS, MachineOperand &RHS, - MachineOperand &Predicate, - MachineIRBuilder &MIRBuilder) const; + + // Emit an integer compare between LHS and RHS, which checks for Predicate. + // + // This returns the produced compare instruction, and the predicate which + // was ultimately used in the compare. The predicate may differ from what + // is passed in \p Predicate due to optimization. + std::pair<MachineInstr *, CmpInst::Predicate> + emitIntegerCompare(MachineOperand &LHS, MachineOperand &RHS, + MachineOperand &Predicate, + MachineIRBuilder &MIRBuilder) const; MachineInstr *emitADD(Register DefReg, MachineOperand &LHS, MachineOperand &RHS, MachineIRBuilder &MIRBuilder) const; MachineInstr *emitCMN(MachineOperand &LHS, MachineOperand &RHS, @@ -163,6 +188,13 @@ private: MachineInstr *emitCSetForICMP(Register DefReg, unsigned Pred, MachineIRBuilder &MIRBuilder) const; + /// Emit a TB(N)Z instruction which tests \p Bit in \p TestReg. + /// \p IsNegative is true if the test should be "not zero". + /// This will also optimize the test bit instruction when possible. + MachineInstr *emitTestBit(Register TestReg, uint64_t Bit, bool IsNegative, + MachineBasicBlock *DstMBB, + MachineIRBuilder &MIB) const; + // Equivalent to the i32shift_a and friends from AArch64InstrInfo.td. // We use these manually instead of using the importer since it doesn't // support SDNodeXForm. @@ -194,6 +226,11 @@ private: return selectAddrModeUnscaled(Root, 16); } + /// Helper to try to fold in a GISEL_ADD_LOW into an immediate, to be used + /// from complex pattern matchers like selectAddrModeIndexed(). + ComplexRendererFns tryFoldAddLowIntoImm(MachineInstr &RootDef, unsigned Size, + MachineRegisterInfo &MRI) const; + ComplexRendererFns selectAddrModeIndexed(MachineOperand &Root, unsigned Size) const; template <int Width> @@ -258,6 +295,8 @@ private: /// new copy. Register narrowExtendRegIfNeeded(Register ExtReg, MachineIRBuilder &MIB) const; + Register widenGPRBankRegIfNeeded(Register Reg, unsigned Size, + MachineIRBuilder &MIB) const; ComplexRendererFns selectArithExtendedRegister(MachineOperand &Root) const; void renderTruncImm(MachineInstrBuilder &MIB, const MachineInstr &MI, @@ -272,12 +311,17 @@ private: unsigned OpFlags) const; // Optimization methods. - bool tryOptVectorShuffle(MachineInstr &I) const; - bool tryOptVectorDup(MachineInstr &MI) const; bool tryOptSelect(MachineInstr &MI) const; MachineInstr *tryFoldIntegerCompare(MachineOperand &LHS, MachineOperand &RHS, MachineOperand &Predicate, MachineIRBuilder &MIRBuilder) const; + MachineInstr *tryOptArithImmedIntegerCompare(MachineOperand &LHS, + MachineOperand &RHS, + CmpInst::Predicate &Predicate, + MachineIRBuilder &MIB) const; + MachineInstr *tryOptArithShiftedCompare(MachineOperand &LHS, + MachineOperand &RHS, + MachineIRBuilder &MIB) const; /// Return true if \p MI is a load or store of \p NumBytes bytes. bool isLoadStoreOfNumBytes(const MachineInstr &MI, unsigned NumBytes) const; @@ -295,6 +339,11 @@ private: bool ProduceNonFlagSettingCondBr = false; + // Some cached values used during selection. + // We use LR as a live-in register, and we keep track of it here as it can be + // clobbered by calls. + Register MFReturnAddr; + #define GET_GLOBALISEL_PREDICATES_DECL #include "AArch64GenGlobalISel.inc" #undef GET_GLOBALISEL_PREDICATES_DECL @@ -421,6 +470,39 @@ static bool getSubRegForClass(const TargetRegisterClass *RC, return true; } +/// Returns the minimum size the given register bank can hold. +static unsigned getMinSizeForRegBank(const RegisterBank &RB) { + switch (RB.getID()) { + case AArch64::GPRRegBankID: + return 32; + case AArch64::FPRRegBankID: + return 8; + default: + llvm_unreachable("Tried to get minimum size for unknown register bank."); + } +} + +static Optional<uint64_t> getImmedFromMO(const MachineOperand &Root) { + auto &MI = *Root.getParent(); + auto &MBB = *MI.getParent(); + auto &MF = *MBB.getParent(); + auto &MRI = MF.getRegInfo(); + uint64_t Immed; + if (Root.isImm()) + Immed = Root.getImm(); + else if (Root.isCImm()) + Immed = Root.getCImm()->getZExtValue(); + else if (Root.isReg()) { + auto ValAndVReg = + getConstantVRegValWithLookThrough(Root.getReg(), MRI, true); + if (!ValAndVReg) + return None; + Immed = ValAndVReg->Value; + } else + return None; + return Immed; +} + /// Check whether \p I is a currently unsupported binary operation: /// - it has an unsized type /// - an operand is not a vreg @@ -609,23 +691,20 @@ static bool isValidCopy(const MachineInstr &I, const RegisterBank &DstBank, } #endif -/// Helper function for selectCopy. Inserts a subregister copy from -/// \p *From to \p *To, linking it up to \p I. -/// -/// e.g, given I = "Dst = COPY SrcReg", we'll transform that into +/// Helper function for selectCopy. Inserts a subregister copy from \p SrcReg +/// to \p *To. /// -/// CopyReg (From class) = COPY SrcReg -/// SubRegCopy (To class) = COPY CopyReg:SubReg -/// Dst = COPY SubRegCopy -static bool selectSubregisterCopy(MachineInstr &I, MachineRegisterInfo &MRI, - const RegisterBankInfo &RBI, Register SrcReg, - const TargetRegisterClass *From, - const TargetRegisterClass *To, - unsigned SubReg) { +/// E.g "To = COPY SrcReg:SubReg" +static bool copySubReg(MachineInstr &I, MachineRegisterInfo &MRI, + const RegisterBankInfo &RBI, Register SrcReg, + const TargetRegisterClass *To, unsigned SubReg) { + assert(SrcReg.isValid() && "Expected a valid source register?"); + assert(To && "Destination register class cannot be null"); + assert(SubReg && "Expected a valid subregister"); + MachineIRBuilder MIB(I); - auto Copy = MIB.buildCopy({From}, {SrcReg}); - auto SubRegCopy = MIB.buildInstr(TargetOpcode::COPY, {To}, {}) - .addReg(Copy.getReg(0), 0, SubReg); + auto SubRegCopy = + MIB.buildInstr(TargetOpcode::COPY, {To}, {}).addReg(SrcReg, 0, SubReg); MachineOperand &RegOp = I.getOperand(1); RegOp.setReg(SubRegCopy.getReg(0)); @@ -670,7 +749,6 @@ getRegClassesForCopy(MachineInstr &I, const TargetInstrInfo &TII, static bool selectCopy(MachineInstr &I, const TargetInstrInfo &TII, MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI, const RegisterBankInfo &RBI) { - Register DstReg = I.getOperand(0).getReg(); Register SrcReg = I.getOperand(1).getReg(); const RegisterBank &DstRegBank = *RBI.getRegBank(DstReg, MRI, TRI); @@ -703,13 +781,15 @@ static bool selectCopy(MachineInstr &I, const TargetInstrInfo &TII, (!Register::isPhysicalRegister(I.getOperand(0).getReg()) && !Register::isPhysicalRegister(I.getOperand(1).getReg()))) && "No phys reg on generic operator!"); - assert(KnownValid || isValidCopy(I, DstRegBank, MRI, TRI, RBI)); - (void)KnownValid; - return true; + bool ValidCopy = true; +#ifndef NDEBUG + ValidCopy = KnownValid || isValidCopy(I, DstRegBank, MRI, TRI, RBI); + assert(ValidCopy && "Invalid copy."); +#endif + return ValidCopy; }; - // Is this a copy? If so, then we may need to insert a subregister copy, or - // a SUBREG_TO_REG. + // Is this a copy? If so, then we may need to insert a subregister copy. if (I.isCopy()) { // Yes. Check if there's anything to fix up. if (!SrcRC) { @@ -719,48 +799,43 @@ static bool selectCopy(MachineInstr &I, const TargetInstrInfo &TII, unsigned SrcSize = TRI.getRegSizeInBits(*SrcRC); unsigned DstSize = TRI.getRegSizeInBits(*DstRC); + unsigned SubReg; - // If we're doing a cross-bank copy on different-sized registers, we need - // to do a bit more work. - if (SrcSize > DstSize) { - // We're doing a cross-bank copy into a smaller register. We need a - // subregister copy. First, get a register class that's on the same bank - // as the destination, but the same size as the source. - const TargetRegisterClass *SubregRC = - getMinClassForRegBank(DstRegBank, SrcSize, true); - assert(SubregRC && "Didn't get a register class for subreg?"); - - // Get the appropriate subregister for the destination. - unsigned SubReg = 0; - if (!getSubRegForClass(DstRC, TRI, SubReg)) { - LLVM_DEBUG(dbgs() << "Couldn't determine subregister for copy.\n"); - return false; - } - - // Now, insert a subregister copy using the new register class. - selectSubregisterCopy(I, MRI, RBI, SrcReg, SubregRC, DstRC, SubReg); - return CheckCopy(); - } + // If the source bank doesn't support a subregister copy small enough, + // then we first need to copy to the destination bank. + if (getMinSizeForRegBank(SrcRegBank) > DstSize) { + const TargetRegisterClass *DstTempRC = + getMinClassForRegBank(DstRegBank, SrcSize, /* GetAllRegSet */ true); + getSubRegForClass(DstRC, TRI, SubReg); - // Is this a cross-bank copy? - if (DstRegBank.getID() != SrcRegBank.getID()) { - if (DstRegBank.getID() == AArch64::GPRRegBankID && DstSize == 32 && - SrcSize == 16) { - // Special case for FPR16 to GPR32. - // FIXME: This can probably be generalized like the above case. - Register PromoteReg = - MRI.createVirtualRegister(&AArch64::FPR32RegClass); - BuildMI(*I.getParent(), I, I.getDebugLoc(), - TII.get(AArch64::SUBREG_TO_REG), PromoteReg) - .addImm(0) - .addUse(SrcReg) - .addImm(AArch64::hsub); - MachineOperand &RegOp = I.getOperand(1); - RegOp.setReg(PromoteReg); + MachineIRBuilder MIB(I); + auto Copy = MIB.buildCopy({DstTempRC}, {SrcReg}); + copySubReg(I, MRI, RBI, Copy.getReg(0), DstRC, SubReg); + } else if (SrcSize > DstSize) { + // If the source register is bigger than the destination we need to + // perform a subregister copy. + const TargetRegisterClass *SubRegRC = + getMinClassForRegBank(SrcRegBank, DstSize, /* GetAllRegSet */ true); + getSubRegForClass(SubRegRC, TRI, SubReg); + copySubReg(I, MRI, RBI, SrcReg, DstRC, SubReg); + } else if (DstSize > SrcSize) { + // If the destination register is bigger than the source we need to do + // a promotion using SUBREG_TO_REG. + const TargetRegisterClass *PromotionRC = + getMinClassForRegBank(SrcRegBank, DstSize, /* GetAllRegSet */ true); + getSubRegForClass(SrcRC, TRI, SubReg); + + Register PromoteReg = MRI.createVirtualRegister(PromotionRC); + BuildMI(*I.getParent(), I, I.getDebugLoc(), + TII.get(AArch64::SUBREG_TO_REG), PromoteReg) + .addImm(0) + .addUse(SrcReg) + .addImm(SubReg); + MachineOperand &RegOp = I.getOperand(1); + RegOp.setReg(PromoteReg); - // Promise that the copy is implicitly validated by the SUBREG_TO_REG. - KnownValid = true; - } + // Promise that the copy is implicitly validated by the SUBREG_TO_REG. + KnownValid = true; } // If the destination is a physical register, then there's nothing to @@ -977,6 +1052,216 @@ static void changeFCMPPredToAArch64CC(CmpInst::Predicate P, } } +/// Return a register which can be used as a bit to test in a TB(N)Z. +static Register getTestBitReg(Register Reg, uint64_t &Bit, bool &Invert, + MachineRegisterInfo &MRI) { + assert(Reg.isValid() && "Expected valid register!"); + while (MachineInstr *MI = getDefIgnoringCopies(Reg, MRI)) { + unsigned Opc = MI->getOpcode(); + + if (!MI->getOperand(0).isReg() || + !MRI.hasOneNonDBGUse(MI->getOperand(0).getReg())) + break; + + // (tbz (any_ext x), b) -> (tbz x, b) if we don't use the extended bits. + // + // (tbz (trunc x), b) -> (tbz x, b) is always safe, because the bit number + // on the truncated x is the same as the bit number on x. + if (Opc == TargetOpcode::G_ANYEXT || Opc == TargetOpcode::G_ZEXT || + Opc == TargetOpcode::G_TRUNC) { + Register NextReg = MI->getOperand(1).getReg(); + // Did we find something worth folding? + if (!NextReg.isValid() || !MRI.hasOneNonDBGUse(NextReg)) + break; + + // NextReg is worth folding. Keep looking. + Reg = NextReg; + continue; + } + + // Attempt to find a suitable operation with a constant on one side. + Optional<uint64_t> C; + Register TestReg; + switch (Opc) { + default: + break; + case TargetOpcode::G_AND: + case TargetOpcode::G_XOR: { + TestReg = MI->getOperand(1).getReg(); + Register ConstantReg = MI->getOperand(2).getReg(); + auto VRegAndVal = getConstantVRegValWithLookThrough(ConstantReg, MRI); + if (!VRegAndVal) { + // AND commutes, check the other side for a constant. + // FIXME: Can we canonicalize the constant so that it's always on the + // same side at some point earlier? + std::swap(ConstantReg, TestReg); + VRegAndVal = getConstantVRegValWithLookThrough(ConstantReg, MRI); + } + if (VRegAndVal) + C = VRegAndVal->Value; + break; + } + case TargetOpcode::G_ASHR: + case TargetOpcode::G_LSHR: + case TargetOpcode::G_SHL: { + TestReg = MI->getOperand(1).getReg(); + auto VRegAndVal = + getConstantVRegValWithLookThrough(MI->getOperand(2).getReg(), MRI); + if (VRegAndVal) + C = VRegAndVal->Value; + break; + } + } + + // Didn't find a constant or viable register. Bail out of the loop. + if (!C || !TestReg.isValid()) + break; + + // We found a suitable instruction with a constant. Check to see if we can + // walk through the instruction. + Register NextReg; + unsigned TestRegSize = MRI.getType(TestReg).getSizeInBits(); + switch (Opc) { + default: + break; + case TargetOpcode::G_AND: + // (tbz (and x, m), b) -> (tbz x, b) when the b-th bit of m is set. + if ((*C >> Bit) & 1) + NextReg = TestReg; + break; + case TargetOpcode::G_SHL: + // (tbz (shl x, c), b) -> (tbz x, b-c) when b-c is positive and fits in + // the type of the register. + if (*C <= Bit && (Bit - *C) < TestRegSize) { + NextReg = TestReg; + Bit = Bit - *C; + } + break; + case TargetOpcode::G_ASHR: + // (tbz (ashr x, c), b) -> (tbz x, b+c) or (tbz x, msb) if b+c is > # bits + // in x + NextReg = TestReg; + Bit = Bit + *C; + if (Bit >= TestRegSize) + Bit = TestRegSize - 1; + break; + case TargetOpcode::G_LSHR: + // (tbz (lshr x, c), b) -> (tbz x, b+c) when b + c is < # bits in x + if ((Bit + *C) < TestRegSize) { + NextReg = TestReg; + Bit = Bit + *C; + } + break; + case TargetOpcode::G_XOR: + // We can walk through a G_XOR by inverting whether we use tbz/tbnz when + // appropriate. + // + // e.g. If x' = xor x, c, and the b-th bit is set in c then + // + // tbz x', b -> tbnz x, b + // + // Because x' only has the b-th bit set if x does not. + if ((*C >> Bit) & 1) + Invert = !Invert; + NextReg = TestReg; + break; + } + + // Check if we found anything worth folding. + if (!NextReg.isValid()) + return Reg; + Reg = NextReg; + } + + return Reg; +} + +MachineInstr *AArch64InstructionSelector::emitTestBit( + Register TestReg, uint64_t Bit, bool IsNegative, MachineBasicBlock *DstMBB, + MachineIRBuilder &MIB) const { + assert(TestReg.isValid()); + assert(ProduceNonFlagSettingCondBr && + "Cannot emit TB(N)Z with speculation tracking!"); + MachineRegisterInfo &MRI = *MIB.getMRI(); + + // Attempt to optimize the test bit by walking over instructions. + TestReg = getTestBitReg(TestReg, Bit, IsNegative, MRI); + LLT Ty = MRI.getType(TestReg); + unsigned Size = Ty.getSizeInBits(); + assert(!Ty.isVector() && "Expected a scalar!"); + assert(Bit < 64 && "Bit is too large!"); + + // When the test register is a 64-bit register, we have to narrow to make + // TBNZW work. + bool UseWReg = Bit < 32; + unsigned NecessarySize = UseWReg ? 32 : 64; + if (Size < NecessarySize) + TestReg = widenGPRBankRegIfNeeded(TestReg, NecessarySize, MIB); + else if (Size > NecessarySize) + TestReg = narrowExtendRegIfNeeded(TestReg, MIB); + + static const unsigned OpcTable[2][2] = {{AArch64::TBZX, AArch64::TBNZX}, + {AArch64::TBZW, AArch64::TBNZW}}; + unsigned Opc = OpcTable[UseWReg][IsNegative]; + auto TestBitMI = + MIB.buildInstr(Opc).addReg(TestReg).addImm(Bit).addMBB(DstMBB); + constrainSelectedInstRegOperands(*TestBitMI, TII, TRI, RBI); + return &*TestBitMI; +} + +bool AArch64InstructionSelector::tryOptAndIntoCompareBranch( + MachineInstr *AndInst, int64_t CmpConstant, const CmpInst::Predicate &Pred, + MachineBasicBlock *DstMBB, MachineIRBuilder &MIB) const { + // Given something like this: + // + // %x = ...Something... + // %one = G_CONSTANT i64 1 + // %zero = G_CONSTANT i64 0 + // %and = G_AND %x, %one + // %cmp = G_ICMP intpred(ne), %and, %zero + // %cmp_trunc = G_TRUNC %cmp + // G_BRCOND %cmp_trunc, %bb.3 + // + // We want to try and fold the AND into the G_BRCOND and produce either a + // TBNZ (when we have intpred(ne)) or a TBZ (when we have intpred(eq)). + // + // In this case, we'd get + // + // TBNZ %x %bb.3 + // + if (!AndInst || AndInst->getOpcode() != TargetOpcode::G_AND) + return false; + + // Need to be comparing against 0 to fold. + if (CmpConstant != 0) + return false; + + MachineRegisterInfo &MRI = *MIB.getMRI(); + + // Only support EQ and NE. If we have LT, then it *is* possible to fold, but + // we don't want to do this. When we have an AND and LT, we need a TST/ANDS, + // so folding would be redundant. + if (Pred != CmpInst::Predicate::ICMP_EQ && + Pred != CmpInst::Predicate::ICMP_NE) + return false; + + // Check if the AND has a constant on its RHS which we can use as a mask. + // If it's a power of 2, then it's the same as checking a specific bit. + // (e.g, ANDing with 8 == ANDing with 000...100 == testing if bit 3 is set) + auto MaybeBit = + getConstantVRegValWithLookThrough(AndInst->getOperand(2).getReg(), MRI); + if (!MaybeBit || !isPowerOf2_64(MaybeBit->Value)) + return false; + + uint64_t Bit = Log2_64(static_cast<uint64_t>(MaybeBit->Value)); + Register TestReg = AndInst->getOperand(1).getReg(); + bool Invert = Pred == CmpInst::Predicate::ICMP_NE; + + // Emit a TB(N)Z. + emitTestBit(TestReg, Bit, Invert, DstMBB, MIB); + return true; +} + bool AArch64InstructionSelector::selectCompareBranch( MachineInstr &I, MachineFunction &MF, MachineRegisterInfo &MRI) const { @@ -991,28 +1276,67 @@ bool AArch64InstructionSelector::selectCompareBranch( Register LHS = CCMI->getOperand(2).getReg(); Register RHS = CCMI->getOperand(3).getReg(); auto VRegAndVal = getConstantVRegValWithLookThrough(RHS, MRI); - if (!VRegAndVal) + MachineIRBuilder MIB(I); + CmpInst::Predicate Pred = + (CmpInst::Predicate)CCMI->getOperand(1).getPredicate(); + MachineInstr *LHSMI = getDefIgnoringCopies(LHS, MRI); + + // When we can emit a TB(N)Z, prefer that. + // + // Handle non-commutative condition codes first. + // Note that we don't want to do this when we have a G_AND because it can + // become a tst. The tst will make the test bit in the TB(N)Z redundant. + if (VRegAndVal && LHSMI->getOpcode() != TargetOpcode::G_AND) { + int64_t C = VRegAndVal->Value; + + // When we have a greater-than comparison, we can just test if the msb is + // zero. + if (C == -1 && Pred == CmpInst::ICMP_SGT) { + uint64_t Bit = MRI.getType(LHS).getSizeInBits() - 1; + emitTestBit(LHS, Bit, /*IsNegative = */ false, DestMBB, MIB); + I.eraseFromParent(); + return true; + } + + // When we have a less than comparison, we can just test if the msb is not + // zero. + if (C == 0 && Pred == CmpInst::ICMP_SLT) { + uint64_t Bit = MRI.getType(LHS).getSizeInBits() - 1; + emitTestBit(LHS, Bit, /*IsNegative = */ true, DestMBB, MIB); + I.eraseFromParent(); + return true; + } + } + + if (!VRegAndVal) { std::swap(RHS, LHS); + VRegAndVal = getConstantVRegValWithLookThrough(RHS, MRI); + LHSMI = getDefIgnoringCopies(LHS, MRI); + } - VRegAndVal = getConstantVRegValWithLookThrough(RHS, MRI); if (!VRegAndVal || VRegAndVal->Value != 0) { - MachineIRBuilder MIB(I); // If we can't select a CBZ then emit a cmp + Bcc. - if (!emitIntegerCompare(CCMI->getOperand(2), CCMI->getOperand(3), - CCMI->getOperand(1), MIB)) + MachineInstr *Cmp; + std::tie(Cmp, Pred) = emitIntegerCompare( + CCMI->getOperand(2), CCMI->getOperand(3), CCMI->getOperand(1), MIB); + if (!Cmp) return false; - const AArch64CC::CondCode CC = changeICMPPredToAArch64CC( - (CmpInst::Predicate)CCMI->getOperand(1).getPredicate()); + const AArch64CC::CondCode CC = changeICMPPredToAArch64CC(Pred); MIB.buildInstr(AArch64::Bcc, {}, {}).addImm(CC).addMBB(DestMBB); I.eraseFromParent(); return true; } + // Try to emit a TB(N)Z for an eq or ne condition. + if (tryOptAndIntoCompareBranch(LHSMI, VRegAndVal->Value, Pred, DestMBB, + MIB)) { + I.eraseFromParent(); + return true; + } + const RegisterBank &RB = *RBI.getRegBank(LHS, MRI, TRI); if (RB.getID() != AArch64::GPRRegBankID) return false; - - const auto Pred = (CmpInst::Predicate)CCMI->getOperand(1).getPredicate(); if (Pred != CmpInst::ICMP_NE && Pred != CmpInst::ICMP_EQ) return false; @@ -1247,7 +1571,7 @@ void AArch64InstructionSelector::materializeLargeCMVal( return; } -void AArch64InstructionSelector::preISelLower(MachineInstr &I) const { +bool AArch64InstructionSelector::preISelLower(MachineInstr &I) { MachineBasicBlock &MBB = *I.getParent(); MachineFunction &MF = *MBB.getParent(); MachineRegisterInfo &MRI = MF.getRegInfo(); @@ -1267,10 +1591,10 @@ void AArch64InstructionSelector::preISelLower(MachineInstr &I) const { const LLT ShiftTy = MRI.getType(ShiftReg); const LLT SrcTy = MRI.getType(SrcReg); if (SrcTy.isVector()) - return; + return false; assert(!ShiftTy.isVector() && "unexpected vector shift ty"); if (SrcTy.getSizeInBits() != 32 || ShiftTy.getSizeInBits() != 64) - return; + return false; auto *AmtMI = MRI.getVRegDef(ShiftReg); assert(AmtMI && "could not find a vreg definition for shift amount"); if (AmtMI->getOpcode() != TargetOpcode::G_CONSTANT) { @@ -1281,14 +1605,65 @@ void AArch64InstructionSelector::preISelLower(MachineInstr &I) const { MRI.setRegBank(Trunc.getReg(0), RBI.getRegBank(AArch64::GPRRegBankID)); I.getOperand(2).setReg(Trunc.getReg(0)); } - return; + return true; } case TargetOpcode::G_STORE: - contractCrossBankCopyIntoStore(I, MRI); - return; + return contractCrossBankCopyIntoStore(I, MRI); + case TargetOpcode::G_PTR_ADD: + return convertPtrAddToAdd(I, MRI); + case TargetOpcode::G_LOAD: { + // For scalar loads of pointers, we try to convert the dest type from p0 + // to s64 so that our imported patterns can match. Like with the G_PTR_ADD + // conversion, this should be ok because all users should have been + // selected already, so the type doesn't matter for them. + Register DstReg = I.getOperand(0).getReg(); + const LLT DstTy = MRI.getType(DstReg); + if (!DstTy.isPointer()) + return false; + MRI.setType(DstReg, LLT::scalar(64)); + return true; + } default: - return; + return false; + } +} + +/// This lowering tries to look for G_PTR_ADD instructions and then converts +/// them to a standard G_ADD with a COPY on the source. +/// +/// The motivation behind this is to expose the add semantics to the imported +/// tablegen patterns. We shouldn't need to check for uses being loads/stores, +/// because the selector works bottom up, uses before defs. By the time we +/// end up trying to select a G_PTR_ADD, we should have already attempted to +/// fold this into addressing modes and were therefore unsuccessful. +bool AArch64InstructionSelector::convertPtrAddToAdd( + MachineInstr &I, MachineRegisterInfo &MRI) { + assert(I.getOpcode() == TargetOpcode::G_PTR_ADD && "Expected G_PTR_ADD"); + Register DstReg = I.getOperand(0).getReg(); + Register AddOp1Reg = I.getOperand(1).getReg(); + const LLT PtrTy = MRI.getType(DstReg); + if (PtrTy.getAddressSpace() != 0) + return false; + + MachineIRBuilder MIB(I); + const LLT CastPtrTy = PtrTy.isVector() ? LLT::vector(2, 64) : LLT::scalar(64); + auto PtrToInt = MIB.buildPtrToInt(CastPtrTy, AddOp1Reg); + // Set regbanks on the registers. + if (PtrTy.isVector()) + MRI.setRegBank(PtrToInt.getReg(0), RBI.getRegBank(AArch64::FPRRegBankID)); + else + MRI.setRegBank(PtrToInt.getReg(0), RBI.getRegBank(AArch64::GPRRegBankID)); + + // Now turn the %dst(p0) = G_PTR_ADD %base, off into: + // %dst(intty) = G_ADD %intbase, off + I.setDesc(TII.get(TargetOpcode::G_ADD)); + MRI.setType(DstReg, CastPtrTy); + I.getOperand(1).setReg(PtrToInt.getReg(0)); + if (!select(*PtrToInt)) { + LLVM_DEBUG(dbgs() << "Failed to select G_PTRTOINT in convertPtrAddToAdd"); + return false; } + return true; } bool AArch64InstructionSelector::earlySelectSHL( @@ -1326,8 +1701,8 @@ bool AArch64InstructionSelector::earlySelectSHL( return constrainSelectedInstRegOperands(*NewI, TII, TRI, RBI); } -void AArch64InstructionSelector::contractCrossBankCopyIntoStore( - MachineInstr &I, MachineRegisterInfo &MRI) const { +bool AArch64InstructionSelector::contractCrossBankCopyIntoStore( + MachineInstr &I, MachineRegisterInfo &MRI) { assert(I.getOpcode() == TargetOpcode::G_STORE && "Expected G_STORE"); // If we're storing a scalar, it doesn't matter what register bank that // scalar is on. All that matters is the size. @@ -1343,10 +1718,9 @@ void AArch64InstructionSelector::contractCrossBankCopyIntoStore( // G_STORE %x:gpr(s32) // // And then continue the selection process normally. - MachineInstr *Def = getDefIgnoringCopies(I.getOperand(0).getReg(), MRI); - if (!Def) - return; - Register DefDstReg = Def->getOperand(0).getReg(); + Register DefDstReg = getSrcRegIgnoringCopies(I.getOperand(0).getReg(), MRI); + if (!DefDstReg.isValid()) + return false; LLT DefDstTy = MRI.getType(DefDstReg); Register StoreSrcReg = I.getOperand(0).getReg(); LLT StoreSrcTy = MRI.getType(StoreSrcReg); @@ -1354,18 +1728,19 @@ void AArch64InstructionSelector::contractCrossBankCopyIntoStore( // If we get something strange like a physical register, then we shouldn't // go any further. if (!DefDstTy.isValid()) - return; + return false; // Are the source and dst types the same size? if (DefDstTy.getSizeInBits() != StoreSrcTy.getSizeInBits()) - return; + return false; if (RBI.getRegBank(StoreSrcReg, MRI, TRI) == RBI.getRegBank(DefDstReg, MRI, TRI)) - return; + return false; // We have a cross-bank copy, which is entering a store. Let's fold it. I.getOperand(0).setReg(DefDstReg); + return true; } bool AArch64InstructionSelector::earlySelect(MachineInstr &I) const { @@ -1391,16 +1766,15 @@ bool AArch64InstructionSelector::earlySelect(MachineInstr &I) const { Register DefReg = I.getOperand(0).getReg(); LLT Ty = MRI.getType(DefReg); - if (Ty != LLT::scalar(64) && Ty != LLT::scalar(32)) - return false; - - if (Ty == LLT::scalar(64)) { + if (Ty.getSizeInBits() == 64) { I.getOperand(1).ChangeToRegister(AArch64::XZR, false); RBI.constrainGenericRegister(DefReg, AArch64::GPR64RegClass, MRI); - } else { + } else if (Ty.getSizeInBits() == 32) { I.getOperand(1).ChangeToRegister(AArch64::WZR, false); RBI.constrainGenericRegister(DefReg, AArch64::GPR32RegClass, MRI); - } + } else + return false; + I.setDesc(TII.get(TargetOpcode::COPY)); return true; } @@ -1417,9 +1791,17 @@ bool AArch64InstructionSelector::select(MachineInstr &I) { MachineFunction &MF = *MBB.getParent(); MachineRegisterInfo &MRI = MF.getRegInfo(); + const AArch64Subtarget *Subtarget = + &static_cast<const AArch64Subtarget &>(MF.getSubtarget()); + if (Subtarget->requiresStrictAlign()) { + // We don't support this feature yet. + LLVM_DEBUG(dbgs() << "AArch64 GISel does not support strict-align yet\n"); + return false; + } + unsigned Opcode = I.getOpcode(); // G_PHI requires same handling as PHI - if (!isPreISelGenericOpcode(Opcode) || Opcode == TargetOpcode::G_PHI) { + if (!I.isPreISelOpcode() || Opcode == TargetOpcode::G_PHI) { // Certain non-generic instructions also need some special handling. if (Opcode == TargetOpcode::LOAD_STACK_GUARD) @@ -1468,7 +1850,9 @@ bool AArch64InstructionSelector::select(MachineInstr &I) { // Try to do some lowering before we start instruction selecting. These // lowerings are purely transformations on the input G_MIR and so selection // must continue after any modification of the instruction. - preISelLower(I); + if (preISelLower(I)) { + Opcode = I.getOpcode(); // The opcode may have been modified, refresh it. + } // There may be patterns where the importer can't deal with them optimally, // but does select it to a suboptimal sequence so our custom C++ selection @@ -1503,8 +1887,6 @@ bool AArch64InstructionSelector::select(MachineInstr &I) { // Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z // instructions will not be produced, as they are conditional branch // instructions that do not set flags. - bool ProduceNonFlagSettingCondBr = - !MF.getFunction().hasFnAttribute(Attribute::SpeculativeLoadHardening); if (ProduceNonFlagSettingCondBr && selectCompareBranch(I, MF, MRI)) return true; @@ -1540,6 +1922,31 @@ bool AArch64InstructionSelector::select(MachineInstr &I) { case TargetOpcode::G_BRJT: return selectBrJT(I, MRI); + case AArch64::G_ADD_LOW: { + // This op may have been separated from it's ADRP companion by the localizer + // or some other code motion pass. Given that many CPUs will try to + // macro fuse these operations anyway, select this into a MOVaddr pseudo + // which will later be expanded into an ADRP+ADD pair after scheduling. + MachineInstr *BaseMI = MRI.getVRegDef(I.getOperand(1).getReg()); + if (BaseMI->getOpcode() != AArch64::ADRP) { + I.setDesc(TII.get(AArch64::ADDXri)); + I.addOperand(MachineOperand::CreateImm(0)); + return constrainSelectedInstRegOperands(I, TII, TRI, RBI); + } + assert(TM.getCodeModel() == CodeModel::Small && + "Expected small code model"); + MachineIRBuilder MIB(I); + auto Op1 = BaseMI->getOperand(1); + auto Op2 = I.getOperand(2); + auto MovAddr = MIB.buildInstr(AArch64::MOVaddr, {I.getOperand(0)}, {}) + .addGlobalAddress(Op1.getGlobal(), Op1.getOffset(), + Op1.getTargetFlags()) + .addGlobalAddress(Op2.getGlobal(), Op2.getOffset(), + Op2.getTargetFlags()); + I.eraseFromParent(); + return constrainSelectedInstRegOperands(*MovAddr, TII, TRI, RBI); + } + case TargetOpcode::G_BSWAP: { // Handle vector types for G_BSWAP directly. Register DstReg = I.getOperand(0).getReg(); @@ -1644,6 +2051,20 @@ bool AArch64InstructionSelector::select(MachineInstr &I) { if (emitFMovForFConstant(I, MRI)) return true; + // For 64b values, emit a constant pool load instead. + if (DefSize == 64) { + auto *FPImm = I.getOperand(1).getFPImm(); + MachineIRBuilder MIB(I); + auto *LoadMI = emitLoadFromConstantPool(FPImm, MIB); + if (!LoadMI) { + LLVM_DEBUG(dbgs() << "Failed to load double constant pool entry\n"); + return false; + } + MIB.buildCopy({DefReg}, {LoadMI->getOperand(0).getReg()}); + I.eraseFromParent(); + return RBI.constrainGenericRegister(DefReg, FPRRC, MRI); + } + // Nope. Emit a copy and use a normal mov instead. const Register DefGPRReg = MRI.createVirtualRegister(&GPRRC); MachineOperand &RegOp = I.getOperand(0); @@ -2005,9 +2426,8 @@ bool AArch64InstructionSelector::select(MachineInstr &I) { // Add and set the set condition flag. unsigned AddsOpc = OpSize == 32 ? AArch64::ADDSWrr : AArch64::ADDSXrr; MachineIRBuilder MIRBuilder(I); - auto AddsMI = MIRBuilder.buildInstr( - AddsOpc, {I.getOperand(0).getReg()}, - {I.getOperand(2).getReg(), I.getOperand(3).getReg()}); + auto AddsMI = MIRBuilder.buildInstr(AddsOpc, {I.getOperand(0)}, + {I.getOperand(2), I.getOperand(3)}); constrainSelectedInstRegOperands(*AddsMI, TII, TRI, RBI); // Now, put the overflow result in the register given by the first operand @@ -2023,14 +2443,17 @@ bool AArch64InstructionSelector::select(MachineInstr &I) { return true; } - case TargetOpcode::G_PTR_MASK: { - uint64_t Align = I.getOperand(2).getImm(); - if (Align >= 64 || Align == 0) + case TargetOpcode::G_PTRMASK: { + Register MaskReg = I.getOperand(2).getReg(); + Optional<int64_t> MaskVal = getConstantVRegVal(MaskReg, MRI); + // TODO: Implement arbitrary cases + if (!MaskVal || !isShiftedMask_64(*MaskVal)) return false; - uint64_t Mask = ~((1ULL << Align) - 1); + uint64_t Mask = *MaskVal; I.setDesc(TII.get(AArch64::ANDXri)); - I.getOperand(2).setImm(AArch64_AM::encodeLogicalImmediate(Mask, 64)); + I.getOperand(2).ChangeToImmediate( + AArch64_AM::encodeLogicalImmediate(Mask, 64)); return constrainSelectedInstRegOperands(I, TII, TRI, RBI); } @@ -2101,6 +2524,13 @@ bool AArch64InstructionSelector::select(MachineInstr &I) { I.eraseFromParent(); return true; } + + // We might have a vector G_PTRTOINT, in which case just emit a COPY. + if (Opcode == TargetOpcode::G_PTRTOINT) { + assert(DstTy.isVector() && "Expected an FPR ptrtoint to be a vector"); + I.setDesc(TII.get(TargetOpcode::COPY)); + return true; + } } return false; @@ -2151,16 +2581,22 @@ bool AArch64InstructionSelector::select(MachineInstr &I) { } case TargetOpcode::G_ZEXT: + case TargetOpcode::G_SEXT_INREG: case TargetOpcode::G_SEXT: { unsigned Opcode = I.getOpcode(); - const bool IsSigned = Opcode == TargetOpcode::G_SEXT; + const bool IsSigned = Opcode != TargetOpcode::G_ZEXT; const Register DefReg = I.getOperand(0).getReg(); - const Register SrcReg = I.getOperand(1).getReg(); + Register SrcReg = I.getOperand(1).getReg(); const LLT DstTy = MRI.getType(DefReg); const LLT SrcTy = MRI.getType(SrcReg); unsigned DstSize = DstTy.getSizeInBits(); unsigned SrcSize = SrcTy.getSizeInBits(); + // SEXT_INREG has the same src reg size as dst, the size of the value to be + // extended is encoded in the imm. + if (Opcode == TargetOpcode::G_SEXT_INREG) + SrcSize = I.getOperand(2).getImm(); + if (DstTy.isVector()) return false; // Should be handled by imported patterns. @@ -2179,31 +2615,65 @@ bool AArch64InstructionSelector::select(MachineInstr &I) { // %v2(s32) = G_ZEXT %v(s8) if (!IsSigned) { auto *LoadMI = getOpcodeDef(TargetOpcode::G_LOAD, SrcReg, MRI); - if (LoadMI && - RBI.getRegBank(SrcReg, MRI, TRI)->getID() == AArch64::GPRRegBankID) { + bool IsGPR = + RBI.getRegBank(SrcReg, MRI, TRI)->getID() == AArch64::GPRRegBankID; + if (LoadMI && IsGPR) { const MachineMemOperand *MemOp = *LoadMI->memoperands_begin(); unsigned BytesLoaded = MemOp->getSize(); if (BytesLoaded < 4 && SrcTy.getSizeInBytes() == BytesLoaded) return selectCopy(I, TII, MRI, TRI, RBI); } - } - if (DstSize == 64) { - // FIXME: Can we avoid manually doing this? - if (!RBI.constrainGenericRegister(SrcReg, AArch64::GPR32RegClass, MRI)) { - LLVM_DEBUG(dbgs() << "Failed to constrain " << TII.getName(Opcode) - << " operand\n"); - return false; - } - - auto SubregToReg = - MIB.buildInstr(AArch64::SUBREG_TO_REG, {&AArch64::GPR64RegClass}, {}) + // If we are zero extending from 32 bits to 64 bits, it's possible that + // the instruction implicitly does the zero extend for us. In that case, + // we can just emit a SUBREG_TO_REG. + if (IsGPR && SrcSize == 32 && DstSize == 64) { + // Unlike with the G_LOAD case, we don't want to look through copies + // here. + MachineInstr *Def = MRI.getVRegDef(SrcReg); + if (Def && isDef32(*Def)) { + MIB.buildInstr(AArch64::SUBREG_TO_REG, {DefReg}, {}) .addImm(0) .addUse(SrcReg) .addImm(AArch64::sub_32); + if (!RBI.constrainGenericRegister(DefReg, AArch64::GPR64RegClass, + MRI)) { + LLVM_DEBUG(dbgs() << "Failed to constrain G_ZEXT destination\n"); + return false; + } + + if (!RBI.constrainGenericRegister(SrcReg, AArch64::GPR32RegClass, + MRI)) { + LLVM_DEBUG(dbgs() << "Failed to constrain G_ZEXT source\n"); + return false; + } + + I.eraseFromParent(); + return true; + } + } + } + + if (DstSize == 64) { + if (Opcode != TargetOpcode::G_SEXT_INREG) { + // FIXME: Can we avoid manually doing this? + if (!RBI.constrainGenericRegister(SrcReg, AArch64::GPR32RegClass, + MRI)) { + LLVM_DEBUG(dbgs() << "Failed to constrain " << TII.getName(Opcode) + << " operand\n"); + return false; + } + SrcReg = MIB.buildInstr(AArch64::SUBREG_TO_REG, + {&AArch64::GPR64RegClass}, {}) + .addImm(0) + .addUse(SrcReg) + .addImm(AArch64::sub_32) + .getReg(0); + } + ExtI = MIB.buildInstr(IsSigned ? AArch64::SBFMXri : AArch64::UBFMXri, - {DefReg}, {SubregToReg}) + {DefReg}, {SrcReg}) .addImm(0) .addImm(SrcSize - 1); } else if (DstSize <= 32) { @@ -2236,6 +2706,8 @@ bool AArch64InstructionSelector::select(MachineInstr &I) { return true; } + case TargetOpcode::G_FREEZE: + return selectCopy(I, TII, MRI, TRI, RBI); case TargetOpcode::G_INTTOPTR: // The importer is currently unable to import pointer types since they @@ -2294,11 +2766,13 @@ bool AArch64InstructionSelector::select(MachineInstr &I) { } MachineIRBuilder MIRBuilder(I); - if (!emitIntegerCompare(I.getOperand(2), I.getOperand(3), I.getOperand(1), - MIRBuilder)) + MachineInstr *Cmp; + CmpInst::Predicate Pred; + std::tie(Cmp, Pred) = emitIntegerCompare(I.getOperand(2), I.getOperand(3), + I.getOperand(1), MIRBuilder); + if (!Cmp) return false; - emitCSetForICMP(I.getOperand(0).getReg(), I.getOperand(1).getPredicate(), - MIRBuilder); + emitCSetForICMP(I.getOperand(0).getReg(), Pred, MIRBuilder); I.eraseFromParent(); return true; } @@ -2435,14 +2909,13 @@ bool AArch64InstructionSelector::selectBrJT(MachineInstr &I, Register TargetReg = MRI.createVirtualRegister(&AArch64::GPR64RegClass); Register ScratchReg = MRI.createVirtualRegister(&AArch64::GPR64spRegClass); - MIB.buildInstr(AArch64::JumpTableDest32, {TargetReg, ScratchReg}, - {JTAddr, Index}) - .addJumpTableIndex(JTI); - + auto JumpTableInst = MIB.buildInstr(AArch64::JumpTableDest32, + {TargetReg, ScratchReg}, {JTAddr, Index}) + .addJumpTableIndex(JTI); // Build the indirect branch. MIB.buildInstr(AArch64::BR, {}, {TargetReg}); I.eraseFromParent(); - return true; + return constrainSelectedInstRegOperands(*JumpTableInst, TII, TRI, RBI); } bool AArch64InstructionSelector::selectJumpTable( @@ -2482,7 +2955,7 @@ bool AArch64InstructionSelector::selectTLSGlobalValue( // TLS calls preserve all registers except those that absolutely must be // trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be // silly). - MIB.buildInstr(AArch64::BLR, {}, {Load}) + MIB.buildInstr(getBLRCallOpcode(MF), {}, {Load}) .addDef(AArch64::X0, RegState::Implicit) .addRegMask(TRI.getTLSCallPreservedMask()); @@ -3158,19 +3631,17 @@ bool AArch64InstructionSelector::selectConcatVectors( } unsigned -AArch64InstructionSelector::emitConstantPoolEntry(Constant *CPVal, +AArch64InstructionSelector::emitConstantPoolEntry(const Constant *CPVal, MachineFunction &MF) const { Type *CPTy = CPVal->getType(); - unsigned Align = MF.getDataLayout().getPrefTypeAlignment(CPTy); - if (Align == 0) - Align = MF.getDataLayout().getTypeAllocSize(CPTy); + Align Alignment = MF.getDataLayout().getPrefTypeAlign(CPTy); MachineConstantPool *MCP = MF.getConstantPool(); - return MCP->getConstantPoolIndex(CPVal, Align); + return MCP->getConstantPoolIndex(CPVal, Alignment); } MachineInstr *AArch64InstructionSelector::emitLoadFromConstantPool( - Constant *CPVal, MachineIRBuilder &MIRBuilder) const { + const Constant *CPVal, MachineIRBuilder &MIRBuilder) const { unsigned CPIdx = emitConstantPoolEntry(CPVal, MIRBuilder.getMF()); auto Adrp = @@ -3248,7 +3719,7 @@ AArch64InstructionSelector::emitADD(Register DefReg, MachineOperand &LHS, bool Is32Bit = MRI.getType(LHS.getReg()).getSizeInBits() == 32; auto ImmFns = selectArithImmed(RHS); unsigned Opc = OpcTable[Is32Bit][ImmFns.hasValue()]; - auto AddMI = MIRBuilder.buildInstr(Opc, {DefReg}, {LHS.getReg()}); + auto AddMI = MIRBuilder.buildInstr(Opc, {DefReg}, {LHS}); // If we matched a valid constant immediate, add those operands. if (ImmFns) { @@ -3274,7 +3745,7 @@ AArch64InstructionSelector::emitCMN(MachineOperand &LHS, MachineOperand &RHS, unsigned Opc = OpcTable[Is32Bit][ImmFns.hasValue()]; Register ZReg = Is32Bit ? AArch64::WZR : AArch64::XZR; - auto CmpMI = MIRBuilder.buildInstr(Opc, {ZReg}, {LHS.getReg()}); + auto CmpMI = MIRBuilder.buildInstr(Opc, {ZReg}, {LHS}); // If we matched a valid constant immediate, add those operands. if (ImmFns) { @@ -3316,17 +3787,21 @@ AArch64InstructionSelector::emitTST(const Register &LHS, const Register &RHS, return &*TstMI; } -MachineInstr *AArch64InstructionSelector::emitIntegerCompare( +std::pair<MachineInstr *, CmpInst::Predicate> +AArch64InstructionSelector::emitIntegerCompare( MachineOperand &LHS, MachineOperand &RHS, MachineOperand &Predicate, MachineIRBuilder &MIRBuilder) const { assert(LHS.isReg() && RHS.isReg() && "Expected LHS and RHS to be registers!"); + assert(Predicate.isPredicate() && "Expected predicate?"); MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo(); + CmpInst::Predicate P = (CmpInst::Predicate)Predicate.getPredicate(); + // Fold the compare if possible. MachineInstr *FoldCmp = tryFoldIntegerCompare(LHS, RHS, Predicate, MIRBuilder); if (FoldCmp) - return FoldCmp; + return {FoldCmp, P}; // Can't fold into a CMN. Just emit a normal compare. unsigned CmpOpc = 0; @@ -3337,31 +3812,31 @@ MachineInstr *AArch64InstructionSelector::emitIntegerCompare( "Expected scalar or pointer"); if (CmpTy == LLT::scalar(32)) { CmpOpc = AArch64::SUBSWrr; - ZReg = AArch64::WZR; + ZReg = MRI.createVirtualRegister(&AArch64::GPR32RegClass); } else if (CmpTy == LLT::scalar(64) || CmpTy.isPointer()) { CmpOpc = AArch64::SUBSXrr; - ZReg = AArch64::XZR; + ZReg = MRI.createVirtualRegister(&AArch64::GPR64RegClass); } else { - return nullptr; + return {nullptr, CmpInst::Predicate::BAD_ICMP_PREDICATE}; } // Try to match immediate forms. - auto ImmFns = selectArithImmed(RHS); - if (ImmFns) - CmpOpc = CmpOpc == AArch64::SUBSWrr ? AArch64::SUBSWri : AArch64::SUBSXri; - - auto CmpMI = MIRBuilder.buildInstr(CmpOpc).addDef(ZReg).addUse(LHS.getReg()); - // If we matched a valid constant immediate, add those operands. - if (ImmFns) { - for (auto &RenderFn : *ImmFns) - RenderFn(CmpMI); - } else { - CmpMI.addUse(RHS.getReg()); - } - + MachineInstr *ImmedCmp = + tryOptArithImmedIntegerCompare(LHS, RHS, P, MIRBuilder); + if (ImmedCmp) + return {ImmedCmp, P}; + + // If we don't have an immediate, we may have a shift which can be folded + // into the compare. + MachineInstr *ShiftedCmp = tryOptArithShiftedCompare(LHS, RHS, MIRBuilder); + if (ShiftedCmp) + return {ShiftedCmp, P}; + + auto CmpMI = + MIRBuilder.buildInstr(CmpOpc, {ZReg}, {LHS.getReg(), RHS.getReg()}); // Make sure that we can constrain the compare that we emitted. constrainSelectedInstRegOperands(*CmpMI, TII, TRI, RBI); - return &*CmpMI; + return {&*CmpMI, P}; } MachineInstr *AArch64InstructionSelector::emitVectorConcat( @@ -3497,8 +3972,16 @@ bool AArch64InstructionSelector::tryOptSelect(MachineInstr &I) const { MachineInstr *CondDef = MRI.getVRegDef(I.getOperand(1).getReg()); while (CondDef) { // We can only fold if all of the defs have one use. - if (!MRI.hasOneUse(CondDef->getOperand(0).getReg())) - return false; + Register CondDefReg = CondDef->getOperand(0).getReg(); + if (!MRI.hasOneNonDBGUse(CondDefReg)) { + // Unless it's another select. + for (const MachineInstr &UI : MRI.use_nodbg_instructions(CondDefReg)) { + if (CondDef == &UI) + continue; + if (UI.getOpcode() != TargetOpcode::G_SELECT) + return false; + } + } // We can skip over G_TRUNC since the condition is 1-bit. // Truncating/extending can have no impact on the value. @@ -3524,13 +4007,21 @@ bool AArch64InstructionSelector::tryOptSelect(MachineInstr &I) const { AArch64CC::CondCode CondCode; if (CondOpc == TargetOpcode::G_ICMP) { - CondCode = changeICMPPredToAArch64CC( - (CmpInst::Predicate)CondDef->getOperand(1).getPredicate()); - if (!emitIntegerCompare(CondDef->getOperand(2), CondDef->getOperand(3), - CondDef->getOperand(1), MIB)) { + MachineInstr *Cmp; + CmpInst::Predicate Pred; + + std::tie(Cmp, Pred) = + emitIntegerCompare(CondDef->getOperand(2), CondDef->getOperand(3), + CondDef->getOperand(1), MIB); + + if (!Cmp) { LLVM_DEBUG(dbgs() << "Couldn't emit compare for select!\n"); return false; } + + // Have to collect the CondCode after emitIntegerCompare, since it can + // update the predicate. + CondCode = changeICMPPredToAArch64CC(Pred); } else { // Get the condition code for the select. AArch64CC::CondCode CondCode2; @@ -3660,119 +4151,150 @@ MachineInstr *AArch64InstructionSelector::tryFoldIntegerCompare( return nullptr; } -bool AArch64InstructionSelector::tryOptVectorDup(MachineInstr &I) const { - // Try to match a vector splat operation into a dup instruction. - // We're looking for this pattern: - // %scalar:gpr(s64) = COPY $x0 - // %undef:fpr(<2 x s64>) = G_IMPLICIT_DEF - // %cst0:gpr(s32) = G_CONSTANT i32 0 - // %zerovec:fpr(<2 x s32>) = G_BUILD_VECTOR %cst0(s32), %cst0(s32) - // %ins:fpr(<2 x s64>) = G_INSERT_VECTOR_ELT %undef, %scalar(s64), %cst0(s32) - // %splat:fpr(<2 x s64>) = G_SHUFFLE_VECTOR %ins(<2 x s64>), %undef, - // %zerovec(<2 x s32>) - // - // ...into: - // %splat = DUP %scalar - // We use the regbank of the scalar to determine which kind of dup to use. - MachineIRBuilder MIB(I); +MachineInstr *AArch64InstructionSelector::tryOptArithImmedIntegerCompare( + MachineOperand &LHS, MachineOperand &RHS, CmpInst::Predicate &P, + MachineIRBuilder &MIB) const { + // Attempt to select the immediate form of an integer compare. MachineRegisterInfo &MRI = *MIB.getMRI(); - const TargetRegisterInfo &TRI = *MRI.getTargetRegisterInfo(); - using namespace TargetOpcode; - using namespace MIPatternMatch; - - // Begin matching the insert. - auto *InsMI = - getOpcodeDef(G_INSERT_VECTOR_ELT, I.getOperand(1).getReg(), MRI); - if (!InsMI) - return false; - // Match the undef vector operand. - auto *UndefMI = - getOpcodeDef(G_IMPLICIT_DEF, InsMI->getOperand(1).getReg(), MRI); - if (!UndefMI) - return false; - // Match the scalar being splatted. - Register ScalarReg = InsMI->getOperand(2).getReg(); - const RegisterBank *ScalarRB = RBI.getRegBank(ScalarReg, MRI, TRI); - // Match the index constant 0. - int64_t Index = 0; - if (!mi_match(InsMI->getOperand(3).getReg(), MRI, m_ICst(Index)) || Index) - return false; - - // The shuffle's second operand doesn't matter if the mask is all zero. - ArrayRef<int> Mask = I.getOperand(3).getShuffleMask(); - if (!all_of(Mask, [](int Elem) { return Elem == 0; })) - return false; + auto Ty = MRI.getType(LHS.getReg()); + assert(!Ty.isVector() && "Expected scalar or pointer only?"); + unsigned Size = Ty.getSizeInBits(); + assert((Size == 32 || Size == 64) && + "Expected 32 bit or 64 bit compare only?"); + + // Check if this is a case we can already handle. + InstructionSelector::ComplexRendererFns ImmFns; + ImmFns = selectArithImmed(RHS); + + if (!ImmFns) { + // We didn't get a rendering function, but we may still have a constant. + auto MaybeImmed = getImmedFromMO(RHS); + if (!MaybeImmed) + return nullptr; - // We're done, now find out what kind of splat we need. - LLT VecTy = MRI.getType(I.getOperand(0).getReg()); - LLT EltTy = VecTy.getElementType(); - if (EltTy.getSizeInBits() < 32) { - LLVM_DEBUG(dbgs() << "Could not optimize splat pattern < 32b elts yet"); - return false; - } - bool IsFP = ScalarRB->getID() == AArch64::FPRRegBankID; - unsigned Opc = 0; - if (IsFP) { - switch (EltTy.getSizeInBits()) { - case 32: - if (VecTy.getNumElements() == 2) { - Opc = AArch64::DUPv2i32lane; - } else { - Opc = AArch64::DUPv4i32lane; - assert(VecTy.getNumElements() == 4); - } + // We have a constant, but it doesn't fit. Try adjusting it by one and + // updating the predicate if possible. + uint64_t C = *MaybeImmed; + CmpInst::Predicate NewP; + switch (P) { + default: + return nullptr; + case CmpInst::ICMP_SLT: + case CmpInst::ICMP_SGE: + // Check for + // + // x slt c => x sle c - 1 + // x sge c => x sgt c - 1 + // + // When c is not the smallest possible negative number. + if ((Size == 64 && static_cast<int64_t>(C) == INT64_MIN) || + (Size == 32 && static_cast<int32_t>(C) == INT32_MIN)) + return nullptr; + NewP = (P == CmpInst::ICMP_SLT) ? CmpInst::ICMP_SLE : CmpInst::ICMP_SGT; + C -= 1; break; - case 64: - assert(VecTy.getNumElements() == 2 && "Unexpected num elts"); - Opc = AArch64::DUPv2i64lane; + case CmpInst::ICMP_ULT: + case CmpInst::ICMP_UGE: + // Check for + // + // x ult c => x ule c - 1 + // x uge c => x ugt c - 1 + // + // When c is not zero. + if (C == 0) + return nullptr; + NewP = (P == CmpInst::ICMP_ULT) ? CmpInst::ICMP_ULE : CmpInst::ICMP_UGT; + C -= 1; break; - } - } else { - switch (EltTy.getSizeInBits()) { - case 32: - if (VecTy.getNumElements() == 2) { - Opc = AArch64::DUPv2i32gpr; - } else { - Opc = AArch64::DUPv4i32gpr; - assert(VecTy.getNumElements() == 4); - } + case CmpInst::ICMP_SLE: + case CmpInst::ICMP_SGT: + // Check for + // + // x sle c => x slt c + 1 + // x sgt c => s sge c + 1 + // + // When c is not the largest possible signed integer. + if ((Size == 32 && static_cast<int32_t>(C) == INT32_MAX) || + (Size == 64 && static_cast<int64_t>(C) == INT64_MAX)) + return nullptr; + NewP = (P == CmpInst::ICMP_SLE) ? CmpInst::ICMP_SLT : CmpInst::ICMP_SGE; + C += 1; break; - case 64: - assert(VecTy.getNumElements() == 2 && "Unexpected num elts"); - Opc = AArch64::DUPv2i64gpr; + case CmpInst::ICMP_ULE: + case CmpInst::ICMP_UGT: + // Check for + // + // x ule c => x ult c + 1 + // x ugt c => s uge c + 1 + // + // When c is not the largest possible unsigned integer. + if ((Size == 32 && static_cast<uint32_t>(C) == UINT32_MAX) || + (Size == 64 && C == UINT64_MAX)) + return nullptr; + NewP = (P == CmpInst::ICMP_ULE) ? CmpInst::ICMP_ULT : CmpInst::ICMP_UGE; + C += 1; break; } + + // Check if the new constant is valid. + if (Size == 32) + C = static_cast<uint32_t>(C); + ImmFns = select12BitValueWithLeftShift(C); + if (!ImmFns) + return nullptr; + P = NewP; } - assert(Opc && "Did not compute an opcode for a dup"); - // For FP splats, we need to widen the scalar reg via undef too. - if (IsFP) { - MachineInstr *Widen = emitScalarToVector( - EltTy.getSizeInBits(), &AArch64::FPR128RegClass, ScalarReg, MIB); - if (!Widen) - return false; - ScalarReg = Widen->getOperand(0).getReg(); + // At this point, we know we can select an immediate form. Go ahead and do + // that. + Register ZReg; + unsigned Opc; + if (Size == 32) { + ZReg = AArch64::WZR; + Opc = AArch64::SUBSWri; + } else { + ZReg = AArch64::XZR; + Opc = AArch64::SUBSXri; } - auto Dup = MIB.buildInstr(Opc, {I.getOperand(0).getReg()}, {ScalarReg}); - if (IsFP) - Dup.addImm(0); - constrainSelectedInstRegOperands(*Dup, TII, TRI, RBI); - I.eraseFromParent(); - return true; + + auto CmpMI = MIB.buildInstr(Opc, {ZReg}, {LHS.getReg()}); + for (auto &RenderFn : *ImmFns) + RenderFn(CmpMI); + constrainSelectedInstRegOperands(*CmpMI, TII, TRI, RBI); + return &*CmpMI; } -bool AArch64InstructionSelector::tryOptVectorShuffle(MachineInstr &I) const { - if (TM.getOptLevel() == CodeGenOpt::None) - return false; - if (tryOptVectorDup(I)) - return true; - return false; +MachineInstr *AArch64InstructionSelector::tryOptArithShiftedCompare( + MachineOperand &LHS, MachineOperand &RHS, MachineIRBuilder &MIB) const { + // We are looking for the following pattern: + // + // shift = G_SHL/ASHR/LHSR y, c + // ... + // cmp = G_ICMP pred, something, shift + // + // Since we will select the G_ICMP to a SUBS, we can potentially fold the + // shift into the subtract. + static const unsigned OpcTable[2] = {AArch64::SUBSWrs, AArch64::SUBSXrs}; + static const Register ZRegTable[2] = {AArch64::WZR, AArch64::XZR}; + auto ImmFns = selectShiftedRegister(RHS); + if (!ImmFns) + return nullptr; + MachineRegisterInfo &MRI = *MIB.getMRI(); + auto Ty = MRI.getType(LHS.getReg()); + assert(!Ty.isVector() && "Expected scalar or pointer only?"); + unsigned Size = Ty.getSizeInBits(); + bool Idx = (Size == 64); + Register ZReg = ZRegTable[Idx]; + unsigned Opc = OpcTable[Idx]; + auto CmpMI = MIB.buildInstr(Opc, {ZReg}, {LHS.getReg()}); + for (auto &RenderFn : *ImmFns) + RenderFn(CmpMI); + constrainSelectedInstRegOperands(*CmpMI, TII, TRI, RBI); + return &*CmpMI; } bool AArch64InstructionSelector::selectShuffleVector( MachineInstr &I, MachineRegisterInfo &MRI) const { - if (tryOptVectorShuffle(I)) - return true; const LLT DstTy = MRI.getType(I.getOperand(0).getReg()); Register Src1Reg = I.getOperand(1).getReg(); const LLT Src1Ty = MRI.getType(Src1Reg); @@ -3852,9 +4374,8 @@ bool AArch64InstructionSelector::selectShuffleVector( .addUse(Src2Reg) .addImm(AArch64::qsub1); - auto TBL2 = - MIRBuilder.buildInstr(AArch64::TBLv16i8Two, {I.getOperand(0).getReg()}, - {RegSeq, IndexLoad->getOperand(0).getReg()}); + auto TBL2 = MIRBuilder.buildInstr(AArch64::TBLv16i8Two, {I.getOperand(0)}, + {RegSeq, IndexLoad->getOperand(0)}); constrainSelectedInstRegOperands(*RegSeq, TII, TRI, RBI); constrainSelectedInstRegOperands(*TBL2, TII, TRI, RBI); I.eraseFromParent(); @@ -3968,6 +4489,44 @@ bool AArch64InstructionSelector::selectInsertElt( return true; } +bool AArch64InstructionSelector::tryOptConstantBuildVec( + MachineInstr &I, LLT DstTy, MachineRegisterInfo &MRI) const { + assert(I.getOpcode() == TargetOpcode::G_BUILD_VECTOR); + assert(DstTy.getSizeInBits() <= 128 && "Unexpected build_vec type!"); + if (DstTy.getSizeInBits() < 32) + return false; + // Check if we're building a constant vector, in which case we want to + // generate a constant pool load instead of a vector insert sequence. + SmallVector<Constant *, 16> Csts; + for (unsigned Idx = 1; Idx < I.getNumOperands(); ++Idx) { + // Try to find G_CONSTANT or G_FCONSTANT + auto *OpMI = + getOpcodeDef(TargetOpcode::G_CONSTANT, I.getOperand(Idx).getReg(), MRI); + if (OpMI) + Csts.emplace_back( + const_cast<ConstantInt *>(OpMI->getOperand(1).getCImm())); + else if ((OpMI = getOpcodeDef(TargetOpcode::G_FCONSTANT, + I.getOperand(Idx).getReg(), MRI))) + Csts.emplace_back( + const_cast<ConstantFP *>(OpMI->getOperand(1).getFPImm())); + else + return false; + } + Constant *CV = ConstantVector::get(Csts); + MachineIRBuilder MIB(I); + auto *CPLoad = emitLoadFromConstantPool(CV, MIB); + if (!CPLoad) { + LLVM_DEBUG(dbgs() << "Could not generate cp load for build_vector"); + return false; + } + MIB.buildCopy(I.getOperand(0), CPLoad->getOperand(0)); + RBI.constrainGenericRegister(I.getOperand(0).getReg(), + *MRI.getRegClass(CPLoad->getOperand(0).getReg()), + MRI); + I.eraseFromParent(); + return true; +} + bool AArch64InstructionSelector::selectBuildVector( MachineInstr &I, MachineRegisterInfo &MRI) const { assert(I.getOpcode() == TargetOpcode::G_BUILD_VECTOR); @@ -3976,6 +4535,9 @@ bool AArch64InstructionSelector::selectBuildVector( const LLT DstTy = MRI.getType(I.getOperand(0).getReg()); const LLT EltTy = MRI.getType(I.getOperand(1).getReg()); unsigned EltSize = EltTy.getSizeInBits(); + + if (tryOptConstantBuildVec(I, DstTy, MRI)) + return true; if (EltSize < 16 || EltSize > 64) return false; // Don't support all element types yet. const RegisterBank &RB = *RBI.getRegBank(I.getOperand(1).getReg(), MRI, TRI); @@ -4081,8 +4643,8 @@ bool AArch64InstructionSelector::selectIntrinsicWithSideEffects( return true; } -bool AArch64InstructionSelector::selectIntrinsic( - MachineInstr &I, MachineRegisterInfo &MRI) const { +bool AArch64InstructionSelector::selectIntrinsic(MachineInstr &I, + MachineRegisterInfo &MRI) { unsigned IntrinID = findIntrinsicID(I); if (!IntrinID) return false; @@ -4091,7 +4653,7 @@ bool AArch64InstructionSelector::selectIntrinsic( switch (IntrinID) { default: break; - case Intrinsic::aarch64_crypto_sha1h: + case Intrinsic::aarch64_crypto_sha1h: { Register DstReg = I.getOperand(0).getReg(); Register SrcReg = I.getOperand(2).getReg(); @@ -4130,28 +4692,59 @@ bool AArch64InstructionSelector::selectIntrinsic( I.eraseFromParent(); return true; } - return false; -} + case Intrinsic::frameaddress: + case Intrinsic::returnaddress: { + MachineFunction &MF = *I.getParent()->getParent(); + MachineFrameInfo &MFI = MF.getFrameInfo(); -static Optional<uint64_t> getImmedFromMO(const MachineOperand &Root) { - auto &MI = *Root.getParent(); - auto &MBB = *MI.getParent(); - auto &MF = *MBB.getParent(); - auto &MRI = MF.getRegInfo(); - uint64_t Immed; - if (Root.isImm()) - Immed = Root.getImm(); - else if (Root.isCImm()) - Immed = Root.getCImm()->getZExtValue(); - else if (Root.isReg()) { - auto ValAndVReg = - getConstantVRegValWithLookThrough(Root.getReg(), MRI, true); - if (!ValAndVReg) - return None; - Immed = ValAndVReg->Value; - } else - return None; - return Immed; + unsigned Depth = I.getOperand(2).getImm(); + Register DstReg = I.getOperand(0).getReg(); + RBI.constrainGenericRegister(DstReg, AArch64::GPR64RegClass, MRI); + + if (Depth == 0 && IntrinID == Intrinsic::returnaddress) { + if (MFReturnAddr) { + MIRBuilder.buildCopy({DstReg}, MFReturnAddr); + I.eraseFromParent(); + return true; + } + MFI.setReturnAddressIsTaken(true); + MF.addLiveIn(AArch64::LR, &AArch64::GPR64spRegClass); + // Insert the copy from LR/X30 into the entry block, before it can be + // clobbered by anything. + MachineBasicBlock &EntryBlock = *MF.begin(); + if (!EntryBlock.isLiveIn(AArch64::LR)) + EntryBlock.addLiveIn(AArch64::LR); + MachineIRBuilder EntryBuilder(MF); + EntryBuilder.setInstr(*EntryBlock.begin()); + EntryBuilder.buildCopy({DstReg}, {Register(AArch64::LR)}); + MFReturnAddr = DstReg; + I.eraseFromParent(); + return true; + } + + MFI.setFrameAddressIsTaken(true); + Register FrameAddr(AArch64::FP); + while (Depth--) { + Register NextFrame = MRI.createVirtualRegister(&AArch64::GPR64spRegClass); + auto Ldr = + MIRBuilder.buildInstr(AArch64::LDRXui, {NextFrame}, {FrameAddr}) + .addImm(0); + constrainSelectedInstRegOperands(*Ldr, TII, TRI, RBI); + FrameAddr = NextFrame; + } + + if (IntrinID == Intrinsic::frameaddress) + MIRBuilder.buildCopy({DstReg}, {FrameAddr}); + else { + MFI.setReturnAddressIsTaken(true); + MIRBuilder.buildInstr(AArch64::LDRXui, {DstReg}, {FrameAddr}).addImm(1); + } + + I.eraseFromParent(); + return true; + } + } + return false; } InstructionSelector::ComplexRendererFns @@ -4271,7 +4864,7 @@ bool AArch64InstructionSelector::isWorthFoldingIntoExtendedReg( MachineInstr &MI, const MachineRegisterInfo &MRI) const { // Always fold if there is one use, or if we're optimizing for size. Register DefReg = MI.getOperand(0).getReg(); - if (MRI.hasOneUse(DefReg) || + if (MRI.hasOneNonDBGUse(DefReg) || MI.getParent()->getParent()->getFunction().hasMinSize()) return true; @@ -4283,10 +4876,21 @@ bool AArch64InstructionSelector::isWorthFoldingIntoExtendedReg( // We have a fastpath, so folding a shift in and potentially computing it // many times may be beneficial. Check if this is only used in memory ops. // If it is, then we should fold. - return all_of(MRI.use_instructions(DefReg), + return all_of(MRI.use_nodbg_instructions(DefReg), [](MachineInstr &Use) { return Use.mayLoadOrStore(); }); } +static bool isSignExtendShiftType(AArch64_AM::ShiftExtendType Type) { + switch (Type) { + case AArch64_AM::SXTB: + case AArch64_AM::SXTH: + case AArch64_AM::SXTW: + return true; + default: + return false; + } +} + InstructionSelector::ComplexRendererFns AArch64InstructionSelector::selectExtendedSHL( MachineOperand &Root, MachineOperand &Base, MachineOperand &Offset, @@ -4359,7 +4963,10 @@ AArch64InstructionSelector::selectExtendedSHL( if (Ext == AArch64_AM::InvalidShiftExtend) return None; - SignExtend = Ext == AArch64_AM::SXTW; + SignExtend = isSignExtendShiftType(Ext) ? 1 : 0; + // We only support SXTW for signed extension here. + if (SignExtend && Ext != AArch64_AM::SXTW) + return None; // Need a 32-bit wide register here. MachineIRBuilder MIB(*MRI.getVRegDef(Root.getReg())); @@ -4441,7 +5048,7 @@ AArch64InstructionSelector::selectAddrModeRegisterOffset( // If this is used more than once, let's not bother folding. // TODO: Check if they are memory ops. If they are, then we can still fold // without having to recompute anything. - if (!MRI.hasOneUse(Gep->getOperand(0).getReg())) + if (!MRI.hasOneNonDBGUse(Gep->getOperand(0).getReg())) return None; // Base is the GEP's LHS, offset is its RHS. @@ -4595,14 +5202,46 @@ AArch64InstructionSelector::selectAddrModeUnscaled(MachineOperand &Root, return None; } +InstructionSelector::ComplexRendererFns +AArch64InstructionSelector::tryFoldAddLowIntoImm(MachineInstr &RootDef, + unsigned Size, + MachineRegisterInfo &MRI) const { + if (RootDef.getOpcode() != AArch64::G_ADD_LOW) + return None; + MachineInstr &Adrp = *MRI.getVRegDef(RootDef.getOperand(1).getReg()); + if (Adrp.getOpcode() != AArch64::ADRP) + return None; + + // TODO: add heuristics like isWorthFoldingADDlow() from SelectionDAG. + // TODO: Need to check GV's offset % size if doing offset folding into globals. + assert(Adrp.getOperand(1).getOffset() == 0 && "Unexpected offset in global"); + auto GV = Adrp.getOperand(1).getGlobal(); + if (GV->isThreadLocal()) + return None; + + auto &MF = *RootDef.getParent()->getParent(); + if (GV->getPointerAlignment(MF.getDataLayout()) < Size) + return None; + + unsigned OpFlags = STI.ClassifyGlobalReference(GV, MF.getTarget()); + MachineIRBuilder MIRBuilder(RootDef); + Register AdrpReg = Adrp.getOperand(0).getReg(); + return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(AdrpReg); }, + [=](MachineInstrBuilder &MIB) { + MIB.addGlobalAddress(GV, /* Offset */ 0, + OpFlags | AArch64II::MO_PAGEOFF | + AArch64II::MO_NC); + }}}; +} + /// Select a "register plus scaled unsigned 12-bit immediate" address. The /// "Size" argument is the size in bytes of the memory reference, which /// determines the scale. InstructionSelector::ComplexRendererFns AArch64InstructionSelector::selectAddrModeIndexed(MachineOperand &Root, unsigned Size) const { - MachineRegisterInfo &MRI = - Root.getParent()->getParent()->getParent()->getRegInfo(); + MachineFunction &MF = *Root.getParent()->getParent()->getParent(); + MachineRegisterInfo &MRI = MF.getRegInfo(); if (!Root.isReg()) return None; @@ -4618,6 +5257,14 @@ AArch64InstructionSelector::selectAddrModeIndexed(MachineOperand &Root, }}; } + CodeModel::Model CM = MF.getTarget().getCodeModel(); + // Check if we can fold in the ADD of small code model ADRP + ADD address. + if (CM == CodeModel::Small) { + auto OpFns = tryFoldAddLowIntoImm(*RootDef, Size, MRI); + if (OpFns) + return OpFns; + } + if (isBaseWithConstantOffset(Root, MRI)) { MachineOperand &LHS = RootDef->getOperand(1); MachineOperand &RHS = RootDef->getOperand(2); @@ -4717,7 +5364,11 @@ AArch64_AM::ShiftExtendType AArch64InstructionSelector::getExtendTypeForInst( // Handle explicit extend instructions first. if (Opc == TargetOpcode::G_SEXT || Opc == TargetOpcode::G_SEXT_INREG) { - unsigned Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits(); + unsigned Size; + if (Opc == TargetOpcode::G_SEXT) + Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits(); + else + Size = MI.getOperand(2).getImm(); assert(Size != 64 && "Extend from 64 bits?"); switch (Size) { case 8: @@ -4782,6 +5433,52 @@ Register AArch64InstructionSelector::narrowExtendRegIfNeeded( return Copy.getReg(0); } +Register AArch64InstructionSelector::widenGPRBankRegIfNeeded( + Register Reg, unsigned WideSize, MachineIRBuilder &MIB) const { + assert(WideSize >= 8 && "WideSize is smaller than all possible registers?"); + MachineRegisterInfo &MRI = *MIB.getMRI(); + unsigned NarrowSize = MRI.getType(Reg).getSizeInBits(); + assert(WideSize >= NarrowSize && + "WideSize cannot be smaller than NarrowSize!"); + + // If the sizes match, just return the register. + // + // If NarrowSize is an s1, then we can select it to any size, so we'll treat + // it as a don't care. + if (NarrowSize == WideSize || NarrowSize == 1) + return Reg; + + // Now check the register classes. + const RegisterBank *RB = RBI.getRegBank(Reg, MRI, TRI); + const TargetRegisterClass *OrigRC = getMinClassForRegBank(*RB, NarrowSize); + const TargetRegisterClass *WideRC = getMinClassForRegBank(*RB, WideSize); + assert(OrigRC && "Could not determine narrow RC?"); + assert(WideRC && "Could not determine wide RC?"); + + // If the sizes differ, but the register classes are the same, there is no + // need to insert a SUBREG_TO_REG. + // + // For example, an s8 that's supposed to be a GPR will be selected to either + // a GPR32 or a GPR64 register. Note that this assumes that the s8 will + // always end up on a GPR32. + if (OrigRC == WideRC) + return Reg; + + // We have two different register classes. Insert a SUBREG_TO_REG. + unsigned SubReg = 0; + getSubRegForClass(OrigRC, TRI, SubReg); + assert(SubReg && "Couldn't determine subregister?"); + + // Build the SUBREG_TO_REG and return the new, widened register. + auto SubRegToReg = + MIB.buildInstr(AArch64::SUBREG_TO_REG, {WideRC}, {}) + .addImm(0) + .addUse(Reg) + .addImm(SubReg); + constrainSelectedInstRegOperands(*SubRegToReg, TII, TRI, RBI); + return SubRegToReg.getReg(0); +} + /// Select an "extended register" operand. This operand folds in an extend /// followed by an optional left shift. InstructionSelector::ComplexRendererFns @@ -4908,6 +5605,95 @@ bool AArch64InstructionSelector::isDef32(const MachineInstr &MI) const { } } + +// Perform fixups on the given PHI instruction's operands to force them all +// to be the same as the destination regbank. +static void fixupPHIOpBanks(MachineInstr &MI, MachineRegisterInfo &MRI, + const AArch64RegisterBankInfo &RBI) { + assert(MI.getOpcode() == TargetOpcode::G_PHI && "Expected a G_PHI"); + Register DstReg = MI.getOperand(0).getReg(); + const RegisterBank *DstRB = MRI.getRegBankOrNull(DstReg); + assert(DstRB && "Expected PHI dst to have regbank assigned"); + MachineIRBuilder MIB(MI); + + // Go through each operand and ensure it has the same regbank. + for (unsigned OpIdx = 1; OpIdx < MI.getNumOperands(); ++OpIdx) { + MachineOperand &MO = MI.getOperand(OpIdx); + if (!MO.isReg()) + continue; + Register OpReg = MO.getReg(); + const RegisterBank *RB = MRI.getRegBankOrNull(OpReg); + if (RB != DstRB) { + // Insert a cross-bank copy. + auto *OpDef = MRI.getVRegDef(OpReg); + const LLT &Ty = MRI.getType(OpReg); + MIB.setInsertPt(*OpDef->getParent(), std::next(OpDef->getIterator())); + auto Copy = MIB.buildCopy(Ty, OpReg); + MRI.setRegBank(Copy.getReg(0), *DstRB); + MO.setReg(Copy.getReg(0)); + } + } +} + +void AArch64InstructionSelector::processPHIs(MachineFunction &MF) { + // We're looking for PHIs, build a list so we don't invalidate iterators. + MachineRegisterInfo &MRI = MF.getRegInfo(); + SmallVector<MachineInstr *, 32> Phis; + for (auto &BB : MF) { + for (auto &MI : BB) { + if (MI.getOpcode() == TargetOpcode::G_PHI) + Phis.emplace_back(&MI); + } + } + + for (auto *MI : Phis) { + // We need to do some work here if the operand types are < 16 bit and they + // are split across fpr/gpr banks. Since all types <32b on gpr + // end up being assigned gpr32 regclasses, we can end up with PHIs here + // which try to select between a gpr32 and an fpr16. Ideally RBS shouldn't + // be selecting heterogenous regbanks for operands if possible, but we + // still need to be able to deal with it here. + // + // To fix this, if we have a gpr-bank operand < 32b in size and at least + // one other operand is on the fpr bank, then we add cross-bank copies + // to homogenize the operand banks. For simplicity the bank that we choose + // to settle on is whatever bank the def operand has. For example: + // + // %endbb: + // %dst:gpr(s16) = G_PHI %in1:gpr(s16), %bb1, %in2:fpr(s16), %bb2 + // => + // %bb2: + // ... + // %in2_copy:gpr(s16) = COPY %in2:fpr(s16) + // ... + // %endbb: + // %dst:gpr(s16) = G_PHI %in1:gpr(s16), %bb1, %in2_copy:gpr(s16), %bb2 + bool HasGPROp = false, HasFPROp = false; + for (unsigned OpIdx = 1; OpIdx < MI->getNumOperands(); ++OpIdx) { + const auto &MO = MI->getOperand(OpIdx); + if (!MO.isReg()) + continue; + const LLT &Ty = MRI.getType(MO.getReg()); + if (!Ty.isValid() || !Ty.isScalar()) + break; + if (Ty.getSizeInBits() >= 32) + break; + const RegisterBank *RB = MRI.getRegBankOrNull(MO.getReg()); + // If for some reason we don't have a regbank yet. Don't try anything. + if (!RB) + break; + + if (RB->getID() == AArch64::GPRRegBankID) + HasGPROp = true; + else + HasFPROp = true; + } + // We have heterogenous regbanks, need to fixup. + if (HasGPROp && HasFPROp) + fixupPHIOpBanks(*MI, MRI, RBI); + } +} + namespace llvm { InstructionSelector * createAArch64InstructionSelector(const AArch64TargetMachine &TM, diff --git a/llvm/lib/Target/AArch64/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp index 95719a35c6daa..2eaec0b970fa6 100644 --- a/llvm/lib/Target/AArch64/AArch64LegalizerInfo.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp @@ -30,7 +30,8 @@ using namespace LegalizeActions; using namespace LegalizeMutations; using namespace LegalityPredicates; -AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) { +AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) + : ST(&ST) { using namespace TargetOpcode; const LLT p0 = LLT::pointer(0, 64); const LLT s1 = LLT::scalar(1); @@ -52,13 +53,15 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) { const LLT v2s64 = LLT::vector(2, 64); const LLT v2p0 = LLT::vector(2, p0); + const TargetMachine &TM = ST.getTargetLowering()->getTargetMachine(); + // FIXME: support subtargets which have neon/fp-armv8 disabled. if (!ST.hasNEON() || !ST.hasFPARMv8()) { computeTables(); return; } - getActionDefinitionsBuilder(G_IMPLICIT_DEF) + getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE}) .legalFor({p0, s1, s8, s16, s32, s64, v2s32, v4s32, v2s64}) .clampScalar(0, s1, s64) .widenScalarToNextPow2(0, 8) @@ -105,10 +108,10 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) { .minScalarSameAs(1, 0); getActionDefinitionsBuilder(G_PTR_ADD) - .legalFor({{p0, s64}}) + .legalFor({{p0, s64}, {v2p0, v2s64}}) .clampScalar(1, s64, s64); - getActionDefinitionsBuilder(G_PTR_MASK).legalFor({p0}); + getActionDefinitionsBuilder(G_PTRMASK).legalFor({{p0, s64}}); getActionDefinitionsBuilder({G_SDIV, G_UDIV}) .legalFor({s32, s64}) @@ -375,7 +378,9 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) { getActionDefinitionsBuilder(G_TRUNC).alwaysLegal(); - getActionDefinitionsBuilder(G_SEXT_INREG).lower(); + getActionDefinitionsBuilder(G_SEXT_INREG) + .legalFor({s32, s64}) + .lower(); // FP conversions getActionDefinitionsBuilder(G_FPTRUNC).legalFor( @@ -413,7 +418,11 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) { // Pointer-handling getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({p0}); - getActionDefinitionsBuilder(G_GLOBAL_VALUE).legalFor({p0}); + + if (TM.getCodeModel() == CodeModel::Small) + getActionDefinitionsBuilder(G_GLOBAL_VALUE).custom(); + else + getActionDefinitionsBuilder(G_GLOBAL_VALUE).legalFor({p0}); getActionDefinitionsBuilder(G_PTRTOINT) .legalForCartesianProduct({s1, s8, s16, s32, s64}, {p0}) @@ -617,10 +626,11 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) { verify(*ST.getInstrInfo()); } -bool AArch64LegalizerInfo::legalizeCustom(MachineInstr &MI, - MachineRegisterInfo &MRI, - MachineIRBuilder &MIRBuilder, - GISelChangeObserver &Observer) const { +bool AArch64LegalizerInfo::legalizeCustom(LegalizerHelper &Helper, + MachineInstr &MI) const { + MachineIRBuilder &MIRBuilder = Helper.MIRBuilder; + MachineRegisterInfo &MRI = *MIRBuilder.getMRI(); + GISelChangeObserver &Observer = Helper.Observer; switch (MI.getOpcode()) { default: // No idea what to do. @@ -634,19 +644,53 @@ bool AArch64LegalizerInfo::legalizeCustom(MachineInstr &MI, case TargetOpcode::G_ASHR: case TargetOpcode::G_LSHR: return legalizeShlAshrLshr(MI, MRI, MIRBuilder, Observer); + case TargetOpcode::G_GLOBAL_VALUE: + return legalizeSmallCMGlobalValue(MI, MRI, MIRBuilder, Observer); } llvm_unreachable("expected switch to return"); } +bool AArch64LegalizerInfo::legalizeSmallCMGlobalValue(MachineInstr &MI, + MachineRegisterInfo &MRI, + MachineIRBuilder &MIRBuilder, + GISelChangeObserver &Observer) const { + assert(MI.getOpcode() == TargetOpcode::G_GLOBAL_VALUE); + // We do this custom legalization to convert G_GLOBAL_VALUE into target ADRP + + // G_ADD_LOW instructions. + // By splitting this here, we can optimize accesses in the small code model by + // folding in the G_ADD_LOW into the load/store offset. + auto GV = MI.getOperand(1).getGlobal(); + if (GV->isThreadLocal()) + return true; // Don't want to modify TLS vars. + + auto &TM = ST->getTargetLowering()->getTargetMachine(); + unsigned OpFlags = ST->ClassifyGlobalReference(GV, TM); + + if (OpFlags & AArch64II::MO_GOT) + return true; + + Register DstReg = MI.getOperand(0).getReg(); + auto ADRP = MIRBuilder.buildInstr(AArch64::ADRP, {LLT::pointer(0, 64)}, {}) + .addGlobalAddress(GV, 0, OpFlags | AArch64II::MO_PAGE); + // Set the regclass on the dest reg too. + MRI.setRegClass(ADRP.getReg(0), &AArch64::GPR64RegClass); + + MIRBuilder.buildInstr(AArch64::G_ADD_LOW, {DstReg}, {ADRP}) + .addGlobalAddress(GV, 0, + OpFlags | AArch64II::MO_PAGEOFF | AArch64II::MO_NC); + MI.eraseFromParent(); + return true; +} + bool AArch64LegalizerInfo::legalizeIntrinsic( - MachineInstr &MI, MachineRegisterInfo &MRI, - MachineIRBuilder &MIRBuilder) const { + LegalizerHelper &Helper, MachineInstr &MI) const { + MachineIRBuilder &MIRBuilder = Helper.MIRBuilder; switch (MI.getIntrinsicID()) { case Intrinsic::memcpy: case Intrinsic::memset: case Intrinsic::memmove: - if (createMemLibcall(MIRBuilder, MRI, MI) == + if (createMemLibcall(MIRBuilder, *MIRBuilder.getMRI(), MI) == LegalizerHelper::UnableToLegalize) return false; MI.eraseFromParent(); @@ -675,7 +719,6 @@ bool AArch64LegalizerInfo::legalizeShlAshrLshr( if (Amount > 31) return true; // This will have to remain a register variant. assert(MRI.getType(AmtReg).getSizeInBits() == 32); - MIRBuilder.setInstr(MI); auto ExtCst = MIRBuilder.buildZExt(LLT::scalar(64), AmtReg); MI.getOperand(2).setReg(ExtCst.getReg(0)); return true; @@ -704,17 +747,15 @@ bool AArch64LegalizerInfo::legalizeLoadStore( return false; } - MIRBuilder.setInstr(MI); unsigned PtrSize = ValTy.getElementType().getSizeInBits(); const LLT NewTy = LLT::vector(ValTy.getNumElements(), PtrSize); auto &MMO = **MI.memoperands_begin(); if (MI.getOpcode() == TargetOpcode::G_STORE) { - auto Bitcast = MIRBuilder.buildBitcast({NewTy}, {ValReg}); - MIRBuilder.buildStore(Bitcast.getReg(0), MI.getOperand(1).getReg(), MMO); + auto Bitcast = MIRBuilder.buildBitcast(NewTy, ValReg); + MIRBuilder.buildStore(Bitcast.getReg(0), MI.getOperand(1), MMO); } else { - Register NewReg = MRI.createGenericVirtualRegister(NewTy); - auto NewLoad = MIRBuilder.buildLoad(NewReg, MI.getOperand(1).getReg(), MMO); - MIRBuilder.buildBitcast({ValReg}, {NewLoad}); + auto NewLoad = MIRBuilder.buildLoad(NewTy, MI.getOperand(1), MMO); + MIRBuilder.buildBitcast(ValReg, NewLoad); } MI.eraseFromParent(); return true; @@ -723,9 +764,8 @@ bool AArch64LegalizerInfo::legalizeLoadStore( bool AArch64LegalizerInfo::legalizeVaArg(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &MIRBuilder) const { - MIRBuilder.setInstr(MI); MachineFunction &MF = MIRBuilder.getMF(); - unsigned Align = MI.getOperand(2).getImm(); + Align Alignment(MI.getOperand(2).getImm()); Register Dst = MI.getOperand(0).getReg(); Register ListPtr = MI.getOperand(1).getReg(); @@ -733,21 +773,19 @@ bool AArch64LegalizerInfo::legalizeVaArg(MachineInstr &MI, LLT IntPtrTy = LLT::scalar(PtrTy.getSizeInBits()); const unsigned PtrSize = PtrTy.getSizeInBits() / 8; - Register List = MRI.createGenericVirtualRegister(PtrTy); - MIRBuilder.buildLoad( - List, ListPtr, + const Align PtrAlign = Align(PtrSize); + auto List = MIRBuilder.buildLoad( + PtrTy, ListPtr, *MF.getMachineMemOperand(MachinePointerInfo(), MachineMemOperand::MOLoad, - PtrSize, /* Align = */ PtrSize)); + PtrSize, PtrAlign)); - Register DstPtr; - if (Align > PtrSize) { + MachineInstrBuilder DstPtr; + if (Alignment > PtrAlign) { // Realign the list to the actual required alignment. - auto AlignMinus1 = MIRBuilder.buildConstant(IntPtrTy, Align - 1); - + auto AlignMinus1 = + MIRBuilder.buildConstant(IntPtrTy, Alignment.value() - 1); auto ListTmp = MIRBuilder.buildPtrAdd(PtrTy, List, AlignMinus1.getReg(0)); - - DstPtr = MRI.createGenericVirtualRegister(PtrTy); - MIRBuilder.buildPtrMask(DstPtr, ListTmp, Log2_64(Align)); + DstPtr = MIRBuilder.buildMaskLowPtrBits(PtrTy, ListTmp, Log2(Alignment)); } else DstPtr = List; @@ -755,16 +793,16 @@ bool AArch64LegalizerInfo::legalizeVaArg(MachineInstr &MI, MIRBuilder.buildLoad( Dst, DstPtr, *MF.getMachineMemOperand(MachinePointerInfo(), MachineMemOperand::MOLoad, - ValSize, std::max(Align, PtrSize))); + ValSize, std::max(Alignment, PtrAlign))); - auto Size = MIRBuilder.buildConstant(IntPtrTy, alignTo(ValSize, PtrSize)); + auto Size = MIRBuilder.buildConstant(IntPtrTy, alignTo(ValSize, PtrAlign)); auto NewList = MIRBuilder.buildPtrAdd(PtrTy, DstPtr, Size.getReg(0)); - MIRBuilder.buildStore( - NewList, ListPtr, - *MF.getMachineMemOperand(MachinePointerInfo(), MachineMemOperand::MOStore, - PtrSize, /* Align = */ PtrSize)); + MIRBuilder.buildStore(NewList, ListPtr, + *MF.getMachineMemOperand(MachinePointerInfo(), + MachineMemOperand::MOStore, + PtrSize, PtrAlign)); MI.eraseFromParent(); return true; diff --git a/llvm/lib/Target/AArch64/AArch64LegalizerInfo.h b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.h index 15161bab466c4..1cb24559c1abf 100644 --- a/llvm/lib/Target/AArch64/AArch64LegalizerInfo.h +++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.h @@ -27,12 +27,10 @@ class AArch64LegalizerInfo : public LegalizerInfo { public: AArch64LegalizerInfo(const AArch64Subtarget &ST); - bool legalizeCustom(MachineInstr &MI, MachineRegisterInfo &MRI, - MachineIRBuilder &MIRBuilder, - GISelChangeObserver &Observer) const override; + bool legalizeCustom(LegalizerHelper &Helper, MachineInstr &MI) const override; - bool legalizeIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, - MachineIRBuilder &MIRBuilder) const override; + bool legalizeIntrinsic(LegalizerHelper &Helper, + MachineInstr &MI) const override; private: bool legalizeVaArg(MachineInstr &MI, MachineRegisterInfo &MRI, @@ -43,6 +41,11 @@ private: bool legalizeShlAshrLshr(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &MIRBuilder, GISelChangeObserver &Observer) const; + + bool legalizeSmallCMGlobalValue(MachineInstr &MI, MachineRegisterInfo &MRI, + MachineIRBuilder &MIRBuilder, + GISelChangeObserver &Observer) const; + const AArch64Subtarget *ST; }; } // End llvm namespace. #endif diff --git a/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp new file mode 100644 index 0000000000000..baa8515baf3ea --- /dev/null +++ b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp @@ -0,0 +1,507 @@ + //=== lib/CodeGen/GlobalISel/AArch64PostLegalizerCombiner.cpp -------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This performs post-legalization combines on generic MachineInstrs. +// +// Any combine that this pass performs must preserve instruction legality. +// Combines unconcerned with legality should be handled by the +// PreLegalizerCombiner instead. +// +//===----------------------------------------------------------------------===// + +#include "AArch64TargetMachine.h" +#include "llvm/CodeGen/GlobalISel/Combiner.h" +#include "llvm/CodeGen/GlobalISel/CombinerHelper.h" +#include "llvm/CodeGen/GlobalISel/CombinerInfo.h" +#include "llvm/CodeGen/GlobalISel/GISelKnownBits.h" +#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" +#include "llvm/CodeGen/MachineDominators.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/TargetPassConfig.h" +#include "llvm/Support/Debug.h" + +#define DEBUG_TYPE "aarch64-postlegalizer-combiner" + +using namespace llvm; +using namespace MIPatternMatch; + +/// Represents a pseudo instruction which replaces a G_SHUFFLE_VECTOR. +/// +/// Used for matching target-supported shuffles before codegen. +struct ShuffleVectorPseudo { + unsigned Opc; ///< Opcode for the instruction. (E.g. G_ZIP1) + Register Dst; ///< Destination register. + SmallVector<SrcOp, 2> SrcOps; ///< Source registers. + ShuffleVectorPseudo(unsigned Opc, Register Dst, + std::initializer_list<SrcOp> SrcOps) + : Opc(Opc), Dst(Dst), SrcOps(SrcOps){}; + ShuffleVectorPseudo() {} +}; + +/// \returns The splat index of a G_SHUFFLE_VECTOR \p MI when \p MI is a splat. +/// If \p MI is not a splat, returns None. +static Optional<int> getSplatIndex(MachineInstr &MI) { + assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR && + "Only G_SHUFFLE_VECTOR can have a splat index!"); + ArrayRef<int> Mask = MI.getOperand(3).getShuffleMask(); + auto FirstDefinedIdx = find_if(Mask, [](int Elt) { return Elt >= 0; }); + + // If all elements are undefined, this shuffle can be considered a splat. + // Return 0 for better potential for callers to simplify. + if (FirstDefinedIdx == Mask.end()) + return 0; + + // Make sure all remaining elements are either undef or the same + // as the first non-undef value. + int SplatValue = *FirstDefinedIdx; + if (any_of(make_range(std::next(FirstDefinedIdx), Mask.end()), + [&SplatValue](int Elt) { return Elt >= 0 && Elt != SplatValue; })) + return None; + + return SplatValue; +} + +/// Check if a vector shuffle corresponds to a REV instruction with the +/// specified blocksize. +static bool isREVMask(ArrayRef<int> M, unsigned EltSize, unsigned NumElts, + unsigned BlockSize) { + assert((BlockSize == 16 || BlockSize == 32 || BlockSize == 64) && + "Only possible block sizes for REV are: 16, 32, 64"); + assert(EltSize != 64 && "EltSize cannot be 64 for REV mask."); + + unsigned BlockElts = M[0] + 1; + + // If the first shuffle index is UNDEF, be optimistic. + if (M[0] < 0) + BlockElts = BlockSize / EltSize; + + if (BlockSize <= EltSize || BlockSize != BlockElts * EltSize) + return false; + + for (unsigned i = 0; i < NumElts; ++i) { + // Ignore undef indices. + if (M[i] < 0) + continue; + if (static_cast<unsigned>(M[i]) != + (i - i % BlockElts) + (BlockElts - 1 - i % BlockElts)) + return false; + } + + return true; +} + +/// Determines if \p M is a shuffle vector mask for a TRN of \p NumElts. +/// Whether or not G_TRN1 or G_TRN2 should be used is stored in \p WhichResult. +static bool isTRNMask(ArrayRef<int> M, unsigned NumElts, + unsigned &WhichResult) { + if (NumElts % 2 != 0) + return false; + WhichResult = (M[0] == 0 ? 0 : 1); + for (unsigned i = 0; i < NumElts; i += 2) { + if ((M[i] >= 0 && static_cast<unsigned>(M[i]) != i + WhichResult) || + (M[i + 1] >= 0 && + static_cast<unsigned>(M[i + 1]) != i + NumElts + WhichResult)) + return false; + } + return true; +} + +/// Check if a G_EXT instruction can handle a shuffle mask \p M when the vector +/// sources of the shuffle are different. +static Optional<std::pair<bool, uint64_t>> getExtMask(ArrayRef<int> M, + unsigned NumElts) { + // Look for the first non-undef element. + auto FirstRealElt = find_if(M, [](int Elt) { return Elt >= 0; }); + if (FirstRealElt == M.end()) + return None; + + // Use APInt to handle overflow when calculating expected element. + unsigned MaskBits = APInt(32, NumElts * 2).logBase2(); + APInt ExpectedElt = APInt(MaskBits, *FirstRealElt + 1); + + // The following shuffle indices must be the successive elements after the + // first real element. + if (any_of( + make_range(std::next(FirstRealElt), M.end()), + [&ExpectedElt](int Elt) { return Elt != ExpectedElt++ && Elt >= 0; })) + return None; + + // The index of an EXT is the first element if it is not UNDEF. + // Watch out for the beginning UNDEFs. The EXT index should be the expected + // value of the first element. E.g. + // <-1, -1, 3, ...> is treated as <1, 2, 3, ...>. + // <-1, -1, 0, 1, ...> is treated as <2*NumElts-2, 2*NumElts-1, 0, 1, ...>. + // ExpectedElt is the last mask index plus 1. + uint64_t Imm = ExpectedElt.getZExtValue(); + bool ReverseExt = false; + + // There are two difference cases requiring to reverse input vectors. + // For example, for vector <4 x i32> we have the following cases, + // Case 1: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, -1, 0>) + // Case 2: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, 7, 0>) + // For both cases, we finally use mask <5, 6, 7, 0>, which requires + // to reverse two input vectors. + if (Imm < NumElts) + ReverseExt = true; + else + Imm -= NumElts; + return std::make_pair(ReverseExt, Imm); +} + +/// Determines if \p M is a shuffle vector mask for a UZP of \p NumElts. +/// Whether or not G_UZP1 or G_UZP2 should be used is stored in \p WhichResult. +static bool isUZPMask(ArrayRef<int> M, unsigned NumElts, + unsigned &WhichResult) { + WhichResult = (M[0] == 0 ? 0 : 1); + for (unsigned i = 0; i != NumElts; ++i) { + // Skip undef indices. + if (M[i] < 0) + continue; + if (static_cast<unsigned>(M[i]) != 2 * i + WhichResult) + return false; + } + return true; +} + +/// \return true if \p M is a zip mask for a shuffle vector of \p NumElts. +/// Whether or not G_ZIP1 or G_ZIP2 should be used is stored in \p WhichResult. +static bool isZipMask(ArrayRef<int> M, unsigned NumElts, + unsigned &WhichResult) { + if (NumElts % 2 != 0) + return false; + + // 0 means use ZIP1, 1 means use ZIP2. + WhichResult = (M[0] == 0 ? 0 : 1); + unsigned Idx = WhichResult * NumElts / 2; + for (unsigned i = 0; i != NumElts; i += 2) { + if ((M[i] >= 0 && static_cast<unsigned>(M[i]) != Idx) || + (M[i + 1] >= 0 && static_cast<unsigned>(M[i + 1]) != Idx + NumElts)) + return false; + Idx += 1; + } + return true; +} + +/// \return true if a G_SHUFFLE_VECTOR instruction \p MI can be replaced with a +/// G_REV instruction. Returns the appropriate G_REV opcode in \p Opc. +static bool matchREV(MachineInstr &MI, MachineRegisterInfo &MRI, + ShuffleVectorPseudo &MatchInfo) { + assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR); + ArrayRef<int> ShuffleMask = MI.getOperand(3).getShuffleMask(); + Register Dst = MI.getOperand(0).getReg(); + Register Src = MI.getOperand(1).getReg(); + LLT Ty = MRI.getType(Dst); + unsigned EltSize = Ty.getScalarSizeInBits(); + + // Element size for a rev cannot be 64. + if (EltSize == 64) + return false; + + unsigned NumElts = Ty.getNumElements(); + + // Try to produce G_REV64 + if (isREVMask(ShuffleMask, EltSize, NumElts, 64)) { + MatchInfo = ShuffleVectorPseudo(AArch64::G_REV64, Dst, {Src}); + return true; + } + + // TODO: Produce G_REV32 and G_REV16 once we have proper legalization support. + // This should be identical to above, but with a constant 32 and constant + // 16. + return false; +} + +/// \return true if a G_SHUFFLE_VECTOR instruction \p MI can be replaced with +/// a G_TRN1 or G_TRN2 instruction. +static bool matchTRN(MachineInstr &MI, MachineRegisterInfo &MRI, + ShuffleVectorPseudo &MatchInfo) { + assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR); + unsigned WhichResult; + ArrayRef<int> ShuffleMask = MI.getOperand(3).getShuffleMask(); + Register Dst = MI.getOperand(0).getReg(); + unsigned NumElts = MRI.getType(Dst).getNumElements(); + if (!isTRNMask(ShuffleMask, NumElts, WhichResult)) + return false; + unsigned Opc = (WhichResult == 0) ? AArch64::G_TRN1 : AArch64::G_TRN2; + Register V1 = MI.getOperand(1).getReg(); + Register V2 = MI.getOperand(2).getReg(); + MatchInfo = ShuffleVectorPseudo(Opc, Dst, {V1, V2}); + return true; +} + +/// \return true if a G_SHUFFLE_VECTOR instruction \p MI can be replaced with +/// a G_UZP1 or G_UZP2 instruction. +/// +/// \param [in] MI - The shuffle vector instruction. +/// \param [out] MatchInfo - Either G_UZP1 or G_UZP2 on success. +static bool matchUZP(MachineInstr &MI, MachineRegisterInfo &MRI, + ShuffleVectorPseudo &MatchInfo) { + assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR); + unsigned WhichResult; + ArrayRef<int> ShuffleMask = MI.getOperand(3).getShuffleMask(); + Register Dst = MI.getOperand(0).getReg(); + unsigned NumElts = MRI.getType(Dst).getNumElements(); + if (!isUZPMask(ShuffleMask, NumElts, WhichResult)) + return false; + unsigned Opc = (WhichResult == 0) ? AArch64::G_UZP1 : AArch64::G_UZP2; + Register V1 = MI.getOperand(1).getReg(); + Register V2 = MI.getOperand(2).getReg(); + MatchInfo = ShuffleVectorPseudo(Opc, Dst, {V1, V2}); + return true; +} + +static bool matchZip(MachineInstr &MI, MachineRegisterInfo &MRI, + ShuffleVectorPseudo &MatchInfo) { + assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR); + unsigned WhichResult; + ArrayRef<int> ShuffleMask = MI.getOperand(3).getShuffleMask(); + Register Dst = MI.getOperand(0).getReg(); + unsigned NumElts = MRI.getType(Dst).getNumElements(); + if (!isZipMask(ShuffleMask, NumElts, WhichResult)) + return false; + unsigned Opc = (WhichResult == 0) ? AArch64::G_ZIP1 : AArch64::G_ZIP2; + Register V1 = MI.getOperand(1).getReg(); + Register V2 = MI.getOperand(2).getReg(); + MatchInfo = ShuffleVectorPseudo(Opc, Dst, {V1, V2}); + return true; +} + +/// Helper function for matchDup. +static bool matchDupFromInsertVectorElt(int Lane, MachineInstr &MI, + MachineRegisterInfo &MRI, + ShuffleVectorPseudo &MatchInfo) { + if (Lane != 0) + return false; + + // Try to match a vector splat operation into a dup instruction. + // We're looking for this pattern: + // + // %scalar:gpr(s64) = COPY $x0 + // %undef:fpr(<2 x s64>) = G_IMPLICIT_DEF + // %cst0:gpr(s32) = G_CONSTANT i32 0 + // %zerovec:fpr(<2 x s32>) = G_BUILD_VECTOR %cst0(s32), %cst0(s32) + // %ins:fpr(<2 x s64>) = G_INSERT_VECTOR_ELT %undef, %scalar(s64), %cst0(s32) + // %splat:fpr(<2 x s64>) = G_SHUFFLE_VECTOR %ins(<2 x s64>), %undef, %zerovec(<2 x s32>) + // + // ...into: + // %splat = G_DUP %scalar + + // Begin matching the insert. + auto *InsMI = getOpcodeDef(TargetOpcode::G_INSERT_VECTOR_ELT, + MI.getOperand(1).getReg(), MRI); + if (!InsMI) + return false; + // Match the undef vector operand. + if (!getOpcodeDef(TargetOpcode::G_IMPLICIT_DEF, InsMI->getOperand(1).getReg(), + MRI)) + return false; + + // Match the index constant 0. + int64_t Index = 0; + if (!mi_match(InsMI->getOperand(3).getReg(), MRI, m_ICst(Index)) || Index) + return false; + + MatchInfo = ShuffleVectorPseudo(AArch64::G_DUP, MI.getOperand(0).getReg(), + {InsMI->getOperand(2).getReg()}); + return true; +} + +/// Helper function for matchDup. +static bool matchDupFromBuildVector(int Lane, MachineInstr &MI, + MachineRegisterInfo &MRI, + ShuffleVectorPseudo &MatchInfo) { + assert(Lane >= 0 && "Expected positive lane?"); + // Test if the LHS is a BUILD_VECTOR. If it is, then we can just reference the + // lane's definition directly. + auto *BuildVecMI = getOpcodeDef(TargetOpcode::G_BUILD_VECTOR, + MI.getOperand(1).getReg(), MRI); + if (!BuildVecMI) + return false; + Register Reg = BuildVecMI->getOperand(Lane + 1).getReg(); + MatchInfo = + ShuffleVectorPseudo(AArch64::G_DUP, MI.getOperand(0).getReg(), {Reg}); + return true; +} + +static bool matchDup(MachineInstr &MI, MachineRegisterInfo &MRI, + ShuffleVectorPseudo &MatchInfo) { + assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR); + auto MaybeLane = getSplatIndex(MI); + if (!MaybeLane) + return false; + int Lane = *MaybeLane; + // If this is undef splat, generate it via "just" vdup, if possible. + if (Lane < 0) + Lane = 0; + if (matchDupFromInsertVectorElt(Lane, MI, MRI, MatchInfo)) + return true; + if (matchDupFromBuildVector(Lane, MI, MRI, MatchInfo)) + return true; + return false; +} + +static bool matchEXT(MachineInstr &MI, MachineRegisterInfo &MRI, + ShuffleVectorPseudo &MatchInfo) { + assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR); + Register Dst = MI.getOperand(0).getReg(); + auto ExtInfo = getExtMask(MI.getOperand(3).getShuffleMask(), + MRI.getType(Dst).getNumElements()); + if (!ExtInfo) + return false; + bool ReverseExt; + uint64_t Imm; + std::tie(ReverseExt, Imm) = *ExtInfo; + Register V1 = MI.getOperand(1).getReg(); + Register V2 = MI.getOperand(2).getReg(); + if (ReverseExt) + std::swap(V1, V2); + uint64_t ExtFactor = MRI.getType(V1).getScalarSizeInBits() / 8; + Imm *= ExtFactor; + MatchInfo = ShuffleVectorPseudo(AArch64::G_EXT, Dst, {V1, V2, Imm}); + return true; +} + +/// Replace a G_SHUFFLE_VECTOR instruction with a pseudo. +/// \p Opc is the opcode to use. \p MI is the G_SHUFFLE_VECTOR. +static bool applyShuffleVectorPseudo(MachineInstr &MI, + ShuffleVectorPseudo &MatchInfo) { + MachineIRBuilder MIRBuilder(MI); + MIRBuilder.buildInstr(MatchInfo.Opc, {MatchInfo.Dst}, MatchInfo.SrcOps); + MI.eraseFromParent(); + return true; +} + +/// Replace a G_SHUFFLE_VECTOR instruction with G_EXT. +/// Special-cased because the constant operand must be emitted as a G_CONSTANT +/// for the imported tablegen patterns to work. +static bool applyEXT(MachineInstr &MI, ShuffleVectorPseudo &MatchInfo) { + MachineIRBuilder MIRBuilder(MI); + // Tablegen patterns expect an i32 G_CONSTANT as the final op. + auto Cst = + MIRBuilder.buildConstant(LLT::scalar(32), MatchInfo.SrcOps[2].getImm()); + MIRBuilder.buildInstr(MatchInfo.Opc, {MatchInfo.Dst}, + {MatchInfo.SrcOps[0], MatchInfo.SrcOps[1], Cst}); + MI.eraseFromParent(); + return true; +} + +#define AARCH64POSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS +#include "AArch64GenPostLegalizeGICombiner.inc" +#undef AARCH64POSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS + +namespace { +#define AARCH64POSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H +#include "AArch64GenPostLegalizeGICombiner.inc" +#undef AARCH64POSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H + +class AArch64PostLegalizerCombinerInfo : public CombinerInfo { + GISelKnownBits *KB; + MachineDominatorTree *MDT; + +public: + AArch64GenPostLegalizerCombinerHelperRuleConfig GeneratedRuleCfg; + + AArch64PostLegalizerCombinerInfo(bool EnableOpt, bool OptSize, bool MinSize, + GISelKnownBits *KB, + MachineDominatorTree *MDT) + : CombinerInfo(/*AllowIllegalOps*/ true, /*ShouldLegalizeIllegal*/ false, + /*LegalizerInfo*/ nullptr, EnableOpt, OptSize, MinSize), + KB(KB), MDT(MDT) { + if (!GeneratedRuleCfg.parseCommandLineOption()) + report_fatal_error("Invalid rule identifier"); + } + + virtual bool combine(GISelChangeObserver &Observer, MachineInstr &MI, + MachineIRBuilder &B) const override; +}; + +bool AArch64PostLegalizerCombinerInfo::combine(GISelChangeObserver &Observer, + MachineInstr &MI, + MachineIRBuilder &B) const { + const auto *LI = + MI.getParent()->getParent()->getSubtarget().getLegalizerInfo(); + CombinerHelper Helper(Observer, B, KB, MDT, LI); + AArch64GenPostLegalizerCombinerHelper Generated(GeneratedRuleCfg); + return Generated.tryCombineAll(Observer, MI, B, Helper); +} + +#define AARCH64POSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP +#include "AArch64GenPostLegalizeGICombiner.inc" +#undef AARCH64POSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP + +class AArch64PostLegalizerCombiner : public MachineFunctionPass { +public: + static char ID; + + AArch64PostLegalizerCombiner(bool IsOptNone = false); + + StringRef getPassName() const override { + return "AArch64PostLegalizerCombiner"; + } + + bool runOnMachineFunction(MachineFunction &MF) override; + void getAnalysisUsage(AnalysisUsage &AU) const override; + +private: + bool IsOptNone; +}; +} // end anonymous namespace + +void AArch64PostLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const { + AU.addRequired<TargetPassConfig>(); + AU.setPreservesCFG(); + getSelectionDAGFallbackAnalysisUsage(AU); + AU.addRequired<GISelKnownBitsAnalysis>(); + AU.addPreserved<GISelKnownBitsAnalysis>(); + if (!IsOptNone) { + AU.addRequired<MachineDominatorTree>(); + AU.addPreserved<MachineDominatorTree>(); + } + MachineFunctionPass::getAnalysisUsage(AU); +} + +AArch64PostLegalizerCombiner::AArch64PostLegalizerCombiner(bool IsOptNone) + : MachineFunctionPass(ID), IsOptNone(IsOptNone) { + initializeAArch64PostLegalizerCombinerPass(*PassRegistry::getPassRegistry()); +} + +bool AArch64PostLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) { + if (MF.getProperties().hasProperty( + MachineFunctionProperties::Property::FailedISel)) + return false; + assert(MF.getProperties().hasProperty( + MachineFunctionProperties::Property::Legalized) && + "Expected a legalized function?"); + auto *TPC = &getAnalysis<TargetPassConfig>(); + const Function &F = MF.getFunction(); + bool EnableOpt = + MF.getTarget().getOptLevel() != CodeGenOpt::None && !skipFunction(F); + GISelKnownBits *KB = &getAnalysis<GISelKnownBitsAnalysis>().get(MF); + MachineDominatorTree *MDT = + IsOptNone ? nullptr : &getAnalysis<MachineDominatorTree>(); + AArch64PostLegalizerCombinerInfo PCInfo(EnableOpt, F.hasOptSize(), + F.hasMinSize(), KB, MDT); + Combiner C(PCInfo, TPC); + return C.combineMachineInstrs(MF, /*CSEInfo*/ nullptr); +} + +char AArch64PostLegalizerCombiner::ID = 0; +INITIALIZE_PASS_BEGIN(AArch64PostLegalizerCombiner, DEBUG_TYPE, + "Combine AArch64 MachineInstrs after legalization", false, + false) +INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) +INITIALIZE_PASS_DEPENDENCY(GISelKnownBitsAnalysis) +INITIALIZE_PASS_END(AArch64PostLegalizerCombiner, DEBUG_TYPE, + "Combine AArch64 MachineInstrs after legalization", false, + false) + +namespace llvm { +FunctionPass *createAArch64PostLegalizeCombiner(bool IsOptNone) { + return new AArch64PostLegalizerCombiner(IsOptNone); +} +} // end namespace llvm diff --git a/llvm/lib/Target/AArch64/AArch64PreLegalizerCombiner.cpp b/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp index 230fd514d0222..9a1f200d52222 100644 --- a/llvm/lib/Target/AArch64/AArch64PreLegalizerCombiner.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp @@ -27,28 +27,62 @@ using namespace llvm; using namespace MIPatternMatch; +/// Return true if a G_FCONSTANT instruction is known to be better-represented +/// as a G_CONSTANT. +static bool matchFConstantToConstant(MachineInstr &MI, + MachineRegisterInfo &MRI) { + assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT); + Register DstReg = MI.getOperand(0).getReg(); + const unsigned DstSize = MRI.getType(DstReg).getSizeInBits(); + if (DstSize != 32 && DstSize != 64) + return false; + + // When we're storing a value, it doesn't matter what register bank it's on. + // Since not all floating point constants can be materialized using a fmov, + // it makes more sense to just use a GPR. + return all_of(MRI.use_nodbg_instructions(DstReg), + [](const MachineInstr &Use) { return Use.mayStore(); }); +} + +/// Change a G_FCONSTANT into a G_CONSTANT. +static void applyFConstantToConstant(MachineInstr &MI) { + assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT); + MachineIRBuilder MIB(MI); + const APFloat &ImmValAPF = MI.getOperand(1).getFPImm()->getValueAPF(); + MIB.buildConstant(MI.getOperand(0).getReg(), ImmValAPF.bitcastToAPInt()); + MI.eraseFromParent(); +} + +class AArch64PreLegalizerCombinerHelperState { +protected: + CombinerHelper &Helper; + +public: + AArch64PreLegalizerCombinerHelperState(CombinerHelper &Helper) + : Helper(Helper) {} +}; + #define AARCH64PRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS -#include "AArch64GenGICombiner.inc" +#include "AArch64GenPreLegalizeGICombiner.inc" #undef AARCH64PRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS namespace { #define AARCH64PRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H -#include "AArch64GenGICombiner.inc" +#include "AArch64GenPreLegalizeGICombiner.inc" #undef AARCH64PRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H class AArch64PreLegalizerCombinerInfo : public CombinerInfo { GISelKnownBits *KB; MachineDominatorTree *MDT; + AArch64GenPreLegalizerCombinerHelperRuleConfig GeneratedRuleCfg; public: - AArch64GenPreLegalizerCombinerHelper Generated; - AArch64PreLegalizerCombinerInfo(bool EnableOpt, bool OptSize, bool MinSize, GISelKnownBits *KB, MachineDominatorTree *MDT) : CombinerInfo(/*AllowIllegalOps*/ true, /*ShouldLegalizeIllegal*/ false, /*LegalizerInfo*/ nullptr, EnableOpt, OptSize, MinSize), KB(KB), MDT(MDT) { - if (!Generated.parseCommandLineOption()) + if (!GeneratedRuleCfg.parseCommandLineOption()) report_fatal_error("Invalid rule identifier"); } @@ -60,6 +94,7 @@ bool AArch64PreLegalizerCombinerInfo::combine(GISelChangeObserver &Observer, MachineInstr &MI, MachineIRBuilder &B) const { CombinerHelper Helper(Observer, B, KB, MDT); + AArch64GenPreLegalizerCombinerHelper Generated(GeneratedRuleCfg, Helper); switch (MI.getOpcode()) { case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS: @@ -79,7 +114,7 @@ bool AArch64PreLegalizerCombinerInfo::combine(GISelChangeObserver &Observer, } } - if (Generated.tryCombineAll(Observer, MI, B, Helper)) + if (Generated.tryCombineAll(Observer, MI, B)) return true; switch (MI.getOpcode()) { @@ -93,7 +128,7 @@ bool AArch64PreLegalizerCombinerInfo::combine(GISelChangeObserver &Observer, } #define AARCH64PRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP -#include "AArch64GenGICombiner.inc" +#include "AArch64GenPreLegalizeGICombiner.inc" #undef AARCH64PRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP // Pass boilerplate diff --git a/llvm/lib/Target/AArch64/AArch64RegisterBankInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp index 40efac261fd99..7e3ff1948dad7 100644 --- a/llvm/lib/Target/AArch64/AArch64RegisterBankInfo.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp @@ -38,58 +38,58 @@ using namespace llvm; AArch64RegisterBankInfo::AArch64RegisterBankInfo(const TargetRegisterInfo &TRI) : AArch64GenRegisterBankInfo() { - static bool AlreadyInit = false; - // We have only one set of register banks, whatever the subtarget - // is. Therefore, the initialization of the RegBanks table should be - // done only once. Indeed the table of all register banks - // (AArch64::RegBanks) is unique in the compiler. At some point, it - // will get tablegen'ed and the whole constructor becomes empty. - if (AlreadyInit) - return; - AlreadyInit = true; - - const RegisterBank &RBGPR = getRegBank(AArch64::GPRRegBankID); - (void)RBGPR; - assert(&AArch64::GPRRegBank == &RBGPR && - "The order in RegBanks is messed up"); - - const RegisterBank &RBFPR = getRegBank(AArch64::FPRRegBankID); - (void)RBFPR; - assert(&AArch64::FPRRegBank == &RBFPR && - "The order in RegBanks is messed up"); - - const RegisterBank &RBCCR = getRegBank(AArch64::CCRegBankID); - (void)RBCCR; - assert(&AArch64::CCRegBank == &RBCCR && "The order in RegBanks is messed up"); - - // The GPR register bank is fully defined by all the registers in - // GR64all + its subclasses. - assert(RBGPR.covers(*TRI.getRegClass(AArch64::GPR32RegClassID)) && - "Subclass not added?"); - assert(RBGPR.getSize() == 64 && "GPRs should hold up to 64-bit"); - - // The FPR register bank is fully defined by all the registers in - // GR64all + its subclasses. - assert(RBFPR.covers(*TRI.getRegClass(AArch64::QQRegClassID)) && - "Subclass not added?"); - assert(RBFPR.covers(*TRI.getRegClass(AArch64::FPR64RegClassID)) && - "Subclass not added?"); - assert(RBFPR.getSize() == 512 && - "FPRs should hold up to 512-bit via QQQQ sequence"); - - assert(RBCCR.covers(*TRI.getRegClass(AArch64::CCRRegClassID)) && - "Class not added?"); - assert(RBCCR.getSize() == 32 && "CCR should hold up to 32-bit"); - - // Check that the TableGen'ed like file is in sync we our expectations. - // First, the Idx. - assert(checkPartialMappingIdx(PMI_FirstGPR, PMI_LastGPR, - {PMI_GPR32, PMI_GPR64}) && - "PartialMappingIdx's are incorrectly ordered"); - assert(checkPartialMappingIdx(PMI_FirstFPR, PMI_LastFPR, - {PMI_FPR16, PMI_FPR32, PMI_FPR64, PMI_FPR128, - PMI_FPR256, PMI_FPR512}) && - "PartialMappingIdx's are incorrectly ordered"); + static llvm::once_flag InitializeRegisterBankFlag; + + static auto InitializeRegisterBankOnce = [&]() { + // We have only one set of register banks, whatever the subtarget + // is. Therefore, the initialization of the RegBanks table should be + // done only once. Indeed the table of all register banks + // (AArch64::RegBanks) is unique in the compiler. At some point, it + // will get tablegen'ed and the whole constructor becomes empty. + + const RegisterBank &RBGPR = getRegBank(AArch64::GPRRegBankID); + (void)RBGPR; + assert(&AArch64::GPRRegBank == &RBGPR && + "The order in RegBanks is messed up"); + + const RegisterBank &RBFPR = getRegBank(AArch64::FPRRegBankID); + (void)RBFPR; + assert(&AArch64::FPRRegBank == &RBFPR && + "The order in RegBanks is messed up"); + + const RegisterBank &RBCCR = getRegBank(AArch64::CCRegBankID); + (void)RBCCR; + assert(&AArch64::CCRegBank == &RBCCR && + "The order in RegBanks is messed up"); + + // The GPR register bank is fully defined by all the registers in + // GR64all + its subclasses. + assert(RBGPR.covers(*TRI.getRegClass(AArch64::GPR32RegClassID)) && + "Subclass not added?"); + assert(RBGPR.getSize() == 64 && "GPRs should hold up to 64-bit"); + + // The FPR register bank is fully defined by all the registers in + // GR64all + its subclasses. + assert(RBFPR.covers(*TRI.getRegClass(AArch64::QQRegClassID)) && + "Subclass not added?"); + assert(RBFPR.covers(*TRI.getRegClass(AArch64::FPR64RegClassID)) && + "Subclass not added?"); + assert(RBFPR.getSize() == 512 && + "FPRs should hold up to 512-bit via QQQQ sequence"); + + assert(RBCCR.covers(*TRI.getRegClass(AArch64::CCRRegClassID)) && + "Class not added?"); + assert(RBCCR.getSize() == 32 && "CCR should hold up to 32-bit"); + + // Check that the TableGen'ed like file is in sync we our expectations. + // First, the Idx. + assert(checkPartialMappingIdx(PMI_FirstGPR, PMI_LastGPR, + {PMI_GPR32, PMI_GPR64}) && + "PartialMappingIdx's are incorrectly ordered"); + assert(checkPartialMappingIdx(PMI_FirstFPR, PMI_LastFPR, + {PMI_FPR16, PMI_FPR32, PMI_FPR64, PMI_FPR128, + PMI_FPR256, PMI_FPR512}) && + "PartialMappingIdx's are incorrectly ordered"); // Now, the content. // Check partial mapping. #define CHECK_PARTIALMAP(Idx, ValStartIdx, ValLength, RB) \ @@ -99,14 +99,14 @@ AArch64RegisterBankInfo::AArch64RegisterBankInfo(const TargetRegisterInfo &TRI) #Idx " is incorrectly initialized"); \ } while (false) - CHECK_PARTIALMAP(PMI_GPR32, 0, 32, RBGPR); - CHECK_PARTIALMAP(PMI_GPR64, 0, 64, RBGPR); - CHECK_PARTIALMAP(PMI_FPR16, 0, 16, RBFPR); - CHECK_PARTIALMAP(PMI_FPR32, 0, 32, RBFPR); - CHECK_PARTIALMAP(PMI_FPR64, 0, 64, RBFPR); - CHECK_PARTIALMAP(PMI_FPR128, 0, 128, RBFPR); - CHECK_PARTIALMAP(PMI_FPR256, 0, 256, RBFPR); - CHECK_PARTIALMAP(PMI_FPR512, 0, 512, RBFPR); + CHECK_PARTIALMAP(PMI_GPR32, 0, 32, RBGPR); + CHECK_PARTIALMAP(PMI_GPR64, 0, 64, RBGPR); + CHECK_PARTIALMAP(PMI_FPR16, 0, 16, RBFPR); + CHECK_PARTIALMAP(PMI_FPR32, 0, 32, RBFPR); + CHECK_PARTIALMAP(PMI_FPR64, 0, 64, RBFPR); + CHECK_PARTIALMAP(PMI_FPR128, 0, 128, RBFPR); + CHECK_PARTIALMAP(PMI_FPR256, 0, 256, RBFPR); + CHECK_PARTIALMAP(PMI_FPR512, 0, 512, RBFPR); // Check value mapping. #define CHECK_VALUEMAP_IMPL(RBName, Size, Offset) \ @@ -119,14 +119,14 @@ AArch64RegisterBankInfo::AArch64RegisterBankInfo(const TargetRegisterInfo &TRI) #define CHECK_VALUEMAP(RBName, Size) CHECK_VALUEMAP_IMPL(RBName, Size, 0) - CHECK_VALUEMAP(GPR, 32); - CHECK_VALUEMAP(GPR, 64); - CHECK_VALUEMAP(FPR, 16); - CHECK_VALUEMAP(FPR, 32); - CHECK_VALUEMAP(FPR, 64); - CHECK_VALUEMAP(FPR, 128); - CHECK_VALUEMAP(FPR, 256); - CHECK_VALUEMAP(FPR, 512); + CHECK_VALUEMAP(GPR, 32); + CHECK_VALUEMAP(GPR, 64); + CHECK_VALUEMAP(FPR, 16); + CHECK_VALUEMAP(FPR, 32); + CHECK_VALUEMAP(FPR, 64); + CHECK_VALUEMAP(FPR, 128); + CHECK_VALUEMAP(FPR, 256); + CHECK_VALUEMAP(FPR, 512); // Check the value mapping for 3-operands instructions where all the operands // map to the same value mapping. @@ -137,13 +137,13 @@ AArch64RegisterBankInfo::AArch64RegisterBankInfo(const TargetRegisterInfo &TRI) CHECK_VALUEMAP_IMPL(RBName, Size, 2); \ } while (false) - CHECK_VALUEMAP_3OPS(GPR, 32); - CHECK_VALUEMAP_3OPS(GPR, 64); - CHECK_VALUEMAP_3OPS(FPR, 32); - CHECK_VALUEMAP_3OPS(FPR, 64); - CHECK_VALUEMAP_3OPS(FPR, 128); - CHECK_VALUEMAP_3OPS(FPR, 256); - CHECK_VALUEMAP_3OPS(FPR, 512); + CHECK_VALUEMAP_3OPS(GPR, 32); + CHECK_VALUEMAP_3OPS(GPR, 64); + CHECK_VALUEMAP_3OPS(FPR, 32); + CHECK_VALUEMAP_3OPS(FPR, 64); + CHECK_VALUEMAP_3OPS(FPR, 128); + CHECK_VALUEMAP_3OPS(FPR, 256); + CHECK_VALUEMAP_3OPS(FPR, 512); #define CHECK_VALUEMAP_CROSSREGCPY(RBNameDst, RBNameSrc, Size) \ do { \ @@ -165,14 +165,14 @@ AArch64RegisterBankInfo::AArch64RegisterBankInfo(const TargetRegisterInfo &TRI) \ } while (false) - CHECK_VALUEMAP_CROSSREGCPY(GPR, GPR, 32); - CHECK_VALUEMAP_CROSSREGCPY(GPR, FPR, 32); - CHECK_VALUEMAP_CROSSREGCPY(GPR, GPR, 64); - CHECK_VALUEMAP_CROSSREGCPY(GPR, FPR, 64); - CHECK_VALUEMAP_CROSSREGCPY(FPR, FPR, 32); - CHECK_VALUEMAP_CROSSREGCPY(FPR, GPR, 32); - CHECK_VALUEMAP_CROSSREGCPY(FPR, FPR, 64); - CHECK_VALUEMAP_CROSSREGCPY(FPR, GPR, 64); + CHECK_VALUEMAP_CROSSREGCPY(GPR, GPR, 32); + CHECK_VALUEMAP_CROSSREGCPY(GPR, FPR, 32); + CHECK_VALUEMAP_CROSSREGCPY(GPR, GPR, 64); + CHECK_VALUEMAP_CROSSREGCPY(GPR, FPR, 64); + CHECK_VALUEMAP_CROSSREGCPY(FPR, FPR, 32); + CHECK_VALUEMAP_CROSSREGCPY(FPR, GPR, 32); + CHECK_VALUEMAP_CROSSREGCPY(FPR, FPR, 64); + CHECK_VALUEMAP_CROSSREGCPY(FPR, GPR, 64); #define CHECK_VALUEMAP_FPEXT(DstSize, SrcSize) \ do { \ @@ -193,12 +193,15 @@ AArch64RegisterBankInfo::AArch64RegisterBankInfo(const TargetRegisterInfo &TRI) \ } while (false) - CHECK_VALUEMAP_FPEXT(32, 16); - CHECK_VALUEMAP_FPEXT(64, 16); - CHECK_VALUEMAP_FPEXT(64, 32); - CHECK_VALUEMAP_FPEXT(128, 64); + CHECK_VALUEMAP_FPEXT(32, 16); + CHECK_VALUEMAP_FPEXT(64, 16); + CHECK_VALUEMAP_FPEXT(64, 32); + CHECK_VALUEMAP_FPEXT(128, 64); - assert(verify(TRI) && "Invalid register bank information"); + assert(verify(TRI) && "Invalid register bank information"); + }; + + llvm::call_once(InitializeRegisterBankFlag, InitializeRegisterBankOnce); } unsigned AArch64RegisterBankInfo::copyCost(const RegisterBank &A, @@ -228,8 +231,11 @@ AArch64RegisterBankInfo::getRegBankFromRegClass(const TargetRegisterClass &RC, switch (RC.getID()) { case AArch64::FPR8RegClassID: case AArch64::FPR16RegClassID: + case AArch64::FPR16_loRegClassID: + case AArch64::FPR32_with_hsub_in_FPR16_loRegClassID: case AArch64::FPR32RegClassID: case AArch64::FPR64RegClassID: + case AArch64::FPR64_loRegClassID: case AArch64::FPR128RegClassID: case AArch64::FPR128_loRegClassID: case AArch64::DDRegClassID: @@ -495,6 +501,7 @@ bool AArch64RegisterBankInfo::onlyDefinesFP( const MachineInstr &MI, const MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI) const { switch (MI.getOpcode()) { + case AArch64::G_DUP: case TargetOpcode::G_SITOFP: case TargetOpcode::G_UITOFP: case TargetOpcode::G_EXTRACT_VECTOR_ELT: @@ -636,6 +643,16 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { // Some of the floating-point instructions have mixed GPR and FPR operands: // fine-tune the computed mapping. switch (Opc) { + case AArch64::G_DUP: { + Register ScalarReg = MI.getOperand(1).getReg(); + auto ScalarDef = MRI.getVRegDef(ScalarReg); + if (getRegBank(ScalarReg, MRI, TRI) == &AArch64::FPRRegBank || + onlyDefinesFP(*ScalarDef, MRI, TRI)) + OpRegBankIdx = {PMI_FirstFPR, PMI_FirstFPR}; + else + OpRegBankIdx = {PMI_FirstFPR, PMI_FirstGPR}; + break; + } case TargetOpcode::G_TRUNC: { LLT SrcTy = MRI.getType(MI.getOperand(1).getReg()); if (!SrcTy.isVector() && SrcTy.getSizeInBits() == 128) @@ -680,7 +697,7 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { // In that case, we want the default mapping to be on FPR // instead of blind map every scalar to GPR. for (const MachineInstr &UseMI : - MRI.use_instructions(MI.getOperand(0).getReg())) { + MRI.use_nodbg_instructions(MI.getOperand(0).getReg())) { // If we have at least one direct use in a FP instruction, // assume this was a floating point load in the IR. // If it was not, we would have had a bitcast before @@ -727,9 +744,8 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { // // %z = G_SELECT %cond %x %y // fpr = G_FOO %z ... - if (any_of( - MRI.use_instructions(MI.getOperand(0).getReg()), - [&](MachineInstr &MI) { return onlyUsesFP(MI, MRI, TRI); })) + if (any_of(MRI.use_nodbg_instructions(MI.getOperand(0).getReg()), + [&](MachineInstr &MI) { return onlyUsesFP(MI, MRI, TRI); })) ++NumFP; // Check if the defs of the source values always produce floating point @@ -770,7 +786,7 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { // UNMERGE into scalars from a vector should always use FPR. // Likewise if any of the uses are FP instructions. if (SrcTy.isVector() || SrcTy == LLT::scalar(128) || - any_of(MRI.use_instructions(MI.getOperand(0).getReg()), + any_of(MRI.use_nodbg_instructions(MI.getOperand(0).getReg()), [&](MachineInstr &MI) { return onlyUsesFP(MI, MRI, TRI); })) { // Set the register bank of every operand to FPR. for (unsigned Idx = 0, NumOperands = MI.getNumOperands(); diff --git a/llvm/lib/Target/AArch64/AArch64RegisterBankInfo.h b/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.h index e956fca1aa109..e956fca1aa109 100644 --- a/llvm/lib/Target/AArch64/AArch64RegisterBankInfo.h +++ b/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.h diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h index 05a909f1780a0..9814f76258538 100644 --- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h +++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h @@ -763,10 +763,10 @@ static inline bool isSVECpyImm(int64_t Imm) { bool IsImm8 = int8_t(Imm) == Imm; bool IsImm16 = int16_t(Imm & ~0xff) == Imm; - if (std::is_same<int8_t, typename std::make_signed<T>::type>::value) + if (std::is_same<int8_t, std::make_signed_t<T>>::value) return IsImm8 || uint8_t(Imm) == Imm; - if (std::is_same<int16_t, typename std::make_signed<T>::type>::value) + if (std::is_same<int16_t, std::make_signed_t<T>>::value) return IsImm8 || IsImm16 || uint16_t(Imm & ~0xff) == Imm; return IsImm8 || IsImm16; @@ -775,8 +775,7 @@ static inline bool isSVECpyImm(int64_t Imm) { /// Returns true if Imm is valid for ADD/SUB. template <typename T> static inline bool isSVEAddSubImm(int64_t Imm) { - bool IsInt8t = - std::is_same<int8_t, typename std::make_signed<T>::type>::value; + bool IsInt8t = std::is_same<int8_t, std::make_signed_t<T>>::value; return uint8_t(Imm) == Imm || (!IsInt8t && uint16_t(Imm & ~0xff) == Imm); } diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp index 9db746733aa35..9f7dfdf624829 100644 --- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp +++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp @@ -24,6 +24,7 @@ #include "llvm/MC/MCSectionMachO.h" #include "llvm/MC/MCTargetOptions.h" #include "llvm/MC/MCValue.h" +#include "llvm/Support/EndianStream.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/TargetRegistry.h" using namespace llvm; @@ -33,6 +34,7 @@ namespace { class AArch64AsmBackend : public MCAsmBackend { static const unsigned PCRelFlagVal = MCFixupKindInfo::FKF_IsAlignedDownTo32Bits | MCFixupKindInfo::FKF_IsPCRel; +protected: Triple TheTriple; public: @@ -68,6 +70,11 @@ public: {"fixup_aarch64_pcrel_call26", 0, 26, PCRelFlagVal}, {"fixup_aarch64_tlsdesc_call", 0, 0, 0}}; + // Fixup kinds from .reloc directive are like R_AARCH64_NONE. They do not + // require any extra processing. + if (Kind >= FirstLiteralRelocationKind) + return MCAsmBackend::getFixupKindInfo(FK_NONE); + if (Kind < FirstTargetFixupKind) return MCAsmBackend::getFixupKindInfo(Kind); @@ -86,8 +93,8 @@ public: bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value, const MCRelaxableFragment *DF, const MCAsmLayout &Layout) const override; - void relaxInstruction(const MCInst &Inst, const MCSubtargetInfo &STI, - MCInst &Res) const override; + void relaxInstruction(MCInst &Inst, + const MCSubtargetInfo &STI) const override; bool writeNopData(raw_ostream &OS, uint64_t Count) const override; void HandleAssemblerFlag(MCAssemblerFlag Flag) {} @@ -108,7 +115,6 @@ static unsigned getFixupKindNumBytes(unsigned Kind) { default: llvm_unreachable("Unknown fixup kind!"); - case FK_NONE: case AArch64::fixup_aarch64_tlsdesc_call: return 0; @@ -237,11 +243,22 @@ static uint64_t adjustFixupValue(const MCFixup &Fixup, const MCValue &Target, static_cast<AArch64MCExpr::VariantKind>(Target.getRefKind()); if (AArch64MCExpr::getSymbolLoc(RefKind) != AArch64MCExpr::VK_ABS && AArch64MCExpr::getSymbolLoc(RefKind) != AArch64MCExpr::VK_SABS) { - // VK_GOTTPREL, VK_TPREL, VK_DTPREL are movw fixups, but they can't - // ever be resolved in the assembler. - Ctx.reportError(Fixup.getLoc(), - "relocation for a thread-local variable points to an " - "absolute symbol"); + if (!RefKind) { + // The fixup is an expression + if (SignedValue > 0xFFFF || SignedValue < -0xFFFF) + Ctx.reportError(Fixup.getLoc(), + "fixup value out of range [-0xFFFF, 0xFFFF]"); + + // Invert the negative immediate because it will feed into a MOVN. + if (SignedValue < 0) + SignedValue = ~SignedValue; + Value = static_cast<uint64_t>(SignedValue); + } else + // VK_GOTTPREL, VK_TPREL, VK_DTPREL are movw fixups, but they can't + // ever be resolved in the assembler. + Ctx.reportError(Fixup.getLoc(), + "relocation for a thread-local variable points to an " + "absolute symbol"); return Value; } @@ -329,7 +346,6 @@ static uint64_t adjustFixupValue(const MCFixup &Fixup, const MCValue &Target, if (!valueFitsIntoFixupKind(Fixup.getTargetKind(), Value)) Ctx.reportError(Fixup.getLoc(), "fixup value too large for data type!"); LLVM_FALLTHROUGH; - case FK_NONE: case FK_SecRel_2: case FK_SecRel_4: return Value; @@ -337,9 +353,17 @@ static uint64_t adjustFixupValue(const MCFixup &Fixup, const MCValue &Target, } Optional<MCFixupKind> AArch64AsmBackend::getFixupKind(StringRef Name) const { - if (TheTriple.isOSBinFormatELF() && Name == "R_AARCH64_NONE") - return FK_NONE; - return MCAsmBackend::getFixupKind(Name); + if (!TheTriple.isOSBinFormatELF()) + return None; + + unsigned Type = llvm::StringSwitch<unsigned>(Name) +#define ELF_RELOC(X, Y) .Case(#X, Y) +#include "llvm/BinaryFormat/ELFRelocs/AArch64.def" +#undef ELF_RELOC + .Default(-1u); + if (Type == -1u) + return None; + return static_cast<MCFixupKind>(FirstLiteralRelocationKind + Type); } /// getFixupKindContainereSizeInBytes - The number of bytes of the @@ -386,9 +410,12 @@ void AArch64AsmBackend::applyFixup(const MCAssembler &Asm, const MCFixup &Fixup, MutableArrayRef<char> Data, uint64_t Value, bool IsResolved, const MCSubtargetInfo *STI) const { - unsigned NumBytes = getFixupKindNumBytes(Fixup.getKind()); if (!Value) return; // Doesn't change encoding. + unsigned Kind = Fixup.getKind(); + if (Kind >= FirstLiteralRelocationKind) + return; + unsigned NumBytes = getFixupKindNumBytes(Kind); MCFixupKindInfo Info = getFixupKindInfo(Fixup.getKind()); MCContext &Ctx = Asm.getContext(); int64_t SignedValue = static_cast<int64_t>(Value); @@ -424,8 +451,9 @@ void AArch64AsmBackend::applyFixup(const MCAssembler &Asm, const MCFixup &Fixup, // FIXME: getFixupKindInfo() and getFixupKindNumBytes() could be fixed to // handle this more cleanly. This may affect the output of -show-mc-encoding. AArch64MCExpr::VariantKind RefKind = - static_cast<AArch64MCExpr::VariantKind>(Target.getRefKind()); - if (AArch64MCExpr::getSymbolLoc(RefKind) == AArch64MCExpr::VK_SABS) { + static_cast<AArch64MCExpr::VariantKind>(Target.getRefKind()); + if (AArch64MCExpr::getSymbolLoc(RefKind) == AArch64MCExpr::VK_SABS || + (!RefKind && Fixup.getTargetKind() == AArch64::fixup_aarch64_movw)) { // If the immediate is negative, generate MOVN else MOVZ. // (Bit 30 = 0) ==> MOVN, (Bit 30 = 1) ==> MOVZ. if (SignedValue < 0) @@ -451,9 +479,8 @@ bool AArch64AsmBackend::fixupNeedsRelaxation(const MCFixup &Fixup, return int64_t(Value) != int64_t(int8_t(Value)); } -void AArch64AsmBackend::relaxInstruction(const MCInst &Inst, - const MCSubtargetInfo &STI, - MCInst &Res) const { +void AArch64AsmBackend::relaxInstruction(MCInst &Inst, + const MCSubtargetInfo &STI) const { llvm_unreachable("AArch64AsmBackend::relaxInstruction() unimplemented"); } @@ -474,7 +501,7 @@ bool AArch64AsmBackend::shouldForceRelocation(const MCAssembler &Asm, const MCFixup &Fixup, const MCValue &Target) { unsigned Kind = Fixup.getKind(); - if (Kind == FK_NONE) + if (Kind >= FirstLiteralRelocationKind) return true; // The ADRP instruction adds some multiple of 0x1000 to the current PC & @@ -544,7 +571,6 @@ enum CompactUnwindEncodings { // FIXME: This should be in a separate file. class DarwinAArch64AsmBackend : public AArch64AsmBackend { const MCRegisterInfo &MRI; - bool IsILP32; /// Encode compact unwind stack adjustment for frameless functions. /// See UNWIND_ARM64_FRAMELESS_STACK_SIZE_MASK in compact_unwind_encoding.h. @@ -555,18 +581,15 @@ class DarwinAArch64AsmBackend : public AArch64AsmBackend { public: DarwinAArch64AsmBackend(const Target &T, const Triple &TT, - const MCRegisterInfo &MRI, bool IsILP32) - : AArch64AsmBackend(T, TT, /*IsLittleEndian*/ true), MRI(MRI), - IsILP32(IsILP32) {} + const MCRegisterInfo &MRI) + : AArch64AsmBackend(T, TT, /*IsLittleEndian*/ true), MRI(MRI) {} std::unique_ptr<MCObjectTargetWriter> createObjectTargetWriter() const override { - if (IsILP32) - return createAArch64MachObjectWriter( - MachO::CPU_TYPE_ARM64_32, MachO::CPU_SUBTYPE_ARM64_32_V8, true); - else - return createAArch64MachObjectWriter(MachO::CPU_TYPE_ARM64, - MachO::CPU_SUBTYPE_ARM64_ALL, false); + uint32_t CPUType = cantFail(MachO::getCPUType(TheTriple)); + uint32_t CPUSubType = cantFail(MachO::getCPUSubType(TheTriple)); + return createAArch64MachObjectWriter(CPUType, CPUSubType, + TheTriple.isArch32Bit()); } /// Generate the compact unwind encoding from the CFI directives. @@ -749,8 +772,7 @@ MCAsmBackend *llvm::createAArch64leAsmBackend(const Target &T, const MCTargetOptions &Options) { const Triple &TheTriple = STI.getTargetTriple(); if (TheTriple.isOSBinFormatMachO()) { - const bool IsILP32 = TheTriple.isArch32Bit(); - return new DarwinAArch64AsmBackend(T, TheTriple, MRI, IsILP32); + return new DarwinAArch64AsmBackend(T, TheTriple, MRI); } if (TheTriple.isOSBinFormatCOFF()) diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp index 0fd1ca187be7f..e5637dcab9419 100644 --- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp +++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp @@ -106,13 +106,17 @@ unsigned AArch64ELFObjectWriter::getRelocType(MCContext &Ctx, const MCValue &Target, const MCFixup &Fixup, bool IsPCRel) const { + unsigned Kind = Fixup.getTargetKind(); + if (Kind >= FirstLiteralRelocationKind) + return Kind - FirstLiteralRelocationKind; AArch64MCExpr::VariantKind RefKind = static_cast<AArch64MCExpr::VariantKind>(Target.getRefKind()); AArch64MCExpr::VariantKind SymLoc = AArch64MCExpr::getSymbolLoc(RefKind); bool IsNC = AArch64MCExpr::isNotChecked(RefKind); assert((!Target.getSymA() || - Target.getSymA()->getKind() == MCSymbolRefExpr::VK_None) && + Target.getSymA()->getKind() == MCSymbolRefExpr::VK_None || + Target.getSymA()->getKind() == MCSymbolRefExpr::VK_PLT) && "Should only be expression-level modifiers here"); assert((!Target.getSymB() || @@ -120,14 +124,17 @@ unsigned AArch64ELFObjectWriter::getRelocType(MCContext &Ctx, "Should only be expression-level modifiers here"); if (IsPCRel) { - switch (Fixup.getTargetKind()) { + switch (Kind) { case FK_Data_1: Ctx.reportError(Fixup.getLoc(), "1-byte data relocations not supported"); return ELF::R_AARCH64_NONE; case FK_Data_2: return R_CLS(PREL16); - case FK_Data_4: - return R_CLS(PREL32); + case FK_Data_4: { + return Target.getAccessVariant() == MCSymbolRefExpr::VK_PLT + ? R_CLS(PLT32) + : R_CLS(PREL32); + } case FK_Data_8: if (IsILP32) { Ctx.reportError(Fixup.getLoc(), @@ -185,8 +192,6 @@ unsigned AArch64ELFObjectWriter::getRelocType(MCContext &Ctx, if (IsILP32 && isNonILP32reloc(Fixup, RefKind, Ctx)) return ELF::R_AARCH64_NONE; switch (Fixup.getTargetKind()) { - case FK_NONE: - return ELF::R_AARCH64_NONE; case FK_Data_1: Ctx.reportError(Fixup.getLoc(), "1-byte data relocations not supported"); return ELF::R_AARCH64_NONE; diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp index c33f7e957b54a..fe4c34be1519b 100644 --- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp +++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp @@ -81,14 +81,14 @@ public: std::move(Emitter)), MappingSymbolCounter(0), LastEMS(EMS_None) {} - void ChangeSection(MCSection *Section, const MCExpr *Subsection) override { + void changeSection(MCSection *Section, const MCExpr *Subsection) override { // We have to keep track of the mapping symbol state of any sections we // use. Each one should start off as EMS_None, which is provided as the // default constructor by DenseMap::lookup. LastMappingSymbols[getPreviousSection().first] = LastEMS; LastEMS = LastMappingSymbols.lookup(Section); - MCELFStreamer::ChangeSection(Section, Subsection); + MCELFStreamer::changeSection(Section, Subsection); } // Reset state between object emissions @@ -102,10 +102,10 @@ public: /// This function is the one used to emit instruction data into the ELF /// streamer. We override it to add the appropriate mapping symbol if /// necessary. - void EmitInstruction(const MCInst &Inst, + void emitInstruction(const MCInst &Inst, const MCSubtargetInfo &STI) override { EmitA64MappingSymbol(); - MCELFStreamer::EmitInstruction(Inst, STI); + MCELFStreamer::emitInstruction(Inst, STI); } /// Emit a 32-bit value as an instruction. This is only used for the .inst @@ -122,28 +122,28 @@ public: } EmitA64MappingSymbol(); - MCELFStreamer::EmitBytes(StringRef(Buffer, 4)); + MCELFStreamer::emitBytes(StringRef(Buffer, 4)); } /// This is one of the functions used to emit data into an ELF section, so the /// AArch64 streamer overrides it to add the appropriate mapping symbol ($d) /// if necessary. - void EmitBytes(StringRef Data) override { - EmitDataMappingSymbol(); - MCELFStreamer::EmitBytes(Data); + void emitBytes(StringRef Data) override { + emitDataMappingSymbol(); + MCELFStreamer::emitBytes(Data); } /// This is one of the functions used to emit data into an ELF section, so the /// AArch64 streamer overrides it to add the appropriate mapping symbol ($d) /// if necessary. - void EmitValueImpl(const MCExpr *Value, unsigned Size, SMLoc Loc) override { - EmitDataMappingSymbol(); - MCELFStreamer::EmitValueImpl(Value, Size, Loc); + void emitValueImpl(const MCExpr *Value, unsigned Size, SMLoc Loc) override { + emitDataMappingSymbol(); + MCELFStreamer::emitValueImpl(Value, Size, Loc); } void emitFill(const MCExpr &NumBytes, uint64_t FillValue, SMLoc Loc) override { - EmitDataMappingSymbol(); + emitDataMappingSymbol(); MCObjectStreamer::emitFill(NumBytes, FillValue, Loc); } private: @@ -153,7 +153,7 @@ private: EMS_Data }; - void EmitDataMappingSymbol() { + void emitDataMappingSymbol() { if (LastEMS == EMS_Data) return; EmitMappingSymbol("$d"); @@ -170,7 +170,7 @@ private: void EmitMappingSymbol(StringRef Name) { auto *Symbol = cast<MCSymbolELF>(getContext().getOrCreateSymbol( Name + "." + Twine(MappingSymbolCounter++))); - EmitLabel(Symbol); + emitLabel(Symbol); Symbol->setType(ELF::STT_NOTYPE); Symbol->setBinding(ELF::STB_LOCAL); Symbol->setExternal(false); diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp index 469892213ef87..38474d31460dd 100644 --- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp +++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp @@ -283,7 +283,8 @@ void AArch64InstPrinter::printInst(const MCInst *MI, uint64_t Address, } if (Opcode == AArch64::SPACE) { - O << '\t' << MAI.getCommentString() << " SPACE"; + O << '\t' << MAI.getCommentString() << " SPACE " + << MI->getOperand(1).getImm(); printAnnotation(O, Annot); return; } @@ -295,7 +296,7 @@ void AArch64InstPrinter::printInst(const MCInst *MI, uint64_t Address, return; } - if (!printAliasInstr(MI, STI, O)) + if (!printAliasInstr(MI, Address, STI, O)) printInstruction(MI, Address, STI, O); printAnnotation(O, Annot); @@ -900,6 +901,19 @@ void AArch64InstPrinter::printImmHex(const MCInst *MI, unsigned OpNo, O << format("#%#llx", Op.getImm()); } +template<int Size> +void AArch64InstPrinter::printSImm(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, + raw_ostream &O) { + const MCOperand &Op = MI->getOperand(OpNo); + if (Size == 8) + O << "#" << formatImm((signed char)Op.getImm()); + else if (Size == 16) + O << "#" << formatImm((signed short)Op.getImm()); + else + O << "#" << formatImm(Op.getImm()); +} + void AArch64InstPrinter::printPostIncOperand(const MCInst *MI, unsigned OpNo, unsigned Imm, raw_ostream &O) { const MCOperand &Op = MI->getOperand(OpNo); @@ -1334,7 +1348,8 @@ void AArch64InstPrinter::printVectorIndex(const MCInst *MI, unsigned OpNum, O << "[" << MI->getOperand(OpNum).getImm() << "]"; } -void AArch64InstPrinter::printAlignedLabel(const MCInst *MI, unsigned OpNum, +void AArch64InstPrinter::printAlignedLabel(const MCInst *MI, uint64_t Address, + unsigned OpNum, const MCSubtargetInfo &STI, raw_ostream &O) { const MCOperand &Op = MI->getOperand(OpNum); @@ -1342,17 +1357,20 @@ void AArch64InstPrinter::printAlignedLabel(const MCInst *MI, unsigned OpNum, // If the label has already been resolved to an immediate offset (say, when // we're running the disassembler), just print the immediate. if (Op.isImm()) { - O << "#" << formatImm(Op.getImm() * 4); + int64_t Offset = Op.getImm() * 4; + if (PrintBranchImmAsAddress) + O << formatHex(Address + Offset); + else + O << "#" << formatImm(Offset); return; } // If the branch target is simply an address then print it in hex. const MCConstantExpr *BranchTarget = dyn_cast<MCConstantExpr>(MI->getOperand(OpNum).getExpr()); - int64_t Address; - if (BranchTarget && BranchTarget->evaluateAsAbsolute(Address)) { - O << "0x"; - O.write_hex(Address); + int64_t TargetAddress; + if (BranchTarget && BranchTarget->evaluateAsAbsolute(TargetAddress)) { + O << formatHex(TargetAddress); } else { // Otherwise, just print the expression. MI->getOperand(OpNum).getExpr()->print(O, &MAI); @@ -1411,6 +1429,12 @@ void AArch64InstPrinter::printMRSSystemRegister(const MCInst *MI, unsigned OpNo, return; } + // Horrible hack for two different registers having the same encoding. + if (Val == AArch64SysReg::TRCEXTINSELR) { + O << "TRCEXTINSELR"; + return; + } + const AArch64SysReg::SysReg *Reg = AArch64SysReg::lookupSysRegByEncoding(Val); if (Reg && Reg->Readable && Reg->haveFeatures(STI.getFeatureBits())) O << Reg->Name; @@ -1431,6 +1455,12 @@ void AArch64InstPrinter::printMSRSystemRegister(const MCInst *MI, unsigned OpNo, return; } + // Horrible hack for two different registers having the same encoding. + if (Val == AArch64SysReg::TRCEXTINSELR) { + O << "TRCEXTINSELR"; + return; + } + const AArch64SysReg::SysReg *Reg = AArch64SysReg::lookupSysRegByEncoding(Val); if (Reg && Reg->Writeable && Reg->haveFeatures(STI.getFeatureBits())) O << Reg->Name; @@ -1499,7 +1529,7 @@ void AArch64InstPrinter::printSVERegOp(const MCInst *MI, unsigned OpNum, template <typename T> void AArch64InstPrinter::printImmSVE(T Value, raw_ostream &O) { - typename std::make_unsigned<T>::type HexValue = Value; + std::make_unsigned_t<T> HexValue = Value; if (getPrintImmHex()) O << '#' << formatHex((uint64_t)HexValue); @@ -1544,8 +1574,8 @@ template <typename T> void AArch64InstPrinter::printSVELogicalImm(const MCInst *MI, unsigned OpNum, const MCSubtargetInfo &STI, raw_ostream &O) { - typedef typename std::make_signed<T>::type SignedT; - typedef typename std::make_unsigned<T>::type UnsignedT; + typedef std::make_signed_t<T> SignedT; + typedef std::make_unsigned_t<T> UnsignedT; uint64_t Val = MI->getOperand(OpNum).getImm(); UnsignedT PrintVal = AArch64_AM::decodeLogicalImmediate(Val, 64); diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.h b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.h index 993f379b53433..6da5f0e81c803 100644 --- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.h +++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.h @@ -32,10 +32,10 @@ public: // Autogenerated by tblgen. virtual void printInstruction(const MCInst *MI, uint64_t Address, const MCSubtargetInfo &STI, raw_ostream &O); - virtual bool printAliasInstr(const MCInst *MI, const MCSubtargetInfo &STI, - raw_ostream &O); - virtual void printCustomAliasOperand(const MCInst *MI, unsigned OpIdx, - unsigned PrintMethodIdx, + virtual bool printAliasInstr(const MCInst *MI, uint64_t Address, + const MCSubtargetInfo &STI, raw_ostream &O); + virtual void printCustomAliasOperand(const MCInst *MI, uint64_t Address, + unsigned OpIdx, unsigned PrintMethodIdx, const MCSubtargetInfo &STI, raw_ostream &O); @@ -56,6 +56,9 @@ protected: raw_ostream &O); void printImmHex(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O); + template <int Size> + void printSImm(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, + raw_ostream &O); template <typename T> void printImmSVE(T Value, raw_ostream &O); void printPostIncOperand(const MCInst *MI, unsigned OpNo, unsigned Imm, raw_ostream &O); @@ -97,7 +100,7 @@ protected: const MCSubtargetInfo &STI, raw_ostream &O); void printInverseCondCode(const MCInst *MI, unsigned OpNum, const MCSubtargetInfo &STI, raw_ostream &O); - void printAlignedLabel(const MCInst *MI, unsigned OpNum, + void printAlignedLabel(const MCInst *MI, uint64_t Address, unsigned OpNum, const MCSubtargetInfo &STI, raw_ostream &O); void printUImm12Offset(const MCInst *MI, unsigned OpNum, unsigned Scale, raw_ostream &O); @@ -202,10 +205,10 @@ public: void printInstruction(const MCInst *MI, uint64_t Address, const MCSubtargetInfo &STI, raw_ostream &O) override; - bool printAliasInstr(const MCInst *MI, const MCSubtargetInfo &STI, - raw_ostream &O) override; - void printCustomAliasOperand(const MCInst *MI, unsigned OpIdx, - unsigned PrintMethodIdx, + bool printAliasInstr(const MCInst *MI, uint64_t Address, + const MCSubtargetInfo &STI, raw_ostream &O) override; + void printCustomAliasOperand(const MCInst *MI, uint64_t Address, + unsigned OpIdx, unsigned PrintMethodIdx, const MCSubtargetInfo &STI, raw_ostream &O) override; diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp index 5926a4f81616c..9a63e26dec190 100644 --- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp +++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp @@ -60,7 +60,7 @@ const MCExpr *AArch64MCAsmInfoDarwin::getExprForPersonalitySymbol( const MCExpr *Res = MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_GOT, Context); MCSymbol *PCSym = Context.createTempSymbol(); - Streamer.EmitLabel(PCSym); + Streamer.emitLabel(PCSym); const MCExpr *PC = MCSymbolRefExpr::create(PCSym, Context); return MCBinaryExpr::createSub(Res, PC, Context); } @@ -96,8 +96,6 @@ AArch64MCAsmInfoELF::AArch64MCAsmInfoELF(const Triple &T) { // Exceptions handling ExceptionsType = ExceptionHandling::DwarfCFI; - UseIntegratedAssembler = true; - HasIdentDirective = true; } diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp index 8f4d9cb94d607..da8f511c650f0 100644 --- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp +++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp @@ -569,23 +569,24 @@ unsigned AArch64MCCodeEmitter::fixMOVZ(const MCInst &MI, unsigned EncodedValue, if (UImm16MO.isImm()) return EncodedValue; - const AArch64MCExpr *A64E = cast<AArch64MCExpr>(UImm16MO.getExpr()); - switch (A64E->getKind()) { - case AArch64MCExpr::VK_DTPREL_G2: - case AArch64MCExpr::VK_DTPREL_G1: - case AArch64MCExpr::VK_DTPREL_G0: - case AArch64MCExpr::VK_GOTTPREL_G1: - case AArch64MCExpr::VK_TPREL_G2: - case AArch64MCExpr::VK_TPREL_G1: - case AArch64MCExpr::VK_TPREL_G0: - return EncodedValue & ~(1u << 30); - default: - // Nothing to do for an unsigned fixup. - return EncodedValue; + const MCExpr *E = UImm16MO.getExpr(); + if (const AArch64MCExpr *A64E = dyn_cast<AArch64MCExpr>(E)) { + switch (A64E->getKind()) { + case AArch64MCExpr::VK_DTPREL_G2: + case AArch64MCExpr::VK_DTPREL_G1: + case AArch64MCExpr::VK_DTPREL_G0: + case AArch64MCExpr::VK_GOTTPREL_G1: + case AArch64MCExpr::VK_TPREL_G2: + case AArch64MCExpr::VK_TPREL_G1: + case AArch64MCExpr::VK_TPREL_G0: + return EncodedValue & ~(1u << 30); + default: + // Nothing to do for an unsigned fixup. + return EncodedValue; + } } - - return EncodedValue & ~(1u << 30); + return EncodedValue; } void AArch64MCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS, diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp index 7dc3665baabc5..209bff3a23117 100644 --- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp +++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp @@ -254,7 +254,7 @@ static MCAsmInfo *createAArch64MCAsmInfo(const MCRegisterInfo &MRI, // Initial state of the frame pointer is SP. unsigned Reg = MRI.getDwarfRegNum(AArch64::SP, true); - MCCFIInstruction Inst = MCCFIInstruction::createDefCfa(nullptr, Reg, 0); + MCCFIInstruction Inst = MCCFIInstruction::cfiDefCfa(nullptr, Reg, 0); MAI->addInitialFrameState(Inst); return MAI; diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp index fc04d37eb3623..b0f414bd27edd 100644 --- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp +++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp @@ -139,7 +139,7 @@ static bool canUseLocalRelocation(const MCSectionMachO &Section, return false; if (RefSec.getSegmentName() == "__DATA" && - RefSec.getSectionName() == "__objc_classrefs") + RefSec.getName() == "__objc_classrefs") return false; // FIXME: ld64 currently handles internal pointer-sized relocations @@ -407,5 +407,5 @@ std::unique_ptr<MCObjectTargetWriter> llvm::createAArch64MachObjectWriter(uint32_t CPUType, uint32_t CPUSubtype, bool IsILP32) { return std::make_unique<AArch64MachObjectWriter>(CPUType, CPUSubtype, - IsILP32); + IsILP32); } diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp index f70752f5303f3..48ed68f492635 100644 --- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp +++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp @@ -51,7 +51,7 @@ void AArch64TargetStreamer::emitInst(uint32_t Inst) { Inst >>= 8; } - getStreamer().EmitBytes(StringRef(Buffer, 4)); + getStreamer().emitBytes(StringRef(Buffer, 4)); } namespace llvm { diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.cpp index 37c6fbb039081..03fbab5142a2e 100644 --- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.cpp +++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.cpp @@ -28,7 +28,7 @@ public: void EmitWinEHHandlerData(SMLoc Loc) override; void EmitWindowsUnwindTables() override; - void FinishImpl() override; + void finishImpl() override; }; void AArch64WinCOFFStreamer::EmitWinEHHandlerData(SMLoc Loc) { @@ -45,11 +45,11 @@ void AArch64WinCOFFStreamer::EmitWindowsUnwindTables() { EHStreamer.Emit(*this); } -void AArch64WinCOFFStreamer::FinishImpl() { - EmitFrames(nullptr); +void AArch64WinCOFFStreamer::finishImpl() { + emitFrames(nullptr); EmitWindowsUnwindTables(); - MCWinCOFFStreamer::FinishImpl(); + MCWinCOFFStreamer::finishImpl(); } } // end anonymous namespace @@ -68,7 +68,7 @@ void AArch64TargetWinCOFFStreamer::EmitARM64WinUnwindCode(unsigned UnwindCode, WinEH::FrameInfo *CurFrame = S.EnsureValidWinFrameInfo(SMLoc()); if (!CurFrame) return; - MCSymbol *Label = S.EmitCFILabel(); + MCSymbol *Label = S.emitCFILabel(); auto Inst = WinEH::Instruction(UnwindCode, Label, Reg, Offset); if (InEpilogCFI) CurFrame->EpilogMap[CurrentEpilog].push_back(Inst); @@ -158,7 +158,7 @@ void AArch64TargetWinCOFFStreamer::EmitARM64WinCFIPrologEnd() { if (!CurFrame) return; - MCSymbol *Label = S.EmitCFILabel(); + MCSymbol *Label = S.emitCFILabel(); CurFrame->PrologEnd = Label; WinEH::Instruction Inst = WinEH::Instruction(Win64EH::UOP_End, Label, -1, 0); auto it = CurFrame->Instructions.begin(); @@ -172,7 +172,7 @@ void AArch64TargetWinCOFFStreamer::EmitARM64WinCFIEpilogStart() { return; InEpilogCFI = true; - CurrentEpilog = S.EmitCFILabel(); + CurrentEpilog = S.emitCFILabel(); } void AArch64TargetWinCOFFStreamer::EmitARM64WinCFIEpilogEnd() { @@ -182,7 +182,7 @@ void AArch64TargetWinCOFFStreamer::EmitARM64WinCFIEpilogEnd() { return; InEpilogCFI = false; - MCSymbol *Label = S.EmitCFILabel(); + MCSymbol *Label = S.emitCFILabel(); WinEH::Instruction Inst = WinEH::Instruction(Win64EH::UOP_End, Label, -1, 0); CurFrame->EpilogMap[CurrentEpilog].push_back(Inst); CurrentEpilog = nullptr; diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td index a172b8d7e6b0a..a005d1e65abe1 100644 --- a/llvm/lib/Target/AArch64/SVEInstrFormats.td +++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td @@ -10,6 +10,14 @@ // //===----------------------------------------------------------------------===// +def SDT_AArch64Setcc : SDTypeProfile<1, 4, [ + SDTCisVec<0>, SDTCisVec<1>, SDTCisVec<2>, SDTCisVec<3>, + SDTCVecEltisVT<0, i1>, SDTCVecEltisVT<1, i1>, SDTCisSameAs<2, 3>, + SDTCisVT<4, OtherVT> +]>; + +def AArch64setcc_z : SDNode<"AArch64ISD::SETCC_MERGE_ZERO", SDT_AArch64Setcc>; + def SVEPatternOperand : AsmOperandClass { let Name = "SVEPattern"; let ParserMethod = "tryParseSVEPattern"; @@ -33,7 +41,7 @@ def SVEPrefetchOperand : AsmOperandClass { let RenderMethod = "addPrefetchOperands"; } -def sve_prfop : Operand<i32>, ImmLeaf<i32, [{ +def sve_prfop : Operand<i32>, TImmLeaf<i32, [{ return (((uint32_t)Imm) <= 15); }]> { let PrintMethod = "printPrefetchOp<true>"; @@ -167,8 +175,8 @@ def SVEAddSubImmOperand32 : SVEShiftedImmOperand<32, "AddSub", "isSVEAddSubImm<i def SVEAddSubImmOperand64 : SVEShiftedImmOperand<64, "AddSub", "isSVEAddSubImm<int64_t>">; class imm8_opt_lsl<int ElementWidth, string printType, - AsmOperandClass OpndClass, code Predicate> - : Operand<i32>, ImmLeaf<i32, Predicate> { + AsmOperandClass OpndClass> + : Operand<i32> { let EncoderMethod = "getImm8OptLsl"; let DecoderMethod = "DecodeImm8OptLsl<" # ElementWidth # ">"; let PrintMethod = "printImm8OptLsl<" # printType # ">"; @@ -176,31 +184,15 @@ class imm8_opt_lsl<int ElementWidth, string printType, let MIOperandInfo = (ops i32imm, i32imm); } -def cpy_imm8_opt_lsl_i8 : imm8_opt_lsl<8, "int8_t", SVECpyImmOperand8, [{ - return AArch64_AM::isSVECpyImm<int8_t>(Imm); -}]>; -def cpy_imm8_opt_lsl_i16 : imm8_opt_lsl<16, "int16_t", SVECpyImmOperand16, [{ - return AArch64_AM::isSVECpyImm<int16_t>(Imm); -}]>; -def cpy_imm8_opt_lsl_i32 : imm8_opt_lsl<32, "int32_t", SVECpyImmOperand32, [{ - return AArch64_AM::isSVECpyImm<int32_t>(Imm); -}]>; -def cpy_imm8_opt_lsl_i64 : imm8_opt_lsl<64, "int64_t", SVECpyImmOperand64, [{ - return AArch64_AM::isSVECpyImm<int64_t>(Imm); -}]>; - -def addsub_imm8_opt_lsl_i8 : imm8_opt_lsl<8, "uint8_t", SVEAddSubImmOperand8, [{ - return AArch64_AM::isSVEAddSubImm<int8_t>(Imm); -}]>; -def addsub_imm8_opt_lsl_i16 : imm8_opt_lsl<16, "uint16_t", SVEAddSubImmOperand16, [{ - return AArch64_AM::isSVEAddSubImm<int16_t>(Imm); -}]>; -def addsub_imm8_opt_lsl_i32 : imm8_opt_lsl<32, "uint32_t", SVEAddSubImmOperand32, [{ - return AArch64_AM::isSVEAddSubImm<int32_t>(Imm); -}]>; -def addsub_imm8_opt_lsl_i64 : imm8_opt_lsl<64, "uint64_t", SVEAddSubImmOperand64, [{ - return AArch64_AM::isSVEAddSubImm<int64_t>(Imm); -}]>; +def cpy_imm8_opt_lsl_i8 : imm8_opt_lsl<8, "int8_t", SVECpyImmOperand8>; +def cpy_imm8_opt_lsl_i16 : imm8_opt_lsl<16, "int16_t", SVECpyImmOperand16>; +def cpy_imm8_opt_lsl_i32 : imm8_opt_lsl<32, "int32_t", SVECpyImmOperand32>; +def cpy_imm8_opt_lsl_i64 : imm8_opt_lsl<64, "int64_t", SVECpyImmOperand64>; + +def addsub_imm8_opt_lsl_i8 : imm8_opt_lsl<8, "uint8_t", SVEAddSubImmOperand8>; +def addsub_imm8_opt_lsl_i16 : imm8_opt_lsl<16, "uint16_t", SVEAddSubImmOperand16>; +def addsub_imm8_opt_lsl_i32 : imm8_opt_lsl<32, "uint32_t", SVEAddSubImmOperand32>; +def addsub_imm8_opt_lsl_i64 : imm8_opt_lsl<64, "uint64_t", SVEAddSubImmOperand64>; def SVEAddSubImm8Pat : ComplexPattern<i32, 2, "SelectSVEAddSubImm<MVT::i8>", []>; def SVEAddSubImm16Pat : ComplexPattern<i32, 2, "SelectSVEAddSubImm<MVT::i16>", []>; @@ -212,9 +204,13 @@ def SVELogicalImm16Pat : ComplexPattern<i64, 1, "SelectSVELogicalImm<MVT::i16>", def SVELogicalImm32Pat : ComplexPattern<i64, 1, "SelectSVELogicalImm<MVT::i32>", []>; def SVELogicalImm64Pat : ComplexPattern<i64, 1, "SelectSVELogicalImm<MVT::i64>", []>; +def SVE8BitLslImm : ComplexPattern<i32, 2, "SelectSVE8BitLslImm", [imm]>; + def SVEArithUImmPat : ComplexPattern<i32, 1, "SelectSVEArithImm", []>; def SVEArithSImmPat : ComplexPattern<i32, 1, "SelectSVESignedArithImm", []>; +def SVEShiftImm64 : ComplexPattern<i32, 1, "SelectSVEShiftImm64<0, 64>", []>; + class SVEExactFPImm<string Suffix, string ValA, string ValB> : AsmOperandClass { let Name = "SVEExactFPImmOperand" # Suffix; let DiagnosticType = "Invalid" # Name; @@ -324,6 +320,16 @@ class SVE_1_Op_Imm_Arith_Pat<ValueType vt, SDPatternOperator op, ZPRRegOp zprty, : Pat<(vt (op (vt zprty:$Op1), (vt (AArch64dup (it (cpx i32:$imm)))))), (inst $Op1, i32:$imm)>; +class SVE_1_Op_Imm_Shift_Pred_Pat<ValueType vt, ValueType pt, SDPatternOperator op, + ZPRRegOp zprty, Operand ImmTy, Instruction inst> + : Pat<(vt (op (pt (AArch64ptrue 31)), (vt zprty:$Op1), (vt (AArch64dup (ImmTy:$imm))))), + (inst $Op1, ImmTy:$imm)>; + +class SVE_1_Op_Imm_Arith_Pred_Pat<ValueType vt, ValueType pt, SDPatternOperator op, + ZPRRegOp zprty, ValueType it, ComplexPattern cpx, Instruction inst> + : Pat<(vt (op (pt (AArch64ptrue 31)), (vt zprty:$Op1), (vt (AArch64dup (it (cpx i32:$imm)))))), + (inst $Op1, i32:$imm)>; + class SVE_1_Op_Imm_Log_Pat<ValueType vt, SDPatternOperator op, ZPRRegOp zprty, ValueType it, ComplexPattern cpx, Instruction inst> : Pat<(vt (op (vt zprty:$Op1), (vt (AArch64dup (it (cpx i64:$imm)))))), @@ -367,8 +373,22 @@ class SVE_4_Op_Imm_Pat<ValueType vtd, SDPatternOperator op, ValueType vt1, : Pat<(vtd (op vt1:$Op1, vt2:$Op2, vt3:$Op3, (vt4 ImmTy:$Op4))), (inst $Op1, $Op2, $Op3, ImmTy:$Op4)>; +def SVEDup0 : ComplexPattern<i64, 0, "SelectDupZero", []>; def SVEDup0Undef : ComplexPattern<i64, 0, "SelectDupZeroOrUndef", []>; +let AddedComplexity = 1 in { +class SVE_3_Op_Pat_SelZero<ValueType vtd, SDPatternOperator op, ValueType vt1, + ValueType vt2, ValueType vt3, Instruction inst> +: Pat<(vtd (vtd (op vt1:$Op1, (vselect vt1:$Op1, vt2:$Op2, (SVEDup0)), vt3:$Op3))), + (inst $Op1, $Op2, $Op3)>; + +class SVE_3_Op_Pat_Shift_Imm_SelZero<ValueType vtd, SDPatternOperator op, + ValueType vt1, ValueType vt2, + Operand vt3, Instruction inst> +: Pat<(vtd (op vt1:$Op1, (vselect vt1:$Op1, vt2:$Op2, (SVEDup0)), (i32 (vt3:$Op3)))), + (inst $Op1, $Op2, vt3:$Op3)>; +} + // // Common but less generic patterns. // @@ -378,6 +398,69 @@ class SVE_1_Op_AllActive_Pat<ValueType vtd, SDPatternOperator op, ValueType vt1, : Pat<(vtd (op vt1:$Op1)), (inst (IMPLICIT_DEF), (ptrue 31), $Op1)>; +class SVE_2_Op_AllActive_Pat<ValueType vtd, SDPatternOperator op, ValueType vt1, + ValueType vt2, Instruction inst, Instruction ptrue> +: Pat<(vtd (op vt1:$Op1, vt2:$Op2)), + (inst (ptrue 31), $Op1, $Op2)>; + +// +// Pseudo -> Instruction mappings +// +def getSVEPseudoMap : InstrMapping { + let FilterClass = "SVEPseudo2Instr"; + let RowFields = ["PseudoName"]; + let ColFields = ["IsInstr"]; + let KeyCol = ["0"]; + let ValueCols = [["1"]]; +} + +class SVEPseudo2Instr<string name, bit instr> { + string PseudoName = name; + bit IsInstr = instr; +} + +// Lookup e.g. DIV -> DIVR +def getSVERevInstr : InstrMapping { + let FilterClass = "SVEInstr2Rev"; + let RowFields = ["InstrName"]; + let ColFields = ["isReverseInstr"]; + let KeyCol = ["0"]; + let ValueCols = [["1"]]; +} + +// Lookup e.g. DIVR -> DIV +def getSVENonRevInstr : InstrMapping { + let FilterClass = "SVEInstr2Rev"; + let RowFields = ["InstrName"]; + let ColFields = ["isReverseInstr"]; + let KeyCol = ["1"]; + let ValueCols = [["0"]]; +} + +class SVEInstr2Rev<string name1, string name2, bit name1IsReverseInstr> { + string InstrName = !if(name1IsReverseInstr, name1, name2); + bit isReverseInstr = name1IsReverseInstr; +} + +// +// Pseudos for destructive operands +// +let hasNoSchedulingInfo = 1 in { + class PredTwoOpPseudo<string name, ZPRRegOp zprty, + FalseLanesEnum flags = FalseLanesNone> + : SVEPseudo2Instr<name, 0>, + Pseudo<(outs zprty:$Zd), (ins PPR3bAny:$Pg, zprty:$Zs1, zprty:$Zs2), []> { + let FalseLanes = flags; + } + + class PredTwoOpImmPseudo<string name, ZPRRegOp zprty, Operand immty, + FalseLanesEnum flags = FalseLanesNone> + : SVEPseudo2Instr<name, 0>, + Pseudo<(outs zprty:$Zd), (ins PPR3bAny:$Pg, zprty:$Zs1, immty:$imm), []> { + let FalseLanes = flags; + } +} + //===----------------------------------------------------------------------===// // SVE Predicate Misc Group //===----------------------------------------------------------------------===// @@ -566,7 +649,7 @@ class sve_int_count_v<bits<2> sz8_64, bits<5> opc, string asm, let Inst{4-0} = Zdn; let Constraints = "$Zdn = $_Zdn"; - let DestructiveInstType = Destructive; + let DestructiveInstType = DestructiveOther; let ElementSize = ElementSizeNone; } @@ -680,7 +763,7 @@ class sve_int_countvlv<bits<5> opc, string asm, ZPRRegOp zprty> let Inst{4-0} = Zdn; let Constraints = "$Zdn = $_Zdn"; - let DestructiveInstType = Destructive; + let DestructiveInstType = DestructiveOther; let ElementSize = ElementSizeNone; } @@ -941,11 +1024,46 @@ multiclass sve_int_perm_tbl<string asm, SDPatternOperator op> { def : SVE_2_Op_Pat<nxv2f64, op, nxv2f64, nxv2i64, !cast<Instruction>(NAME # _D)>; } -multiclass sve2_int_perm_tbl<string asm> { +multiclass sve2_int_perm_tbl<string asm, SDPatternOperator op> { def _B : sve_int_perm_tbl<0b00, 0b01, asm, ZPR8, ZZ_b>; def _H : sve_int_perm_tbl<0b01, 0b01, asm, ZPR16, ZZ_h>; def _S : sve_int_perm_tbl<0b10, 0b01, asm, ZPR32, ZZ_s>; def _D : sve_int_perm_tbl<0b11, 0b01, asm, ZPR64, ZZ_d>; + + def : Pat<(nxv16i8 (op nxv16i8:$Op1, nxv16i8:$Op2, nxv16i8:$Op3)), + (nxv16i8 (!cast<Instruction>(NAME # _B) (REG_SEQUENCE ZPR2, nxv16i8:$Op1, zsub0, + nxv16i8:$Op2, zsub1), + nxv16i8:$Op3))>; + + def : Pat<(nxv8i16 (op nxv8i16:$Op1, nxv8i16:$Op2, nxv8i16:$Op3)), + (nxv8i16 (!cast<Instruction>(NAME # _H) (REG_SEQUENCE ZPR2, nxv8i16:$Op1, zsub0, + nxv8i16:$Op2, zsub1), + nxv8i16:$Op3))>; + + def : Pat<(nxv4i32 (op nxv4i32:$Op1, nxv4i32:$Op2, nxv4i32:$Op3)), + (nxv4i32 (!cast<Instruction>(NAME # _S) (REG_SEQUENCE ZPR2, nxv4i32:$Op1, zsub0, + nxv4i32:$Op2, zsub1), + nxv4i32:$Op3))>; + + def : Pat<(nxv2i64 (op nxv2i64:$Op1, nxv2i64:$Op2, nxv2i64:$Op3)), + (nxv2i64 (!cast<Instruction>(NAME # _D) (REG_SEQUENCE ZPR2, nxv2i64:$Op1, zsub0, + nxv2i64:$Op2, zsub1), + nxv2i64:$Op3))>; + + def : Pat<(nxv8f16 (op nxv8f16:$Op1, nxv8f16:$Op2, nxv8i16:$Op3)), + (nxv8f16 (!cast<Instruction>(NAME # _H) (REG_SEQUENCE ZPR2, nxv8f16:$Op1, zsub0, + nxv8f16:$Op2, zsub1), + nxv8i16:$Op3))>; + + def : Pat<(nxv4f32 (op nxv4f32:$Op1, nxv4f32:$Op2, nxv4i32:$Op3)), + (nxv4f32 (!cast<Instruction>(NAME # _S) (REG_SEQUENCE ZPR2, nxv4f32:$Op1, zsub0, + nxv4f32:$Op2, zsub1), + nxv4i32:$Op3))>; + + def : Pat<(nxv2f64 (op nxv2f64:$Op1, nxv2f64:$Op2, nxv2i64:$Op3)), + (nxv2f64 (!cast<Instruction>(NAME # _D) (REG_SEQUENCE ZPR2, nxv2f64:$Op1, zsub0, + nxv2f64:$Op2, zsub1), + nxv2i64:$Op3))>; } class sve2_int_perm_tbx<bits<2> sz8_64, string asm, ZPRRegOp zprty> @@ -967,11 +1085,20 @@ class sve2_int_perm_tbx<bits<2> sz8_64, string asm, ZPRRegOp zprty> let Constraints = "$Zd = $_Zd"; } -multiclass sve2_int_perm_tbx<string asm> { +multiclass sve2_int_perm_tbx<string asm, SDPatternOperator op> { def _B : sve2_int_perm_tbx<0b00, asm, ZPR8>; def _H : sve2_int_perm_tbx<0b01, asm, ZPR16>; def _S : sve2_int_perm_tbx<0b10, asm, ZPR32>; def _D : sve2_int_perm_tbx<0b11, asm, ZPR64>; + + def : SVE_3_Op_Pat<nxv16i8, op, nxv16i8, nxv16i8, nxv16i8, !cast<Instruction>(NAME # _B)>; + def : SVE_3_Op_Pat<nxv8i16, op, nxv8i16, nxv8i16, nxv8i16, !cast<Instruction>(NAME # _H)>; + def : SVE_3_Op_Pat<nxv4i32, op, nxv4i32, nxv4i32, nxv4i32, !cast<Instruction>(NAME # _S)>; + def : SVE_3_Op_Pat<nxv2i64, op, nxv2i64, nxv2i64, nxv2i64, !cast<Instruction>(NAME # _D)>; + + def : SVE_3_Op_Pat<nxv8f16, op, nxv8f16, nxv8f16, nxv8i16, !cast<Instruction>(NAME # _H)>; + def : SVE_3_Op_Pat<nxv4f32, op, nxv4f32, nxv4f32, nxv4i32, !cast<Instruction>(NAME # _S)>; + def : SVE_3_Op_Pat<nxv2f64, op, nxv2f64, nxv2f64, nxv2i64, !cast<Instruction>(NAME # _D)>; } class sve_int_perm_reverse_z<bits<2> sz8_64, string asm, ZPRRegOp zprty> @@ -1072,7 +1199,7 @@ class sve_int_perm_insrs<bits<2> sz8_64, string asm, ZPRRegOp zprty, let Inst{4-0} = Zdn; let Constraints = "$Zdn = $_Zdn"; - let DestructiveInstType = Destructive; + let DestructiveInstType = DestructiveOther; } multiclass sve_int_perm_insrs<string asm, SDPatternOperator op> { @@ -1102,7 +1229,7 @@ class sve_int_perm_insrv<bits<2> sz8_64, string asm, ZPRRegOp zprty, let Inst{4-0} = Zdn; let Constraints = "$Zdn = $_Zdn"; - let DestructiveInstType = Destructive; + let DestructiveInstType = DestructiveOther; } multiclass sve_int_perm_insrv<string asm, SDPatternOperator op> { @@ -1135,7 +1262,7 @@ class sve_int_perm_extract_i<string asm> let Inst{4-0} = Zdn; let Constraints = "$Zdn = $_Zdn"; - let DestructiveInstType = Destructive; + let DestructiveInstType = DestructiveOther; let ElementSize = ElementSizeNone; } @@ -1244,13 +1371,22 @@ class sve_int_pred_log<bits<4> opc, string asm> } -multiclass sve_int_pred_log<bits<4> opc, string asm, SDPatternOperator op> { +multiclass sve_int_pred_log<bits<4> opc, string asm, SDPatternOperator op, + SDPatternOperator op_nopred = null_frag> { def NAME : sve_int_pred_log<opc, asm>; def : SVE_3_Op_Pat<nxv16i1, op, nxv16i1, nxv16i1, nxv16i1, !cast<Instruction>(NAME)>; def : SVE_3_Op_Pat<nxv8i1, op, nxv8i1, nxv8i1, nxv8i1, !cast<Instruction>(NAME)>; def : SVE_3_Op_Pat<nxv4i1, op, nxv4i1, nxv4i1, nxv4i1, !cast<Instruction>(NAME)>; def : SVE_3_Op_Pat<nxv2i1, op, nxv2i1, nxv2i1, nxv2i1, !cast<Instruction>(NAME)>; + def : SVE_2_Op_AllActive_Pat<nxv16i1, op_nopred, nxv16i1, nxv16i1, + !cast<Instruction>(NAME), PTRUE_B>; + def : SVE_2_Op_AllActive_Pat<nxv8i1, op_nopred, nxv8i1, nxv8i1, + !cast<Instruction>(NAME), PTRUE_H>; + def : SVE_2_Op_AllActive_Pat<nxv4i1, op_nopred, nxv4i1, nxv4i1, + !cast<Instruction>(NAME), PTRUE_S>; + def : SVE_2_Op_AllActive_Pat<nxv2i1, op_nopred, nxv2i1, nxv2i1, + !cast<Instruction>(NAME), PTRUE_D>; } @@ -1272,7 +1408,7 @@ class sve_int_log_imm<bits<2> opc, string asm> let Constraints = "$Zdn = $_Zdn"; let DecoderMethod = "DecodeSVELogicalImmInstruction"; - let DestructiveInstType = Destructive; + let DestructiveInstType = DestructiveOther; let ElementSize = ElementSizeNone; } @@ -1357,7 +1493,8 @@ class sve_int_bin_cons_arit_0<bits<2> sz8_64, bits<3> opc, string asm, let Inst{4-0} = Zd; } -multiclass sve_int_bin_cons_arit_0<bits<3> opc, string asm, SDPatternOperator op> { +multiclass sve_int_bin_cons_arit_0<bits<3> opc, string asm, + SDPatternOperator op, SDPatternOperator int_op> { def _B : sve_int_bin_cons_arit_0<0b00, opc, asm, ZPR8>; def _H : sve_int_bin_cons_arit_0<0b01, opc, asm, ZPR16>; def _S : sve_int_bin_cons_arit_0<0b10, opc, asm, ZPR32>; @@ -1367,6 +1504,12 @@ multiclass sve_int_bin_cons_arit_0<bits<3> opc, string asm, SDPatternOperator op def : SVE_2_Op_Pat<nxv8i16, op, nxv8i16, nxv8i16, !cast<Instruction>(NAME # _H)>; def : SVE_2_Op_Pat<nxv4i32, op, nxv4i32, nxv4i32, !cast<Instruction>(NAME # _S)>; def : SVE_2_Op_Pat<nxv2i64, op, nxv2i64, nxv2i64, !cast<Instruction>(NAME # _D)>; + + // Intrinsic version + def : SVE_2_Op_Pat<nxv16i8, int_op, nxv16i8, nxv16i8, !cast<Instruction>(NAME # _B)>; + def : SVE_2_Op_Pat<nxv8i16, int_op, nxv8i16, nxv8i16, !cast<Instruction>(NAME # _H)>; + def : SVE_2_Op_Pat<nxv4i32, int_op, nxv4i32, nxv4i32, !cast<Instruction>(NAME # _S)>; + def : SVE_2_Op_Pat<nxv2i64, int_op, nxv2i64, nxv2i64, !cast<Instruction>(NAME # _D)>; } //===----------------------------------------------------------------------===// @@ -1394,7 +1537,7 @@ class sve_fp_2op_i_p_zds<bits<2> sz, bits<3> opc, string asm, let Inst{4-0} = Zdn; let Constraints = "$Zdn = $_Zdn"; - let DestructiveInstType = Destructive; + let DestructiveInstType = DestructiveOther; let ElementSize = zprty.ElementSize; } @@ -1423,15 +1566,21 @@ class sve_fp_2op_p_zds<bits<2> sz, bits<4> opc, string asm, let Inst{4-0} = Zdn; let Constraints = "$Zdn = $_Zdn"; - let DestructiveInstType = Destructive; + let DestructiveInstType = DestructiveOther; let ElementSize = zprty.ElementSize; } -multiclass sve_fp_2op_p_zds<bits<4> opc, string asm, - SDPatternOperator op> { - def _H : sve_fp_2op_p_zds<0b01, opc, asm, ZPR16>; - def _S : sve_fp_2op_p_zds<0b10, opc, asm, ZPR32>; - def _D : sve_fp_2op_p_zds<0b11, opc, asm, ZPR64>; +multiclass sve_fp_2op_p_zds<bits<4> opc, string asm, string Ps, + SDPatternOperator op, DestructiveInstTypeEnum flags, + string revname="", bit isReverseInstr=0> { + let DestructiveInstType = flags in { + def _H : sve_fp_2op_p_zds<0b01, opc, asm, ZPR16>, + SVEPseudo2Instr<Ps # _H, 1>, SVEInstr2Rev<NAME # _H, revname # _H, isReverseInstr>; + def _S : sve_fp_2op_p_zds<0b10, opc, asm, ZPR32>, + SVEPseudo2Instr<Ps # _S, 1>, SVEInstr2Rev<NAME # _S, revname # _S, isReverseInstr>; + def _D : sve_fp_2op_p_zds<0b11, opc, asm, ZPR64>, + SVEPseudo2Instr<Ps # _D, 1>, SVEInstr2Rev<NAME # _D, revname # _D, isReverseInstr>; + } def : SVE_3_Op_Pat<nxv8f16, op, nxv8i1, nxv8f16, nxv8f16, !cast<Instruction>(NAME # _H)>; def : SVE_3_Op_Pat<nxv4f32, op, nxv4i1, nxv4f32, nxv4f32, !cast<Instruction>(NAME # _S)>; @@ -1449,6 +1598,16 @@ multiclass sve_fp_2op_p_zds_fscale<bits<4> opc, string asm, def : SVE_3_Op_Pat<nxv2f64, op, nxv2i1, nxv2f64, nxv2i64, !cast<Instruction>(NAME # _D)>; } +multiclass sve_fp_2op_p_zds_zeroing_hsd<SDPatternOperator op> { + def _ZERO_H : PredTwoOpPseudo<NAME # _H, ZPR16, FalseLanesZero>; + def _ZERO_S : PredTwoOpPseudo<NAME # _S, ZPR32, FalseLanesZero>; + def _ZERO_D : PredTwoOpPseudo<NAME # _D, ZPR64, FalseLanesZero>; + + def : SVE_3_Op_Pat_SelZero<nxv8f16, op, nxv8i1, nxv8f16, nxv8f16, !cast<Pseudo>(NAME # _ZERO_H)>; + def : SVE_3_Op_Pat_SelZero<nxv4f32, op, nxv4i1, nxv4f32, nxv4f32, !cast<Pseudo>(NAME # _ZERO_S)>; + def : SVE_3_Op_Pat_SelZero<nxv2f64, op, nxv2i1, nxv2f64, nxv2f64, !cast<Pseudo>(NAME # _ZERO_D)>; +} + class sve_fp_ftmad<bits<2> sz, string asm, ZPRRegOp zprty> : I<(outs zprty:$Zdn), (ins zprty:$_Zdn, zprty:$Zm, imm32_0_7:$imm3), asm, "\t$Zdn, $_Zdn, $Zm, $imm3", @@ -1466,7 +1625,7 @@ class sve_fp_ftmad<bits<2> sz, string asm, ZPRRegOp zprty> let Inst{4-0} = Zdn; let Constraints = "$Zdn = $_Zdn"; - let DestructiveInstType = Destructive; + let DestructiveInstType = DestructiveOther; let ElementSize = ElementSizeNone; } @@ -1551,7 +1710,7 @@ class sve_fp_3op_p_zds_a<bits<2> sz, bits<2> opc, string asm, ZPRRegOp zprty> let Inst{4-0} = Zda; let Constraints = "$Zda = $_Zda"; - let DestructiveInstType = Destructive; + let DestructiveInstType = DestructiveOther; let ElementSize = zprty.ElementSize; } @@ -1586,7 +1745,7 @@ class sve_fp_3op_p_zds_b<bits<2> sz, bits<2> opc, string asm, let Inst{4-0} = Zdn; let Constraints = "$Zdn = $_Zdn"; - let DestructiveInstType = Destructive; + let DestructiveInstType = DestructiveOther; let ElementSize = zprty.ElementSize; } @@ -1620,7 +1779,7 @@ class sve_fp_fma_by_indexed_elem<bits<2> sz, bit opc, string asm, let Inst{4-0} = Zda; let Constraints = "$Zda = $_Zda"; - let DestructiveInstType = Destructive; + let DestructiveInstType = DestructiveOther; let ElementSize = ElementSizeNone; } @@ -1646,12 +1805,12 @@ multiclass sve_fp_fma_by_indexed_elem<bit opc, string asm, let Inst{19-16} = Zm; } - def : Pat<(nxv8f16 (op nxv8f16:$Op1, nxv8f16:$Op2, nxv8f16:$Op3, (i32 VectorIndexH32b:$idx))), - (!cast<Instruction>(NAME # _H) $Op1, $Op2, $Op3, VectorIndexH32b:$idx)>; - def : Pat<(nxv4f32 (op nxv4f32:$Op1, nxv4f32:$Op2, nxv4f32:$Op3, (i32 VectorIndexS32b:$idx))), - (!cast<Instruction>(NAME # _S) $Op1, $Op2, $Op3, VectorIndexS32b:$idx)>; - def : Pat<(nxv2f64 (op nxv2f64:$Op1, nxv2f64:$Op2, nxv2f64:$Op3, (i32 VectorIndexD32b:$idx))), - (!cast<Instruction>(NAME # _D) $Op1, $Op2, $Op3, VectorIndexD32b:$idx)>; + def : Pat<(nxv8f16 (op nxv8f16:$Op1, nxv8f16:$Op2, nxv8f16:$Op3, (i32 VectorIndexH32b_timm:$idx))), + (!cast<Instruction>(NAME # _H) $Op1, $Op2, $Op3, VectorIndexH32b_timm:$idx)>; + def : Pat<(nxv4f32 (op nxv4f32:$Op1, nxv4f32:$Op2, nxv4f32:$Op3, (i32 VectorIndexS32b_timm:$idx))), + (!cast<Instruction>(NAME # _S) $Op1, $Op2, $Op3, VectorIndexS32b_timm:$idx)>; + def : Pat<(nxv2f64 (op nxv2f64:$Op1, nxv2f64:$Op2, nxv2f64:$Op3, (i32 VectorIndexD32b_timm:$idx))), + (!cast<Instruction>(NAME # _D) $Op1, $Op2, $Op3, VectorIndexD32b_timm:$idx)>; } @@ -1694,12 +1853,12 @@ multiclass sve_fp_fmul_by_indexed_elem<string asm, SDPatternOperator op> { let Inst{19-16} = Zm; } - def : Pat<(nxv8f16 (op nxv8f16:$Op1, nxv8f16:$Op2, (i32 VectorIndexH32b:$idx))), - (!cast<Instruction>(NAME # _H) $Op1, $Op2, VectorIndexH32b:$idx)>; - def : Pat<(nxv4f32 (op nxv4f32:$Op1, nxv4f32:$Op2, (i32 VectorIndexS32b:$idx))), - (!cast<Instruction>(NAME # _S) $Op1, $Op2, VectorIndexS32b:$idx)>; - def : Pat<(nxv2f64 (op nxv2f64:$Op1, nxv2f64:$Op2, (i32 VectorIndexD32b:$idx))), - (!cast<Instruction>(NAME # _D) $Op1, $Op2, VectorIndexD32b:$idx)>; + def : Pat<(nxv8f16 (op nxv8f16:$Op1, nxv8f16:$Op2, (i32 VectorIndexH32b_timm:$idx))), + (!cast<Instruction>(NAME # _H) $Op1, $Op2, VectorIndexH32b_timm:$idx)>; + def : Pat<(nxv4f32 (op nxv4f32:$Op1, nxv4f32:$Op2, (i32 VectorIndexS32b_timm:$idx))), + (!cast<Instruction>(NAME # _S) $Op1, $Op2, VectorIndexS32b_timm:$idx)>; + def : Pat<(nxv2f64 (op nxv2f64:$Op1, nxv2f64:$Op2, (i32 VectorIndexD32b_timm:$idx))), + (!cast<Instruction>(NAME # _D) $Op1, $Op2, VectorIndexD32b_timm:$idx)>; } //===----------------------------------------------------------------------===// @@ -1727,7 +1886,7 @@ class sve_fp_fcmla<bits<2> sz, string asm, ZPRRegOp zprty> let Inst{4-0} = Zda; let Constraints = "$Zda = $_Zda"; - let DestructiveInstType = Destructive; + let DestructiveInstType = DestructiveOther; let ElementSize = zprty.ElementSize; } @@ -1767,7 +1926,7 @@ class sve_fp_fcmla_by_indexed_elem<bits<2> sz, string asm, let Inst{4-0} = Zda; let Constraints = "$Zda = $_Zda"; - let DestructiveInstType = Destructive; + let DestructiveInstType = DestructiveOther; let ElementSize = ElementSizeNone; } @@ -1785,10 +1944,10 @@ multiclass sve_fp_fcmla_by_indexed_elem<string asm, SDPatternOperator op> { let Inst{19-16} = Zm; } - def : Pat<(nxv8f16 (op nxv8f16:$Op1, nxv8f16:$Op2, nxv8f16:$Op3, (i32 VectorIndexS32b:$idx), (i32 complexrotateop:$imm))), - (!cast<Instruction>(NAME # _H) $Op1, $Op2, $Op3, VectorIndexS32b:$idx, complexrotateop:$imm)>; - def : Pat<(nxv4f32 (op nxv4f32:$Op1, nxv4f32:$Op2, nxv4f32:$Op3, (i32 VectorIndexD32b:$idx), (i32 complexrotateop:$imm))), - (!cast<Instruction>(NAME # _S) $Op1, $Op2, $Op3, VectorIndexD32b:$idx, complexrotateop:$imm)>; + def : Pat<(nxv8f16 (op nxv8f16:$Op1, nxv8f16:$Op2, nxv8f16:$Op3, (i32 VectorIndexS32b_timm:$idx), (i32 complexrotateop:$imm))), + (!cast<Instruction>(NAME # _H) $Op1, $Op2, $Op3, VectorIndexS32b_timm:$idx, complexrotateop:$imm)>; + def : Pat<(nxv4f32 (op nxv4f32:$Op1, nxv4f32:$Op2, nxv4f32:$Op3, (i32 VectorIndexD32b_timm:$idx), (i32 complexrotateop:$imm))), + (!cast<Instruction>(NAME # _S) $Op1, $Op2, $Op3, VectorIndexD32b_timm:$idx, complexrotateop:$imm)>; } //===----------------------------------------------------------------------===// @@ -1815,7 +1974,7 @@ class sve_fp_fcadd<bits<2> sz, string asm, ZPRRegOp zprty> let Inst{4-0} = Zdn; let Constraints = "$Zdn = $_Zdn"; - let DestructiveInstType = Destructive; + let DestructiveInstType = DestructiveOther; let ElementSize = zprty.ElementSize; } @@ -1861,22 +2020,22 @@ multiclass sve2_fp_convert_down_narrow<string asm, string op> { def _StoH : sve2_fp_convert_precision<0b1000, asm, ZPR16, ZPR32>; def _DtoS : sve2_fp_convert_precision<0b1110, asm, ZPR32, ZPR64>; - def : SVE_3_Op_Pat<nxv8f16, !cast<SDPatternOperator>(op # _f16f32), nxv8f16, nxv16i1, nxv4f32, !cast<Instruction>(NAME # _StoH)>; - def : SVE_3_Op_Pat<nxv4f32, !cast<SDPatternOperator>(op # _f32f64), nxv4f32, nxv16i1, nxv2f64, !cast<Instruction>(NAME # _DtoS)>; + def : SVE_3_Op_Pat<nxv8f16, !cast<SDPatternOperator>(op # _f16f32), nxv8f16, nxv4i1, nxv4f32, !cast<Instruction>(NAME # _StoH)>; + def : SVE_3_Op_Pat<nxv4f32, !cast<SDPatternOperator>(op # _f32f64), nxv4f32, nxv2i1, nxv2f64, !cast<Instruction>(NAME # _DtoS)>; } multiclass sve2_fp_convert_up_long<string asm, string op> { def _HtoS : sve2_fp_convert_precision<0b1001, asm, ZPR32, ZPR16>; def _StoD : sve2_fp_convert_precision<0b1111, asm, ZPR64, ZPR32>; - def : SVE_3_Op_Pat<nxv4f32, !cast<SDPatternOperator>(op # _f32f16), nxv4f32, nxv16i1, nxv8f16, !cast<Instruction>(NAME # _HtoS)>; - def : SVE_3_Op_Pat<nxv2f64, !cast<SDPatternOperator>(op # _f64f32), nxv2f64, nxv16i1, nxv4f32, !cast<Instruction>(NAME # _StoD)>; + def : SVE_3_Op_Pat<nxv4f32, !cast<SDPatternOperator>(op # _f32f16), nxv4f32, nxv4i1, nxv8f16, !cast<Instruction>(NAME # _HtoS)>; + def : SVE_3_Op_Pat<nxv2f64, !cast<SDPatternOperator>(op # _f64f32), nxv2f64, nxv2i1, nxv4f32, !cast<Instruction>(NAME # _StoD)>; } multiclass sve2_fp_convert_down_odd_rounding_top<string asm, string op> { def _DtoS : sve2_fp_convert_precision<0b0010, asm, ZPR32, ZPR64>; - def : SVE_3_Op_Pat<nxv4f32, !cast<SDPatternOperator>(op # _f32f64), nxv4f32, nxv16i1, nxv2f64, !cast<Instruction>(NAME # _DtoS)>; + def : SVE_3_Op_Pat<nxv4f32, !cast<SDPatternOperator>(op # _f32f64), nxv4f32, nxv2i1, nxv2f64, !cast<Instruction>(NAME # _DtoS)>; } //===----------------------------------------------------------------------===// @@ -1902,7 +2061,7 @@ class sve2_fp_pairwise_pred<bits<2> sz, bits<3> opc, string asm, let Inst{4-0} = Zdn; let Constraints = "$Zdn = $_Zdn"; - let DestructiveInstType = Destructive; + let DestructiveInstType = DestructiveOther; let ElementSize = zprty.ElementSize; } @@ -1942,14 +2101,14 @@ class sve2_fp_mla_long_by_indexed_elem<bits<2> opc, string asm> let Inst{4-0} = Zda; let Constraints = "$Zda = $_Zda"; - let DestructiveInstType = Destructive; + let DestructiveInstType = DestructiveOther; let ElementSize = ElementSizeNone; } multiclass sve2_fp_mla_long_by_indexed_elem<bits<2> opc, string asm, SDPatternOperator op> { def NAME : sve2_fp_mla_long_by_indexed_elem<opc, asm>; - def : SVE_4_Op_Imm_Pat<nxv4f32, op, nxv4f32, nxv8f16, nxv8f16, i32, VectorIndexH32b, !cast<Instruction>(NAME)>; + def : SVE_4_Op_Imm_Pat<nxv4f32, op, nxv4f32, nxv8f16, nxv8f16, i32, VectorIndexH32b_timm, !cast<Instruction>(NAME)>; } //===----------------------------------------------------------------------===// @@ -1974,7 +2133,7 @@ class sve2_fp_mla_long<bits<2> opc, string asm> let Inst{4-0} = Zda; let Constraints = "$Zda = $_Zda"; - let DestructiveInstType = Destructive; + let DestructiveInstType = DestructiveOther; let ElementSize = ElementSizeNone; } @@ -2084,7 +2243,7 @@ class sve_fp_2op_p_zd<bits<7> opc, string asm, RegisterOperand i_zprtype, let Inst{4-0} = Zd; let Constraints = "$Zd = $_Zd"; - let DestructiveInstType = Destructive; + let DestructiveInstType = DestructiveOther; let ElementSize = size; } @@ -2120,7 +2279,7 @@ multiclass sve2_fp_flogb<string asm, SDPatternOperator op> { multiclass sve2_fp_convert_down_odd_rounding<string asm, string op> { def _DtoS : sve_fp_2op_p_zd<0b0001010, asm, ZPR64, ZPR32, ElementSizeD>; - def : SVE_3_Op_Pat<nxv4f32, !cast<SDPatternOperator>(op # _f32f64), nxv4f32, nxv16i1, nxv2f64, !cast<Instruction>(NAME # _DtoS)>; + def : SVE_3_Op_Pat<nxv4f32, !cast<SDPatternOperator>(op # _f32f64), nxv4f32, nxv2i1, nxv2f64, !cast<Instruction>(NAME # _DtoS)>; } //===----------------------------------------------------------------------===// @@ -2176,7 +2335,7 @@ class sve_int_bin_pred_arit_log<bits<2> sz8_64, bits<2> fmt, bits<3> opc, let Inst{4-0} = Zdn; let Constraints = "$Zdn = $_Zdn"; - let DestructiveInstType = Destructive; + let DestructiveInstType = DestructiveOther; let ElementSize = zprty.ElementSize; } @@ -2192,11 +2351,20 @@ multiclass sve_int_bin_pred_log<bits<3> opc, string asm, SDPatternOperator op> { def : SVE_3_Op_Pat<nxv2i64, op, nxv2i1, nxv2i64, nxv2i64, !cast<Instruction>(NAME # _D)>; } -multiclass sve_int_bin_pred_arit_0<bits<3> opc, string asm, SDPatternOperator op> { - def _B : sve_int_bin_pred_arit_log<0b00, 0b00, opc, asm, ZPR8>; - def _H : sve_int_bin_pred_arit_log<0b01, 0b00, opc, asm, ZPR16>; - def _S : sve_int_bin_pred_arit_log<0b10, 0b00, opc, asm, ZPR32>; - def _D : sve_int_bin_pred_arit_log<0b11, 0b00, opc, asm, ZPR64>; +multiclass sve_int_bin_pred_arit_0<bits<3> opc, string asm, string Ps, + SDPatternOperator op, + DestructiveInstTypeEnum flags, + string revname="", bit isReverseInstr=0> { + let DestructiveInstType = flags in { + def _B : sve_int_bin_pred_arit_log<0b00, 0b00, opc, asm, ZPR8>, + SVEPseudo2Instr<Ps # _B, 1>, SVEInstr2Rev<NAME # _B, revname # _B, isReverseInstr>; + def _H : sve_int_bin_pred_arit_log<0b01, 0b00, opc, asm, ZPR16>, + SVEPseudo2Instr<Ps # _H, 1>, SVEInstr2Rev<NAME # _H, revname # _H, isReverseInstr>; + def _S : sve_int_bin_pred_arit_log<0b10, 0b00, opc, asm, ZPR32>, + SVEPseudo2Instr<Ps # _S, 1>, SVEInstr2Rev<NAME # _S, revname # _S, isReverseInstr>; + def _D : sve_int_bin_pred_arit_log<0b11, 0b00, opc, asm, ZPR64>, + SVEPseudo2Instr<Ps # _D, 1>, SVEInstr2Rev<NAME # _D, revname # _D, isReverseInstr>; + } def : SVE_3_Op_Pat<nxv16i8, op, nxv16i1, nxv16i8, nxv16i8, !cast<Instruction>(NAME # _B)>; def : SVE_3_Op_Pat<nxv8i16, op, nxv8i1, nxv8i16, nxv8i16, !cast<Instruction>(NAME # _H)>; @@ -2229,9 +2397,16 @@ multiclass sve_int_bin_pred_arit_2<bits<3> opc, string asm, SDPatternOperator op } // Special case for divides which are not defined for 8b/16b elements. -multiclass sve_int_bin_pred_arit_2_div<bits<3> opc, string asm, SDPatternOperator op> { - def _S : sve_int_bin_pred_arit_log<0b10, 0b10, opc, asm, ZPR32>; - def _D : sve_int_bin_pred_arit_log<0b11, 0b10, opc, asm, ZPR64>; +multiclass sve_int_bin_pred_arit_2_div<bits<3> opc, string asm, string Ps, + SDPatternOperator op, + DestructiveInstTypeEnum flags, + string revname="", bit isReverseInstr=0> { + let DestructiveInstType = flags in { + def _S : sve_int_bin_pred_arit_log<0b10, 0b10, opc, asm, ZPR32>, + SVEPseudo2Instr<Ps # _S, 1>, SVEInstr2Rev<NAME # _S, revname # _S, isReverseInstr>; + def _D : sve_int_bin_pred_arit_log<0b11, 0b10, opc, asm, ZPR64>, + SVEPseudo2Instr<Ps # _D, 1>, SVEInstr2Rev<NAME # _D, revname # _D, isReverseInstr>; + } def : SVE_3_Op_Pat<nxv4i32, op, nxv4i1, nxv4i32, nxv4i32, !cast<Instruction>(NAME # _S)>; def : SVE_3_Op_Pat<nxv2i64, op, nxv2i1, nxv2i64, nxv2i64, !cast<Instruction>(NAME # _D)>; @@ -2262,7 +2437,7 @@ class sve_int_mladdsub_vvv_pred<bits<2> sz8_64, bits<1> opc, string asm, let Inst{4-0} = Zdn; let Constraints = "$Zdn = $_Zdn"; - let DestructiveInstType = Destructive; + let DestructiveInstType = DestructiveOther; let ElementSize = zprty.ElementSize; } @@ -2299,7 +2474,7 @@ class sve_int_mlas_vvv_pred<bits<2> sz8_64, bits<1> opc, string asm, let Inst{4-0} = Zda; let Constraints = "$Zda = $_Zda"; - let DestructiveInstType = Destructive; + let DestructiveInstType = DestructiveOther; let ElementSize = zprty.ElementSize; } @@ -2336,21 +2511,30 @@ class sve2_int_mla<bits<2> sz, bits<5> opc, string asm, let Inst{4-0} = Zda; let Constraints = "$Zda = $_Zda"; - let DestructiveInstType = Destructive; + let DestructiveInstType = DestructiveOther; let ElementSize = ElementSizeNone; } -multiclass sve2_int_mla<bit S, string asm> { +multiclass sve2_int_mla<bit S, string asm, SDPatternOperator op> { def _B : sve2_int_mla<0b00, { 0b1110, S }, asm, ZPR8, ZPR8>; def _H : sve2_int_mla<0b01, { 0b1110, S }, asm, ZPR16, ZPR16>; def _S : sve2_int_mla<0b10, { 0b1110, S }, asm, ZPR32, ZPR32>; def _D : sve2_int_mla<0b11, { 0b1110, S }, asm, ZPR64, ZPR64>; + + def : SVE_3_Op_Pat<nxv16i8, op, nxv16i8, nxv16i8, nxv16i8, !cast<Instruction>(NAME # _B)>; + def : SVE_3_Op_Pat<nxv8i16, op, nxv8i16, nxv8i16, nxv8i16, !cast<Instruction>(NAME # _H)>; + def : SVE_3_Op_Pat<nxv4i32, op, nxv4i32, nxv4i32, nxv4i32, !cast<Instruction>(NAME # _S)>; + def : SVE_3_Op_Pat<nxv2i64, op, nxv2i64, nxv2i64, nxv2i64, !cast<Instruction>(NAME # _D)>; } -multiclass sve2_int_mla_long<bits<5> opc, string asm> { +multiclass sve2_int_mla_long<bits<5> opc, string asm, SDPatternOperator op> { def _H : sve2_int_mla<0b01, opc, asm, ZPR16, ZPR8>; def _S : sve2_int_mla<0b10, opc, asm, ZPR32, ZPR16>; def _D : sve2_int_mla<0b11, opc, asm, ZPR64, ZPR32>; + + def : SVE_3_Op_Pat<nxv8i16, op, nxv8i16, nxv16i8, nxv16i8, !cast<Instruction>(NAME # _H)>; + def : SVE_3_Op_Pat<nxv4i32, op, nxv4i32, nxv8i16, nxv8i16, !cast<Instruction>(NAME # _S)>; + def : SVE_3_Op_Pat<nxv2i64, op, nxv2i64, nxv4i32, nxv4i32, !cast<Instruction>(NAME # _D)>; } //===----------------------------------------------------------------------===// @@ -2372,39 +2556,44 @@ class sve2_int_mla_by_indexed_elem<bits<2> sz, bits<6> opc, string asm, let Inst{4-0} = Zda; let Constraints = "$Zda = $_Zda"; - let DestructiveInstType = Destructive; + let DestructiveInstType = DestructiveOther; let ElementSize = ElementSizeNone; } -multiclass sve2_int_mla_by_indexed_elem<bits<2> opc, bit S, string asm> { - def _H : sve2_int_mla_by_indexed_elem<{0, ?}, { 0b000, opc, S }, asm, ZPR16, ZPR16, ZPR3b16, VectorIndexH> { +multiclass sve2_int_mla_by_indexed_elem<bits<2> opc, bit S, string asm, + SDPatternOperator op> { + def _H : sve2_int_mla_by_indexed_elem<{0, ?}, { 0b000, opc, S }, asm, ZPR16, ZPR16, ZPR3b16, VectorIndexH32b> { bits<3> Zm; bits<3> iop; let Inst{22} = iop{2}; let Inst{20-19} = iop{1-0}; let Inst{18-16} = Zm; } - def _S : sve2_int_mla_by_indexed_elem<0b10, { 0b000, opc, S }, asm, ZPR32, ZPR32, ZPR3b32, VectorIndexS> { + def _S : sve2_int_mla_by_indexed_elem<0b10, { 0b000, opc, S }, asm, ZPR32, ZPR32, ZPR3b32, VectorIndexS32b> { bits<3> Zm; bits<2> iop; let Inst{20-19} = iop; let Inst{18-16} = Zm; } - def _D : sve2_int_mla_by_indexed_elem<0b11, { 0b000, opc, S }, asm, ZPR64, ZPR64, ZPR4b64, VectorIndexD> { + def _D : sve2_int_mla_by_indexed_elem<0b11, { 0b000, opc, S }, asm, ZPR64, ZPR64, ZPR4b64, VectorIndexD32b> { bits<4> Zm; bit iop; let Inst{20} = iop; let Inst{19-16} = Zm; } + + def : SVE_4_Op_Imm_Pat<nxv8i16, op, nxv8i16, nxv8i16, nxv8i16, i32, VectorIndexH32b_timm, !cast<Instruction>(NAME # _H)>; + def : SVE_4_Op_Imm_Pat<nxv4i32, op, nxv4i32, nxv4i32, nxv4i32, i32, VectorIndexS32b_timm, !cast<Instruction>(NAME # _S)>; + def : SVE_4_Op_Imm_Pat<nxv2i64, op, nxv2i64, nxv2i64, nxv2i64, i32, VectorIndexD32b_timm, !cast<Instruction>(NAME # _D)>; } //===----------------------------------------------------------------------===// // SVE2 Integer Multiply-Add Long - Indexed Group //===----------------------------------------------------------------------===// -multiclass sve2_int_mla_long_by_indexed_elem<bits<4> opc, string asm> { +multiclass sve2_int_mla_long_by_indexed_elem<bits<4> opc, string asm, SDPatternOperator op> { def _S : sve2_int_mla_by_indexed_elem<0b10, { opc{3}, 0b0, opc{2-1}, ?, opc{0} }, - asm, ZPR32, ZPR16, ZPR3b16, VectorIndexH> { + asm, ZPR32, ZPR16, ZPR3b16, VectorIndexH32b> { bits<3> Zm; bits<3> iop; let Inst{20-19} = iop{2-1}; @@ -2412,13 +2601,16 @@ multiclass sve2_int_mla_long_by_indexed_elem<bits<4> opc, string asm> { let Inst{11} = iop{0}; } def _D : sve2_int_mla_by_indexed_elem<0b11, { opc{3}, 0b0, opc{2-1}, ?, opc{0} }, - asm, ZPR64, ZPR32, ZPR4b32, VectorIndexS> { + asm, ZPR64, ZPR32, ZPR4b32, VectorIndexS32b> { bits<4> Zm; bits<2> iop; let Inst{20} = iop{1}; let Inst{19-16} = Zm; let Inst{11} = iop{0}; } + + def : SVE_4_Op_Imm_Pat<nxv4i32, op, nxv4i32, nxv8i16, nxv8i16, i32, VectorIndexH32b_timm, !cast<Instruction>(NAME # _S)>; + def : SVE_4_Op_Imm_Pat<nxv2i64, op, nxv2i64, nxv4i32, nxv4i32, i32, VectorIndexS32b_timm, !cast<Instruction>(NAME # _D)>; } //===----------------------------------------------------------------------===// @@ -2442,7 +2634,7 @@ class sve_intx_dot<bit sz, bit U, string asm, ZPRRegOp zprty1, let Inst{4-0} = Zda; let Constraints = "$Zda = $_Zda"; - let DestructiveInstType = Destructive; + let DestructiveInstType = DestructiveOther; } multiclass sve_intx_dot<bit opc, string asm, SDPatternOperator op> { @@ -2474,28 +2666,28 @@ class sve_intx_dot_by_indexed_elem<bit sz, bit U, string asm, let Inst{4-0} = Zda; let Constraints = "$Zda = $_Zda"; - let DestructiveInstType = Destructive; + let DestructiveInstType = DestructiveOther; } multiclass sve_intx_dot_by_indexed_elem<bit opc, string asm, SDPatternOperator op> { - def _S : sve_intx_dot_by_indexed_elem<0b0, opc, asm, ZPR32, ZPR8, ZPR3b8, VectorIndexS32b> { + def _S : sve_intx_dot_by_indexed_elem<0b0, opc, asm, ZPR32, ZPR8, ZPR3b8, VectorIndexS32b_timm> { bits<2> iop; bits<3> Zm; let Inst{20-19} = iop; let Inst{18-16} = Zm; } - def _D : sve_intx_dot_by_indexed_elem<0b1, opc, asm, ZPR64, ZPR16, ZPR4b16, VectorIndexD32b> { + def _D : sve_intx_dot_by_indexed_elem<0b1, opc, asm, ZPR64, ZPR16, ZPR4b16, VectorIndexD32b_timm> { bits<1> iop; bits<4> Zm; let Inst{20} = iop; let Inst{19-16} = Zm; } - def : Pat<(nxv4i32 (op nxv4i32:$Op1, nxv16i8:$Op2, nxv16i8:$Op3, (i32 VectorIndexS32b:$idx))), - (!cast<Instruction>(NAME # _S) $Op1, $Op2, $Op3, VectorIndexS32b:$idx)>; - def : Pat<(nxv2i64 (op nxv2i64:$Op1, nxv8i16:$Op2, nxv8i16:$Op3, (i32 VectorIndexD32b:$idx))), - (!cast<Instruction>(NAME # _D) $Op1, $Op2, $Op3, VectorIndexD32b:$idx)>; + def : Pat<(nxv4i32 (op nxv4i32:$Op1, nxv16i8:$Op2, nxv16i8:$Op3, (i32 VectorIndexS32b_timm:$idx))), + (!cast<Instruction>(NAME # _S) $Op1, $Op2, $Op3, VectorIndexS32b_timm:$idx)>; + def : Pat<(nxv2i64 (op nxv2i64:$Op1, nxv8i16:$Op2, nxv8i16:$Op3, (i32 VectorIndexD32b_timm:$idx))), + (!cast<Instruction>(NAME # _D) $Op1, $Op2, $Op3, VectorIndexD32b_timm:$idx)>; } //===----------------------------------------------------------------------===// @@ -2521,24 +2713,36 @@ class sve2_complex_int_arith<bits<2> sz, bits<4> opc, string asm, let Inst{4-0} = Zda; let Constraints = "$Zda = $_Zda"; - let DestructiveInstType = Destructive; + let DestructiveInstType = DestructiveOther; let ElementSize = ElementSizeNone; } -multiclass sve2_cintx_dot<string asm> { +multiclass sve2_cintx_dot<string asm, SDPatternOperator op> { def _S : sve2_complex_int_arith<0b10, 0b0001, asm, ZPR32, ZPR8>; def _D : sve2_complex_int_arith<0b11, 0b0001, asm, ZPR64, ZPR16>; + + def : Pat<(nxv4i32 (op (nxv4i32 ZPR32:$Op1), (nxv16i8 ZPR8:$Op2), (nxv16i8 ZPR8:$Op3), + (i32 complexrotateop:$imm))), + (!cast<Instruction>(NAME # "_S") ZPR32:$Op1, ZPR8:$Op2, ZPR8:$Op3, complexrotateop:$imm)>; + def : Pat<(nxv2i64 (op (nxv2i64 ZPR64:$Op1), (nxv8i16 ZPR16:$Op2), (nxv8i16 ZPR16:$Op3), + (i32 complexrotateop:$imm))), + (!cast<Instruction>(NAME # "_D") ZPR64:$Op1, ZPR16:$Op2, ZPR16:$Op3, complexrotateop:$imm)>; } //===----------------------------------------------------------------------===// // SVE2 Complex Multiply-Add Group //===----------------------------------------------------------------------===// -multiclass sve2_int_cmla<bit opc, string asm> { +multiclass sve2_int_cmla<bit opc, string asm, SDPatternOperator op> { def _B : sve2_complex_int_arith<0b00, { 0b001, opc }, asm, ZPR8, ZPR8>; def _H : sve2_complex_int_arith<0b01, { 0b001, opc }, asm, ZPR16, ZPR16>; def _S : sve2_complex_int_arith<0b10, { 0b001, opc }, asm, ZPR32, ZPR32>; def _D : sve2_complex_int_arith<0b11, { 0b001, opc }, asm, ZPR64, ZPR64>; + + def : SVE_4_Op_Imm_Pat<nxv16i8, op, nxv16i8, nxv16i8, nxv16i8, i32, complexrotateop, !cast<Instruction>(NAME # _B)>; + def : SVE_4_Op_Imm_Pat<nxv8i16, op, nxv8i16, nxv8i16, nxv8i16, i32, complexrotateop, !cast<Instruction>(NAME # _H)>; + def : SVE_4_Op_Imm_Pat<nxv4i32, op, nxv4i32, nxv4i32, nxv4i32, i32, complexrotateop, !cast<Instruction>(NAME # _S)>; + def : SVE_4_Op_Imm_Pat<nxv2i64, op, nxv2i64, nxv2i64, nxv2i64, i32, complexrotateop, !cast<Instruction>(NAME # _D)>; } //===----------------------------------------------------------------------===// @@ -2563,42 +2767,58 @@ class sve2_complex_int_arith_indexed<bits<2> sz, bits<4> opc, string asm, let Inst{4-0} = Zda; let Constraints = "$Zda = $_Zda"; - let DestructiveInstType = Destructive; + let DestructiveInstType = DestructiveOther; let ElementSize = ElementSizeNone; } -multiclass sve2_cintx_dot_by_indexed_elem<string asm> { - def _S : sve2_complex_int_arith_indexed<0b10, 0b0100, asm, ZPR32, ZPR8, ZPR3b8, VectorIndexS> { +multiclass sve2_cintx_dot_by_indexed_elem<string asm, SDPatternOperator op> { + def _S : sve2_complex_int_arith_indexed<0b10, 0b0100, asm, ZPR32, ZPR8, ZPR3b8, VectorIndexS32b> { bits<2> iop; bits<3> Zm; let Inst{20-19} = iop; let Inst{18-16} = Zm; } - def _D : sve2_complex_int_arith_indexed<0b11, 0b0100, asm, ZPR64, ZPR16, ZPR4b16, VectorIndexD> { + def _D : sve2_complex_int_arith_indexed<0b11, 0b0100, asm, ZPR64, ZPR16, ZPR4b16, VectorIndexD32b> { bit iop; bits<4> Zm; let Inst{20} = iop; let Inst{19-16} = Zm; } + + def : Pat<(nxv4i32 (op (nxv4i32 ZPR32:$Op1), (nxv16i8 ZPR8:$Op2), (nxv16i8 ZPR8:$Op3), + (i32 VectorIndexS32b_timm:$idx), (i32 complexrotateop:$imm))), + (!cast<Instruction>(NAME # "_S") ZPR32:$Op1, ZPR8:$Op2, ZPR8:$Op3, VectorIndexS32b_timm:$idx, complexrotateop:$imm)>; + def : Pat<(nxv2i64 (op (nxv2i64 ZPR64:$Op1), (nxv8i16 ZPR16:$Op2), (nxv8i16 ZPR16:$Op3), + (i32 VectorIndexD32b_timm:$idx), (i32 complexrotateop:$imm))), + (!cast<Instruction>(NAME # "_D") ZPR64:$Op1, ZPR16:$Op2, ZPR16:$Op3, VectorIndexD32b_timm:$idx, complexrotateop:$imm)>; } //===----------------------------------------------------------------------===// // SVE2 Complex Multiply-Add - Indexed Group //===----------------------------------------------------------------------===// -multiclass sve2_cmla_by_indexed_elem<bit opc, string asm> { - def _H : sve2_complex_int_arith_indexed<0b10, { 0b011, opc }, asm, ZPR16, ZPR16, ZPR3b16, VectorIndexS> { +multiclass sve2_cmla_by_indexed_elem<bit opc, string asm, + SDPatternOperator op> { + def _H : sve2_complex_int_arith_indexed<0b10, { 0b011, opc }, asm, ZPR16, ZPR16, ZPR3b16, VectorIndexS32b> { bits<2> iop; bits<3> Zm; let Inst{20-19} = iop; let Inst{18-16} = Zm; } - def _S : sve2_complex_int_arith_indexed<0b11, { 0b011, opc }, asm, ZPR32, ZPR32, ZPR4b32, VectorIndexD> { + def _S : sve2_complex_int_arith_indexed<0b11, { 0b011, opc }, asm, ZPR32, ZPR32, ZPR4b32, VectorIndexD32b> { bit iop; bits<4> Zm; let Inst{20} = iop; let Inst{19-16} = Zm; } + + def : Pat<(nxv8i16 (op (nxv8i16 ZPR16:$Op1), (nxv8i16 ZPR16:$Op2), (nxv8i16 ZPR16:$Op3), + (i32 VectorIndexS32b_timm:$idx), (i32 complexrotateop:$imm))), + (!cast<Instruction>(NAME # "_H") ZPR16:$Op1, ZPR16:$Op2, ZPR16:$Op3, VectorIndexS32b_timm:$idx, complexrotateop:$imm)>; + + def : Pat<(nxv4i32 (op (nxv4i32 ZPR32:$Op1), (nxv4i32 ZPR32:$Op2), (nxv4i32 ZPR32:$Op3), + (i32 VectorIndexD32b_timm:$idx), (i32 complexrotateop:$imm))), + (!cast<Instruction>(NAME # "_S") ZPR32:$Op1, ZPR32:$Op2, ZPR32:$Op3, VectorIndexD32b_timm:$idx, complexrotateop:$imm)>; } //===----------------------------------------------------------------------===// @@ -2621,11 +2841,22 @@ class sve2_int_mul<bits<2> sz, bits<3> opc, string asm, ZPRRegOp zprty> let Inst{4-0} = Zd; } -multiclass sve2_int_mul<bits<3> opc, string asm> { +multiclass sve2_int_mul<bits<3> opc, string asm, SDPatternOperator op> { def _B : sve2_int_mul<0b00, opc, asm, ZPR8>; def _H : sve2_int_mul<0b01, opc, asm, ZPR16>; def _S : sve2_int_mul<0b10, opc, asm, ZPR32>; def _D : sve2_int_mul<0b11, opc, asm, ZPR64>; + + def : SVE_2_Op_Pat<nxv16i8, op, nxv16i8, nxv16i8, !cast<Instruction>(NAME # _B)>; + def : SVE_2_Op_Pat<nxv8i16, op, nxv8i16, nxv8i16, !cast<Instruction>(NAME # _H)>; + def : SVE_2_Op_Pat<nxv4i32, op, nxv4i32, nxv4i32, !cast<Instruction>(NAME # _S)>; + def : SVE_2_Op_Pat<nxv2i64, op, nxv2i64, nxv2i64, !cast<Instruction>(NAME # _D)>; +} + +multiclass sve2_int_mul_single<bits<3> opc, string asm, SDPatternOperator op> { + def _B : sve2_int_mul<0b00, opc, asm, ZPR8>; + + def : SVE_2_Op_Pat<nxv16i8, op, nxv16i8, nxv16i8, !cast<Instruction>(NAME # _B)>; } //===----------------------------------------------------------------------===// @@ -2648,31 +2879,37 @@ class sve2_int_mul_by_indexed_elem<bits<2> sz, bits<4> opc, string asm, let Inst{4-0} = Zd; } -multiclass sve2_int_mul_by_indexed_elem<bits<4> opc, string asm> { - def _H : sve2_int_mul_by_indexed_elem<{0, ?}, opc, asm, ZPR16, ZPR16, ZPR3b16, VectorIndexH> { +multiclass sve2_int_mul_by_indexed_elem<bits<4> opc, string asm, + SDPatternOperator op> { + def _H : sve2_int_mul_by_indexed_elem<{0, ?}, opc, asm, ZPR16, ZPR16, ZPR3b16, VectorIndexH32b> { bits<3> Zm; bits<3> iop; let Inst{22} = iop{2}; let Inst{20-19} = iop{1-0}; let Inst{18-16} = Zm; } - def _S : sve2_int_mul_by_indexed_elem<0b10, opc, asm, ZPR32, ZPR32, ZPR3b32, VectorIndexS> { + def _S : sve2_int_mul_by_indexed_elem<0b10, opc, asm, ZPR32, ZPR32, ZPR3b32, VectorIndexS32b> { bits<3> Zm; bits<2> iop; let Inst{20-19} = iop; let Inst{18-16} = Zm; } - def _D : sve2_int_mul_by_indexed_elem<0b11, opc, asm, ZPR64, ZPR64, ZPR4b64, VectorIndexD> { + def _D : sve2_int_mul_by_indexed_elem<0b11, opc, asm, ZPR64, ZPR64, ZPR4b64, VectorIndexD32b> { bits<4> Zm; bit iop; let Inst{20} = iop; let Inst{19-16} = Zm; } + + def : SVE_3_Op_Imm_Pat<nxv8i16, op, nxv8i16, nxv8i16, i32, VectorIndexH32b_timm, !cast<Instruction>(NAME # _H)>; + def : SVE_3_Op_Imm_Pat<nxv4i32, op, nxv4i32, nxv4i32, i32, VectorIndexS32b_timm, !cast<Instruction>(NAME # _S)>; + def : SVE_3_Op_Imm_Pat<nxv2i64, op, nxv2i64, nxv2i64, i32, VectorIndexD32b_timm, !cast<Instruction>(NAME # _D)>; } -multiclass sve2_int_mul_long_by_indexed_elem<bits<3> opc, string asm> { +multiclass sve2_int_mul_long_by_indexed_elem<bits<3> opc, string asm, + SDPatternOperator op> { def _S : sve2_int_mul_by_indexed_elem<0b10, { opc{2-1}, ?, opc{0} }, asm, - ZPR32, ZPR16, ZPR3b16, VectorIndexH> { + ZPR32, ZPR16, ZPR3b16, VectorIndexH32b> { bits<3> Zm; bits<3> iop; let Inst{20-19} = iop{2-1}; @@ -2680,13 +2917,16 @@ multiclass sve2_int_mul_long_by_indexed_elem<bits<3> opc, string asm> { let Inst{11} = iop{0}; } def _D : sve2_int_mul_by_indexed_elem<0b11, { opc{2-1}, ?, opc{0} }, asm, - ZPR64, ZPR32, ZPR4b32, VectorIndexS> { + ZPR64, ZPR32, ZPR4b32, VectorIndexS32b> { bits<4> Zm; bits<2> iop; let Inst{20} = iop{1}; let Inst{19-16} = Zm; let Inst{11} = iop{0}; } + + def : SVE_3_Op_Imm_Pat<nxv4i32, op, nxv8i16, nxv8i16, i32, VectorIndexH32b_timm, !cast<Instruction>(NAME # _S)>; + def : SVE_3_Op_Imm_Pat<nxv2i64, op, nxv4i32, nxv4i32, i32, VectorIndexS32b_timm, !cast<Instruction>(NAME # _D)>; } //===----------------------------------------------------------------------===// @@ -2702,7 +2942,7 @@ class sve2_int_arith_pred<bits<2> sz, bits<6> opc, string asm, bits<5> Zdn; let Inst{31-24} = 0b01000100; let Inst{23-22} = sz; - let Inst{21} = 0b0; + let Inst{21-20} = 0b01; let Inst{20-16} = opc{5-1}; let Inst{15-14} = 0b10; let Inst{13} = opc{0}; @@ -2711,15 +2951,20 @@ class sve2_int_arith_pred<bits<2> sz, bits<6> opc, string asm, let Inst{4-0} = Zdn; let Constraints = "$Zdn = $_Zdn"; - let DestructiveInstType = Destructive; + let DestructiveInstType = DestructiveOther; let ElementSize = zprty.ElementSize; } -multiclass sve2_int_arith_pred<bits<6> opc, string asm> { +multiclass sve2_int_arith_pred<bits<6> opc, string asm, SDPatternOperator op> { def _B : sve2_int_arith_pred<0b00, opc, asm, ZPR8>; def _H : sve2_int_arith_pred<0b01, opc, asm, ZPR16>; def _S : sve2_int_arith_pred<0b10, opc, asm, ZPR32>; def _D : sve2_int_arith_pred<0b11, opc, asm, ZPR64>; + + def : SVE_3_Op_Pat<nxv16i8, op, nxv16i1, nxv16i8, nxv16i8, !cast<Instruction>(NAME # _B)>; + def : SVE_3_Op_Pat<nxv8i16, op, nxv8i1, nxv8i16, nxv8i16, !cast<Instruction>(NAME # _H)>; + def : SVE_3_Op_Pat<nxv4i32, op, nxv4i1, nxv4i32, nxv4i32, !cast<Instruction>(NAME # _S)>; + def : SVE_3_Op_Pat<nxv2i64, op, nxv2i1, nxv2i64, nxv2i64, !cast<Instruction>(NAME # _D)>; } class sve2_int_sadd_long_accum_pairwise<bits<2> sz, bit U, string asm, @@ -2739,14 +2984,18 @@ class sve2_int_sadd_long_accum_pairwise<bits<2> sz, bit U, string asm, let Inst{4-0} = Zda; let Constraints = "$Zda = $_Zda"; - let DestructiveInstType = Destructive; + let DestructiveInstType = DestructiveOther; let ElementSize = zprty1.ElementSize; } -multiclass sve2_int_sadd_long_accum_pairwise<bit U, string asm> { +multiclass sve2_int_sadd_long_accum_pairwise<bit U, string asm, SDPatternOperator op> { def _H : sve2_int_sadd_long_accum_pairwise<0b01, U, asm, ZPR16, ZPR8>; def _S : sve2_int_sadd_long_accum_pairwise<0b10, U, asm, ZPR32, ZPR16>; def _D : sve2_int_sadd_long_accum_pairwise<0b11, U, asm, ZPR64, ZPR32>; + + def : SVE_3_Op_Pat<nxv8i16, op, nxv8i1, nxv8i16, nxv16i8, !cast<Instruction>(NAME # _H)>; + def : SVE_3_Op_Pat<nxv4i32, op, nxv4i1, nxv4i32, nxv8i16, !cast<Instruction>(NAME # _S)>; + def : SVE_3_Op_Pat<nxv2i64, op, nxv2i1, nxv2i64, nxv4i32, !cast<Instruction>(NAME # _D)>; } class sve2_int_un_pred_arit<bits<2> sz, bit Q, bits<2> opc, @@ -2770,19 +3019,26 @@ class sve2_int_un_pred_arit<bits<2> sz, bit Q, bits<2> opc, let Inst{4-0} = Zd; let Constraints = "$Zd = $_Zd"; - let DestructiveInstType = Destructive; + let DestructiveInstType = DestructiveOther; let ElementSize = zprty.ElementSize; } -multiclass sve2_int_un_pred_arit_s<bits<3> opc, string asm> { +multiclass sve2_int_un_pred_arit_s<bits<3> opc, string asm, + SDPatternOperator op> { def _S : sve2_int_un_pred_arit<0b10, opc{2}, opc{1-0}, asm, ZPR32>; + def : SVE_3_Op_Pat<nxv4i32, op, nxv4i32, nxv4i1, nxv4i32, !cast<Instruction>(NAME # _S)>; } -multiclass sve2_int_un_pred_arit<bits<3> opc, string asm> { +multiclass sve2_int_un_pred_arit<bits<3> opc, string asm, SDPatternOperator op> { def _B : sve2_int_un_pred_arit<0b00, opc{2}, opc{1-0}, asm, ZPR8>; def _H : sve2_int_un_pred_arit<0b01, opc{2}, opc{1-0}, asm, ZPR16>; def _S : sve2_int_un_pred_arit<0b10, opc{2}, opc{1-0}, asm, ZPR32>; def _D : sve2_int_un_pred_arit<0b11, opc{2}, opc{1-0}, asm, ZPR64>; + + def : SVE_3_Op_Pat<nxv16i8, op, nxv16i8, nxv16i1, nxv16i8, !cast<Instruction>(NAME # _B)>; + def : SVE_3_Op_Pat<nxv8i16, op, nxv8i16, nxv8i1, nxv8i16, !cast<Instruction>(NAME # _H)>; + def : SVE_3_Op_Pat<nxv4i32, op, nxv4i32, nxv4i1, nxv4i32, !cast<Instruction>(NAME # _S)>; + def : SVE_3_Op_Pat<nxv2i64, op, nxv2i64, nxv2i1, nxv2i64, !cast<Instruction>(NAME # _D)>; } //===----------------------------------------------------------------------===// @@ -2806,21 +3062,47 @@ class sve2_wide_int_arith<bits<2> sz, bits<5> opc, string asm, let Inst{4-0} = Zd; } -multiclass sve2_wide_int_arith_long<bits<5> opc, string asm> { +multiclass sve2_wide_int_arith_long<bits<5> opc, string asm, + SDPatternOperator op> { def _H : sve2_wide_int_arith<0b01, opc, asm, ZPR16, ZPR8, ZPR8>; def _S : sve2_wide_int_arith<0b10, opc, asm, ZPR32, ZPR16, ZPR16>; def _D : sve2_wide_int_arith<0b11, opc, asm, ZPR64, ZPR32, ZPR32>; + + def : SVE_2_Op_Pat<nxv8i16, op, nxv16i8, nxv16i8, !cast<Instruction>(NAME # _H)>; + def : SVE_2_Op_Pat<nxv4i32, op, nxv8i16, nxv8i16, !cast<Instruction>(NAME # _S)>; + def : SVE_2_Op_Pat<nxv2i64, op, nxv4i32, nxv4i32, !cast<Instruction>(NAME # _D)>; } -multiclass sve2_wide_int_arith_wide<bits<3> opc, string asm> { +multiclass sve2_wide_int_arith_wide<bits<3> opc, string asm, + SDPatternOperator op> { def _H : sve2_wide_int_arith<0b01, { 0b10, opc }, asm, ZPR16, ZPR16, ZPR8>; def _S : sve2_wide_int_arith<0b10, { 0b10, opc }, asm, ZPR32, ZPR32, ZPR16>; def _D : sve2_wide_int_arith<0b11, { 0b10, opc }, asm, ZPR64, ZPR64, ZPR32>; + + def : SVE_2_Op_Pat<nxv8i16, op, nxv8i16, nxv16i8, !cast<Instruction>(NAME # _H)>; + def : SVE_2_Op_Pat<nxv4i32, op, nxv4i32, nxv8i16, !cast<Instruction>(NAME # _S)>; + def : SVE_2_Op_Pat<nxv2i64, op, nxv2i64, nxv4i32, !cast<Instruction>(NAME # _D)>; +} + +multiclass sve2_wide_int_arith_pmul<bits<2> sz, bits<5> opc, string asm, + SDPatternOperator op> { + def NAME : sve2_wide_int_arith<sz, opc, asm, ZPR128, ZPR64, ZPR64>; + + // To avoid using 128 bit elements in the IR, the pattern below works with + // llvm intrinsics with the _pair suffix, to reflect that + // _Q is implemented as a pair of _D. + def : SVE_2_Op_Pat<nxv2i64, op, nxv2i64, nxv2i64, !cast<Instruction>(NAME)>; } -multiclass sve2_pmul_long<bits<1> opc, string asm> { +multiclass sve2_pmul_long<bits<1> opc, string asm, SDPatternOperator op> { def _H : sve2_wide_int_arith<0b01, {0b1101, opc}, asm, ZPR16, ZPR8, ZPR8>; def _D : sve2_wide_int_arith<0b11, {0b1101, opc}, asm, ZPR64, ZPR32, ZPR32>; + + // To avoid using 128 bit elements in the IR, the patterns below work with + // llvm intrinsics with the _pair suffix, to reflect that + // _H is implemented as a pair of _B and _D is implemented as a pair of _S. + def : SVE_2_Op_Pat<nxv16i8, op, nxv16i8, nxv16i8, !cast<Instruction>(NAME # _H)>; + def : SVE_2_Op_Pat<nxv4i32, op, nxv4i32, nxv4i32, !cast<Instruction>(NAME # _D)>; } //===----------------------------------------------------------------------===// @@ -2844,17 +3126,27 @@ class sve2_misc<bits<2> sz, bits<4> opc, string asm, let Inst{4-0} = Zd; } -multiclass sve2_misc_bitwise<bits<4> opc, string asm> { +multiclass sve2_misc_bitwise<bits<4> opc, string asm, SDPatternOperator op> { def _B : sve2_misc<0b00, opc, asm, ZPR8, ZPR8>; def _H : sve2_misc<0b01, opc, asm, ZPR16, ZPR16>; def _S : sve2_misc<0b10, opc, asm, ZPR32, ZPR32>; def _D : sve2_misc<0b11, opc, asm, ZPR64, ZPR64>; + + def : SVE_2_Op_Pat<nxv16i8, op, nxv16i8, nxv16i8, !cast<Instruction>(NAME # _B)>; + def : SVE_2_Op_Pat<nxv8i16, op, nxv8i16, nxv8i16, !cast<Instruction>(NAME # _H)>; + def : SVE_2_Op_Pat<nxv4i32, op, nxv4i32, nxv4i32, !cast<Instruction>(NAME # _S)>; + def : SVE_2_Op_Pat<nxv2i64, op, nxv2i64, nxv2i64, !cast<Instruction>(NAME # _D)>; } -multiclass sve2_misc_int_addsub_long_interleaved<bits<2> opc, string asm> { +multiclass sve2_misc_int_addsub_long_interleaved<bits<2> opc, string asm, + SDPatternOperator op> { def _H : sve2_misc<0b01, { 0b00, opc }, asm, ZPR16, ZPR8>; def _S : sve2_misc<0b10, { 0b00, opc }, asm, ZPR32, ZPR16>; def _D : sve2_misc<0b11, { 0b00, opc }, asm, ZPR64, ZPR32>; + + def : SVE_2_Op_Pat<nxv8i16, op, nxv16i8, nxv16i8, !cast<Instruction>(NAME # _H)>; + def : SVE_2_Op_Pat<nxv4i32, op, nxv8i16, nxv8i16, !cast<Instruction>(NAME # _S)>; + def : SVE_2_Op_Pat<nxv2i64, op, nxv4i32, nxv4i32, !cast<Instruction>(NAME # _D)>; } class sve2_bitwise_xor_interleaved<bits<2> sz, bits<1> opc, string asm, @@ -2874,15 +3166,21 @@ class sve2_bitwise_xor_interleaved<bits<2> sz, bits<1> opc, string asm, let Inst{4-0} = Zd; let Constraints = "$Zd = $_Zd"; - let DestructiveInstType = Destructive; + let DestructiveInstType = DestructiveOther; let ElementSize = ElementSizeNone; } -multiclass sve2_bitwise_xor_interleaved<bit opc, string asm> { +multiclass sve2_bitwise_xor_interleaved<bit opc, string asm, + SDPatternOperator op> { def _B : sve2_bitwise_xor_interleaved<0b00, opc, asm, ZPR8, ZPR8>; def _H : sve2_bitwise_xor_interleaved<0b01, opc, asm, ZPR16, ZPR16>; def _S : sve2_bitwise_xor_interleaved<0b10, opc, asm, ZPR32, ZPR32>; def _D : sve2_bitwise_xor_interleaved<0b11, opc, asm, ZPR64, ZPR64>; + + def : SVE_3_Op_Pat<nxv16i8, op, nxv16i8, nxv16i8, nxv16i8, !cast<Instruction>(NAME # _B)>; + def : SVE_3_Op_Pat<nxv8i16, op, nxv8i16, nxv8i16, nxv8i16, !cast<Instruction>(NAME # _H)>; + def : SVE_3_Op_Pat<nxv4i32, op, nxv4i32, nxv4i32, nxv4i32, !cast<Instruction>(NAME # _S)>; + def : SVE_3_Op_Pat<nxv2i64, op, nxv2i64, nxv2i64, nxv2i64, !cast<Instruction>(NAME # _D)>; } class sve2_bitwise_shift_left_long<bits<3> tsz8_64, bits<2> opc, string asm, @@ -2905,7 +3203,8 @@ class sve2_bitwise_shift_left_long<bits<3> tsz8_64, bits<2> opc, string asm, let Inst{4-0} = Zd; } -multiclass sve2_bitwise_shift_left_long<bits<2> opc, string asm> { +multiclass sve2_bitwise_shift_left_long<bits<2> opc, string asm, + SDPatternOperator op> { def _H : sve2_bitwise_shift_left_long<{0,0,1}, opc, asm, ZPR16, ZPR8, vecshiftL8>; def _S : sve2_bitwise_shift_left_long<{0,1,?}, opc, asm, @@ -2916,6 +3215,9 @@ multiclass sve2_bitwise_shift_left_long<bits<2> opc, string asm> { ZPR64, ZPR32, vecshiftL32> { let Inst{20-19} = imm{4-3}; } + def : SVE_2_Op_Imm_Pat<nxv8i16, op, nxv16i8, i32, tvecshiftL8, !cast<Instruction>(NAME # _H)>; + def : SVE_2_Op_Imm_Pat<nxv4i32, op, nxv8i16, i32, tvecshiftL16, !cast<Instruction>(NAME # _S)>; + def : SVE_2_Op_Imm_Pat<nxv2i64, op, nxv4i32, i32, tvecshiftL32, !cast<Instruction>(NAME # _D)>; } //===----------------------------------------------------------------------===// @@ -2943,7 +3245,8 @@ class sve2_int_bin_shift_imm<bits<4> tsz8_64, bit opc, string asm, let Constraints = "$Zd = $_Zd"; } -multiclass sve2_int_bin_shift_imm_left<bit opc, string asm> { +multiclass sve2_int_bin_shift_imm_left<bit opc, string asm, + SDPatternOperator op> { def _B : sve2_int_bin_shift_imm<{0,0,0,1}, opc, asm, ZPR8, vecshiftL8>; def _H : sve2_int_bin_shift_imm<{0,0,1,?}, opc, asm, ZPR16, vecshiftL16> { let Inst{19} = imm{3}; @@ -2955,9 +3258,15 @@ multiclass sve2_int_bin_shift_imm_left<bit opc, string asm> { let Inst{22} = imm{5}; let Inst{20-19} = imm{4-3}; } + + def : SVE_3_Op_Imm_Pat<nxv16i8, op, nxv16i8, nxv16i8, i32, tvecshiftL8, !cast<Instruction>(NAME # _B)>; + def : SVE_3_Op_Imm_Pat<nxv8i16, op, nxv8i16, nxv8i16, i32, tvecshiftL16, !cast<Instruction>(NAME # _H)>; + def : SVE_3_Op_Imm_Pat<nxv4i32, op, nxv4i32, nxv4i32, i32, tvecshiftL32, !cast<Instruction>(NAME # _S)>; + def : SVE_3_Op_Imm_Pat<nxv2i64, op, nxv2i64, nxv2i64, i32, tvecshiftL64, !cast<Instruction>(NAME # _D)>; } -multiclass sve2_int_bin_shift_imm_right<bit opc, string asm> { +multiclass sve2_int_bin_shift_imm_right<bit opc, string asm, + SDPatternOperator op> { def _B : sve2_int_bin_shift_imm<{0,0,0,1}, opc, asm, ZPR8, vecshiftR8>; def _H : sve2_int_bin_shift_imm<{0,0,1,?}, opc, asm, ZPR16, vecshiftR16> { let Inst{19} = imm{3}; @@ -2969,6 +3278,11 @@ multiclass sve2_int_bin_shift_imm_right<bit opc, string asm> { let Inst{22} = imm{5}; let Inst{20-19} = imm{4-3}; } + + def : SVE_3_Op_Imm_Pat<nxv16i8, op, nxv16i8, nxv16i8, i32, tvecshiftR8, !cast<Instruction>(NAME # _B)>; + def : SVE_3_Op_Imm_Pat<nxv8i16, op, nxv8i16, nxv8i16, i32, tvecshiftR16, !cast<Instruction>(NAME # _H)>; + def : SVE_3_Op_Imm_Pat<nxv4i32, op, nxv4i32, nxv4i32, i32, tvecshiftR32, !cast<Instruction>(NAME # _S)>; + def : SVE_3_Op_Imm_Pat<nxv2i64, op, nxv2i64, nxv2i64, i32, tvecshiftR64, !cast<Instruction>(NAME # _D)>; } class sve2_int_bin_accum_shift_imm<bits<4> tsz8_64, bits<2> opc, string asm, @@ -2990,11 +3304,12 @@ class sve2_int_bin_accum_shift_imm<bits<4> tsz8_64, bits<2> opc, string asm, let Inst{4-0} = Zda; let Constraints = "$Zda = $_Zda"; - let DestructiveInstType = Destructive; + let DestructiveInstType = DestructiveOther; let ElementSize = ElementSizeNone; } -multiclass sve2_int_bin_accum_shift_imm_right<bits<2> opc, string asm> { +multiclass sve2_int_bin_accum_shift_imm_right<bits<2> opc, string asm, + SDPatternOperator op> { def _B : sve2_int_bin_accum_shift_imm<{0,0,0,1}, opc, asm, ZPR8, vecshiftR8>; def _H : sve2_int_bin_accum_shift_imm<{0,0,1,?}, opc, asm, ZPR16, vecshiftR16> { let Inst{19} = imm{3}; @@ -3006,6 +3321,11 @@ multiclass sve2_int_bin_accum_shift_imm_right<bits<2> opc, string asm> { let Inst{22} = imm{5}; let Inst{20-19} = imm{4-3}; } + + def : SVE_3_Op_Imm_Pat<nxv16i8, op, nxv16i8, nxv16i8, i32, tvecshiftR8, !cast<Instruction>(NAME # _B)>; + def : SVE_3_Op_Imm_Pat<nxv8i16, op, nxv8i16, nxv8i16, i32, tvecshiftR16, !cast<Instruction>(NAME # _H)>; + def : SVE_3_Op_Imm_Pat<nxv4i32, op, nxv4i32, nxv4i32, i32, tvecshiftR32, !cast<Instruction>(NAME # _S)>; + def : SVE_3_Op_Imm_Pat<nxv2i64, op, nxv2i64, nxv2i64, i32, tvecshiftR64, !cast<Instruction>(NAME # _D)>; } class sve2_int_cadd<bits<2> sz, bit opc, string asm, ZPRRegOp zprty> @@ -3024,15 +3344,20 @@ class sve2_int_cadd<bits<2> sz, bit opc, string asm, ZPRRegOp zprty> let Inst{4-0} = Zdn; let Constraints = "$Zdn = $_Zdn"; - let DestructiveInstType = Destructive; + let DestructiveInstType = DestructiveOther; let ElementSize = ElementSizeNone; } -multiclass sve2_int_cadd<bit opc, string asm> { +multiclass sve2_int_cadd<bit opc, string asm, SDPatternOperator op> { def _B : sve2_int_cadd<0b00, opc, asm, ZPR8>; def _H : sve2_int_cadd<0b01, opc, asm, ZPR16>; def _S : sve2_int_cadd<0b10, opc, asm, ZPR32>; def _D : sve2_int_cadd<0b11, opc, asm, ZPR64>; + + def : SVE_3_Op_Imm_Pat<nxv16i8, op, nxv16i8, nxv16i8, i32, complexrotateopodd, !cast<Instruction>(NAME # _B)>; + def : SVE_3_Op_Imm_Pat<nxv8i16, op, nxv8i16, nxv8i16, i32, complexrotateopodd, !cast<Instruction>(NAME # _H)>; + def : SVE_3_Op_Imm_Pat<nxv4i32, op, nxv4i32, nxv4i32, i32, complexrotateopodd, !cast<Instruction>(NAME # _S)>; + def : SVE_3_Op_Imm_Pat<nxv2i64, op, nxv2i64, nxv2i64, i32, complexrotateopodd, !cast<Instruction>(NAME # _D)>; } class sve2_int_absdiff_accum<bits<2> sz, bits<4> opc, string asm, @@ -3052,28 +3377,41 @@ class sve2_int_absdiff_accum<bits<2> sz, bits<4> opc, string asm, let Inst{4-0} = Zda; let Constraints = "$Zda = $_Zda"; - let DestructiveInstType = Destructive; + let DestructiveInstType = DestructiveOther; let ElementSize = ElementSizeNone; } -multiclass sve2_int_absdiff_accum<bit opc, string asm> { +multiclass sve2_int_absdiff_accum<bit opc, string asm, SDPatternOperator op> { def _B : sve2_int_absdiff_accum<0b00, { 0b111, opc }, asm, ZPR8, ZPR8>; def _H : sve2_int_absdiff_accum<0b01, { 0b111, opc }, asm, ZPR16, ZPR16>; def _S : sve2_int_absdiff_accum<0b10, { 0b111, opc }, asm, ZPR32, ZPR32>; def _D : sve2_int_absdiff_accum<0b11, { 0b111, opc }, asm, ZPR64, ZPR64>; + + def : SVE_3_Op_Pat<nxv16i8, op, nxv16i8, nxv16i8, nxv16i8, !cast<Instruction>(NAME # _B)>; + def : SVE_3_Op_Pat<nxv8i16, op, nxv8i16, nxv8i16, nxv8i16, !cast<Instruction>(NAME # _H)>; + def : SVE_3_Op_Pat<nxv4i32, op, nxv4i32, nxv4i32, nxv4i32, !cast<Instruction>(NAME # _S)>; + def : SVE_3_Op_Pat<nxv2i64, op, nxv2i64, nxv2i64, nxv2i64, !cast<Instruction>(NAME # _D)>; } -multiclass sve2_int_absdiff_accum_long<bits<2> opc, string asm> { +multiclass sve2_int_absdiff_accum_long<bits<2> opc, string asm, + SDPatternOperator op> { def _H : sve2_int_absdiff_accum<0b01, { 0b00, opc }, asm, ZPR16, ZPR8>; def _S : sve2_int_absdiff_accum<0b10, { 0b00, opc }, asm, ZPR32, ZPR16>; def _D : sve2_int_absdiff_accum<0b11, { 0b00, opc }, asm, ZPR64, ZPR32>; + + def : SVE_3_Op_Pat<nxv8i16, op, nxv8i16, nxv16i8, nxv16i8, !cast<Instruction>(NAME # _H)>; + def : SVE_3_Op_Pat<nxv4i32, op, nxv4i32, nxv8i16, nxv8i16, !cast<Instruction>(NAME # _S)>; + def : SVE_3_Op_Pat<nxv2i64, op, nxv2i64, nxv4i32, nxv4i32, !cast<Instruction>(NAME # _D)>; } -multiclass sve2_int_addsub_long_carry<bits<2> opc, string asm> { +multiclass sve2_int_addsub_long_carry<bits<2> opc, string asm, SDPatternOperator op> { def _S : sve2_int_absdiff_accum<{ opc{1}, 0b0 }, { 0b010, opc{0} }, asm, ZPR32, ZPR32>; def _D : sve2_int_absdiff_accum<{ opc{1}, 0b1 }, { 0b010, opc{0} }, asm, ZPR64, ZPR64>; + + def : SVE_3_Op_Pat<nxv4i32, op, nxv4i32, nxv4i32, nxv4i32, !cast<Instruction>(NAME # _S)>; + def : SVE_3_Op_Pat<nxv2i64, op, nxv2i64, nxv2i64, nxv2i64, !cast<Instruction>(NAME # _D)>; } //===----------------------------------------------------------------------===// @@ -3300,7 +3638,7 @@ class sve_int_un_pred_arit<bits<2> sz8_64, bits<4> opc, let Inst{4-0} = Zd; let Constraints = "$Zd = $_Zd"; - let DestructiveInstType = Destructive; + let DestructiveInstType = DestructiveOther; let ElementSize = zprty.ElementSize; } @@ -3465,11 +3803,12 @@ class sve_int_arith_imm0<bits<2> sz8_64, bits<3> opc, string asm, let Inst{4-0} = Zdn; let Constraints = "$Zdn = $_Zdn"; - let DestructiveInstType = Destructive; + let DestructiveInstType = DestructiveOther; let ElementSize = ElementSizeNone; } -multiclass sve_int_arith_imm0<bits<3> opc, string asm, SDPatternOperator op> { +multiclass sve_int_arith_imm0<bits<3> opc, string asm, + SDPatternOperator op, SDPatternOperator int_op> { def _B : sve_int_arith_imm0<0b00, opc, asm, ZPR8, addsub_imm8_opt_lsl_i8>; def _H : sve_int_arith_imm0<0b01, opc, asm, ZPR16, addsub_imm8_opt_lsl_i16>; def _S : sve_int_arith_imm0<0b10, opc, asm, ZPR32, addsub_imm8_opt_lsl_i32>; @@ -3479,6 +3818,12 @@ multiclass sve_int_arith_imm0<bits<3> opc, string asm, SDPatternOperator op> { def : SVE_1_Op_Imm_OptLsl_Pat<nxv8i16, op, ZPR16, i32, SVEAddSubImm16Pat, !cast<Instruction>(NAME # _H)>; def : SVE_1_Op_Imm_OptLsl_Pat<nxv4i32, op, ZPR32, i32, SVEAddSubImm32Pat, !cast<Instruction>(NAME # _S)>; def : SVE_1_Op_Imm_OptLsl_Pat<nxv2i64, op, ZPR64, i64, SVEAddSubImm64Pat, !cast<Instruction>(NAME # _D)>; + + // Intrinsic version + def : SVE_1_Op_Imm_OptLsl_Pat<nxv16i8, int_op, ZPR8, i32, SVEAddSubImm8Pat, !cast<Instruction>(NAME # _B)>; + def : SVE_1_Op_Imm_OptLsl_Pat<nxv8i16, int_op, ZPR16, i32, SVEAddSubImm16Pat, !cast<Instruction>(NAME # _H)>; + def : SVE_1_Op_Imm_OptLsl_Pat<nxv4i32, int_op, ZPR32, i32, SVEAddSubImm32Pat, !cast<Instruction>(NAME # _S)>; + def : SVE_1_Op_Imm_OptLsl_Pat<nxv2i64, int_op, ZPR64, i64, SVEAddSubImm64Pat, !cast<Instruction>(NAME # _D)>; } multiclass sve_int_arith_imm0_subr<bits<3> opc, string asm, SDPatternOperator op> { @@ -3509,7 +3854,7 @@ class sve_int_arith_imm<bits<2> sz8_64, bits<6> opc, string asm, let Inst{4-0} = Zdn; let Constraints = "$Zdn = $_Zdn"; - let DestructiveInstType = Destructive; + let DestructiveInstType = DestructiveOther; let ElementSize = ElementSizeNone; } @@ -3519,10 +3864,10 @@ multiclass sve_int_arith_imm1<bits<2> opc, string asm, SDPatternOperator op> { def _S : sve_int_arith_imm<0b10, { 0b1010, opc }, asm, ZPR32, simm8>; def _D : sve_int_arith_imm<0b11, { 0b1010, opc }, asm, ZPR64, simm8>; - def : SVE_1_Op_Imm_Arith_Pat<nxv16i8, op, ZPR8, i32, SVEArithSImmPat, !cast<Instruction>(NAME # _B)>; - def : SVE_1_Op_Imm_Arith_Pat<nxv8i16, op, ZPR16, i32, SVEArithSImmPat, !cast<Instruction>(NAME # _H)>; - def : SVE_1_Op_Imm_Arith_Pat<nxv4i32, op, ZPR32, i32, SVEArithSImmPat, !cast<Instruction>(NAME # _S)>; - def : SVE_1_Op_Imm_Arith_Pat<nxv2i64, op, ZPR64, i64, SVEArithSImmPat, !cast<Instruction>(NAME # _D)>; + def : SVE_1_Op_Imm_Arith_Pred_Pat<nxv16i8, nxv16i1, op, ZPR8, i32, SVEArithSImmPat, !cast<Instruction>(NAME # _B)>; + def : SVE_1_Op_Imm_Arith_Pred_Pat<nxv8i16, nxv8i1, op, ZPR16, i32, SVEArithSImmPat, !cast<Instruction>(NAME # _H)>; + def : SVE_1_Op_Imm_Arith_Pred_Pat<nxv4i32, nxv4i1, op, ZPR32, i32, SVEArithSImmPat, !cast<Instruction>(NAME # _S)>; + def : SVE_1_Op_Imm_Arith_Pred_Pat<nxv2i64, nxv2i1, op, ZPR64, i64, SVEArithSImmPat, !cast<Instruction>(NAME # _D)>; } multiclass sve_int_arith_imm1_unsigned<bits<2> opc, string asm, SDPatternOperator op> { @@ -3531,10 +3876,10 @@ multiclass sve_int_arith_imm1_unsigned<bits<2> opc, string asm, SDPatternOperato def _S : sve_int_arith_imm<0b10, { 0b1010, opc }, asm, ZPR32, imm0_255>; def _D : sve_int_arith_imm<0b11, { 0b1010, opc }, asm, ZPR64, imm0_255>; - def : SVE_1_Op_Imm_Arith_Pat<nxv16i8, op, ZPR8, i32, SVEArithUImmPat, !cast<Instruction>(NAME # _B)>; - def : SVE_1_Op_Imm_Arith_Pat<nxv8i16, op, ZPR16, i32, SVEArithUImmPat, !cast<Instruction>(NAME # _H)>; - def : SVE_1_Op_Imm_Arith_Pat<nxv4i32, op, ZPR32, i32, SVEArithUImmPat, !cast<Instruction>(NAME # _S)>; - def : SVE_1_Op_Imm_Arith_Pat<nxv2i64, op, ZPR64, i64, SVEArithUImmPat, !cast<Instruction>(NAME # _D)>; + def : SVE_1_Op_Imm_Arith_Pred_Pat<nxv16i8, nxv16i1, op, ZPR8, i32, SVEArithUImmPat, !cast<Instruction>(NAME # _B)>; + def : SVE_1_Op_Imm_Arith_Pred_Pat<nxv8i16, nxv8i1, op, ZPR16, i32, SVEArithUImmPat, !cast<Instruction>(NAME # _H)>; + def : SVE_1_Op_Imm_Arith_Pred_Pat<nxv4i32, nxv4i1, op, ZPR32, i32, SVEArithUImmPat, !cast<Instruction>(NAME # _S)>; + def : SVE_1_Op_Imm_Arith_Pred_Pat<nxv2i64, nxv2i1, op, ZPR64, i64, SVEArithUImmPat, !cast<Instruction>(NAME # _D)>; } multiclass sve_int_arith_imm2<string asm, SDPatternOperator op> { @@ -3604,11 +3949,11 @@ class sve2_int_bitwise_ternary_op_d<bits<3> opc, string asm> let Inst{4-0} = Zdn; let Constraints = "$Zdn = $_Zdn"; - let DestructiveInstType = Destructive; + let DestructiveInstType = DestructiveOther; let ElementSize = ElementSizeNone; } -multiclass sve2_int_bitwise_ternary_op<bits<3> opc, string asm> { +multiclass sve2_int_bitwise_ternary_op<bits<3> opc, string asm, SDPatternOperator op> { def NAME : sve2_int_bitwise_ternary_op_d<opc, asm>; def : InstAlias<asm # "\t$Zdn, $Zdn, $Zm, $Zk", @@ -3617,6 +3962,11 @@ multiclass sve2_int_bitwise_ternary_op<bits<3> opc, string asm> { (!cast<Instruction>(NAME) ZPR16:$Zdn, ZPR16:$Zm, ZPR16:$Zk), 1>; def : InstAlias<asm # "\t$Zdn, $Zdn, $Zm, $Zk", (!cast<Instruction>(NAME) ZPR32:$Zdn, ZPR32:$Zm, ZPR32:$Zk), 1>; + + def : SVE_3_Op_Pat<nxv16i8, op, nxv16i8, nxv16i8, nxv16i8, !cast<Instruction>(NAME)>; + def : SVE_3_Op_Pat<nxv8i16, op, nxv8i16, nxv8i16, nxv8i16, !cast<Instruction>(NAME)>; + def : SVE_3_Op_Pat<nxv4i32, op, nxv4i32, nxv4i32, nxv4i32, !cast<Instruction>(NAME)>; + def : SVE_3_Op_Pat<nxv2i64, op, nxv2i64, nxv2i64, nxv2i64, !cast<Instruction>(NAME)>; } class sve2_int_rotate_right_imm<bits<4> tsz8_64, string asm, @@ -3638,11 +3988,11 @@ class sve2_int_rotate_right_imm<bits<4> tsz8_64, string asm, let Inst{4-0} = Zdn; let Constraints = "$Zdn = $_Zdn"; - let DestructiveInstType = Destructive; + let DestructiveInstType = DestructiveOther; let ElementSize = ElementSizeNone; } -multiclass sve2_int_rotate_right_imm<string asm> { +multiclass sve2_int_rotate_right_imm<string asm, SDPatternOperator op> { def _B : sve2_int_rotate_right_imm<{0,0,0,1}, asm, ZPR8, vecshiftR8>; def _H : sve2_int_rotate_right_imm<{0,0,1,?}, asm, ZPR16, vecshiftR16> { let Inst{19} = imm{3}; @@ -3654,6 +4004,10 @@ multiclass sve2_int_rotate_right_imm<string asm> { let Inst{22} = imm{5}; let Inst{20-19} = imm{4-3}; } + def : SVE_3_Op_Imm_Pat<nxv16i8, op, nxv16i8, nxv16i8, i32, tvecshiftR8, !cast<Instruction>(NAME # _B)>; + def : SVE_3_Op_Imm_Pat<nxv8i16, op, nxv8i16, nxv8i16, i32, tvecshiftR16, !cast<Instruction>(NAME # _H)>; + def : SVE_3_Op_Imm_Pat<nxv4i32, op, nxv4i32, nxv4i32, i32, tvecshiftR32, !cast<Instruction>(NAME # _S)>; + def : SVE_3_Op_Imm_Pat<nxv2i64, op, nxv2i64, nxv2i64, i32, tvecshiftR64, !cast<Instruction>(NAME # _D)>; } //===----------------------------------------------------------------------===// @@ -3678,7 +4032,7 @@ class sve_int_dup_fpimm_pred<bits<2> sz, Operand fpimmtype, let Inst{4-0} = Zd; let Constraints = "$Zd = $_Zd"; - let DestructiveInstType = Destructive; + let DestructiveInstType = DestructiveOther; let ElementSize = zprty.ElementSize; } @@ -3713,26 +4067,34 @@ class sve_int_dup_imm_pred<bits<2> sz8_64, bit m, string asm, let Inst{12-5} = imm{7-0}; // imm8 let Inst{4-0} = Zd; - let DestructiveInstType = Destructive; + let DestructiveInstType = DestructiveOther; let ElementSize = zprty.ElementSize; } -multiclass sve_int_dup_imm_pred_merge<string asm> { - let Constraints = "$Zd = $_Zd" in { - def _B : sve_int_dup_imm_pred<0b00, 1, asm, ZPR8, "/m", (ins ZPR8:$_Zd, PPRAny:$Pg, cpy_imm8_opt_lsl_i8:$imm)>; - def _H : sve_int_dup_imm_pred<0b01, 1, asm, ZPR16, "/m", (ins ZPR16:$_Zd, PPRAny:$Pg, cpy_imm8_opt_lsl_i16:$imm)>; - def _S : sve_int_dup_imm_pred<0b10, 1, asm, ZPR32, "/m", (ins ZPR32:$_Zd, PPRAny:$Pg, cpy_imm8_opt_lsl_i32:$imm)>; - def _D : sve_int_dup_imm_pred<0b11, 1, asm, ZPR64, "/m", (ins ZPR64:$_Zd, PPRAny:$Pg, cpy_imm8_opt_lsl_i64:$imm)>; - } - - def : InstAlias<"mov $Zd, $Pg/m, $imm", - (!cast<Instruction>(NAME # _B) ZPR8:$Zd, PPRAny:$Pg, cpy_imm8_opt_lsl_i8:$imm), 1>; - def : InstAlias<"mov $Zd, $Pg/m, $imm", - (!cast<Instruction>(NAME # _H) ZPR16:$Zd, PPRAny:$Pg, cpy_imm8_opt_lsl_i16:$imm), 1>; +multiclass sve_int_dup_imm_pred_merge_inst< + bits<2> sz8_64, string asm, ZPRRegOp zprty, ValueType intty, + ValueType predty, ValueType scalarty, imm8_opt_lsl cpyimm> { + let Constraints = "$Zd = $_Zd" in + def NAME : sve_int_dup_imm_pred<sz8_64, 1, asm, zprty, "/m", + (ins zprty:$_Zd, PPRAny:$Pg, cpyimm:$imm)>; def : InstAlias<"mov $Zd, $Pg/m, $imm", - (!cast<Instruction>(NAME # _S) ZPR32:$Zd, PPRAny:$Pg, cpy_imm8_opt_lsl_i32:$imm), 1>; - def : InstAlias<"mov $Zd, $Pg/m, $imm", - (!cast<Instruction>(NAME # _D) ZPR64:$Zd, PPRAny:$Pg, cpy_imm8_opt_lsl_i64:$imm), 1>; + (!cast<Instruction>(NAME) zprty:$Zd, PPRAny:$Pg, cpyimm:$imm), 1>; + def : Pat<(intty + (vselect predty:$Pg, + (intty (AArch64dup (scalarty (SVE8BitLslImm i32:$imm, i32:$shift)))), + intty:$Zd)), + (!cast<Instruction>(NAME) zprty:$Zd, $Pg, i32:$imm, i32:$shift)>; +} + +multiclass sve_int_dup_imm_pred_merge<string asm> { + defm _B : sve_int_dup_imm_pred_merge_inst<0b00, asm, ZPR8, nxv16i8, nxv16i1, + i32, cpy_imm8_opt_lsl_i8>; + defm _H : sve_int_dup_imm_pred_merge_inst<0b01, asm, ZPR16, nxv8i16, nxv8i1, + i32, cpy_imm8_opt_lsl_i16>; + defm _S : sve_int_dup_imm_pred_merge_inst<0b10, asm, ZPR32, nxv4i32, nxv4i1, + i32, cpy_imm8_opt_lsl_i32>; + defm _D : sve_int_dup_imm_pred_merge_inst<0b11, asm, ZPR64, nxv2i64, nxv2i1, + i64, cpy_imm8_opt_lsl_i64>; def : InstAlias<"fmov $Zd, $Pg/m, #0.0", (!cast<Instruction>(NAME # _H) ZPR16:$Zd, PPRAny:$Pg, 0, 0), 0>; @@ -3742,20 +4104,35 @@ multiclass sve_int_dup_imm_pred_merge<string asm> { (!cast<Instruction>(NAME # _D) ZPR64:$Zd, PPRAny:$Pg, 0, 0), 0>; } -multiclass sve_int_dup_imm_pred_zero<string asm> { - def _B : sve_int_dup_imm_pred<0b00, 0, asm, ZPR8, "/z", (ins PPRAny:$Pg, cpy_imm8_opt_lsl_i8:$imm)>; - def _H : sve_int_dup_imm_pred<0b01, 0, asm, ZPR16, "/z", (ins PPRAny:$Pg, cpy_imm8_opt_lsl_i16:$imm)>; - def _S : sve_int_dup_imm_pred<0b10, 0, asm, ZPR32, "/z", (ins PPRAny:$Pg, cpy_imm8_opt_lsl_i32:$imm)>; - def _D : sve_int_dup_imm_pred<0b11, 0, asm, ZPR64, "/z", (ins PPRAny:$Pg, cpy_imm8_opt_lsl_i64:$imm)>; - - def : InstAlias<"mov $Zd, $Pg/z, $imm", - (!cast<Instruction>(NAME # _B) ZPR8:$Zd, PPRAny:$Pg, cpy_imm8_opt_lsl_i8:$imm), 1>; - def : InstAlias<"mov $Zd, $Pg/z, $imm", - (!cast<Instruction>(NAME # _H) ZPR16:$Zd, PPRAny:$Pg, cpy_imm8_opt_lsl_i16:$imm), 1>; +multiclass sve_int_dup_imm_pred_zero_inst< + bits<2> sz8_64, string asm, ZPRRegOp zprty, ValueType intty, + ValueType predty, ValueType scalarty, imm8_opt_lsl cpyimm> { + def NAME : sve_int_dup_imm_pred<sz8_64, 0, asm, zprty, "/z", + (ins PPRAny:$Pg, cpyimm:$imm)>; def : InstAlias<"mov $Zd, $Pg/z, $imm", - (!cast<Instruction>(NAME # _S) ZPR32:$Zd, PPRAny:$Pg, cpy_imm8_opt_lsl_i32:$imm), 1>; - def : InstAlias<"mov $Zd, $Pg/z, $imm", - (!cast<Instruction>(NAME # _D) ZPR64:$Zd, PPRAny:$Pg, cpy_imm8_opt_lsl_i64:$imm), 1>; + (!cast<Instruction>(NAME) zprty:$Zd, PPRAny:$Pg, cpyimm:$imm), 1>; + def : Pat<(intty (zext (predty PPRAny:$Ps1))), + (!cast<Instruction>(NAME) PPRAny:$Ps1, 1, 0)>; + def : Pat<(intty (sext (predty PPRAny:$Ps1))), + (!cast<Instruction>(NAME) PPRAny:$Ps1, -1, 0)>; + def : Pat<(intty (anyext (predty PPRAny:$Ps1))), + (!cast<Instruction>(NAME) PPRAny:$Ps1, 1, 0)>; + def : Pat<(intty + (vselect predty:$Pg, + (intty (AArch64dup (scalarty (SVE8BitLslImm i32:$imm, i32:$shift)))), + (intty (AArch64dup (scalarty 0))))), + (!cast<Instruction>(NAME) $Pg, i32:$imm, i32:$shift)>; +} + +multiclass sve_int_dup_imm_pred_zero<string asm> { + defm _B : sve_int_dup_imm_pred_zero_inst<0b00, asm, ZPR8, nxv16i8, nxv16i1, + i32, cpy_imm8_opt_lsl_i8>; + defm _H : sve_int_dup_imm_pred_zero_inst<0b01, asm, ZPR16, nxv8i16, nxv8i1, + i32, cpy_imm8_opt_lsl_i16>; + defm _S : sve_int_dup_imm_pred_zero_inst<0b10, asm, ZPR32, nxv4i32, nxv4i1, + i32, cpy_imm8_opt_lsl_i32>; + defm _D : sve_int_dup_imm_pred_zero_inst<0b11, asm, ZPR64, nxv2i64, nxv2i1, + i64, cpy_imm8_opt_lsl_i64>; } //===----------------------------------------------------------------------===// @@ -3787,17 +4164,24 @@ class sve_int_cmp<bit cmp_1, bits<2> sz8_64, bits<3> opc, string asm, let Defs = [NZCV]; } -multiclass sve_int_cmp_0<bits<3> opc, string asm, SDPatternOperator op, - CondCode cc> { +multiclass SVE_SETCC_Pat<CondCode cc, CondCode invcc, ValueType predvt, + ValueType intvt, sve_int_cmp cmp> { + def : Pat<(predvt (AArch64setcc_z predvt:$Op1, intvt:$Op2, intvt:$Op3, cc)), + (cmp $Op1, $Op2, $Op3)>; + def : Pat<(predvt (AArch64setcc_z predvt:$Op1, intvt:$Op2, intvt:$Op3, invcc)), + (cmp $Op1, $Op3, $Op2)>; +} + +multiclass sve_int_cmp_0<bits<3> opc, string asm, CondCode cc, CondCode invcc> { def _B : sve_int_cmp<0b0, 0b00, opc, asm, PPR8, ZPR8, ZPR8>; def _H : sve_int_cmp<0b0, 0b01, opc, asm, PPR16, ZPR16, ZPR16>; def _S : sve_int_cmp<0b0, 0b10, opc, asm, PPR32, ZPR32, ZPR32>; def _D : sve_int_cmp<0b0, 0b11, opc, asm, PPR64, ZPR64, ZPR64>; - def : SVE_3_Op_Pat<nxv16i1, op, nxv16i1, nxv16i8, nxv16i8, !cast<Instruction>(NAME # _B)>; - def : SVE_3_Op_Pat<nxv8i1, op, nxv8i1, nxv8i16, nxv8i16, !cast<Instruction>(NAME # _H)>; - def : SVE_3_Op_Pat<nxv4i1, op, nxv4i1, nxv4i32, nxv4i32, !cast<Instruction>(NAME # _S)>; - def : SVE_3_Op_Pat<nxv2i1, op, nxv2i1, nxv2i64, nxv2i64, !cast<Instruction>(NAME # _D)>; + defm : SVE_SETCC_Pat<cc, invcc, nxv16i1, nxv16i8, !cast<Instruction>(NAME # _B)>; + defm : SVE_SETCC_Pat<cc, invcc, nxv8i1, nxv8i16, !cast<Instruction>(NAME # _H)>; + defm : SVE_SETCC_Pat<cc, invcc, nxv4i1, nxv4i32, !cast<Instruction>(NAME # _S)>; + defm : SVE_SETCC_Pat<cc, invcc, nxv2i1, nxv2i64, !cast<Instruction>(NAME # _D)>; } multiclass sve_int_cmp_0_wide<bits<3> opc, string asm, SDPatternOperator op> { @@ -3852,67 +4236,35 @@ class sve_int_scmp_vi<bits<2> sz8_64, bits<3> opc, string asm, PPRRegOp pprty, let ElementSize = pprty.ElementSize; } -multiclass sve_int_scmp_vi<bits<3> opc, string asm, CondCode cc, - SDPatternOperator op = null_frag, - SDPatternOperator inv_op = null_frag> { +multiclass SVE_SETCC_Imm_Pat<CondCode cc, CondCode commuted_cc, + ValueType predvt, ValueType intvt, + Operand immtype, Instruction cmp> { + def : Pat<(predvt (AArch64setcc_z (predvt PPR_3b:$Pg), + (intvt ZPR:$Zs1), + (intvt (AArch64dup (immtype:$imm))), + cc)), + (cmp $Pg, $Zs1, immtype:$imm)>; + def : Pat<(predvt (AArch64setcc_z (predvt PPR_3b:$Pg), + (intvt (AArch64dup (immtype:$imm))), + (intvt ZPR:$Zs1), + commuted_cc)), + (cmp $Pg, $Zs1, immtype:$imm)>; +} + +multiclass sve_int_scmp_vi<bits<3> opc, string asm, CondCode cc, CondCode commuted_cc> { def _B : sve_int_scmp_vi<0b00, opc, asm, PPR8, ZPR8, simm5_32b>; def _H : sve_int_scmp_vi<0b01, opc, asm, PPR16, ZPR16, simm5_32b>; def _S : sve_int_scmp_vi<0b10, opc, asm, PPR32, ZPR32, simm5_32b>; def _D : sve_int_scmp_vi<0b11, opc, asm, PPR64, ZPR64, simm5_64b>; - // IR version - def : Pat<(nxv16i1 (setcc (nxv16i8 ZPR:$Zs1), - (nxv16i8 (AArch64dup (simm5_32b:$imm))), - cc)), - (!cast<Instruction>(NAME # "_B") (PTRUE_B 31), ZPR:$Zs1, simm5_32b:$imm)>; - def : Pat<(nxv8i1 (setcc (nxv8i16 ZPR:$Zs1), - (nxv8i16 (AArch64dup (simm5_32b:$imm))), - cc)), - (!cast<Instruction>(NAME # "_H") (PTRUE_H 31), ZPR:$Zs1, simm5_32b:$imm)>; - def : Pat<(nxv4i1 (setcc (nxv4i32 ZPR:$Zs1), - (nxv4i32 (AArch64dup (simm5_32b:$imm))), - cc)), - (!cast<Instruction>(NAME # "_S") (PTRUE_S 31), ZPR:$Zs1, simm5_32b:$imm)>; - def : Pat<(nxv2i1 (setcc (nxv2i64 ZPR:$Zs1), - (nxv2i64 (AArch64dup (simm5_64b:$imm))), - cc)), - (!cast<Instruction>(NAME # "_D") (PTRUE_D 31), ZPR:$Zs1, simm5_64b:$imm)>; - - // Intrinsic version - def : Pat<(nxv16i1 (op (nxv16i1 PPR_3b:$Pg), - (nxv16i8 ZPR:$Zs1), - (nxv16i8 (AArch64dup (simm5_32b:$imm))))), - (!cast<Instruction>(NAME # "_B") PPR_3b:$Pg, ZPR:$Zs1, simm5_32b:$imm)>; - def : Pat<(nxv8i1 (op (nxv8i1 PPR_3b:$Pg), - (nxv8i16 ZPR:$Zs1), - (nxv8i16 (AArch64dup (simm5_32b:$imm))))), - (!cast<Instruction>(NAME # "_H") PPR_3b:$Pg, ZPR:$Zs1, simm5_32b:$imm)>; - def : Pat<(nxv4i1 (op (nxv4i1 PPR_3b:$Pg), - (nxv4i32 ZPR:$Zs1), - (nxv4i32 (AArch64dup (simm5_32b:$imm))))), - (!cast<Instruction>(NAME # "_S") PPR_3b:$Pg, ZPR:$Zs1, simm5_32b:$imm)>; - def : Pat<(nxv2i1 (op (nxv2i1 PPR_3b:$Pg), - (nxv2i64 ZPR:$Zs1), - (nxv2i64 (AArch64dup (simm5_64b:$imm))))), - (!cast<Instruction>(NAME # "_D") PPR_3b:$Pg, ZPR:$Zs1, simm5_64b:$imm)>; - - // Inverted intrinsic version - def : Pat<(nxv16i1 (inv_op (nxv16i1 PPR_3b:$Pg), - (nxv16i8 (AArch64dup (simm5_32b:$imm))), - (nxv16i8 ZPR:$Zs1))), - (!cast<Instruction>(NAME # "_B") PPR_3b:$Pg, ZPR:$Zs1, simm5_32b:$imm)>; - def : Pat<(nxv8i1 (inv_op (nxv8i1 PPR_3b:$Pg), - (nxv8i16 (AArch64dup (simm5_32b:$imm))), - (nxv8i16 ZPR:$Zs1))), - (!cast<Instruction>(NAME # "_H") PPR_3b:$Pg, ZPR:$Zs1, simm5_32b:$imm)>; - def : Pat<(nxv4i1 (inv_op (nxv4i1 PPR_3b:$Pg), - (nxv4i32 (AArch64dup (simm5_32b:$imm))), - (nxv4i32 ZPR:$Zs1))), - (!cast<Instruction>(NAME # "_S") PPR_3b:$Pg, ZPR:$Zs1, simm5_32b:$imm)>; - def : Pat<(nxv2i1 (inv_op (nxv2i1 PPR_3b:$Pg), - (nxv2i64 (AArch64dup (simm5_64b:$imm))), - (nxv2i64 ZPR:$Zs1))), - (!cast<Instruction>(NAME # "_D") PPR_3b:$Pg, ZPR:$Zs1, simm5_64b:$imm)>; + defm : SVE_SETCC_Imm_Pat<cc, commuted_cc, nxv16i1, nxv16i8, simm5_32b, + !cast<Instruction>(NAME # _B)>; + defm : SVE_SETCC_Imm_Pat<cc, commuted_cc, nxv8i1, nxv8i16, simm5_32b, + !cast<Instruction>(NAME # _H)>; + defm : SVE_SETCC_Imm_Pat<cc, commuted_cc, nxv4i1, nxv4i32, simm5_32b, + !cast<Instruction>(NAME # _S)>; + defm : SVE_SETCC_Imm_Pat<cc, commuted_cc, nxv2i1, nxv2i64, simm5_64b, + !cast<Instruction>(NAME # _D)>; } @@ -3944,66 +4296,20 @@ class sve_int_ucmp_vi<bits<2> sz8_64, bits<2> opc, string asm, PPRRegOp pprty, } multiclass sve_int_ucmp_vi<bits<2> opc, string asm, CondCode cc, - SDPatternOperator op = null_frag, - SDPatternOperator inv_op = null_frag> { + CondCode commuted_cc> { def _B : sve_int_ucmp_vi<0b00, opc, asm, PPR8, ZPR8, imm0_127>; def _H : sve_int_ucmp_vi<0b01, opc, asm, PPR16, ZPR16, imm0_127>; def _S : sve_int_ucmp_vi<0b10, opc, asm, PPR32, ZPR32, imm0_127>; def _D : sve_int_ucmp_vi<0b11, opc, asm, PPR64, ZPR64, imm0_127_64b>; - // IR version - def : Pat<(nxv16i1 (setcc (nxv16i8 ZPR:$Zs1), - (nxv16i8 (AArch64dup (imm0_127:$imm))), - cc)), - (!cast<Instruction>(NAME # "_B") (PTRUE_B 31), ZPR:$Zs1, imm0_127:$imm)>; - def : Pat<(nxv8i1 (setcc (nxv8i16 ZPR:$Zs1), - (nxv8i16 (AArch64dup (imm0_127:$imm))), - cc)), - (!cast<Instruction>(NAME # "_H") (PTRUE_H 31), ZPR:$Zs1, imm0_127:$imm)>; - def : Pat<(nxv4i1 (setcc (nxv4i32 ZPR:$Zs1), - (nxv4i32 (AArch64dup (imm0_127:$imm))), - cc)), - (!cast<Instruction>(NAME # "_S") (PTRUE_S 31), ZPR:$Zs1, imm0_127:$imm)>; - def : Pat<(nxv2i1 (setcc (nxv2i64 ZPR:$Zs1), - (nxv2i64 (AArch64dup (imm0_127_64b:$imm))), - cc)), - (!cast<Instruction>(NAME # "_D") (PTRUE_D 31), ZPR:$Zs1, imm0_127_64b:$imm)>; - - // Intrinsic version - def : Pat<(nxv16i1 (op (nxv16i1 PPR_3b:$Pg), - (nxv16i8 ZPR:$Zs1), - (nxv16i8 (AArch64dup (imm0_127:$imm))))), - (!cast<Instruction>(NAME # "_B") PPR_3b:$Pg, ZPR:$Zs1, imm0_127:$imm)>; - def : Pat<(nxv8i1 (op (nxv8i1 PPR_3b:$Pg), - (nxv8i16 ZPR:$Zs1), - (nxv8i16 (AArch64dup (imm0_127:$imm))))), - (!cast<Instruction>(NAME # "_H") PPR_3b:$Pg, ZPR:$Zs1, imm0_127:$imm)>; - def : Pat<(nxv4i1 (op (nxv4i1 PPR_3b:$Pg), - (nxv4i32 ZPR:$Zs1), - (nxv4i32 (AArch64dup (imm0_127:$imm))))), - (!cast<Instruction>(NAME # "_S") PPR_3b:$Pg, ZPR:$Zs1, imm0_127:$imm)>; - def : Pat<(nxv2i1 (op (nxv2i1 PPR_3b:$Pg), - (nxv2i64 ZPR:$Zs1), - (nxv2i64 (AArch64dup (imm0_127_64b:$imm))))), - (!cast<Instruction>(NAME # "_D") PPR_3b:$Pg, ZPR:$Zs1, imm0_127_64b:$imm)>; - - // Inverted intrinsic version - def : Pat<(nxv16i1 (inv_op (nxv16i1 PPR_3b:$Pg), - (nxv16i8 (AArch64dup (imm0_127:$imm))), - (nxv16i8 ZPR:$Zs1))), - (!cast<Instruction>(NAME # "_B") PPR_3b:$Pg, ZPR:$Zs1, imm0_127:$imm)>; - def : Pat<(nxv8i1 (inv_op (nxv8i1 PPR_3b:$Pg), - (nxv8i16 (AArch64dup (imm0_127:$imm))), - (nxv8i16 ZPR:$Zs1))), - (!cast<Instruction>(NAME # "_H") PPR_3b:$Pg, ZPR:$Zs1, imm0_127:$imm)>; - def : Pat<(nxv4i1 (inv_op (nxv4i1 PPR_3b:$Pg), - (nxv4i32 (AArch64dup (imm0_127:$imm))), - (nxv4i32 ZPR:$Zs1))), - (!cast<Instruction>(NAME # "_S") PPR_3b:$Pg, ZPR:$Zs1, imm0_127:$imm)>; - def : Pat<(nxv2i1 (inv_op (nxv2i1 PPR_3b:$Pg), - (nxv2i64 (AArch64dup (imm0_127_64b:$imm))), - (nxv2i64 ZPR:$Zs1))), - (!cast<Instruction>(NAME # "_D") PPR_3b:$Pg, ZPR:$Zs1, imm0_127_64b:$imm)>; + defm : SVE_SETCC_Imm_Pat<cc, commuted_cc, nxv16i1, nxv16i8, imm0_127, + !cast<Instruction>(NAME # _B)>; + defm : SVE_SETCC_Imm_Pat<cc, commuted_cc, nxv8i1, nxv8i16, imm0_127, + !cast<Instruction>(NAME # _H)>; + defm : SVE_SETCC_Imm_Pat<cc, commuted_cc, nxv4i1, nxv4i32, imm0_127, + !cast<Instruction>(NAME # _S)>; + defm : SVE_SETCC_Imm_Pat<cc, commuted_cc, nxv2i1, nxv2i64, imm0_127_64b, + !cast<Instruction>(NAME # _D)>; } @@ -4096,11 +4402,17 @@ class sve2_int_while_rr<bits<2> sz8_64, bits<1> rw, string asm, let Defs = [NZCV]; } -multiclass sve2_int_while_rr<bits<1> rw, string asm> { +multiclass sve2_int_while_rr<bits<1> rw, string asm, string op> { def _B : sve2_int_while_rr<0b00, rw, asm, PPR8>; def _H : sve2_int_while_rr<0b01, rw, asm, PPR16>; def _S : sve2_int_while_rr<0b10, rw, asm, PPR32>; def _D : sve2_int_while_rr<0b11, rw, asm, PPR64>; + + def : SVE_2_Op_Pat<nxv16i1, !cast<SDPatternOperator>(op # _b), i64, i64, !cast<Instruction>(NAME # _B)>; + def : SVE_2_Op_Pat<nxv8i1, !cast<SDPatternOperator>(op # _h), i64, i64, !cast<Instruction>(NAME # _H)>; + def : SVE_2_Op_Pat<nxv4i1, !cast<SDPatternOperator>(op # _s), i64, i64, !cast<Instruction>(NAME # _S)>; + def : SVE_2_Op_Pat<nxv2i1, !cast<SDPatternOperator>(op # _d), i64, i64, !cast<Instruction>(NAME # _D)>; + } //===----------------------------------------------------------------------===// @@ -4108,8 +4420,8 @@ multiclass sve2_int_while_rr<bits<1> rw, string asm> { //===----------------------------------------------------------------------===// class sve_fp_fast_red<bits<2> sz, bits<3> opc, string asm, - ZPRRegOp zprty, RegisterClass dstRegClass> -: I<(outs dstRegClass:$Vd), (ins PPR3bAny:$Pg, zprty:$Zn), + ZPRRegOp zprty, FPRasZPROperand dstOpType> +: I<(outs dstOpType:$Vd), (ins PPR3bAny:$Pg, zprty:$Zn), asm, "\t$Vd, $Pg, $Zn", "", []>, Sched<[]> { @@ -4127,13 +4439,13 @@ class sve_fp_fast_red<bits<2> sz, bits<3> opc, string asm, } multiclass sve_fp_fast_red<bits<3> opc, string asm, SDPatternOperator op> { - def _H : sve_fp_fast_red<0b01, opc, asm, ZPR16, FPR16>; - def _S : sve_fp_fast_red<0b10, opc, asm, ZPR32, FPR32>; - def _D : sve_fp_fast_red<0b11, opc, asm, ZPR64, FPR64>; + def _H : sve_fp_fast_red<0b01, opc, asm, ZPR16, FPR16asZPR>; + def _S : sve_fp_fast_red<0b10, opc, asm, ZPR32, FPR32asZPR>; + def _D : sve_fp_fast_red<0b11, opc, asm, ZPR64, FPR64asZPR>; - def : SVE_2_Op_Pat<f16, op, nxv8i1, nxv8f16, !cast<Instruction>(NAME # _H)>; - def : SVE_2_Op_Pat<f32, op, nxv4i1, nxv4f32, !cast<Instruction>(NAME # _S)>; - def : SVE_2_Op_Pat<f64, op, nxv2i1, nxv2f64, !cast<Instruction>(NAME # _D)>; + def : SVE_2_Op_Pat<nxv8f16, op, nxv8i1, nxv8f16, !cast<Instruction>(NAME # _H)>; + def : SVE_2_Op_Pat<nxv4f32, op, nxv4i1, nxv4f32, !cast<Instruction>(NAME # _S)>; + def : SVE_2_Op_Pat<nxv2f64, op, nxv2i1, nxv2f64, !cast<Instruction>(NAME # _D)>; } @@ -4142,8 +4454,8 @@ multiclass sve_fp_fast_red<bits<3> opc, string asm, SDPatternOperator op> { //===----------------------------------------------------------------------===// class sve_fp_2op_p_vd<bits<2> sz, bits<3> opc, string asm, - ZPRRegOp zprty, RegisterClass dstRegClass> -: I<(outs dstRegClass:$Vdn), (ins PPR3bAny:$Pg, dstRegClass:$_Vdn, zprty:$Zm), + ZPRRegOp zprty, FPRasZPROperand dstOpType> +: I<(outs dstOpType:$Vdn), (ins PPR3bAny:$Pg, dstOpType:$_Vdn, zprty:$Zm), asm, "\t$Vdn, $Pg, $_Vdn, $Zm", "", []>, @@ -4164,13 +4476,13 @@ class sve_fp_2op_p_vd<bits<2> sz, bits<3> opc, string asm, } multiclass sve_fp_2op_p_vd<bits<3> opc, string asm, SDPatternOperator op> { - def _H : sve_fp_2op_p_vd<0b01, opc, asm, ZPR16, FPR16>; - def _S : sve_fp_2op_p_vd<0b10, opc, asm, ZPR32, FPR32>; - def _D : sve_fp_2op_p_vd<0b11, opc, asm, ZPR64, FPR64>; + def _H : sve_fp_2op_p_vd<0b01, opc, asm, ZPR16, FPR16asZPR>; + def _S : sve_fp_2op_p_vd<0b10, opc, asm, ZPR32, FPR32asZPR>; + def _D : sve_fp_2op_p_vd<0b11, opc, asm, ZPR64, FPR64asZPR>; - def : SVE_3_Op_Pat<f16, op, nxv8i1, f16, nxv8f16, !cast<Instruction>(NAME # _H)>; - def : SVE_3_Op_Pat<f32, op, nxv4i1, f32, nxv4f32, !cast<Instruction>(NAME # _S)>; - def : SVE_3_Op_Pat<f64, op, nxv2i1, f64, nxv2f64, !cast<Instruction>(NAME # _D)>; + def : SVE_3_Op_Pat<nxv8f16, op, nxv8i1, nxv8f16, nxv8f16, !cast<Instruction>(NAME # _H)>; + def : SVE_3_Op_Pat<nxv4f32, op, nxv4i1, nxv4f32, nxv4f32, !cast<Instruction>(NAME # _S)>; + def : SVE_3_Op_Pat<nxv2f64, op, nxv2i1, nxv2f64, nxv2f64, !cast<Instruction>(NAME # _D)>; } //===----------------------------------------------------------------------===// @@ -4210,6 +4522,22 @@ multiclass sve_fp_3op_p_pd<bits<3> opc, string asm, SDPatternOperator op> { def : SVE_3_Op_Pat<nxv2i1, op, nxv2i1, nxv2f64, nxv2f64, !cast<Instruction>(NAME # _D)>; } +multiclass sve_fp_3op_p_pd_cc<bits<3> opc, string asm, SDPatternOperator op, + SDPatternOperator op_nopred> +: sve_fp_3op_p_pd<opc, asm, op> { + def : SVE_2_Op_AllActive_Pat<nxv8i1, op_nopred, nxv8f16, nxv8f16, + !cast<Instruction>(NAME # _H), PTRUE_H>; + def : SVE_2_Op_AllActive_Pat<nxv4i1, op_nopred, nxv4f16, nxv4f16, + !cast<Instruction>(NAME # _H), PTRUE_S>; + def : SVE_2_Op_AllActive_Pat<nxv2i1, op_nopred, nxv2f16, nxv2f16, + !cast<Instruction>(NAME # _H), PTRUE_D>; + def : SVE_2_Op_AllActive_Pat<nxv4i1, op_nopred, nxv4f32, nxv4f32, + !cast<Instruction>(NAME # _S), PTRUE_S>; + def : SVE_2_Op_AllActive_Pat<nxv2i1, op_nopred, nxv2f32, nxv2f32, + !cast<Instruction>(NAME # _S), PTRUE_D>; + def : SVE_2_Op_AllActive_Pat<nxv2i1, op_nopred, nxv2f64, nxv2f64, + !cast<Instruction>(NAME # _D), PTRUE_D>; +} //===----------------------------------------------------------------------===// // SVE Floating Point Compare - with Zero Group @@ -4263,11 +4591,20 @@ class sve_int_index_ii<bits<2> sz8_64, string asm, ZPRRegOp zprty, let Inst{4-0} = Zd; } -multiclass sve_int_index_ii<string asm> { - def _B : sve_int_index_ii<0b00, asm, ZPR8, simm5_32b>; - def _H : sve_int_index_ii<0b01, asm, ZPR16, simm5_32b>; +multiclass sve_int_index_ii<string asm, SDPatternOperator op> { + def _B : sve_int_index_ii<0b00, asm, ZPR8, simm5_8b>; + def _H : sve_int_index_ii<0b01, asm, ZPR16, simm5_16b>; def _S : sve_int_index_ii<0b10, asm, ZPR32, simm5_32b>; def _D : sve_int_index_ii<0b11, asm, ZPR64, simm5_64b>; + + def : Pat<(nxv16i8 (op simm5_8b:$imm5, simm5_8b:$imm5b)), + (!cast<Instruction>(NAME # "_B") simm5_8b:$imm5, simm5_8b:$imm5b)>; + def : Pat<(nxv8i16 (op simm5_16b:$imm5, simm5_16b:$imm5b)), + (!cast<Instruction>(NAME # "_H") simm5_16b:$imm5, simm5_16b:$imm5b)>; + def : Pat<(nxv4i32 (op simm5_32b:$imm5, simm5_32b:$imm5b)), + (!cast<Instruction>(NAME # "_S") simm5_32b:$imm5, simm5_32b:$imm5b)>; + def : Pat<(nxv2i64 (op simm5_64b:$imm5, simm5_64b:$imm5b)), + (!cast<Instruction>(NAME # "_D") simm5_64b:$imm5, simm5_64b:$imm5b)>; } class sve_int_index_ir<bits<2> sz8_64, string asm, ZPRRegOp zprty, @@ -4287,11 +4624,20 @@ class sve_int_index_ir<bits<2> sz8_64, string asm, ZPRRegOp zprty, let Inst{4-0} = Zd; } -multiclass sve_int_index_ir<string asm> { - def _B : sve_int_index_ir<0b00, asm, ZPR8, GPR32, simm5_32b>; - def _H : sve_int_index_ir<0b01, asm, ZPR16, GPR32, simm5_32b>; +multiclass sve_int_index_ir<string asm, SDPatternOperator op> { + def _B : sve_int_index_ir<0b00, asm, ZPR8, GPR32, simm5_8b>; + def _H : sve_int_index_ir<0b01, asm, ZPR16, GPR32, simm5_16b>; def _S : sve_int_index_ir<0b10, asm, ZPR32, GPR32, simm5_32b>; def _D : sve_int_index_ir<0b11, asm, ZPR64, GPR64, simm5_64b>; + + def : Pat<(nxv16i8 (op simm5_8b:$imm5, GPR32:$Rm)), + (!cast<Instruction>(NAME # "_B") simm5_8b:$imm5, GPR32:$Rm)>; + def : Pat<(nxv8i16 (op simm5_16b:$imm5, GPR32:$Rm)), + (!cast<Instruction>(NAME # "_H") simm5_16b:$imm5, GPR32:$Rm)>; + def : Pat<(nxv4i32 (op simm5_32b:$imm5, GPR32:$Rm)), + (!cast<Instruction>(NAME # "_S") simm5_32b:$imm5, GPR32:$Rm)>; + def : Pat<(nxv2i64 (op simm5_64b:$imm5, GPR64:$Rm)), + (!cast<Instruction>(NAME # "_D") simm5_64b:$imm5, GPR64:$Rm)>; } class sve_int_index_ri<bits<2> sz8_64, string asm, ZPRRegOp zprty, @@ -4311,11 +4657,20 @@ class sve_int_index_ri<bits<2> sz8_64, string asm, ZPRRegOp zprty, let Inst{4-0} = Zd; } -multiclass sve_int_index_ri<string asm> { - def _B : sve_int_index_ri<0b00, asm, ZPR8, GPR32, simm5_32b>; - def _H : sve_int_index_ri<0b01, asm, ZPR16, GPR32, simm5_32b>; +multiclass sve_int_index_ri<string asm, SDPatternOperator op> { + def _B : sve_int_index_ri<0b00, asm, ZPR8, GPR32, simm5_8b>; + def _H : sve_int_index_ri<0b01, asm, ZPR16, GPR32, simm5_16b>; def _S : sve_int_index_ri<0b10, asm, ZPR32, GPR32, simm5_32b>; def _D : sve_int_index_ri<0b11, asm, ZPR64, GPR64, simm5_64b>; + + def : Pat<(nxv16i8 (op GPR32:$Rm, simm5_8b:$imm5)), + (!cast<Instruction>(NAME # "_B") GPR32:$Rm, simm5_8b:$imm5)>; + def : Pat<(nxv8i16 (op GPR32:$Rm, simm5_16b:$imm5)), + (!cast<Instruction>(NAME # "_H") GPR32:$Rm, simm5_16b:$imm5)>; + def : Pat<(nxv4i32 (op GPR32:$Rm, simm5_32b:$imm5)), + (!cast<Instruction>(NAME # "_S") GPR32:$Rm, simm5_32b:$imm5)>; + def : Pat<(nxv2i64 (op GPR64:$Rm, simm5_64b:$imm5)), + (!cast<Instruction>(NAME # "_D") GPR64:$Rm, simm5_64b:$imm5)>; } class sve_int_index_rr<bits<2> sz8_64, string asm, ZPRRegOp zprty, @@ -4335,19 +4690,23 @@ class sve_int_index_rr<bits<2> sz8_64, string asm, ZPRRegOp zprty, let Inst{4-0} = Zd; } -multiclass sve_int_index_rr<string asm> { +multiclass sve_int_index_rr<string asm, SDPatternOperator op> { def _B : sve_int_index_rr<0b00, asm, ZPR8, GPR32>; def _H : sve_int_index_rr<0b01, asm, ZPR16, GPR32>; def _S : sve_int_index_rr<0b10, asm, ZPR32, GPR32>; def _D : sve_int_index_rr<0b11, asm, ZPR64, GPR64>; + + def : SVE_2_Op_Pat<nxv16i8, op, i32, i32, !cast<Instruction>(NAME # _B)>; + def : SVE_2_Op_Pat<nxv8i16, op, i32, i32, !cast<Instruction>(NAME # _H)>; + def : SVE_2_Op_Pat<nxv4i32, op, i32, i32, !cast<Instruction>(NAME # _S)>; + def : SVE_2_Op_Pat<nxv2i64, op, i64, i64, !cast<Instruction>(NAME # _D)>; } // //===----------------------------------------------------------------------===// // SVE Bitwise Shift - Predicated Group //===----------------------------------------------------------------------===// class sve_int_bin_pred_shift_imm<bits<4> tsz8_64, bits<4> opc, string asm, - ZPRRegOp zprty, Operand immtype, - ElementSizeEnum size> + ZPRRegOp zprty, Operand immtype> : I<(outs zprty:$Zdn), (ins PPR3bAny:$Pg, zprty:$_Zdn, immtype:$imm), asm, "\t$Zdn, $Pg/m, $_Zdn, $imm", "", @@ -4366,50 +4725,99 @@ class sve_int_bin_pred_shift_imm<bits<4> tsz8_64, bits<4> opc, string asm, let Inst{4-0} = Zdn; let Constraints = "$Zdn = $_Zdn"; - let DestructiveInstType = Destructive; - let ElementSize = size; + let DestructiveInstType = DestructiveBinaryImm; + let ElementSize = zprty.ElementSize; +} + +multiclass sve_int_bin_pred_shift_imm_left<bits<4> opc, string asm, string psName=""> { + def _B : SVEPseudo2Instr<psName # _B, 1>, + sve_int_bin_pred_shift_imm<{0,0,0,1}, opc, asm, ZPR8, vecshiftL8>; + def _H : SVEPseudo2Instr<psName # _H, 1>, + sve_int_bin_pred_shift_imm<{0,0,1,?}, opc, asm, ZPR16, vecshiftL16> { + let Inst{8} = imm{3}; + } + def _S : SVEPseudo2Instr<psName # _S, 1>, + sve_int_bin_pred_shift_imm<{0,1,?,?}, opc, asm, ZPR32, vecshiftL32> { + let Inst{9-8} = imm{4-3}; + } + def _D : SVEPseudo2Instr<psName # _D, 1>, + sve_int_bin_pred_shift_imm<{1,?,?,?}, opc, asm, ZPR64, vecshiftL64> { + let Inst{22} = imm{5}; + let Inst{9-8} = imm{4-3}; + } } -multiclass sve_int_bin_pred_shift_imm_left<bits<4> opc, string asm> { - def _B : sve_int_bin_pred_shift_imm<{0,0,0,1}, opc, asm, ZPR8, vecshiftL8, - ElementSizeB>; - def _H : sve_int_bin_pred_shift_imm<{0,0,1,?}, opc, asm, ZPR16, vecshiftL16, - ElementSizeH> { +multiclass sve2_int_bin_pred_shift_imm_left<bits<4> opc, string asm, + string psName, + SDPatternOperator op> { + + def _B : SVEPseudo2Instr<psName # _B, 1>, sve_int_bin_pred_shift_imm<{0,0,0,1}, opc, asm, ZPR8, vecshiftL8>; + def _H : SVEPseudo2Instr<psName # _H, 1>, + sve_int_bin_pred_shift_imm<{0,0,1,?}, opc, asm, ZPR16, vecshiftL16> { let Inst{8} = imm{3}; } - def _S : sve_int_bin_pred_shift_imm<{0,1,?,?}, opc, asm, ZPR32, vecshiftL32, - ElementSizeS> { + def _S : SVEPseudo2Instr<psName # _S, 1>, + sve_int_bin_pred_shift_imm<{0,1,?,?}, opc, asm, ZPR32, vecshiftL32> { let Inst{9-8} = imm{4-3}; } - def _D : sve_int_bin_pred_shift_imm<{1,?,?,?}, opc, asm, ZPR64, vecshiftL64, - ElementSizeD> { + def _D : SVEPseudo2Instr<psName # _D, 1>, + sve_int_bin_pred_shift_imm<{1,?,?,?}, opc, asm, ZPR64, vecshiftL64> { let Inst{22} = imm{5}; let Inst{9-8} = imm{4-3}; } + + def : SVE_3_Op_Imm_Pat<nxv16i8, op, nxv16i1, nxv16i8, i32, tvecshiftL8, !cast<Instruction>(NAME # _B)>; + def : SVE_3_Op_Imm_Pat<nxv8i16, op, nxv8i1, nxv8i16, i32, tvecshiftL16, !cast<Instruction>(NAME # _H)>; + def : SVE_3_Op_Imm_Pat<nxv4i32, op, nxv4i1, nxv4i32, i32, tvecshiftL32, !cast<Instruction>(NAME # _S)>; + def : SVE_3_Op_Imm_Pat<nxv2i64, op, nxv2i1, nxv2i64, i32, tvecshiftL64, !cast<Instruction>(NAME # _D)>; } -multiclass sve_int_bin_pred_shift_imm_right<bits<4> opc, string asm, +multiclass sve_int_bin_pred_shift_imm_left_zeroing_bhsd<SDPatternOperator op> { + def _ZERO_B : PredTwoOpImmPseudo<NAME # _B, ZPR8, tvecshiftL8, FalseLanesZero>; + def _ZERO_H : PredTwoOpImmPseudo<NAME # _H, ZPR16, tvecshiftL16, FalseLanesZero>; + def _ZERO_S : PredTwoOpImmPseudo<NAME # _S, ZPR32, tvecshiftL32, FalseLanesZero>; + def _ZERO_D : PredTwoOpImmPseudo<NAME # _D, ZPR64, tvecshiftL64, FalseLanesZero>; + + def : SVE_3_Op_Pat_Shift_Imm_SelZero<nxv16i8, op, nxv16i1, nxv16i8, tvecshiftL8, !cast<Pseudo>(NAME # _ZERO_B)>; + def : SVE_3_Op_Pat_Shift_Imm_SelZero<nxv8i16, op, nxv8i1, nxv8i16, tvecshiftL16, !cast<Pseudo>(NAME # _ZERO_H)>; + def : SVE_3_Op_Pat_Shift_Imm_SelZero<nxv4i32, op, nxv4i1, nxv4i32, tvecshiftL32, !cast<Pseudo>(NAME # _ZERO_S)>; + def : SVE_3_Op_Pat_Shift_Imm_SelZero<nxv2i64, op, nxv2i1, nxv2i64, tvecshiftL64, !cast<Pseudo>(NAME # _ZERO_D)>; +} + +multiclass sve_int_bin_pred_shift_imm_right<bits<4> opc, string asm, string Ps, SDPatternOperator op = null_frag> { - def _B : sve_int_bin_pred_shift_imm<{0,0,0,1}, opc, asm, ZPR8, vecshiftR8, - ElementSizeB>; - def _H : sve_int_bin_pred_shift_imm<{0,0,1,?}, opc, asm, ZPR16, vecshiftR16, - ElementSizeH> { + def _B : SVEPseudo2Instr<Ps # _B, 1>, + sve_int_bin_pred_shift_imm<{0,0,0,1}, opc, asm, ZPR8, vecshiftR8>; + def _H : SVEPseudo2Instr<Ps # _H, 1>, + sve_int_bin_pred_shift_imm<{0,0,1,?}, opc, asm, ZPR16, vecshiftR16> { let Inst{8} = imm{3}; } - def _S : sve_int_bin_pred_shift_imm<{0,1,?,?}, opc, asm, ZPR32, vecshiftR32, - ElementSizeS> { + def _S : SVEPseudo2Instr<Ps # _S, 1>, + sve_int_bin_pred_shift_imm<{0,1,?,?}, opc, asm, ZPR32, vecshiftR32> { let Inst{9-8} = imm{4-3}; } - def _D : sve_int_bin_pred_shift_imm<{1,?,?,?}, opc, asm, ZPR64, vecshiftR64, - ElementSizeD> { + def _D : SVEPseudo2Instr<Ps # _D, 1>, + sve_int_bin_pred_shift_imm<{1,?,?,?}, opc, asm, ZPR64, vecshiftR64> { let Inst{22} = imm{5}; let Inst{9-8} = imm{4-3}; } - def : SVE_3_Op_Imm_Pat<nxv16i8, op, nxv16i1, nxv16i8, i32, vecshiftR8, !cast<Instruction>(NAME # _B)>; - def : SVE_3_Op_Imm_Pat<nxv8i16, op, nxv8i1, nxv8i16, i32, vecshiftR16, !cast<Instruction>(NAME # _H)>; - def : SVE_3_Op_Imm_Pat<nxv4i32, op, nxv4i1, nxv4i32, i32, vecshiftR32, !cast<Instruction>(NAME # _S)>; - def : SVE_3_Op_Imm_Pat<nxv2i64, op, nxv2i1, nxv2i64, i32, vecshiftR64, !cast<Instruction>(NAME # _D)>; + def : SVE_3_Op_Imm_Pat<nxv16i8, op, nxv16i1, nxv16i8, i32, tvecshiftR8, !cast<Instruction>(NAME # _B)>; + def : SVE_3_Op_Imm_Pat<nxv8i16, op, nxv8i1, nxv8i16, i32, tvecshiftR16, !cast<Instruction>(NAME # _H)>; + def : SVE_3_Op_Imm_Pat<nxv4i32, op, nxv4i1, nxv4i32, i32, tvecshiftR32, !cast<Instruction>(NAME # _S)>; + def : SVE_3_Op_Imm_Pat<nxv2i64, op, nxv2i1, nxv2i64, i32, tvecshiftR64, !cast<Instruction>(NAME # _D)>; +} + +multiclass sve_int_bin_pred_shift_imm_right_zeroing_bhsd<SDPatternOperator op = null_frag> { + def _ZERO_B : PredTwoOpImmPseudo<NAME # _B, ZPR8, vecshiftR8, FalseLanesZero>; + def _ZERO_H : PredTwoOpImmPseudo<NAME # _H, ZPR16, vecshiftR16, FalseLanesZero>; + def _ZERO_S : PredTwoOpImmPseudo<NAME # _S, ZPR32, vecshiftR32, FalseLanesZero>; + def _ZERO_D : PredTwoOpImmPseudo<NAME # _D, ZPR64, vecshiftR64, FalseLanesZero>; + + def : SVE_3_Op_Pat_Shift_Imm_SelZero<nxv16i8, op, nxv16i1, nxv16i8, tvecshiftR8, !cast<Pseudo>(NAME # _ZERO_B)>; + def : SVE_3_Op_Pat_Shift_Imm_SelZero<nxv8i16, op, nxv8i1, nxv8i16, tvecshiftR16, !cast<Pseudo>(NAME # _ZERO_H)>; + def : SVE_3_Op_Pat_Shift_Imm_SelZero<nxv4i32, op, nxv4i1, nxv4i32, tvecshiftR32, !cast<Pseudo>(NAME # _ZERO_S)>; + def : SVE_3_Op_Pat_Shift_Imm_SelZero<nxv2i64, op, nxv2i1, nxv2i64, tvecshiftR64, !cast<Pseudo>(NAME # _ZERO_D)>; } class sve_int_bin_pred_shift<bits<2> sz8_64, bit wide, bits<3> opc, @@ -4432,23 +4840,40 @@ class sve_int_bin_pred_shift<bits<2> sz8_64, bit wide, bits<3> opc, let Inst{4-0} = Zdn; let Constraints = "$Zdn = $_Zdn"; - let DestructiveInstType = Destructive; + let DestructiveInstType = DestructiveOther; let ElementSize = zprty.ElementSize; } -multiclass sve_int_bin_pred_shift<bits<3> opc, string asm, - SDPatternOperator op> { - def _B : sve_int_bin_pred_shift<0b00, 0b0, opc, asm, ZPR8, ZPR8>; - def _H : sve_int_bin_pred_shift<0b01, 0b0, opc, asm, ZPR16, ZPR16>; - def _S : sve_int_bin_pred_shift<0b10, 0b0, opc, asm, ZPR32, ZPR32>; - def _D : sve_int_bin_pred_shift<0b11, 0b0, opc, asm, ZPR64, ZPR64>; - +multiclass sve_int_bin_pred_shift<bits<3> opc, string asm, string Ps, + SDPatternOperator op, string revname, bit isReverseInstr = 0> { + let DestructiveInstType = DestructiveBinaryCommWithRev in { + def _B : sve_int_bin_pred_shift<0b00, 0b0, opc, asm, ZPR8, ZPR8>, + SVEPseudo2Instr<Ps # _B, 1>, SVEInstr2Rev<NAME # _B, revname # _B, isReverseInstr>; + def _H : sve_int_bin_pred_shift<0b01, 0b0, opc, asm, ZPR16, ZPR16>, + SVEPseudo2Instr<Ps # _H, 1>, SVEInstr2Rev<NAME # _H, revname # _H, isReverseInstr>; + def _S : sve_int_bin_pred_shift<0b10, 0b0, opc, asm, ZPR32, ZPR32>, + SVEPseudo2Instr<Ps # _S, 1>, SVEInstr2Rev<NAME # _S, revname # _S, isReverseInstr>; + def _D : sve_int_bin_pred_shift<0b11, 0b0, opc, asm, ZPR64, ZPR64>, + SVEPseudo2Instr<Ps # _D, 1>, SVEInstr2Rev<NAME # _D, revname # _D, isReverseInstr>; + } def : SVE_3_Op_Pat<nxv16i8, op, nxv16i1, nxv16i8, nxv16i8, !cast<Instruction>(NAME # _B)>; def : SVE_3_Op_Pat<nxv8i16, op, nxv8i1, nxv8i16, nxv8i16, !cast<Instruction>(NAME # _H)>; def : SVE_3_Op_Pat<nxv4i32, op, nxv4i1, nxv4i32, nxv4i32, !cast<Instruction>(NAME # _S)>; def : SVE_3_Op_Pat<nxv2i64, op, nxv2i1, nxv2i64, nxv2i64, !cast<Instruction>(NAME # _D)>; } +multiclass sve_int_bin_pred_zeroing_bhsd<SDPatternOperator op> { + def _ZERO_B : PredTwoOpPseudo<NAME # _B, ZPR8, FalseLanesZero>; + def _ZERO_H : PredTwoOpPseudo<NAME # _H, ZPR16, FalseLanesZero>; + def _ZERO_S : PredTwoOpPseudo<NAME # _S, ZPR32, FalseLanesZero>; + def _ZERO_D : PredTwoOpPseudo<NAME # _D, ZPR64, FalseLanesZero>; + + def : SVE_3_Op_Pat_SelZero<nxv16i8, op, nxv16i1, nxv16i8, nxv16i8, !cast<Pseudo>(NAME # _ZERO_B)>; + def : SVE_3_Op_Pat_SelZero<nxv8i16, op, nxv8i1, nxv8i16, nxv8i16, !cast<Pseudo>(NAME # _ZERO_H)>; + def : SVE_3_Op_Pat_SelZero<nxv4i32, op, nxv4i1, nxv4i32, nxv4i32, !cast<Pseudo>(NAME # _ZERO_S)>; + def : SVE_3_Op_Pat_SelZero<nxv2i64, op, nxv2i1, nxv2i64, nxv2i64, !cast<Pseudo>(NAME # _ZERO_D)>; +} + multiclass sve_int_bin_pred_shift_wide<bits<3> opc, string asm, SDPatternOperator op> { def _B : sve_int_bin_pred_shift<0b00, 0b1, opc, asm, ZPR8, ZPR64>; @@ -4493,7 +4918,8 @@ class sve_int_bin_cons_shift_imm<bits<4> tsz8_64, bits<2> opc, string asm, ZPRRegOp zprty, Operand immtype> : I<(outs zprty:$Zd), (ins zprty:$Zn, immtype:$imm), asm, "\t$Zd, $Zn, $imm", - "", []>, Sched<[]> { + "", + []>, Sched<[]> { bits<5> Zd; bits<5> Zn; bits<6> imm; @@ -4508,7 +4934,8 @@ class sve_int_bin_cons_shift_imm<bits<4> tsz8_64, bits<2> opc, string asm, let Inst{4-0} = Zd; } -multiclass sve_int_bin_cons_shift_imm_left<bits<2> opc, string asm> { +multiclass sve_int_bin_cons_shift_imm_left<bits<2> opc, string asm, + SDPatternOperator op> { def _B : sve_int_bin_cons_shift_imm<{0,0,0,1}, opc, asm, ZPR8, vecshiftL8>; def _H : sve_int_bin_cons_shift_imm<{0,0,1,?}, opc, asm, ZPR16, vecshiftL16> { let Inst{19} = imm{3}; @@ -4520,9 +4947,15 @@ multiclass sve_int_bin_cons_shift_imm_left<bits<2> opc, string asm> { let Inst{22} = imm{5}; let Inst{20-19} = imm{4-3}; } + + def : SVE_1_Op_Imm_Shift_Pred_Pat<nxv16i8, nxv16i1, op, ZPR8, vecshiftL8, !cast<Instruction>(NAME # _B)>; + def : SVE_1_Op_Imm_Shift_Pred_Pat<nxv8i16, nxv8i1, op, ZPR16, vecshiftL16, !cast<Instruction>(NAME # _H)>; + def : SVE_1_Op_Imm_Shift_Pred_Pat<nxv4i32, nxv4i1, op, ZPR32, vecshiftL32, !cast<Instruction>(NAME # _S)>; + def : SVE_1_Op_Imm_Arith_Pred_Pat<nxv2i64, nxv2i1, op, ZPR64, i64, SVEShiftImm64, !cast<Instruction>(NAME # _D)>; } -multiclass sve_int_bin_cons_shift_imm_right<bits<2> opc, string asm> { +multiclass sve_int_bin_cons_shift_imm_right<bits<2> opc, string asm, + SDPatternOperator op> { def _B : sve_int_bin_cons_shift_imm<{0,0,0,1}, opc, asm, ZPR8, vecshiftR8>; def _H : sve_int_bin_cons_shift_imm<{0,0,1,?}, opc, asm, ZPR16, vecshiftR16> { let Inst{19} = imm{3}; @@ -4534,6 +4967,11 @@ multiclass sve_int_bin_cons_shift_imm_right<bits<2> opc, string asm> { let Inst{22} = imm{5}; let Inst{20-19} = imm{4-3}; } + + def : SVE_1_Op_Imm_Shift_Pred_Pat<nxv16i8, nxv16i1, op, ZPR8, vecshiftR8, !cast<Instruction>(NAME # _B)>; + def : SVE_1_Op_Imm_Shift_Pred_Pat<nxv8i16, nxv8i1, op, ZPR16, vecshiftR16, !cast<Instruction>(NAME # _H)>; + def : SVE_1_Op_Imm_Shift_Pred_Pat<nxv4i32, nxv4i1, op, ZPR32, vecshiftR32, !cast<Instruction>(NAME # _S)>; + def : SVE_1_Op_Imm_Arith_Pred_Pat<nxv2i64, nxv2i1, op, ZPR64, i64, SVEShiftImm64, !cast<Instruction>(NAME # _D)>; } //===----------------------------------------------------------------------===// // SVE Memory - Store Group @@ -4743,16 +5181,36 @@ class sve2_mem_sstnt_vs_base<bits<3> opc, string asm, let mayStore = 1; } -multiclass sve2_mem_sstnt_vs<bits<3> opc, string asm, - RegisterOperand listty, ZPRRegOp zprty> { - def _REAL : sve2_mem_sstnt_vs_base<opc, asm, listty, zprty>; +multiclass sve2_mem_sstnt_vs_32_ptrs<bits<3> opc, string asm, + SDPatternOperator op, + ValueType vt> { + def _REAL : sve2_mem_sstnt_vs_base<opc, asm, Z_s, ZPR32>; + + def : InstAlias<asm # "\t$Zt, $Pg, [$Zn, $Rm]", + (!cast<Instruction>(NAME # _REAL) ZPR32:$Zt, PPR3bAny:$Pg, ZPR32:$Zn, GPR64:$Rm), 0>; + def : InstAlias<asm # "\t$Zt, $Pg, [$Zn]", + (!cast<Instruction>(NAME # _REAL) ZPR32:$Zt, PPR3bAny:$Pg, ZPR32:$Zn, XZR), 0>; + def : InstAlias<asm # "\t$Zt, $Pg, [$Zn]", + (!cast<Instruction>(NAME # _REAL) Z_s:$Zt, PPR3bAny:$Pg, ZPR32:$Zn, XZR), 1>; + + def : Pat <(op (nxv4i32 ZPR32:$Zt), (nxv4i1 PPR3bAny:$Pg), (nxv4i32 ZPR32:$Zn), (i64 GPR64:$Rm), vt), + (!cast<Instruction>(NAME # _REAL) ZPR32:$Zt, PPR3bAny:$Pg, ZPR32:$Zn, GPR64:$Rm)>; +} + +multiclass sve2_mem_sstnt_vs_64_ptrs<bits<3> opc, string asm, + SDPatternOperator op, + ValueType vt> { + def _REAL : sve2_mem_sstnt_vs_base<opc, asm, Z_d, ZPR64>; def : InstAlias<asm # "\t$Zt, $Pg, [$Zn, $Rm]", - (!cast<Instruction>(NAME # _REAL) zprty:$Zt, PPR3bAny:$Pg, zprty:$Zn, GPR64:$Rm), 0>; + (!cast<Instruction>(NAME # _REAL) ZPR64:$Zt, PPR3bAny:$Pg, ZPR64:$Zn, GPR64:$Rm), 0>; def : InstAlias<asm # "\t$Zt, $Pg, [$Zn]", - (!cast<Instruction>(NAME # _REAL) zprty:$Zt, PPR3bAny:$Pg, zprty:$Zn, XZR), 0>; + (!cast<Instruction>(NAME # _REAL) ZPR64:$Zt, PPR3bAny:$Pg, ZPR64:$Zn, XZR), 0>; def : InstAlias<asm # "\t$Zt, $Pg, [$Zn]", - (!cast<Instruction>(NAME # _REAL) listty:$Zt, PPR3bAny:$Pg, zprty:$Zn, XZR), 1>; + (!cast<Instruction>(NAME # _REAL) Z_d:$Zt, PPR3bAny:$Pg, ZPR64:$Zn, XZR), 1>; + + def : Pat <(op (nxv2i64 ZPR64:$Zt), (nxv2i1 PPR3bAny:$Pg), (nxv2i64 ZPR64:$Zn), (i64 GPR64:$Rm), vt), + (!cast<Instruction>(NAME # _REAL) ZPR64:$Zt, PPR3bAny:$Pg, ZPR64:$Zn, GPR64:$Rm)>; } class sve_mem_sst_sv<bits<3> opc, bit xs, bit scaled, string asm, @@ -5094,6 +5552,17 @@ class sve_int_rdffr_pred<bit s, string asm> let Uses = [FFR]; } +multiclass sve_int_rdffr_pred<bit s, string asm, SDPatternOperator op> { + def _REAL : sve_int_rdffr_pred<s, asm>; + + // We need a layer of indirection because early machine code passes balk at + // physical register (i.e. FFR) uses that have no previous definition. + let hasSideEffects = 1, hasNoSchedulingInfo = 1 in { + def "" : Pseudo<(outs PPR8:$Pd), (ins PPRAny:$Pg), [(set (nxv16i1 PPR8:$Pd), (op (nxv16i1 PPRAny:$Pg)))]>, + PseudoInstExpansion<(!cast<Instruction>(NAME # _REAL) PPR8:$Pd, PPRAny:$Pg)>; + } +} + class sve_int_rdffr_unpred<string asm> : I< (outs PPR8:$Pd), (ins), asm, "\t$Pd", @@ -5106,11 +5575,22 @@ class sve_int_rdffr_unpred<string asm> : I< let Uses = [FFR]; } -class sve_int_wrffr<string asm> +multiclass sve_int_rdffr_unpred<string asm, SDPatternOperator op> { + def _REAL : sve_int_rdffr_unpred<asm>; + + // We need a layer of indirection because early machine code passes balk at + // physical register (i.e. FFR) uses that have no previous definition. + let hasSideEffects = 1, hasNoSchedulingInfo = 1 in { + def "" : Pseudo<(outs PPR8:$Pd), (ins), [(set (nxv16i1 PPR8:$Pd), (op))]>, + PseudoInstExpansion<(!cast<Instruction>(NAME # _REAL) PPR8:$Pd)>; + } +} + +class sve_int_wrffr<string asm, SDPatternOperator op> : I<(outs), (ins PPR8:$Pn), asm, "\t$Pn", "", - []>, Sched<[]> { + [(op (nxv16i1 PPR8:$Pn))]>, Sched<[]> { bits<4> Pn; let Inst{31-9} = 0b00100101001010001001000; let Inst{8-5} = Pn; @@ -5120,11 +5600,11 @@ class sve_int_wrffr<string asm> let Defs = [FFR]; } -class sve_int_setffr<string asm> +class sve_int_setffr<string asm, SDPatternOperator op> : I<(outs), (ins), asm, "", "", - []>, Sched<[]> { + [(op)]>, Sched<[]> { let Inst{31-0} = 0b00100101001011001001000000000000; let hasSideEffects = 1; @@ -5219,7 +5699,7 @@ class sve_int_perm_clast_zz<bits<2> sz8_64, bit ab, string asm, let Inst{4-0} = Zdn; let Constraints = "$Zdn = $_Zdn"; - let DestructiveInstType = Destructive; + let DestructiveInstType = DestructiveOther; let ElementSize = ElementSizeNone; } @@ -5317,7 +5797,7 @@ class sve_int_perm_splice<bits<2> sz8_64, string asm, ZPRRegOp zprty> let Inst{4-0} = Zdn; let Constraints = "$Zdn = $_Zdn"; - let DestructiveInstType = Destructive; + let DestructiveInstType = DestructiveOther; let ElementSize = ElementSizeNone; } @@ -5332,9 +5812,9 @@ multiclass sve_int_perm_splice<string asm, SDPatternOperator op> { def : SVE_3_Op_Pat<nxv4i32, op, nxv4i1, nxv4i32, nxv4i32, !cast<Instruction>(NAME # _S)>; def : SVE_3_Op_Pat<nxv2i64, op, nxv2i1, nxv2i64, nxv2i64, !cast<Instruction>(NAME # _D)>; - def : SVE_3_Op_Pat<nxv8f16, op, nxv8i1, nxv8f16, nxv8f16, !cast<Instruction>(NAME # _H)>; - def : SVE_3_Op_Pat<nxv4f32, op, nxv4i1, nxv4f32, nxv4f32, !cast<Instruction>(NAME # _S)>; - def : SVE_3_Op_Pat<nxv2f64, op, nxv2i1, nxv2f64, nxv2f64, !cast<Instruction>(NAME # _D)>; + def : SVE_3_Op_Pat<nxv8f16, op, nxv8i1, nxv8f16, nxv8f16, !cast<Instruction>(NAME # _H)>; + def : SVE_3_Op_Pat<nxv4f32, op, nxv4i1, nxv4f32, nxv4f32, !cast<Instruction>(NAME # _S)>; + def : SVE_3_Op_Pat<nxv2f64, op, nxv2i1, nxv2f64, nxv2f64, !cast<Instruction>(NAME # _D)>; } class sve2_int_perm_splice_cons<bits<2> sz8_64, string asm, @@ -5380,7 +5860,7 @@ class sve_int_perm_rev<bits<2> sz8_64, bits<2> opc, string asm, let Inst{4-0} = Zd; let Constraints = "$Zd = $_Zd"; - let DestructiveInstType = Destructive; + let DestructiveInstType = DestructiveOther; let ElementSize = zprty.ElementSize; } @@ -5443,11 +5923,11 @@ class sve_int_perm_cpy_r<bits<2> sz8_64, string asm, ZPRRegOp zprty, let Inst{4-0} = Zd; let Constraints = "$Zd = $_Zd"; - let DestructiveInstType = Destructive; + let DestructiveInstType = DestructiveOther; let ElementSize = zprty.ElementSize; } -multiclass sve_int_perm_cpy_r<string asm> { +multiclass sve_int_perm_cpy_r<string asm, SDPatternOperator op> { def _B : sve_int_perm_cpy_r<0b00, asm, ZPR8, GPR32sp>; def _H : sve_int_perm_cpy_r<0b01, asm, ZPR16, GPR32sp>; def _S : sve_int_perm_cpy_r<0b10, asm, ZPR32, GPR32sp>; @@ -5461,6 +5941,15 @@ multiclass sve_int_perm_cpy_r<string asm> { (!cast<Instruction>(NAME # _S) ZPR32:$Zd, PPR3bAny:$Pg, GPR32sp:$Rn), 1>; def : InstAlias<"mov $Zd, $Pg/m, $Rn", (!cast<Instruction>(NAME # _D) ZPR64:$Zd, PPR3bAny:$Pg, GPR64sp:$Rn), 1>; + + def : Pat<(nxv16i8 (op nxv16i1:$pg, i32:$splat, nxv16i8:$passthru)), + (!cast<Instruction>(NAME # _B) $passthru, $pg, $splat)>; + def : Pat<(nxv8i16 (op nxv8i1:$pg, i32:$splat, nxv8i16:$passthru)), + (!cast<Instruction>(NAME # _H) $passthru, $pg, $splat)>; + def : Pat<(nxv4i32 (op nxv4i1:$pg, i32:$splat, nxv4i32:$passthru)), + (!cast<Instruction>(NAME # _S) $passthru, $pg, $splat)>; + def : Pat<(nxv2i64 (op nxv2i1:$pg, i64:$splat, nxv2i64:$passthru)), + (!cast<Instruction>(NAME # _D) $passthru, $pg, $splat)>; } class sve_int_perm_cpy_v<bits<2> sz8_64, string asm, ZPRRegOp zprty, @@ -5480,11 +5969,11 @@ class sve_int_perm_cpy_v<bits<2> sz8_64, string asm, ZPRRegOp zprty, let Inst{4-0} = Zd; let Constraints = "$Zd = $_Zd"; - let DestructiveInstType = Destructive; + let DestructiveInstType = DestructiveOther; let ElementSize = zprty.ElementSize; } -multiclass sve_int_perm_cpy_v<string asm> { +multiclass sve_int_perm_cpy_v<string asm, SDPatternOperator op> { def _B : sve_int_perm_cpy_v<0b00, asm, ZPR8, FPR8>; def _H : sve_int_perm_cpy_v<0b01, asm, ZPR16, FPR16>; def _S : sve_int_perm_cpy_v<0b10, asm, ZPR32, FPR32>; @@ -5498,6 +5987,16 @@ multiclass sve_int_perm_cpy_v<string asm> { (!cast<Instruction>(NAME # _S) ZPR32:$Zd, PPR3bAny:$Pg, FPR32:$Vn), 1>; def : InstAlias<"mov $Zd, $Pg/m, $Vn", (!cast<Instruction>(NAME # _D) ZPR64:$Zd, PPR3bAny:$Pg, FPR64:$Vn), 1>; + + + def : Pat<(nxv8f16 (op nxv8i1:$pg, f16:$splat, nxv8f16:$passthru)), + (!cast<Instruction>(NAME # _H) $passthru, $pg, $splat)>; + def : Pat<(nxv2f32 (op nxv2i1:$pg, f32:$splat, nxv2f32:$passthru)), + (!cast<Instruction>(NAME # _S) $passthru, $pg, $splat)>; + def : Pat<(nxv4f32 (op nxv4i1:$pg, f32:$splat, nxv4f32:$passthru)), + (!cast<Instruction>(NAME # _S) $passthru, $pg, $splat)>; + def : Pat<(nxv2f64 (op nxv2i1:$pg, f64:$splat, nxv2f64:$passthru)), + (!cast<Instruction>(NAME # _D) $passthru, $pg, $splat)>; } class sve_int_perm_compact<bit sz, string asm, ZPRRegOp zprty> @@ -5557,14 +6056,21 @@ class sve_mem_cld_si_base<bits<4> dtype, bit nf, string asm, multiclass sve_mem_cld_si_base<bits<4> dtype, bit nf, string asm, RegisterOperand listty, ZPRRegOp zprty> { - def "" : sve_mem_cld_si_base<dtype, nf, asm, listty>; + def _REAL : sve_mem_cld_si_base<dtype, nf, asm, listty>; def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn]", - (!cast<Instruction>(NAME) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, 0), 0>; + (!cast<Instruction>(NAME # _REAL) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, 0), 0>; def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn, $imm4, mul vl]", - (!cast<Instruction>(NAME) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, simm4s1:$imm4), 0>; + (!cast<Instruction>(NAME # _REAL) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, simm4s1:$imm4), 0>; def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn]", - (!cast<Instruction>(NAME) listty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, 0), 1>; + (!cast<Instruction>(NAME # _REAL) listty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, 0), 1>; + + // We need a layer of indirection because early machine code passes balk at + // physical register (i.e. FFR) uses that have no previous definition. + let hasSideEffects = 1, hasNoSchedulingInfo = 1, mayLoad = 1 in { + def "" : Pseudo<(outs listty:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, simm4s1:$imm4), []>, + PseudoInstExpansion<(!cast<Instruction>(NAME # _REAL) listty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, simm4s1:$imm4)>; + } } multiclass sve_mem_cld_si<bits<4> dtype, string asm, RegisterOperand listty, @@ -5773,6 +6279,13 @@ multiclass sve_mem_cldff_ss<bits<4> dtype, string asm, RegisterOperand listty, def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn]", (!cast<Instruction>(NAME # _REAL) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, XZR), 0>; + + // We need a layer of indirection because early machine code passes balk at + // physical register (i.e. FFR) uses that have no previous definition. + let hasSideEffects = 1, hasNoSchedulingInfo = 1 in { + def "" : Pseudo<(outs listty:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, gprty:$Rm), []>, + PseudoInstExpansion<(!cast<Instruction>(NAME # _REAL) listty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, gprty:$Rm)>; + } } multiclass sve_mem_cldnf_si<bits<4> dtype, string asm, RegisterOperand listty, @@ -5878,10 +6391,19 @@ multiclass sve_mem_32b_gld_sv_32_scaled<bits<4> opc, string asm, def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn, $Zm]", (!cast<Instruction>(NAME # _SXTW_SCALED_REAL) ZPR32:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm), 0>; + // We need a layer of indirection because early machine code passes balk at + // physical register (i.e. FFR) uses that have no previous definition. + let hasSideEffects = 1, hasNoSchedulingInfo = 1 in { + def _UXTW_SCALED : Pseudo<(outs Z_s:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, uxtw_opnd:$Zm), []>, + PseudoInstExpansion<(!cast<Instruction>(NAME # _UXTW_SCALED_REAL) Z_s:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, uxtw_opnd:$Zm)>; + def _SXTW_SCALED : Pseudo<(outs Z_s:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm), []>, + PseudoInstExpansion<(!cast<Instruction>(NAME # _SXTW_SCALED_REAL) Z_s:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm)>; + } + def : Pat<(nxv4i32 (uxtw_op (nxv4i1 PPR:$gp), GPR64sp:$base, (nxv4i32 ZPR:$indices), vt)), - (!cast<Instruction>(NAME # _UXTW_SCALED_REAL) PPR:$gp, GPR64sp:$base, ZPR:$indices)>; + (!cast<Instruction>(NAME # _UXTW_SCALED) PPR:$gp, GPR64sp:$base, ZPR:$indices)>; def : Pat<(nxv4i32 (sxtw_op (nxv4i1 PPR:$gp), GPR64sp:$base, (nxv4i32 ZPR:$indices), vt)), - (!cast<Instruction>(NAME # _SXTW_SCALED_REAL) PPR:$gp, GPR64sp:$base, ZPR:$indices)>; + (!cast<Instruction>(NAME # _SXTW_SCALED) PPR:$gp, GPR64sp:$base, ZPR:$indices)>; } multiclass sve_mem_32b_gld_vs_32_unscaled<bits<4> opc, string asm, @@ -5898,10 +6420,19 @@ multiclass sve_mem_32b_gld_vs_32_unscaled<bits<4> opc, string asm, def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn, $Zm]", (!cast<Instruction>(NAME # _SXTW_REAL) ZPR32:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm), 0>; + // We need a layer of indirection because early machine code passes balk at + // physical register (i.e. FFR) uses that have no previous definition. + let hasSideEffects = 1, hasNoSchedulingInfo = 1 in { + def _UXTW : Pseudo<(outs Z_s:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, uxtw_opnd:$Zm), []>, + PseudoInstExpansion<(!cast<Instruction>(NAME # _UXTW_REAL) Z_s:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, uxtw_opnd:$Zm)>; + def _SXTW : Pseudo<(outs Z_s:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm), []>, + PseudoInstExpansion<(!cast<Instruction>(NAME # _SXTW_REAL) Z_s:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm)>; + } + def : Pat<(nxv4i32 (uxtw_op (nxv4i1 PPR:$gp), GPR64sp:$base, (nxv4i32 ZPR:$offsets), vt)), - (!cast<Instruction>(NAME # _UXTW_REAL) PPR:$gp, GPR64sp:$base, ZPR:$offsets)>; + (!cast<Instruction>(NAME # _UXTW) PPR:$gp, GPR64sp:$base, ZPR:$offsets)>; def : Pat<(nxv4i32 (sxtw_op (nxv4i1 PPR:$gp), GPR64sp:$base, (nxv4i32 ZPR:$offsets), vt)), - (!cast<Instruction>(NAME # _SXTW_REAL) PPR:$gp, GPR64sp:$base, ZPR:$offsets)>; + (!cast<Instruction>(NAME # _SXTW) PPR:$gp, GPR64sp:$base, ZPR:$offsets)>; } @@ -5940,8 +6471,15 @@ multiclass sve_mem_32b_gld_vi_32_ptrs<bits<4> opc, string asm, Operand imm_ty, def : InstAlias<asm # "\t$Zt, $Pg/z, [$Zn]", (!cast<Instruction>(NAME # _IMM_REAL) Z_s:$Zt, PPR3bAny:$Pg, ZPR32:$Zn, 0), 1>; + // We need a layer of indirection because early machine code passes balk at + // physical register (i.e. FFR) uses that have no previous definition. + let hasSideEffects = 1, hasNoSchedulingInfo = 1 in { + def _IMM : Pseudo<(outs Z_s:$Zt), (ins PPR3bAny:$Pg, ZPR32:$Zn, imm_ty:$imm5), []>, + PseudoInstExpansion<(!cast<Instruction>(NAME # _IMM_REAL) Z_s:$Zt, PPR3bAny:$Pg, ZPR32:$Zn, imm_ty:$imm5)>; + } + def : Pat<(nxv4i32 (op (nxv4i1 PPR:$gp), (nxv4i32 ZPR:$ptrs), imm_ty:$index, vt)), - (!cast<Instruction>(NAME # _IMM_REAL) PPR:$gp, ZPR:$ptrs, imm_ty:$index)>; + (!cast<Instruction>(NAME # _IMM) PPR:$gp, ZPR:$ptrs, imm_ty:$index)>; } class sve_mem_prfm_si<bits<2> msz, string asm> @@ -6022,9 +6560,17 @@ class sve_mem_32b_prfm_sv<bits<2> msz, bit xs, string asm, multiclass sve_mem_32b_prfm_sv_scaled<bits<2> msz, string asm, RegisterOperand sxtw_opnd, - RegisterOperand uxtw_opnd> { + RegisterOperand uxtw_opnd, + PatFrag op_sxtw, + PatFrag op_uxtw> { def _UXTW_SCALED : sve_mem_32b_prfm_sv<msz, 0, asm, uxtw_opnd>; def _SXTW_SCALED : sve_mem_32b_prfm_sv<msz, 1, asm, sxtw_opnd>; + + def : Pat<(op_uxtw (nxv4i1 PPR3bAny:$Pg), (i64 GPR64sp:$Rn), (nxv4i32 uxtw_opnd:$Zm), (i32 sve_prfop:$prfop)), + (!cast<Instruction>(NAME # _UXTW_SCALED) sve_prfop:$prfop, PPR3bAny:$Pg, GPR64sp:$Rn, uxtw_opnd:$Zm)>; + + def : Pat<(op_sxtw (nxv4i1 PPR3bAny:$Pg), (i64 GPR64sp:$Rn), (nxv4i32 sxtw_opnd:$Zm), (i32 sve_prfop:$prfop)), + (!cast<Instruction>(NAME # _SXTW_SCALED) sve_prfop:$prfop, PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm)>; } class sve_mem_32b_prfm_vi<bits<2> msz, string asm, Operand imm_ty> @@ -6047,11 +6593,14 @@ class sve_mem_32b_prfm_vi<bits<2> msz, string asm, Operand imm_ty> let Inst{3-0} = prfop; } -multiclass sve_mem_32b_prfm_vi<bits<2> msz, string asm, Operand imm_ty> { +multiclass sve_mem_32b_prfm_vi<bits<2> msz, string asm, Operand imm_ty, SDPatternOperator op> { def NAME : sve_mem_32b_prfm_vi<msz, asm, imm_ty>; def : InstAlias<asm # "\t$prfop, $Pg, [$Zn]", (!cast<Instruction>(NAME) sve_prfop:$prfop, PPR3bAny:$Pg, ZPR32:$Zn, 0), 1>; + + def : Pat<(op (nxv4i1 PPR_3b:$Pg), (nxv4i32 ZPR32:$Zn), (i64 imm_ty:$imm), (i32 sve_prfop:$prfop)), + (!cast<Instruction>(NAME) sve_prfop:$prfop, PPR_3b:$Pg, ZPR32:$Zn, imm_ty:$imm)>; } class sve_mem_z_fill<string asm> @@ -6130,17 +6679,38 @@ class sve2_mem_gldnt_vs_base<bits<5> opc, dag iops, string asm, let mayLoad = 1; } -multiclass sve2_mem_gldnt_vs<bits<5> opc, string asm, - RegisterOperand listty, ZPRRegOp zprty> { - def _REAL : sve2_mem_gldnt_vs_base<opc, (ins PPR3bAny:$Pg, zprty:$Zn, GPR64:$Rm), - asm, listty>; +multiclass sve2_mem_gldnt_vs_32_ptrs<bits<5> opc, string asm, + SDPatternOperator op, + ValueType vt> { + def _REAL : sve2_mem_gldnt_vs_base<opc, (ins PPR3bAny:$Pg, ZPR32:$Zn, GPR64:$Rm), + asm, Z_s>; + + def : InstAlias<asm # "\t$Zt, $Pg/z, [$Zn, $Rm]", + (!cast<Instruction>(NAME # _REAL) ZPR32:$Zt, PPR3bAny:$Pg, ZPR32:$Zn, GPR64:$Rm), 0>; + def : InstAlias<asm # "\t$Zt, $Pg/z, [$Zn]", + (!cast<Instruction>(NAME # _REAL) ZPR32:$Zt, PPR3bAny:$Pg, ZPR32:$Zn, XZR), 0>; + def : InstAlias<asm # "\t$Zt, $Pg/z, [$Zn]", + (!cast<Instruction>(NAME # _REAL) Z_s:$Zt, PPR3bAny:$Pg, ZPR32:$Zn, XZR), 1>; + + def : Pat <(nxv4i32 (op (nxv4i1 PPR3bAny:$Pg), (nxv4i32 ZPR32:$Zd), (i64 GPR64:$Rm), vt)), + (!cast<Instruction>(NAME # _REAL) PPR3bAny:$Pg, ZPR32:$Zd, GPR64:$Rm)>; +} + +multiclass sve2_mem_gldnt_vs_64_ptrs<bits<5> opc, string asm, + SDPatternOperator op, + ValueType vt> { + def _REAL : sve2_mem_gldnt_vs_base<opc, (ins PPR3bAny:$Pg, ZPR64:$Zn, GPR64:$Rm), + asm, Z_d>; def : InstAlias<asm # "\t$Zt, $Pg/z, [$Zn, $Rm]", - (!cast<Instruction>(NAME # _REAL) zprty:$Zt, PPR3bAny:$Pg, zprty:$Zn, GPR64:$Rm), 0>; + (!cast<Instruction>(NAME # _REAL) ZPR64:$Zt, PPR3bAny:$Pg, ZPR64:$Zn, GPR64:$Rm), 0>; def : InstAlias<asm # "\t$Zt, $Pg/z, [$Zn]", - (!cast<Instruction>(NAME # _REAL) zprty:$Zt, PPR3bAny:$Pg, zprty:$Zn, XZR), 0>; + (!cast<Instruction>(NAME # _REAL) ZPR64:$Zt, PPR3bAny:$Pg, ZPR64:$Zn, XZR), 0>; def : InstAlias<asm # "\t$Zt, $Pg/z, [$Zn]", - (!cast<Instruction>(NAME # _REAL) listty:$Zt, PPR3bAny:$Pg, zprty:$Zn, XZR), 1>; + (!cast<Instruction>(NAME # _REAL) Z_d:$Zt, PPR3bAny:$Pg, ZPR64:$Zn, XZR), 1>; + + def : Pat <(nxv2i64 (op (nxv2i1 PPR3bAny:$Pg), (nxv2i64 ZPR64:$Zd), (i64 GPR64:$Rm), vt)), + (!cast<Instruction>(NAME # _REAL) PPR3bAny:$Pg, ZPR64:$Zd, GPR64:$Rm)>; } //===----------------------------------------------------------------------===// @@ -6190,10 +6760,19 @@ multiclass sve_mem_64b_gld_sv_32_scaled<bits<4> opc, string asm, def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn, $Zm]", (!cast<Instruction>(NAME # _SXTW_SCALED_REAL) ZPR64:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm), 0>; + // We need a layer of indirection because early machine code passes balk at + // physical register (i.e. FFR) uses that have no previous definition. + let hasSideEffects = 1, hasNoSchedulingInfo = 1 in { + def _UXTW_SCALED : Pseudo<(outs Z_d:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, uxtw_opnd:$Zm), []>, + PseudoInstExpansion<(!cast<Instruction>(NAME # _UXTW_SCALED_REAL) Z_d:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, uxtw_opnd:$Zm)>; + def _SXTW_SCALED : Pseudo<(outs Z_d:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm), []>, + PseudoInstExpansion<(!cast<Instruction>(NAME # _SXTW_SCALED_REAL) Z_d:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm)>; + } + def : Pat<(nxv2i64 (uxtw_op (nxv2i1 PPR:$gp), GPR64sp:$base, (nxv2i64 ZPR:$indices), vt)), - (!cast<Instruction>(NAME # _UXTW_SCALED_REAL) PPR:$gp, GPR64sp:$base, ZPR:$indices)>; + (!cast<Instruction>(NAME # _UXTW_SCALED) PPR:$gp, GPR64sp:$base, ZPR:$indices)>; def : Pat<(nxv2i64 (sxtw_op (nxv2i1 PPR:$gp), GPR64sp:$base, (nxv2i64 ZPR:$indices), vt)), - (!cast<Instruction>(NAME # _SXTW_SCALED_REAL) PPR:$gp, GPR64sp:$base, ZPR:$indices)>; + (!cast<Instruction>(NAME # _SXTW_SCALED) PPR:$gp, GPR64sp:$base, ZPR:$indices)>; } multiclass sve_mem_64b_gld_vs_32_unscaled<bits<4> opc, string asm, @@ -6210,10 +6789,19 @@ multiclass sve_mem_64b_gld_vs_32_unscaled<bits<4> opc, string asm, def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn, $Zm]", (!cast<Instruction>(NAME # _SXTW_REAL) ZPR64:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm), 0>; + // We need a layer of indirection because early machine code passes balk at + // physical register (i.e. FFR) uses that have no previous definition. + let hasSideEffects = 1, hasNoSchedulingInfo = 1 in { + def _UXTW : Pseudo<(outs Z_d:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, uxtw_opnd:$Zm), []>, + PseudoInstExpansion<(!cast<Instruction>(NAME # _UXTW_REAL) Z_d:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, uxtw_opnd:$Zm)>; + def _SXTW : Pseudo<(outs Z_d:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm), []>, + PseudoInstExpansion<(!cast<Instruction>(NAME # _SXTW_REAL) Z_d:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm)>; + } + def : Pat<(nxv2i64 (uxtw_op (nxv2i1 PPR:$gp), GPR64sp:$base, (nxv2i64 ZPR:$offsets), vt)), - (!cast<Instruction>(NAME # _UXTW_REAL) PPR:$gp, GPR64sp:$base, ZPR:$offsets)>; + (!cast<Instruction>(NAME # _UXTW) PPR:$gp, GPR64sp:$base, ZPR:$offsets)>; def : Pat<(nxv2i64 (sxtw_op (nxv2i1 PPR:$gp), GPR64sp:$base, (nxv2i64 ZPR:$offsets), vt)), - (!cast<Instruction>(NAME # _SXTW_REAL) PPR:$gp, GPR64sp:$base, ZPR:$offsets)>; + (!cast<Instruction>(NAME # _SXTW) PPR:$gp, GPR64sp:$base, ZPR:$offsets)>; } multiclass sve_mem_64b_gld_sv2_64_scaled<bits<4> opc, string asm, @@ -6224,8 +6812,15 @@ multiclass sve_mem_64b_gld_sv2_64_scaled<bits<4> opc, string asm, def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn, $Zm]", (!cast<Instruction>(NAME # _SCALED_REAL) ZPR64:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, zprext:$Zm), 0>; + // We need a layer of indirection because early machine code passes balk at + // physical register (i.e. FFR) uses that have no previous definition. + let hasSideEffects = 1, hasNoSchedulingInfo = 1 in { + def _SCALED : Pseudo<(outs Z_d:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, zprext:$Zm), []>, + PseudoInstExpansion<(!cast<Instruction>(NAME # _SCALED_REAL) Z_d:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, zprext:$Zm)>; + } + def : Pat<(nxv2i64 (op (nxv2i1 PPR:$gp), GPR64sp:$base, (nxv2i64 ZPR:$indices), vt)), - (!cast<Instruction>(NAME # _SCALED_REAL) PPR:$gp, GPR64sp:$base, ZPR:$indices)>; + (!cast<Instruction>(NAME # _SCALED) PPR:$gp, GPR64sp:$base, ZPR:$indices)>; } multiclass sve_mem_64b_gld_vs2_64_unscaled<bits<4> opc, string asm, @@ -6235,8 +6830,15 @@ multiclass sve_mem_64b_gld_vs2_64_unscaled<bits<4> opc, string asm, def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn, $Zm]", (!cast<Instruction>(NAME # _REAL) ZPR64:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, ZPR64ExtLSL8:$Zm), 0>; + // We need a layer of indirection because early machine code passes balk at + // physical register (i.e. FFR) uses that have no previous definition. + let hasSideEffects = 1, hasNoSchedulingInfo = 1 in { + def "" : Pseudo<(outs Z_d:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, ZPR64ExtLSL8:$Zm), []>, + PseudoInstExpansion<(!cast<Instruction>(NAME # _REAL) Z_d:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, ZPR64ExtLSL8:$Zm)>; + } + def : Pat<(nxv2i64 (op (nxv2i1 PPR:$gp), GPR64sp:$base, (nxv2i64 ZPR:$offsets), vt)), - (!cast<Instruction>(NAME # _REAL) PPR:$gp, GPR64sp:$base, ZPR:$offsets)>; + (!cast<Instruction>(NAME) PPR:$gp, GPR64sp:$base, ZPR:$offsets)>; } class sve_mem_64b_gld_vi<bits<4> opc, string asm, Operand imm_ty> @@ -6274,8 +6876,15 @@ multiclass sve_mem_64b_gld_vi_64_ptrs<bits<4> opc, string asm, Operand imm_ty, def : InstAlias<asm # "\t$Zt, $Pg/z, [$Zn]", (!cast<Instruction>(NAME # _IMM_REAL) Z_d:$Zt, PPR3bAny:$Pg, ZPR64:$Zn, 0), 1>; + // We need a layer of indirection because early machine code passes balk at + // physical register (i.e. FFR) uses that have no previous definition. + let hasSideEffects = 1, hasNoSchedulingInfo = 1 in { + def _IMM : Pseudo<(outs Z_d:$Zt), (ins PPR3bAny:$Pg, ZPR64:$Zn, imm_ty:$imm5), []>, + PseudoInstExpansion<(!cast<Instruction>(NAME # _IMM_REAL) Z_d:$Zt, PPR3bAny:$Pg, ZPR64:$Zn, imm_ty:$imm5)>; + } + def : Pat<(nxv2i64 (op (nxv2i1 PPR:$gp), (nxv2i64 ZPR:$ptrs), imm_ty:$index, vt)), - (!cast<Instruction>(NAME # _IMM_REAL) PPR:$gp, ZPR:$ptrs, imm_ty:$index)>; + (!cast<Instruction>(NAME # _IMM) PPR:$gp, ZPR:$ptrs, imm_ty:$index)>; } // bit lsl is '0' if the offsets are extended (uxtw/sxtw), '1' if shifted (lsl) @@ -6305,14 +6914,27 @@ class sve_mem_64b_prfm_sv<bits<2> msz, bit xs, bit lsl, string asm, multiclass sve_mem_64b_prfm_sv_ext_scaled<bits<2> msz, string asm, RegisterOperand sxtw_opnd, - RegisterOperand uxtw_opnd> { + RegisterOperand uxtw_opnd, + PatFrag op_sxtw, + PatFrag op_uxtw> { def _UXTW_SCALED : sve_mem_64b_prfm_sv<msz, 0, 0, asm, uxtw_opnd>; def _SXTW_SCALED : sve_mem_64b_prfm_sv<msz, 1, 0, asm, sxtw_opnd>; + + def : Pat<(op_uxtw (nxv2i1 PPR3bAny:$Pg), (i64 GPR64sp:$Rn), (nxv2i64 uxtw_opnd:$Zm), (i32 sve_prfop:$prfop)), + (!cast<Instruction>(NAME # _UXTW_SCALED) sve_prfop:$prfop, PPR3bAny:$Pg, GPR64sp:$Rn, uxtw_opnd:$Zm)>; + + def : Pat<(op_sxtw (nxv2i1 PPR3bAny:$Pg), (i64 GPR64sp:$Rn), (nxv2i64 sxtw_opnd:$Zm), (i32 sve_prfop:$prfop)), + (!cast<Instruction>(NAME # _SXTW_SCALED) sve_prfop:$prfop, PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm)>; + } multiclass sve_mem_64b_prfm_sv_lsl_scaled<bits<2> msz, string asm, - RegisterOperand zprext> { + RegisterOperand zprext, PatFrag frag> { def NAME : sve_mem_64b_prfm_sv<msz, 1, 1, asm, zprext>; + + def : Pat<(frag (nxv2i1 PPR3bAny:$Pg), (i64 GPR64sp:$Rn), (nxv2i64 zprext:$Zm), (i32 sve_prfop:$prfop)), + (!cast<Instruction>(NAME) sve_prfop:$prfop, PPR3bAny:$Pg, GPR64sp:$Rn, zprext:$Zm)>; + } @@ -6338,13 +6960,15 @@ class sve_mem_64b_prfm_vi<bits<2> msz, string asm, Operand imm_ty> let hasSideEffects = 1; } -multiclass sve_mem_64b_prfm_vi<bits<2> msz, string asm, Operand imm_ty> { +multiclass sve_mem_64b_prfm_vi<bits<2> msz, string asm, Operand imm_ty, SDPatternOperator op> { def NAME : sve_mem_64b_prfm_vi<msz, asm, imm_ty>; def : InstAlias<asm # "\t$prfop, $Pg, [$Zn]", (!cast<Instruction>(NAME) sve_prfop:$prfop, PPR3bAny:$Pg, ZPR64:$Zn, 0), 1>; -} + def : Pat<(op (nxv2i1 PPR_3b:$Pg), (nxv2i64 ZPR32:$Zn), (i64 imm_ty:$imm), (i32 sve_prfop:$prfop)), + (!cast<Instruction>(NAME) sve_prfop:$prfop, PPR_3b:$Pg, ZPR32:$Zn, imm_ty:$imm)>; +} //===----------------------------------------------------------------------===// // SVE Compute Vector Address Group @@ -6600,6 +7224,12 @@ class sve_int_brkp<bits<2> opc, string asm> let Defs = !if(!eq (opc{1}, 1), [NZCV], []); } +multiclass sve_int_brkp<bits<2> opc, string asm, SDPatternOperator op> { + def NAME : sve_int_brkp<opc, asm>; + + def : SVE_3_Op_Pat<nxv16i1, op, nxv16i1, nxv16i1, nxv16i1, !cast<Instruction>(NAME)>; +} + //===----------------------------------------------------------------------===// // SVE Partition Break Group @@ -6626,6 +7256,12 @@ class sve_int_brkn<bit S, string asm> let Defs = !if(!eq (S, 0b1), [NZCV], []); } +multiclass sve_int_brkn<bits<1> opc, string asm, SDPatternOperator op> { + def NAME : sve_int_brkn<opc, asm>; + + def : SVE_3_Op_Pat<nxv16i1, op, nxv16i1, nxv16i1, nxv16i1, !cast<Instruction>(NAME)>; +} + class sve_int_break<bits<3> opc, string asm, string suffix, dag iops> : I<(outs PPR8:$Pd), iops, asm, "\t$Pd, $Pg"#suffix#", $Pn", @@ -6648,12 +7284,16 @@ class sve_int_break<bits<3> opc, string asm, string suffix, dag iops> } -multiclass sve_int_break_m<bits<3> opc, string asm> { +multiclass sve_int_break_m<bits<3> opc, string asm, SDPatternOperator op> { def NAME : sve_int_break<opc, asm, "/m", (ins PPR8:$_Pd, PPRAny:$Pg, PPR8:$Pn)>; + + def : SVE_3_Op_Pat<nxv16i1, op, nxv16i1, nxv16i1, nxv16i1, !cast<Instruction>(NAME)>; } -multiclass sve_int_break_z<bits<3> opc, string asm> { +multiclass sve_int_break_z<bits<3> opc, string asm, SDPatternOperator op> { def NAME : sve_int_break<opc, asm, "/z", (ins PPRAny:$Pg, PPR8:$Pn)>; + + def : SVE_2_Op_Pat<nxv16i1, op, nxv16i1, nxv16i1, !cast<Instruction>(NAME)>; } //===----------------------------------------------------------------------===// @@ -6683,20 +7323,23 @@ class sve2_char_match<bit sz, bit opc, string asm, let Defs = [NZCV]; } -multiclass sve2_char_match<bit opc, string asm> { +multiclass sve2_char_match<bit opc, string asm, SDPatternOperator op> { def _B : sve2_char_match<0b0, opc, asm, PPR8, ZPR8>; def _H : sve2_char_match<0b1, opc, asm, PPR16, ZPR16>; + + def : SVE_3_Op_Pat<nxv16i1, op, nxv16i1, nxv16i8, nxv16i8, !cast<Instruction>(NAME # _B)>; + def : SVE_3_Op_Pat<nxv8i1, op, nxv8i1, nxv8i16, nxv8i16, !cast<Instruction>(NAME # _H)>; } //===----------------------------------------------------------------------===// // SVE2 Histogram Computation - Segment Group //===----------------------------------------------------------------------===// -class sve2_hist_gen_segment<string asm> +class sve2_hist_gen_segment<string asm, SDPatternOperator op> : I<(outs ZPR8:$Zd), (ins ZPR8:$Zn, ZPR8:$Zm), asm, "\t$Zd, $Zn, $Zm", "", - []>, Sched<[]> { + [(set nxv16i8:$Zd, (op nxv16i8:$Zn, nxv16i8:$Zm))]>, Sched<[]> { bits<5> Zd; bits<5> Zn; bits<5> Zm; @@ -6730,9 +7373,12 @@ class sve2_hist_gen_vector<bit sz, string asm, ZPRRegOp zprty> let Inst{4-0} = Zd; } -multiclass sve2_hist_gen_vector<string asm> { +multiclass sve2_hist_gen_vector<string asm, SDPatternOperator op> { def _S : sve2_hist_gen_vector<0b0, asm, ZPR32>; def _D : sve2_hist_gen_vector<0b1, asm, ZPR64>; + + def : SVE_3_Op_Pat<nxv4i32, op, nxv4i1, nxv4i32, nxv4i32, !cast<Instruction>(NAME # _S)>; + def : SVE_3_Op_Pat<nxv2i64, op, nxv2i1, nxv2i64, nxv2i64, !cast<Instruction>(NAME # _D)>; } //===----------------------------------------------------------------------===// @@ -6755,6 +7401,12 @@ class sve2_crypto_cons_bin_op<bit opc, string asm, ZPRRegOp zprty> let Inst{4-0} = Zd; } +multiclass sve2_crypto_cons_bin_op<bit opc, string asm, ZPRRegOp zprty, + SDPatternOperator op, ValueType vt> { + def NAME : sve2_crypto_cons_bin_op<opc, asm, zprty>; + def : SVE_2_Op_Pat<vt, op, vt, vt, !cast<Instruction>(NAME)>; +} + class sve2_crypto_des_bin_op<bits<2> opc, string asm, ZPRRegOp zprty> : I<(outs zprty:$Zdn), (ins zprty:$_Zdn, zprty:$Zm), asm, "\t$Zdn, $_Zdn, $Zm", @@ -6772,8 +7424,14 @@ class sve2_crypto_des_bin_op<bits<2> opc, string asm, ZPRRegOp zprty> let Constraints = "$Zdn = $_Zdn"; } -class sve2_crypto_unary_op<bit opc, string asm> -: I<(outs ZPR8:$Zdn), (ins ZPR8:$_Zdn), +multiclass sve2_crypto_des_bin_op<bits<2> opc, string asm, ZPRRegOp zprty, + SDPatternOperator op, ValueType vt> { + def NAME : sve2_crypto_des_bin_op<opc, asm, zprty>; + def : SVE_2_Op_Pat<vt, op, vt, vt, !cast<Instruction>(NAME)>; +} + +class sve2_crypto_unary_op<bit opc, string asm, ZPRRegOp zprty> +: I<(outs zprty:$Zdn), (ins zprty:$_Zdn), asm, "\t$Zdn, $_Zdn", "", []>, Sched<[]> { @@ -6785,3 +7443,389 @@ class sve2_crypto_unary_op<bit opc, string asm> let Constraints = "$Zdn = $_Zdn"; } + +multiclass sve2_crypto_unary_op<bit opc, string asm, SDPatternOperator op> { + def NAME : sve2_crypto_unary_op<opc, asm, ZPR8>; + def : SVE_1_Op_Pat<nxv16i8, op, nxv16i8, !cast<Instruction>(NAME)>; +} + +//===----------------------------------------------------------------------===// +// SVE BFloat16 Group +//===----------------------------------------------------------------------===// + +class sve_bfloat_dot_base<bits<2> opc, string asm, string ops, dag iops> +: I<(outs ZPR32:$Zda), iops, asm, ops, "", []>, Sched<[]> { + bits<5> Zda; + bits<5> Zn; + let Inst{31-21} = 0b01100100011; + let Inst{15-14} = opc; + let Inst{13-10} = 0b0000; + let Inst{9-5} = Zn; + let Inst{4-0} = Zda; + + let Constraints = "$Zda = $_Zda"; + let DestructiveInstType = DestructiveOther; + let ElementSize = ElementSizeH; +} + +class sve_bfloat_dot<string asm> +: sve_bfloat_dot_base<0b10, asm, "\t$Zda, $Zn, $Zm", + (ins ZPR32:$_Zda, ZPR16:$Zn, ZPR16:$Zm)> { + bits<5> Zm; + let Inst{20-16} = Zm; +} + +multiclass sve_bfloat_dot<string asm, SDPatternOperator op> { + def NAME : sve_bfloat_dot<asm>; + def : SVE_3_Op_Pat<nxv4f32, op, nxv4f32, nxv8bf16, nxv8bf16 ,!cast<Instruction>(NAME)>; +} + +class sve_bfloat_dot_indexed<string asm> +: sve_bfloat_dot_base<0b01, asm, "\t$Zda, $Zn, $Zm$iop", + (ins ZPR32:$_Zda, ZPR16:$Zn, ZPR3b16:$Zm, VectorIndexS:$iop)> { + bits<2> iop; + bits<3> Zm; + let Inst{20-19} = iop; + let Inst{18-16} = Zm; +} + +multiclass sve_bfloat_dot_indexed<string asm, SDPatternOperator op> { + def NAME : sve_bfloat_dot_indexed<asm>; + def : SVE_4_Op_Imm_Pat<nxv4f32, op, nxv4f32, nxv8bf16, nxv8bf16, i64, VectorIndexS_timm, !cast<Instruction>(NAME)>; +} + +class sve_bfloat_matmul<string asm> +: I<(outs ZPR32:$Zda), (ins ZPR32:$_Zda, ZPR16:$Zn, ZPR16:$Zm), + asm, "\t$Zda, $Zn, $Zm", "", []>, Sched<[]> { + bits<5> Zm; + bits<5> Zda; + bits<5> Zn; + let Inst{31-21} = 0b01100100011; + let Inst{20-16} = Zm; + let Inst{15-10} = 0b111001; + let Inst{9-5} = Zn; + let Inst{4-0} = Zda; + + let Constraints = "$Zda = $_Zda"; + let DestructiveInstType = DestructiveOther; + let ElementSize = ElementSizeH; +} + +multiclass sve_bfloat_matmul<string asm, SDPatternOperator op> { + def NAME : sve_bfloat_matmul<asm>; + def : SVE_3_Op_Pat<nxv4f32, op, nxv4f32, nxv8bf16, nxv8bf16 ,!cast<Instruction>(NAME)>; +} + +class sve_bfloat_matmul_longvecl<bit BT, string asm> +: sve_bfloat_matmul<asm> { + let Inst{23} = 0b1; + let Inst{14-13} = 0b00; + let Inst{10} = BT; +} + +multiclass sve_bfloat_matmul_longvecl<bit BT, string asm, SDPatternOperator op> { + def NAME : sve_bfloat_matmul_longvecl<BT, asm>; + def : SVE_3_Op_Pat<nxv4f32, op, nxv4f32, nxv8bf16, nxv8bf16 ,!cast<Instruction>(NAME)>; +} + +class sve_bfloat_matmul_longvecl_idx<bit BT, string asm> +: sve_bfloat_dot_base<0b01, asm, "\t$Zda, $Zn, $Zm$iop", + (ins ZPR32:$_Zda, ZPR16:$Zn, ZPR3b16:$Zm, VectorIndexH:$iop)> { + bits<3> iop; + bits<3> Zm; + let Inst{23} = 0b1; + let Inst{20-19} = iop{2-1}; + let Inst{18-16} = Zm; + let Inst{11} = iop{0}; + let Inst{10} = BT; +} + +multiclass sve_bfloat_matmul_longvecl_idx<bit BT, string asm, SDPatternOperator op> { + def NAME : sve_bfloat_matmul_longvecl_idx<BT, asm>; + def : SVE_4_Op_Imm_Pat<nxv4f32, op, nxv4f32, nxv8bf16, nxv8bf16, i64, VectorIndexH_timm, !cast<Instruction>(NAME)>; +} + +class sve_bfloat_convert<bit N, string asm> +: I<(outs ZPR16:$Zd), (ins ZPR16:$_Zd, PPR3bAny:$Pg, ZPR32:$Zn), + asm, "\t$Zd, $Pg/m, $Zn", "", []>, Sched<[]> { + bits<5> Zd; + bits<3> Pg; + bits<5> Zn; + let Inst{31-25} = 0b0110010; + let Inst{24} = N; + let Inst{23-13} = 0b10001010101; + let Inst{12-10} = Pg; + let Inst{9-5} = Zn; + let Inst{4-0} = Zd; + + let Constraints = "$Zd = $_Zd"; + let DestructiveInstType = DestructiveOther; + let hasSideEffects = 1; + let ElementSize = ElementSizeS; +} + +multiclass sve_bfloat_convert<bit N, string asm, SDPatternOperator op> { + def NAME : sve_bfloat_convert<N, asm>; + def : SVE_3_Op_Pat<nxv8bf16, op, nxv8bf16, nxv8i1, nxv4f32, !cast<Instruction>(NAME)>; +} + +//===----------------------------------------------------------------------===// +// SVE Integer Matrix Multiply Group +//===----------------------------------------------------------------------===// + +class sve_int_matmul<bits<2> uns, string asm> +: I<(outs ZPR32:$Zda), (ins ZPR32:$_Zda, ZPR8:$Zn, ZPR8:$Zm), asm, + "\t$Zda, $Zn, $Zm", "", []>, Sched<[]> { + bits<5> Zda; + bits<5> Zn; + bits<5> Zm; + let Inst{31-24} = 0b01000101; + let Inst{23-22} = uns; + let Inst{21} = 0; + let Inst{20-16} = Zm; + let Inst{15-10} = 0b100110; + let Inst{9-5} = Zn; + let Inst{4-0} = Zda; + + let Constraints = "$Zda = $_Zda"; + let DestructiveInstType = DestructiveOther; + let ElementSize = ZPR32.ElementSize; +} + +multiclass sve_int_matmul<bits<2> uns, string asm, SDPatternOperator op> { + def NAME : sve_int_matmul<uns, asm>; + + def : SVE_3_Op_Pat<nxv4i32, op , nxv4i32, nxv16i8, nxv16i8, !cast<Instruction>(NAME)>; +} + +//===----------------------------------------------------------------------===// +// SVE Integer Dot Product Mixed Sign Group +//===----------------------------------------------------------------------===// + +class sve_int_dot_mixed<string asm> +: I<(outs ZPR32:$Zda), (ins ZPR32:$_Zda, ZPR8:$Zn, ZPR8:$Zm), asm, + "\t$Zda, $Zn, $Zm", "", []>, Sched<[]> { + bits<5> Zda; + bits<5> Zn; + bits<5> Zm; + let Inst{31-21} = 0b01000100100; + let Inst{20-16} = Zm; + let Inst{15-10} = 0b011110; + let Inst{9-5} = Zn; + let Inst{4-0} = Zda; + + let Constraints = "$Zda = $_Zda"; + let DestructiveInstType = DestructiveOther; + let ElementSize = ZPR32.ElementSize; +} + +multiclass sve_int_dot_mixed<string asm, SDPatternOperator op> { + def NAME : sve_int_dot_mixed<asm>; + + def : SVE_3_Op_Pat<nxv4i32, op , nxv4i32, nxv16i8, nxv16i8, !cast<Instruction>(NAME)>; +} + +//===----------------------------------------------------------------------===// +// SVE Integer Dot Product Mixed Sign - Indexed Group +//===----------------------------------------------------------------------===// + +class sve_int_dot_mixed_indexed<bit U, string asm> +: I<(outs ZPR32:$Zda), (ins ZPR32:$_Zda, ZPR8:$Zn, ZPR3b8:$Zm, VectorIndexS32b:$idx), + asm, "\t$Zda, $Zn, $Zm$idx", "", []>, Sched<[]> { + bits<5> Zda; + bits<5> Zn; + bits<3> Zm; + bits<2> idx; + let Inst{31-21} = 0b01000100101; + let Inst{20-19} = idx; + let Inst{18-16} = Zm; + let Inst{15-11} = 0b00011; + let Inst{10} = U; + let Inst{9-5} = Zn; + let Inst{4-0} = Zda; + + let Constraints = "$Zda = $_Zda"; + let DestructiveInstType = DestructiveOther; + let ElementSize = ZPR32.ElementSize; +} + +multiclass sve_int_dot_mixed_indexed<bit U, string asm, SDPatternOperator op> { + def NAME : sve_int_dot_mixed_indexed<U, asm>; + + def : SVE_4_Op_Imm_Pat<nxv4i32, op, nxv4i32, nxv16i8, nxv16i8, i32, VectorIndexS32b_timm, !cast<Instruction>(NAME)>; +} + +//===----------------------------------------------------------------------===// +// SVE Floating Point Matrix Multiply Accumulate Group +//===----------------------------------------------------------------------===// + +class sve_fp_matrix_mla<bit sz, string asm, ZPRRegOp zprty> +: I<(outs zprty:$Zda), (ins zprty:$_Zda, zprty:$Zn, zprty:$Zm), + asm, "\t$Zda, $Zn, $Zm", "", []>, Sched<[]> { + bits<5> Zda; + bits<5> Zn; + bits<5> Zm; + let Inst{31-23} = 0b011001001; + let Inst{22} = sz; + let Inst{21} = 1; + let Inst{20-16} = Zm; + let Inst{15-10} = 0b111001; + let Inst{9-5} = Zn; + let Inst{4-0} = Zda; + + let Constraints = "$Zda = $_Zda"; + let DestructiveInstType = DestructiveOther; + let ElementSize = zprty.ElementSize; +} + +multiclass sve_fp_matrix_mla<bit sz, string asm, ZPRRegOp zprty, SDPatternOperator op, ValueType vt> { + def NAME : sve_fp_matrix_mla<sz, asm, zprty>; + + def : SVE_3_Op_Pat<vt, op , vt, vt, vt, !cast<Instruction>(NAME)>; +} + +//===----------------------------------------------------------------------===// +// SVE Memory - Contiguous Load And Replicate 256-bit Group +//===----------------------------------------------------------------------===// + +class sve_mem_ldor_si<bits<2> sz, string asm, RegisterOperand VecList> +: I<(outs VecList:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, simm4s32:$imm4), + asm, "\t$Zt, $Pg/z, [$Rn, $imm4]", "", []>, Sched<[]> { + bits<5> Zt; + bits<5> Rn; + bits<3> Pg; + bits<4> imm4; + let Inst{31-25} = 0b1010010; + let Inst{24-23} = sz; + let Inst{22-20} = 0b010; + let Inst{19-16} = imm4; + let Inst{15-13} = 0b001; + let Inst{12-10} = Pg; + let Inst{9-5} = Rn; + let Inst{4-0} = Zt; + + let mayLoad = 1; +} + +multiclass sve_mem_ldor_si<bits<2> sz, string asm, RegisterOperand listty, + ZPRRegOp zprty, ValueType Ty, ValueType PredTy, SDNode Ld1ro> { + def NAME : sve_mem_ldor_si<sz, asm, listty>; + def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn]", + (!cast<Instruction>(NAME) listty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, 0), 1>; + def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn]", + (!cast<Instruction>(NAME) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, 0), 0>; + def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn, $imm4]", + (!cast<Instruction>(NAME) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, simm4s32:$imm4), 0>; + + // Base addressing mode + def : Pat<(Ty (Ld1ro (PredTy PPR3bAny:$gp), GPR64sp:$base)), + (!cast<Instruction>(NAME) PPR3bAny:$gp, GPR64sp:$base, (i64 0))>; + +} + +class sve_mem_ldor_ss<bits<2> sz, string asm, RegisterOperand VecList, + RegisterOperand gprty> +: I<(outs VecList:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, gprty:$Rm), + asm, "\t$Zt, $Pg/z, [$Rn, $Rm]", "", []>, Sched<[]> { + bits<5> Zt; + bits<3> Pg; + bits<5> Rn; + bits<5> Rm; + let Inst{31-25} = 0b1010010; + let Inst{24-23} = sz; + let Inst{22-21} = 0b01; + let Inst{20-16} = Rm; + let Inst{15-13} = 0; + let Inst{12-10} = Pg; + let Inst{9-5} = Rn; + let Inst{4-0} = Zt; + + let mayLoad = 1; +} + +multiclass sve_mem_ldor_ss<bits<2> sz, string asm, RegisterOperand listty, + ZPRRegOp zprty, RegisterOperand gprty, ValueType Ty, + ValueType PredTy, SDNode Ld1ro, ComplexPattern AddrCP> { + def NAME : sve_mem_ldor_ss<sz, asm, listty, gprty>; + + def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn, $Rm]", + (!cast<Instruction>(NAME) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, gprty:$Rm), 0>; + + def : Pat<(Ty (Ld1ro (PredTy PPR3bAny:$gp), (AddrCP GPR64sp:$base, gprty:$offset))), + (!cast<Instruction>(NAME) PPR3bAny:$gp, GPR64sp:$base, gprty:$offset)>; +} + +//===----------------------------------------------------------------------===// +// SVE Interleave 128-bit Elements Group +//===----------------------------------------------------------------------===// + +class sve_int_perm_bin_perm_128_zz<bits<2> opc, bit P, string asm> +: I<(outs ZPR128:$Zd), (ins ZPR128:$Zn, ZPR128:$Zm), + asm, "\t$Zd, $Zn, $Zm", + "", + []>, Sched<[]> { + bits<5> Zd; + bits<5> Zm; + bits<5> Zn; + let Inst{31-21} = 0b00000101101; + let Inst{20-16} = Zm; + let Inst{15-13} = 0b000; + let Inst{12-11} = opc; + let Inst{10} = P; + let Inst{9-5} = Zn; + let Inst{4-0} = Zd; +} + +multiclass sve_int_perm_bin_perm_128_zz<bits<2> opc, bit P, string asm, SDPatternOperator op> { + def NAME : sve_int_perm_bin_perm_128_zz<opc, P, asm>; + + def : SVE_2_Op_Pat<nxv16i8, op, nxv16i8, nxv16i8, !cast<Instruction>(NAME)>; + def : SVE_2_Op_Pat<nxv8i16, op, nxv8i16, nxv8i16, !cast<Instruction>(NAME)>; + def : SVE_2_Op_Pat<nxv8f16, op, nxv8f16, nxv8f16, !cast<Instruction>(NAME)>; + def : SVE_2_Op_Pat<nxv4i32, op, nxv4i32, nxv4i32, !cast<Instruction>(NAME)>; + def : SVE_2_Op_Pat<nxv4f32, op, nxv4f32, nxv4f32, !cast<Instruction>(NAME)>; + def : SVE_2_Op_Pat<nxv2i64, op, nxv2i64, nxv2i64, !cast<Instruction>(NAME)>; + def : SVE_2_Op_Pat<nxv2f64, op, nxv2f64, nxv2f64, !cast<Instruction>(NAME)>; +} + +/// Addressing modes +def am_sve_indexed_s4 :ComplexPattern<i64, 2, "SelectAddrModeIndexedSVE<-8,7>", [], [SDNPWantRoot]>; +def am_sve_indexed_s6 :ComplexPattern<i64, 2, "SelectAddrModeIndexedSVE<-32,31>", [], [SDNPWantRoot]>; + +def am_sve_regreg_lsl0 : ComplexPattern<i64, 2, "SelectSVERegRegAddrMode<0>", []>; +def am_sve_regreg_lsl1 : ComplexPattern<i64, 2, "SelectSVERegRegAddrMode<1>", []>; +def am_sve_regreg_lsl2 : ComplexPattern<i64, 2, "SelectSVERegRegAddrMode<2>", []>; +def am_sve_regreg_lsl3 : ComplexPattern<i64, 2, "SelectSVERegRegAddrMode<3>", []>; + +// Predicated pseudo floating point two operand instructions. +multiclass sve_fp_bin_pred_hfd<SDPatternOperator op> { + def _UNDEF_H : PredTwoOpPseudo<NAME # _H, ZPR16, FalseLanesUndef>; + def _UNDEF_S : PredTwoOpPseudo<NAME # _S, ZPR32, FalseLanesUndef>; + def _UNDEF_D : PredTwoOpPseudo<NAME # _D, ZPR64, FalseLanesUndef>; + + def : SVE_3_Op_Pat<nxv8f16, op, nxv8i1, nxv8f16, nxv8f16, !cast<Pseudo>(NAME # _UNDEF_H)>; + def : SVE_3_Op_Pat<nxv4f32, op, nxv4i1, nxv4f32, nxv4f32, !cast<Pseudo>(NAME # _UNDEF_S)>; + def : SVE_3_Op_Pat<nxv2f64, op, nxv2i1, nxv2f64, nxv2f64, !cast<Pseudo>(NAME # _UNDEF_D)>; +} + +// Predicated pseudo integer two operand instructions. +multiclass sve_int_bin_pred_bhsd<SDPatternOperator op> { + def _UNDEF_B : PredTwoOpPseudo<NAME # _B, ZPR8, FalseLanesUndef>; + def _UNDEF_H : PredTwoOpPseudo<NAME # _H, ZPR16, FalseLanesUndef>; + def _UNDEF_S : PredTwoOpPseudo<NAME # _S, ZPR32, FalseLanesUndef>; + def _UNDEF_D : PredTwoOpPseudo<NAME # _D, ZPR64, FalseLanesUndef>; + + def : SVE_3_Op_Pat<nxv16i8, op, nxv16i1, nxv16i8, nxv16i8, !cast<Pseudo>(NAME # _UNDEF_B)>; + def : SVE_3_Op_Pat<nxv8i16, op, nxv8i1, nxv8i16, nxv8i16, !cast<Pseudo>(NAME # _UNDEF_H)>; + def : SVE_3_Op_Pat<nxv4i32, op, nxv4i1, nxv4i32, nxv4i32, !cast<Pseudo>(NAME # _UNDEF_S)>; + def : SVE_3_Op_Pat<nxv2i64, op, nxv2i1, nxv2i64, nxv2i64, !cast<Pseudo>(NAME # _UNDEF_D)>; +} + +// As sve_int_bin_pred but when only i32 and i64 vector types are required. +multiclass sve_int_bin_pred_sd<SDPatternOperator op> { + def _UNDEF_S : PredTwoOpPseudo<NAME # _S, ZPR32, FalseLanesUndef>; + def _UNDEF_D : PredTwoOpPseudo<NAME # _D, ZPR64, FalseLanesUndef>; + + def : SVE_3_Op_Pat<nxv4i32, op, nxv4i1, nxv4i32, nxv4i32, !cast<Pseudo>(NAME # _UNDEF_S)>; + def : SVE_3_Op_Pat<nxv2i64, op, nxv2i1, nxv2i64, nxv2i64, !cast<Pseudo>(NAME # _UNDEF_D)>; +} diff --git a/llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp b/llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp new file mode 100644 index 0000000000000..74fe0cdd1ea7f --- /dev/null +++ b/llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp @@ -0,0 +1,265 @@ +//===----- SVEIntrinsicOpts - SVE ACLE Intrinsics Opts --------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// Performs general IR level optimizations on SVE intrinsics. +// +// The main goal of this pass is to remove unnecessary reinterpret +// intrinsics (llvm.aarch64.sve.convert.[to|from].svbool), e.g: +// +// %1 = @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> %a) +// %2 = @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %1) +// +// This pass also looks for ptest intrinsics & phi instructions where the +// operands are being needlessly converted to and from svbool_t. +// +//===----------------------------------------------------------------------===// + +#include "Utils/AArch64BaseInfo.h" +#include "llvm/ADT/PostOrderIterator.h" +#include "llvm/ADT/SetVector.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/IntrinsicsAArch64.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/IR/PatternMatch.h" +#include "llvm/InitializePasses.h" +#include "llvm/Support/Debug.h" + +using namespace llvm; +using namespace llvm::PatternMatch; + +#define DEBUG_TYPE "sve-intrinsic-opts" + +namespace llvm { +void initializeSVEIntrinsicOptsPass(PassRegistry &); +} + +namespace { +struct SVEIntrinsicOpts : public ModulePass { + static char ID; // Pass identification, replacement for typeid + SVEIntrinsicOpts() : ModulePass(ID) { + initializeSVEIntrinsicOptsPass(*PassRegistry::getPassRegistry()); + } + + bool runOnModule(Module &M) override; + void getAnalysisUsage(AnalysisUsage &AU) const override; + +private: + static IntrinsicInst *isReinterpretToSVBool(Value *V); + + static bool optimizeIntrinsic(Instruction *I); + + bool optimizeFunctions(SmallSetVector<Function *, 4> &Functions); + + static bool optimizeConvertFromSVBool(IntrinsicInst *I); + static bool optimizePTest(IntrinsicInst *I); + + static bool processPhiNode(IntrinsicInst *I); +}; +} // end anonymous namespace + +void SVEIntrinsicOpts::getAnalysisUsage(AnalysisUsage &AU) const { + AU.addRequired<DominatorTreeWrapperPass>(); + AU.setPreservesCFG(); +} + +char SVEIntrinsicOpts::ID = 0; +static const char *name = "SVE intrinsics optimizations"; +INITIALIZE_PASS_BEGIN(SVEIntrinsicOpts, DEBUG_TYPE, name, false, false) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass); +INITIALIZE_PASS_END(SVEIntrinsicOpts, DEBUG_TYPE, name, false, false) + +namespace llvm { +ModulePass *createSVEIntrinsicOptsPass() { return new SVEIntrinsicOpts(); } +} // namespace llvm + +/// Returns V if it's a cast from <n x 16 x i1> (aka svbool_t), nullptr +/// otherwise. +IntrinsicInst *SVEIntrinsicOpts::isReinterpretToSVBool(Value *V) { + IntrinsicInst *I = dyn_cast<IntrinsicInst>(V); + if (!I) + return nullptr; + + if (I->getIntrinsicID() != Intrinsic::aarch64_sve_convert_to_svbool) + return nullptr; + + return I; +} + +/// The function will remove redundant reinterprets casting in the presence +/// of the control flow +bool SVEIntrinsicOpts::processPhiNode(IntrinsicInst *X) { + + SmallVector<Instruction *, 32> Worklist; + auto RequiredType = X->getType(); + + auto *PN = dyn_cast<PHINode>(X->getArgOperand(0)); + assert(PN && "Expected Phi Node!"); + + // Don't create a new Phi unless we can remove the old one. + if (!PN->hasOneUse()) + return false; + + for (Value *IncValPhi : PN->incoming_values()) { + auto *Reinterpret = isReinterpretToSVBool(IncValPhi); + if (!Reinterpret || + RequiredType != Reinterpret->getArgOperand(0)->getType()) + return false; + } + + // Create the new Phi + LLVMContext &Ctx = PN->getContext(); + IRBuilder<> Builder(Ctx); + Builder.SetInsertPoint(PN); + PHINode *NPN = Builder.CreatePHI(RequiredType, PN->getNumIncomingValues()); + Worklist.push_back(PN); + + for (unsigned I = 0; I < PN->getNumIncomingValues(); I++) { + auto *Reinterpret = cast<Instruction>(PN->getIncomingValue(I)); + NPN->addIncoming(Reinterpret->getOperand(0), PN->getIncomingBlock(I)); + Worklist.push_back(Reinterpret); + } + + // Cleanup Phi Node and reinterprets + X->replaceAllUsesWith(NPN); + X->eraseFromParent(); + + for (auto &I : Worklist) + if (I->use_empty()) + I->eraseFromParent(); + + return true; +} + +bool SVEIntrinsicOpts::optimizePTest(IntrinsicInst *I) { + IntrinsicInst *Op1 = dyn_cast<IntrinsicInst>(I->getArgOperand(0)); + IntrinsicInst *Op2 = dyn_cast<IntrinsicInst>(I->getArgOperand(1)); + + if (Op1 && Op2 && + Op1->getIntrinsicID() == Intrinsic::aarch64_sve_convert_to_svbool && + Op2->getIntrinsicID() == Intrinsic::aarch64_sve_convert_to_svbool && + Op1->getArgOperand(0)->getType() == Op2->getArgOperand(0)->getType()) { + + Value *Ops[] = {Op1->getArgOperand(0), Op2->getArgOperand(0)}; + Type *Tys[] = {Op1->getArgOperand(0)->getType()}; + Module *M = I->getParent()->getParent()->getParent(); + + auto Fn = Intrinsic::getDeclaration(M, I->getIntrinsicID(), Tys); + auto CI = CallInst::Create(Fn, Ops, I->getName(), I); + + I->replaceAllUsesWith(CI); + I->eraseFromParent(); + if (Op1->use_empty()) + Op1->eraseFromParent(); + if (Op2->use_empty()) + Op2->eraseFromParent(); + + return true; + } + + return false; +} + +bool SVEIntrinsicOpts::optimizeConvertFromSVBool(IntrinsicInst *I) { + assert(I->getIntrinsicID() == Intrinsic::aarch64_sve_convert_from_svbool && + "Unexpected opcode"); + + // If the reinterpret instruction operand is a PHI Node + if (isa<PHINode>(I->getArgOperand(0))) + return processPhiNode(I); + + // If we have a reinterpret intrinsic I of type A which is converting from + // another reinterpret Y of type B, and the source type of Y is A, then we can + // elide away both reinterprets if there are no other users of Y. + auto *Y = isReinterpretToSVBool(I->getArgOperand(0)); + if (!Y) + return false; + + Value *SourceVal = Y->getArgOperand(0); + if (I->getType() != SourceVal->getType()) + return false; + + I->replaceAllUsesWith(SourceVal); + I->eraseFromParent(); + if (Y->use_empty()) + Y->eraseFromParent(); + + return true; +} + +bool SVEIntrinsicOpts::optimizeIntrinsic(Instruction *I) { + IntrinsicInst *IntrI = dyn_cast<IntrinsicInst>(I); + if (!IntrI) + return false; + + switch (IntrI->getIntrinsicID()) { + case Intrinsic::aarch64_sve_convert_from_svbool: + return optimizeConvertFromSVBool(IntrI); + case Intrinsic::aarch64_sve_ptest_any: + case Intrinsic::aarch64_sve_ptest_first: + case Intrinsic::aarch64_sve_ptest_last: + return optimizePTest(IntrI); + default: + return false; + } + + return true; +} + +bool SVEIntrinsicOpts::optimizeFunctions( + SmallSetVector<Function *, 4> &Functions) { + bool Changed = false; + for (auto *F : Functions) { + DominatorTree *DT = &getAnalysis<DominatorTreeWrapperPass>(*F).getDomTree(); + + // Traverse the DT with an rpo walk so we see defs before uses, allowing + // simplification to be done incrementally. + BasicBlock *Root = DT->getRoot(); + ReversePostOrderTraversal<BasicBlock *> RPOT(Root); + for (auto *BB : RPOT) + for (Instruction &I : make_early_inc_range(*BB)) + Changed |= optimizeIntrinsic(&I); + } + return Changed; +} + +bool SVEIntrinsicOpts::runOnModule(Module &M) { + bool Changed = false; + SmallSetVector<Function *, 4> Functions; + + // Check for SVE intrinsic declarations first so that we only iterate over + // relevant functions. Where an appropriate declaration is found, store the + // function(s) where it is used so we can target these only. + for (auto &F : M.getFunctionList()) { + if (!F.isDeclaration()) + continue; + + switch (F.getIntrinsicID()) { + case Intrinsic::aarch64_sve_convert_from_svbool: + case Intrinsic::aarch64_sve_ptest_any: + case Intrinsic::aarch64_sve_ptest_first: + case Intrinsic::aarch64_sve_ptest_last: + for (auto I = F.user_begin(), E = F.user_end(); I != E;) { + auto *Inst = dyn_cast<Instruction>(*I++); + Functions.insert(Inst->getFunction()); + } + break; + default: + break; + } + } + + if (!Functions.empty()) + Changed |= optimizeFunctions(Functions); + + return Changed; +} diff --git a/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h b/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h index 87980cddb7c0b..4e289fbe23257 100644 --- a/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h +++ b/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h @@ -658,6 +658,7 @@ namespace AArch64 { // in index i*P of a <n x (M*P) x t> vector. The other elements of the // <n x (M*P) x t> vector (such as index 1) are undefined. static constexpr unsigned SVEBitsPerBlock = 128; +static constexpr unsigned SVEMaxBitsPerVector = 2048; const unsigned NeonBitsPerVector = 128; } // end namespace AArch64 } // end namespace llvm |