From cfca06d7963fa0909f90483b42a6d7d194d01e08 Mon Sep 17 00:00:00 2001 From: Dimitry Andric Date: Sun, 26 Jul 2020 19:36:28 +0000 Subject: Vendor import of llvm-project master 2e10b7a39b9, the last commit before the llvmorg-12-init tag, from which release/11.x was branched. --- llvm/lib/Target/AArch64/AArch64.h | 9 +- llvm/lib/Target/AArch64/AArch64.td | 174 +- llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp | 182 +- llvm/lib/Target/AArch64/AArch64BranchTargets.cpp | 12 +- llvm/lib/Target/AArch64/AArch64CallLowering.cpp | 1029 ---- llvm/lib/Target/AArch64/AArch64CallLowering.h | 82 - .../Target/AArch64/AArch64CallingConvention.cpp | 15 +- .../lib/Target/AArch64/AArch64CallingConvention.td | 145 +- .../AArch64/AArch64CleanupLocalDynamicTLSPass.cpp | 4 + llvm/lib/Target/AArch64/AArch64CollectLOH.cpp | 21 +- llvm/lib/Target/AArch64/AArch64Combine.td | 68 +- .../Target/AArch64/AArch64CompressJumpTables.cpp | 2 +- llvm/lib/Target/AArch64/AArch64CondBrTuning.cpp | 16 +- .../Target/AArch64/AArch64ConditionOptimizer.cpp | 37 +- .../Target/AArch64/AArch64ConditionalCompares.cpp | 14 +- .../Target/AArch64/AArch64ExpandPseudoInsts.cpp | 327 +- llvm/lib/Target/AArch64/AArch64FalkorHWPFFix.cpp | 3 - llvm/lib/Target/AArch64/AArch64FastISel.cpp | 15 +- llvm/lib/Target/AArch64/AArch64FrameLowering.cpp | 702 ++- llvm/lib/Target/AArch64/AArch64FrameLowering.h | 30 +- llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp | 743 ++- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 3009 +++++++++-- llvm/lib/Target/AArch64/AArch64ISelLowering.h | 236 +- llvm/lib/Target/AArch64/AArch64InstrFormats.td | 560 +- llvm/lib/Target/AArch64/AArch64InstrGISel.td | 124 + llvm/lib/Target/AArch64/AArch64InstrInfo.cpp | 557 +- llvm/lib/Target/AArch64/AArch64InstrInfo.h | 99 +- llvm/lib/Target/AArch64/AArch64InstrInfo.td | 632 ++- .../Target/AArch64/AArch64InstructionSelector.cpp | 4918 ----------------- llvm/lib/Target/AArch64/AArch64LegalizerInfo.cpp | 771 --- llvm/lib/Target/AArch64/AArch64LegalizerInfo.h | 48 - .../Target/AArch64/AArch64LoadStoreOptimizer.cpp | 122 +- .../Target/AArch64/AArch64MachineFunctionInfo.cpp | 32 + .../Target/AArch64/AArch64MachineFunctionInfo.h | 32 + .../Target/AArch64/AArch64PreLegalizerCombiner.cpp | 168 - llvm/lib/Target/AArch64/AArch64PromoteConstant.cpp | 23 +- .../lib/Target/AArch64/AArch64RegisterBankInfo.cpp | 852 --- llvm/lib/Target/AArch64/AArch64RegisterBankInfo.h | 145 - llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp | 140 +- llvm/lib/Target/AArch64/AArch64RegisterInfo.h | 19 +- llvm/lib/Target/AArch64/AArch64RegisterInfo.td | 34 +- llvm/lib/Target/AArch64/AArch64SIMDInstrOpt.cpp | 5 +- llvm/lib/Target/AArch64/AArch64SLSHardening.cpp | 443 ++ llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td | 2140 ++++++-- llvm/lib/Target/AArch64/AArch64SchedA53.td | 3 +- llvm/lib/Target/AArch64/AArch64SchedA57.td | 5 +- llvm/lib/Target/AArch64/AArch64SchedCyclone.td | 5 +- llvm/lib/Target/AArch64/AArch64SchedExynosM3.td | 5 +- llvm/lib/Target/AArch64/AArch64SchedExynosM4.td | 5 +- llvm/lib/Target/AArch64/AArch64SchedExynosM5.td | 5 +- llvm/lib/Target/AArch64/AArch64SchedFalkor.td | 4 +- .../Target/AArch64/AArch64SchedFalkorDetails.td | 4 +- llvm/lib/Target/AArch64/AArch64SchedKryo.td | 4 +- llvm/lib/Target/AArch64/AArch64SchedKryoDetails.td | 4 +- llvm/lib/Target/AArch64/AArch64SchedThunderX.td | 4 +- .../lib/Target/AArch64/AArch64SchedThunderX2T99.td | 6 +- .../Target/AArch64/AArch64SchedThunderX3T110.td | 1997 +++++++ .../lib/Target/AArch64/AArch64SelectionDAGInfo.cpp | 29 +- llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.h | 3 +- llvm/lib/Target/AArch64/AArch64StackOffset.h | 1 + llvm/lib/Target/AArch64/AArch64StackTagging.cpp | 49 +- .../Target/AArch64/AArch64StorePairSuppress.cpp | 4 +- llvm/lib/Target/AArch64/AArch64Subtarget.cpp | 67 +- llvm/lib/Target/AArch64/AArch64Subtarget.h | 61 +- llvm/lib/Target/AArch64/AArch64SystemOperands.td | 60 +- llvm/lib/Target/AArch64/AArch64TargetMachine.cpp | 67 +- llvm/lib/Target/AArch64/AArch64TargetMachine.h | 8 + .../lib/Target/AArch64/AArch64TargetObjectFile.cpp | 5 +- llvm/lib/Target/AArch64/AArch64TargetObjectFile.h | 5 + .../Target/AArch64/AArch64TargetTransformInfo.cpp | 202 +- .../Target/AArch64/AArch64TargetTransformInfo.h | 85 +- .../Target/AArch64/AsmParser/AArch64AsmParser.cpp | 95 +- .../AArch64/Disassembler/AArch64Disassembler.cpp | 36 + .../Target/AArch64/GISel/AArch64CallLowering.cpp | 1049 ++++ .../lib/Target/AArch64/GISel/AArch64CallLowering.h | 84 + .../AArch64/GISel/AArch64InstructionSelector.cpp | 5704 ++++++++++++++++++++ .../Target/AArch64/GISel/AArch64LegalizerInfo.cpp | 809 +++ .../Target/AArch64/GISel/AArch64LegalizerInfo.h | 51 + .../AArch64/GISel/AArch64PostLegalizerCombiner.cpp | 507 ++ .../AArch64/GISel/AArch64PreLegalizerCombiner.cpp | 203 + .../AArch64/GISel/AArch64RegisterBankInfo.cpp | 868 +++ .../Target/AArch64/GISel/AArch64RegisterBankInfo.h | 145 + .../AArch64/MCTargetDesc/AArch64AddressingModes.h | 7 +- .../AArch64/MCTargetDesc/AArch64AsmBackend.cpp | 84 +- .../MCTargetDesc/AArch64ELFObjectWriter.cpp | 17 +- .../AArch64/MCTargetDesc/AArch64ELFStreamer.cpp | 28 +- .../AArch64/MCTargetDesc/AArch64InstPrinter.cpp | 52 +- .../AArch64/MCTargetDesc/AArch64InstPrinter.h | 21 +- .../AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp | 4 +- .../AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp | 31 +- .../AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp | 2 +- .../MCTargetDesc/AArch64MachObjectWriter.cpp | 4 +- .../AArch64/MCTargetDesc/AArch64TargetStreamer.cpp | 2 +- .../MCTargetDesc/AArch64WinCOFFStreamer.cpp | 16 +- llvm/lib/Target/AArch64/SVEInstrFormats.td | 1920 +++++-- llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp | 265 + llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h | 1 + 97 files changed, 22545 insertions(+), 10896 deletions(-) delete mode 100644 llvm/lib/Target/AArch64/AArch64CallLowering.cpp delete mode 100644 llvm/lib/Target/AArch64/AArch64CallLowering.h create mode 100644 llvm/lib/Target/AArch64/AArch64InstrGISel.td delete mode 100644 llvm/lib/Target/AArch64/AArch64InstructionSelector.cpp delete mode 100644 llvm/lib/Target/AArch64/AArch64LegalizerInfo.cpp delete mode 100644 llvm/lib/Target/AArch64/AArch64LegalizerInfo.h create mode 100644 llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp delete mode 100644 llvm/lib/Target/AArch64/AArch64PreLegalizerCombiner.cpp delete mode 100644 llvm/lib/Target/AArch64/AArch64RegisterBankInfo.cpp delete mode 100644 llvm/lib/Target/AArch64/AArch64RegisterBankInfo.h create mode 100644 llvm/lib/Target/AArch64/AArch64SLSHardening.cpp create mode 100644 llvm/lib/Target/AArch64/AArch64SchedThunderX3T110.td create mode 100644 llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp create mode 100644 llvm/lib/Target/AArch64/GISel/AArch64CallLowering.h create mode 100644 llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp create mode 100644 llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp create mode 100644 llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.h create mode 100644 llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp create mode 100644 llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp create mode 100644 llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp create mode 100644 llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.h create mode 100644 llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp (limited to 'llvm/lib/Target/AArch64') diff --git a/llvm/lib/Target/AArch64/AArch64.h b/llvm/lib/Target/AArch64/AArch64.h index ac765ebcddc04..fd35b530e3ce4 100644 --- a/llvm/lib/Target/AArch64/AArch64.h +++ b/llvm/lib/Target/AArch64/AArch64.h @@ -38,6 +38,8 @@ FunctionPass *createAArch64ISelDag(AArch64TargetMachine &TM, CodeGenOpt::Level OptLevel); FunctionPass *createAArch64StorePairSuppressPass(); FunctionPass *createAArch64ExpandPseudoPass(); +FunctionPass *createAArch64SLSHardeningPass(); +FunctionPass *createAArch64IndirectThunks(); FunctionPass *createAArch64SpeculationHardeningPass(); FunctionPass *createAArch64LoadStoreOptimizationPass(); FunctionPass *createAArch64SIMDInstrOptPass(); @@ -52,11 +54,13 @@ FunctionPass *createAArch64BranchTargetsPass(); FunctionPass *createAArch64CleanupLocalDynamicTLSPass(); FunctionPass *createAArch64CollectLOHPass(); +ModulePass *createSVEIntrinsicOptsPass(); InstructionSelector * createAArch64InstructionSelector(const AArch64TargetMachine &, AArch64Subtarget &, AArch64RegisterBankInfo &); FunctionPass *createAArch64PreLegalizeCombiner(bool IsOptNone); -FunctionPass *createAArch64StackTaggingPass(bool MergeInit); +FunctionPass *createAArch64PostLegalizeCombiner(bool IsOptNone); +FunctionPass *createAArch64StackTaggingPass(bool IsOptNone); FunctionPass *createAArch64StackTaggingPreRAPass(); void initializeAArch64A53Fix835769Pass(PassRegistry&); @@ -70,16 +74,19 @@ void initializeAArch64ConditionalComparesPass(PassRegistry&); void initializeAArch64ConditionOptimizerPass(PassRegistry&); void initializeAArch64DeadRegisterDefinitionsPass(PassRegistry&); void initializeAArch64ExpandPseudoPass(PassRegistry&); +void initializeAArch64SLSHardeningPass(PassRegistry&); void initializeAArch64SpeculationHardeningPass(PassRegistry&); void initializeAArch64LoadStoreOptPass(PassRegistry&); void initializeAArch64SIMDInstrOptPass(PassRegistry&); void initializeAArch64PreLegalizerCombinerPass(PassRegistry&); +void initializeAArch64PostLegalizerCombinerPass(PassRegistry &); void initializeAArch64PromoteConstantPass(PassRegistry&); void initializeAArch64RedundantCopyEliminationPass(PassRegistry&); void initializeAArch64StorePairSuppressPass(PassRegistry&); void initializeFalkorHWPFFixPass(PassRegistry&); void initializeFalkorMarkStridedAccessesLegacyPass(PassRegistry&); void initializeLDTLSCleanupPass(PassRegistry&); +void initializeSVEIntrinsicOptsPass(PassRegistry&); void initializeAArch64StackTaggingPass(PassRegistry&); void initializeAArch64StackTaggingPreRAPass(PassRegistry&); } // end namespace llvm diff --git a/llvm/lib/Target/AArch64/AArch64.td b/llvm/lib/Target/AArch64/AArch64.td index 0106355b1a440..534af9686af06 100644 --- a/llvm/lib/Target/AArch64/AArch64.td +++ b/llvm/lib/Target/AArch64/AArch64.td @@ -42,11 +42,11 @@ def FeatureAES : SubtargetFeature< "Enable AES support", [FeatureNEON]>; // Crypto has been split up and any combination is now valid (see the -// crypto defintions above). Also, crypto is now context sensitive: +// crypto definitions above). Also, crypto is now context sensitive: // it has a different meaning for e.g. Armv8.4 than it has for Armv8.2. // Therefore, we rely on Clang, the user interacing tool, to pass on the // appropriate crypto options. But here in the backend, crypto has very little -// meaning anymore. We kept the Crypto defintion here for backward +// meaning anymore. We kept the Crypto definition here for backward // compatibility, and now imply features SHA2 and AES, which was the // "traditional" meaning of Crypto. def FeatureCrypto : SubtargetFeature<"crypto", "HasCrypto", "true", @@ -101,7 +101,25 @@ def FeatureCCPP : SubtargetFeature<"ccpp", "HasCCPP", "true", "Enable v8.2 data Cache Clean to Point of Persistence" >; def FeatureSVE : SubtargetFeature<"sve", "HasSVE", "true", - "Enable Scalable Vector Extension (SVE) instructions">; + "Enable Scalable Vector Extension (SVE) instructions", [FeatureFullFP16]>; + +// This flag is currently still labeled as Experimental, but when fully +// implemented this should tell the compiler to use the zeroing pseudos to +// benefit from the reverse instructions (e.g. SUB vs SUBR) if the inactive +// lanes are known to be zero. The pseudos will then be expanded using the +// MOVPRFX instruction to zero the inactive lanes. This feature should only be +// enabled if MOVPRFX instructions are known to merge with the destructive +// operations they prefix. +// +// This feature could similarly be extended to support cheap merging of _any_ +// value into the inactive lanes using the MOVPRFX instruction that uses +// merging-predication. +def FeatureExperimentalZeroingPseudos + : SubtargetFeature<"use-experimental-zeroing-pseudos", + "UseExperimentalZeroingPseudos", "true", + "Hint to the compiler that the MOVPRFX instruction is " + "merged with destructive operations", + []>; def FeatureSVE2 : SubtargetFeature<"sve2", "HasSVE2", "true", "Enable Scalable Vector Extension 2 (SVE2) instructions", [FeatureSVE]>; @@ -142,7 +160,7 @@ def FeatureStrictAlign : SubtargetFeature<"strict-align", "Disallow all unaligned memory " "access">; -foreach i = {1-7,9-15,18,20-28} in +foreach i = {1-7,9-15,18,20-28,30} in def FeatureReserveX#i : SubtargetFeature<"reserve-x"#i, "ReserveXRegister["#i#"]", "true", "Reserve X"#i#", making it unavailable " "as a GPR">; @@ -240,11 +258,11 @@ def FeatureDotProd : SubtargetFeature< def FeaturePA : SubtargetFeature< "pa", "HasPA", "true", - "Enable v8.3-A Pointer Authentication enchancement">; + "Enable v8.3-A Pointer Authentication extension">; def FeatureJS : SubtargetFeature< "jsconv", "HasJS", "true", - "Enable v8.3-A JavaScript FP conversion enchancement", + "Enable v8.3-A JavaScript FP conversion instructions", [FeatureFPARMv8]>; def FeatureCCIDX : SubtargetFeature< @@ -281,6 +299,11 @@ def FeatureAM : SubtargetFeature< "am", "HasAM", "true", "Enable v8.4-A Activity Monitors extension">; +def FeatureAMVS : SubtargetFeature< + "amvs", "HasAMVS", "true", + "Enable v8.6-A Activity Monitors Virtualization support", + [FeatureAM]>; + def FeatureSEL2 : SubtargetFeature< "sel2", "HasSEL2", "true", "Enable v8.4-A Secure Exception Level 2 extension">; @@ -365,6 +388,25 @@ def FeatureTaggedGlobals : SubtargetFeature<"tagged-globals", "true", "Use an instruction sequence for taking the address of a global " "that allows a memory tag in the upper address bits">; +def FeatureBF16 : SubtargetFeature<"bf16", "HasBF16", + "true", "Enable BFloat16 Extension" >; + +def FeatureMatMulInt8 : SubtargetFeature<"i8mm", "HasMatMulInt8", + "true", "Enable Matrix Multiply Int8 Extension">; + +def FeatureMatMulFP32 : SubtargetFeature<"f32mm", "HasMatMulFP32", + "true", "Enable Matrix Multiply FP32 Extension", [FeatureSVE]>; + +def FeatureMatMulFP64 : SubtargetFeature<"f64mm", "HasMatMulFP64", + "true", "Enable Matrix Multiply FP64 Extension", [FeatureSVE]>; + +def FeatureFineGrainedTraps : SubtargetFeature<"fgt", "HasFineGrainedTraps", + "true", "Enable fine grained virtualization traps extension">; + +def FeatureEnhancedCounterVirtualization : + SubtargetFeature<"ecv", "HasEnhancedCounterVirtualization", + "true", "Enable enhanced counter virtualization extension">; + //===----------------------------------------------------------------------===// // Architectures. // @@ -391,8 +433,13 @@ def HasV8_5aOps : SubtargetFeature< "v8.5a", "HasV8_5aOps", "true", "Support ARM v8.5a instructions", [HasV8_4aOps, FeatureAltFPCmp, FeatureFRInt3264, FeatureSpecRestrict, FeatureSSBS, FeatureSB, FeaturePredRes, FeatureCacheDeepPersist, - FeatureBranchTargetId] ->; + FeatureBranchTargetId]>; + +def HasV8_6aOps : SubtargetFeature< + "v8.6a", "HasV8_6aOps", "true", "Support ARM v8.6a instructions", + + [HasV8_5aOps, FeatureAMVS, FeatureBF16, FeatureFineGrainedTraps, + FeatureEnhancedCounterVirtualization, FeatureMatMulInt8]>; //===----------------------------------------------------------------------===// // Register File Description @@ -428,6 +475,17 @@ foreach i = 1-3 in def FeatureUseEL#i#ForTP : SubtargetFeature<"tpidr-el"#i, "UseEL"#i#"ForTP", "true", "Permit use of TPIDR_EL"#i#" for the TLS base">; +//===----------------------------------------------------------------------===// +// Control codegen mitigation against Straight Line Speculation vulnerability. +//===----------------------------------------------------------------------===// + +def FeatureHardenSlsRetBr : SubtargetFeature<"harden-sls-retbr", + "HardenSlsRetBr", "true", + "Harden against straight line speculation across RET and BR instructions">; +def FeatureHardenSlsBlr : SubtargetFeature<"harden-sls-blr", + "HardenSlsBlr", "true", + "Harden against straight line speculation across BLR instructions">; + //===----------------------------------------------------------------------===// // AArch64 Processors supported. // @@ -443,6 +501,10 @@ def SVEUnsupported : AArch64Unsupported { HasSVE2BitPerm]; } +def PAUnsupported : AArch64Unsupported { + let F = [HasPA]; +} + include "AArch64SchedA53.td" include "AArch64SchedA57.td" include "AArch64SchedCyclone.td" @@ -453,6 +515,7 @@ include "AArch64SchedExynosM4.td" include "AArch64SchedExynosM5.td" include "AArch64SchedThunderX.td" include "AArch64SchedThunderX2T99.td" +include "AArch64SchedThunderX3T110.td" def ProcA35 : SubtargetFeature<"a35", "ARMProcFamily", "CortexA35", "Cortex-A35 ARM processors", [ @@ -563,6 +626,67 @@ def ProcA76 : SubtargetFeature<"a76", "ARMProcFamily", "CortexA76", FeatureSSBS ]>; +def ProcA77 : SubtargetFeature<"a77", "ARMProcFamily", "CortexA77", + "Cortex-A77 ARM processors", [ + HasV8_2aOps, + FeatureFPARMv8, + FeatureNEON, FeatureRCPC, + FeatureCrypto, + FeatureFullFP16, + FeatureDotProd + ]>; + +def ProcA78 : SubtargetFeature<"cortex-a78", "ARMProcFamily", + "CortexA78", + "Cortex-A78 ARM processors", [ + HasV8_2aOps, + FeatureCrypto, + FeatureFPARMv8, + FeatureFuseAES, + FeatureNEON, + FeatureRCPC, + FeaturePerfMon, + FeaturePostRAScheduler, + FeatureSPE, + FeatureFullFP16, + FeatureSSBS, + FeatureDotProd]>; + +def ProcX1 : SubtargetFeature<"cortex-x1", "ARMProcFamily", "CortexX1", + "Cortex-X1 ARM processors", [ + HasV8_2aOps, + FeatureCrypto, + FeatureFPARMv8, + FeatureFuseAES, + FeatureNEON, + FeatureRCPC, + FeaturePerfMon, + FeaturePostRAScheduler, + FeatureSPE, + FeatureFullFP16, + FeatureDotProd]>; + +def ProcA64FX : SubtargetFeature<"a64fx", "ARMProcFamily", "A64FX", + "Fujitsu A64FX processors", [ + HasV8_2aOps, + FeatureFPARMv8, + FeatureNEON, + FeatureSHA2, + FeaturePerfMon, + FeatureFullFP16, + FeatureSVE, + FeaturePostRAScheduler, + FeatureComplxNum + ]>; + +def ProcCarmel : SubtargetFeature<"carmel", "ARMProcFamily", "Carmel", + "Nvidia Carmel processors", [ + HasV8_2aOps, + FeatureNEON, + FeatureCrypto, + FeatureFullFP16 + ]>; + // Note that cyclone does not fuse AES instructions, but newer apple chips do // perform the fusion and cyclone is used by default when targetting apple OSes. def ProcAppleA7 : SubtargetFeature<"apple-a7", "ARMProcFamily", "AppleA7", @@ -780,6 +904,25 @@ def ProcThunderX2T99 : SubtargetFeature<"thunderx2t99", "ARMProcFamily", FeatureLSE, HasV8_1aOps]>; +def ProcThunderX3T110 : SubtargetFeature<"thunderx3t110", "ARMProcFamily", + "ThunderX3T110", + "Marvell ThunderX3 processors", [ + FeatureAggressiveFMA, + FeatureCRC, + FeatureCrypto, + FeatureFPARMv8, + FeatureArithmeticBccFusion, + FeatureNEON, + FeaturePostRAScheduler, + FeaturePredictableSelectIsExpensive, + FeatureLSE, + FeaturePA, + FeatureUseAA, + FeatureBalanceFPOps, + FeaturePerfMon, + FeatureStrictAlign, + HasV8_3aOps]>; + def ProcThunderX : SubtargetFeature<"thunderx", "ARMProcFamily", "ThunderX", "Cavium ThunderX processors", [ FeatureCRC, @@ -844,7 +987,7 @@ def : ProcessorModel<"generic", NoSchedModel, [ FeatureNEON, FeaturePerfMon, FeaturePostRAScheduler, -// ETE and TRBE are future architecture extensions. We temporariliy enable them +// ETE and TRBE are future architecture extensions. We temporarily enable them // by default for users targeting generic AArch64, until it is decided in which // armv8.x-a architecture revision they will end up. The extensions do not // affect code generated by the compiler and can be used only by explicitly @@ -853,6 +996,7 @@ def : ProcessorModel<"generic", NoSchedModel, [ ]>; def : ProcessorModel<"cortex-a35", CortexA53Model, [ProcA35]>; +def : ProcessorModel<"cortex-a34", CortexA53Model, [ProcA35]>; def : ProcessorModel<"cortex-a53", CortexA53Model, [ProcA53]>; def : ProcessorModel<"cortex-a55", CortexA53Model, [ProcA55]>; def : ProcessorModel<"cortex-a57", CortexA57Model, [ProcA57]>; @@ -863,6 +1007,9 @@ def : ProcessorModel<"cortex-a73", CortexA57Model, [ProcA73]>; def : ProcessorModel<"cortex-a75", CortexA57Model, [ProcA75]>; def : ProcessorModel<"cortex-a76", CortexA57Model, [ProcA76]>; def : ProcessorModel<"cortex-a76ae", CortexA57Model, [ProcA76]>; +def : ProcessorModel<"cortex-a77", CortexA57Model, [ProcA77]>; +def : ProcessorModel<"cortex-a78", CortexA57Model, [ProcA78]>; +def : ProcessorModel<"cortex-x1", CortexA57Model, [ProcX1]>; def : ProcessorModel<"neoverse-e1", CortexA53Model, [ProcNeoverseE1]>; def : ProcessorModel<"neoverse-n1", CortexA57Model, [ProcNeoverseN1]>; def : ProcessorModel<"exynos-m3", ExynosM3Model, [ProcExynosM3]>; @@ -878,6 +1025,8 @@ def : ProcessorModel<"thunderxt81", ThunderXT8XModel, [ProcThunderXT81]>; def : ProcessorModel<"thunderxt83", ThunderXT8XModel, [ProcThunderXT83]>; // Cavium ThunderX2T9X Processors. Formerly Broadcom Vulcan. def : ProcessorModel<"thunderx2t99", ThunderX2T99Model, [ProcThunderX2T99]>; +// Marvell ThunderX3T110 Processors. +def : ProcessorModel<"thunderx3t110", ThunderX3T110Model, [ProcThunderX3T110]>; // FIXME: HiSilicon TSV110 is currently modeled as a Cortex-A57. def : ProcessorModel<"tsv110", CortexA57Model, [ProcTSV110]>; @@ -900,6 +1049,13 @@ def : ProcessorModel<"apple-s5", CycloneModel, [ProcAppleA12]>; // Alias for the latest Apple processor model supported by LLVM. def : ProcessorModel<"apple-latest", CycloneModel, [ProcAppleA13]>; +// Fujitsu A64FX +// FIXME: Scheduling model is not implemented yet. +def : ProcessorModel<"a64fx", NoSchedModel, [ProcA64FX]>; + +// Nvidia Carmel +def : ProcessorModel<"carmel", NoSchedModel, [ProcCarmel]>; + //===----------------------------------------------------------------------===// // Assembly parser //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp index 00e321f9b8509..3a94820dac8d3 100644 --- a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp +++ b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp @@ -84,8 +84,8 @@ public: return MCInstLowering.lowerOperand(MO, MCOp); } - void EmitStartOfAsmFile(Module &M) override; - void EmitJumpTableInfo() override; + void emitStartOfAsmFile(Module &M) override; + void emitJumpTableInfo() override; void emitJumpTableEntry(const MachineJumpTableInfo *MJTI, const MachineBasicBlock *MBB, unsigned JTI); @@ -112,7 +112,9 @@ public: bool emitPseudoExpansionLowering(MCStreamer &OutStreamer, const MachineInstr *MI); - void EmitInstruction(const MachineInstr *MI) override; + void emitInstruction(const MachineInstr *MI) override; + + void emitFunctionHeaderComment() override; void getAnalysisUsage(AnalysisUsage &AU) const override { AsmPrinter::getAnalysisUsage(AU); @@ -139,7 +141,7 @@ public: } // Emit the rest of the function body. - EmitFunctionBody(); + emitFunctionBody(); // Emit the XRay table for this function. emitXRayTable(); @@ -162,10 +164,10 @@ private: void PrintDebugValueComment(const MachineInstr *MI, raw_ostream &OS); - void EmitFunctionBodyEnd() override; + void emitFunctionBodyEnd() override; MCSymbol *GetCPISymbol(unsigned CPID) const override; - void EmitEndOfAsmFile(Module &M) override; + void emitEndOfAsmFile(Module &M) override; AArch64FunctionInfo *AArch64FI = nullptr; @@ -182,7 +184,7 @@ private: } // end anonymous namespace -void AArch64AsmPrinter::EmitStartOfAsmFile(Module &M) { +void AArch64AsmPrinter::emitStartOfAsmFile(Module &M) { if (!TM.getTargetTriple().isOSBinFormatELF()) return; @@ -225,22 +227,29 @@ void AArch64AsmPrinter::EmitStartOfAsmFile(Module &M) { OutStreamer->SwitchSection(Nt); // Emit the note header. - EmitAlignment(Align(8)); - OutStreamer->EmitIntValue(4, 4); // data size for "GNU\0" - OutStreamer->EmitIntValue(4 * 4, 4); // Elf_Prop size - OutStreamer->EmitIntValue(ELF::NT_GNU_PROPERTY_TYPE_0, 4); - OutStreamer->EmitBytes(StringRef("GNU", 4)); // note name + emitAlignment(Align(8)); + OutStreamer->emitInt32(4); // data size for "GNU\0" + OutStreamer->emitInt32(4 * 4); // Elf_Prop size + OutStreamer->emitInt32(ELF::NT_GNU_PROPERTY_TYPE_0); + OutStreamer->emitBytes(StringRef("GNU", 4)); // note name // Emit the PAC/BTI properties. - OutStreamer->EmitIntValue(ELF::GNU_PROPERTY_AARCH64_FEATURE_1_AND, 4); - OutStreamer->EmitIntValue(4, 4); // data size - OutStreamer->EmitIntValue(Flags, 4); // data - OutStreamer->EmitIntValue(0, 4); // pad + OutStreamer->emitInt32(ELF::GNU_PROPERTY_AARCH64_FEATURE_1_AND); + OutStreamer->emitInt32(4); // data size + OutStreamer->emitInt32(Flags); // data + OutStreamer->emitInt32(0); // pad OutStreamer->endSection(Nt); OutStreamer->SwitchSection(Cur); } +void AArch64AsmPrinter::emitFunctionHeaderComment() { + const AArch64FunctionInfo *FI = MF->getInfo(); + Optional OutlinerString = FI->getOutliningStyle(); + if (OutlinerString != None) + OutStreamer->GetCommentOS() << ' ' << OutlinerString; +} + void AArch64AsmPrinter::LowerPATCHABLE_FUNCTION_ENTER(const MachineInstr &MI) { const Function &F = MF->getFunction(); @@ -250,8 +259,7 @@ void AArch64AsmPrinter::LowerPATCHABLE_FUNCTION_ENTER(const MachineInstr &MI) .getValueAsString() .getAsInteger(10, Num)) return; - for (; Num; --Num) - EmitToStreamer(*OutStreamer, MCInstBuilder(AArch64::HINT).addImm(0)); + emitNops(Num); return; } @@ -291,9 +299,9 @@ void AArch64AsmPrinter::EmitSled(const MachineInstr &MI, SledKind Kind) // ;DATA: higher 32 bits of the address of the trampoline // LDP X0, X30, [SP], #16 ; pop X0 and the link register from the stack // - OutStreamer->EmitCodeAlignment(4); + OutStreamer->emitCodeAlignment(4); auto CurSled = OutContext.createTempSymbol("xray_sled_", true); - OutStreamer->EmitLabel(CurSled); + OutStreamer->emitLabel(CurSled); auto Target = OutContext.createTempSymbol(); // Emit "B #32" instruction, which jumps over the next 28 bytes. @@ -304,8 +312,8 @@ void AArch64AsmPrinter::EmitSled(const MachineInstr &MI, SledKind Kind) for (int8_t I = 0; I < NoopsInSledCount; I++) EmitToStreamer(*OutStreamer, MCInstBuilder(AArch64::HINT).addImm(0)); - OutStreamer->EmitLabel(Target); - recordSled(CurSled, MI, Kind); + OutStreamer->emitLabel(Target); + recordSled(CurSled, MI, Kind, 2); } void AArch64AsmPrinter::LowerHWASAN_CHECK_MEMACCESS(const MachineInstr &MI) { @@ -364,25 +372,25 @@ void AArch64AsmPrinter::EmitHwasanMemaccessSymbols(Module &M) { ELF::SHF_EXECINSTR | ELF::SHF_ALLOC | ELF::SHF_GROUP, 0, Sym->getName())); - OutStreamer->EmitSymbolAttribute(Sym, MCSA_ELF_TypeFunction); - OutStreamer->EmitSymbolAttribute(Sym, MCSA_Weak); - OutStreamer->EmitSymbolAttribute(Sym, MCSA_Hidden); - OutStreamer->EmitLabel(Sym); + OutStreamer->emitSymbolAttribute(Sym, MCSA_ELF_TypeFunction); + OutStreamer->emitSymbolAttribute(Sym, MCSA_Weak); + OutStreamer->emitSymbolAttribute(Sym, MCSA_Hidden); + OutStreamer->emitLabel(Sym); - OutStreamer->EmitInstruction(MCInstBuilder(AArch64::UBFMXri) + OutStreamer->emitInstruction(MCInstBuilder(AArch64::UBFMXri) .addReg(AArch64::X16) .addReg(Reg) .addImm(4) .addImm(55), *STI); - OutStreamer->EmitInstruction(MCInstBuilder(AArch64::LDRBBroX) + OutStreamer->emitInstruction(MCInstBuilder(AArch64::LDRBBroX) .addReg(AArch64::W16) .addReg(AArch64::X9) .addReg(AArch64::X16) .addImm(0) .addImm(0), *STI); - OutStreamer->EmitInstruction( + OutStreamer->emitInstruction( MCInstBuilder(AArch64::SUBSXrs) .addReg(AArch64::XZR) .addReg(AArch64::X16) @@ -390,33 +398,33 @@ void AArch64AsmPrinter::EmitHwasanMemaccessSymbols(Module &M) { .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSR, 56)), *STI); MCSymbol *HandleMismatchOrPartialSym = OutContext.createTempSymbol(); - OutStreamer->EmitInstruction( + OutStreamer->emitInstruction( MCInstBuilder(AArch64::Bcc) .addImm(AArch64CC::NE) .addExpr(MCSymbolRefExpr::create(HandleMismatchOrPartialSym, OutContext)), *STI); MCSymbol *ReturnSym = OutContext.createTempSymbol(); - OutStreamer->EmitLabel(ReturnSym); - OutStreamer->EmitInstruction( + OutStreamer->emitLabel(ReturnSym); + OutStreamer->emitInstruction( MCInstBuilder(AArch64::RET).addReg(AArch64::LR), *STI); - OutStreamer->EmitLabel(HandleMismatchOrPartialSym); + OutStreamer->emitLabel(HandleMismatchOrPartialSym); if (IsShort) { - OutStreamer->EmitInstruction(MCInstBuilder(AArch64::SUBSWri) + OutStreamer->emitInstruction(MCInstBuilder(AArch64::SUBSWri) .addReg(AArch64::WZR) .addReg(AArch64::W16) .addImm(15) .addImm(0), *STI); MCSymbol *HandleMismatchSym = OutContext.createTempSymbol(); - OutStreamer->EmitInstruction( + OutStreamer->emitInstruction( MCInstBuilder(AArch64::Bcc) .addImm(AArch64CC::HI) .addExpr(MCSymbolRefExpr::create(HandleMismatchSym, OutContext)), *STI); - OutStreamer->EmitInstruction( + OutStreamer->emitInstruction( MCInstBuilder(AArch64::ANDXri) .addReg(AArch64::X17) .addReg(Reg) @@ -424,59 +432,59 @@ void AArch64AsmPrinter::EmitHwasanMemaccessSymbols(Module &M) { *STI); unsigned Size = 1 << (AccessInfo & 0xf); if (Size != 1) - OutStreamer->EmitInstruction(MCInstBuilder(AArch64::ADDXri) + OutStreamer->emitInstruction(MCInstBuilder(AArch64::ADDXri) .addReg(AArch64::X17) .addReg(AArch64::X17) .addImm(Size - 1) .addImm(0), *STI); - OutStreamer->EmitInstruction(MCInstBuilder(AArch64::SUBSWrs) + OutStreamer->emitInstruction(MCInstBuilder(AArch64::SUBSWrs) .addReg(AArch64::WZR) .addReg(AArch64::W16) .addReg(AArch64::W17) .addImm(0), *STI); - OutStreamer->EmitInstruction( + OutStreamer->emitInstruction( MCInstBuilder(AArch64::Bcc) .addImm(AArch64CC::LS) .addExpr(MCSymbolRefExpr::create(HandleMismatchSym, OutContext)), *STI); - OutStreamer->EmitInstruction( + OutStreamer->emitInstruction( MCInstBuilder(AArch64::ORRXri) .addReg(AArch64::X16) .addReg(Reg) .addImm(AArch64_AM::encodeLogicalImmediate(0xf, 64)), *STI); - OutStreamer->EmitInstruction(MCInstBuilder(AArch64::LDRBBui) + OutStreamer->emitInstruction(MCInstBuilder(AArch64::LDRBBui) .addReg(AArch64::W16) .addReg(AArch64::X16) .addImm(0), *STI); - OutStreamer->EmitInstruction( + OutStreamer->emitInstruction( MCInstBuilder(AArch64::SUBSXrs) .addReg(AArch64::XZR) .addReg(AArch64::X16) .addReg(Reg) .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSR, 56)), *STI); - OutStreamer->EmitInstruction( + OutStreamer->emitInstruction( MCInstBuilder(AArch64::Bcc) .addImm(AArch64CC::EQ) .addExpr(MCSymbolRefExpr::create(ReturnSym, OutContext)), *STI); - OutStreamer->EmitLabel(HandleMismatchSym); + OutStreamer->emitLabel(HandleMismatchSym); } - OutStreamer->EmitInstruction(MCInstBuilder(AArch64::STPXpre) + OutStreamer->emitInstruction(MCInstBuilder(AArch64::STPXpre) .addReg(AArch64::SP) .addReg(AArch64::X0) .addReg(AArch64::X1) .addReg(AArch64::SP) .addImm(-32), *STI); - OutStreamer->EmitInstruction(MCInstBuilder(AArch64::STPXi) + OutStreamer->emitInstruction(MCInstBuilder(AArch64::STPXi) .addReg(AArch64::FP) .addReg(AArch64::LR) .addReg(AArch64::SP) @@ -484,13 +492,13 @@ void AArch64AsmPrinter::EmitHwasanMemaccessSymbols(Module &M) { *STI); if (Reg != AArch64::X0) - OutStreamer->EmitInstruction(MCInstBuilder(AArch64::ORRXrs) + OutStreamer->emitInstruction(MCInstBuilder(AArch64::ORRXrs) .addReg(AArch64::X0) .addReg(AArch64::XZR) .addReg(Reg) .addImm(0), *STI); - OutStreamer->EmitInstruction(MCInstBuilder(AArch64::MOVZXi) + OutStreamer->emitInstruction(MCInstBuilder(AArch64::MOVZXi) .addReg(AArch64::X1) .addImm(AccessInfo) .addImm(0), @@ -499,14 +507,14 @@ void AArch64AsmPrinter::EmitHwasanMemaccessSymbols(Module &M) { // Intentionally load the GOT entry and branch to it, rather than possibly // late binding the function, which may clobber the registers before we have // a chance to save them. - OutStreamer->EmitInstruction( + OutStreamer->emitInstruction( MCInstBuilder(AArch64::ADRP) .addReg(AArch64::X16) .addExpr(AArch64MCExpr::create( HwasanTagMismatchRef, AArch64MCExpr::VariantKind::VK_GOT_PAGE, OutContext)), *STI); - OutStreamer->EmitInstruction( + OutStreamer->emitInstruction( MCInstBuilder(AArch64::LDRXui) .addReg(AArch64::X16) .addReg(AArch64::X16) @@ -514,12 +522,12 @@ void AArch64AsmPrinter::EmitHwasanMemaccessSymbols(Module &M) { HwasanTagMismatchRef, AArch64MCExpr::VariantKind::VK_GOT_LO12, OutContext)), *STI); - OutStreamer->EmitInstruction( + OutStreamer->emitInstruction( MCInstBuilder(AArch64::BR).addReg(AArch64::X16), *STI); } } -void AArch64AsmPrinter::EmitEndOfAsmFile(Module &M) { +void AArch64AsmPrinter::emitEndOfAsmFile(Module &M) { EmitHwasanMemaccessSymbols(M); const Triple &TT = TM.getTargetTriple(); @@ -529,7 +537,7 @@ void AArch64AsmPrinter::EmitEndOfAsmFile(Module &M) { // implementation of multiple entry points). If this doesn't occur, the // linker can safely perform dead code stripping. Since LLVM never // generates code that does this, it is always safe to set. - OutStreamer->EmitAssemblerFlag(MCAF_SubsectionsViaSymbols); + OutStreamer->emitAssemblerFlag(MCAF_SubsectionsViaSymbols); } emitStackMaps(SM); } @@ -544,12 +552,12 @@ void AArch64AsmPrinter::EmitLOHs() { "Label hasn't been inserted for LOH related instruction"); MCArgs.push_back(LabelIt->second); } - OutStreamer->EmitLOHDirective(D.getKind(), MCArgs); + OutStreamer->emitLOHDirective(D.getKind(), MCArgs); MCArgs.clear(); } } -void AArch64AsmPrinter::EmitFunctionBodyEnd() { +void AArch64AsmPrinter::emitFunctionBodyEnd() { if (!AArch64FI->getLOHRelated().empty()) EmitLOHs(); } @@ -741,11 +749,10 @@ void AArch64AsmPrinter::PrintDebugValueComment(const MachineInstr *MI, assert(NOps == 4); OS << '\t' << MAI->getCommentString() << "DEBUG_VALUE: "; // cast away const; DIetc do not take const operands for some reason. - OS << cast(MI->getOperand(NOps - 2).getMetadata()) - ->getName(); + OS << MI->getDebugVariable()->getName(); OS << " <- "; // Frame address. Currently handles register +- offset only. - assert(MI->getOperand(0).isReg() && MI->getOperand(1).isImm()); + assert(MI->getDebugOperand(0).isReg() && MI->isDebugOffsetImm()); OS << '['; printOperand(MI, 0, OS); OS << '+'; @@ -755,7 +762,7 @@ void AArch64AsmPrinter::PrintDebugValueComment(const MachineInstr *MI, printOperand(MI, NOps - 2, OS); } -void AArch64AsmPrinter::EmitJumpTableInfo() { +void AArch64AsmPrinter::emitJumpTableInfo() { const MachineJumpTableInfo *MJTI = MF->getJumpTableInfo(); if (!MJTI) return; @@ -783,8 +790,8 @@ void AArch64AsmPrinter::EmitJumpTableInfo() { if (JTBBs.empty()) continue; unsigned Size = AFI->getJumpTableEntrySize(JTI); - EmitAlignment(Align(Size)); - OutStreamer->EmitLabel(GetJTISymbol(JTI)); + emitAlignment(Align(Size)); + OutStreamer->emitLabel(GetJTISymbol(JTI)); for (auto *JTBB : JTBBs) emitJumpTableEntry(MJTI, JTBB, JTI); @@ -812,7 +819,7 @@ void AArch64AsmPrinter::emitJumpTableEntry(const MachineJumpTableInfo *MJTI, Value, MCConstantExpr::create(2, OutContext), OutContext); } - OutStreamer->EmitValue(Value, Size); + OutStreamer->emitValue(Value, Size); } /// Small jump tables contain an unsigned byte or half, representing the offset @@ -868,7 +875,7 @@ void AArch64AsmPrinter::LowerSTACKMAP(MCStreamer &OutStreamer, StackMaps &SM, auto &Ctx = OutStreamer.getContext(); MCSymbol *MILabel = Ctx.createTempSymbol(); - OutStreamer.EmitLabel(MILabel); + OutStreamer.emitLabel(MILabel); SM.recordStackMap(*MILabel, MI); assert(NumNOPBytes % 4 == 0 && "Invalid number of NOP bytes requested!"); @@ -898,7 +905,7 @@ void AArch64AsmPrinter::LowerPATCHPOINT(MCStreamer &OutStreamer, StackMaps &SM, const MachineInstr &MI) { auto &Ctx = OutStreamer.getContext(); MCSymbol *MILabel = Ctx.createTempSymbol(); - OutStreamer.EmitLabel(MILabel); + OutStreamer.emitLabel(MILabel); SM.recordPatchPoint(*MILabel, MI); PatchPointOpers Opers(&MI); @@ -982,7 +989,7 @@ void AArch64AsmPrinter::EmitFMov0(const MachineInstr &MI) { // instructions) auto-generated. #include "AArch64GenMCPseudoLowering.inc" -void AArch64AsmPrinter::EmitInstruction(const MachineInstr *MI) { +void AArch64AsmPrinter::emitInstruction(const MachineInstr *MI) { // Do any auto-generated pseudo lowerings. if (emitPseudoExpansionLowering(*OutStreamer, MI)) return; @@ -992,7 +999,7 @@ void AArch64AsmPrinter::EmitInstruction(const MachineInstr *MI) { MCSymbol *LOHLabel = createTempSymbol("loh"); // Associate the instruction with the label LOHInstToLabel[MI] = LOHLabel; - OutStreamer->EmitLabel(LOHLabel); + OutStreamer->emitLabel(LOHLabel); } AArch64TargetStreamer *TS = @@ -1001,6 +1008,26 @@ void AArch64AsmPrinter::EmitInstruction(const MachineInstr *MI) { switch (MI->getOpcode()) { default: break; + case AArch64::HINT: { + // CurrentPatchableFunctionEntrySym can be CurrentFnBegin only for + // -fpatchable-function-entry=N,0. The entry MBB is guaranteed to be + // non-empty. If MI is the initial BTI, place the + // __patchable_function_entries label after BTI. + if (CurrentPatchableFunctionEntrySym && + CurrentPatchableFunctionEntrySym == CurrentFnBegin && + MI == &MF->front().front()) { + int64_t Imm = MI->getOperand(0).getImm(); + if ((Imm & 32) && (Imm & 6)) { + MCInst Inst; + MCInstLowering.Lower(MI, Inst); + EmitToStreamer(*OutStreamer, Inst); + CurrentPatchableFunctionEntrySym = createTempSymbol("patch"); + OutStreamer->emitLabel(CurrentPatchableFunctionEntrySym); + return; + } + } + break; + } case AArch64::MOVMCSym: { Register DestReg = MI->getOperand(0).getReg(); const MachineOperand &MO_Sym = MI->getOperand(1); @@ -1048,7 +1075,7 @@ void AArch64AsmPrinter::EmitInstruction(const MachineInstr *MI) { SmallString<128> TmpStr; raw_svector_ostream OS(TmpStr); PrintDebugValueComment(MI, OS); - OutStreamer->EmitRawText(StringRef(OS.str())); + OutStreamer->emitRawText(StringRef(OS.str())); } return; @@ -1061,7 +1088,7 @@ void AArch64AsmPrinter::EmitInstruction(const MachineInstr *MI) { if (needsCFIMoves() == CFI_M_None) return; - OutStreamer->EmitCFIBKeyFrame(); + OutStreamer->emitCFIBKeyFrame(); return; } } @@ -1087,6 +1114,25 @@ void AArch64AsmPrinter::EmitInstruction(const MachineInstr *MI) { EmitToStreamer(*OutStreamer, TmpInst); return; } + case AArch64::SpeculationBarrierISBDSBEndBB: { + // Print DSB SYS + ISB + MCInst TmpInstDSB; + TmpInstDSB.setOpcode(AArch64::DSB); + TmpInstDSB.addOperand(MCOperand::createImm(0xf)); + EmitToStreamer(*OutStreamer, TmpInstDSB); + MCInst TmpInstISB; + TmpInstISB.setOpcode(AArch64::ISB); + TmpInstISB.addOperand(MCOperand::createImm(0xf)); + EmitToStreamer(*OutStreamer, TmpInstISB); + return; + } + case AArch64::SpeculationBarrierSBEndBB: { + // Print SB + MCInst TmpInstSB; + TmpInstSB.setOpcode(AArch64::SB); + EmitToStreamer(*OutStreamer, TmpInstSB); + return; + } case AArch64::TLSDESC_CALLSEQ: { /// lower this to: /// adrp x0, :tlsdesc:var diff --git a/llvm/lib/Target/AArch64/AArch64BranchTargets.cpp b/llvm/lib/Target/AArch64/AArch64BranchTargets.cpp index 6fa3a462bc71a..1956014b738d0 100644 --- a/llvm/lib/Target/AArch64/AArch64BranchTargets.cpp +++ b/llvm/lib/Target/AArch64/AArch64BranchTargets.cpp @@ -118,9 +118,15 @@ void AArch64BranchTargets::addBTI(MachineBasicBlock &MBB, bool CouldCall, auto MBBI = MBB.begin(); - // PACI[AB]SP are implicitly BTI JC, so no BTI instruction needed there. - if (MBBI != MBB.end() && (MBBI->getOpcode() == AArch64::PACIASP || - MBBI->getOpcode() == AArch64::PACIBSP)) + // Skip the meta instuctions, those will be removed anyway. + for (; MBBI != MBB.end() && MBBI->isMetaInstruction(); ++MBBI) + ; + + // SCTLR_EL1.BT[01] is set to 0 by default which means + // PACI[AB]SP are implicitly BTI C so no BTI C instruction is needed there. + if (MBBI != MBB.end() && HintNum == 34 && + (MBBI->getOpcode() == AArch64::PACIASP || + MBBI->getOpcode() == AArch64::PACIBSP)) return; BuildMI(MBB, MBB.begin(), MBB.findDebugLoc(MBB.begin()), diff --git a/llvm/lib/Target/AArch64/AArch64CallLowering.cpp b/llvm/lib/Target/AArch64/AArch64CallLowering.cpp deleted file mode 100644 index 76ff238234d99..0000000000000 --- a/llvm/lib/Target/AArch64/AArch64CallLowering.cpp +++ /dev/null @@ -1,1029 +0,0 @@ -//===--- AArch64CallLowering.cpp - Call lowering --------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -/// -/// \file -/// This file implements the lowering of LLVM calls to machine code calls for -/// GlobalISel. -/// -//===----------------------------------------------------------------------===// - -#include "AArch64CallLowering.h" -#include "AArch64ISelLowering.h" -#include "AArch64MachineFunctionInfo.h" -#include "AArch64Subtarget.h" -#include "llvm/ADT/ArrayRef.h" -#include "llvm/ADT/SmallVector.h" -#include "llvm/CodeGen/Analysis.h" -#include "llvm/CodeGen/CallingConvLower.h" -#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" -#include "llvm/CodeGen/GlobalISel/Utils.h" -#include "llvm/CodeGen/LowLevelType.h" -#include "llvm/CodeGen/MachineBasicBlock.h" -#include "llvm/CodeGen/MachineFrameInfo.h" -#include "llvm/CodeGen/MachineFunction.h" -#include "llvm/CodeGen/MachineInstrBuilder.h" -#include "llvm/CodeGen/MachineMemOperand.h" -#include "llvm/CodeGen/MachineOperand.h" -#include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/CodeGen/TargetRegisterInfo.h" -#include "llvm/CodeGen/TargetSubtargetInfo.h" -#include "llvm/CodeGen/ValueTypes.h" -#include "llvm/IR/Argument.h" -#include "llvm/IR/Attributes.h" -#include "llvm/IR/Function.h" -#include "llvm/IR/Type.h" -#include "llvm/IR/Value.h" -#include "llvm/Support/MachineValueType.h" -#include -#include -#include -#include - -#define DEBUG_TYPE "aarch64-call-lowering" - -using namespace llvm; - -AArch64CallLowering::AArch64CallLowering(const AArch64TargetLowering &TLI) - : CallLowering(&TLI) {} - -namespace { -struct IncomingArgHandler : public CallLowering::ValueHandler { - IncomingArgHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI, - CCAssignFn *AssignFn) - : ValueHandler(MIRBuilder, MRI, AssignFn), StackUsed(0) {} - - Register getStackAddress(uint64_t Size, int64_t Offset, - MachinePointerInfo &MPO) override { - auto &MFI = MIRBuilder.getMF().getFrameInfo(); - int FI = MFI.CreateFixedObject(Size, Offset, true); - MPO = MachinePointerInfo::getFixedStack(MIRBuilder.getMF(), FI); - Register AddrReg = MRI.createGenericVirtualRegister(LLT::pointer(0, 64)); - MIRBuilder.buildFrameIndex(AddrReg, FI); - StackUsed = std::max(StackUsed, Size + Offset); - return AddrReg; - } - - void assignValueToReg(Register ValVReg, Register PhysReg, - CCValAssign &VA) override { - markPhysRegUsed(PhysReg); - switch (VA.getLocInfo()) { - default: - MIRBuilder.buildCopy(ValVReg, PhysReg); - break; - case CCValAssign::LocInfo::SExt: - case CCValAssign::LocInfo::ZExt: - case CCValAssign::LocInfo::AExt: { - auto Copy = MIRBuilder.buildCopy(LLT{VA.getLocVT()}, PhysReg); - MIRBuilder.buildTrunc(ValVReg, Copy); - break; - } - } - } - - void assignValueToAddress(Register ValVReg, Register Addr, uint64_t Size, - MachinePointerInfo &MPO, CCValAssign &VA) override { - // FIXME: Get alignment - auto MMO = MIRBuilder.getMF().getMachineMemOperand( - MPO, MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant, Size, - 1); - MIRBuilder.buildLoad(ValVReg, Addr, *MMO); - } - - /// How the physical register gets marked varies between formal - /// parameters (it's a basic-block live-in), and a call instruction - /// (it's an implicit-def of the BL). - virtual void markPhysRegUsed(unsigned PhysReg) = 0; - - bool isIncomingArgumentHandler() const override { return true; } - - uint64_t StackUsed; -}; - -struct FormalArgHandler : public IncomingArgHandler { - FormalArgHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI, - CCAssignFn *AssignFn) - : IncomingArgHandler(MIRBuilder, MRI, AssignFn) {} - - void markPhysRegUsed(unsigned PhysReg) override { - MIRBuilder.getMRI()->addLiveIn(PhysReg); - MIRBuilder.getMBB().addLiveIn(PhysReg); - } -}; - -struct CallReturnHandler : public IncomingArgHandler { - CallReturnHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI, - MachineInstrBuilder MIB, CCAssignFn *AssignFn) - : IncomingArgHandler(MIRBuilder, MRI, AssignFn), MIB(MIB) {} - - void markPhysRegUsed(unsigned PhysReg) override { - MIB.addDef(PhysReg, RegState::Implicit); - } - - MachineInstrBuilder MIB; -}; - -struct OutgoingArgHandler : public CallLowering::ValueHandler { - OutgoingArgHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI, - MachineInstrBuilder MIB, CCAssignFn *AssignFn, - CCAssignFn *AssignFnVarArg, bool IsTailCall = false, - int FPDiff = 0) - : ValueHandler(MIRBuilder, MRI, AssignFn), MIB(MIB), - AssignFnVarArg(AssignFnVarArg), IsTailCall(IsTailCall), FPDiff(FPDiff), - StackSize(0) {} - - bool isIncomingArgumentHandler() const override { return false; } - - Register getStackAddress(uint64_t Size, int64_t Offset, - MachinePointerInfo &MPO) override { - MachineFunction &MF = MIRBuilder.getMF(); - LLT p0 = LLT::pointer(0, 64); - LLT s64 = LLT::scalar(64); - - if (IsTailCall) { - Offset += FPDiff; - int FI = MF.getFrameInfo().CreateFixedObject(Size, Offset, true); - Register FIReg = MRI.createGenericVirtualRegister(p0); - MIRBuilder.buildFrameIndex(FIReg, FI); - MPO = MachinePointerInfo::getFixedStack(MF, FI); - return FIReg; - } - - Register SPReg = MRI.createGenericVirtualRegister(p0); - MIRBuilder.buildCopy(SPReg, Register(AArch64::SP)); - - Register OffsetReg = MRI.createGenericVirtualRegister(s64); - MIRBuilder.buildConstant(OffsetReg, Offset); - - Register AddrReg = MRI.createGenericVirtualRegister(p0); - MIRBuilder.buildPtrAdd(AddrReg, SPReg, OffsetReg); - - MPO = MachinePointerInfo::getStack(MF, Offset); - return AddrReg; - } - - void assignValueToReg(Register ValVReg, Register PhysReg, - CCValAssign &VA) override { - MIB.addUse(PhysReg, RegState::Implicit); - Register ExtReg = extendRegister(ValVReg, VA); - MIRBuilder.buildCopy(PhysReg, ExtReg); - } - - void assignValueToAddress(Register ValVReg, Register Addr, uint64_t Size, - MachinePointerInfo &MPO, CCValAssign &VA) override { - if (VA.getLocInfo() == CCValAssign::LocInfo::AExt) { - Size = VA.getLocVT().getSizeInBits() / 8; - ValVReg = MIRBuilder.buildAnyExt(LLT::scalar(Size * 8), ValVReg) - ->getOperand(0) - .getReg(); - } - auto MMO = MIRBuilder.getMF().getMachineMemOperand( - MPO, MachineMemOperand::MOStore, Size, 1); - MIRBuilder.buildStore(ValVReg, Addr, *MMO); - } - - bool assignArg(unsigned ValNo, MVT ValVT, MVT LocVT, - CCValAssign::LocInfo LocInfo, - const CallLowering::ArgInfo &Info, - ISD::ArgFlagsTy Flags, - CCState &State) override { - bool Res; - if (Info.IsFixed) - Res = AssignFn(ValNo, ValVT, LocVT, LocInfo, Flags, State); - else - Res = AssignFnVarArg(ValNo, ValVT, LocVT, LocInfo, Flags, State); - - StackSize = State.getNextStackOffset(); - return Res; - } - - MachineInstrBuilder MIB; - CCAssignFn *AssignFnVarArg; - bool IsTailCall; - - /// For tail calls, the byte offset of the call's argument area from the - /// callee's. Unused elsewhere. - int FPDiff; - uint64_t StackSize; -}; -} // namespace - -static bool doesCalleeRestoreStack(CallingConv::ID CallConv, bool TailCallOpt) { - return CallConv == CallingConv::Fast && TailCallOpt; -} - -void AArch64CallLowering::splitToValueTypes( - const ArgInfo &OrigArg, SmallVectorImpl &SplitArgs, - const DataLayout &DL, MachineRegisterInfo &MRI, CallingConv::ID CallConv) const { - const AArch64TargetLowering &TLI = *getTLI(); - LLVMContext &Ctx = OrigArg.Ty->getContext(); - - if (OrigArg.Ty->isVoidTy()) - return; - - SmallVector SplitVTs; - SmallVector Offsets; - ComputeValueVTs(TLI, DL, OrigArg.Ty, SplitVTs, &Offsets, 0); - - if (SplitVTs.size() == 1) { - // No splitting to do, but we want to replace the original type (e.g. [1 x - // double] -> double). - SplitArgs.emplace_back(OrigArg.Regs[0], SplitVTs[0].getTypeForEVT(Ctx), - OrigArg.Flags[0], OrigArg.IsFixed); - return; - } - - // Create one ArgInfo for each virtual register in the original ArgInfo. - assert(OrigArg.Regs.size() == SplitVTs.size() && "Regs / types mismatch"); - - bool NeedsRegBlock = TLI.functionArgumentNeedsConsecutiveRegisters( - OrigArg.Ty, CallConv, false); - for (unsigned i = 0, e = SplitVTs.size(); i < e; ++i) { - Type *SplitTy = SplitVTs[i].getTypeForEVT(Ctx); - SplitArgs.emplace_back(OrigArg.Regs[i], SplitTy, OrigArg.Flags[0], - OrigArg.IsFixed); - if (NeedsRegBlock) - SplitArgs.back().Flags[0].setInConsecutiveRegs(); - } - - SplitArgs.back().Flags[0].setInConsecutiveRegsLast(); -} - -bool AArch64CallLowering::lowerReturn(MachineIRBuilder &MIRBuilder, - const Value *Val, - ArrayRef VRegs, - Register SwiftErrorVReg) const { - auto MIB = MIRBuilder.buildInstrNoInsert(AArch64::RET_ReallyLR); - assert(((Val && !VRegs.empty()) || (!Val && VRegs.empty())) && - "Return value without a vreg"); - - bool Success = true; - if (!VRegs.empty()) { - MachineFunction &MF = MIRBuilder.getMF(); - const Function &F = MF.getFunction(); - - MachineRegisterInfo &MRI = MF.getRegInfo(); - const AArch64TargetLowering &TLI = *getTLI(); - CCAssignFn *AssignFn = TLI.CCAssignFnForReturn(F.getCallingConv()); - auto &DL = F.getParent()->getDataLayout(); - LLVMContext &Ctx = Val->getType()->getContext(); - - SmallVector SplitEVTs; - ComputeValueVTs(TLI, DL, Val->getType(), SplitEVTs); - assert(VRegs.size() == SplitEVTs.size() && - "For each split Type there should be exactly one VReg."); - - SmallVector SplitArgs; - CallingConv::ID CC = F.getCallingConv(); - - for (unsigned i = 0; i < SplitEVTs.size(); ++i) { - if (TLI.getNumRegistersForCallingConv(Ctx, CC, SplitEVTs[i]) > 1) { - LLVM_DEBUG(dbgs() << "Can't handle extended arg types which need split"); - return false; - } - - Register CurVReg = VRegs[i]; - ArgInfo CurArgInfo = ArgInfo{CurVReg, SplitEVTs[i].getTypeForEVT(Ctx)}; - setArgFlags(CurArgInfo, AttributeList::ReturnIndex, DL, F); - - // i1 is a special case because SDAG i1 true is naturally zero extended - // when widened using ANYEXT. We need to do it explicitly here. - if (MRI.getType(CurVReg).getSizeInBits() == 1) { - CurVReg = MIRBuilder.buildZExt(LLT::scalar(8), CurVReg).getReg(0); - } else { - // Some types will need extending as specified by the CC. - MVT NewVT = TLI.getRegisterTypeForCallingConv(Ctx, CC, SplitEVTs[i]); - if (EVT(NewVT) != SplitEVTs[i]) { - unsigned ExtendOp = TargetOpcode::G_ANYEXT; - if (F.getAttributes().hasAttribute(AttributeList::ReturnIndex, - Attribute::SExt)) - ExtendOp = TargetOpcode::G_SEXT; - else if (F.getAttributes().hasAttribute(AttributeList::ReturnIndex, - Attribute::ZExt)) - ExtendOp = TargetOpcode::G_ZEXT; - - LLT NewLLT(NewVT); - LLT OldLLT(MVT::getVT(CurArgInfo.Ty)); - CurArgInfo.Ty = EVT(NewVT).getTypeForEVT(Ctx); - // Instead of an extend, we might have a vector type which needs - // padding with more elements, e.g. <2 x half> -> <4 x half>. - if (NewVT.isVector()) { - if (OldLLT.isVector()) { - if (NewLLT.getNumElements() > OldLLT.getNumElements()) { - // We don't handle VA types which are not exactly twice the - // size, but can easily be done in future. - if (NewLLT.getNumElements() != OldLLT.getNumElements() * 2) { - LLVM_DEBUG(dbgs() << "Outgoing vector ret has too many elts"); - return false; - } - auto Undef = MIRBuilder.buildUndef({OldLLT}); - CurVReg = - MIRBuilder.buildMerge({NewLLT}, {CurVReg, Undef.getReg(0)}) - .getReg(0); - } else { - // Just do a vector extend. - CurVReg = MIRBuilder.buildInstr(ExtendOp, {NewLLT}, {CurVReg}) - .getReg(0); - } - } else if (NewLLT.getNumElements() == 2) { - // We need to pad a <1 x S> type to <2 x S>. Since we don't have - // <1 x S> vector types in GISel we use a build_vector instead - // of a vector merge/concat. - auto Undef = MIRBuilder.buildUndef({OldLLT}); - CurVReg = - MIRBuilder - .buildBuildVector({NewLLT}, {CurVReg, Undef.getReg(0)}) - .getReg(0); - } else { - LLVM_DEBUG(dbgs() << "Could not handle ret ty"); - return false; - } - } else { - // A scalar extend. - CurVReg = - MIRBuilder.buildInstr(ExtendOp, {NewLLT}, {CurVReg}).getReg(0); - } - } - } - if (CurVReg != CurArgInfo.Regs[0]) { - CurArgInfo.Regs[0] = CurVReg; - // Reset the arg flags after modifying CurVReg. - setArgFlags(CurArgInfo, AttributeList::ReturnIndex, DL, F); - } - splitToValueTypes(CurArgInfo, SplitArgs, DL, MRI, CC); - } - - OutgoingArgHandler Handler(MIRBuilder, MRI, MIB, AssignFn, AssignFn); - Success = handleAssignments(MIRBuilder, SplitArgs, Handler); - } - - if (SwiftErrorVReg) { - MIB.addUse(AArch64::X21, RegState::Implicit); - MIRBuilder.buildCopy(AArch64::X21, SwiftErrorVReg); - } - - MIRBuilder.insertInstr(MIB); - return Success; -} - -/// Helper function to compute forwarded registers for musttail calls. Computes -/// the forwarded registers, sets MBB liveness, and emits COPY instructions that -/// can be used to save + restore registers later. -static void handleMustTailForwardedRegisters(MachineIRBuilder &MIRBuilder, - CCAssignFn *AssignFn) { - MachineBasicBlock &MBB = MIRBuilder.getMBB(); - MachineFunction &MF = MIRBuilder.getMF(); - MachineFrameInfo &MFI = MF.getFrameInfo(); - - if (!MFI.hasMustTailInVarArgFunc()) - return; - - AArch64FunctionInfo *FuncInfo = MF.getInfo(); - const Function &F = MF.getFunction(); - assert(F.isVarArg() && "Expected F to be vararg?"); - - // Compute the set of forwarded registers. The rest are scratch. - SmallVector ArgLocs; - CCState CCInfo(F.getCallingConv(), /*IsVarArg=*/true, MF, ArgLocs, - F.getContext()); - SmallVector RegParmTypes; - RegParmTypes.push_back(MVT::i64); - RegParmTypes.push_back(MVT::f128); - - // Later on, we can use this vector to restore the registers if necessary. - SmallVectorImpl &Forwards = - FuncInfo->getForwardedMustTailRegParms(); - CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, AssignFn); - - // Conservatively forward X8, since it might be used for an aggregate - // return. - if (!CCInfo.isAllocated(AArch64::X8)) { - unsigned X8VReg = MF.addLiveIn(AArch64::X8, &AArch64::GPR64RegClass); - Forwards.push_back(ForwardedRegister(X8VReg, AArch64::X8, MVT::i64)); - } - - // Add the forwards to the MachineBasicBlock and MachineFunction. - for (const auto &F : Forwards) { - MBB.addLiveIn(F.PReg); - MIRBuilder.buildCopy(Register(F.VReg), Register(F.PReg)); - } -} - -bool AArch64CallLowering::lowerFormalArguments( - MachineIRBuilder &MIRBuilder, const Function &F, - ArrayRef> VRegs) const { - MachineFunction &MF = MIRBuilder.getMF(); - MachineBasicBlock &MBB = MIRBuilder.getMBB(); - MachineRegisterInfo &MRI = MF.getRegInfo(); - auto &DL = F.getParent()->getDataLayout(); - - SmallVector SplitArgs; - unsigned i = 0; - for (auto &Arg : F.args()) { - if (DL.getTypeStoreSize(Arg.getType()) == 0) - continue; - - ArgInfo OrigArg{VRegs[i], Arg.getType()}; - setArgFlags(OrigArg, i + AttributeList::FirstArgIndex, DL, F); - - splitToValueTypes(OrigArg, SplitArgs, DL, MRI, F.getCallingConv()); - ++i; - } - - if (!MBB.empty()) - MIRBuilder.setInstr(*MBB.begin()); - - const AArch64TargetLowering &TLI = *getTLI(); - CCAssignFn *AssignFn = - TLI.CCAssignFnForCall(F.getCallingConv(), /*IsVarArg=*/false); - - FormalArgHandler Handler(MIRBuilder, MRI, AssignFn); - if (!handleAssignments(MIRBuilder, SplitArgs, Handler)) - return false; - - AArch64FunctionInfo *FuncInfo = MF.getInfo(); - uint64_t StackOffset = Handler.StackUsed; - if (F.isVarArg()) { - auto &Subtarget = MF.getSubtarget(); - if (!Subtarget.isTargetDarwin()) { - // FIXME: we need to reimplement saveVarArgsRegisters from - // AArch64ISelLowering. - return false; - } - - // We currently pass all varargs at 8-byte alignment, or 4 in ILP32. - StackOffset = alignTo(Handler.StackUsed, Subtarget.isTargetILP32() ? 4 : 8); - - auto &MFI = MIRBuilder.getMF().getFrameInfo(); - FuncInfo->setVarArgsStackIndex(MFI.CreateFixedObject(4, StackOffset, true)); - } - - if (doesCalleeRestoreStack(F.getCallingConv(), - MF.getTarget().Options.GuaranteedTailCallOpt)) { - // We have a non-standard ABI, so why not make full use of the stack that - // we're going to pop? It must be aligned to 16 B in any case. - StackOffset = alignTo(StackOffset, 16); - - // If we're expected to restore the stack (e.g. fastcc), then we'll be - // adding a multiple of 16. - FuncInfo->setArgumentStackToRestore(StackOffset); - - // Our own callers will guarantee that the space is free by giving an - // aligned value to CALLSEQ_START. - } - - // When we tail call, we need to check if the callee's arguments - // will fit on the caller's stack. So, whenever we lower formal arguments, - // we should keep track of this information, since we might lower a tail call - // in this function later. - FuncInfo->setBytesInStackArgArea(StackOffset); - - auto &Subtarget = MF.getSubtarget(); - if (Subtarget.hasCustomCallingConv()) - Subtarget.getRegisterInfo()->UpdateCustomCalleeSavedRegs(MF); - - handleMustTailForwardedRegisters(MIRBuilder, AssignFn); - - // Move back to the end of the basic block. - MIRBuilder.setMBB(MBB); - - return true; -} - -/// Return true if the calling convention is one that we can guarantee TCO for. -static bool canGuaranteeTCO(CallingConv::ID CC) { - return CC == CallingConv::Fast; -} - -/// Return true if we might ever do TCO for calls with this calling convention. -static bool mayTailCallThisCC(CallingConv::ID CC) { - switch (CC) { - case CallingConv::C: - case CallingConv::PreserveMost: - case CallingConv::Swift: - return true; - default: - return canGuaranteeTCO(CC); - } -} - -/// Returns a pair containing the fixed CCAssignFn and the vararg CCAssignFn for -/// CC. -static std::pair -getAssignFnsForCC(CallingConv::ID CC, const AArch64TargetLowering &TLI) { - return {TLI.CCAssignFnForCall(CC, false), TLI.CCAssignFnForCall(CC, true)}; -} - -bool AArch64CallLowering::doCallerAndCalleePassArgsTheSameWay( - CallLoweringInfo &Info, MachineFunction &MF, - SmallVectorImpl &InArgs) const { - const Function &CallerF = MF.getFunction(); - CallingConv::ID CalleeCC = Info.CallConv; - CallingConv::ID CallerCC = CallerF.getCallingConv(); - - // If the calling conventions match, then everything must be the same. - if (CalleeCC == CallerCC) - return true; - - // Check if the caller and callee will handle arguments in the same way. - const AArch64TargetLowering &TLI = *getTLI(); - CCAssignFn *CalleeAssignFnFixed; - CCAssignFn *CalleeAssignFnVarArg; - std::tie(CalleeAssignFnFixed, CalleeAssignFnVarArg) = - getAssignFnsForCC(CalleeCC, TLI); - - CCAssignFn *CallerAssignFnFixed; - CCAssignFn *CallerAssignFnVarArg; - std::tie(CallerAssignFnFixed, CallerAssignFnVarArg) = - getAssignFnsForCC(CallerCC, TLI); - - if (!resultsCompatible(Info, MF, InArgs, *CalleeAssignFnFixed, - *CalleeAssignFnVarArg, *CallerAssignFnFixed, - *CallerAssignFnVarArg)) - return false; - - // Make sure that the caller and callee preserve all of the same registers. - auto TRI = MF.getSubtarget().getRegisterInfo(); - const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC); - const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC); - if (MF.getSubtarget().hasCustomCallingConv()) { - TRI->UpdateCustomCallPreservedMask(MF, &CallerPreserved); - TRI->UpdateCustomCallPreservedMask(MF, &CalleePreserved); - } - - return TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved); -} - -bool AArch64CallLowering::areCalleeOutgoingArgsTailCallable( - CallLoweringInfo &Info, MachineFunction &MF, - SmallVectorImpl &OutArgs) const { - // If there are no outgoing arguments, then we are done. - if (OutArgs.empty()) - return true; - - const Function &CallerF = MF.getFunction(); - CallingConv::ID CalleeCC = Info.CallConv; - CallingConv::ID CallerCC = CallerF.getCallingConv(); - const AArch64TargetLowering &TLI = *getTLI(); - - CCAssignFn *AssignFnFixed; - CCAssignFn *AssignFnVarArg; - std::tie(AssignFnFixed, AssignFnVarArg) = getAssignFnsForCC(CalleeCC, TLI); - - // We have outgoing arguments. Make sure that we can tail call with them. - SmallVector OutLocs; - CCState OutInfo(CalleeCC, false, MF, OutLocs, CallerF.getContext()); - - if (!analyzeArgInfo(OutInfo, OutArgs, *AssignFnFixed, *AssignFnVarArg)) { - LLVM_DEBUG(dbgs() << "... Could not analyze call operands.\n"); - return false; - } - - // Make sure that they can fit on the caller's stack. - const AArch64FunctionInfo *FuncInfo = MF.getInfo(); - if (OutInfo.getNextStackOffset() > FuncInfo->getBytesInStackArgArea()) { - LLVM_DEBUG(dbgs() << "... Cannot fit call operands on caller's stack.\n"); - return false; - } - - // Verify that the parameters in callee-saved registers match. - // TODO: Port this over to CallLowering as general code once swiftself is - // supported. - auto TRI = MF.getSubtarget().getRegisterInfo(); - const uint32_t *CallerPreservedMask = TRI->getCallPreservedMask(MF, CallerCC); - MachineRegisterInfo &MRI = MF.getRegInfo(); - - for (unsigned i = 0; i < OutLocs.size(); ++i) { - auto &ArgLoc = OutLocs[i]; - // If it's not a register, it's fine. - if (!ArgLoc.isRegLoc()) { - if (Info.IsVarArg) { - // Be conservative and disallow variadic memory operands to match SDAG's - // behaviour. - // FIXME: If the caller's calling convention is C, then we can - // potentially use its argument area. However, for cases like fastcc, - // we can't do anything. - LLVM_DEBUG( - dbgs() - << "... Cannot tail call vararg function with stack arguments\n"); - return false; - } - continue; - } - - Register Reg = ArgLoc.getLocReg(); - - // Only look at callee-saved registers. - if (MachineOperand::clobbersPhysReg(CallerPreservedMask, Reg)) - continue; - - LLVM_DEBUG( - dbgs() - << "... Call has an argument passed in a callee-saved register.\n"); - - // Check if it was copied from. - ArgInfo &OutInfo = OutArgs[i]; - - if (OutInfo.Regs.size() > 1) { - LLVM_DEBUG( - dbgs() << "... Cannot handle arguments in multiple registers.\n"); - return false; - } - - // Check if we copy the register, walking through copies from virtual - // registers. Note that getDefIgnoringCopies does not ignore copies from - // physical registers. - MachineInstr *RegDef = getDefIgnoringCopies(OutInfo.Regs[0], MRI); - if (!RegDef || RegDef->getOpcode() != TargetOpcode::COPY) { - LLVM_DEBUG( - dbgs() - << "... Parameter was not copied into a VReg, cannot tail call.\n"); - return false; - } - - // Got a copy. Verify that it's the same as the register we want. - Register CopyRHS = RegDef->getOperand(1).getReg(); - if (CopyRHS != Reg) { - LLVM_DEBUG(dbgs() << "... Callee-saved register was not copied into " - "VReg, cannot tail call.\n"); - return false; - } - } - - return true; -} - -bool AArch64CallLowering::isEligibleForTailCallOptimization( - MachineIRBuilder &MIRBuilder, CallLoweringInfo &Info, - SmallVectorImpl &InArgs, - SmallVectorImpl &OutArgs) const { - - // Must pass all target-independent checks in order to tail call optimize. - if (!Info.IsTailCall) - return false; - - CallingConv::ID CalleeCC = Info.CallConv; - MachineFunction &MF = MIRBuilder.getMF(); - const Function &CallerF = MF.getFunction(); - - LLVM_DEBUG(dbgs() << "Attempting to lower call as tail call\n"); - - if (Info.SwiftErrorVReg) { - // TODO: We should handle this. - // Note that this is also handled by the check for no outgoing arguments. - // Proactively disabling this though, because the swifterror handling in - // lowerCall inserts a COPY *after* the location of the call. - LLVM_DEBUG(dbgs() << "... Cannot handle tail calls with swifterror yet.\n"); - return false; - } - - if (!mayTailCallThisCC(CalleeCC)) { - LLVM_DEBUG(dbgs() << "... Calling convention cannot be tail called.\n"); - return false; - } - - // Byval parameters hand the function a pointer directly into the stack area - // we want to reuse during a tail call. Working around this *is* possible (see - // X86). - // - // FIXME: In AArch64ISelLowering, this isn't worked around. Can/should we try - // it? - // - // On Windows, "inreg" attributes signify non-aggregate indirect returns. - // In this case, it is necessary to save/restore X0 in the callee. Tail - // call opt interferes with this. So we disable tail call opt when the - // caller has an argument with "inreg" attribute. - // - // FIXME: Check whether the callee also has an "inreg" argument. - // - // When the caller has a swifterror argument, we don't want to tail call - // because would have to move into the swifterror register before the - // tail call. - if (any_of(CallerF.args(), [](const Argument &A) { - return A.hasByValAttr() || A.hasInRegAttr() || A.hasSwiftErrorAttr(); - })) { - LLVM_DEBUG(dbgs() << "... Cannot tail call from callers with byval, " - "inreg, or swifterror arguments\n"); - return false; - } - - // Externally-defined functions with weak linkage should not be - // tail-called on AArch64 when the OS does not support dynamic - // pre-emption of symbols, as the AAELF spec requires normal calls - // to undefined weak functions to be replaced with a NOP or jump to the - // next instruction. The behaviour of branch instructions in this - // situation (as used for tail calls) is implementation-defined, so we - // cannot rely on the linker replacing the tail call with a return. - if (Info.Callee.isGlobal()) { - const GlobalValue *GV = Info.Callee.getGlobal(); - const Triple &TT = MF.getTarget().getTargetTriple(); - if (GV->hasExternalWeakLinkage() && - (!TT.isOSWindows() || TT.isOSBinFormatELF() || - TT.isOSBinFormatMachO())) { - LLVM_DEBUG(dbgs() << "... Cannot tail call externally-defined function " - "with weak linkage for this OS.\n"); - return false; - } - } - - // If we have -tailcallopt, then we're done. - if (MF.getTarget().Options.GuaranteedTailCallOpt) - return canGuaranteeTCO(CalleeCC) && CalleeCC == CallerF.getCallingConv(); - - // We don't have -tailcallopt, so we're allowed to change the ABI (sibcall). - // Try to find cases where we can do that. - - // I want anyone implementing a new calling convention to think long and hard - // about this assert. - assert((!Info.IsVarArg || CalleeCC == CallingConv::C) && - "Unexpected variadic calling convention"); - - // Verify that the incoming and outgoing arguments from the callee are - // safe to tail call. - if (!doCallerAndCalleePassArgsTheSameWay(Info, MF, InArgs)) { - LLVM_DEBUG( - dbgs() - << "... Caller and callee have incompatible calling conventions.\n"); - return false; - } - - if (!areCalleeOutgoingArgsTailCallable(Info, MF, OutArgs)) - return false; - - LLVM_DEBUG( - dbgs() << "... Call is eligible for tail call optimization.\n"); - return true; -} - -static unsigned getCallOpcode(const Function &CallerF, bool IsIndirect, - bool IsTailCall) { - if (!IsTailCall) - return IsIndirect ? AArch64::BLR : AArch64::BL; - - if (!IsIndirect) - return AArch64::TCRETURNdi; - - // When BTI is enabled, we need to use TCRETURNriBTI to make sure that we use - // x16 or x17. - if (CallerF.hasFnAttribute("branch-target-enforcement")) - return AArch64::TCRETURNriBTI; - - return AArch64::TCRETURNri; -} - -bool AArch64CallLowering::lowerTailCall( - MachineIRBuilder &MIRBuilder, CallLoweringInfo &Info, - SmallVectorImpl &OutArgs) const { - MachineFunction &MF = MIRBuilder.getMF(); - const Function &F = MF.getFunction(); - MachineRegisterInfo &MRI = MF.getRegInfo(); - const AArch64TargetLowering &TLI = *getTLI(); - AArch64FunctionInfo *FuncInfo = MF.getInfo(); - - // True when we're tail calling, but without -tailcallopt. - bool IsSibCall = !MF.getTarget().Options.GuaranteedTailCallOpt; - - // TODO: Right now, regbankselect doesn't know how to handle the rtcGPR64 - // register class. Until we can do that, we should fall back here. - if (F.hasFnAttribute("branch-target-enforcement")) { - LLVM_DEBUG( - dbgs() << "Cannot lower indirect tail calls with BTI enabled yet.\n"); - return false; - } - - // Find out which ABI gets to decide where things go. - CallingConv::ID CalleeCC = Info.CallConv; - CCAssignFn *AssignFnFixed; - CCAssignFn *AssignFnVarArg; - std::tie(AssignFnFixed, AssignFnVarArg) = getAssignFnsForCC(CalleeCC, TLI); - - MachineInstrBuilder CallSeqStart; - if (!IsSibCall) - CallSeqStart = MIRBuilder.buildInstr(AArch64::ADJCALLSTACKDOWN); - - unsigned Opc = getCallOpcode(F, Info.Callee.isReg(), true); - auto MIB = MIRBuilder.buildInstrNoInsert(Opc); - MIB.add(Info.Callee); - - // Byte offset for the tail call. When we are sibcalling, this will always - // be 0. - MIB.addImm(0); - - // Tell the call which registers are clobbered. - auto TRI = MF.getSubtarget().getRegisterInfo(); - const uint32_t *Mask = TRI->getCallPreservedMask(MF, CalleeCC); - if (MF.getSubtarget().hasCustomCallingConv()) - TRI->UpdateCustomCallPreservedMask(MF, &Mask); - MIB.addRegMask(Mask); - - if (TRI->isAnyArgRegReserved(MF)) - TRI->emitReservedArgRegCallError(MF); - - // FPDiff is the byte offset of the call's argument area from the callee's. - // Stores to callee stack arguments will be placed in FixedStackSlots offset - // by this amount for a tail call. In a sibling call it must be 0 because the - // caller will deallocate the entire stack and the callee still expects its - // arguments to begin at SP+0. - int FPDiff = 0; - - // This will be 0 for sibcalls, potentially nonzero for tail calls produced - // by -tailcallopt. For sibcalls, the memory operands for the call are - // already available in the caller's incoming argument space. - unsigned NumBytes = 0; - if (!IsSibCall) { - // We aren't sibcalling, so we need to compute FPDiff. We need to do this - // before handling assignments, because FPDiff must be known for memory - // arguments. - unsigned NumReusableBytes = FuncInfo->getBytesInStackArgArea(); - SmallVector OutLocs; - CCState OutInfo(CalleeCC, false, MF, OutLocs, F.getContext()); - analyzeArgInfo(OutInfo, OutArgs, *AssignFnFixed, *AssignFnVarArg); - - // The callee will pop the argument stack as a tail call. Thus, we must - // keep it 16-byte aligned. - NumBytes = alignTo(OutInfo.getNextStackOffset(), 16); - - // FPDiff will be negative if this tail call requires more space than we - // would automatically have in our incoming argument space. Positive if we - // actually shrink the stack. - FPDiff = NumReusableBytes - NumBytes; - - // The stack pointer must be 16-byte aligned at all times it's used for a - // memory operation, which in practice means at *all* times and in - // particular across call boundaries. Therefore our own arguments started at - // a 16-byte aligned SP and the delta applied for the tail call should - // satisfy the same constraint. - assert(FPDiff % 16 == 0 && "unaligned stack on tail call"); - } - - const auto &Forwards = FuncInfo->getForwardedMustTailRegParms(); - - // Do the actual argument marshalling. - SmallVector PhysRegs; - OutgoingArgHandler Handler(MIRBuilder, MRI, MIB, AssignFnFixed, - AssignFnVarArg, true, FPDiff); - if (!handleAssignments(MIRBuilder, OutArgs, Handler)) - return false; - - if (Info.IsVarArg && Info.IsMustTailCall) { - // Now we know what's being passed to the function. Add uses to the call for - // the forwarded registers that we *aren't* passing as parameters. This will - // preserve the copies we build earlier. - for (const auto &F : Forwards) { - Register ForwardedReg = F.PReg; - // If the register is already passed, or aliases a register which is - // already being passed, then skip it. - if (any_of(MIB->uses(), [&ForwardedReg, &TRI](const MachineOperand &Use) { - if (!Use.isReg()) - return false; - return TRI->regsOverlap(Use.getReg(), ForwardedReg); - })) - continue; - - // We aren't passing it already, so we should add it to the call. - MIRBuilder.buildCopy(ForwardedReg, Register(F.VReg)); - MIB.addReg(ForwardedReg, RegState::Implicit); - } - } - - // If we have -tailcallopt, we need to adjust the stack. We'll do the call - // sequence start and end here. - if (!IsSibCall) { - MIB->getOperand(1).setImm(FPDiff); - CallSeqStart.addImm(NumBytes).addImm(0); - // End the call sequence *before* emitting the call. Normally, we would - // tidy the frame up after the call. However, here, we've laid out the - // parameters so that when SP is reset, they will be in the correct - // location. - MIRBuilder.buildInstr(AArch64::ADJCALLSTACKUP).addImm(NumBytes).addImm(0); - } - - // Now we can add the actual call instruction to the correct basic block. - MIRBuilder.insertInstr(MIB); - - // If Callee is a reg, since it is used by a target specific instruction, - // it must have a register class matching the constraint of that instruction. - if (Info.Callee.isReg()) - MIB->getOperand(0).setReg(constrainOperandRegClass( - MF, *TRI, MRI, *MF.getSubtarget().getInstrInfo(), - *MF.getSubtarget().getRegBankInfo(), *MIB, MIB->getDesc(), Info.Callee, - 0)); - - MF.getFrameInfo().setHasTailCall(); - Info.LoweredTailCall = true; - return true; -} - -bool AArch64CallLowering::lowerCall(MachineIRBuilder &MIRBuilder, - CallLoweringInfo &Info) const { - MachineFunction &MF = MIRBuilder.getMF(); - const Function &F = MF.getFunction(); - MachineRegisterInfo &MRI = MF.getRegInfo(); - auto &DL = F.getParent()->getDataLayout(); - const AArch64TargetLowering &TLI = *getTLI(); - - SmallVector OutArgs; - for (auto &OrigArg : Info.OrigArgs) { - splitToValueTypes(OrigArg, OutArgs, DL, MRI, Info.CallConv); - // AAPCS requires that we zero-extend i1 to 8 bits by the caller. - if (OrigArg.Ty->isIntegerTy(1)) - OutArgs.back().Flags[0].setZExt(); - } - - SmallVector InArgs; - if (!Info.OrigRet.Ty->isVoidTy()) - splitToValueTypes(Info.OrigRet, InArgs, DL, MRI, F.getCallingConv()); - - // If we can lower as a tail call, do that instead. - bool CanTailCallOpt = - isEligibleForTailCallOptimization(MIRBuilder, Info, InArgs, OutArgs); - - // We must emit a tail call if we have musttail. - if (Info.IsMustTailCall && !CanTailCallOpt) { - // There are types of incoming/outgoing arguments we can't handle yet, so - // it doesn't make sense to actually die here like in ISelLowering. Instead, - // fall back to SelectionDAG and let it try to handle this. - LLVM_DEBUG(dbgs() << "Failed to lower musttail call as tail call\n"); - return false; - } - - if (CanTailCallOpt) - return lowerTailCall(MIRBuilder, Info, OutArgs); - - // Find out which ABI gets to decide where things go. - CCAssignFn *AssignFnFixed; - CCAssignFn *AssignFnVarArg; - std::tie(AssignFnFixed, AssignFnVarArg) = - getAssignFnsForCC(Info.CallConv, TLI); - - MachineInstrBuilder CallSeqStart; - CallSeqStart = MIRBuilder.buildInstr(AArch64::ADJCALLSTACKDOWN); - - // Create a temporarily-floating call instruction so we can add the implicit - // uses of arg registers. - unsigned Opc = getCallOpcode(F, Info.Callee.isReg(), false); - - auto MIB = MIRBuilder.buildInstrNoInsert(Opc); - MIB.add(Info.Callee); - - // Tell the call which registers are clobbered. - auto TRI = MF.getSubtarget().getRegisterInfo(); - const uint32_t *Mask = TRI->getCallPreservedMask(MF, Info.CallConv); - if (MF.getSubtarget().hasCustomCallingConv()) - TRI->UpdateCustomCallPreservedMask(MF, &Mask); - MIB.addRegMask(Mask); - - if (TRI->isAnyArgRegReserved(MF)) - TRI->emitReservedArgRegCallError(MF); - - // Do the actual argument marshalling. - SmallVector PhysRegs; - OutgoingArgHandler Handler(MIRBuilder, MRI, MIB, AssignFnFixed, - AssignFnVarArg, false); - if (!handleAssignments(MIRBuilder, OutArgs, Handler)) - return false; - - // Now we can add the actual call instruction to the correct basic block. - MIRBuilder.insertInstr(MIB); - - // If Callee is a reg, since it is used by a target specific - // instruction, it must have a register class matching the - // constraint of that instruction. - if (Info.Callee.isReg()) - MIB->getOperand(0).setReg(constrainOperandRegClass( - MF, *TRI, MRI, *MF.getSubtarget().getInstrInfo(), - *MF.getSubtarget().getRegBankInfo(), *MIB, MIB->getDesc(), Info.Callee, - 0)); - - // Finally we can copy the returned value back into its virtual-register. In - // symmetry with the arguments, the physical register must be an - // implicit-define of the call instruction. - if (!Info.OrigRet.Ty->isVoidTy()) { - CCAssignFn *RetAssignFn = TLI.CCAssignFnForReturn(Info.CallConv); - CallReturnHandler Handler(MIRBuilder, MRI, MIB, RetAssignFn); - if (!handleAssignments(MIRBuilder, InArgs, Handler)) - return false; - } - - if (Info.SwiftErrorVReg) { - MIB.addDef(AArch64::X21, RegState::Implicit); - MIRBuilder.buildCopy(Info.SwiftErrorVReg, Register(AArch64::X21)); - } - - uint64_t CalleePopBytes = - doesCalleeRestoreStack(Info.CallConv, - MF.getTarget().Options.GuaranteedTailCallOpt) - ? alignTo(Handler.StackSize, 16) - : 0; - - CallSeqStart.addImm(Handler.StackSize).addImm(0); - MIRBuilder.buildInstr(AArch64::ADJCALLSTACKUP) - .addImm(Handler.StackSize) - .addImm(CalleePopBytes); - - return true; -} diff --git a/llvm/lib/Target/AArch64/AArch64CallLowering.h b/llvm/lib/Target/AArch64/AArch64CallLowering.h deleted file mode 100644 index b0c601c7062c0..0000000000000 --- a/llvm/lib/Target/AArch64/AArch64CallLowering.h +++ /dev/null @@ -1,82 +0,0 @@ -//===- AArch64CallLowering.h - Call lowering --------------------*- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -/// -/// \file -/// This file describes how to lower LLVM calls to machine code calls. -/// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64CALLLOWERING_H -#define LLVM_LIB_TARGET_AARCH64_AARCH64CALLLOWERING_H - -#include "llvm/ADT/ArrayRef.h" -#include "llvm/CodeGen/GlobalISel/CallLowering.h" -#include "llvm/IR/CallingConv.h" -#include -#include - -namespace llvm { - -class AArch64TargetLowering; -class CCValAssign; -class DataLayout; -class MachineIRBuilder; -class MachineRegisterInfo; -class Type; - -class AArch64CallLowering: public CallLowering { -public: - AArch64CallLowering(const AArch64TargetLowering &TLI); - - bool lowerReturn(MachineIRBuilder &MIRBuilder, const Value *Val, - ArrayRef VRegs, - Register SwiftErrorVReg) const override; - - bool lowerFormalArguments(MachineIRBuilder &MIRBuilder, const Function &F, - ArrayRef> VRegs) const override; - - bool lowerCall(MachineIRBuilder &MIRBuilder, - CallLoweringInfo &Info) const override; - - /// Returns true if the call can be lowered as a tail call. - bool - isEligibleForTailCallOptimization(MachineIRBuilder &MIRBuilder, - CallLoweringInfo &Info, - SmallVectorImpl &InArgs, - SmallVectorImpl &OutArgs) const; - - bool supportSwiftError() const override { return true; } - -private: - using RegHandler = std::function; - - using MemHandler = - std::function; - - void splitToValueTypes(const ArgInfo &OrigArgInfo, - SmallVectorImpl &SplitArgs, - const DataLayout &DL, MachineRegisterInfo &MRI, - CallingConv::ID CallConv) const; - - bool lowerTailCall(MachineIRBuilder &MIRBuilder, CallLoweringInfo &Info, - SmallVectorImpl &OutArgs) const; - - bool - doCallerAndCalleePassArgsTheSameWay(CallLoweringInfo &Info, - MachineFunction &MF, - SmallVectorImpl &InArgs) const; - - bool - areCalleeOutgoingArgsTailCallable(CallLoweringInfo &Info, MachineFunction &MF, - SmallVectorImpl &OutArgs) const; -}; - -} // end namespace llvm - -#endif // LLVM_LIB_TARGET_AARCH64_AARCH64CALLLOWERING_H diff --git a/llvm/lib/Target/AArch64/AArch64CallingConvention.cpp b/llvm/lib/Target/AArch64/AArch64CallingConvention.cpp index a0695cef615f3..84ec5afcc9c19 100644 --- a/llvm/lib/Target/AArch64/AArch64CallingConvention.cpp +++ b/llvm/lib/Target/AArch64/AArch64CallingConvention.cpp @@ -38,18 +38,17 @@ static const MCPhysReg QRegList[] = {AArch64::Q0, AArch64::Q1, AArch64::Q2, static bool finishStackBlock(SmallVectorImpl &PendingMembers, MVT LocVT, ISD::ArgFlagsTy &ArgFlags, - CCState &State, unsigned SlotAlign) { + CCState &State, Align SlotAlign) { unsigned Size = LocVT.getSizeInBits() / 8; const Align StackAlign = State.getMachineFunction().getDataLayout().getStackAlignment(); - const Align OrigAlign(ArgFlags.getOrigAlign()); - const Align Align = std::min(OrigAlign, StackAlign); + const Align OrigAlign = ArgFlags.getNonZeroOrigAlign(); + const Align Alignment = std::min(OrigAlign, StackAlign); for (auto &It : PendingMembers) { - It.convertToMem(State.AllocateStack( - Size, std::max((unsigned)Align.value(), SlotAlign))); + It.convertToMem(State.AllocateStack(Size, std::max(Alignment, SlotAlign))); State.addLoc(It); - SlotAlign = 1; + SlotAlign = Align(1); } // All pending members have now been allocated @@ -72,7 +71,7 @@ static bool CC_AArch64_Custom_Stack_Block( if (!ArgFlags.isInConsecutiveRegsLast()) return true; - return finishStackBlock(PendingMembers, LocVT, ArgFlags, State, 8); + return finishStackBlock(PendingMembers, LocVT, ArgFlags, State, Align(8)); } /// Given an [N x Ty] block, it should be passed in a consecutive sequence of @@ -146,7 +145,7 @@ static bool CC_AArch64_Custom_Block(unsigned &ValNo, MVT &ValVT, MVT &LocVT, for (auto Reg : RegList) State.AllocateReg(Reg); - unsigned SlotAlign = Subtarget.isTargetDarwin() ? 1 : 8; + const Align SlotAlign = Subtarget.isTargetDarwin() ? Align(1) : Align(8); return finishStackBlock(PendingMembers, LocVT, ArgFlags, State, SlotAlign); } diff --git a/llvm/lib/Target/AArch64/AArch64CallingConvention.td b/llvm/lib/Target/AArch64/AArch64CallingConvention.td index a0b2d7712b662..fdcc890bf5892 100644 --- a/llvm/lib/Target/AArch64/AArch64CallingConvention.td +++ b/llvm/lib/Target/AArch64/AArch64CallingConvention.td @@ -10,9 +10,6 @@ // //===----------------------------------------------------------------------===// -/// CCIfAlign - Match of the original alignment of the arg -class CCIfAlign : - CCIf; /// CCIfBigEndian - Match only if we're in big endian mode. class CCIfBigEndian : CCIf<"State.getMachineFunction().getDataLayout().isBigEndian()", A>; @@ -33,9 +30,9 @@ def CC_AArch64_AAPCS : CallingConv<[ // Big endian vectors must be passed as if they were 1-element vectors so that // their lanes are in a consistent order. - CCIfBigEndian>>, - CCIfBigEndian>>, // In AAPCS, an SRet is passed in X8, not X0 like a normal pointer parameter. @@ -75,10 +72,10 @@ def CC_AArch64_AAPCS : CallingConv<[ CCIfConsecutiveRegs>, CCIfType<[nxv16i8, nxv8i16, nxv4i32, nxv2i64, nxv2f16, nxv4f16, nxv8f16, - nxv2f32, nxv4f32, nxv2f64], + nxv2bf16, nxv4bf16, nxv8bf16, nxv2f32, nxv4f32, nxv2f64], CCAssignToReg<[Z0, Z1, Z2, Z3, Z4, Z5, Z6, Z7]>>, CCIfType<[nxv16i8, nxv8i16, nxv4i32, nxv2i64, nxv2f16, nxv4f16, nxv8f16, - nxv2f32, nxv4f32, nxv2f64], + nxv2bf16, nxv4bf16, nxv8bf16, nxv2f32, nxv4f32, nxv2f64], CCPassIndirect>, CCIfType<[nxv2i1, nxv4i1, nxv8i1, nxv16i1], @@ -102,22 +99,24 @@ def CC_AArch64_AAPCS : CallingConv<[ [W0, W1, W2, W3, W4, W5, W6, W7]>>, CCIfType<[f16], CCAssignToRegWithShadow<[H0, H1, H2, H3, H4, H5, H6, H7], [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>, + CCIfType<[bf16], CCAssignToRegWithShadow<[H0, H1, H2, H3, H4, H5, H6, H7], + [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>, CCIfType<[f32], CCAssignToRegWithShadow<[S0, S1, S2, S3, S4, S5, S6, S7], [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>, CCIfType<[f64], CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7], [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>, - CCIfType<[v1i64, v2i32, v4i16, v8i8, v1f64, v2f32, v4f16], + CCIfType<[v1i64, v2i32, v4i16, v8i8, v1f64, v2f32, v4f16, v4bf16], CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7], [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>, - CCIfType<[f128, v2i64, v4i32, v8i16, v16i8, v4f32, v2f64, v8f16], + CCIfType<[f128, v2i64, v4i32, v8i16, v16i8, v4f32, v2f64, v8f16, v8bf16], CCAssignToReg<[Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>, // If more than will fit in registers, pass them on the stack instead. - CCIfType<[i1, i8, i16, f16], CCAssignToStack<8, 8>>, + CCIfType<[i1, i8, i16, f16, bf16], CCAssignToStack<8, 8>>, CCIfType<[i32, f32], CCAssignToStack<8, 8>>, - CCIfType<[i64, f64, v1f64, v2f32, v1i64, v2i32, v4i16, v8i8, v4f16], + CCIfType<[i64, f64, v1f64, v2f32, v1i64, v2i32, v4i16, v8i8, v4f16, v4bf16], CCAssignToStack<8, 8>>, - CCIfType<[f128, v2i64, v4i32, v8i16, v16i8, v4f32, v2f64, v8f16], + CCIfType<[f128, v2i64, v4i32, v8i16, v16i8, v4f32, v2f64, v8f16, v8bf16], CCAssignToStack<16, 16>> ]>; @@ -132,9 +131,9 @@ def RetCC_AArch64_AAPCS : CallingConv<[ // Big endian vectors must be passed as if they were 1-element vectors so that // their lanes are in a consistent order. - CCIfBigEndian>>, - CCIfBigEndian>>, CCIfType<[i1, i8, i16], CCPromoteToType>, @@ -144,18 +143,20 @@ def RetCC_AArch64_AAPCS : CallingConv<[ [W0, W1, W2, W3, W4, W5, W6, W7]>>, CCIfType<[f16], CCAssignToRegWithShadow<[H0, H1, H2, H3, H4, H5, H6, H7], [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>, + CCIfType<[bf16], CCAssignToRegWithShadow<[H0, H1, H2, H3, H4, H5, H6, H7], + [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>, CCIfType<[f32], CCAssignToRegWithShadow<[S0, S1, S2, S3, S4, S5, S6, S7], [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>, CCIfType<[f64], CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7], [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>, - CCIfType<[v1i64, v2i32, v4i16, v8i8, v1f64, v2f32, v4f16], + CCIfType<[v1i64, v2i32, v4i16, v8i8, v1f64, v2f32, v4f16, v4bf16], CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7], [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>, - CCIfType<[f128, v2i64, v4i32, v8i16, v16i8, v4f32, v2f64, v8f16], + CCIfType<[f128, v2i64, v4i32, v8i16, v16i8, v4f32, v2f64, v8f16, v8bf16], CCAssignToReg<[Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>, CCIfType<[nxv16i8, nxv8i16, nxv4i32, nxv2i64, nxv2f16, nxv4f16, nxv8f16, - nxv2f32, nxv4f32, nxv2f64], + nxv2bf16, nxv4bf16, nxv8bf16, nxv2f32, nxv4f32, nxv2f64], CCAssignToReg<[Z0, Z1, Z2, Z3, Z4, Z5, Z6, Z7]>>, CCIfType<[nxv2i1, nxv4i1, nxv8i1, nxv16i1], @@ -165,7 +166,7 @@ def RetCC_AArch64_AAPCS : CallingConv<[ // Vararg functions on windows pass floats in integer registers let Entry = 1 in def CC_AArch64_Win64_VarArg : CallingConv<[ - CCIfType<[f16, f32], CCPromoteToType>, + CCIfType<[f16, bf16, f32], CCPromoteToType>, CCIfType<[f64], CCBitConvertToType>, CCDelegateTo ]>; @@ -219,19 +220,22 @@ def CC_AArch64_DarwinPCS : CallingConv<[ [W0, W1, W2, W3, W4, W5, W6, W7]>>, CCIfType<[f16], CCAssignToRegWithShadow<[H0, H1, H2, H3, H4, H5, H6, H7], [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>, + CCIfType<[bf16], CCAssignToRegWithShadow<[H0, H1, H2, H3, H4, H5, H6, H7], + [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>, CCIfType<[f32], CCAssignToRegWithShadow<[S0, S1, S2, S3, S4, S5, S6, S7], [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>, CCIfType<[f64], CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7], [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>, - CCIfType<[v1i64, v2i32, v4i16, v8i8, v1f64, v2f32, v4f16], + CCIfType<[v1i64, v2i32, v4i16, v8i8, v1f64, v2f32, v4f16, v4bf16], CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7], [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>, - CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32, v2f64, v8f16], + CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32, v2f64, v8f16, v8bf16], CCAssignToReg<[Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>, // If more than will fit in registers, pass them on the stack instead. CCIf<"ValVT == MVT::i1 || ValVT == MVT::i8", CCAssignToStack<1, 1>>, - CCIf<"ValVT == MVT::i16 || ValVT == MVT::f16", CCAssignToStack<2, 2>>, + CCIf<"ValVT == MVT::i16 || ValVT == MVT::f16 || ValVT == MVT::bf16", + CCAssignToStack<2, 2>>, CCIfType<[i32, f32], CCAssignToStack<4, 4>>, // Re-demote pointers to 32-bits so we don't end up storing 64-bit @@ -239,9 +243,9 @@ def CC_AArch64_DarwinPCS : CallingConv<[ CCIfPtr>>, CCIfPtr>>, - CCIfType<[i64, f64, v1f64, v2f32, v1i64, v2i32, v4i16, v8i8, v4f16], + CCIfType<[i64, f64, v1f64, v2f32, v1i64, v2i32, v4i16, v8i8, v4f16, v4bf16], CCAssignToStack<8, 8>>, - CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32, v2f64, v8f16], + CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32, v2f64, v8f16, v8bf16], CCAssignToStack<16, 16>> ]>; @@ -255,14 +259,14 @@ def CC_AArch64_DarwinPCS_VarArg : CallingConv<[ // Handle all scalar types as either i64 or f64. CCIfType<[i8, i16, i32], CCPromoteToType>, - CCIfType<[f16, f32], CCPromoteToType>, + CCIfType<[f16, bf16, f32], CCPromoteToType>, // Everything is on the stack. // i128 is split to two i64s, and its stack alignment is 16 bytes. CCIfType<[i64], CCIfSplit>>, - CCIfType<[i64, f64, v1i64, v2i32, v4i16, v8i8, v1f64, v2f32, v4f16], + CCIfType<[i64, f64, v1i64, v2i32, v4i16, v8i8, v1f64, v2f32, v4f16, v4bf16], CCAssignToStack<8, 8>>, - CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32, v2f64, v8f16], + CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32, v2f64, v8f16, v8bf16], CCAssignToStack<16, 16>> ]>; @@ -275,16 +279,16 @@ def CC_AArch64_DarwinPCS_ILP32_VarArg : CallingConv<[ // Handle all scalar types as either i32 or f32. CCIfType<[i8, i16], CCPromoteToType>, - CCIfType<[f16], CCPromoteToType>, + CCIfType<[f16, bf16], CCPromoteToType>, // Everything is on the stack. // i128 is split to two i64s, and its stack alignment is 16 bytes. CCIfPtr>>, CCIfType<[i32, f32], CCAssignToStack<4, 4>>, CCIfType<[i64], CCIfSplit>>, - CCIfType<[i64, f64, v1i64, v2i32, v4i16, v8i8, v1f64, v2f32, v4f16], + CCIfType<[i64, f64, v1i64, v2i32, v4i16, v8i8, v1f64, v2f32, v4f16, v4bf16], CCAssignToStack<8, 8>>, - CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32, v2f64, v8f16], + CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32, v2f64, v8f16, v8bf16], CCAssignToStack<16, 16>> ]>; @@ -377,11 +381,9 @@ def CSR_AArch64_AAPCS : CalleeSavedRegs<(add X19, X20, X21, X22, X23, X24, D8, D9, D10, D11, D12, D13, D14, D15)>; -// Darwin puts the frame-record at the top of the callee-save area. -def CSR_Darwin_AArch64_AAPCS : CalleeSavedRegs<(add LR, FP, X19, X20, X21, X22, - X23, X24, X25, X26, X27, X28, - D8, D9, D10, D11, - D12, D13, D14, D15)>; +// A variant for treating X18 as callee saved, when interfacing with +// code that needs X18 to be preserved. +def CSR_AArch64_AAPCS_X18 : CalleeSavedRegs<(add X18, CSR_AArch64_AAPCS)>; // Win64 has unwinding codes for an (FP,LR) pair, save_fplr and save_fplr_x. // We put FP before LR, so that frame lowering logic generates (FP,LR) pairs, @@ -421,33 +423,7 @@ def CSR_AArch64_SVE_AAPCS : CalleeSavedRegs<(add (sequence "Z%u", 8, 23), def CSR_AArch64_AAPCS_ThisReturn : CalleeSavedRegs<(add CSR_AArch64_AAPCS, X0)>; def CSR_AArch64_AAPCS_SwiftError - : CalleeSavedRegs<(sub CSR_Darwin_AArch64_AAPCS, X21)>; - -// The function used by Darwin to obtain the address of a thread-local variable -// guarantees more than a normal AAPCS function. x16 and x17 are used on the -// fast path for calculation, but other registers except X0 (argument/return) -// and LR (it is a call, after all) are preserved. -def CSR_AArch64_TLS_Darwin - : CalleeSavedRegs<(add (sub (sequence "X%u", 1, 28), X16, X17), - FP, - (sequence "Q%u", 0, 31))>; - -// We can only handle a register pair with adjacent registers, the register pair -// should belong to the same class as well. Since the access function on the -// fast path calls a function that follows CSR_AArch64_TLS_Darwin, -// CSR_AArch64_CXX_TLS_Darwin should be a subset of CSR_AArch64_TLS_Darwin. -def CSR_AArch64_CXX_TLS_Darwin - : CalleeSavedRegs<(add CSR_Darwin_AArch64_AAPCS, - (sub (sequence "X%u", 1, 28), X15, X16, X17, X18), - (sequence "D%u", 0, 31))>; - -// CSRs that are handled by prologue, epilogue. -def CSR_AArch64_CXX_TLS_Darwin_PE - : CalleeSavedRegs<(add LR, FP)>; - -// CSRs that are handled explicitly via copies. -def CSR_AArch64_CXX_TLS_Darwin_ViaCopy - : CalleeSavedRegs<(sub CSR_AArch64_CXX_TLS_Darwin, LR, FP)>; + : CalleeSavedRegs<(sub CSR_AArch64_AAPCS, X21)>; // The ELF stub used for TLS-descriptor access saves every feasible // register. Only X0 and LR are clobbered. @@ -472,14 +448,57 @@ def CSR_AArch64_StackProbe_Windows (sequence "X%u", 18, 28), FP, SP, (sequence "Q%u", 0, 31))>; +// Darwin variants of AAPCS. +// Darwin puts the frame-record at the top of the callee-save area. +def CSR_Darwin_AArch64_AAPCS : CalleeSavedRegs<(add LR, FP, X19, X20, X21, X22, + X23, X24, X25, X26, X27, X28, + D8, D9, D10, D11, + D12, D13, D14, D15)>; + +def CSR_Darwin_AArch64_AAVPCS : CalleeSavedRegs<(add LR, FP, X19, X20, X21, + X22, X23, X24, X25, X26, X27, + X28, (sequence "Q%u", 8, 23))>; +def CSR_Darwin_AArch64_AAPCS_ThisReturn + : CalleeSavedRegs<(add CSR_Darwin_AArch64_AAPCS, X0)>; + +def CSR_Darwin_AArch64_AAPCS_SwiftError + : CalleeSavedRegs<(sub CSR_Darwin_AArch64_AAPCS, X21)>; + +// The function used by Darwin to obtain the address of a thread-local variable +// guarantees more than a normal AAPCS function. x16 and x17 are used on the +// fast path for calculation, but other registers except X0 (argument/return) +// and LR (it is a call, after all) are preserved. +def CSR_Darwin_AArch64_TLS + : CalleeSavedRegs<(add (sub (sequence "X%u", 1, 28), X16, X17), + FP, + (sequence "Q%u", 0, 31))>; + +// We can only handle a register pair with adjacent registers, the register pair +// should belong to the same class as well. Since the access function on the +// fast path calls a function that follows CSR_Darwin_AArch64_TLS, +// CSR_Darwin_AArch64_CXX_TLS should be a subset of CSR_Darwin_AArch64_TLS. +def CSR_Darwin_AArch64_CXX_TLS + : CalleeSavedRegs<(add CSR_Darwin_AArch64_AAPCS, + (sub (sequence "X%u", 1, 28), X15, X16, X17, X18), + (sequence "D%u", 0, 31))>; + +// CSRs that are handled by prologue, epilogue. +def CSR_Darwin_AArch64_CXX_TLS_PE + : CalleeSavedRegs<(add LR, FP)>; + +// CSRs that are handled explicitly via copies. +def CSR_Darwin_AArch64_CXX_TLS_ViaCopy + : CalleeSavedRegs<(sub CSR_Darwin_AArch64_CXX_TLS, LR, FP)>; + +def CSR_Darwin_AArch64_RT_MostRegs + : CalleeSavedRegs<(add CSR_Darwin_AArch64_AAPCS, (sequence "X%u", 9, 15))>; + // Variants of the standard calling conventions for shadow call stack. // These all preserve x18 in addition to any other registers. def CSR_AArch64_NoRegs_SCS : CalleeSavedRegs<(add CSR_AArch64_NoRegs, X18)>; def CSR_AArch64_AllRegs_SCS : CalleeSavedRegs<(add CSR_AArch64_AllRegs, X18)>; -def CSR_AArch64_CXX_TLS_Darwin_SCS - : CalleeSavedRegs<(add CSR_AArch64_CXX_TLS_Darwin, X18)>; def CSR_AArch64_AAPCS_SwiftError_SCS : CalleeSavedRegs<(add CSR_AArch64_AAPCS_SwiftError, X18)>; def CSR_AArch64_RT_MostRegs_SCS diff --git a/llvm/lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp b/llvm/lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp index 688bd1b28e855..3f244ba10102a 100644 --- a/llvm/lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp +++ b/llvm/lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp @@ -105,6 +105,10 @@ struct LDTLSCleanup : public MachineFunctionPass { TII->get(TargetOpcode::COPY), AArch64::X0) .addReg(TLSBaseAddrReg); + // Update the call site info. + if (I.shouldUpdateCallSiteInfo()) + I.getMF()->eraseCallSiteInfo(&I); + // Erase the TLS_base_addr instruction. I.eraseFromParent(); diff --git a/llvm/lib/Target/AArch64/AArch64CollectLOH.cpp b/llvm/lib/Target/AArch64/AArch64CollectLOH.cpp index 35e6fef24363c..efdb1131abc91 100644 --- a/llvm/lib/Target/AArch64/AArch64CollectLOH.cpp +++ b/llvm/lib/Target/AArch64/AArch64CollectLOH.cpp @@ -382,7 +382,7 @@ static bool handleMiddleInst(const MachineInstr &MI, LOHInfo &DefInfo, /// Update state when seeing and ADRP instruction. static void handleADRP(const MachineInstr &MI, AArch64FunctionInfo &AFI, - LOHInfo &Info) { + LOHInfo &Info, LOHInfo *LOHInfos) { if (Info.LastADRP != nullptr) { LLVM_DEBUG(dbgs() << "Adding MCLOH_AdrpAdrp:\n" << '\t' << MI << '\t' << *Info.LastADRP); @@ -393,12 +393,24 @@ static void handleADRP(const MachineInstr &MI, AArch64FunctionInfo &AFI, // Produce LOH directive if possible. if (Info.IsCandidate) { switch (Info.Type) { - case MCLOH_AdrpAdd: + case MCLOH_AdrpAdd: { + // ADRPs and ADDs for this candidate may be split apart if using + // GlobalISel instead of pseudo-expanded. If that happens, the + // def register of the ADD may have a use in between. Adding an LOH in + // this case can cause the linker to rewrite the ADRP to write to that + // register, clobbering the use. + const MachineInstr *AddMI = Info.MI0; + int DefIdx = mapRegToGPRIndex(MI.getOperand(0).getReg()); + int OpIdx = mapRegToGPRIndex(AddMI->getOperand(0).getReg()); + LOHInfo DefInfo = LOHInfos[OpIdx]; + if (DefIdx != OpIdx && (DefInfo.OneUser || DefInfo.MultiUsers)) + break; LLVM_DEBUG(dbgs() << "Adding MCLOH_AdrpAdd:\n" << '\t' << MI << '\t' << *Info.MI0); AFI.addLOHDirective(MCLOH_AdrpAdd, {&MI, Info.MI0}); ++NumADRSimpleCandidate; break; + } case MCLOH_AdrpLdr: if (supportLoadFromLiteral(*Info.MI0)) { LLVM_DEBUG(dbgs() << "Adding MCLOH_AdrpLdr:\n" @@ -522,7 +534,8 @@ bool AArch64CollectLOH::runOnMachineFunction(MachineFunction &MF) { // Walk the basic block backwards and update the per register state machine // in the process. - for (const MachineInstr &MI : make_range(MBB.rbegin(), MBB.rend())) { + for (const MachineInstr &MI : + instructionsWithoutDebug(MBB.rbegin(), MBB.rend())) { unsigned Opcode = MI.getOpcode(); switch (Opcode) { case AArch64::ADDXri: @@ -544,7 +557,7 @@ bool AArch64CollectLOH::runOnMachineFunction(MachineFunction &MF) { const MachineOperand &Op0 = MI.getOperand(0); int Idx = mapRegToGPRIndex(Op0.getReg()); if (Idx >= 0) { - handleADRP(MI, AFI, LOHInfos[Idx]); + handleADRP(MI, AFI, LOHInfos[Idx], LOHInfos); continue; } break; diff --git a/llvm/lib/Target/AArch64/AArch64Combine.td b/llvm/lib/Target/AArch64/AArch64Combine.td index bb99f2516ecf0..aa41cae289e8b 100644 --- a/llvm/lib/Target/AArch64/AArch64Combine.td +++ b/llvm/lib/Target/AArch64/AArch64Combine.td @@ -11,8 +11,74 @@ include "llvm/Target/GlobalISel/Combine.td" +def fconstant_to_constant : GICombineRule< + (defs root:$root), + (match (wip_match_opcode G_FCONSTANT):$root, + [{ return matchFConstantToConstant(*${root}, MRI); }]), + (apply [{ applyFConstantToConstant(*${root}); }])>; + def AArch64PreLegalizerCombinerHelper: GICombinerHelper< "AArch64GenPreLegalizerCombinerHelper", [all_combines, - elide_br_by_inverting_cond]> { + elide_br_by_inverting_cond, + fconstant_to_constant]> { let DisableRuleOption = "aarch64prelegalizercombiner-disable-rule"; + let StateClass = "AArch64PreLegalizerCombinerHelperState"; + let AdditionalArguments = []; +} + +// Matchdata for combines which replace a G_SHUFFLE_VECTOR with a +// target-specific opcode. +def shuffle_matchdata : GIDefMatchData<"ShuffleVectorPseudo">; + +def rev : GICombineRule< + (defs root:$root, shuffle_matchdata:$matchinfo), + (match (wip_match_opcode G_SHUFFLE_VECTOR):$root, + [{ return matchREV(*${root}, MRI, ${matchinfo}); }]), + (apply [{ applyShuffleVectorPseudo(*${root}, ${matchinfo}); }]) +>; + +def zip : GICombineRule< + (defs root:$root, shuffle_matchdata:$matchinfo), + (match (wip_match_opcode G_SHUFFLE_VECTOR):$root, + [{ return matchZip(*${root}, MRI, ${matchinfo}); }]), + (apply [{ applyShuffleVectorPseudo(*${root}, ${matchinfo}); }]) +>; + +def uzp : GICombineRule< + (defs root:$root, shuffle_matchdata:$matchinfo), + (match (wip_match_opcode G_SHUFFLE_VECTOR):$root, + [{ return matchUZP(*${root}, MRI, ${matchinfo}); }]), + (apply [{ applyShuffleVectorPseudo(*${root}, ${matchinfo}); }]) +>; + +def dup: GICombineRule < + (defs root:$root, shuffle_matchdata:$matchinfo), + (match (wip_match_opcode G_SHUFFLE_VECTOR):$root, + [{ return matchDup(*${root}, MRI, ${matchinfo}); }]), + (apply [{ applyShuffleVectorPseudo(*${root}, ${matchinfo}); }]) +>; + +def trn : GICombineRule< + (defs root:$root, shuffle_matchdata:$matchinfo), + (match (wip_match_opcode G_SHUFFLE_VECTOR):$root, + [{ return matchTRN(*${root}, MRI, ${matchinfo}); }]), + (apply [{ applyShuffleVectorPseudo(*${root}, ${matchinfo}); }]) +>; + +def ext: GICombineRule < + (defs root:$root, shuffle_matchdata:$matchinfo), + (match (wip_match_opcode G_SHUFFLE_VECTOR):$root, + [{ return matchEXT(*${root}, MRI, ${matchinfo}); }]), + (apply [{ applyEXT(*${root}, ${matchinfo}); }]) +>; + +// Combines which replace a G_SHUFFLE_VECTOR with a target-specific pseudo +// instruction. +def shuffle_vector_pseudos : GICombineGroup<[dup, rev, ext, zip, uzp, trn]>; + +def AArch64PostLegalizerCombinerHelper + : GICombinerHelper<"AArch64GenPostLegalizerCombinerHelper", + [erase_undef_store, combines_for_extload, + sext_already_extended, shuffle_vector_pseudos]> { + let DisableRuleOption = "aarch64postlegalizercombiner-disable-rule"; } diff --git a/llvm/lib/Target/AArch64/AArch64CompressJumpTables.cpp b/llvm/lib/Target/AArch64/AArch64CompressJumpTables.cpp index 2592387059652..57dc8a4061f12 100644 --- a/llvm/lib/Target/AArch64/AArch64CompressJumpTables.cpp +++ b/llvm/lib/Target/AArch64/AArch64CompressJumpTables.cpp @@ -79,7 +79,7 @@ void AArch64CompressJumpTables::scanFunction() { for (MachineBasicBlock &MBB : *MF) { const Align Alignment = MBB.getAlignment(); unsigned AlignedOffset; - if (Alignment == Align::None()) + if (Alignment == Align(1)) AlignedOffset = Offset; else AlignedOffset = alignTo(Offset, Alignment); diff --git a/llvm/lib/Target/AArch64/AArch64CondBrTuning.cpp b/llvm/lib/Target/AArch64/AArch64CondBrTuning.cpp index 25e23e4623de1..e90e8e3da0576 100644 --- a/llvm/lib/Target/AArch64/AArch64CondBrTuning.cpp +++ b/llvm/lib/Target/AArch64/AArch64CondBrTuning.cpp @@ -194,12 +194,8 @@ bool AArch64CondBrTuning::tryToTuneBranch(MachineInstr &MI, // There must not be any instruction between DefMI and MI that clobbers or // reads NZCV. - MachineBasicBlock::iterator I(DefMI), E(MI); - for (I = std::next(I); I != E; ++I) { - if (I->modifiesRegister(AArch64::NZCV, TRI) || - I->readsRegister(AArch64::NZCV, TRI)) - return false; - } + if (isNZCVTouchedInInstructionRange(DefMI, MI, TRI)) + return false; LLVM_DEBUG(dbgs() << " Replacing instructions:\n "); LLVM_DEBUG(DefMI.print(dbgs())); LLVM_DEBUG(dbgs() << " "); @@ -253,12 +249,8 @@ bool AArch64CondBrTuning::tryToTuneBranch(MachineInstr &MI, return false; // There must not be any instruction between DefMI and MI that clobbers or // reads NZCV. - MachineBasicBlock::iterator I(DefMI), E(MI); - for (I = std::next(I); I != E; ++I) { - if (I->modifiesRegister(AArch64::NZCV, TRI) || - I->readsRegister(AArch64::NZCV, TRI)) - return false; - } + if (isNZCVTouchedInInstructionRange(DefMI, MI, TRI)) + return false; LLVM_DEBUG(dbgs() << " Replacing instructions:\n "); LLVM_DEBUG(DefMI.print(dbgs())); LLVM_DEBUG(dbgs() << " "); diff --git a/llvm/lib/Target/AArch64/AArch64ConditionOptimizer.cpp b/llvm/lib/Target/AArch64/AArch64ConditionOptimizer.cpp index 51b2ce0297019..64f0bb63762de 100644 --- a/llvm/lib/Target/AArch64/AArch64ConditionOptimizer.cpp +++ b/llvm/lib/Target/AArch64/AArch64ConditionOptimizer.cpp @@ -145,11 +145,11 @@ void AArch64ConditionOptimizer::getAnalysisUsage(AnalysisUsage &AU) const { // instructions. MachineInstr *AArch64ConditionOptimizer::findSuitableCompare( MachineBasicBlock *MBB) { - MachineBasicBlock::iterator I = MBB->getFirstTerminator(); - if (I == MBB->end()) + MachineBasicBlock::iterator Term = MBB->getFirstTerminator(); + if (Term == MBB->end()) return nullptr; - if (I->getOpcode() != AArch64::Bcc) + if (Term->getOpcode() != AArch64::Bcc) return nullptr; // Since we may modify cmp of this MBB, make sure NZCV does not live out. @@ -158,32 +158,33 @@ MachineInstr *AArch64ConditionOptimizer::findSuitableCompare( return nullptr; // Now find the instruction controlling the terminator. - for (MachineBasicBlock::iterator B = MBB->begin(); I != B;) { - --I; - assert(!I->isTerminator() && "Spurious terminator"); + for (MachineBasicBlock::iterator B = MBB->begin(), It = Term; It != B;) { + It = prev_nodbg(It, B); + MachineInstr &I = *It; + assert(!I.isTerminator() && "Spurious terminator"); // Check if there is any use of NZCV between CMP and Bcc. - if (I->readsRegister(AArch64::NZCV)) + if (I.readsRegister(AArch64::NZCV)) return nullptr; - switch (I->getOpcode()) { + switch (I.getOpcode()) { // cmp is an alias for subs with a dead destination register. case AArch64::SUBSWri: case AArch64::SUBSXri: // cmn is an alias for adds with a dead destination register. case AArch64::ADDSWri: case AArch64::ADDSXri: { - unsigned ShiftAmt = AArch64_AM::getShiftValue(I->getOperand(3).getImm()); - if (!I->getOperand(2).isImm()) { - LLVM_DEBUG(dbgs() << "Immediate of cmp is symbolic, " << *I << '\n'); + unsigned ShiftAmt = AArch64_AM::getShiftValue(I.getOperand(3).getImm()); + if (!I.getOperand(2).isImm()) { + LLVM_DEBUG(dbgs() << "Immediate of cmp is symbolic, " << I << '\n'); return nullptr; - } else if (I->getOperand(2).getImm() << ShiftAmt >= 0xfff) { - LLVM_DEBUG(dbgs() << "Immediate of cmp may be out of range, " << *I + } else if (I.getOperand(2).getImm() << ShiftAmt >= 0xfff) { + LLVM_DEBUG(dbgs() << "Immediate of cmp may be out of range, " << I << '\n'); return nullptr; - } else if (!MRI->use_empty(I->getOperand(0).getReg())) { - LLVM_DEBUG(dbgs() << "Destination of cmp is not dead, " << *I << '\n'); + } else if (!MRI->use_nodbg_empty(I.getOperand(0).getReg())) { + LLVM_DEBUG(dbgs() << "Destination of cmp is not dead, " << I << '\n'); return nullptr; } - return &*I; + return &I; } // Prevent false positive case like: // cmp w19, #0 @@ -294,12 +295,10 @@ void AArch64ConditionOptimizer::modifyCmp(MachineInstr *CmpMI, .add(BrMI.getOperand(1)); BrMI.eraseFromParent(); - MBB->updateTerminator(); - ++NumConditionsAdjusted; } -// Parse a condition code returned by AnalyzeBranch, and compute the CondCode +// Parse a condition code returned by analyzeBranch, and compute the CondCode // corresponding to TBB. // Returns true if parsing was successful, otherwise false is returned. static bool parseCond(ArrayRef Cond, AArch64CC::CondCode &CC) { diff --git a/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp b/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp index 054ef8f482ca9..82e8df3b73f90 100644 --- a/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp +++ b/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp @@ -157,7 +157,7 @@ public: MachineInstr *CmpMI; private: - /// The branch condition in Head as determined by AnalyzeBranch. + /// The branch condition in Head as determined by analyzeBranch. SmallVector HeadCond; /// The condition code that makes Head branch to CmpBB. @@ -267,7 +267,7 @@ bool SSACCmpConv::isDeadDef(unsigned DstReg) { return MRI->use_nodbg_empty(DstReg); } -// Parse a condition code returned by AnalyzeBranch, and compute the CondCode +// Parse a condition code returned by analyzeBranch, and compute the CondCode // corresponding to TBB. // Return static bool parseCond(ArrayRef Cond, AArch64CC::CondCode &CC) { @@ -317,7 +317,7 @@ MachineInstr *SSACCmpConv::findConvertibleCompare(MachineBasicBlock *MBB) { // Now find the instruction controlling the terminator. for (MachineBasicBlock::iterator B = MBB->begin(); I != B;) { - --I; + I = prev_nodbg(I, MBB->begin()); assert(!I->isTerminator() && "Spurious terminator"); switch (I->getOpcode()) { // cmp is an alias for subs with a dead destination register. @@ -509,7 +509,7 @@ bool SSACCmpConv::canConvert(MachineBasicBlock *MBB) { // landing pad. if (!TBB || HeadCond.empty()) { LLVM_DEBUG( - dbgs() << "AnalyzeBranch didn't find conditional branch in Head.\n"); + dbgs() << "analyzeBranch didn't find conditional branch in Head.\n"); ++NumHeadBranchRejs; return false; } @@ -536,7 +536,7 @@ bool SSACCmpConv::canConvert(MachineBasicBlock *MBB) { if (!TBB || CmpBBCond.empty()) { LLVM_DEBUG( - dbgs() << "AnalyzeBranch didn't find conditional branch in CmpBB.\n"); + dbgs() << "analyzeBranch didn't find conditional branch in CmpBB.\n"); ++NumCmpBranchRejs; return false; } @@ -710,7 +710,7 @@ void SSACCmpConv::convert(SmallVectorImpl &RemovedBlocks) { .add(CmpMI->getOperand(1)); // Branch target. } CmpMI->eraseFromParent(); - Head->updateTerminator(); + Head->updateTerminator(CmpBB->getNextNode()); RemovedBlocks.push_back(CmpBB); CmpBB->eraseFromParent(); @@ -828,7 +828,7 @@ void AArch64ConditionalCompares::updateDomTree( assert(Node != HeadNode && "Cannot erase the head node"); assert(Node->getIDom() == HeadNode && "CmpBB should be dominated by Head"); while (Node->getNumChildren()) - DomTree->changeImmediateDominator(Node->getChildren().back(), HeadNode); + DomTree->changeImmediateDominator(Node->back(), HeadNode); DomTree->eraseNode(RemovedMBB); } } diff --git a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp index 3b8f8a19fe49c..9e65ad2e18f95 100644 --- a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp +++ b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp @@ -68,6 +68,8 @@ private: bool expandMOVImm(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, unsigned BitSize); + bool expand_DestructiveOp(MachineInstr &MI, MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI); bool expandCMP_SWAP(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, unsigned LdarOp, unsigned StlrOp, unsigned CmpOp, unsigned ExtendImm, unsigned ZeroReg, @@ -78,6 +80,9 @@ private: bool expandSetTagLoop(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, MachineBasicBlock::iterator &NextMBBI); + bool expandSVESpillFill(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, unsigned Opc, + unsigned N); }; } // end anonymous namespace @@ -344,27 +349,225 @@ bool AArch64ExpandPseudo::expandCMP_SWAP_128( return true; } +/// \brief Expand Pseudos to Instructions with destructive operands. +/// +/// This mechanism uses MOVPRFX instructions for zeroing the false lanes +/// or for fixing relaxed register allocation conditions to comply with +/// the instructions register constraints. The latter case may be cheaper +/// than setting the register constraints in the register allocator, +/// since that will insert regular MOV instructions rather than MOVPRFX. +/// +/// Example (after register allocation): +/// +/// FSUB_ZPZZ_ZERO_B Z0, Pg, Z1, Z0 +/// +/// * The Pseudo FSUB_ZPZZ_ZERO_B maps to FSUB_ZPmZ_B. +/// * We cannot map directly to FSUB_ZPmZ_B because the register +/// constraints of the instruction are not met. +/// * Also the _ZERO specifies the false lanes need to be zeroed. +/// +/// We first try to see if the destructive operand == result operand, +/// if not, we try to swap the operands, e.g. +/// +/// FSUB_ZPmZ_B Z0, Pg/m, Z0, Z1 +/// +/// But because FSUB_ZPmZ is not commutative, this is semantically +/// different, so we need a reverse instruction: +/// +/// FSUBR_ZPmZ_B Z0, Pg/m, Z0, Z1 +/// +/// Then we implement the zeroing of the false lanes of Z0 by adding +/// a zeroing MOVPRFX instruction: +/// +/// MOVPRFX_ZPzZ_B Z0, Pg/z, Z0 +/// FSUBR_ZPmZ_B Z0, Pg/m, Z0, Z1 +/// +/// Note that this can only be done for _ZERO or _UNDEF variants where +/// we can guarantee the false lanes to be zeroed (by implementing this) +/// or that they are undef (don't care / not used), otherwise the +/// swapping of operands is illegal because the operation is not +/// (or cannot be emulated to be) fully commutative. +bool AArch64ExpandPseudo::expand_DestructiveOp( + MachineInstr &MI, + MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI) { + unsigned Opcode = AArch64::getSVEPseudoMap(MI.getOpcode()); + uint64_t DType = TII->get(Opcode).TSFlags & AArch64::DestructiveInstTypeMask; + uint64_t FalseLanes = MI.getDesc().TSFlags & AArch64::FalseLanesMask; + bool FalseZero = FalseLanes == AArch64::FalseLanesZero; + + unsigned DstReg = MI.getOperand(0).getReg(); + bool DstIsDead = MI.getOperand(0).isDead(); + + if (DType == AArch64::DestructiveBinary) + assert(DstReg != MI.getOperand(3).getReg()); + + bool UseRev = false; + unsigned PredIdx, DOPIdx, SrcIdx; + switch (DType) { + case AArch64::DestructiveBinaryComm: + case AArch64::DestructiveBinaryCommWithRev: + if (DstReg == MI.getOperand(3).getReg()) { + // FSUB Zd, Pg, Zs1, Zd ==> FSUBR Zd, Pg/m, Zd, Zs1 + std::tie(PredIdx, DOPIdx, SrcIdx) = std::make_tuple(1, 3, 2); + UseRev = true; + break; + } + LLVM_FALLTHROUGH; + case AArch64::DestructiveBinary: + case AArch64::DestructiveBinaryImm: + std::tie(PredIdx, DOPIdx, SrcIdx) = std::make_tuple(1, 2, 3); + break; + default: + llvm_unreachable("Unsupported Destructive Operand type"); + } + +#ifndef NDEBUG + // MOVPRFX can only be used if the destination operand + // is the destructive operand, not as any other operand, + // so the Destructive Operand must be unique. + bool DOPRegIsUnique = false; + switch (DType) { + case AArch64::DestructiveBinaryComm: + case AArch64::DestructiveBinaryCommWithRev: + DOPRegIsUnique = + DstReg != MI.getOperand(DOPIdx).getReg() || + MI.getOperand(DOPIdx).getReg() != MI.getOperand(SrcIdx).getReg(); + break; + case AArch64::DestructiveBinaryImm: + DOPRegIsUnique = true; + break; + } +#endif + + // Resolve the reverse opcode + if (UseRev) { + int NewOpcode; + // e.g. DIV -> DIVR + if ((NewOpcode = AArch64::getSVERevInstr(Opcode)) != -1) + Opcode = NewOpcode; + // e.g. DIVR -> DIV + else if ((NewOpcode = AArch64::getSVENonRevInstr(Opcode)) != -1) + Opcode = NewOpcode; + } + + // Get the right MOVPRFX + uint64_t ElementSize = TII->getElementSizeForOpcode(Opcode); + unsigned MovPrfx, MovPrfxZero; + switch (ElementSize) { + case AArch64::ElementSizeNone: + case AArch64::ElementSizeB: + MovPrfx = AArch64::MOVPRFX_ZZ; + MovPrfxZero = AArch64::MOVPRFX_ZPzZ_B; + break; + case AArch64::ElementSizeH: + MovPrfx = AArch64::MOVPRFX_ZZ; + MovPrfxZero = AArch64::MOVPRFX_ZPzZ_H; + break; + case AArch64::ElementSizeS: + MovPrfx = AArch64::MOVPRFX_ZZ; + MovPrfxZero = AArch64::MOVPRFX_ZPzZ_S; + break; + case AArch64::ElementSizeD: + MovPrfx = AArch64::MOVPRFX_ZZ; + MovPrfxZero = AArch64::MOVPRFX_ZPzZ_D; + break; + default: + llvm_unreachable("Unsupported ElementSize"); + } + + // + // Create the destructive operation (if required) + // + MachineInstrBuilder PRFX, DOP; + if (FalseZero) { +#ifndef NDEBUG + assert(DOPRegIsUnique && "The destructive operand should be unique"); +#endif + assert(ElementSize != AArch64::ElementSizeNone && + "This instruction is unpredicated"); + + // Merge source operand into destination register + PRFX = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(MovPrfxZero)) + .addReg(DstReg, RegState::Define) + .addReg(MI.getOperand(PredIdx).getReg()) + .addReg(MI.getOperand(DOPIdx).getReg()); + + // After the movprfx, the destructive operand is same as Dst + DOPIdx = 0; + } else if (DstReg != MI.getOperand(DOPIdx).getReg()) { +#ifndef NDEBUG + assert(DOPRegIsUnique && "The destructive operand should be unique"); +#endif + PRFX = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(MovPrfx)) + .addReg(DstReg, RegState::Define) + .addReg(MI.getOperand(DOPIdx).getReg()); + DOPIdx = 0; + } + + // + // Create the destructive operation + // + DOP = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Opcode)) + .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead)); + + switch (DType) { + case AArch64::DestructiveBinaryImm: + case AArch64::DestructiveBinaryComm: + case AArch64::DestructiveBinaryCommWithRev: + DOP.add(MI.getOperand(PredIdx)) + .addReg(MI.getOperand(DOPIdx).getReg(), RegState::Kill) + .add(MI.getOperand(SrcIdx)); + break; + } + + if (PRFX) { + finalizeBundle(MBB, PRFX->getIterator(), MBBI->getIterator()); + transferImpOps(MI, PRFX, DOP); + } else + transferImpOps(MI, DOP, DOP); + + MI.eraseFromParent(); + return true; +} + bool AArch64ExpandPseudo::expandSetTagLoop( MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, MachineBasicBlock::iterator &NextMBBI) { MachineInstr &MI = *MBBI; DebugLoc DL = MI.getDebugLoc(); - Register SizeReg = MI.getOperand(2).getReg(); - Register AddressReg = MI.getOperand(3).getReg(); + Register SizeReg = MI.getOperand(0).getReg(); + Register AddressReg = MI.getOperand(1).getReg(); MachineFunction *MF = MBB.getParent(); - bool ZeroData = MI.getOpcode() == AArch64::STZGloop; - const unsigned OpCode = + bool ZeroData = MI.getOpcode() == AArch64::STZGloop_wback; + const unsigned OpCode1 = + ZeroData ? AArch64::STZGPostIndex : AArch64::STGPostIndex; + const unsigned OpCode2 = ZeroData ? AArch64::STZ2GPostIndex : AArch64::ST2GPostIndex; + unsigned Size = MI.getOperand(2).getImm(); + assert(Size > 0 && Size % 16 == 0); + if (Size % (16 * 2) != 0) { + BuildMI(MBB, MBBI, DL, TII->get(OpCode1), AddressReg) + .addReg(AddressReg) + .addReg(AddressReg) + .addImm(1); + Size -= 16; + } + MachineBasicBlock::iterator I = + BuildMI(MBB, MBBI, DL, TII->get(AArch64::MOVi64imm), SizeReg) + .addImm(Size); + expandMOVImm(MBB, I, 64); + auto LoopBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock()); auto DoneBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock()); MF->insert(++MBB.getIterator(), LoopBB); MF->insert(++LoopBB->getIterator(), DoneBB); - BuildMI(LoopBB, DL, TII->get(OpCode)) + BuildMI(LoopBB, DL, TII->get(OpCode2)) .addDef(AddressReg) .addReg(AddressReg) .addReg(AddressReg) @@ -402,6 +605,28 @@ bool AArch64ExpandPseudo::expandSetTagLoop( return true; } +bool AArch64ExpandPseudo::expandSVESpillFill(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + unsigned Opc, unsigned N) { + const TargetRegisterInfo *TRI = + MBB.getParent()->getSubtarget().getRegisterInfo(); + MachineInstr &MI = *MBBI; + for (unsigned Offset = 0; Offset < N; ++Offset) { + int ImmOffset = MI.getOperand(2).getImm() + Offset; + bool Kill = (Offset + 1 == N) ? MI.getOperand(1).isKill() : false; + assert(ImmOffset >= -256 && ImmOffset < 256 && + "Immediate spill offset out of range"); + BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Opc)) + .addReg( + TRI->getSubReg(MI.getOperand(0).getReg(), AArch64::zsub0 + Offset), + Opc == AArch64::LDR_ZXI ? RegState::Define : 0) + .addReg(MI.getOperand(1).getReg(), getKillRegState(Kill)) + .addImm(ImmOffset); + } + MI.eraseFromParent(); + return true; +} + /// If MBBI references a pseudo instruction that should be expanded here, /// do the expansion and return true. Otherwise return false. bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB, @@ -409,10 +634,76 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB, MachineBasicBlock::iterator &NextMBBI) { MachineInstr &MI = *MBBI; unsigned Opcode = MI.getOpcode(); + + // Check if we can expand the destructive op + int OrigInstr = AArch64::getSVEPseudoMap(MI.getOpcode()); + if (OrigInstr != -1) { + auto &Orig = TII->get(OrigInstr); + if ((Orig.TSFlags & AArch64::DestructiveInstTypeMask) + != AArch64::NotDestructive) { + return expand_DestructiveOp(MI, MBB, MBBI); + } + } + switch (Opcode) { default: break; + case AArch64::BSPv8i8: + case AArch64::BSPv16i8: { + Register DstReg = MI.getOperand(0).getReg(); + if (DstReg == MI.getOperand(3).getReg()) { + // Expand to BIT + BuildMI(MBB, MBBI, MI.getDebugLoc(), + TII->get(Opcode == AArch64::BSPv8i8 ? AArch64::BITv8i8 + : AArch64::BITv16i8)) + .add(MI.getOperand(0)) + .add(MI.getOperand(3)) + .add(MI.getOperand(2)) + .add(MI.getOperand(1)); + } else if (DstReg == MI.getOperand(2).getReg()) { + // Expand to BIF + BuildMI(MBB, MBBI, MI.getDebugLoc(), + TII->get(Opcode == AArch64::BSPv8i8 ? AArch64::BIFv8i8 + : AArch64::BIFv16i8)) + .add(MI.getOperand(0)) + .add(MI.getOperand(2)) + .add(MI.getOperand(3)) + .add(MI.getOperand(1)); + } else { + // Expand to BSL, use additional move if required + if (DstReg == MI.getOperand(1).getReg()) { + BuildMI(MBB, MBBI, MI.getDebugLoc(), + TII->get(Opcode == AArch64::BSPv8i8 ? AArch64::BSLv8i8 + : AArch64::BSLv16i8)) + .add(MI.getOperand(0)) + .add(MI.getOperand(1)) + .add(MI.getOperand(2)) + .add(MI.getOperand(3)); + } else { + BuildMI(MBB, MBBI, MI.getDebugLoc(), + TII->get(Opcode == AArch64::BSPv8i8 ? AArch64::ORRv8i8 + : AArch64::ORRv16i8)) + .addReg(DstReg, + RegState::Define | + getRenamableRegState(MI.getOperand(0).isRenamable())) + .add(MI.getOperand(1)) + .add(MI.getOperand(1)); + BuildMI(MBB, MBBI, MI.getDebugLoc(), + TII->get(Opcode == AArch64::BSPv8i8 ? AArch64::BSLv8i8 + : AArch64::BSLv16i8)) + .add(MI.getOperand(0)) + .addReg(DstReg, + RegState::Kill | + getRenamableRegState(MI.getOperand(0).isRenamable())) + .add(MI.getOperand(2)) + .add(MI.getOperand(3)); + } + } + MI.eraseFromParent(); + return true; + } + case AArch64::ADDWrr: case AArch64::SUBWrr: case AArch64::ADDXrr: @@ -599,10 +890,7 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB, Register DstReg = MI.getOperand(0).getReg(); auto SysReg = AArch64SysReg::TPIDR_EL0; MachineFunction *MF = MBB.getParent(); - if (MF->getTarget().getTargetTriple().isOSFuchsia() && - MF->getTarget().getCodeModel() == CodeModel::Kernel) - SysReg = AArch64SysReg::TPIDR_EL1; - else if (MF->getSubtarget().useEL3ForTP()) + if (MF->getSubtarget().useEL3ForTP()) SysReg = AArch64SysReg::TPIDR_EL3; else if (MF->getSubtarget().useEL2ForTP()) SysReg = AArch64SysReg::TPIDR_EL2; @@ -676,7 +964,7 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB, // almost always point to SP-after-prologue; if not, emit a longer // instruction sequence. int BaseOffset = -AFI->getTaggedBasePointerOffset(); - unsigned FrameReg; + Register FrameReg; StackOffset FrameRegOffset = TFI->resolveFrameOffsetReference( MF, BaseOffset, false /*isFixed*/, false /*isSVE*/, FrameReg, /*PreferFP=*/false, @@ -706,9 +994,26 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB, MI.eraseFromParent(); return true; } + case AArch64::STGloop_wback: + case AArch64::STZGloop_wback: + return expandSetTagLoop(MBB, MBBI, NextMBBI); case AArch64::STGloop: case AArch64::STZGloop: - return expandSetTagLoop(MBB, MBBI, NextMBBI); + report_fatal_error( + "Non-writeback variants of STGloop / STZGloop should not " + "survive past PrologEpilogInserter."); + case AArch64::STR_ZZZZXI: + return expandSVESpillFill(MBB, MBBI, AArch64::STR_ZXI, 4); + case AArch64::STR_ZZZXI: + return expandSVESpillFill(MBB, MBBI, AArch64::STR_ZXI, 3); + case AArch64::STR_ZZXI: + return expandSVESpillFill(MBB, MBBI, AArch64::STR_ZXI, 2); + case AArch64::LDR_ZZZZXI: + return expandSVESpillFill(MBB, MBBI, AArch64::LDR_ZXI, 4); + case AArch64::LDR_ZZZXI: + return expandSVESpillFill(MBB, MBBI, AArch64::LDR_ZXI, 3); + case AArch64::LDR_ZZXI: + return expandSVESpillFill(MBB, MBBI, AArch64::LDR_ZXI, 2); } return false; } diff --git a/llvm/lib/Target/AArch64/AArch64FalkorHWPFFix.cpp b/llvm/lib/Target/AArch64/AArch64FalkorHWPFFix.cpp index c1fc183b04f6f..538863ebe95af 100644 --- a/llvm/lib/Target/AArch64/AArch64FalkorHWPFFix.cpp +++ b/llvm/lib/Target/AArch64/AArch64FalkorHWPFFix.cpp @@ -823,9 +823,6 @@ bool FalkorHWPFFix::runOnMachineFunction(MachineFunction &Fn) { TII = static_cast(ST.getInstrInfo()); TRI = ST.getRegisterInfo(); - assert(TRI->trackLivenessAfterRegAlloc(Fn) && - "Register liveness not available!"); - MachineLoopInfo &LI = getAnalysis(); Modified = false; diff --git a/llvm/lib/Target/AArch64/AArch64FastISel.cpp b/llvm/lib/Target/AArch64/AArch64FastISel.cpp index 7e9c68f2bb305..0f63f4ca62e5e 100644 --- a/llvm/lib/Target/AArch64/AArch64FastISel.cpp +++ b/llvm/lib/Target/AArch64/AArch64FastISel.cpp @@ -434,11 +434,9 @@ unsigned AArch64FastISel::materializeFP(const ConstantFP *CFP, MVT VT) { // Materialize via constant pool. MachineConstantPool wants an explicit // alignment. - unsigned Align = DL.getPrefTypeAlignment(CFP->getType()); - if (Align == 0) - Align = DL.getTypeAllocSize(CFP->getType()); + Align Alignment = DL.getPrefTypeAlign(CFP->getType()); - unsigned CPI = MCP.getConstantPoolIndex(cast(CFP), Align); + unsigned CPI = MCP.getConstantPoolIndex(cast(CFP), Alignment); unsigned ADRPReg = createResultReg(&AArch64::GPR64commonRegClass); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ADRP), ADRPReg).addConstantPoolIndex(CPI, 0, AArch64II::MO_PAGE); @@ -1130,7 +1128,7 @@ void AArch64FastISel::addLoadStoreOperands(Address &Addr, // and alignment should be based on the VT. MMO = FuncInfo.MF->getMachineMemOperand( MachinePointerInfo::getFixedStack(*FuncInfo.MF, FI, Offset), Flags, - MFI.getObjectSize(FI), MFI.getObjectAlignment(FI)); + MFI.getObjectSize(FI), MFI.getObjectAlign(FI)); // Now add the rest of the operands. MIB.addFrameIndex(FI).addImm(Offset); } else { @@ -3137,7 +3135,7 @@ bool AArch64FastISel::processCallArgs(CallLoweringInfo &CLI, Addr.setReg(AArch64::SP); Addr.setOffset(VA.getLocMemOffset() + BEAlign); - unsigned Alignment = DL.getABITypeAlignment(ArgVal->getType()); + Align Alignment = DL.getABITypeAlign(ArgVal->getType()); MachineMemOperand *MMO = FuncInfo.MF->getMachineMemOperand( MachinePointerInfo::getStack(*FuncInfo.MF, Addr.getOffset()), MachineMemOperand::MOStore, ArgVT.getStoreSize(), Alignment); @@ -3272,7 +3270,8 @@ bool AArch64FastISel::fastLowerCall(CallLoweringInfo &CLI) { // Issue the call. MachineInstrBuilder MIB; if (Subtarget->useSmallAddressing()) { - const MCInstrDesc &II = TII.get(Addr.getReg() ? AArch64::BLR : AArch64::BL); + const MCInstrDesc &II = + TII.get(Addr.getReg() ? getBLRCallOpcode(*MF) : (unsigned)AArch64::BL); MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II); if (Symbol) MIB.addSym(Symbol, 0); @@ -3305,7 +3304,7 @@ bool AArch64FastISel::fastLowerCall(CallLoweringInfo &CLI) { if (!CallReg) return false; - const MCInstrDesc &II = TII.get(AArch64::BLR); + const MCInstrDesc &II = TII.get(getBLRCallOpcode(*MF)); CallReg = constrainOperandRegClass(II, CallReg, 0); MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II).addReg(CallReg); } diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp index ea3e800a1ad20..efa3fd5ca9cef 100644 --- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp @@ -170,8 +170,45 @@ static cl::opt cl::desc("reverse the CSR restore sequence"), cl::init(false), cl::Hidden); +static cl::opt StackTaggingMergeSetTag( + "stack-tagging-merge-settag", + cl::desc("merge settag instruction in function epilog"), cl::init(true), + cl::Hidden); + STATISTIC(NumRedZoneFunctions, "Number of functions using red zone"); +/// Returns the argument pop size. +static uint64_t getArgumentPopSize(MachineFunction &MF, + MachineBasicBlock &MBB) { + MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr(); + bool IsTailCallReturn = false; + if (MBB.end() != MBBI) { + unsigned RetOpcode = MBBI->getOpcode(); + IsTailCallReturn = RetOpcode == AArch64::TCRETURNdi || + RetOpcode == AArch64::TCRETURNri || + RetOpcode == AArch64::TCRETURNriBTI; + } + AArch64FunctionInfo *AFI = MF.getInfo(); + + uint64_t ArgumentPopSize = 0; + if (IsTailCallReturn) { + MachineOperand &StackAdjust = MBBI->getOperand(1); + + // For a tail-call in a callee-pops-arguments environment, some or all of + // the stack may actually be in use for the call's arguments, this is + // calculated during LowerCall and consumed here... + ArgumentPopSize = StackAdjust.getImm(); + } else { + // ... otherwise the amount to pop is *all* of the argument space, + // conveniently stored in the MachineFunctionInfo by + // LowerFormalArguments. This will, of course, be zero for the C calling + // convention. + ArgumentPopSize = AFI->getArgumentStackToRestore(); + } + + return ArgumentPopSize; +} + /// This is the biggest offset to the stack pointer we can encode in aarch64 /// instructions (without using a separate calculation and a temp register). /// Note that the exception here are vector stores/loads which cannot encode any @@ -211,6 +248,24 @@ AArch64FrameLowering::getStackIDForScalableVectors() const { return TargetStackID::SVEVector; } +/// Returns the size of the fixed object area (allocated next to sp on entry) +/// On Win64 this may include a var args area and an UnwindHelp object for EH. +static unsigned getFixedObjectSize(const MachineFunction &MF, + const AArch64FunctionInfo *AFI, bool IsWin64, + bool IsFunclet) { + if (!IsWin64 || IsFunclet) { + // Only Win64 uses fixed objects, and then only for the function (not + // funclets) + return 0; + } else { + // Var args are stored here in the primary function. + const unsigned VarArgsArea = AFI->getVarArgsGPRSize(); + // To support EH funclets we allocate an UnwindHelp object + const unsigned UnwindHelpObject = (MF.hasEHFunclets() ? 8 : 0); + return alignTo(VarArgsArea + UnwindHelpObject, 16); + } +} + /// Returns the size of the entire SVE stackframe (calleesaves + spills). static StackOffset getSVEStackSize(const MachineFunction &MF) { const AArch64FunctionInfo *AFI = MF.getInfo(); @@ -286,10 +341,8 @@ MachineBasicBlock::iterator AArch64FrameLowering::eliminateCallFramePseudoInstr( uint64_t CalleePopAmount = IsDestroy ? I->getOperand(1).getImm() : 0; if (!hasReservedCallFrame(MF)) { - unsigned Align = getStackAlignment(); - int64_t Amount = I->getOperand(0).getImm(); - Amount = alignTo(Amount, Align); + Amount = alignTo(Amount, getStackAlign()); if (!IsDestroy) Amount = -Amount; @@ -480,6 +533,39 @@ bool AArch64FrameLowering::shouldCombineCSRLocalStackBump( return true; } +bool AArch64FrameLowering::shouldCombineCSRLocalStackBumpInEpilogue( + MachineBasicBlock &MBB, unsigned StackBumpBytes) const { + if (!shouldCombineCSRLocalStackBump(*MBB.getParent(), StackBumpBytes)) + return false; + + if (MBB.empty()) + return true; + + // Disable combined SP bump if the last instruction is an MTE tag store. It + // is almost always better to merge SP adjustment into those instructions. + MachineBasicBlock::iterator LastI = MBB.getFirstTerminator(); + MachineBasicBlock::iterator Begin = MBB.begin(); + while (LastI != Begin) { + --LastI; + if (LastI->isTransient()) + continue; + if (!LastI->getFlag(MachineInstr::FrameDestroy)) + break; + } + switch (LastI->getOpcode()) { + case AArch64::STGloop: + case AArch64::STZGloop: + case AArch64::STGOffset: + case AArch64::STZGOffset: + case AArch64::ST2GOffset: + case AArch64::STZ2GOffset: + return false; + default: + return true; + } + llvm_unreachable("unreachable"); +} + // Given a load or a store instruction, generate an appropriate unwinding SEH // code on Windows. static MachineBasicBlock::iterator InsertSEH(MachineBasicBlock::iterator MBBI, @@ -940,11 +1026,11 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, // Label used to tie together the PROLOG_LABEL and the MachineMoves. MCSymbol *FrameLabel = MMI.getContext().createTempSymbol(); // Encode the stack size of the leaf function. - unsigned CFIIndex = MF.addFrameInst( - MCCFIInstruction::createDefCfaOffset(FrameLabel, -NumBytes)); - BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION)) - .addCFIIndex(CFIIndex) - .setMIFlags(MachineInstr::FrameSetup); + unsigned CFIIndex = MF.addFrameInst( + MCCFIInstruction::cfiDefCfaOffset(FrameLabel, NumBytes)); + BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION)) + .addCFIIndex(CFIIndex) + .setMIFlags(MachineInstr::FrameSetup); } } @@ -959,10 +1045,7 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, bool IsWin64 = Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv()); - // Var args are accounted for in the containing function, so don't - // include them for funclets. - unsigned FixedObject = (IsWin64 && !IsFunclet) ? - alignTo(AFI->getVarArgsGPRSize(), 16) : 0; + unsigned FixedObject = getFixedObjectSize(MF, AFI, IsWin64, IsFunclet); auto PrologueSaveSize = AFI->getCalleeSavedStackSize() + FixedObject; // All of the remaining stack allocations are for locals. @@ -993,32 +1076,8 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, ++MBBI; } - // The code below is not applicable to funclets. We have emitted all the SEH - // opcodes that we needed to emit. The FP and BP belong to the containing - // function. - if (IsFunclet) { - if (NeedsWinCFI) { - HasWinCFI = true; - BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_PrologEnd)) - .setMIFlag(MachineInstr::FrameSetup); - } - - // SEH funclets are passed the frame pointer in X1. If the parent - // function uses the base register, then the base register is used - // directly, and is not retrieved from X1. - if (F.hasPersonalityFn()) { - EHPersonality Per = classifyEHPersonality(F.getPersonalityFn()); - if (isAsynchronousEHPersonality(Per)) { - BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::COPY), AArch64::FP) - .addReg(AArch64::X1).setMIFlag(MachineInstr::FrameSetup); - MBB.addLiveIn(AArch64::X1); - } - } - - return; - } - - if (HasFP) { + // For funclets the FP belongs to the containing function. + if (!IsFunclet && HasFP) { // Only set up FP if we actually need to. int64_t FPOffset = isTargetDarwin(MF) ? (AFI->getCalleeSavedStackSize() - 16) : 0; @@ -1099,7 +1158,7 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, .setMIFlag(MachineInstr::FrameSetup); } - BuildMI(MBB, MBBI, DL, TII->get(AArch64::BLR)) + BuildMI(MBB, MBBI, DL, TII->get(getBLRCallOpcode(MF))) .addReg(AArch64::X16, RegState::Kill) .addReg(AArch64::X15, RegState::Implicit | RegState::Define) .addReg(AArch64::X16, RegState::Implicit | RegState::Define | RegState::Dead) @@ -1161,7 +1220,9 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, // Allocate space for the rest of the frame. if (NumBytes) { - const bool NeedsRealignment = RegInfo->needsStackRealignment(MF); + // Alignment is required for the parent frame, not the funclet + const bool NeedsRealignment = + !IsFunclet && RegInfo->needsStackRealignment(MF); unsigned scratchSPReg = AArch64::SP; if (NeedsRealignment) { @@ -1179,8 +1240,7 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, false, NeedsWinCFI, &HasWinCFI); if (NeedsRealignment) { - const unsigned Alignment = MFI.getMaxAlignment(); - const unsigned NrBitsToZero = countTrailingZeros(Alignment); + const unsigned NrBitsToZero = Log2(MFI.getMaxAlign()); assert(NrBitsToZero > 1); assert(scratchSPReg != AArch64::SP); @@ -1215,7 +1275,8 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, // FIXME: Clarify FrameSetup flags here. // Note: Use emitFrameOffset() like above for FP if the FrameSetup flag is // needed. - if (RegInfo->hasBasePointer(MF)) { + // For funclets the BP belongs to the containing function. + if (!IsFunclet && RegInfo->hasBasePointer(MF)) { TII->copyPhysReg(MBB, MBBI, DL, RegInfo->getBaseRegister(), AArch64::SP, false); if (NeedsWinCFI) { @@ -1232,6 +1293,19 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, .setMIFlag(MachineInstr::FrameSetup); } + // SEH funclets are passed the frame pointer in X1. If the parent + // function uses the base register, then the base register is used + // directly, and is not retrieved from X1. + if (IsFunclet && F.hasPersonalityFn()) { + EHPersonality Per = classifyEHPersonality(F.getPersonalityFn()); + if (isAsynchronousEHPersonality(Per)) { + BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::COPY), AArch64::FP) + .addReg(AArch64::X1) + .setMIFlag(MachineInstr::FrameSetup); + MBB.addLiveIn(AArch64::X1); + } + } + if (needsFrameMoves) { const DataLayout &TD = MF.getDataLayout(); const int StackGrowth = isTargetDarwin(MF) @@ -1307,15 +1381,15 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, if (HasFP) { // Define the current CFA rule to use the provided FP. unsigned Reg = RegInfo->getDwarfRegNum(FramePtr, true); - unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::createDefCfa( - nullptr, Reg, StackGrowth - FixedObject)); + unsigned CFIIndex = MF.addFrameInst( + MCCFIInstruction::cfiDefCfa(nullptr, Reg, FixedObject - StackGrowth)); BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION)) .addCFIIndex(CFIIndex) .setMIFlags(MachineInstr::FrameSetup); } else { // Encode the stack size of the leaf function. unsigned CFIIndex = MF.addFrameInst( - MCCFIInstruction::createDefCfaOffset(nullptr, -MFI.getStackSize())); + MCCFIInstruction::cfiDefCfaOffset(nullptr, MFI.getStackSize())); BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION)) .addCFIIndex(CFIIndex) .setMIFlags(MachineInstr::FrameSetup); @@ -1374,7 +1448,6 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF, const AArch64Subtarget &Subtarget = MF.getSubtarget(); const TargetInstrInfo *TII = Subtarget.getInstrInfo(); DebugLoc DL; - bool IsTailCallReturn = false; bool NeedsWinCFI = needsWinCFI(MF); bool HasWinCFI = false; bool IsFunclet = false; @@ -1385,10 +1458,6 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF, if (MBB.end() != MBBI) { DL = MBBI->getDebugLoc(); - unsigned RetOpcode = MBBI->getOpcode(); - IsTailCallReturn = RetOpcode == AArch64::TCRETURNdi || - RetOpcode == AArch64::TCRETURNri || - RetOpcode == AArch64::TCRETURNriBTI; IsFunclet = isFuncletReturnInstr(*MBBI); } @@ -1403,21 +1472,7 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF, // Initial and residual are named for consistency with the prologue. Note that // in the epilogue, the residual adjustment is executed first. - uint64_t ArgumentPopSize = 0; - if (IsTailCallReturn) { - MachineOperand &StackAdjust = MBBI->getOperand(1); - - // For a tail-call in a callee-pops-arguments environment, some or all of - // the stack may actually be in use for the call's arguments, this is - // calculated during LowerCall and consumed here... - ArgumentPopSize = StackAdjust.getImm(); - } else { - // ... otherwise the amount to pop is *all* of the argument space, - // conveniently stored in the MachineFunctionInfo by - // LowerFormalArguments. This will, of course, be zero for the C calling - // convention. - ArgumentPopSize = AFI->getArgumentStackToRestore(); - } + uint64_t ArgumentPopSize = getArgumentPopSize(MF, MBB); // The stack frame should be like below, // @@ -1450,10 +1505,7 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF, bool IsWin64 = Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv()); - // Var args are accounted for in the containing function, so don't - // include them for funclets. - unsigned FixedObject = - (IsWin64 && !IsFunclet) ? alignTo(AFI->getVarArgsGPRSize(), 16) : 0; + unsigned FixedObject = getFixedObjectSize(MF, AFI, IsWin64, IsFunclet); uint64_t AfterCSRPopSize = ArgumentPopSize; auto PrologueSaveSize = AFI->getCalleeSavedStackSize() + FixedObject; @@ -1463,7 +1515,7 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF, // function. if (MF.hasEHFunclets()) AFI->setLocalStackSize(NumBytes - PrologueSaveSize); - bool CombineSPBump = shouldCombineCSRLocalStackBump(MF, NumBytes); + bool CombineSPBump = shouldCombineCSRLocalStackBumpInEpilogue(MBB, NumBytes); // Assume we can't combine the last pop with the sp restore. if (!CombineSPBump && PrologueSaveSize != 0) { @@ -1660,7 +1712,7 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF, /// SP-relative and simple call frames aren't used. int AArch64FrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI, - unsigned &FrameReg) const { + Register &FrameReg) const { return resolveFrameIndexReference( MF, FI, FrameReg, /*PreferFP=*/ @@ -1679,7 +1731,9 @@ static StackOffset getFPOffset(const MachineFunction &MF, int64_t ObjectOffset) const auto &Subtarget = MF.getSubtarget(); bool IsWin64 = Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv()); - unsigned FixedObject = IsWin64 ? alignTo(AFI->getVarArgsGPRSize(), 16) : 0; + + unsigned FixedObject = + getFixedObjectSize(MF, AFI, IsWin64, /*IsFunclet=*/false); unsigned FPAdjust = isTargetDarwin(MF) ? 16 : AFI->getCalleeSavedStackSize(MF.getFrameInfo()); return {ObjectOffset + FixedObject + FPAdjust, MVT::i8}; @@ -1701,7 +1755,7 @@ int AArch64FrameLowering::getSEHFrameIndexOffset(const MachineFunction &MF, } StackOffset AArch64FrameLowering::resolveFrameIndexReference( - const MachineFunction &MF, int FI, unsigned &FrameReg, bool PreferFP, + const MachineFunction &MF, int FI, Register &FrameReg, bool PreferFP, bool ForSimm) const { const auto &MFI = MF.getFrameInfo(); int64_t ObjectOffset = MFI.getObjectOffset(FI); @@ -1713,7 +1767,7 @@ StackOffset AArch64FrameLowering::resolveFrameIndexReference( StackOffset AArch64FrameLowering::resolveFrameOffsetReference( const MachineFunction &MF, int64_t ObjectOffset, bool isFixed, bool isSVE, - unsigned &FrameReg, bool PreferFP, bool ForSimm) const { + Register &FrameReg, bool PreferFP, bool ForSimm) const { const auto &MFI = MF.getFrameInfo(); const auto *RegInfo = static_cast( MF.getSubtarget().getRegisterInfo()); @@ -1764,10 +1818,8 @@ StackOffset AArch64FrameLowering::resolveFrameOffsetReference( bool CanUseBP = RegInfo->hasBasePointer(MF); if (FPOffsetFits && CanUseBP) // Both are ok. Pick the best. UseFP = PreferFP; - else if (!CanUseBP) { // Can't use BP. Forced to use FP. - assert(!SVEStackSize && "Expected BP to be available"); + else if (!CanUseBP) // Can't use BP. Forced to use FP. UseFP = true; - } // else we can use BP and FP, but the offset from FP won't fit. // That will make us scavenge registers which we can probably avoid by // using BP. If it won't fit for BP either, we'll scavenge anyway. @@ -1933,7 +1985,7 @@ struct RegPairInfo { } // end anonymous namespace static void computeCalleeSaveRegisterPairs( - MachineFunction &MF, const std::vector &CSI, + MachineFunction &MF, ArrayRef CSI, const TargetRegisterInfo *TRI, SmallVectorImpl &RegPairs, bool &NeedShadowCallStackProlog, bool NeedsFrameRecord) { @@ -2058,8 +2110,8 @@ static void computeCalleeSaveRegisterPairs( FixupDone = true; ByteOffset -= 8; assert(ByteOffset % 16 == 0); - assert(MFI.getObjectAlignment(RPI.FrameIdx) <= 16); - MFI.setObjectAlignment(RPI.FrameIdx, 16); + assert(MFI.getObjectAlign(RPI.FrameIdx) <= Align(16)); + MFI.setObjectAlignment(RPI.FrameIdx, Align(16)); } int Offset = RPI.isScalable() ? ScalableByteOffset : ByteOffset; @@ -2078,8 +2130,7 @@ static void computeCalleeSaveRegisterPairs( bool AArch64FrameLowering::spillCalleeSavedRegisters( MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, - const std::vector &CSI, - const TargetRegisterInfo *TRI) const { + ArrayRef CSI, const TargetRegisterInfo *TRI) const { MachineFunction &MF = *MBB.getParent(); const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); bool NeedsWinCFI = needsWinCFI(MF); @@ -2142,32 +2193,33 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters( // Rationale: This sequence saves uop updates compared to a sequence of // pre-increment spills like stp xi,xj,[sp,#-16]! // Note: Similar rationale and sequence for restores in epilog. - unsigned Size, Align; + unsigned Size; + Align Alignment; switch (RPI.Type) { case RegPairInfo::GPR: StrOpc = RPI.isPaired() ? AArch64::STPXi : AArch64::STRXui; Size = 8; - Align = 8; + Alignment = Align(8); break; case RegPairInfo::FPR64: StrOpc = RPI.isPaired() ? AArch64::STPDi : AArch64::STRDui; Size = 8; - Align = 8; + Alignment = Align(8); break; case RegPairInfo::FPR128: StrOpc = RPI.isPaired() ? AArch64::STPQi : AArch64::STRQui; Size = 16; - Align = 16; + Alignment = Align(16); break; case RegPairInfo::ZPR: StrOpc = AArch64::STR_ZXI; Size = 16; - Align = 16; + Alignment = Align(16); break; case RegPairInfo::PPR: StrOpc = AArch64::STR_PXI; Size = 2; - Align = 2; + Alignment = Align(2); break; } LLVM_DEBUG(dbgs() << "CSR spill: (" << printReg(Reg1, TRI); @@ -2196,7 +2248,7 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters( MIB.addReg(Reg2, getPrologueDeath(MF, Reg2)); MIB.addMemOperand(MF.getMachineMemOperand( MachinePointerInfo::getFixedStack(MF, FrameIdxReg2), - MachineMemOperand::MOStore, Size, Align)); + MachineMemOperand::MOStore, Size, Alignment)); } MIB.addReg(Reg1, getPrologueDeath(MF, Reg1)) .addReg(AArch64::SP) @@ -2204,8 +2256,8 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters( // where factor*scale is implicit .setMIFlag(MachineInstr::FrameSetup); MIB.addMemOperand(MF.getMachineMemOperand( - MachinePointerInfo::getFixedStack(MF,FrameIdxReg1), - MachineMemOperand::MOStore, Size, Align)); + MachinePointerInfo::getFixedStack(MF, FrameIdxReg1), + MachineMemOperand::MOStore, Size, Alignment)); if (NeedsWinCFI) InsertSEH(MIB, TII, MachineInstr::FrameSetup); @@ -2220,8 +2272,7 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters( bool AArch64FrameLowering::restoreCalleeSavedRegisters( MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, - std::vector &CSI, - const TargetRegisterInfo *TRI) const { + MutableArrayRef CSI, const TargetRegisterInfo *TRI) const { MachineFunction &MF = *MBB.getParent(); const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); DebugLoc DL; @@ -2248,32 +2299,33 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters( // ldp x22, x21, [sp, #0] // addImm(+0) // Note: see comment in spillCalleeSavedRegisters() unsigned LdrOpc; - unsigned Size, Align; + unsigned Size; + Align Alignment; switch (RPI.Type) { case RegPairInfo::GPR: LdrOpc = RPI.isPaired() ? AArch64::LDPXi : AArch64::LDRXui; Size = 8; - Align = 8; + Alignment = Align(8); break; case RegPairInfo::FPR64: LdrOpc = RPI.isPaired() ? AArch64::LDPDi : AArch64::LDRDui; Size = 8; - Align = 8; + Alignment = Align(8); break; case RegPairInfo::FPR128: LdrOpc = RPI.isPaired() ? AArch64::LDPQi : AArch64::LDRQui; Size = 16; - Align = 16; + Alignment = Align(16); break; case RegPairInfo::ZPR: LdrOpc = AArch64::LDR_ZXI; Size = 16; - Align = 16; + Alignment = Align(16); break; case RegPairInfo::PPR: LdrOpc = AArch64::LDR_PXI; Size = 2; - Align = 2; + Alignment = Align(2); break; } LLVM_DEBUG(dbgs() << "CSR restore: (" << printReg(Reg1, TRI); @@ -2296,7 +2348,7 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters( MIB.addReg(Reg2, getDefRegState(true)); MIB.addMemOperand(MF.getMachineMemOperand( MachinePointerInfo::getFixedStack(MF, FrameIdxReg2), - MachineMemOperand::MOLoad, Size, Align)); + MachineMemOperand::MOLoad, Size, Alignment)); } MIB.addReg(Reg1, getDefRegState(true)) .addReg(AArch64::SP) @@ -2305,7 +2357,7 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters( .setMIFlag(MachineInstr::FrameDestroy); MIB.addMemOperand(MF.getMachineMemOperand( MachinePointerInfo::getFixedStack(MF, FrameIdxReg1), - MachineMemOperand::MOLoad, Size, Align)); + MachineMemOperand::MOLoad, Size, Alignment)); if (NeedsWinCFI) InsertSEH(MIB, TII, MachineInstr::FrameDestroy); }; @@ -2348,6 +2400,7 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF, TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS); const AArch64RegisterInfo *RegInfo = static_cast( MF.getSubtarget().getRegisterInfo()); + const AArch64Subtarget &Subtarget = MF.getSubtarget(); AArch64FunctionInfo *AFI = MF.getInfo(); unsigned UnspilledCSGPR = AArch64::NoRegister; unsigned UnspilledCSGPRPaired = AArch64::NoRegister; @@ -2396,6 +2449,16 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF, } } + if (MF.getFunction().getCallingConv() == CallingConv::Win64 && + !Subtarget.isTargetWindows()) { + // For Windows calling convention on a non-windows OS, where X18 is treated + // as reserved, back up X18 when entering non-windows code (marked with the + // Windows calling convention) and restore when returning regardless of + // whether the individual function uses it - it might call other functions + // that clobber it. + SavedRegs.set(AArch64::X18); + } + // Calculates the callee saved stack size. unsigned CSStackSize = 0; unsigned SVECSStackSize = 0; @@ -2467,8 +2530,8 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF, const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); const TargetRegisterClass &RC = AArch64::GPR64RegClass; unsigned Size = TRI->getSpillSize(RC); - unsigned Align = TRI->getSpillAlignment(RC); - int FI = MFI.CreateStackObject(Size, Align, false); + Align Alignment = TRI->getSpillAlign(RC); + int FI = MFI.CreateStackObject(Size, Alignment, false); RS->addScavengingFrameIndex(FI); LLVM_DEBUG(dbgs() << "No available CS registers, allocated fi#" << FI << " as the emergency spill slot.\n"); @@ -2549,12 +2612,12 @@ static int64_t determineSVEStackObjectOffsets(MachineFrameInfo &MFI, // Then process all callee saved slots. if (getSVECalleeSaveSlotRange(MFI, MinCSFrameIndex, MaxCSFrameIndex)) { // Make sure to align the last callee save slot. - MFI.setObjectAlignment(MaxCSFrameIndex, 16U); + MFI.setObjectAlignment(MaxCSFrameIndex, Align(16)); // Assign offsets to the callee save slots. for (int I = MinCSFrameIndex; I <= MaxCSFrameIndex; ++I) { Offset += MFI.getObjectSize(I); - Offset = alignTo(Offset, MFI.getObjectAlignment(I)); + Offset = alignTo(Offset, MFI.getObjectAlign(I)); if (AssignOffsets) Assign(I, -Offset); } @@ -2576,15 +2639,15 @@ static int64_t determineSVEStackObjectOffsets(MachineFrameInfo &MFI, // Allocate all SVE locals and spills for (unsigned FI : ObjectsToAllocate) { - unsigned Align = MFI.getObjectAlignment(FI); + Align Alignment = MFI.getObjectAlign(FI); // FIXME: Given that the length of SVE vectors is not necessarily a power of // two, we'd need to align every object dynamically at runtime if the // alignment is larger than 16. This is not yet supported. - if (Align > 16) + if (Alignment > Align(16)) report_fatal_error( "Alignment of scalable vectors > 16 bytes is not yet supported"); - Offset = alignTo(Offset + MFI.getObjectSize(FI), Align); + Offset = alignTo(Offset + MFI.getObjectSize(FI), Alignment); if (AssignOffsets) Assign(FI, -Offset); } @@ -2632,9 +2695,14 @@ void AArch64FrameLowering::processFunctionBeforeFrameFinalized( ++MBBI; // Create an UnwindHelp object. - int UnwindHelpFI = - MFI.CreateStackObject(/*size*/8, /*alignment*/16, false); + // The UnwindHelp object is allocated at the start of the fixed object area + int64_t FixedObject = + getFixedObjectSize(MF, AFI, /*IsWin64*/ true, /*IsFunclet*/ false); + int UnwindHelpFI = MFI.CreateFixedObject(/*Size*/ 8, + /*SPOffset*/ -FixedObject, + /*IsImmutable=*/false); EHInfo.UnwindHelpFrameIdx = UnwindHelpFI; + // We need to store -2 into the UnwindHelp object at the start of the // function. DebugLoc DL; @@ -2649,17 +2717,411 @@ void AArch64FrameLowering::processFunctionBeforeFrameFinalized( .addImm(0); } -/// For Win64 AArch64 EH, the offset to the Unwind object is from the SP before -/// the update. This is easily retrieved as it is exactly the offset that is set -/// in processFunctionBeforeFrameFinalized. +namespace { +struct TagStoreInstr { + MachineInstr *MI; + int64_t Offset, Size; + explicit TagStoreInstr(MachineInstr *MI, int64_t Offset, int64_t Size) + : MI(MI), Offset(Offset), Size(Size) {} +}; + +class TagStoreEdit { + MachineFunction *MF; + MachineBasicBlock *MBB; + MachineRegisterInfo *MRI; + // Tag store instructions that are being replaced. + SmallVector TagStores; + // Combined memref arguments of the above instructions. + SmallVector CombinedMemRefs; + + // Replace allocation tags in [FrameReg + FrameRegOffset, FrameReg + + // FrameRegOffset + Size) with the address tag of SP. + Register FrameReg; + StackOffset FrameRegOffset; + int64_t Size; + // If not None, move FrameReg to (FrameReg + FrameRegUpdate) at the end. + Optional FrameRegUpdate; + // MIFlags for any FrameReg updating instructions. + unsigned FrameRegUpdateFlags; + + // Use zeroing instruction variants. + bool ZeroData; + DebugLoc DL; + + void emitUnrolled(MachineBasicBlock::iterator InsertI); + void emitLoop(MachineBasicBlock::iterator InsertI); + +public: + TagStoreEdit(MachineBasicBlock *MBB, bool ZeroData) + : MBB(MBB), ZeroData(ZeroData) { + MF = MBB->getParent(); + MRI = &MF->getRegInfo(); + } + // Add an instruction to be replaced. Instructions must be added in the + // ascending order of Offset, and have to be adjacent. + void addInstruction(TagStoreInstr I) { + assert((TagStores.empty() || + TagStores.back().Offset + TagStores.back().Size == I.Offset) && + "Non-adjacent tag store instructions."); + TagStores.push_back(I); + } + void clear() { TagStores.clear(); } + // Emit equivalent code at the given location, and erase the current set of + // instructions. May skip if the replacement is not profitable. May invalidate + // the input iterator and replace it with a valid one. + void emitCode(MachineBasicBlock::iterator &InsertI, + const AArch64FrameLowering *TFI, bool IsLast); +}; + +void TagStoreEdit::emitUnrolled(MachineBasicBlock::iterator InsertI) { + const AArch64InstrInfo *TII = + MF->getSubtarget().getInstrInfo(); + + const int64_t kMinOffset = -256 * 16; + const int64_t kMaxOffset = 255 * 16; + + Register BaseReg = FrameReg; + int64_t BaseRegOffsetBytes = FrameRegOffset.getBytes(); + if (BaseRegOffsetBytes < kMinOffset || + BaseRegOffsetBytes + (Size - Size % 32) > kMaxOffset) { + Register ScratchReg = MRI->createVirtualRegister(&AArch64::GPR64RegClass); + emitFrameOffset(*MBB, InsertI, DL, ScratchReg, BaseReg, + {BaseRegOffsetBytes, MVT::i8}, TII); + BaseReg = ScratchReg; + BaseRegOffsetBytes = 0; + } + + MachineInstr *LastI = nullptr; + while (Size) { + int64_t InstrSize = (Size > 16) ? 32 : 16; + unsigned Opcode = + InstrSize == 16 + ? (ZeroData ? AArch64::STZGOffset : AArch64::STGOffset) + : (ZeroData ? AArch64::STZ2GOffset : AArch64::ST2GOffset); + MachineInstr *I = BuildMI(*MBB, InsertI, DL, TII->get(Opcode)) + .addReg(AArch64::SP) + .addReg(BaseReg) + .addImm(BaseRegOffsetBytes / 16) + .setMemRefs(CombinedMemRefs); + // A store to [BaseReg, #0] should go last for an opportunity to fold the + // final SP adjustment in the epilogue. + if (BaseRegOffsetBytes == 0) + LastI = I; + BaseRegOffsetBytes += InstrSize; + Size -= InstrSize; + } + + if (LastI) + MBB->splice(InsertI, MBB, LastI); +} + +void TagStoreEdit::emitLoop(MachineBasicBlock::iterator InsertI) { + const AArch64InstrInfo *TII = + MF->getSubtarget().getInstrInfo(); + + Register BaseReg = FrameRegUpdate + ? FrameReg + : MRI->createVirtualRegister(&AArch64::GPR64RegClass); + Register SizeReg = MRI->createVirtualRegister(&AArch64::GPR64RegClass); + + emitFrameOffset(*MBB, InsertI, DL, BaseReg, FrameReg, FrameRegOffset, TII); + + int64_t LoopSize = Size; + // If the loop size is not a multiple of 32, split off one 16-byte store at + // the end to fold BaseReg update into. + if (FrameRegUpdate && *FrameRegUpdate) + LoopSize -= LoopSize % 32; + MachineInstr *LoopI = BuildMI(*MBB, InsertI, DL, + TII->get(ZeroData ? AArch64::STZGloop_wback + : AArch64::STGloop_wback)) + .addDef(SizeReg) + .addDef(BaseReg) + .addImm(LoopSize) + .addReg(BaseReg) + .setMemRefs(CombinedMemRefs); + if (FrameRegUpdate) + LoopI->setFlags(FrameRegUpdateFlags); + + int64_t ExtraBaseRegUpdate = + FrameRegUpdate ? (*FrameRegUpdate - FrameRegOffset.getBytes() - Size) : 0; + if (LoopSize < Size) { + assert(FrameRegUpdate); + assert(Size - LoopSize == 16); + // Tag 16 more bytes at BaseReg and update BaseReg. + BuildMI(*MBB, InsertI, DL, + TII->get(ZeroData ? AArch64::STZGPostIndex : AArch64::STGPostIndex)) + .addDef(BaseReg) + .addReg(BaseReg) + .addReg(BaseReg) + .addImm(1 + ExtraBaseRegUpdate / 16) + .setMemRefs(CombinedMemRefs) + .setMIFlags(FrameRegUpdateFlags); + } else if (ExtraBaseRegUpdate) { + // Update BaseReg. + BuildMI( + *MBB, InsertI, DL, + TII->get(ExtraBaseRegUpdate > 0 ? AArch64::ADDXri : AArch64::SUBXri)) + .addDef(BaseReg) + .addReg(BaseReg) + .addImm(std::abs(ExtraBaseRegUpdate)) + .addImm(0) + .setMIFlags(FrameRegUpdateFlags); + } +} + +// Check if *II is a register update that can be merged into STGloop that ends +// at (Reg + Size). RemainingOffset is the required adjustment to Reg after the +// end of the loop. +bool canMergeRegUpdate(MachineBasicBlock::iterator II, unsigned Reg, + int64_t Size, int64_t *TotalOffset) { + MachineInstr &MI = *II; + if ((MI.getOpcode() == AArch64::ADDXri || + MI.getOpcode() == AArch64::SUBXri) && + MI.getOperand(0).getReg() == Reg && MI.getOperand(1).getReg() == Reg) { + unsigned Shift = AArch64_AM::getShiftValue(MI.getOperand(3).getImm()); + int64_t Offset = MI.getOperand(2).getImm() << Shift; + if (MI.getOpcode() == AArch64::SUBXri) + Offset = -Offset; + int64_t AbsPostOffset = std::abs(Offset - Size); + const int64_t kMaxOffset = + 0xFFF; // Max encoding for unshifted ADDXri / SUBXri + if (AbsPostOffset <= kMaxOffset && AbsPostOffset % 16 == 0) { + *TotalOffset = Offset; + return true; + } + } + return false; +} + +void mergeMemRefs(const SmallVectorImpl &TSE, + SmallVectorImpl &MemRefs) { + MemRefs.clear(); + for (auto &TS : TSE) { + MachineInstr *MI = TS.MI; + // An instruction without memory operands may access anything. Be + // conservative and return an empty list. + if (MI->memoperands_empty()) { + MemRefs.clear(); + return; + } + MemRefs.append(MI->memoperands_begin(), MI->memoperands_end()); + } +} + +void TagStoreEdit::emitCode(MachineBasicBlock::iterator &InsertI, + const AArch64FrameLowering *TFI, bool IsLast) { + if (TagStores.empty()) + return; + TagStoreInstr &FirstTagStore = TagStores[0]; + TagStoreInstr &LastTagStore = TagStores[TagStores.size() - 1]; + Size = LastTagStore.Offset - FirstTagStore.Offset + LastTagStore.Size; + DL = TagStores[0].MI->getDebugLoc(); + + Register Reg; + FrameRegOffset = TFI->resolveFrameOffsetReference( + *MF, FirstTagStore.Offset, false /*isFixed*/, false /*isSVE*/, Reg, + /*PreferFP=*/false, /*ForSimm=*/true); + FrameReg = Reg; + FrameRegUpdate = None; + + mergeMemRefs(TagStores, CombinedMemRefs); + + LLVM_DEBUG(dbgs() << "Replacing adjacent STG instructions:\n"; + for (const auto &Instr + : TagStores) { dbgs() << " " << *Instr.MI; }); + + // Size threshold where a loop becomes shorter than a linear sequence of + // tagging instructions. + const int kSetTagLoopThreshold = 176; + if (Size < kSetTagLoopThreshold) { + if (TagStores.size() < 2) + return; + emitUnrolled(InsertI); + } else { + MachineInstr *UpdateInstr = nullptr; + int64_t TotalOffset; + if (IsLast) { + // See if we can merge base register update into the STGloop. + // This is done in AArch64LoadStoreOptimizer for "normal" stores, + // but STGloop is way too unusual for that, and also it only + // realistically happens in function epilogue. Also, STGloop is expanded + // before that pass. + if (InsertI != MBB->end() && + canMergeRegUpdate(InsertI, FrameReg, FrameRegOffset.getBytes() + Size, + &TotalOffset)) { + UpdateInstr = &*InsertI++; + LLVM_DEBUG(dbgs() << "Folding SP update into loop:\n " + << *UpdateInstr); + } + } + + if (!UpdateInstr && TagStores.size() < 2) + return; + + if (UpdateInstr) { + FrameRegUpdate = TotalOffset; + FrameRegUpdateFlags = UpdateInstr->getFlags(); + } + emitLoop(InsertI); + if (UpdateInstr) + UpdateInstr->eraseFromParent(); + } + + for (auto &TS : TagStores) + TS.MI->eraseFromParent(); +} + +bool isMergeableStackTaggingInstruction(MachineInstr &MI, int64_t &Offset, + int64_t &Size, bool &ZeroData) { + MachineFunction &MF = *MI.getParent()->getParent(); + const MachineFrameInfo &MFI = MF.getFrameInfo(); + + unsigned Opcode = MI.getOpcode(); + ZeroData = (Opcode == AArch64::STZGloop || Opcode == AArch64::STZGOffset || + Opcode == AArch64::STZ2GOffset); + + if (Opcode == AArch64::STGloop || Opcode == AArch64::STZGloop) { + if (!MI.getOperand(0).isDead() || !MI.getOperand(1).isDead()) + return false; + if (!MI.getOperand(2).isImm() || !MI.getOperand(3).isFI()) + return false; + Offset = MFI.getObjectOffset(MI.getOperand(3).getIndex()); + Size = MI.getOperand(2).getImm(); + return true; + } + + if (Opcode == AArch64::STGOffset || Opcode == AArch64::STZGOffset) + Size = 16; + else if (Opcode == AArch64::ST2GOffset || Opcode == AArch64::STZ2GOffset) + Size = 32; + else + return false; + + if (MI.getOperand(0).getReg() != AArch64::SP || !MI.getOperand(1).isFI()) + return false; + + Offset = MFI.getObjectOffset(MI.getOperand(1).getIndex()) + + 16 * MI.getOperand(2).getImm(); + return true; +} + +// Detect a run of memory tagging instructions for adjacent stack frame slots, +// and replace them with a shorter instruction sequence: +// * replace STG + STG with ST2G +// * replace STGloop + STGloop with STGloop +// This code needs to run when stack slot offsets are already known, but before +// FrameIndex operands in STG instructions are eliminated. +MachineBasicBlock::iterator tryMergeAdjacentSTG(MachineBasicBlock::iterator II, + const AArch64FrameLowering *TFI, + RegScavenger *RS) { + bool FirstZeroData; + int64_t Size, Offset; + MachineInstr &MI = *II; + MachineBasicBlock *MBB = MI.getParent(); + MachineBasicBlock::iterator NextI = ++II; + if (&MI == &MBB->instr_back()) + return II; + if (!isMergeableStackTaggingInstruction(MI, Offset, Size, FirstZeroData)) + return II; + + SmallVector Instrs; + Instrs.emplace_back(&MI, Offset, Size); + + constexpr int kScanLimit = 10; + int Count = 0; + for (MachineBasicBlock::iterator E = MBB->end(); + NextI != E && Count < kScanLimit; ++NextI) { + MachineInstr &MI = *NextI; + bool ZeroData; + int64_t Size, Offset; + // Collect instructions that update memory tags with a FrameIndex operand + // and (when applicable) constant size, and whose output registers are dead + // (the latter is almost always the case in practice). Since these + // instructions effectively have no inputs or outputs, we are free to skip + // any non-aliasing instructions in between without tracking used registers. + if (isMergeableStackTaggingInstruction(MI, Offset, Size, ZeroData)) { + if (ZeroData != FirstZeroData) + break; + Instrs.emplace_back(&MI, Offset, Size); + continue; + } + + // Only count non-transient, non-tagging instructions toward the scan + // limit. + if (!MI.isTransient()) + ++Count; + + // Just in case, stop before the epilogue code starts. + if (MI.getFlag(MachineInstr::FrameSetup) || + MI.getFlag(MachineInstr::FrameDestroy)) + break; + + // Reject anything that may alias the collected instructions. + if (MI.mayLoadOrStore() || MI.hasUnmodeledSideEffects()) + break; + } + + // New code will be inserted after the last tagging instruction we've found. + MachineBasicBlock::iterator InsertI = Instrs.back().MI; + InsertI++; + + llvm::stable_sort(Instrs, + [](const TagStoreInstr &Left, const TagStoreInstr &Right) { + return Left.Offset < Right.Offset; + }); + + // Make sure that we don't have any overlapping stores. + int64_t CurOffset = Instrs[0].Offset; + for (auto &Instr : Instrs) { + if (CurOffset > Instr.Offset) + return NextI; + CurOffset = Instr.Offset + Instr.Size; + } + + // Find contiguous runs of tagged memory and emit shorter instruction + // sequencies for them when possible. + TagStoreEdit TSE(MBB, FirstZeroData); + Optional EndOffset; + for (auto &Instr : Instrs) { + if (EndOffset && *EndOffset != Instr.Offset) { + // Found a gap. + TSE.emitCode(InsertI, TFI, /*IsLast = */ false); + TSE.clear(); + } + + TSE.addInstruction(Instr); + EndOffset = Instr.Offset + Instr.Size; + } + + TSE.emitCode(InsertI, TFI, /*IsLast = */ true); + + return InsertI; +} +} // namespace + +void AArch64FrameLowering::processFunctionBeforeFrameIndicesReplaced( + MachineFunction &MF, RegScavenger *RS = nullptr) const { + if (StackTaggingMergeSetTag) + for (auto &BB : MF) + for (MachineBasicBlock::iterator II = BB.begin(); II != BB.end();) + II = tryMergeAdjacentSTG(II, this, RS); +} + +/// For Win64 AArch64 EH, the offset to the Unwind object is from the SP +/// before the update. This is easily retrieved as it is exactly the offset +/// that is set in processFunctionBeforeFrameFinalized. int AArch64FrameLowering::getFrameIndexReferencePreferSP( - const MachineFunction &MF, int FI, unsigned &FrameReg, + const MachineFunction &MF, int FI, Register &FrameReg, bool IgnoreSPUpdates) const { const MachineFrameInfo &MFI = MF.getFrameInfo(); - LLVM_DEBUG(dbgs() << "Offset from the SP for " << FI << " is " - << MFI.getObjectOffset(FI) << "\n"); - FrameReg = AArch64::SP; - return MFI.getObjectOffset(FI); + if (IgnoreSPUpdates) { + LLVM_DEBUG(dbgs() << "Offset from the SP for " << FI << " is " + << MFI.getObjectOffset(FI) << "\n"); + FrameReg = AArch64::SP; + return MFI.getObjectOffset(FI); + } + + return getFrameIndexReference(MF, FI, FrameReg); } /// The parent frame offset (aka dispFrame) is only used on X86_64 to retrieve @@ -2678,5 +3140,5 @@ unsigned AArch64FrameLowering::getWinEHFuncletFrameSize( MF.getInfo()->getCalleeSavedStackSize(); // This is the amount of stack a funclet needs to allocate. return alignTo(CSSize + MF.getFrameInfo().getMaxCallFrameSize(), - getStackAlignment()); + getStackAlign()); } diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.h b/llvm/lib/Target/AArch64/AArch64FrameLowering.h index b5719feb6b154..9d0a6d9eaf255 100644 --- a/llvm/lib/Target/AArch64/AArch64FrameLowering.h +++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.h @@ -24,8 +24,9 @@ public: : TargetFrameLowering(StackGrowsDown, Align(16), 0, Align(16), true /*StackRealignable*/) {} - void emitCalleeSavedFrameMoves(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MBBI) const; + void + emitCalleeSavedFrameMoves(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI) const override; MachineBasicBlock::iterator eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB, @@ -39,23 +40,24 @@ public: bool canUseAsPrologue(const MachineBasicBlock &MBB) const override; int getFrameIndexReference(const MachineFunction &MF, int FI, - unsigned &FrameReg) const override; + Register &FrameReg) const override; StackOffset resolveFrameIndexReference(const MachineFunction &MF, int FI, - unsigned &FrameReg, bool PreferFP, + Register &FrameReg, bool PreferFP, bool ForSimm) const; StackOffset resolveFrameOffsetReference(const MachineFunction &MF, int64_t ObjectOffset, bool isFixed, - bool isSVE, unsigned &FrameReg, + bool isSVE, Register &FrameReg, bool PreferFP, bool ForSimm) const; bool spillCalleeSavedRegisters(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, - const std::vector &CSI, + ArrayRef CSI, const TargetRegisterInfo *TRI) const override; - bool restoreCalleeSavedRegisters(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MI, - std::vector &CSI, - const TargetRegisterInfo *TRI) const override; + bool + restoreCalleeSavedRegisters(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + MutableArrayRef CSI, + const TargetRegisterInfo *TRI) const override; /// Can this function use the red zone for local allocations. bool canUseRedZone(const MachineFunction &MF) const; @@ -77,12 +79,16 @@ public: void processFunctionBeforeFrameFinalized(MachineFunction &MF, RegScavenger *RS) const override; + void + processFunctionBeforeFrameIndicesReplaced(MachineFunction &MF, + RegScavenger *RS) const override; + unsigned getWinEHParentFrameOffset(const MachineFunction &MF) const override; unsigned getWinEHFuncletFrameSize(const MachineFunction &MF) const; int getFrameIndexReferencePreferSP(const MachineFunction &MF, int FI, - unsigned &FrameReg, + Register &FrameReg, bool IgnoreSPUpdates) const override; int getNonLocalFrameIndexReference(const MachineFunction &MF, int FI) const override; @@ -107,6 +113,8 @@ private: int64_t assignSVEStackObjectOffsets(MachineFrameInfo &MF, int &MinCSFrameIndex, int &MaxCSFrameIndex) const; + bool shouldCombineCSRLocalStackBumpInEpilogue(MachineBasicBlock &MBB, + unsigned StackBumpBytes) const; }; } // End llvm namespace diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp index a51aa85a931c0..10c4778533533 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp @@ -62,6 +62,9 @@ public: unsigned ConstraintID, std::vector &OutOps) override; + template + bool SelectRDVLImm(SDValue N, SDValue &Imm); + bool tryMLAV64LaneV128(SDNode *N); bool tryMULLV64LaneV128(unsigned IntNo, SDNode *N); bool SelectArithExtendedRegister(SDValue N, SDValue &Reg, SDValue &Shift); @@ -159,6 +162,24 @@ public: return false; } + bool SelectDupZero(SDValue N) { + switch(N->getOpcode()) { + case AArch64ISD::DUP: + case ISD::SPLAT_VECTOR: { + auto Opnd0 = N->getOperand(0); + if (auto CN = dyn_cast(Opnd0)) + if (CN->isNullValue()) + return true; + if (auto CN = dyn_cast(Opnd0)) + if (CN->isZero()) + return true; + break; + } + } + + return false; + } + template bool SelectSVEAddSubImm(SDValue N, SDValue &Imm, SDValue &Shift) { return SelectSVEAddSubImm(N, VT, Imm, Shift); @@ -169,6 +190,11 @@ public: return SelectSVELogicalImm(N, VT, Imm); } + template + bool SelectSVEShiftImm64(SDValue N, SDValue &Imm) { + return SelectSVEShiftImm64(N, Low, High, Imm); + } + // Returns a suitable CNT/INC/DEC/RDVL multiplier to calculate VSCALE*N. template bool SelectCntImm(SDValue N, SDValue &Imm) { @@ -197,6 +223,9 @@ public: /// unchanged; otherwise a REG_SEQUENCE value is returned. SDValue createDTuple(ArrayRef Vecs); SDValue createQTuple(ArrayRef Vecs); + // Form a sequence of SVE registers for instructions using list of vectors, + // e.g. structured loads and stores (ldN, stN). + SDValue createZTuple(ArrayRef Vecs); /// Generic helper for the createDTuple/createQTuple /// functions. Those should almost always be called instead. @@ -216,11 +245,31 @@ public: unsigned SubRegIdx); void SelectLoadLane(SDNode *N, unsigned NumVecs, unsigned Opc); void SelectPostLoadLane(SDNode *N, unsigned NumVecs, unsigned Opc); + void SelectPredicatedLoad(SDNode *N, unsigned NumVecs, const unsigned Opc); + + bool SelectAddrModeFrameIndexSVE(SDValue N, SDValue &Base, SDValue &OffImm); + /// SVE Reg+Imm addressing mode. + template + bool SelectAddrModeIndexedSVE(SDNode *Root, SDValue N, SDValue &Base, + SDValue &OffImm); + /// SVE Reg+Reg address mode. + template + bool SelectSVERegRegAddrMode(SDValue N, SDValue &Base, SDValue &Offset) { + return SelectSVERegRegAddrMode(N, Scale, Base, Offset); + } void SelectStore(SDNode *N, unsigned NumVecs, unsigned Opc); void SelectPostStore(SDNode *N, unsigned NumVecs, unsigned Opc); void SelectStoreLane(SDNode *N, unsigned NumVecs, unsigned Opc); void SelectPostStoreLane(SDNode *N, unsigned NumVecs, unsigned Opc); + template + void SelectPredicatedStore(SDNode *N, unsigned NumVecs, const unsigned Opc_rr, + const unsigned Opc_ri); + template + std::tuple + findAddrModeSVELoadStore(SDNode *N, const unsigned Opc_rr, + const unsigned Opc_ri, const SDValue &OldBase, + const SDValue &OldOffset); bool tryBitfieldExtractOp(SDNode *N); bool tryBitfieldExtractOpFromSExt(SDNode *N); @@ -268,13 +317,19 @@ private: bool SelectCMP_SWAP(SDNode *N); + bool SelectSVE8BitLslImm(SDValue N, SDValue &Imm, SDValue &Shift); + bool SelectSVEAddSubImm(SDValue N, MVT VT, SDValue &Imm, SDValue &Shift); bool SelectSVELogicalImm(SDValue N, MVT VT, SDValue &Imm); bool SelectSVESignedArithImm(SDValue N, SDValue &Imm); + bool SelectSVEShiftImm64(SDValue N, uint64_t Low, uint64_t High, + SDValue &Imm); bool SelectSVEArithImm(SDValue N, SDValue &Imm); + bool SelectSVERegRegAddrMode(SDValue N, unsigned Scale, SDValue &Base, + SDValue &Offset); }; } // end anonymous namespace @@ -679,6 +734,23 @@ static SDValue narrowIfNeeded(SelectionDAG *CurDAG, SDValue N) { return SDValue(Node, 0); } +// Returns a suitable CNT/INC/DEC/RDVL multiplier to calculate VSCALE*N. +template +bool AArch64DAGToDAGISel::SelectRDVLImm(SDValue N, SDValue &Imm) { + if (!isa(N)) + return false; + + int64_t MulImm = cast(N)->getSExtValue(); + if ((MulImm % std::abs(Scale)) == 0) { + int64_t RDVLImm = MulImm / Scale; + if ((RDVLImm >= Low) && (RDVLImm <= High)) { + Imm = CurDAG->getTargetConstant(RDVLImm, SDLoc(N), MVT::i32); + return true; + } + } + + return false; +} /// SelectArithExtendedRegister - Select a "extended register" operand. This /// operand folds in an extend followed by an optional left shift. @@ -832,16 +904,9 @@ bool AArch64DAGToDAGISel::SelectAddrModeIndexed(SDValue N, unsigned Size, if (!GAN) return true; - if (GAN->getOffset() % Size == 0) { - const GlobalValue *GV = GAN->getGlobal(); - unsigned Alignment = GV->getAlignment(); - Type *Ty = GV->getValueType(); - if (Alignment == 0 && Ty->isSized()) - Alignment = DL.getABITypeAlignment(Ty); - - if (Alignment >= Size) - return true; - } + if (GAN->getOffset() % Size == 0 && + GAN->getGlobal()->getPointerAlignment(DL) >= Size) + return true; } if (CurDAG->isBaseWithConstantOffset(N)) { @@ -1132,6 +1197,16 @@ SDValue AArch64DAGToDAGISel::createQTuple(ArrayRef Regs) { return createTuple(Regs, RegClassIDs, SubRegs); } +SDValue AArch64DAGToDAGISel::createZTuple(ArrayRef Regs) { + static const unsigned RegClassIDs[] = {AArch64::ZPR2RegClassID, + AArch64::ZPR3RegClassID, + AArch64::ZPR4RegClassID}; + static const unsigned SubRegs[] = {AArch64::zsub0, AArch64::zsub1, + AArch64::zsub2, AArch64::zsub3}; + + return createTuple(Regs, RegClassIDs, SubRegs); +} + SDValue AArch64DAGToDAGISel::createTuple(ArrayRef Regs, const unsigned RegClassIDs[], const unsigned SubRegs[]) { @@ -1240,6 +1315,8 @@ bool AArch64DAGToDAGISel::tryIndexedLoad(SDNode *N) { } } else if (VT == MVT::f16) { Opcode = IsPre ? AArch64::LDRHpre : AArch64::LDRHpost; + } else if (VT == MVT::bf16) { + Opcode = IsPre ? AArch64::LDRHpre : AArch64::LDRHpost; } else if (VT == MVT::f32) { Opcode = IsPre ? AArch64::LDRSpre : AArch64::LDRSpost; } else if (VT == MVT::f64 || VT.is64BitVector()) { @@ -1334,6 +1411,54 @@ void AArch64DAGToDAGISel::SelectPostLoad(SDNode *N, unsigned NumVecs, CurDAG->RemoveDeadNode(N); } +/// Optimize \param OldBase and \param OldOffset selecting the best addressing +/// mode. Returns a tuple consisting of an Opcode, an SDValue representing the +/// new Base and an SDValue representing the new offset. +template +std::tuple +AArch64DAGToDAGISel::findAddrModeSVELoadStore(SDNode *N, const unsigned Opc_rr, + const unsigned Opc_ri, + const SDValue &OldBase, + const SDValue &OldOffset) { + SDValue NewBase = OldBase; + SDValue NewOffset = OldOffset; + // Detect a possible Reg+Imm addressing mode. + const bool IsRegImm = SelectAddrModeIndexedSVE( + N, OldBase, NewBase, NewOffset); + + // Detect a possible reg+reg addressing mode, but only if we haven't already + // detected a Reg+Imm one. + const bool IsRegReg = + !IsRegImm && SelectSVERegRegAddrMode(OldBase, NewBase, NewOffset); + + // Select the instruction. + return std::make_tuple(IsRegReg ? Opc_rr : Opc_ri, NewBase, NewOffset); +} + +void AArch64DAGToDAGISel::SelectPredicatedLoad(SDNode *N, unsigned NumVecs, + const unsigned Opc) { + SDLoc DL(N); + EVT VT = N->getValueType(0); + SDValue Chain = N->getOperand(0); + + SDValue Ops[] = {N->getOperand(1), // Predicate + N->getOperand(2), // Memory operand + CurDAG->getTargetConstant(0, DL, MVT::i64), Chain}; + + const EVT ResTys[] = {MVT::Untyped, MVT::Other}; + + SDNode *Load = CurDAG->getMachineNode(Opc, DL, ResTys, Ops); + SDValue SuperReg = SDValue(Load, 0); + for (unsigned i = 0; i < NumVecs; ++i) + ReplaceUses(SDValue(N, i), CurDAG->getTargetExtractSubreg( + AArch64::zsub0 + i, DL, VT, SuperReg)); + + // Copy chain + unsigned ChainIdx = NumVecs; + ReplaceUses(SDValue(N, ChainIdx), SDValue(Load, 1)); + CurDAG->RemoveDeadNode(N); +} + void AArch64DAGToDAGISel::SelectStore(SDNode *N, unsigned NumVecs, unsigned Opc) { SDLoc dl(N); @@ -1354,6 +1479,49 @@ void AArch64DAGToDAGISel::SelectStore(SDNode *N, unsigned NumVecs, ReplaceNode(N, St); } +template +void AArch64DAGToDAGISel::SelectPredicatedStore(SDNode *N, unsigned NumVecs, + const unsigned Opc_rr, + const unsigned Opc_ri) { + SDLoc dl(N); + + // Form a REG_SEQUENCE to force register allocation. + SmallVector Regs(N->op_begin() + 2, N->op_begin() + 2 + NumVecs); + SDValue RegSeq = createZTuple(Regs); + + // Optimize addressing mode. + unsigned Opc; + SDValue Offset, Base; + std::tie(Opc, Base, Offset) = findAddrModeSVELoadStore( + N, Opc_rr, Opc_ri, N->getOperand(NumVecs + 3), + CurDAG->getTargetConstant(0, dl, MVT::i64)); + + SDValue Ops[] = {RegSeq, N->getOperand(NumVecs + 2), // predicate + Base, // address + Offset, // offset + N->getOperand(0)}; // chain + SDNode *St = CurDAG->getMachineNode(Opc, dl, N->getValueType(0), Ops); + + ReplaceNode(N, St); +} + +bool AArch64DAGToDAGISel::SelectAddrModeFrameIndexSVE(SDValue N, SDValue &Base, + SDValue &OffImm) { + SDLoc dl(N); + const DataLayout &DL = CurDAG->getDataLayout(); + const TargetLowering *TLI = getTargetLowering(); + + // Try to match it for the frame address + if (auto FINode = dyn_cast(N)) { + int FI = FINode->getIndex(); + Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy(DL)); + OffImm = CurDAG->getTargetConstant(0, dl, MVT::i64); + return true; + } + + return false; +} + void AArch64DAGToDAGISel::SelectPostStore(SDNode *N, unsigned NumVecs, unsigned Opc) { SDLoc dl(N); @@ -2632,7 +2800,8 @@ bool AArch64DAGToDAGISel::tryShiftAmountMod(SDNode *N) { // bits that are implicitly ANDed off by the above opcodes and if so, skip // the AND. uint64_t MaskImm; - if (!isOpcWithIntImmediate(ShiftAmt.getNode(), ISD::AND, MaskImm)) + if (!isOpcWithIntImmediate(ShiftAmt.getNode(), ISD::AND, MaskImm) && + !isOpcWithIntImmediate(ShiftAmt.getNode(), AArch64ISD::ANDS, MaskImm)) return false; if (countTrailingOnes(MaskImm) < Bits) @@ -2879,6 +3048,32 @@ bool AArch64DAGToDAGISel::SelectCMP_SWAP(SDNode *N) { return true; } +bool AArch64DAGToDAGISel::SelectSVE8BitLslImm(SDValue N, SDValue &Base, + SDValue &Offset) { + auto C = dyn_cast(N); + if (!C) + return false; + + auto Ty = N->getValueType(0); + + int64_t Imm = C->getSExtValue(); + SDLoc DL(N); + + if ((Imm >= -128) && (Imm <= 127)) { + Base = CurDAG->getTargetConstant(Imm, DL, Ty); + Offset = CurDAG->getTargetConstant(0, DL, Ty); + return true; + } + + if (((Imm % 256) == 0) && (Imm >= -32768) && (Imm <= 32512)) { + Base = CurDAG->getTargetConstant(Imm/256, DL, Ty); + Offset = CurDAG->getTargetConstant(8, DL, Ty); + return true; + } + + return false; +} + bool AArch64DAGToDAGISel::SelectSVEAddSubImm(SDValue N, MVT VT, SDValue &Imm, SDValue &Shift) { if (auto CNode = dyn_cast(N)) { const int64_t ImmVal = CNode->getZExtValue(); @@ -2917,7 +3112,7 @@ bool AArch64DAGToDAGISel::SelectSVESignedArithImm(SDValue N, SDValue &Imm) { if (auto CNode = dyn_cast(N)) { int64_t ImmVal = CNode->getSExtValue(); SDLoc DL(N); - if (ImmVal >= -127 && ImmVal < 127) { + if (ImmVal >= -128 && ImmVal < 128) { Imm = CurDAG->getTargetConstant(ImmVal, DL, MVT::i32); return true; } @@ -2975,6 +3170,24 @@ bool AArch64DAGToDAGISel::SelectSVELogicalImm(SDValue N, MVT VT, SDValue &Imm) { return false; } +// This method is only needed to "cast" i64s into i32s when the value +// is a valid shift which has been splatted into a vector with i64 elements. +// Every other type is fine in tablegen. +bool AArch64DAGToDAGISel::SelectSVEShiftImm64(SDValue N, uint64_t Low, + uint64_t High, SDValue &Imm) { + if (auto *CN = dyn_cast(N)) { + uint64_t ImmVal = CN->getZExtValue(); + SDLoc DL(N); + + if (ImmVal >= Low && ImmVal <= High) { + Imm = CurDAG->getTargetConstant(ImmVal, DL, MVT::i32); + return true; + } + } + + return false; +} + bool AArch64DAGToDAGISel::trySelectStackSlotTagP(SDNode *N) { // tagp(FrameIndex, IRGstack, tag_offset): // since the offset between FrameIndex and IRGstack is a compile-time @@ -3027,6 +3240,63 @@ void AArch64DAGToDAGISel::SelectTagP(SDNode *N) { ReplaceNode(N, N3); } +// NOTE: We cannot use EXTRACT_SUBREG in all cases because the fixed length +// vector types larger than NEON don't have a matching SubRegIndex. +static SDNode *extractSubReg(SelectionDAG *DAG, EVT VT, SDValue V) { + assert(V.getValueType().isScalableVector() && + V.getValueType().getSizeInBits().getKnownMinSize() == + AArch64::SVEBitsPerBlock && + "Expected to extract from a packed scalable vector!"); + assert(VT.isFixedLengthVector() && + "Expected to extract a fixed length vector!"); + + SDLoc DL(V); + switch (VT.getSizeInBits()) { + case 64: { + auto SubReg = DAG->getTargetConstant(AArch64::dsub, DL, MVT::i32); + return DAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL, VT, V, SubReg); + } + case 128: { + auto SubReg = DAG->getTargetConstant(AArch64::zsub, DL, MVT::i32); + return DAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL, VT, V, SubReg); + } + default: { + auto RC = DAG->getTargetConstant(AArch64::ZPRRegClassID, DL, MVT::i64); + return DAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS, DL, VT, V, RC); + } + } +} + +// NOTE: We cannot use INSERT_SUBREG in all cases because the fixed length +// vector types larger than NEON don't have a matching SubRegIndex. +static SDNode *insertSubReg(SelectionDAG *DAG, EVT VT, SDValue V) { + assert(VT.isScalableVector() && + VT.getSizeInBits().getKnownMinSize() == AArch64::SVEBitsPerBlock && + "Expected to insert into a packed scalable vector!"); + assert(V.getValueType().isFixedLengthVector() && + "Expected to insert a fixed length vector!"); + + SDLoc DL(V); + switch (V.getValueType().getSizeInBits()) { + case 64: { + auto SubReg = DAG->getTargetConstant(AArch64::dsub, DL, MVT::i32); + auto Container = DAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, VT); + return DAG->getMachineNode(TargetOpcode::INSERT_SUBREG, DL, VT, + SDValue(Container, 0), V, SubReg); + } + case 128: { + auto SubReg = DAG->getTargetConstant(AArch64::zsub, DL, MVT::i32); + auto Container = DAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, VT); + return DAG->getMachineNode(TargetOpcode::INSERT_SUBREG, DL, VT, + SDValue(Container, 0), V, SubReg); + } + default: { + auto RC = DAG->getTargetConstant(AArch64::ZPRRegClassID, DL, MVT::i64); + return DAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS, DL, VT, V, RC); + } + } +} + void AArch64DAGToDAGISel::Select(SDNode *Node) { // If we have a custom node, we already have selected! if (Node->isMachineOpcode()) { @@ -3100,6 +3370,52 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { return; break; + case ISD::EXTRACT_SUBVECTOR: { + // Bail when not a "cast" like extract_subvector. + if (cast(Node->getOperand(1))->getZExtValue() != 0) + break; + + // Bail when normal isel can do the job. + EVT InVT = Node->getOperand(0).getValueType(); + if (VT.isScalableVector() || InVT.isFixedLengthVector()) + break; + + // NOTE: We can only get here when doing fixed length SVE code generation. + // We do manual selection because the types involved are not linked to real + // registers (despite being legal) and must be coerced into SVE registers. + // + // NOTE: If the above changes, be aware that selection will still not work + // because the td definition of extract_vector does not support extracting + // a fixed length vector from a scalable vector. + + ReplaceNode(Node, extractSubReg(CurDAG, VT, Node->getOperand(0))); + return; + } + + case ISD::INSERT_SUBVECTOR: { + // Bail when not a "cast" like insert_subvector. + if (cast(Node->getOperand(2))->getZExtValue() != 0) + break; + if (!Node->getOperand(0).isUndef()) + break; + + // Bail when normal isel should do the job. + EVT InVT = Node->getOperand(1).getValueType(); + if (VT.isFixedLengthVector() || InVT.isScalableVector()) + break; + + // NOTE: We can only get here when doing fixed length SVE code generation. + // We do manual selection because the types involved are not linked to real + // registers (despite being legal) and must be coerced into SVE registers. + // + // NOTE: If the above changes, be aware that selection will still not work + // because the td definition of insert_vector does not support inserting a + // fixed length vector into a scalable vector. + + ReplaceNode(Node, insertSubReg(CurDAG, VT, Node->getOperand(1))); + return; + } + case ISD::Constant: { // Materialize zero constants as copies from WZR/XZR. This allows // the coalescer to propagate these into other instructions. @@ -3185,10 +3501,10 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { } else if (VT == MVT::v16i8) { SelectLoad(Node, 2, AArch64::LD1Twov16b, AArch64::qsub0); return; - } else if (VT == MVT::v4i16 || VT == MVT::v4f16) { + } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) { SelectLoad(Node, 2, AArch64::LD1Twov4h, AArch64::dsub0); return; - } else if (VT == MVT::v8i16 || VT == MVT::v8f16) { + } else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) { SelectLoad(Node, 2, AArch64::LD1Twov8h, AArch64::qsub0); return; } else if (VT == MVT::v2i32 || VT == MVT::v2f32) { @@ -3212,10 +3528,10 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { } else if (VT == MVT::v16i8) { SelectLoad(Node, 3, AArch64::LD1Threev16b, AArch64::qsub0); return; - } else if (VT == MVT::v4i16 || VT == MVT::v4f16) { + } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) { SelectLoad(Node, 3, AArch64::LD1Threev4h, AArch64::dsub0); return; - } else if (VT == MVT::v8i16 || VT == MVT::v8f16) { + } else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) { SelectLoad(Node, 3, AArch64::LD1Threev8h, AArch64::qsub0); return; } else if (VT == MVT::v2i32 || VT == MVT::v2f32) { @@ -3239,10 +3555,10 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { } else if (VT == MVT::v16i8) { SelectLoad(Node, 4, AArch64::LD1Fourv16b, AArch64::qsub0); return; - } else if (VT == MVT::v4i16 || VT == MVT::v4f16) { + } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) { SelectLoad(Node, 4, AArch64::LD1Fourv4h, AArch64::dsub0); return; - } else if (VT == MVT::v8i16 || VT == MVT::v8f16) { + } else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) { SelectLoad(Node, 4, AArch64::LD1Fourv8h, AArch64::qsub0); return; } else if (VT == MVT::v2i32 || VT == MVT::v2f32) { @@ -3266,10 +3582,10 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { } else if (VT == MVT::v16i8) { SelectLoad(Node, 2, AArch64::LD2Twov16b, AArch64::qsub0); return; - } else if (VT == MVT::v4i16 || VT == MVT::v4f16) { + } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) { SelectLoad(Node, 2, AArch64::LD2Twov4h, AArch64::dsub0); return; - } else if (VT == MVT::v8i16 || VT == MVT::v8f16) { + } else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) { SelectLoad(Node, 2, AArch64::LD2Twov8h, AArch64::qsub0); return; } else if (VT == MVT::v2i32 || VT == MVT::v2f32) { @@ -3293,10 +3609,10 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { } else if (VT == MVT::v16i8) { SelectLoad(Node, 3, AArch64::LD3Threev16b, AArch64::qsub0); return; - } else if (VT == MVT::v4i16 || VT == MVT::v4f16) { + } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) { SelectLoad(Node, 3, AArch64::LD3Threev4h, AArch64::dsub0); return; - } else if (VT == MVT::v8i16 || VT == MVT::v8f16) { + } else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) { SelectLoad(Node, 3, AArch64::LD3Threev8h, AArch64::qsub0); return; } else if (VT == MVT::v2i32 || VT == MVT::v2f32) { @@ -3320,10 +3636,10 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { } else if (VT == MVT::v16i8) { SelectLoad(Node, 4, AArch64::LD4Fourv16b, AArch64::qsub0); return; - } else if (VT == MVT::v4i16 || VT == MVT::v4f16) { + } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) { SelectLoad(Node, 4, AArch64::LD4Fourv4h, AArch64::dsub0); return; - } else if (VT == MVT::v8i16 || VT == MVT::v8f16) { + } else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) { SelectLoad(Node, 4, AArch64::LD4Fourv8h, AArch64::qsub0); return; } else if (VT == MVT::v2i32 || VT == MVT::v2f32) { @@ -3347,10 +3663,10 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { } else if (VT == MVT::v16i8) { SelectLoad(Node, 2, AArch64::LD2Rv16b, AArch64::qsub0); return; - } else if (VT == MVT::v4i16 || VT == MVT::v4f16) { + } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) { SelectLoad(Node, 2, AArch64::LD2Rv4h, AArch64::dsub0); return; - } else if (VT == MVT::v8i16 || VT == MVT::v8f16) { + } else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) { SelectLoad(Node, 2, AArch64::LD2Rv8h, AArch64::qsub0); return; } else if (VT == MVT::v2i32 || VT == MVT::v2f32) { @@ -3374,10 +3690,10 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { } else if (VT == MVT::v16i8) { SelectLoad(Node, 3, AArch64::LD3Rv16b, AArch64::qsub0); return; - } else if (VT == MVT::v4i16 || VT == MVT::v4f16) { + } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) { SelectLoad(Node, 3, AArch64::LD3Rv4h, AArch64::dsub0); return; - } else if (VT == MVT::v8i16 || VT == MVT::v8f16) { + } else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) { SelectLoad(Node, 3, AArch64::LD3Rv8h, AArch64::qsub0); return; } else if (VT == MVT::v2i32 || VT == MVT::v2f32) { @@ -3401,10 +3717,10 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { } else if (VT == MVT::v16i8) { SelectLoad(Node, 4, AArch64::LD4Rv16b, AArch64::qsub0); return; - } else if (VT == MVT::v4i16 || VT == MVT::v4f16) { + } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) { SelectLoad(Node, 4, AArch64::LD4Rv4h, AArch64::dsub0); return; - } else if (VT == MVT::v8i16 || VT == MVT::v8f16) { + } else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) { SelectLoad(Node, 4, AArch64::LD4Rv8h, AArch64::qsub0); return; } else if (VT == MVT::v2i32 || VT == MVT::v2f32) { @@ -3426,7 +3742,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { SelectLoadLane(Node, 2, AArch64::LD2i8); return; } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 || - VT == MVT::v8f16) { + VT == MVT::v8f16 || VT == MVT::v4bf16 || VT == MVT::v8bf16) { SelectLoadLane(Node, 2, AArch64::LD2i16); return; } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 || @@ -3444,7 +3760,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { SelectLoadLane(Node, 3, AArch64::LD3i8); return; } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 || - VT == MVT::v8f16) { + VT == MVT::v8f16 || VT == MVT::v4bf16 || VT == MVT::v8bf16) { SelectLoadLane(Node, 3, AArch64::LD3i16); return; } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 || @@ -3462,7 +3778,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { SelectLoadLane(Node, 4, AArch64::LD4i8); return; } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 || - VT == MVT::v8f16) { + VT == MVT::v8f16 || VT == MVT::v4bf16 || VT == MVT::v8bf16) { SelectLoadLane(Node, 4, AArch64::LD4i16); return; } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 || @@ -3537,10 +3853,12 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { } else if (VT == MVT::v16i8) { SelectStore(Node, 2, AArch64::ST1Twov16b); return; - } else if (VT == MVT::v4i16 || VT == MVT::v4f16) { + } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || + VT == MVT::v4bf16) { SelectStore(Node, 2, AArch64::ST1Twov4h); return; - } else if (VT == MVT::v8i16 || VT == MVT::v8f16) { + } else if (VT == MVT::v8i16 || VT == MVT::v8f16 || + VT == MVT::v8bf16) { SelectStore(Node, 2, AArch64::ST1Twov8h); return; } else if (VT == MVT::v2i32 || VT == MVT::v2f32) { @@ -3565,10 +3883,12 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { } else if (VT == MVT::v16i8) { SelectStore(Node, 3, AArch64::ST1Threev16b); return; - } else if (VT == MVT::v4i16 || VT == MVT::v4f16) { + } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || + VT == MVT::v4bf16) { SelectStore(Node, 3, AArch64::ST1Threev4h); return; - } else if (VT == MVT::v8i16 || VT == MVT::v8f16) { + } else if (VT == MVT::v8i16 || VT == MVT::v8f16 || + VT == MVT::v8bf16) { SelectStore(Node, 3, AArch64::ST1Threev8h); return; } else if (VT == MVT::v2i32 || VT == MVT::v2f32) { @@ -3593,10 +3913,12 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { } else if (VT == MVT::v16i8) { SelectStore(Node, 4, AArch64::ST1Fourv16b); return; - } else if (VT == MVT::v4i16 || VT == MVT::v4f16) { + } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || + VT == MVT::v4bf16) { SelectStore(Node, 4, AArch64::ST1Fourv4h); return; - } else if (VT == MVT::v8i16 || VT == MVT::v8f16) { + } else if (VT == MVT::v8i16 || VT == MVT::v8f16 || + VT == MVT::v8bf16) { SelectStore(Node, 4, AArch64::ST1Fourv8h); return; } else if (VT == MVT::v2i32 || VT == MVT::v2f32) { @@ -3621,10 +3943,12 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { } else if (VT == MVT::v16i8) { SelectStore(Node, 2, AArch64::ST2Twov16b); return; - } else if (VT == MVT::v4i16 || VT == MVT::v4f16) { + } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || + VT == MVT::v4bf16) { SelectStore(Node, 2, AArch64::ST2Twov4h); return; - } else if (VT == MVT::v8i16 || VT == MVT::v8f16) { + } else if (VT == MVT::v8i16 || VT == MVT::v8f16 || + VT == MVT::v8bf16) { SelectStore(Node, 2, AArch64::ST2Twov8h); return; } else if (VT == MVT::v2i32 || VT == MVT::v2f32) { @@ -3649,10 +3973,12 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { } else if (VT == MVT::v16i8) { SelectStore(Node, 3, AArch64::ST3Threev16b); return; - } else if (VT == MVT::v4i16 || VT == MVT::v4f16) { + } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || + VT == MVT::v4bf16) { SelectStore(Node, 3, AArch64::ST3Threev4h); return; - } else if (VT == MVT::v8i16 || VT == MVT::v8f16) { + } else if (VT == MVT::v8i16 || VT == MVT::v8f16 || + VT == MVT::v8bf16) { SelectStore(Node, 3, AArch64::ST3Threev8h); return; } else if (VT == MVT::v2i32 || VT == MVT::v2f32) { @@ -3677,10 +4003,12 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { } else if (VT == MVT::v16i8) { SelectStore(Node, 4, AArch64::ST4Fourv16b); return; - } else if (VT == MVT::v4i16 || VT == MVT::v4f16) { + } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || + VT == MVT::v4bf16) { SelectStore(Node, 4, AArch64::ST4Fourv4h); return; - } else if (VT == MVT::v8i16 || VT == MVT::v8f16) { + } else if (VT == MVT::v8i16 || VT == MVT::v8f16 || + VT == MVT::v8bf16) { SelectStore(Node, 4, AArch64::ST4Fourv8h); return; } else if (VT == MVT::v2i32 || VT == MVT::v2f32) { @@ -3703,7 +4031,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { SelectStoreLane(Node, 2, AArch64::ST2i8); return; } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 || - VT == MVT::v8f16) { + VT == MVT::v8f16 || VT == MVT::v4bf16 || VT == MVT::v8bf16) { SelectStoreLane(Node, 2, AArch64::ST2i16); return; } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 || @@ -3722,7 +4050,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { SelectStoreLane(Node, 3, AArch64::ST3i8); return; } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 || - VT == MVT::v8f16) { + VT == MVT::v8f16 || VT == MVT::v4bf16 || VT == MVT::v8bf16) { SelectStoreLane(Node, 3, AArch64::ST3i16); return; } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 || @@ -3741,7 +4069,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { SelectStoreLane(Node, 4, AArch64::ST4i8); return; } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 || - VT == MVT::v8f16) { + VT == MVT::v8f16 || VT == MVT::v4bf16 || VT == MVT::v8bf16) { SelectStoreLane(Node, 4, AArch64::ST4i16); return; } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 || @@ -3755,6 +4083,69 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { } break; } + case Intrinsic::aarch64_sve_st2: { + if (VT == MVT::nxv16i8) { + SelectPredicatedStore(Node, 2, AArch64::ST2B, + AArch64::ST2B_IMM); + return; + } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 || + (VT == MVT::nxv8bf16 && Subtarget->hasBF16())) { + SelectPredicatedStore(Node, 2, AArch64::ST2H, + AArch64::ST2H_IMM); + return; + } else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) { + SelectPredicatedStore(Node, 2, AArch64::ST2W, + AArch64::ST2W_IMM); + return; + } else if (VT == MVT::nxv2i64 || VT == MVT::nxv2f64) { + SelectPredicatedStore(Node, 2, AArch64::ST2D, + AArch64::ST2D_IMM); + return; + } + break; + } + case Intrinsic::aarch64_sve_st3: { + if (VT == MVT::nxv16i8) { + SelectPredicatedStore(Node, 3, AArch64::ST3B, + AArch64::ST3B_IMM); + return; + } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 || + (VT == MVT::nxv8bf16 && Subtarget->hasBF16())) { + SelectPredicatedStore(Node, 3, AArch64::ST3H, + AArch64::ST3H_IMM); + return; + } else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) { + SelectPredicatedStore(Node, 3, AArch64::ST3W, + AArch64::ST3W_IMM); + return; + } else if (VT == MVT::nxv2i64 || VT == MVT::nxv2f64) { + SelectPredicatedStore(Node, 3, AArch64::ST3D, + AArch64::ST3D_IMM); + return; + } + break; + } + case Intrinsic::aarch64_sve_st4: { + if (VT == MVT::nxv16i8) { + SelectPredicatedStore(Node, 4, AArch64::ST4B, + AArch64::ST4B_IMM); + return; + } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 || + (VT == MVT::nxv8bf16 && Subtarget->hasBF16())) { + SelectPredicatedStore(Node, 4, AArch64::ST4H, + AArch64::ST4H_IMM); + return; + } else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) { + SelectPredicatedStore(Node, 4, AArch64::ST4W, + AArch64::ST4W_IMM); + return; + } else if (VT == MVT::nxv2i64 || VT == MVT::nxv2f64) { + SelectPredicatedStore(Node, 4, AArch64::ST4D, + AArch64::ST4D_IMM); + return; + } + break; + } } break; } @@ -3765,10 +4156,10 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { } else if (VT == MVT::v16i8) { SelectPostLoad(Node, 2, AArch64::LD2Twov16b_POST, AArch64::qsub0); return; - } else if (VT == MVT::v4i16 || VT == MVT::v4f16) { + } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) { SelectPostLoad(Node, 2, AArch64::LD2Twov4h_POST, AArch64::dsub0); return; - } else if (VT == MVT::v8i16 || VT == MVT::v8f16) { + } else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) { SelectPostLoad(Node, 2, AArch64::LD2Twov8h_POST, AArch64::qsub0); return; } else if (VT == MVT::v2i32 || VT == MVT::v2f32) { @@ -3793,10 +4184,10 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { } else if (VT == MVT::v16i8) { SelectPostLoad(Node, 3, AArch64::LD3Threev16b_POST, AArch64::qsub0); return; - } else if (VT == MVT::v4i16 || VT == MVT::v4f16) { + } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) { SelectPostLoad(Node, 3, AArch64::LD3Threev4h_POST, AArch64::dsub0); return; - } else if (VT == MVT::v8i16 || VT == MVT::v8f16) { + } else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) { SelectPostLoad(Node, 3, AArch64::LD3Threev8h_POST, AArch64::qsub0); return; } else if (VT == MVT::v2i32 || VT == MVT::v2f32) { @@ -3821,10 +4212,10 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { } else if (VT == MVT::v16i8) { SelectPostLoad(Node, 4, AArch64::LD4Fourv16b_POST, AArch64::qsub0); return; - } else if (VT == MVT::v4i16 || VT == MVT::v4f16) { + } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) { SelectPostLoad(Node, 4, AArch64::LD4Fourv4h_POST, AArch64::dsub0); return; - } else if (VT == MVT::v8i16 || VT == MVT::v8f16) { + } else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) { SelectPostLoad(Node, 4, AArch64::LD4Fourv8h_POST, AArch64::qsub0); return; } else if (VT == MVT::v2i32 || VT == MVT::v2f32) { @@ -3849,10 +4240,10 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { } else if (VT == MVT::v16i8) { SelectPostLoad(Node, 2, AArch64::LD1Twov16b_POST, AArch64::qsub0); return; - } else if (VT == MVT::v4i16 || VT == MVT::v4f16) { + } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) { SelectPostLoad(Node, 2, AArch64::LD1Twov4h_POST, AArch64::dsub0); return; - } else if (VT == MVT::v8i16 || VT == MVT::v8f16) { + } else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) { SelectPostLoad(Node, 2, AArch64::LD1Twov8h_POST, AArch64::qsub0); return; } else if (VT == MVT::v2i32 || VT == MVT::v2f32) { @@ -3877,10 +4268,10 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { } else if (VT == MVT::v16i8) { SelectPostLoad(Node, 3, AArch64::LD1Threev16b_POST, AArch64::qsub0); return; - } else if (VT == MVT::v4i16 || VT == MVT::v4f16) { + } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) { SelectPostLoad(Node, 3, AArch64::LD1Threev4h_POST, AArch64::dsub0); return; - } else if (VT == MVT::v8i16 || VT == MVT::v8f16) { + } else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) { SelectPostLoad(Node, 3, AArch64::LD1Threev8h_POST, AArch64::qsub0); return; } else if (VT == MVT::v2i32 || VT == MVT::v2f32) { @@ -3905,10 +4296,10 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { } else if (VT == MVT::v16i8) { SelectPostLoad(Node, 4, AArch64::LD1Fourv16b_POST, AArch64::qsub0); return; - } else if (VT == MVT::v4i16 || VT == MVT::v4f16) { + } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) { SelectPostLoad(Node, 4, AArch64::LD1Fourv4h_POST, AArch64::dsub0); return; - } else if (VT == MVT::v8i16 || VT == MVT::v8f16) { + } else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) { SelectPostLoad(Node, 4, AArch64::LD1Fourv8h_POST, AArch64::qsub0); return; } else if (VT == MVT::v2i32 || VT == MVT::v2f32) { @@ -3933,10 +4324,10 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { } else if (VT == MVT::v16i8) { SelectPostLoad(Node, 1, AArch64::LD1Rv16b_POST, AArch64::qsub0); return; - } else if (VT == MVT::v4i16 || VT == MVT::v4f16) { + } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) { SelectPostLoad(Node, 1, AArch64::LD1Rv4h_POST, AArch64::dsub0); return; - } else if (VT == MVT::v8i16 || VT == MVT::v8f16) { + } else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) { SelectPostLoad(Node, 1, AArch64::LD1Rv8h_POST, AArch64::qsub0); return; } else if (VT == MVT::v2i32 || VT == MVT::v2f32) { @@ -3961,10 +4352,10 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { } else if (VT == MVT::v16i8) { SelectPostLoad(Node, 2, AArch64::LD2Rv16b_POST, AArch64::qsub0); return; - } else if (VT == MVT::v4i16 || VT == MVT::v4f16) { + } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) { SelectPostLoad(Node, 2, AArch64::LD2Rv4h_POST, AArch64::dsub0); return; - } else if (VT == MVT::v8i16 || VT == MVT::v8f16) { + } else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) { SelectPostLoad(Node, 2, AArch64::LD2Rv8h_POST, AArch64::qsub0); return; } else if (VT == MVT::v2i32 || VT == MVT::v2f32) { @@ -3989,10 +4380,10 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { } else if (VT == MVT::v16i8) { SelectPostLoad(Node, 3, AArch64::LD3Rv16b_POST, AArch64::qsub0); return; - } else if (VT == MVT::v4i16 || VT == MVT::v4f16) { + } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) { SelectPostLoad(Node, 3, AArch64::LD3Rv4h_POST, AArch64::dsub0); return; - } else if (VT == MVT::v8i16 || VT == MVT::v8f16) { + } else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) { SelectPostLoad(Node, 3, AArch64::LD3Rv8h_POST, AArch64::qsub0); return; } else if (VT == MVT::v2i32 || VT == MVT::v2f32) { @@ -4017,10 +4408,10 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { } else if (VT == MVT::v16i8) { SelectPostLoad(Node, 4, AArch64::LD4Rv16b_POST, AArch64::qsub0); return; - } else if (VT == MVT::v4i16 || VT == MVT::v4f16) { + } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) { SelectPostLoad(Node, 4, AArch64::LD4Rv4h_POST, AArch64::dsub0); return; - } else if (VT == MVT::v8i16 || VT == MVT::v8f16) { + } else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) { SelectPostLoad(Node, 4, AArch64::LD4Rv8h_POST, AArch64::qsub0); return; } else if (VT == MVT::v2i32 || VT == MVT::v2f32) { @@ -4043,7 +4434,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { SelectPostLoadLane(Node, 1, AArch64::LD1i8_POST); return; } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 || - VT == MVT::v8f16) { + VT == MVT::v8f16 || VT == MVT::v4bf16 || VT == MVT::v8bf16) { SelectPostLoadLane(Node, 1, AArch64::LD1i16_POST); return; } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 || @@ -4062,7 +4453,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { SelectPostLoadLane(Node, 2, AArch64::LD2i8_POST); return; } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 || - VT == MVT::v8f16) { + VT == MVT::v8f16 || VT == MVT::v4bf16 || VT == MVT::v8bf16) { SelectPostLoadLane(Node, 2, AArch64::LD2i16_POST); return; } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 || @@ -4081,7 +4472,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { SelectPostLoadLane(Node, 3, AArch64::LD3i8_POST); return; } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 || - VT == MVT::v8f16) { + VT == MVT::v8f16 || VT == MVT::v4bf16 || VT == MVT::v8bf16) { SelectPostLoadLane(Node, 3, AArch64::LD3i16_POST); return; } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 || @@ -4100,7 +4491,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { SelectPostLoadLane(Node, 4, AArch64::LD4i8_POST); return; } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 || - VT == MVT::v8f16) { + VT == MVT::v8f16 || VT == MVT::v4bf16 || VT == MVT::v8bf16) { SelectPostLoadLane(Node, 4, AArch64::LD4i16_POST); return; } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 || @@ -4122,10 +4513,10 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { } else if (VT == MVT::v16i8) { SelectPostStore(Node, 2, AArch64::ST2Twov16b_POST); return; - } else if (VT == MVT::v4i16 || VT == MVT::v4f16) { + } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) { SelectPostStore(Node, 2, AArch64::ST2Twov4h_POST); return; - } else if (VT == MVT::v8i16 || VT == MVT::v8f16) { + } else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) { SelectPostStore(Node, 2, AArch64::ST2Twov8h_POST); return; } else if (VT == MVT::v2i32 || VT == MVT::v2f32) { @@ -4151,10 +4542,10 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { } else if (VT == MVT::v16i8) { SelectPostStore(Node, 3, AArch64::ST3Threev16b_POST); return; - } else if (VT == MVT::v4i16 || VT == MVT::v4f16) { + } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) { SelectPostStore(Node, 3, AArch64::ST3Threev4h_POST); return; - } else if (VT == MVT::v8i16 || VT == MVT::v8f16) { + } else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) { SelectPostStore(Node, 3, AArch64::ST3Threev8h_POST); return; } else if (VT == MVT::v2i32 || VT == MVT::v2f32) { @@ -4180,10 +4571,10 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { } else if (VT == MVT::v16i8) { SelectPostStore(Node, 4, AArch64::ST4Fourv16b_POST); return; - } else if (VT == MVT::v4i16 || VT == MVT::v4f16) { + } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) { SelectPostStore(Node, 4, AArch64::ST4Fourv4h_POST); return; - } else if (VT == MVT::v8i16 || VT == MVT::v8f16) { + } else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) { SelectPostStore(Node, 4, AArch64::ST4Fourv8h_POST); return; } else if (VT == MVT::v2i32 || VT == MVT::v2f32) { @@ -4209,10 +4600,10 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { } else if (VT == MVT::v16i8) { SelectPostStore(Node, 2, AArch64::ST1Twov16b_POST); return; - } else if (VT == MVT::v4i16 || VT == MVT::v4f16) { + } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) { SelectPostStore(Node, 2, AArch64::ST1Twov4h_POST); return; - } else if (VT == MVT::v8i16 || VT == MVT::v8f16) { + } else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) { SelectPostStore(Node, 2, AArch64::ST1Twov8h_POST); return; } else if (VT == MVT::v2i32 || VT == MVT::v2f32) { @@ -4238,10 +4629,10 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { } else if (VT == MVT::v16i8) { SelectPostStore(Node, 3, AArch64::ST1Threev16b_POST); return; - } else if (VT == MVT::v4i16 || VT == MVT::v4f16) { + } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) { SelectPostStore(Node, 3, AArch64::ST1Threev4h_POST); return; - } else if (VT == MVT::v8i16 || VT == MVT::v8f16) { + } else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16 ) { SelectPostStore(Node, 3, AArch64::ST1Threev8h_POST); return; } else if (VT == MVT::v2i32 || VT == MVT::v2f32) { @@ -4267,10 +4658,10 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { } else if (VT == MVT::v16i8) { SelectPostStore(Node, 4, AArch64::ST1Fourv16b_POST); return; - } else if (VT == MVT::v4i16 || VT == MVT::v4f16) { + } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) { SelectPostStore(Node, 4, AArch64::ST1Fourv4h_POST); return; - } else if (VT == MVT::v8i16 || VT == MVT::v8f16) { + } else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) { SelectPostStore(Node, 4, AArch64::ST1Fourv8h_POST); return; } else if (VT == MVT::v2i32 || VT == MVT::v2f32) { @@ -4294,7 +4685,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { SelectPostStoreLane(Node, 2, AArch64::ST2i8_POST); return; } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 || - VT == MVT::v8f16) { + VT == MVT::v8f16 || VT == MVT::v4bf16 || VT == MVT::v8bf16) { SelectPostStoreLane(Node, 2, AArch64::ST2i16_POST); return; } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 || @@ -4314,7 +4705,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { SelectPostStoreLane(Node, 3, AArch64::ST3i8_POST); return; } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 || - VT == MVT::v8f16) { + VT == MVT::v8f16 || VT == MVT::v4bf16 || VT == MVT::v8bf16) { SelectPostStoreLane(Node, 3, AArch64::ST3i16_POST); return; } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 || @@ -4334,7 +4725,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { SelectPostStoreLane(Node, 4, AArch64::ST4i8_POST); return; } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 || - VT == MVT::v8f16) { + VT == MVT::v8f16 || VT == MVT::v4bf16 || VT == MVT::v8bf16) { SelectPostStoreLane(Node, 4, AArch64::ST4i16_POST); return; } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 || @@ -4348,6 +4739,57 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { } break; } + case AArch64ISD::SVE_LD2_MERGE_ZERO: { + if (VT == MVT::nxv16i8) { + SelectPredicatedLoad(Node, 2, AArch64::LD2B_IMM); + return; + } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 || + (VT == MVT::nxv8bf16 && Subtarget->hasBF16())) { + SelectPredicatedLoad(Node, 2, AArch64::LD2H_IMM); + return; + } else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) { + SelectPredicatedLoad(Node, 2, AArch64::LD2W_IMM); + return; + } else if (VT == MVT::nxv2i64 || VT == MVT::nxv2f64) { + SelectPredicatedLoad(Node, 2, AArch64::LD2D_IMM); + return; + } + break; + } + case AArch64ISD::SVE_LD3_MERGE_ZERO: { + if (VT == MVT::nxv16i8) { + SelectPredicatedLoad(Node, 3, AArch64::LD3B_IMM); + return; + } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 || + (VT == MVT::nxv8bf16 && Subtarget->hasBF16())) { + SelectPredicatedLoad(Node, 3, AArch64::LD3H_IMM); + return; + } else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) { + SelectPredicatedLoad(Node, 3, AArch64::LD3W_IMM); + return; + } else if (VT == MVT::nxv2i64 || VT == MVT::nxv2f64) { + SelectPredicatedLoad(Node, 3, AArch64::LD3D_IMM); + return; + } + break; + } + case AArch64ISD::SVE_LD4_MERGE_ZERO: { + if (VT == MVT::nxv16i8) { + SelectPredicatedLoad(Node, 4, AArch64::LD4B_IMM); + return; + } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 || + (VT == MVT::nxv8bf16 && Subtarget->hasBF16())) { + SelectPredicatedLoad(Node, 4, AArch64::LD4H_IMM); + return; + } else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) { + SelectPredicatedLoad(Node, 4, AArch64::LD4W_IMM); + return; + } else if (VT == MVT::nxv2i64 || VT == MVT::nxv2f64) { + SelectPredicatedLoad(Node, 4, AArch64::LD4D_IMM); + return; + } + break; + } } // Select the default instruction @@ -4360,3 +4802,130 @@ FunctionPass *llvm::createAArch64ISelDag(AArch64TargetMachine &TM, CodeGenOpt::Level OptLevel) { return new AArch64DAGToDAGISel(TM, OptLevel); } + +/// When \p PredVT is a scalable vector predicate in the form +/// MVT::nxxi1, it builds the correspondent scalable vector of +/// integers MVT::nxxi s.t. M x bits = 128. If the input +/// PredVT is not in the form MVT::nxxi1, it returns an invalid +/// EVT. +static EVT getPackedVectorTypeFromPredicateType(LLVMContext &Ctx, EVT PredVT) { + if (!PredVT.isScalableVector() || PredVT.getVectorElementType() != MVT::i1) + return EVT(); + + if (PredVT != MVT::nxv16i1 && PredVT != MVT::nxv8i1 && + PredVT != MVT::nxv4i1 && PredVT != MVT::nxv2i1) + return EVT(); + + ElementCount EC = PredVT.getVectorElementCount(); + EVT ScalarVT = EVT::getIntegerVT(Ctx, AArch64::SVEBitsPerBlock / EC.Min); + EVT MemVT = EVT::getVectorVT(Ctx, ScalarVT, EC); + return MemVT; +} + +/// Return the EVT of the data associated to a memory operation in \p +/// Root. If such EVT cannot be retrived, it returns an invalid EVT. +static EVT getMemVTFromNode(LLVMContext &Ctx, SDNode *Root) { + if (isa(Root)) + return cast(Root)->getMemoryVT(); + + if (isa(Root)) + return cast(Root)->getMemoryVT(); + + const unsigned Opcode = Root->getOpcode(); + // For custom ISD nodes, we have to look at them individually to extract the + // type of the data moved to/from memory. + switch (Opcode) { + case AArch64ISD::LD1_MERGE_ZERO: + case AArch64ISD::LD1S_MERGE_ZERO: + case AArch64ISD::LDNF1_MERGE_ZERO: + case AArch64ISD::LDNF1S_MERGE_ZERO: + return cast(Root->getOperand(3))->getVT(); + case AArch64ISD::ST1_PRED: + return cast(Root->getOperand(4))->getVT(); + default: + break; + } + + if (Opcode != ISD::INTRINSIC_VOID) + return EVT(); + + const unsigned IntNo = + cast(Root->getOperand(1))->getZExtValue(); + if (IntNo != Intrinsic::aarch64_sve_prf) + return EVT(); + + // We are using an SVE prefetch intrinsic. Type must be inferred + // from the width of the predicate. + return getPackedVectorTypeFromPredicateType( + Ctx, Root->getOperand(2)->getValueType(0)); +} + +/// SelectAddrModeIndexedSVE - Attempt selection of the addressing mode: +/// Base + OffImm * sizeof(MemVT) for Min >= OffImm <= Max +/// where Root is the memory access using N for its address. +template +bool AArch64DAGToDAGISel::SelectAddrModeIndexedSVE(SDNode *Root, SDValue N, + SDValue &Base, + SDValue &OffImm) { + const EVT MemVT = getMemVTFromNode(*(CurDAG->getContext()), Root); + + if (MemVT == EVT()) + return false; + + if (N.getOpcode() != ISD::ADD) + return false; + + SDValue VScale = N.getOperand(1); + if (VScale.getOpcode() != ISD::VSCALE) + return false; + + TypeSize TS = MemVT.getSizeInBits(); + int64_t MemWidthBytes = static_cast(TS.getKnownMinSize()) / 8; + int64_t MulImm = cast(VScale.getOperand(0))->getSExtValue(); + + if ((MulImm % MemWidthBytes) != 0) + return false; + + int64_t Offset = MulImm / MemWidthBytes; + if (Offset < Min || Offset > Max) + return false; + + Base = N.getOperand(0); + OffImm = CurDAG->getTargetConstant(Offset, SDLoc(N), MVT::i64); + return true; +} + +/// Select register plus register addressing mode for SVE, with scaled +/// offset. +bool AArch64DAGToDAGISel::SelectSVERegRegAddrMode(SDValue N, unsigned Scale, + SDValue &Base, + SDValue &Offset) { + if (N.getOpcode() != ISD::ADD) + return false; + + // Process an ADD node. + const SDValue LHS = N.getOperand(0); + const SDValue RHS = N.getOperand(1); + + // 8 bit data does not come with the SHL node, so it is treated + // separately. + if (Scale == 0) { + Base = LHS; + Offset = RHS; + return true; + } + + // Check if the RHS is a shift node with a constant. + if (RHS.getOpcode() != ISD::SHL) + return false; + + const SDValue ShiftRHS = RHS.getOperand(1); + if (auto *C = dyn_cast(ShiftRHS)) + if (C->getZExtValue() == Scale) { + Base = LHS; + Offset = RHS.getOperand(0); + return true; + } + + return false; +} diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index d45a80057564a..85db14ab66feb 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -99,11 +99,6 @@ STATISTIC(NumTailCalls, "Number of tail calls"); STATISTIC(NumShiftInserts, "Number of vector shift inserts"); STATISTIC(NumOptimizedImms, "Number of times immediates were optimized"); -static cl::opt -EnableAArch64SlrGeneration("aarch64-shift-insert-generation", cl::Hidden, - cl::desc("Allow AArch64 SLI/SRI formation"), - cl::init(false)); - // FIXME: The necessary dtprel relocations don't seem to be supported // well in the GNU bfd and gold linkers at the moment. Therefore, by // default, for now, fall back to GeneralDynamic code generation. @@ -121,6 +116,18 @@ EnableOptimizeLogicalImm("aarch64-enable-logical-imm", cl::Hidden, /// Value type used for condition codes. static const MVT MVT_CC = MVT::i32; +/// Returns true if VT's elements occupy the lowest bit positions of its +/// associated register class without any intervening space. +/// +/// For example, nxv2f16, nxv4f16 and nxv8f16 are legal types that belong to the +/// same register class, but only nxv8f16 can be treated as a packed vector. +static inline bool isPackedVectorType(EVT VT, SelectionDAG &DAG) { + assert(VT.isVector() && DAG.getTargetLoweringInfo().isTypeLegal(VT) && + "Expected legal vector type!"); + return VT.isFixedLengthVector() || + VT.getSizeInBits().getKnownMinSize() == AArch64::SVEBitsPerBlock; +} + AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, const AArch64Subtarget &STI) : TargetLowering(TM), Subtarget(&STI) { @@ -137,6 +144,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, if (Subtarget->hasFPARMv8()) { addRegisterClass(MVT::f16, &AArch64::FPR16RegClass); + addRegisterClass(MVT::bf16, &AArch64::FPR16RegClass); addRegisterClass(MVT::f32, &AArch64::FPR32RegClass); addRegisterClass(MVT::f64, &AArch64::FPR64RegClass); addRegisterClass(MVT::f128, &AArch64::FPR128RegClass); @@ -153,6 +161,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, addDRTypeForNEON(MVT::v1i64); addDRTypeForNEON(MVT::v1f64); addDRTypeForNEON(MVT::v4f16); + addDRTypeForNEON(MVT::v4bf16); addQRTypeForNEON(MVT::v4f32); addQRTypeForNEON(MVT::v2f64); @@ -161,6 +170,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, addQRTypeForNEON(MVT::v4i32); addQRTypeForNEON(MVT::v2i64); addQRTypeForNEON(MVT::v8f16); + addQRTypeForNEON(MVT::v8bf16); } if (Subtarget->hasSVE()) { @@ -183,21 +193,51 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, addRegisterClass(MVT::nxv4f32, &AArch64::ZPRRegClass); addRegisterClass(MVT::nxv2f64, &AArch64::ZPRRegClass); + if (Subtarget->hasBF16()) { + addRegisterClass(MVT::nxv2bf16, &AArch64::ZPRRegClass); + addRegisterClass(MVT::nxv4bf16, &AArch64::ZPRRegClass); + addRegisterClass(MVT::nxv8bf16, &AArch64::ZPRRegClass); + } + + if (useSVEForFixedLengthVectors()) { + for (MVT VT : MVT::integer_fixedlen_vector_valuetypes()) + if (useSVEForFixedLengthVectorVT(VT)) + addRegisterClass(VT, &AArch64::ZPRRegClass); + + for (MVT VT : MVT::fp_fixedlen_vector_valuetypes()) + if (useSVEForFixedLengthVectorVT(VT)) + addRegisterClass(VT, &AArch64::ZPRRegClass); + } + for (auto VT : { MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32, MVT::nxv2i64 }) { setOperationAction(ISD::SADDSAT, VT, Legal); setOperationAction(ISD::UADDSAT, VT, Legal); setOperationAction(ISD::SSUBSAT, VT, Legal); setOperationAction(ISD::USUBSAT, VT, Legal); - setOperationAction(ISD::SMAX, VT, Legal); - setOperationAction(ISD::UMAX, VT, Legal); - setOperationAction(ISD::SMIN, VT, Legal); - setOperationAction(ISD::UMIN, VT, Legal); + setOperationAction(ISD::UREM, VT, Expand); + setOperationAction(ISD::SREM, VT, Expand); + setOperationAction(ISD::SDIVREM, VT, Expand); + setOperationAction(ISD::UDIVREM, VT, Expand); } for (auto VT : { MVT::nxv2i8, MVT::nxv2i16, MVT::nxv2i32, MVT::nxv2i64, MVT::nxv4i8, MVT::nxv4i16, MVT::nxv4i32, MVT::nxv8i8, MVT::nxv8i16 }) setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Legal); + + for (auto VT : + { MVT::nxv2f16, MVT::nxv4f16, MVT::nxv8f16, MVT::nxv2f32, MVT::nxv4f32, + MVT::nxv2f64 }) { + setCondCodeAction(ISD::SETO, VT, Expand); + setCondCodeAction(ISD::SETOLT, VT, Expand); + setCondCodeAction(ISD::SETOLE, VT, Expand); + setCondCodeAction(ISD::SETULT, VT, Expand); + setCondCodeAction(ISD::SETULE, VT, Expand); + setCondCodeAction(ISD::SETUGE, VT, Expand); + setCondCodeAction(ISD::SETUGT, VT, Expand); + setCondCodeAction(ISD::SETUEQ, VT, Expand); + setCondCodeAction(ISD::SETUNE, VT, Expand); + } } // Compute derived properties from the register classes @@ -211,6 +251,12 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setOperationAction(ISD::SETCC, MVT::f16, Custom); setOperationAction(ISD::SETCC, MVT::f32, Custom); setOperationAction(ISD::SETCC, MVT::f64, Custom); + setOperationAction(ISD::STRICT_FSETCC, MVT::f16, Custom); + setOperationAction(ISD::STRICT_FSETCC, MVT::f32, Custom); + setOperationAction(ISD::STRICT_FSETCC, MVT::f64, Custom); + setOperationAction(ISD::STRICT_FSETCCS, MVT::f16, Custom); + setOperationAction(ISD::STRICT_FSETCCS, MVT::f32, Custom); + setOperationAction(ISD::STRICT_FSETCCS, MVT::f64, Custom); setOperationAction(ISD::BITREVERSE, MVT::i32, Legal); setOperationAction(ISD::BITREVERSE, MVT::i64, Legal); setOperationAction(ISD::BRCOND, MVT::Other, Expand); @@ -266,6 +312,8 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setOperationAction(ISD::FSUB, MVT::f128, Custom); setOperationAction(ISD::FTRUNC, MVT::f128, Expand); setOperationAction(ISD::SETCC, MVT::f128, Custom); + setOperationAction(ISD::STRICT_FSETCC, MVT::f128, Custom); + setOperationAction(ISD::STRICT_FSETCCS, MVT::f128, Custom); setOperationAction(ISD::BR_CC, MVT::f128, Custom); setOperationAction(ISD::SELECT, MVT::f128, Custom); setOperationAction(ISD::SELECT_CC, MVT::f128, Custom); @@ -276,17 +324,31 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom); setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom); setOperationAction(ISD::FP_TO_SINT, MVT::i128, Custom); + setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i32, Custom); + setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i64, Custom); + setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i128, Custom); setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom); setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom); setOperationAction(ISD::FP_TO_UINT, MVT::i128, Custom); + setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i32, Custom); + setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i64, Custom); + setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i128, Custom); setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom); setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom); setOperationAction(ISD::SINT_TO_FP, MVT::i128, Custom); + setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i32, Custom); + setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i64, Custom); + setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i128, Custom); setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom); setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom); setOperationAction(ISD::UINT_TO_FP, MVT::i128, Custom); + setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i32, Custom); + setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i64, Custom); + setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i128, Custom); setOperationAction(ISD::FP_ROUND, MVT::f32, Custom); setOperationAction(ISD::FP_ROUND, MVT::f64, Custom); + setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Custom); + setOperationAction(ISD::STRICT_FP_ROUND, MVT::f64, Custom); // Variable arguments. setOperationAction(ISD::VASTART, MVT::Other, Custom); @@ -327,12 +389,17 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setOperationAction(ISD::ROTR, VT, Expand); } + // AArch64 doesn't have i32 MULH{S|U}. + setOperationAction(ISD::MULHU, MVT::i32, Expand); + setOperationAction(ISD::MULHS, MVT::i32, Expand); + // AArch64 doesn't have {U|S}MUL_LOHI. setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand); setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand); setOperationAction(ISD::CTPOP, MVT::i32, Custom); setOperationAction(ISD::CTPOP, MVT::i64, Custom); + setOperationAction(ISD::CTPOP, MVT::i128, Custom); setOperationAction(ISD::SDIVREM, MVT::i32, Expand); setOperationAction(ISD::SDIVREM, MVT::i64, Expand); @@ -525,6 +592,17 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setOperationAction(ISD::LOAD, MVT::i128, Custom); setOperationAction(ISD::STORE, MVT::i128, Custom); + // 256 bit non-temporal stores can be lowered to STNP. Do this as part of the + // custom lowering, as there are no un-paired non-temporal stores and + // legalization will break up 256 bit inputs. + setOperationAction(ISD::STORE, MVT::v32i8, Custom); + setOperationAction(ISD::STORE, MVT::v16i16, Custom); + setOperationAction(ISD::STORE, MVT::v16f16, Custom); + setOperationAction(ISD::STORE, MVT::v8i32, Custom); + setOperationAction(ISD::STORE, MVT::v8f32, Custom); + setOperationAction(ISD::STORE, MVT::v4f64, Custom); + setOperationAction(ISD::STORE, MVT::v4i64, Custom); + // Lower READCYCLECOUNTER using an mrs from PMCCNTR_EL0. // This requires the Performance Monitors extension. if (Subtarget->hasPerfMon()) @@ -574,6 +652,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setOperationAction(ISD::BITCAST, MVT::i16, Custom); setOperationAction(ISD::BITCAST, MVT::f16, Custom); + setOperationAction(ISD::BITCAST, MVT::bf16, Custom); // Indexed loads and stores are supported. for (unsigned im = (unsigned)ISD::PRE_INC; @@ -585,6 +664,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setIndexedLoadAction(im, MVT::f64, Legal); setIndexedLoadAction(im, MVT::f32, Legal); setIndexedLoadAction(im, MVT::f16, Legal); + setIndexedLoadAction(im, MVT::bf16, Legal); setIndexedStoreAction(im, MVT::i8, Legal); setIndexedStoreAction(im, MVT::i16, Legal); setIndexedStoreAction(im, MVT::i32, Legal); @@ -592,6 +672,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setIndexedStoreAction(im, MVT::f64, Legal); setIndexedStoreAction(im, MVT::f32, Legal); setIndexedStoreAction(im, MVT::f16, Legal); + setIndexedStoreAction(im, MVT::bf16, Legal); } // Trap. @@ -769,6 +850,8 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setOperationAction(ISD::UADDSAT, VT, Legal); setOperationAction(ISD::SSUBSAT, VT, Legal); setOperationAction(ISD::USUBSAT, VT, Legal); + + setOperationAction(ISD::TRUNCATE, VT, Custom); } for (MVT VT : { MVT::v4f16, MVT::v2f32, MVT::v8f16, MVT::v4f32, MVT::v2f64 }) { @@ -825,6 +908,9 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, } } + if (Subtarget->hasSVE()) + setOperationAction(ISD::VSCALE, MVT::i32, Custom); + setTruncStoreAction(MVT::v4i16, MVT::v4i8, Custom); } @@ -833,11 +919,60 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, // splat of 0 or undef) once vector selects supported in SVE codegen. See // D68877 for more details. for (MVT VT : MVT::integer_scalable_vector_valuetypes()) { - if (isTypeLegal(VT)) + if (isTypeLegal(VT)) { + setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom); setOperationAction(ISD::SPLAT_VECTOR, VT, Custom); + setOperationAction(ISD::SELECT, VT, Custom); + setOperationAction(ISD::SDIV, VT, Custom); + setOperationAction(ISD::UDIV, VT, Custom); + setOperationAction(ISD::SMIN, VT, Custom); + setOperationAction(ISD::UMIN, VT, Custom); + setOperationAction(ISD::SMAX, VT, Custom); + setOperationAction(ISD::UMAX, VT, Custom); + setOperationAction(ISD::SHL, VT, Custom); + setOperationAction(ISD::SRL, VT, Custom); + setOperationAction(ISD::SRA, VT, Custom); + if (VT.getScalarType() == MVT::i1) + setOperationAction(ISD::SETCC, VT, Custom); + } } + + for (auto VT : {MVT::nxv8i8, MVT::nxv4i16, MVT::nxv2i32}) + setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom); + setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i8, Custom); setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i16, Custom); + + for (MVT VT : MVT::fp_scalable_vector_valuetypes()) { + if (isTypeLegal(VT)) { + setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom); + setOperationAction(ISD::SPLAT_VECTOR, VT, Custom); + setOperationAction(ISD::SELECT, VT, Custom); + setOperationAction(ISD::FMA, VT, Custom); + } + } + + // NOTE: Currently this has to happen after computeRegisterProperties rather + // than the preferred option of combining it with the addRegisterClass call. + if (useSVEForFixedLengthVectors()) { + for (MVT VT : MVT::integer_fixedlen_vector_valuetypes()) + if (useSVEForFixedLengthVectorVT(VT)) + addTypeForFixedLengthSVE(VT); + for (MVT VT : MVT::fp_fixedlen_vector_valuetypes()) + if (useSVEForFixedLengthVectorVT(VT)) + addTypeForFixedLengthSVE(VT); + + // 64bit results can mean a bigger than NEON input. + for (auto VT : {MVT::v8i8, MVT::v4i16}) + setOperationAction(ISD::TRUNCATE, VT, Custom); + setOperationAction(ISD::FP_ROUND, MVT::v4f16, Custom); + + // 128bit results imply a bigger than NEON input. + for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32}) + setOperationAction(ISD::TRUNCATE, VT, Custom); + for (auto VT : {MVT::v8f16, MVT::v4f32}) + setOperationAction(ISD::FP_ROUND, VT, Expand); + } } PredictableSelectIsExpensive = Subtarget->predictableSelectIsExpensive(); @@ -922,6 +1057,24 @@ void AArch64TargetLowering::addTypeForNEON(MVT VT, MVT PromotedBitwiseVT) { } } +void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT) { + assert(VT.isFixedLengthVector() && "Expected fixed length vector type!"); + + // By default everything must be expanded. + for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) + setOperationAction(Op, VT, Expand); + + // We use EXTRACT_SUBVECTOR to "cast" a scalable vector to a fixed length one. + setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom); + + // Lower fixed length vector operations to scalable equivalents. + setOperationAction(ISD::ADD, VT, Custom); + setOperationAction(ISD::FADD, VT, Custom); + setOperationAction(ISD::LOAD, VT, Custom); + setOperationAction(ISD::STORE, VT, Custom); + setOperationAction(ISD::TRUNCATE, VT, Custom); +} + void AArch64TargetLowering::addDRTypeForNEON(MVT VT) { addRegisterClass(VT, &AArch64::FPR64RegClass); addTypeForNEON(VT, MVT::v2i32); @@ -932,10 +1085,12 @@ void AArch64TargetLowering::addQRTypeForNEON(MVT VT) { addTypeForNEON(VT, MVT::v4i32); } -EVT AArch64TargetLowering::getSetCCResultType(const DataLayout &, LLVMContext &, - EVT VT) const { +EVT AArch64TargetLowering::getSetCCResultType(const DataLayout &, + LLVMContext &C, EVT VT) const { if (!VT.isVector()) return MVT::i32; + if (VT.isScalableVector()) + return EVT::getVectorVT(C, MVT::i1, VT.getVectorElementCount()); return VT.changeVectorElementTypeToInteger(); } @@ -1035,7 +1190,8 @@ static bool optimizeLogicalImm(SDValue Op, unsigned Size, uint64_t Imm, } bool AArch64TargetLowering::targetShrinkDemandedConstant( - SDValue Op, const APInt &Demanded, TargetLoweringOpt &TLO) const { + SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, + TargetLoweringOpt &TLO) const { // Delay this optimization to as late as possible. if (!TLO.LegalOps) return false; @@ -1052,7 +1208,7 @@ bool AArch64TargetLowering::targetShrinkDemandedConstant( "i32 or i64 is expected after legalization."); // Exit early if we demand all bits. - if (Demanded.countPopulation() == Size) + if (DemandedBits.countPopulation() == Size) return false; unsigned NewOpc; @@ -1073,7 +1229,7 @@ bool AArch64TargetLowering::targetShrinkDemandedConstant( if (!C) return false; uint64_t Imm = C->getZExtValue(); - return optimizeLogicalImm(Op, Size, Imm, Demanded, TLO, NewOpc); + return optimizeLogicalImm(Op, Size, Imm, DemandedBits, TLO, NewOpc); } /// computeKnownBitsForTargetNode - Determine which of the bits specified in @@ -1177,7 +1333,7 @@ bool AArch64TargetLowering::allowsMisalignedMemoryAccesses( // Same as above but handling LLTs instead. bool AArch64TargetLowering::allowsMisalignedMemoryAccesses( - LLT Ty, unsigned AddrSpace, unsigned Align, MachineMemOperand::Flags Flags, + LLT Ty, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags, bool *Fast) const { if (Subtarget->requiresStrictAlign()) return false; @@ -1192,7 +1348,7 @@ bool AArch64TargetLowering::allowsMisalignedMemoryAccesses( // Code that uses clang vector extensions can mark that it // wants unaligned accesses to be treated as fast by // underspecifying alignment to be 1 or 2. - Align <= 2 || + Alignment <= 2 || // Disregard v2i64. Memcpy lowering produces those and splitting // them regresses performance on micro-benchmarks and olden/bh. @@ -1208,181 +1364,246 @@ AArch64TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo, } const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const { +#define MAKE_CASE(V) \ + case V: \ + return #V; switch ((AArch64ISD::NodeType)Opcode) { - case AArch64ISD::FIRST_NUMBER: break; - case AArch64ISD::CALL: return "AArch64ISD::CALL"; - case AArch64ISD::ADRP: return "AArch64ISD::ADRP"; - case AArch64ISD::ADR: return "AArch64ISD::ADR"; - case AArch64ISD::ADDlow: return "AArch64ISD::ADDlow"; - case AArch64ISD::LOADgot: return "AArch64ISD::LOADgot"; - case AArch64ISD::RET_FLAG: return "AArch64ISD::RET_FLAG"; - case AArch64ISD::BRCOND: return "AArch64ISD::BRCOND"; - case AArch64ISD::CSEL: return "AArch64ISD::CSEL"; - case AArch64ISD::FCSEL: return "AArch64ISD::FCSEL"; - case AArch64ISD::CSINV: return "AArch64ISD::CSINV"; - case AArch64ISD::CSNEG: return "AArch64ISD::CSNEG"; - case AArch64ISD::CSINC: return "AArch64ISD::CSINC"; - case AArch64ISD::THREAD_POINTER: return "AArch64ISD::THREAD_POINTER"; - case AArch64ISD::TLSDESC_CALLSEQ: return "AArch64ISD::TLSDESC_CALLSEQ"; - case AArch64ISD::ADC: return "AArch64ISD::ADC"; - case AArch64ISD::SBC: return "AArch64ISD::SBC"; - case AArch64ISD::ADDS: return "AArch64ISD::ADDS"; - case AArch64ISD::SUBS: return "AArch64ISD::SUBS"; - case AArch64ISD::ADCS: return "AArch64ISD::ADCS"; - case AArch64ISD::SBCS: return "AArch64ISD::SBCS"; - case AArch64ISD::ANDS: return "AArch64ISD::ANDS"; - case AArch64ISD::CCMP: return "AArch64ISD::CCMP"; - case AArch64ISD::CCMN: return "AArch64ISD::CCMN"; - case AArch64ISD::FCCMP: return "AArch64ISD::FCCMP"; - case AArch64ISD::FCMP: return "AArch64ISD::FCMP"; - case AArch64ISD::DUP: return "AArch64ISD::DUP"; - case AArch64ISD::DUPLANE8: return "AArch64ISD::DUPLANE8"; - case AArch64ISD::DUPLANE16: return "AArch64ISD::DUPLANE16"; - case AArch64ISD::DUPLANE32: return "AArch64ISD::DUPLANE32"; - case AArch64ISD::DUPLANE64: return "AArch64ISD::DUPLANE64"; - case AArch64ISD::MOVI: return "AArch64ISD::MOVI"; - case AArch64ISD::MOVIshift: return "AArch64ISD::MOVIshift"; - case AArch64ISD::MOVIedit: return "AArch64ISD::MOVIedit"; - case AArch64ISD::MOVImsl: return "AArch64ISD::MOVImsl"; - case AArch64ISD::FMOV: return "AArch64ISD::FMOV"; - case AArch64ISD::MVNIshift: return "AArch64ISD::MVNIshift"; - case AArch64ISD::MVNImsl: return "AArch64ISD::MVNImsl"; - case AArch64ISD::BICi: return "AArch64ISD::BICi"; - case AArch64ISD::ORRi: return "AArch64ISD::ORRi"; - case AArch64ISD::BSL: return "AArch64ISD::BSL"; - case AArch64ISD::NEG: return "AArch64ISD::NEG"; - case AArch64ISD::EXTR: return "AArch64ISD::EXTR"; - case AArch64ISD::ZIP1: return "AArch64ISD::ZIP1"; - case AArch64ISD::ZIP2: return "AArch64ISD::ZIP2"; - case AArch64ISD::UZP1: return "AArch64ISD::UZP1"; - case AArch64ISD::UZP2: return "AArch64ISD::UZP2"; - case AArch64ISD::TRN1: return "AArch64ISD::TRN1"; - case AArch64ISD::TRN2: return "AArch64ISD::TRN2"; - case AArch64ISD::REV16: return "AArch64ISD::REV16"; - case AArch64ISD::REV32: return "AArch64ISD::REV32"; - case AArch64ISD::REV64: return "AArch64ISD::REV64"; - case AArch64ISD::EXT: return "AArch64ISD::EXT"; - case AArch64ISD::VSHL: return "AArch64ISD::VSHL"; - case AArch64ISD::VLSHR: return "AArch64ISD::VLSHR"; - case AArch64ISD::VASHR: return "AArch64ISD::VASHR"; - case AArch64ISD::CMEQ: return "AArch64ISD::CMEQ"; - case AArch64ISD::CMGE: return "AArch64ISD::CMGE"; - case AArch64ISD::CMGT: return "AArch64ISD::CMGT"; - case AArch64ISD::CMHI: return "AArch64ISD::CMHI"; - case AArch64ISD::CMHS: return "AArch64ISD::CMHS"; - case AArch64ISD::FCMEQ: return "AArch64ISD::FCMEQ"; - case AArch64ISD::FCMGE: return "AArch64ISD::FCMGE"; - case AArch64ISD::FCMGT: return "AArch64ISD::FCMGT"; - case AArch64ISD::CMEQz: return "AArch64ISD::CMEQz"; - case AArch64ISD::CMGEz: return "AArch64ISD::CMGEz"; - case AArch64ISD::CMGTz: return "AArch64ISD::CMGTz"; - case AArch64ISD::CMLEz: return "AArch64ISD::CMLEz"; - case AArch64ISD::CMLTz: return "AArch64ISD::CMLTz"; - case AArch64ISD::FCMEQz: return "AArch64ISD::FCMEQz"; - case AArch64ISD::FCMGEz: return "AArch64ISD::FCMGEz"; - case AArch64ISD::FCMGTz: return "AArch64ISD::FCMGTz"; - case AArch64ISD::FCMLEz: return "AArch64ISD::FCMLEz"; - case AArch64ISD::FCMLTz: return "AArch64ISD::FCMLTz"; - case AArch64ISD::SADDV: return "AArch64ISD::SADDV"; - case AArch64ISD::UADDV: return "AArch64ISD::UADDV"; - case AArch64ISD::SMINV: return "AArch64ISD::SMINV"; - case AArch64ISD::UMINV: return "AArch64ISD::UMINV"; - case AArch64ISD::SMAXV: return "AArch64ISD::SMAXV"; - case AArch64ISD::UMAXV: return "AArch64ISD::UMAXV"; - case AArch64ISD::SMAXV_PRED: return "AArch64ISD::SMAXV_PRED"; - case AArch64ISD::UMAXV_PRED: return "AArch64ISD::UMAXV_PRED"; - case AArch64ISD::SMINV_PRED: return "AArch64ISD::SMINV_PRED"; - case AArch64ISD::UMINV_PRED: return "AArch64ISD::UMINV_PRED"; - case AArch64ISD::ORV_PRED: return "AArch64ISD::ORV_PRED"; - case AArch64ISD::EORV_PRED: return "AArch64ISD::EORV_PRED"; - case AArch64ISD::ANDV_PRED: return "AArch64ISD::ANDV_PRED"; - case AArch64ISD::CLASTA_N: return "AArch64ISD::CLASTA_N"; - case AArch64ISD::CLASTB_N: return "AArch64ISD::CLASTB_N"; - case AArch64ISD::LASTA: return "AArch64ISD::LASTA"; - case AArch64ISD::LASTB: return "AArch64ISD::LASTB"; - case AArch64ISD::REV: return "AArch64ISD::REV"; - case AArch64ISD::TBL: return "AArch64ISD::TBL"; - case AArch64ISD::NOT: return "AArch64ISD::NOT"; - case AArch64ISD::BIT: return "AArch64ISD::BIT"; - case AArch64ISD::CBZ: return "AArch64ISD::CBZ"; - case AArch64ISD::CBNZ: return "AArch64ISD::CBNZ"; - case AArch64ISD::TBZ: return "AArch64ISD::TBZ"; - case AArch64ISD::TBNZ: return "AArch64ISD::TBNZ"; - case AArch64ISD::TC_RETURN: return "AArch64ISD::TC_RETURN"; - case AArch64ISD::PREFETCH: return "AArch64ISD::PREFETCH"; - case AArch64ISD::SITOF: return "AArch64ISD::SITOF"; - case AArch64ISD::UITOF: return "AArch64ISD::UITOF"; - case AArch64ISD::NVCAST: return "AArch64ISD::NVCAST"; - case AArch64ISD::SQSHL_I: return "AArch64ISD::SQSHL_I"; - case AArch64ISD::UQSHL_I: return "AArch64ISD::UQSHL_I"; - case AArch64ISD::SRSHR_I: return "AArch64ISD::SRSHR_I"; - case AArch64ISD::URSHR_I: return "AArch64ISD::URSHR_I"; - case AArch64ISD::SQSHLU_I: return "AArch64ISD::SQSHLU_I"; - case AArch64ISD::WrapperLarge: return "AArch64ISD::WrapperLarge"; - case AArch64ISD::LD2post: return "AArch64ISD::LD2post"; - case AArch64ISD::LD3post: return "AArch64ISD::LD3post"; - case AArch64ISD::LD4post: return "AArch64ISD::LD4post"; - case AArch64ISD::ST2post: return "AArch64ISD::ST2post"; - case AArch64ISD::ST3post: return "AArch64ISD::ST3post"; - case AArch64ISD::ST4post: return "AArch64ISD::ST4post"; - case AArch64ISD::LD1x2post: return "AArch64ISD::LD1x2post"; - case AArch64ISD::LD1x3post: return "AArch64ISD::LD1x3post"; - case AArch64ISD::LD1x4post: return "AArch64ISD::LD1x4post"; - case AArch64ISD::ST1x2post: return "AArch64ISD::ST1x2post"; - case AArch64ISD::ST1x3post: return "AArch64ISD::ST1x3post"; - case AArch64ISD::ST1x4post: return "AArch64ISD::ST1x4post"; - case AArch64ISD::LD1DUPpost: return "AArch64ISD::LD1DUPpost"; - case AArch64ISD::LD2DUPpost: return "AArch64ISD::LD2DUPpost"; - case AArch64ISD::LD3DUPpost: return "AArch64ISD::LD3DUPpost"; - case AArch64ISD::LD4DUPpost: return "AArch64ISD::LD4DUPpost"; - case AArch64ISD::LD1LANEpost: return "AArch64ISD::LD1LANEpost"; - case AArch64ISD::LD2LANEpost: return "AArch64ISD::LD2LANEpost"; - case AArch64ISD::LD3LANEpost: return "AArch64ISD::LD3LANEpost"; - case AArch64ISD::LD4LANEpost: return "AArch64ISD::LD4LANEpost"; - case AArch64ISD::ST2LANEpost: return "AArch64ISD::ST2LANEpost"; - case AArch64ISD::ST3LANEpost: return "AArch64ISD::ST3LANEpost"; - case AArch64ISD::ST4LANEpost: return "AArch64ISD::ST4LANEpost"; - case AArch64ISD::SMULL: return "AArch64ISD::SMULL"; - case AArch64ISD::UMULL: return "AArch64ISD::UMULL"; - case AArch64ISD::FRECPE: return "AArch64ISD::FRECPE"; - case AArch64ISD::FRECPS: return "AArch64ISD::FRECPS"; - case AArch64ISD::FRSQRTE: return "AArch64ISD::FRSQRTE"; - case AArch64ISD::FRSQRTS: return "AArch64ISD::FRSQRTS"; - case AArch64ISD::STG: return "AArch64ISD::STG"; - case AArch64ISD::STZG: return "AArch64ISD::STZG"; - case AArch64ISD::ST2G: return "AArch64ISD::ST2G"; - case AArch64ISD::STZ2G: return "AArch64ISD::STZ2G"; - case AArch64ISD::SUNPKHI: return "AArch64ISD::SUNPKHI"; - case AArch64ISD::SUNPKLO: return "AArch64ISD::SUNPKLO"; - case AArch64ISD::UUNPKHI: return "AArch64ISD::UUNPKHI"; - case AArch64ISD::UUNPKLO: return "AArch64ISD::UUNPKLO"; - case AArch64ISD::INSR: return "AArch64ISD::INSR"; - case AArch64ISD::PTEST: return "AArch64ISD::PTEST"; - case AArch64ISD::PTRUE: return "AArch64ISD::PTRUE"; - case AArch64ISD::GLD1: return "AArch64ISD::GLD1"; - case AArch64ISD::GLD1_SCALED: return "AArch64ISD::GLD1_SCALED"; - case AArch64ISD::GLD1_SXTW: return "AArch64ISD::GLD1_SXTW"; - case AArch64ISD::GLD1_UXTW: return "AArch64ISD::GLD1_UXTW"; - case AArch64ISD::GLD1_SXTW_SCALED: return "AArch64ISD::GLD1_SXTW_SCALED"; - case AArch64ISD::GLD1_UXTW_SCALED: return "AArch64ISD::GLD1_UXTW_SCALED"; - case AArch64ISD::GLD1_IMM: return "AArch64ISD::GLD1_IMM"; - case AArch64ISD::GLD1S: return "AArch64ISD::GLD1S"; - case AArch64ISD::GLD1S_SCALED: return "AArch64ISD::GLD1S_SCALED"; - case AArch64ISD::GLD1S_SXTW: return "AArch64ISD::GLD1S_SXTW"; - case AArch64ISD::GLD1S_UXTW: return "AArch64ISD::GLD1S_UXTW"; - case AArch64ISD::GLD1S_SXTW_SCALED: return "AArch64ISD::GLD1S_SXTW_SCALED"; - case AArch64ISD::GLD1S_UXTW_SCALED: return "AArch64ISD::GLD1S_UXTW_SCALED"; - case AArch64ISD::GLD1S_IMM: return "AArch64ISD::GLD1S_IMM"; - case AArch64ISD::SST1: return "AArch64ISD::SST1"; - case AArch64ISD::SST1_SCALED: return "AArch64ISD::SST1_SCALED"; - case AArch64ISD::SST1_SXTW: return "AArch64ISD::SST1_SXTW"; - case AArch64ISD::SST1_UXTW: return "AArch64ISD::SST1_UXTW"; - case AArch64ISD::SST1_SXTW_SCALED: return "AArch64ISD::SST1_SXTW_SCALED"; - case AArch64ISD::SST1_UXTW_SCALED: return "AArch64ISD::SST1_UXTW_SCALED"; - case AArch64ISD::SST1_IMM: return "AArch64ISD::SST1_IMM"; - case AArch64ISD::LDP: return "AArch64ISD::LDP"; - case AArch64ISD::STP: return "AArch64ISD::STP"; - } + case AArch64ISD::FIRST_NUMBER: + break; + MAKE_CASE(AArch64ISD::CALL) + MAKE_CASE(AArch64ISD::ADRP) + MAKE_CASE(AArch64ISD::ADR) + MAKE_CASE(AArch64ISD::ADDlow) + MAKE_CASE(AArch64ISD::LOADgot) + MAKE_CASE(AArch64ISD::RET_FLAG) + MAKE_CASE(AArch64ISD::BRCOND) + MAKE_CASE(AArch64ISD::CSEL) + MAKE_CASE(AArch64ISD::FCSEL) + MAKE_CASE(AArch64ISD::CSINV) + MAKE_CASE(AArch64ISD::CSNEG) + MAKE_CASE(AArch64ISD::CSINC) + MAKE_CASE(AArch64ISD::THREAD_POINTER) + MAKE_CASE(AArch64ISD::TLSDESC_CALLSEQ) + MAKE_CASE(AArch64ISD::ADD_PRED) + MAKE_CASE(AArch64ISD::SDIV_PRED) + MAKE_CASE(AArch64ISD::UDIV_PRED) + MAKE_CASE(AArch64ISD::SMIN_MERGE_OP1) + MAKE_CASE(AArch64ISD::UMIN_MERGE_OP1) + MAKE_CASE(AArch64ISD::SMAX_MERGE_OP1) + MAKE_CASE(AArch64ISD::UMAX_MERGE_OP1) + MAKE_CASE(AArch64ISD::SHL_MERGE_OP1) + MAKE_CASE(AArch64ISD::SRL_MERGE_OP1) + MAKE_CASE(AArch64ISD::SRA_MERGE_OP1) + MAKE_CASE(AArch64ISD::SETCC_MERGE_ZERO) + MAKE_CASE(AArch64ISD::ADC) + MAKE_CASE(AArch64ISD::SBC) + MAKE_CASE(AArch64ISD::ADDS) + MAKE_CASE(AArch64ISD::SUBS) + MAKE_CASE(AArch64ISD::ADCS) + MAKE_CASE(AArch64ISD::SBCS) + MAKE_CASE(AArch64ISD::ANDS) + MAKE_CASE(AArch64ISD::CCMP) + MAKE_CASE(AArch64ISD::CCMN) + MAKE_CASE(AArch64ISD::FCCMP) + MAKE_CASE(AArch64ISD::FCMP) + MAKE_CASE(AArch64ISD::STRICT_FCMP) + MAKE_CASE(AArch64ISD::STRICT_FCMPE) + MAKE_CASE(AArch64ISD::DUP) + MAKE_CASE(AArch64ISD::DUPLANE8) + MAKE_CASE(AArch64ISD::DUPLANE16) + MAKE_CASE(AArch64ISD::DUPLANE32) + MAKE_CASE(AArch64ISD::DUPLANE64) + MAKE_CASE(AArch64ISD::MOVI) + MAKE_CASE(AArch64ISD::MOVIshift) + MAKE_CASE(AArch64ISD::MOVIedit) + MAKE_CASE(AArch64ISD::MOVImsl) + MAKE_CASE(AArch64ISD::FMOV) + MAKE_CASE(AArch64ISD::MVNIshift) + MAKE_CASE(AArch64ISD::MVNImsl) + MAKE_CASE(AArch64ISD::BICi) + MAKE_CASE(AArch64ISD::ORRi) + MAKE_CASE(AArch64ISD::BSP) + MAKE_CASE(AArch64ISD::NEG) + MAKE_CASE(AArch64ISD::EXTR) + MAKE_CASE(AArch64ISD::ZIP1) + MAKE_CASE(AArch64ISD::ZIP2) + MAKE_CASE(AArch64ISD::UZP1) + MAKE_CASE(AArch64ISD::UZP2) + MAKE_CASE(AArch64ISD::TRN1) + MAKE_CASE(AArch64ISD::TRN2) + MAKE_CASE(AArch64ISD::REV16) + MAKE_CASE(AArch64ISD::REV32) + MAKE_CASE(AArch64ISD::REV64) + MAKE_CASE(AArch64ISD::EXT) + MAKE_CASE(AArch64ISD::VSHL) + MAKE_CASE(AArch64ISD::VLSHR) + MAKE_CASE(AArch64ISD::VASHR) + MAKE_CASE(AArch64ISD::VSLI) + MAKE_CASE(AArch64ISD::VSRI) + MAKE_CASE(AArch64ISD::CMEQ) + MAKE_CASE(AArch64ISD::CMGE) + MAKE_CASE(AArch64ISD::CMGT) + MAKE_CASE(AArch64ISD::CMHI) + MAKE_CASE(AArch64ISD::CMHS) + MAKE_CASE(AArch64ISD::FCMEQ) + MAKE_CASE(AArch64ISD::FCMGE) + MAKE_CASE(AArch64ISD::FCMGT) + MAKE_CASE(AArch64ISD::CMEQz) + MAKE_CASE(AArch64ISD::CMGEz) + MAKE_CASE(AArch64ISD::CMGTz) + MAKE_CASE(AArch64ISD::CMLEz) + MAKE_CASE(AArch64ISD::CMLTz) + MAKE_CASE(AArch64ISD::FCMEQz) + MAKE_CASE(AArch64ISD::FCMGEz) + MAKE_CASE(AArch64ISD::FCMGTz) + MAKE_CASE(AArch64ISD::FCMLEz) + MAKE_CASE(AArch64ISD::FCMLTz) + MAKE_CASE(AArch64ISD::SADDV) + MAKE_CASE(AArch64ISD::UADDV) + MAKE_CASE(AArch64ISD::SRHADD) + MAKE_CASE(AArch64ISD::URHADD) + MAKE_CASE(AArch64ISD::SMINV) + MAKE_CASE(AArch64ISD::UMINV) + MAKE_CASE(AArch64ISD::SMAXV) + MAKE_CASE(AArch64ISD::UMAXV) + MAKE_CASE(AArch64ISD::SMAXV_PRED) + MAKE_CASE(AArch64ISD::UMAXV_PRED) + MAKE_CASE(AArch64ISD::SMINV_PRED) + MAKE_CASE(AArch64ISD::UMINV_PRED) + MAKE_CASE(AArch64ISD::ORV_PRED) + MAKE_CASE(AArch64ISD::EORV_PRED) + MAKE_CASE(AArch64ISD::ANDV_PRED) + MAKE_CASE(AArch64ISD::CLASTA_N) + MAKE_CASE(AArch64ISD::CLASTB_N) + MAKE_CASE(AArch64ISD::LASTA) + MAKE_CASE(AArch64ISD::LASTB) + MAKE_CASE(AArch64ISD::REV) + MAKE_CASE(AArch64ISD::REINTERPRET_CAST) + MAKE_CASE(AArch64ISD::TBL) + MAKE_CASE(AArch64ISD::FADD_PRED) + MAKE_CASE(AArch64ISD::FADDA_PRED) + MAKE_CASE(AArch64ISD::FADDV_PRED) + MAKE_CASE(AArch64ISD::FMA_PRED) + MAKE_CASE(AArch64ISD::FMAXV_PRED) + MAKE_CASE(AArch64ISD::FMAXNMV_PRED) + MAKE_CASE(AArch64ISD::FMINV_PRED) + MAKE_CASE(AArch64ISD::FMINNMV_PRED) + MAKE_CASE(AArch64ISD::NOT) + MAKE_CASE(AArch64ISD::BIT) + MAKE_CASE(AArch64ISD::CBZ) + MAKE_CASE(AArch64ISD::CBNZ) + MAKE_CASE(AArch64ISD::TBZ) + MAKE_CASE(AArch64ISD::TBNZ) + MAKE_CASE(AArch64ISD::TC_RETURN) + MAKE_CASE(AArch64ISD::PREFETCH) + MAKE_CASE(AArch64ISD::SITOF) + MAKE_CASE(AArch64ISD::UITOF) + MAKE_CASE(AArch64ISD::NVCAST) + MAKE_CASE(AArch64ISD::SQSHL_I) + MAKE_CASE(AArch64ISD::UQSHL_I) + MAKE_CASE(AArch64ISD::SRSHR_I) + MAKE_CASE(AArch64ISD::URSHR_I) + MAKE_CASE(AArch64ISD::SQSHLU_I) + MAKE_CASE(AArch64ISD::WrapperLarge) + MAKE_CASE(AArch64ISD::LD2post) + MAKE_CASE(AArch64ISD::LD3post) + MAKE_CASE(AArch64ISD::LD4post) + MAKE_CASE(AArch64ISD::ST2post) + MAKE_CASE(AArch64ISD::ST3post) + MAKE_CASE(AArch64ISD::ST4post) + MAKE_CASE(AArch64ISD::LD1x2post) + MAKE_CASE(AArch64ISD::LD1x3post) + MAKE_CASE(AArch64ISD::LD1x4post) + MAKE_CASE(AArch64ISD::ST1x2post) + MAKE_CASE(AArch64ISD::ST1x3post) + MAKE_CASE(AArch64ISD::ST1x4post) + MAKE_CASE(AArch64ISD::LD1DUPpost) + MAKE_CASE(AArch64ISD::LD2DUPpost) + MAKE_CASE(AArch64ISD::LD3DUPpost) + MAKE_CASE(AArch64ISD::LD4DUPpost) + MAKE_CASE(AArch64ISD::LD1LANEpost) + MAKE_CASE(AArch64ISD::LD2LANEpost) + MAKE_CASE(AArch64ISD::LD3LANEpost) + MAKE_CASE(AArch64ISD::LD4LANEpost) + MAKE_CASE(AArch64ISD::ST2LANEpost) + MAKE_CASE(AArch64ISD::ST3LANEpost) + MAKE_CASE(AArch64ISD::ST4LANEpost) + MAKE_CASE(AArch64ISD::SMULL) + MAKE_CASE(AArch64ISD::UMULL) + MAKE_CASE(AArch64ISD::FRECPE) + MAKE_CASE(AArch64ISD::FRECPS) + MAKE_CASE(AArch64ISD::FRSQRTE) + MAKE_CASE(AArch64ISD::FRSQRTS) + MAKE_CASE(AArch64ISD::STG) + MAKE_CASE(AArch64ISD::STZG) + MAKE_CASE(AArch64ISD::ST2G) + MAKE_CASE(AArch64ISD::STZ2G) + MAKE_CASE(AArch64ISD::SUNPKHI) + MAKE_CASE(AArch64ISD::SUNPKLO) + MAKE_CASE(AArch64ISD::UUNPKHI) + MAKE_CASE(AArch64ISD::UUNPKLO) + MAKE_CASE(AArch64ISD::INSR) + MAKE_CASE(AArch64ISD::PTEST) + MAKE_CASE(AArch64ISD::PTRUE) + MAKE_CASE(AArch64ISD::LD1_MERGE_ZERO) + MAKE_CASE(AArch64ISD::LD1S_MERGE_ZERO) + MAKE_CASE(AArch64ISD::LDNF1_MERGE_ZERO) + MAKE_CASE(AArch64ISD::LDNF1S_MERGE_ZERO) + MAKE_CASE(AArch64ISD::LDFF1_MERGE_ZERO) + MAKE_CASE(AArch64ISD::LDFF1S_MERGE_ZERO) + MAKE_CASE(AArch64ISD::LD1RQ_MERGE_ZERO) + MAKE_CASE(AArch64ISD::LD1RO_MERGE_ZERO) + MAKE_CASE(AArch64ISD::SVE_LD2_MERGE_ZERO) + MAKE_CASE(AArch64ISD::SVE_LD3_MERGE_ZERO) + MAKE_CASE(AArch64ISD::SVE_LD4_MERGE_ZERO) + MAKE_CASE(AArch64ISD::GLD1_MERGE_ZERO) + MAKE_CASE(AArch64ISD::GLD1_SCALED_MERGE_ZERO) + MAKE_CASE(AArch64ISD::GLD1_SXTW_MERGE_ZERO) + MAKE_CASE(AArch64ISD::GLD1_UXTW_MERGE_ZERO) + MAKE_CASE(AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO) + MAKE_CASE(AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO) + MAKE_CASE(AArch64ISD::GLD1_IMM_MERGE_ZERO) + MAKE_CASE(AArch64ISD::GLD1S_MERGE_ZERO) + MAKE_CASE(AArch64ISD::GLD1S_SCALED_MERGE_ZERO) + MAKE_CASE(AArch64ISD::GLD1S_SXTW_MERGE_ZERO) + MAKE_CASE(AArch64ISD::GLD1S_UXTW_MERGE_ZERO) + MAKE_CASE(AArch64ISD::GLD1S_SXTW_SCALED_MERGE_ZERO) + MAKE_CASE(AArch64ISD::GLD1S_UXTW_SCALED_MERGE_ZERO) + MAKE_CASE(AArch64ISD::GLD1S_IMM_MERGE_ZERO) + MAKE_CASE(AArch64ISD::GLDFF1_MERGE_ZERO) + MAKE_CASE(AArch64ISD::GLDFF1_SCALED_MERGE_ZERO) + MAKE_CASE(AArch64ISD::GLDFF1_SXTW_MERGE_ZERO) + MAKE_CASE(AArch64ISD::GLDFF1_UXTW_MERGE_ZERO) + MAKE_CASE(AArch64ISD::GLDFF1_SXTW_SCALED_MERGE_ZERO) + MAKE_CASE(AArch64ISD::GLDFF1_UXTW_SCALED_MERGE_ZERO) + MAKE_CASE(AArch64ISD::GLDFF1_IMM_MERGE_ZERO) + MAKE_CASE(AArch64ISD::GLDFF1S_MERGE_ZERO) + MAKE_CASE(AArch64ISD::GLDFF1S_SCALED_MERGE_ZERO) + MAKE_CASE(AArch64ISD::GLDFF1S_SXTW_MERGE_ZERO) + MAKE_CASE(AArch64ISD::GLDFF1S_UXTW_MERGE_ZERO) + MAKE_CASE(AArch64ISD::GLDFF1S_SXTW_SCALED_MERGE_ZERO) + MAKE_CASE(AArch64ISD::GLDFF1S_UXTW_SCALED_MERGE_ZERO) + MAKE_CASE(AArch64ISD::GLDFF1S_IMM_MERGE_ZERO) + MAKE_CASE(AArch64ISD::GLDNT1_MERGE_ZERO) + MAKE_CASE(AArch64ISD::GLDNT1_INDEX_MERGE_ZERO) + MAKE_CASE(AArch64ISD::GLDNT1S_MERGE_ZERO) + MAKE_CASE(AArch64ISD::ST1_PRED) + MAKE_CASE(AArch64ISD::SST1_PRED) + MAKE_CASE(AArch64ISD::SST1_SCALED_PRED) + MAKE_CASE(AArch64ISD::SST1_SXTW_PRED) + MAKE_CASE(AArch64ISD::SST1_UXTW_PRED) + MAKE_CASE(AArch64ISD::SST1_SXTW_SCALED_PRED) + MAKE_CASE(AArch64ISD::SST1_UXTW_SCALED_PRED) + MAKE_CASE(AArch64ISD::SST1_IMM_PRED) + MAKE_CASE(AArch64ISD::SSTNT1_PRED) + MAKE_CASE(AArch64ISD::SSTNT1_INDEX_PRED) + MAKE_CASE(AArch64ISD::LDP) + MAKE_CASE(AArch64ISD::STP) + MAKE_CASE(AArch64ISD::STNP) + MAKE_CASE(AArch64ISD::DUP_MERGE_PASSTHRU) + MAKE_CASE(AArch64ISD::INDEX_VECTOR) + } +#undef MAKE_CASE return nullptr; } @@ -1454,12 +1675,6 @@ MachineBasicBlock *AArch64TargetLowering::EmitLoweredCatchRet( return BB; } -MachineBasicBlock *AArch64TargetLowering::EmitLoweredCatchPad( - MachineInstr &MI, MachineBasicBlock *BB) const { - MI.eraseFromParent(); - return BB; -} - MachineBasicBlock *AArch64TargetLowering::EmitInstrWithCustomInserter( MachineInstr &MI, MachineBasicBlock *BB) const { switch (MI.getOpcode()) { @@ -1478,8 +1693,6 @@ MachineBasicBlock *AArch64TargetLowering::EmitInstrWithCustomInserter( case AArch64::CATCHRET: return EmitLoweredCatchRet(MI, BB); - case AArch64::CATCHPAD: - return EmitLoweredCatchPad(MI, BB); } } @@ -1668,6 +1881,17 @@ static bool isCMN(SDValue Op, ISD::CondCode CC) { (CC == ISD::SETEQ || CC == ISD::SETNE); } +static SDValue emitStrictFPComparison(SDValue LHS, SDValue RHS, const SDLoc &dl, + SelectionDAG &DAG, SDValue Chain, + bool IsSignaling) { + EVT VT = LHS.getValueType(); + assert(VT != MVT::f128); + assert(VT != MVT::f16 && "Lowering of strict fp16 not yet implemented"); + unsigned Opcode = + IsSignaling ? AArch64ISD::STRICT_FCMPE : AArch64ISD::STRICT_FCMP; + return DAG.getNode(Opcode, dl, {VT, MVT::Other}, {Chain, LHS, RHS}); +} + static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC, const SDLoc &dl, SelectionDAG &DAG) { EVT VT = LHS.getValueType(); @@ -1699,14 +1923,22 @@ static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC, // we combine a (CMP (sub 0, op1), op2) into a CMN instruction ? Opcode = AArch64ISD::ADDS; LHS = LHS.getOperand(1); - } else if (LHS.getOpcode() == ISD::AND && isNullConstant(RHS) && - !isUnsignedIntSetCC(CC)) { - // Similarly, (CMP (and X, Y), 0) can be implemented with a TST - // (a.k.a. ANDS) except that the flags are only guaranteed to work for one - // of the signed comparisons. - Opcode = AArch64ISD::ANDS; - RHS = LHS.getOperand(1); - LHS = LHS.getOperand(0); + } else if (isNullConstant(RHS) && !isUnsignedIntSetCC(CC)) { + if (LHS.getOpcode() == ISD::AND) { + // Similarly, (CMP (and X, Y), 0) can be implemented with a TST + // (a.k.a. ANDS) except that the flags are only guaranteed to work for one + // of the signed comparisons. + const SDValue ANDSNode = DAG.getNode(AArch64ISD::ANDS, dl, + DAG.getVTList(VT, MVT_CC), + LHS.getOperand(0), + LHS.getOperand(1)); + // Replace all users of (and X, Y) with newly generated (ands X, Y) + DAG.ReplaceAllUsesWith(LHS, ANDSNode); + return ANDSNode.getValue(1); + } else if (LHS.getOpcode() == AArch64ISD::ANDS) { + // Use result of ANDS + return LHS.getValue(1); + } } return DAG.getNode(Opcode, dl, DAG.getVTList(VT, MVT_CC), LHS, RHS) @@ -2284,18 +2516,16 @@ getAArch64XALUOOp(AArch64CC::CondCode &CC, SDValue Op, SelectionDAG &DAG) { SDValue AArch64TargetLowering::LowerF128Call(SDValue Op, SelectionDAG &DAG, RTLIB::Libcall Call) const { - SmallVector Ops(Op->op_begin(), Op->op_end()); + bool IsStrict = Op->isStrictFPOpcode(); + unsigned Offset = IsStrict ? 1 : 0; + SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue(); + SmallVector Ops(Op->op_begin() + Offset, Op->op_end()); MakeLibCallOptions CallOptions; - return makeLibCall(DAG, Call, MVT::f128, Ops, CallOptions, SDLoc(Op)).first; -} - -// Returns true if the given Op is the overflow flag result of an overflow -// intrinsic operation. -static bool isOverflowIntrOpRes(SDValue Op) { - unsigned Opc = Op.getOpcode(); - return (Op.getResNo() == 1 && - (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO || - Opc == ISD::USUBO || Opc == ISD::SMULO || Opc == ISD::UMULO)); + SDValue Result; + SDLoc dl(Op); + std::tie(Result, Chain) = makeLibCall(DAG, Call, Op.getValueType(), Ops, + CallOptions, dl, Chain); + return IsStrict ? DAG.getMergeValues({Result, Chain}, dl) : Result; } static SDValue LowerXOR(SDValue Op, SelectionDAG &DAG) { @@ -2310,7 +2540,7 @@ static SDValue LowerXOR(SDValue Op, SelectionDAG &DAG) { // (csel 1, 0, invert(cc), overflow_op_bool) // ... which later gets transformed to just a cset instruction with an // inverted condition code, rather than a cset + eor sequence. - if (isOneConstant(Other) && isOverflowIntrOpRes(Sel)) { + if (isOneConstant(Other) && ISD::isOverflowIntrOpRes(Sel)) { // Only lower legal XALUO ops. if (!DAG.getTargetLoweringInfo().isTypeLegal(Sel->getValueType(0))) return SDValue(); @@ -2483,21 +2713,32 @@ SDValue AArch64TargetLowering::LowerFP_EXTEND(SDValue Op, SDValue AArch64TargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const { - if (Op.getOperand(0).getValueType() != MVT::f128) { + bool IsStrict = Op->isStrictFPOpcode(); + SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0); + EVT SrcVT = SrcVal.getValueType(); + + if (SrcVT != MVT::f128) { + // Expand cases where the input is a vector bigger than NEON. + if (useSVEForFixedLengthVectorVT(SrcVT)) + return SDValue(); + // It's legal except when f128 is involved return Op; } RTLIB::Libcall LC; - LC = RTLIB::getFPROUND(Op.getOperand(0).getValueType(), Op.getValueType()); + LC = RTLIB::getFPROUND(SrcVT, Op.getValueType()); // FP_ROUND node has a second operand indicating whether it is known to be // precise. That doesn't take part in the LibCall so we can't directly use // LowerF128Call. - SDValue SrcVal = Op.getOperand(0); MakeLibCallOptions CallOptions; - return makeLibCall(DAG, LC, Op.getValueType(), SrcVal, CallOptions, - SDLoc(Op)).first; + SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue(); + SDValue Result; + SDLoc dl(Op); + std::tie(Result, Chain) = makeLibCall(DAG, LC, Op.getValueType(), SrcVal, + CallOptions, dl, Chain); + return IsStrict ? DAG.getMergeValues({Result, Chain}, dl) : Result; } SDValue AArch64TargetLowering::LowerVectorFP_TO_INT(SDValue Op, @@ -2542,32 +2783,34 @@ SDValue AArch64TargetLowering::LowerVectorFP_TO_INT(SDValue Op, SDValue AArch64TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const { - if (Op.getOperand(0).getValueType().isVector()) + bool IsStrict = Op->isStrictFPOpcode(); + SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0); + + if (SrcVal.getValueType().isVector()) return LowerVectorFP_TO_INT(Op, DAG); // f16 conversions are promoted to f32 when full fp16 is not supported. - if (Op.getOperand(0).getValueType() == MVT::f16 && - !Subtarget->hasFullFP16()) { + if (SrcVal.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) { + assert(!IsStrict && "Lowering of strict fp16 not yet implemented"); SDLoc dl(Op); return DAG.getNode( Op.getOpcode(), dl, Op.getValueType(), - DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, Op.getOperand(0))); + DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, SrcVal)); } - if (Op.getOperand(0).getValueType() != MVT::f128) { + if (SrcVal.getValueType() != MVT::f128) { // It's legal except when f128 is involved return Op; } RTLIB::Libcall LC; - if (Op.getOpcode() == ISD::FP_TO_SINT) - LC = RTLIB::getFPTOSINT(Op.getOperand(0).getValueType(), Op.getValueType()); + if (Op.getOpcode() == ISD::FP_TO_SINT || + Op.getOpcode() == ISD::STRICT_FP_TO_SINT) + LC = RTLIB::getFPTOSINT(SrcVal.getValueType(), Op.getValueType()); else - LC = RTLIB::getFPTOUINT(Op.getOperand(0).getValueType(), Op.getValueType()); + LC = RTLIB::getFPTOUINT(SrcVal.getValueType(), Op.getValueType()); - SmallVector Ops(Op->op_begin(), Op->op_end()); - MakeLibCallOptions CallOptions; - return makeLibCall(DAG, LC, Op.getValueType(), Ops, CallOptions, SDLoc(Op)).first; + return LowerF128Call(Op, DAG, LC); } static SDValue LowerVectorINT_TO_FP(SDValue Op, SelectionDAG &DAG) { @@ -2603,18 +2846,22 @@ SDValue AArch64TargetLowering::LowerINT_TO_FP(SDValue Op, if (Op.getValueType().isVector()) return LowerVectorINT_TO_FP(Op, DAG); + bool IsStrict = Op->isStrictFPOpcode(); + SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0); + // f16 conversions are promoted to f32 when full fp16 is not supported. if (Op.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) { + assert(!IsStrict && "Lowering of strict fp16 not yet implemented"); SDLoc dl(Op); return DAG.getNode( ISD::FP_ROUND, dl, MVT::f16, - DAG.getNode(Op.getOpcode(), dl, MVT::f32, Op.getOperand(0)), + DAG.getNode(Op.getOpcode(), dl, MVT::f32, SrcVal), DAG.getIntPtrConstant(0, dl)); } // i128 conversions are libcalls. - if (Op.getOperand(0).getValueType() == MVT::i128) + if (SrcVal.getValueType() == MVT::i128) return SDValue(); // Other conversions are legal, unless it's to the completely software-based @@ -2623,10 +2870,11 @@ SDValue AArch64TargetLowering::LowerINT_TO_FP(SDValue Op, return Op; RTLIB::Libcall LC; - if (Op.getOpcode() == ISD::SINT_TO_FP) - LC = RTLIB::getSINTTOFP(Op.getOperand(0).getValueType(), Op.getValueType()); + if (Op.getOpcode() == ISD::SINT_TO_FP || + Op.getOpcode() == ISD::STRICT_SINT_TO_FP) + LC = RTLIB::getSINTTOFP(SrcVal.getValueType(), Op.getValueType()); else - LC = RTLIB::getUINTTOFP(Op.getOperand(0).getValueType(), Op.getValueType()); + LC = RTLIB::getUINTTOFP(SrcVal.getValueType(), Op.getValueType()); return LowerF128Call(Op, DAG, LC); } @@ -2666,7 +2914,8 @@ SDValue AArch64TargetLowering::LowerFSINCOS(SDValue Op, } static SDValue LowerBITCAST(SDValue Op, SelectionDAG &DAG) { - if (Op.getValueType() != MVT::f16) + EVT OpVT = Op.getValueType(); + if (OpVT != MVT::f16 && OpVT != MVT::bf16) return SDValue(); assert(Op.getOperand(0).getValueType() == MVT::i16); @@ -2675,7 +2924,7 @@ static SDValue LowerBITCAST(SDValue Op, SelectionDAG &DAG) { Op = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op.getOperand(0)); Op = DAG.getNode(ISD::BITCAST, DL, MVT::f32, Op); return SDValue( - DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL, MVT::f16, Op, + DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL, OpVT, Op, DAG.getTargetConstant(AArch64::hsub, DL, MVT::i32)), 0); } @@ -2804,16 +3053,19 @@ SDValue AArch64TargetLowering::LowerFLT_ROUNDS_(SDValue Op, // so that the shift + and get folded into a bitfield extract. SDLoc dl(Op); - SDValue FPCR_64 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::i64, - DAG.getConstant(Intrinsic::aarch64_get_fpcr, dl, - MVT::i64)); + SDValue Chain = Op.getOperand(0); + SDValue FPCR_64 = DAG.getNode( + ISD::INTRINSIC_W_CHAIN, dl, {MVT::i64, MVT::Other}, + {Chain, DAG.getConstant(Intrinsic::aarch64_get_fpcr, dl, MVT::i64)}); + Chain = FPCR_64.getValue(1); SDValue FPCR_32 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, FPCR_64); SDValue FltRounds = DAG.getNode(ISD::ADD, dl, MVT::i32, FPCR_32, DAG.getConstant(1U << 22, dl, MVT::i32)); SDValue RMODE = DAG.getNode(ISD::SRL, dl, MVT::i32, FltRounds, DAG.getConstant(22, dl, MVT::i32)); - return DAG.getNode(ISD::AND, dl, MVT::i32, RMODE, - DAG.getConstant(3, dl, MVT::i32)); + SDValue AND = DAG.getNode(ISD::AND, dl, MVT::i32, RMODE, + DAG.getConstant(3, dl, MVT::i32)); + return DAG.getMergeValues({AND, Chain}, dl); } static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) { @@ -2885,6 +3137,12 @@ static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) { DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1)); } +static inline SDValue getPTrue(SelectionDAG &DAG, SDLoc DL, EVT VT, + int Pattern) { + return DAG.getNode(AArch64ISD::PTRUE, DL, VT, + DAG.getTargetConstant(Pattern, DL, MVT::i32)); +} + SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const { unsigned IntNo = cast(Op.getOperand(0))->getZExtValue(); @@ -2972,6 +3230,26 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, case Intrinsic::aarch64_sve_ptrue: return DAG.getNode(AArch64ISD::PTRUE, dl, Op.getValueType(), Op.getOperand(1)); + case Intrinsic::aarch64_sve_dupq_lane: + return LowerDUPQLane(Op, DAG); + case Intrinsic::aarch64_sve_convert_from_svbool: + return DAG.getNode(AArch64ISD::REINTERPRET_CAST, dl, Op.getValueType(), + Op.getOperand(1)); + case Intrinsic::aarch64_sve_convert_to_svbool: { + EVT OutVT = Op.getValueType(); + EVT InVT = Op.getOperand(1).getValueType(); + // Return the operand if the cast isn't changing type, + // i.e. -> + if (InVT == OutVT) + return Op.getOperand(1); + // Otherwise, zero the newly introduced lanes. + SDValue Reinterpret = + DAG.getNode(AArch64ISD::REINTERPRET_CAST, dl, OutVT, Op.getOperand(1)); + SDValue Mask = getPTrue(DAG, dl, InVT, AArch64SVEPredPattern::all); + SDValue MaskReinterpret = + DAG.getNode(AArch64ISD::REINTERPRET_CAST, dl, OutVT, Mask); + return DAG.getNode(ISD::AND, dl, OutVT, Reinterpret, MaskReinterpret); + } case Intrinsic::aarch64_sve_insr: { SDValue Scalar = Op.getOperand(2); @@ -3004,6 +3282,29 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, "llvm.eh.recoverfp must take a function as the first argument"); return IncomingFPOp; } + + case Intrinsic::aarch64_neon_vsri: + case Intrinsic::aarch64_neon_vsli: { + EVT Ty = Op.getValueType(); + + if (!Ty.isVector()) + report_fatal_error("Unexpected type for aarch64_neon_vsli"); + + assert(Op.getConstantOperandVal(3) <= Ty.getScalarSizeInBits()); + + bool IsShiftRight = IntNo == Intrinsic::aarch64_neon_vsri; + unsigned Opcode = IsShiftRight ? AArch64ISD::VSRI : AArch64ISD::VSLI; + return DAG.getNode(Opcode, dl, Ty, Op.getOperand(1), Op.getOperand(2), + Op.getOperand(3)); + } + + case Intrinsic::aarch64_neon_srhadd: + case Intrinsic::aarch64_neon_urhadd: { + bool IsSignedAdd = IntNo == Intrinsic::aarch64_neon_srhadd; + unsigned Opcode = IsSignedAdd ? AArch64ISD::SRHADD : AArch64ISD::URHADD; + return DAG.getNode(Opcode, dl, Op.getValueType(), Op.getOperand(1), + Op.getOperand(2)); + } } } @@ -3058,10 +3359,13 @@ SDValue AArch64TargetLowering::LowerSTORE(SDValue Op, EVT MemVT = StoreNode->getMemoryVT(); if (VT.isVector()) { + if (useSVEForFixedLengthVectorVT(VT)) + return LowerFixedLengthVectorStoreToSVE(Op, DAG); + unsigned AS = StoreNode->getAddressSpace(); - unsigned Align = StoreNode->getAlignment(); - if (Align < MemVT.getStoreSize() && - !allowsMisalignedMemoryAccesses(MemVT, AS, Align, + Align Alignment = StoreNode->getAlign(); + if (Alignment < MemVT.getStoreSize() && + !allowsMisalignedMemoryAccesses(MemVT, AS, Alignment.value(), StoreNode->getMemOperand()->getFlags(), nullptr)) { return scalarizeVectorStore(StoreNode, DAG); @@ -3070,6 +3374,30 @@ SDValue AArch64TargetLowering::LowerSTORE(SDValue Op, if (StoreNode->isTruncatingStore()) { return LowerTruncateVectorStore(Dl, StoreNode, VT, MemVT, DAG); } + // 256 bit non-temporal stores can be lowered to STNP. Do this as part of + // the custom lowering, as there are no un-paired non-temporal stores and + // legalization will break up 256 bit inputs. + if (StoreNode->isNonTemporal() && MemVT.getSizeInBits() == 256u && + MemVT.getVectorElementCount().Min % 2u == 0 && + ((MemVT.getScalarSizeInBits() == 8u || + MemVT.getScalarSizeInBits() == 16u || + MemVT.getScalarSizeInBits() == 32u || + MemVT.getScalarSizeInBits() == 64u))) { + SDValue Lo = + DAG.getNode(ISD::EXTRACT_SUBVECTOR, Dl, + MemVT.getHalfNumVectorElementsVT(*DAG.getContext()), + StoreNode->getValue(), DAG.getConstant(0, Dl, MVT::i64)); + SDValue Hi = DAG.getNode( + ISD::EXTRACT_SUBVECTOR, Dl, + MemVT.getHalfNumVectorElementsVT(*DAG.getContext()), + StoreNode->getValue(), + DAG.getConstant(MemVT.getVectorElementCount().Min / 2, Dl, MVT::i64)); + SDValue Result = DAG.getMemIntrinsicNode( + AArch64ISD::STNP, Dl, DAG.getVTList(MVT::Other), + {StoreNode->getChain(), Lo, Hi, StoreNode->getBasePtr()}, + StoreNode->getMemoryVT(), StoreNode->getMemOperand()); + return Result; + } } else if (MemVT == MVT::i128 && StoreNode->isVolatile()) { assert(StoreNode->getValue()->getValueType(0) == MVT::i128); SDValue Lo = @@ -3104,6 +3432,8 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op, case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG); case ISD::SETCC: + case ISD::STRICT_FSETCC: + case ISD::STRICT_FSETCCS: return LowerSETCC(Op, DAG); case ISD::BR_CC: return LowerBR_CC(Op, DAG); @@ -3138,14 +3468,19 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op, case ISD::UMULO: return LowerXALUO(Op, DAG); case ISD::FADD: + if (useSVEForFixedLengthVectorVT(Op.getValueType())) + return LowerToPredicatedOp(Op, DAG, AArch64ISD::FADD_PRED); return LowerF128Call(Op, DAG, RTLIB::ADD_F128); case ISD::FSUB: return LowerF128Call(Op, DAG, RTLIB::SUB_F128); case ISD::FMUL: return LowerF128Call(Op, DAG, RTLIB::MUL_F128); + case ISD::FMA: + return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMA_PRED); case ISD::FDIV: return LowerF128Call(Op, DAG, RTLIB::DIV_F128); case ISD::FP_ROUND: + case ISD::STRICT_FP_ROUND: return LowerFP_ROUND(Op, DAG); case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG); @@ -3169,6 +3504,20 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op, return LowerSPLAT_VECTOR(Op, DAG); case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG); + case ISD::INSERT_SUBVECTOR: + return LowerINSERT_SUBVECTOR(Op, DAG); + case ISD::SDIV: + return LowerToPredicatedOp(Op, DAG, AArch64ISD::SDIV_PRED); + case ISD::UDIV: + return LowerToPredicatedOp(Op, DAG, AArch64ISD::UDIV_PRED); + case ISD::SMIN: + return LowerToPredicatedOp(Op, DAG, AArch64ISD::SMIN_MERGE_OP1); + case ISD::UMIN: + return LowerToPredicatedOp(Op, DAG, AArch64ISD::UMIN_MERGE_OP1); + case ISD::SMAX: + return LowerToPredicatedOp(Op, DAG, AArch64ISD::SMAX_MERGE_OP1); + case ISD::UMAX: + return LowerToPredicatedOp(Op, DAG, AArch64ISD::UMAX_MERGE_OP1); case ISD::SRA: case ISD::SRL: case ISD::SHL: @@ -3190,9 +3539,13 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op, return LowerPREFETCH(Op, DAG); case ISD::SINT_TO_FP: case ISD::UINT_TO_FP: + case ISD::STRICT_SINT_TO_FP: + case ISD::STRICT_UINT_TO_FP: return LowerINT_TO_FP(Op, DAG); case ISD::FP_TO_SINT: case ISD::FP_TO_UINT: + case ISD::STRICT_FP_TO_SINT: + case ISD::STRICT_FP_TO_UINT: return LowerFP_TO_INT(Op, DAG); case ISD::FSINCOS: return LowerFSINCOS(Op, DAG); @@ -3218,7 +3571,66 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op, return LowerATOMIC_LOAD_AND(Op, DAG); case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG); + case ISD::VSCALE: + return LowerVSCALE(Op, DAG); + case ISD::TRUNCATE: + return LowerTRUNCATE(Op, DAG); + case ISD::LOAD: + if (useSVEForFixedLengthVectorVT(Op.getValueType())) + return LowerFixedLengthVectorLoadToSVE(Op, DAG); + llvm_unreachable("Unexpected request to lower ISD::LOAD"); + case ISD::ADD: + if (useSVEForFixedLengthVectorVT(Op.getValueType())) + return LowerToPredicatedOp(Op, DAG, AArch64ISD::ADD_PRED); + llvm_unreachable("Unexpected request to lower ISD::ADD"); + } +} + +bool AArch64TargetLowering::useSVEForFixedLengthVectors() const { + // Prefer NEON unless larger SVE registers are available. + return Subtarget->hasSVE() && Subtarget->getMinSVEVectorSizeInBits() >= 256; +} + +bool AArch64TargetLowering::useSVEForFixedLengthVectorVT(EVT VT) const { + if (!useSVEForFixedLengthVectors()) + return false; + + if (!VT.isFixedLengthVector()) + return false; + + // Fixed length predicates should be promoted to i8. + // NOTE: This is consistent with how NEON (and thus 64/128bit vectors) work. + if (VT.getVectorElementType() == MVT::i1) + return false; + + // Don't use SVE for vectors we cannot scalarize if required. + switch (VT.getVectorElementType().getSimpleVT().SimpleTy) { + default: + return false; + case MVT::i8: + case MVT::i16: + case MVT::i32: + case MVT::i64: + case MVT::f16: + case MVT::f32: + case MVT::f64: + break; } + + // Ensure NEON MVTs only belong to a single register class. + if (VT.getSizeInBits() <= 128) + return false; + + // Don't use SVE for types that don't fit. + if (VT.getSizeInBits() > Subtarget->getMinSVEVectorSizeInBits()) + return false; + + // TODO: Perhaps an artificial restriction, but worth having whilst getting + // the base fixed length SVE support in place. + if (!VT.isPow2VectorType()) + return false; + + return true; } //===----------------------------------------------------------------------===// @@ -3231,9 +3643,6 @@ CCAssignFn *AArch64TargetLowering::CCAssignFnForCall(CallingConv::ID CC, switch (CC) { default: report_fatal_error("Unsupported calling convention."); - case CallingConv::AArch64_SVE_VectorCall: - // Calling SVE functions is currently not yet supported. - report_fatal_error("Unsupported calling convention."); case CallingConv::WebKit_JS: return CC_AArch64_WebKit_JS; case CallingConv::GHC: @@ -3256,6 +3665,7 @@ CCAssignFn *AArch64TargetLowering::CCAssignFnForCall(CallingConv::ID CC, case CallingConv::CFGuard_Check: return CC_AArch64_Win64_CFGuard_Check; case CallingConv::AArch64_VectorCall: + case CallingConv::AArch64_SVE_VectorCall: return CC_AArch64_AAPCS; } } @@ -3343,7 +3753,7 @@ SDValue AArch64TargetLowering::LowerFormalArguments( RC = &AArch64::GPR32RegClass; else if (RegVT == MVT::i64) RC = &AArch64::GPR64RegClass; - else if (RegVT == MVT::f16) + else if (RegVT == MVT::f16 || RegVT == MVT::bf16) RC = &AArch64::FPR16RegClass; else if (RegVT == MVT::f32) RC = &AArch64::FPR32RegClass; @@ -3374,7 +3784,7 @@ SDValue AArch64TargetLowering::LowerFormalArguments( case CCValAssign::Indirect: assert(VA.getValVT().isScalableVector() && "Only scalable vectors can be passed indirectly"); - llvm_unreachable("Spilling of SVE vectors not yet implemented"); + break; case CCValAssign::BCvt: ArgValue = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), ArgValue); break; @@ -3391,7 +3801,9 @@ SDValue AArch64TargetLowering::LowerFormalArguments( } else { // VA.isRegLoc() assert(VA.isMemLoc() && "CCValAssign is neither reg nor mem"); unsigned ArgOffset = VA.getLocMemOffset(); - unsigned ArgSize = VA.getValVT().getSizeInBits() / 8; + unsigned ArgSize = (VA.getLocInfo() == CCValAssign::Indirect + ? VA.getLocVT().getSizeInBits() + : VA.getValVT().getSizeInBits()) / 8; uint32_t BEAlign = 0; if (!Subtarget->isLittleEndian() && ArgSize < 8 && @@ -3417,7 +3829,8 @@ SDValue AArch64TargetLowering::LowerFormalArguments( case CCValAssign::Indirect: assert(VA.getValVT().isScalableVector() && "Only scalable vectors can be passed indirectly"); - llvm_unreachable("Spilling of SVE vectors not yet implemented"); + MemVT = VA.getLocVT(); + break; case CCValAssign::SExt: ExtType = ISD::SEXTLOAD; break; @@ -3435,6 +3848,15 @@ SDValue AArch64TargetLowering::LowerFormalArguments( MemVT); } + + if (VA.getLocInfo() == CCValAssign::Indirect) { + assert(VA.getValVT().isScalableVector() && + "Only scalable vectors can be passed indirectly"); + // If value is passed via pointer - do a load. + ArgValue = + DAG.getLoad(VA.getValVT(), DL, Chain, ArgValue, MachinePointerInfo()); + } + if (Subtarget->isTargetILP32() && Ins[i].Flags.isPointer()) ArgValue = DAG.getNode(ISD::AssertZext, DL, ArgValue.getValueType(), ArgValue, DAG.getValueType(MVT::i32)); @@ -3550,7 +3972,7 @@ void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo, // The extra size here, if triggered, will always be 8. MFI.CreateFixedObject(16 - (GPRSaveSize & 15), -(int)alignTo(GPRSaveSize, 16), false); } else - GPRIdx = MFI.CreateStackObject(GPRSaveSize, 8, false); + GPRIdx = MFI.CreateStackObject(GPRSaveSize, Align(8), false); SDValue FIN = DAG.getFrameIndex(GPRIdx, PtrVT); @@ -3582,7 +4004,7 @@ void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo, unsigned FPRSaveSize = 16 * (NumFPRArgRegs - FirstVariadicFPR); int FPRIdx = 0; if (FPRSaveSize != 0) { - FPRIdx = MFI.CreateStackObject(FPRSaveSize, 16, false); + FPRIdx = MFI.CreateStackObject(FPRSaveSize, Align(16), false); SDValue FIN = DAG.getFrameIndex(FPRIdx, PtrVT); @@ -3703,6 +4125,13 @@ bool AArch64TargetLowering::isEligibleForTailCallOptimization( CallingConv::ID CallerCC = CallerF.getCallingConv(); bool CCMatch = CallerCC == CalleeCC; + // When using the Windows calling convention on a non-windows OS, we want + // to back up and restore X18 in such functions; we can't do a tail call + // from those functions. + if (CallerCC == CallingConv::Win64 && !Subtarget->isTargetWindows() && + CalleeCC != CallingConv::Win64) + return false; + // Byval parameters hand the function a pointer directly into the stack area // we want to reuse during a tail call. Working around this *is* possible (see // X86) but less efficient and uglier in LowerCall. @@ -3795,6 +4224,18 @@ bool AArch64TargetLowering::isEligibleForTailCallOptimization( const AArch64FunctionInfo *FuncInfo = MF.getInfo(); + // If any of the arguments is passed indirectly, it must be SVE, so the + // 'getBytesInStackArgArea' is not sufficient to determine whether we need to + // allocate space on the stack. That is why we determine this explicitly here + // the call cannot be a tailcall. + if (llvm::any_of(ArgLocs, [](CCValAssign &A) { + assert((A.getLocInfo() != CCValAssign::Indirect || + A.getValVT().isScalableVector()) && + "Expected value to be scalable"); + return A.getLocInfo() == CCValAssign::Indirect; + })) + return false; + // If the stack arguments for this call do not fit into our own save area then // the call cannot be made tail. if (CCInfo.getNextStackOffset() > FuncInfo->getBytesInStackArgArea()) @@ -3873,7 +4314,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, // Check if it's really possible to do a tail call. IsTailCall = isEligibleForTailCallOptimization( Callee, CallConv, IsVarArg, Outs, OutVals, Ins, DAG); - if (!IsTailCall && CLI.CS && CLI.CS.isMustTailCall()) + if (!IsTailCall && CLI.CB && CLI.CB->isMustTailCall()) report_fatal_error("failed to perform tail call elimination on a call " "site marked musttail"); @@ -3983,7 +4424,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, SmallVector MemOpChains; auto PtrVT = getPointerTy(DAG.getDataLayout()); - if (IsVarArg && CLI.CS && CLI.CS.isMustTailCall()) { + if (IsVarArg && CLI.CB && CLI.CB->isMustTailCall()) { const auto &Forwards = FuncInfo->getForwardedMustTailRegParms(); for (const auto &F : Forwards) { SDValue Val = DAG.getCopyFromReg(Chain, DL, F.VReg, F.VT); @@ -4035,7 +4476,20 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, case CCValAssign::Indirect: assert(VA.getValVT().isScalableVector() && "Only scalable vectors can be passed indirectly"); - llvm_unreachable("Spilling of SVE vectors not yet implemented"); + MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo(); + Type *Ty = EVT(VA.getValVT()).getTypeForEVT(*DAG.getContext()); + Align Alignment = DAG.getDataLayout().getPrefTypeAlign(Ty); + int FI = MFI.CreateStackObject( + VA.getValVT().getStoreSize().getKnownMinSize(), Alignment, false); + MFI.setStackID(FI, TargetStackID::SVEVector); + + SDValue SpillSlot = DAG.getFrameIndex( + FI, DAG.getTargetLoweringInfo().getFrameIndexTy(DAG.getDataLayout())); + Chain = DAG.getStore( + Chain, DL, Arg, SpillSlot, + MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI)); + Arg = SpillSlot; + break; } if (VA.isRegLoc()) { @@ -4071,7 +4525,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, RegsToPass.emplace_back(VA.getLocReg(), Arg); RegsUsed.insert(VA.getLocReg()); const TargetOptions &Options = DAG.getTarget().Options; - if (Options.EnableDebugEntryValues) + if (Options.EmitCallSiteInfo) CSInfo.emplace_back(VA.getLocReg(), i); } } else { @@ -4083,8 +4537,12 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, // FIXME: This works on big-endian for composite byvals, which are the // common case. It should also work for fundamental types too. uint32_t BEAlign = 0; - unsigned OpSize = Flags.isByVal() ? Flags.getByValSize() * 8 - : VA.getValVT().getSizeInBits(); + unsigned OpSize; + if (VA.getLocInfo() == CCValAssign::Indirect) + OpSize = VA.getLocVT().getSizeInBits(); + else + OpSize = Flags.isByVal() ? Flags.getByValSize() * 8 + : VA.getValVT().getSizeInBits(); OpSize = (OpSize + 7) / 8; if (!Subtarget->isLittleEndian() && !Flags.isByVal() && !Flags.isInConsecutiveRegs()) { @@ -4120,10 +4578,10 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, SDValue SizeNode = DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i64); SDValue Cpy = DAG.getMemcpy( - Chain, DL, DstAddr, Arg, SizeNode, Outs[i].Flags.getByValAlign(), + Chain, DL, DstAddr, Arg, SizeNode, + Outs[i].Flags.getNonZeroByValAlign(), /*isVol = */ false, /*AlwaysInline = */ false, - /*isTailCall = */ false, - DstInfo, MachinePointerInfo()); + /*isTailCall = */ false, DstInfo, MachinePointerInfo()); MemOpChains.push_back(Cpy); } else { @@ -4257,6 +4715,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, // Returns a chain and a flag for retval copy to use. Chain = DAG.getNode(AArch64ISD::CALL, DL, NodeTys, Ops); + DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge); InFlag = Chain.getValue(1); DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo)); @@ -4422,7 +4881,7 @@ SDValue AArch64TargetLowering::getTargetNode(JumpTableSDNode *N, EVT Ty, SDValue AArch64TargetLowering::getTargetNode(ConstantPoolSDNode *N, EVT Ty, SelectionDAG &DAG, unsigned Flag) const { - return DAG.getTargetConstantPool(N->getConstVal(), Ty, N->getAlignment(), + return DAG.getTargetConstantPool(N->getConstVal(), Ty, N->getAlign(), N->getOffset(), Flag); } @@ -4913,7 +5372,7 @@ SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const { // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch // instruction. - if (isOverflowIntrOpRes(LHS) && isOneConstant(RHS) && + if (ISD::isOverflowIntrOpRes(LHS) && isOneConstant(RHS) && (CC == ISD::SETEQ || CC == ISD::SETNE)) { // Only lower legal XALUO ops. if (!DAG.getTargetLoweringInfo().isTypeLegal(LHS->getValueType(0))) @@ -4997,8 +5456,8 @@ SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const { Cmp); } - assert(LHS.getValueType() == MVT::f16 || LHS.getValueType() == MVT::f32 || - LHS.getValueType() == MVT::f64); + assert(LHS.getValueType() == MVT::f16 || LHS.getValueType() == MVT::bf16 || + LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64); // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally // clean. Some of them require two branches to implement. @@ -5124,6 +5583,15 @@ SDValue AArch64TargetLowering::LowerCTPOP(SDValue Op, SelectionDAG &DAG) const { if (VT == MVT::i64) UaddLV = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, UaddLV); return UaddLV; + } else if (VT == MVT::i128) { + Val = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Val); + + SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v16i8, Val); + SDValue UaddLV = DAG.getNode( + ISD::INTRINSIC_WO_CHAIN, DL, MVT::i32, + DAG.getConstant(Intrinsic::aarch64_neon_uaddlv, DL, MVT::i32), CtPop); + + return DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i128, UaddLV); } assert((VT == MVT::v1i64 || VT == MVT::v2i64 || VT == MVT::v2i32 || @@ -5154,9 +5622,15 @@ SDValue AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { if (Op.getValueType().isVector()) return LowerVSETCC(Op, DAG); - SDValue LHS = Op.getOperand(0); - SDValue RHS = Op.getOperand(1); - ISD::CondCode CC = cast(Op.getOperand(2))->get(); + bool IsStrict = Op->isStrictFPOpcode(); + bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS; + unsigned OpNo = IsStrict ? 1 : 0; + SDValue Chain; + if (IsStrict) + Chain = Op.getOperand(0); + SDValue LHS = Op.getOperand(OpNo + 0); + SDValue RHS = Op.getOperand(OpNo + 1); + ISD::CondCode CC = cast(Op.getOperand(OpNo + 2))->get(); SDLoc dl(Op); // We chose ZeroOrOneBooleanContents, so use zero and one. @@ -5167,13 +5641,14 @@ SDValue AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { // Handle f128 first, since one possible outcome is a normal integer // comparison which gets picked up by the next if statement. if (LHS.getValueType() == MVT::f128) { - softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl, LHS, RHS); + softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl, LHS, RHS, Chain, + IsSignaling); // If softenSetCCOperands returned a scalar, use it. if (!RHS.getNode()) { assert(LHS.getValueType() == Op.getValueType() && "Unexpected setcc expansion!"); - return LHS; + return IsStrict ? DAG.getMergeValues({LHS, Chain}, dl) : LHS; } } @@ -5185,7 +5660,8 @@ SDValue AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { // Note that we inverted the condition above, so we reverse the order of // the true and false operands here. This will allow the setcc to be // matched to a single CSINC instruction. - return DAG.getNode(AArch64ISD::CSEL, dl, VT, FVal, TVal, CCVal, Cmp); + SDValue Res = DAG.getNode(AArch64ISD::CSEL, dl, VT, FVal, TVal, CCVal, Cmp); + return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res; } // Now we know we're dealing with FP values. @@ -5194,10 +5670,15 @@ SDValue AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { // If that fails, we'll need to perform an FCMP + CSEL sequence. Go ahead // and do the comparison. - SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG); + SDValue Cmp; + if (IsStrict) + Cmp = emitStrictFPComparison(LHS, RHS, dl, DAG, Chain, IsSignaling); + else + Cmp = emitComparison(LHS, RHS, CC, dl, DAG); AArch64CC::CondCode CC1, CC2; changeFPCCToAArch64CC(CC, CC1, CC2); + SDValue Res; if (CC2 == AArch64CC::AL) { changeFPCCToAArch64CC(ISD::getSetCCInverse(CC, LHS.getValueType()), CC1, CC2); @@ -5206,7 +5687,7 @@ SDValue AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { // Note that we inverted the condition above, so we reverse the order of // the true and false operands here. This will allow the setcc to be // matched to a single CSINC instruction. - return DAG.getNode(AArch64ISD::CSEL, dl, VT, FVal, TVal, CC1Val, Cmp); + Res = DAG.getNode(AArch64ISD::CSEL, dl, VT, FVal, TVal, CC1Val, Cmp); } else { // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't // totally clean. Some of them require two CSELs to implement. As is in @@ -5219,8 +5700,9 @@ SDValue AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, FVal, CC1Val, Cmp); SDValue CC2Val = DAG.getConstant(CC2, dl, MVT::i32); - return DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, CS1, CC2Val, Cmp); + Res = DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, CS1, CC2Val, Cmp); } + return IsStrict ? DAG.getMergeValues({Res, Cmp.getValue(1)}, dl) : Res; } SDValue AArch64TargetLowering::LowerSELECT_CC(ISD::CondCode CC, SDValue LHS, @@ -5429,9 +5911,17 @@ SDValue AArch64TargetLowering::LowerSELECT(SDValue Op, SDValue FVal = Op->getOperand(2); SDLoc DL(Op); + EVT Ty = Op.getValueType(); + if (Ty.isScalableVector()) { + SDValue TruncCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, CCVal); + MVT PredVT = MVT::getVectorVT(MVT::i1, Ty.getVectorElementCount()); + SDValue SplatPred = DAG.getNode(ISD::SPLAT_VECTOR, DL, PredVT, TruncCC); + return DAG.getNode(ISD::VSELECT, DL, Ty, SplatPred, TVal, FVal); + } + // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a select // instruction. - if (isOverflowIntrOpRes(CCVal)) { + if (ISD::isOverflowIntrOpRes(CCVal)) { // Only lower legal XALUO ops. if (!DAG.getTargetLoweringInfo().isTypeLegal(CCVal->getValueType(0))) return SDValue(); @@ -5642,9 +6132,9 @@ SDValue AArch64TargetLowering::LowerVACOPY(SDValue Op, const Value *SrcSV = cast(Op.getOperand(4))->getValue(); return DAG.getMemcpy(Op.getOperand(0), DL, Op.getOperand(1), Op.getOperand(2), - DAG.getConstant(VaListSize, DL, MVT::i32), PtrSize, - false, false, false, MachinePointerInfo(DestSV), - MachinePointerInfo(SrcSV)); + DAG.getConstant(VaListSize, DL, MVT::i32), + Align(PtrSize), false, false, false, + MachinePointerInfo(DestSV), MachinePointerInfo(SrcSV)); } SDValue AArch64TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const { @@ -5656,7 +6146,7 @@ SDValue AArch64TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const { SDLoc DL(Op); SDValue Chain = Op.getOperand(0); SDValue Addr = Op.getOperand(1); - unsigned Align = Op.getConstantOperandVal(3); + MaybeAlign Align(Op.getConstantOperandVal(3)); unsigned MinSlotSize = Subtarget->isTargetILP32() ? 4 : 8; auto PtrVT = getPointerTy(DAG.getDataLayout()); auto PtrMemVT = getPointerMemTy(DAG.getDataLayout()); @@ -5665,12 +6155,11 @@ SDValue AArch64TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const { Chain = VAList.getValue(1); VAList = DAG.getZExtOrTrunc(VAList, DL, PtrVT); - if (Align > MinSlotSize) { - assert(((Align & (Align - 1)) == 0) && "Expected Align to be a power of 2"); + if (Align && *Align > MinSlotSize) { VAList = DAG.getNode(ISD::ADD, DL, PtrVT, VAList, - DAG.getConstant(Align - 1, DL, PtrVT)); + DAG.getConstant(Align->value() - 1, DL, PtrVT)); VAList = DAG.getNode(ISD::AND, DL, PtrVT, VAList, - DAG.getConstant(-(int64_t)Align, DL, PtrVT)); + DAG.getConstant(-(int64_t)Align->value(), DL, PtrVT)); } Type *ArgTy = VT.getTypeForEVT(*DAG.getContext()); @@ -7001,7 +7490,8 @@ static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS, return DAG.getNode(AArch64ISD::REV64, dl, VT, OpLHS); // vrev <4 x i16> -> REV32 if (VT.getVectorElementType() == MVT::i16 || - VT.getVectorElementType() == MVT::f16) + VT.getVectorElementType() == MVT::f16 || + VT.getVectorElementType() == MVT::bf16) return DAG.getNode(AArch64ISD::REV32, dl, VT, OpLHS); // vrev <4 x i8> -> REV16 assert(VT.getVectorElementType() == MVT::i8); @@ -7014,7 +7504,7 @@ static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS, unsigned Opcode; if (EltTy == MVT::i8) Opcode = AArch64ISD::DUPLANE8; - else if (EltTy == MVT::i16 || EltTy == MVT::f16) + else if (EltTy == MVT::i16 || EltTy == MVT::f16 || EltTy == MVT::bf16) Opcode = AArch64ISD::DUPLANE16; else if (EltTy == MVT::i32 || EltTy == MVT::f32) Opcode = AArch64ISD::DUPLANE32; @@ -7121,7 +7611,7 @@ static SDValue GenerateTBL(SDValue Op, ArrayRef ShuffleMask, static unsigned getDUPLANEOp(EVT EltType) { if (EltType == MVT::i8) return AArch64ISD::DUPLANE8; - if (EltType == MVT::i16 || EltType == MVT::f16) + if (EltType == MVT::i16 || EltType == MVT::f16 || EltType == MVT::bf16) return AArch64ISD::DUPLANE16; if (EltType == MVT::i32 || EltType == MVT::f32) return AArch64ISD::DUPLANE32; @@ -7330,18 +7820,16 @@ SDValue AArch64TargetLowering::LowerSPLAT_VECTOR(SDValue Op, // Extend input splat value where needed to fit into a GPR (32b or 64b only) // FPRs don't have this restriction. switch (ElemVT.getSimpleVT().SimpleTy) { - case MVT::i8: - case MVT::i16: - case MVT::i32: - SplatVal = DAG.getAnyExtOrTrunc(SplatVal, dl, MVT::i32); - return DAG.getNode(AArch64ISD::DUP, dl, VT, SplatVal); - case MVT::i64: - SplatVal = DAG.getAnyExtOrTrunc(SplatVal, dl, MVT::i64); - return DAG.getNode(AArch64ISD::DUP, dl, VT, SplatVal); case MVT::i1: { + // The only legal i1 vectors are SVE vectors, so we can use SVE-specific + // lowering code. + if (auto *ConstVal = dyn_cast(SplatVal)) { + if (ConstVal->isOne()) + return getPTrue(DAG, dl, VT, AArch64SVEPredPattern::all); + // TODO: Add special case for constant false + } // The general case of i1. There isn't any natural way to do this, // so we use some trickery with whilelo. - // TODO: Add special cases for splat of constant true/false. SplatVal = DAG.getAnyExtOrTrunc(SplatVal, dl, MVT::i64); SplatVal = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::i64, SplatVal, DAG.getValueType(MVT::i1)); @@ -7350,15 +7838,76 @@ SDValue AArch64TargetLowering::LowerSPLAT_VECTOR(SDValue Op, return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, ID, DAG.getConstant(0, dl, MVT::i64), SplatVal); } - // TODO: we can support float types, but haven't added patterns yet. + case MVT::i8: + case MVT::i16: + case MVT::i32: + SplatVal = DAG.getAnyExtOrTrunc(SplatVal, dl, MVT::i32); + break; + case MVT::i64: + SplatVal = DAG.getAnyExtOrTrunc(SplatVal, dl, MVT::i64); + break; case MVT::f16: + case MVT::bf16: case MVT::f32: case MVT::f64: + // Fine as is + break; default: report_fatal_error("Unsupported SPLAT_VECTOR input operand type"); } + + return DAG.getNode(AArch64ISD::DUP, dl, VT, SplatVal); +} + +SDValue AArch64TargetLowering::LowerDUPQLane(SDValue Op, + SelectionDAG &DAG) const { + SDLoc DL(Op); + + EVT VT = Op.getValueType(); + if (!isTypeLegal(VT) || !VT.isScalableVector()) + return SDValue(); + + // Current lowering only supports the SVE-ACLE types. + if (VT.getSizeInBits().getKnownMinSize() != AArch64::SVEBitsPerBlock) + return SDValue(); + + // The DUPQ operation is indepedent of element type so normalise to i64s. + SDValue V = DAG.getNode(ISD::BITCAST, DL, MVT::nxv2i64, Op.getOperand(1)); + SDValue Idx128 = Op.getOperand(2); + + // DUPQ can be used when idx is in range. + auto *CIdx = dyn_cast(Idx128); + if (CIdx && (CIdx->getZExtValue() <= 3)) { + SDValue CI = DAG.getTargetConstant(CIdx->getZExtValue(), DL, MVT::i64); + SDNode *DUPQ = + DAG.getMachineNode(AArch64::DUP_ZZI_Q, DL, MVT::nxv2i64, V, CI); + return DAG.getNode(ISD::BITCAST, DL, VT, SDValue(DUPQ, 0)); + } + + // The ACLE says this must produce the same result as: + // svtbl(data, svadd_x(svptrue_b64(), + // svand_x(svptrue_b64(), svindex_u64(0, 1), 1), + // index * 2)) + SDValue One = DAG.getConstant(1, DL, MVT::i64); + SDValue SplatOne = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::nxv2i64, One); + + // create the vector 0,1,0,1,... + SDValue Zero = DAG.getConstant(0, DL, MVT::i64); + SDValue SV = DAG.getNode(AArch64ISD::INDEX_VECTOR, + DL, MVT::nxv2i64, Zero, One); + SV = DAG.getNode(ISD::AND, DL, MVT::nxv2i64, SV, SplatOne); + + // create the vector idx64,idx64+1,idx64,idx64+1,... + SDValue Idx64 = DAG.getNode(ISD::ADD, DL, MVT::i64, Idx128, Idx128); + SDValue SplatIdx64 = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::nxv2i64, Idx64); + SDValue ShuffleMask = DAG.getNode(ISD::ADD, DL, MVT::nxv2i64, SV, SplatIdx64); + + // create the vector Val[idx64],Val[idx64+1],Val[idx64],Val[idx64+1],... + SDValue TBL = DAG.getNode(AArch64ISD::TBL, DL, MVT::nxv2i64, V, ShuffleMask); + return DAG.getNode(ISD::BITCAST, DL, VT, TBL); } + static bool resolveBuildVector(BuildVectorSDNode *BVN, APInt &CnstBits, APInt &UndefBits) { EVT VT = BVN->getValueType(0); @@ -7609,8 +8158,10 @@ static unsigned getIntrinsicID(const SDNode *N) { // Attempt to form a vector S[LR]I from (or (and X, BvecC1), (lsl Y, C2)), // to (SLI X, Y, C2), where X and Y have matching vector types, BvecC1 is a -// BUILD_VECTORs with constant element C1, C2 is a constant, and C1 == ~C2. -// Also, logical shift right -> sri, with the same structure. +// BUILD_VECTORs with constant element C1, C2 is a constant, and: +// - for the SLI case: C1 == ~(Ones(ElemSizeInBits) << C2) +// - for the SRI case: C1 == ~(Ones(ElemSizeInBits) >> C2) +// The (or (lsl Y, C2), (and X, BvecC1)) case is also handled. static SDValue tryLowerToSLI(SDNode *N, SelectionDAG &DAG) { EVT VT = N->getValueType(0); @@ -7619,49 +8170,70 @@ static SDValue tryLowerToSLI(SDNode *N, SelectionDAG &DAG) { SDLoc DL(N); - // Is the first op an AND? - const SDValue And = N->getOperand(0); - if (And.getOpcode() != ISD::AND) + SDValue And; + SDValue Shift; + + SDValue FirstOp = N->getOperand(0); + unsigned FirstOpc = FirstOp.getOpcode(); + SDValue SecondOp = N->getOperand(1); + unsigned SecondOpc = SecondOp.getOpcode(); + + // Is one of the operands an AND or a BICi? The AND may have been optimised to + // a BICi in order to use an immediate instead of a register. + // Is the other operand an shl or lshr? This will have been turned into: + // AArch64ISD::VSHL vector, #shift or AArch64ISD::VLSHR vector, #shift. + if ((FirstOpc == ISD::AND || FirstOpc == AArch64ISD::BICi) && + (SecondOpc == AArch64ISD::VSHL || SecondOpc == AArch64ISD::VLSHR)) { + And = FirstOp; + Shift = SecondOp; + + } else if ((SecondOpc == ISD::AND || SecondOpc == AArch64ISD::BICi) && + (FirstOpc == AArch64ISD::VSHL || FirstOpc == AArch64ISD::VLSHR)) { + And = SecondOp; + Shift = FirstOp; + } else return SDValue(); - // Is the second op an shl or lshr? - SDValue Shift = N->getOperand(1); - // This will have been turned into: AArch64ISD::VSHL vector, #shift - // or AArch64ISD::VLSHR vector, #shift - unsigned ShiftOpc = Shift.getOpcode(); - if ((ShiftOpc != AArch64ISD::VSHL && ShiftOpc != AArch64ISD::VLSHR)) - return SDValue(); - bool IsShiftRight = ShiftOpc == AArch64ISD::VLSHR; + bool IsAnd = And.getOpcode() == ISD::AND; + bool IsShiftRight = Shift.getOpcode() == AArch64ISD::VLSHR; // Is the shift amount constant? ConstantSDNode *C2node = dyn_cast(Shift.getOperand(1)); if (!C2node) return SDValue(); - // Is the and mask vector all constant? uint64_t C1; - if (!isAllConstantBuildVector(And.getOperand(1), C1)) - return SDValue(); + if (IsAnd) { + // Is the and mask vector all constant? + if (!isAllConstantBuildVector(And.getOperand(1), C1)) + return SDValue(); + } else { + // Reconstruct the corresponding AND immediate from the two BICi immediates. + ConstantSDNode *C1nodeImm = dyn_cast(And.getOperand(1)); + ConstantSDNode *C1nodeShift = dyn_cast(And.getOperand(2)); + assert(C1nodeImm && C1nodeShift); + C1 = ~(C1nodeImm->getZExtValue() << C1nodeShift->getZExtValue()); + } - // Is C1 == ~C2, taking into account how much one can shift elements of a - // particular size? + // Is C1 == ~(Ones(ElemSizeInBits) << C2) or + // C1 == ~(Ones(ElemSizeInBits) >> C2), taking into account + // how much one can shift elements of a particular size? uint64_t C2 = C2node->getZExtValue(); unsigned ElemSizeInBits = VT.getScalarSizeInBits(); if (C2 > ElemSizeInBits) return SDValue(); - unsigned ElemMask = (1 << ElemSizeInBits) - 1; - if ((C1 & ElemMask) != (~C2 & ElemMask)) + + APInt C1AsAPInt(ElemSizeInBits, C1); + APInt RequiredC1 = IsShiftRight ? APInt::getHighBitsSet(ElemSizeInBits, C2) + : APInt::getLowBitsSet(ElemSizeInBits, C2); + if (C1AsAPInt != RequiredC1) return SDValue(); SDValue X = And.getOperand(0); SDValue Y = Shift.getOperand(0); - unsigned Intrin = - IsShiftRight ? Intrinsic::aarch64_neon_vsri : Intrinsic::aarch64_neon_vsli; - SDValue ResultSLI = - DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, - DAG.getConstant(Intrin, DL, MVT::i32), X, Y, - Shift.getOperand(1)); + unsigned Inst = IsShiftRight ? AArch64ISD::VSRI : AArch64ISD::VSLI; + SDValue ResultSLI = DAG.getNode(Inst, DL, VT, X, Y, Shift.getOperand(1)); LLVM_DEBUG(dbgs() << "aarch64-lower: transformed: \n"); LLVM_DEBUG(N->dump(&DAG)); @@ -7675,10 +8247,8 @@ static SDValue tryLowerToSLI(SDNode *N, SelectionDAG &DAG) { SDValue AArch64TargetLowering::LowerVectorOR(SDValue Op, SelectionDAG &DAG) const { // Attempt to form a vector S[LR]I from (or (and X, C1), (lsl Y, C2)) - if (EnableAArch64SlrGeneration) { - if (SDValue Res = tryLowerToSLI(Op.getNode(), DAG)) - return Res; - } + if (SDValue Res = tryLowerToSLI(Op.getNode(), DAG)) + return Res; EVT VT = Op.getValueType(); @@ -7966,8 +8536,8 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op, if (VT.getVectorElementType().isFloatingPoint()) { SmallVector Ops; EVT EltTy = VT.getVectorElementType(); - assert ((EltTy == MVT::f16 || EltTy == MVT::f32 || EltTy == MVT::f64) && - "Unsupported floating-point vector type"); + assert ((EltTy == MVT::f16 || EltTy == MVT::bf16 || EltTy == MVT::f32 || + EltTy == MVT::f64) && "Unsupported floating-point vector type"); LLVM_DEBUG( dbgs() << "LowerBUILD_VECTOR: float constant splats, creating int " "BITCASTS, and try again\n"); @@ -8086,11 +8656,12 @@ SDValue AArch64TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, // Insertion/extraction are legal for V128 types. if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 || VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64 || - VT == MVT::v8f16) + VT == MVT::v8f16 || VT == MVT::v8bf16) return Op; if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 && - VT != MVT::v1i64 && VT != MVT::v2f32 && VT != MVT::v4f16) + VT != MVT::v1i64 && VT != MVT::v2f32 && VT != MVT::v4f16 && + VT != MVT::v4bf16) return SDValue(); // For V64 types, we perform insertion by expanding the value @@ -8120,11 +8691,12 @@ AArch64TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, // Insertion/extraction are legal for V128 types. if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 || VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64 || - VT == MVT::v8f16) + VT == MVT::v8f16 || VT == MVT::v8bf16) return Op; if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 && - VT != MVT::v1i64 && VT != MVT::v2f32 && VT != MVT::v4f16) + VT != MVT::v1i64 && VT != MVT::v2f32 && VT != MVT::v4f16 && + VT != MVT::v4bf16) return SDValue(); // For V64 types, we perform extraction by expanding the value @@ -8144,32 +8716,57 @@ AArch64TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, SDValue AArch64TargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const { - EVT VT = Op.getOperand(0).getValueType(); - SDLoc dl(Op); - // Just in case... - if (!VT.isVector()) - return SDValue(); - - ConstantSDNode *Cst = dyn_cast(Op.getOperand(1)); - if (!Cst) - return SDValue(); - unsigned Val = Cst->getZExtValue(); + assert(Op.getValueType().isFixedLengthVector() && + "Only cases that extract a fixed length vector are supported!"); + EVT InVT = Op.getOperand(0).getValueType(); + unsigned Idx = cast(Op.getOperand(1))->getZExtValue(); unsigned Size = Op.getValueSizeInBits(); + if (InVT.isScalableVector()) { + // This will be matched by custom code during ISelDAGToDAG. + if (Idx == 0 && isPackedVectorType(InVT, DAG)) + return Op; + + return SDValue(); + } + // This will get lowered to an appropriate EXTRACT_SUBREG in ISel. - if (Val == 0) + if (Idx == 0 && InVT.getSizeInBits() <= 128) return Op; // If this is extracting the upper 64-bits of a 128-bit vector, we match // that directly. - if (Size == 64 && Val * VT.getScalarSizeInBits() == 64) + if (Size == 64 && Idx * InVT.getScalarSizeInBits() == 64) + return Op; + + return SDValue(); +} + +SDValue AArch64TargetLowering::LowerINSERT_SUBVECTOR(SDValue Op, + SelectionDAG &DAG) const { + assert(Op.getValueType().isScalableVector() && + "Only expect to lower inserts into scalable vectors!"); + + EVT InVT = Op.getOperand(1).getValueType(); + unsigned Idx = cast(Op.getOperand(2))->getZExtValue(); + + // We don't have any patterns for scalable vector yet. + if (InVT.isScalableVector() || !useSVEForFixedLengthVectorVT(InVT)) + return SDValue(); + + // This will be matched by custom code during ISelDAGToDAG. + if (Idx == 0 && isPackedVectorType(InVT, DAG) && Op.getOperand(0).isUndef()) return Op; return SDValue(); } bool AArch64TargetLowering::isShuffleMaskLegal(ArrayRef M, EVT VT) const { + // Currently no fixed length shuffles that require SVE are legal. + if (useSVEForFixedLengthVectorVT(VT)) + return false; + if (VT.getVectorNumElements() == 4 && (VT.is128BitVector() || VT.is64BitVector())) { unsigned PFIndexes[4]; @@ -8249,6 +8846,81 @@ static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, int64_t &Cnt) { return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits / 2 : ElementBits)); } +// Attempt to form urhadd(OpA, OpB) from +// truncate(vlshr(sub(zext(OpB), xor(zext(OpA), Ones(ElemSizeInBits))), 1)). +// The original form of this expression is +// truncate(srl(add(zext(OpB), add(zext(OpA), 1)), 1)) and before this function +// is called the srl will have been lowered to AArch64ISD::VLSHR and the +// ((OpA + OpB + 1) >> 1) expression will have been changed to (OpB - (~OpA)). +// This pass can also recognize a variant of this pattern that uses sign +// extension instead of zero extension and form a srhadd(OpA, OpB) from it. +SDValue AArch64TargetLowering::LowerTRUNCATE(SDValue Op, + SelectionDAG &DAG) const { + EVT VT = Op.getValueType(); + + if (!VT.isVector() || VT.isScalableVector()) + return Op; + + if (useSVEForFixedLengthVectorVT(Op.getOperand(0).getValueType())) + return LowerFixedLengthVectorTruncateToSVE(Op, DAG); + + // Since we are looking for a right shift by a constant value of 1 and we are + // operating on types at least 16 bits in length (sign/zero extended OpA and + // OpB, which are at least 8 bits), it follows that the truncate will always + // discard the shifted-in bit and therefore the right shift will be logical + // regardless of the signedness of OpA and OpB. + SDValue Shift = Op.getOperand(0); + if (Shift.getOpcode() != AArch64ISD::VLSHR) + return Op; + + // Is the right shift using an immediate value of 1? + uint64_t ShiftAmount = Shift.getConstantOperandVal(1); + if (ShiftAmount != 1) + return Op; + + SDValue Sub = Shift->getOperand(0); + if (Sub.getOpcode() != ISD::SUB) + return Op; + + SDValue Xor = Sub.getOperand(1); + if (Xor.getOpcode() != ISD::XOR) + return Op; + + SDValue ExtendOpA = Xor.getOperand(0); + SDValue ExtendOpB = Sub.getOperand(0); + unsigned ExtendOpAOpc = ExtendOpA.getOpcode(); + unsigned ExtendOpBOpc = ExtendOpB.getOpcode(); + if (!(ExtendOpAOpc == ExtendOpBOpc && + (ExtendOpAOpc == ISD::ZERO_EXTEND || ExtendOpAOpc == ISD::SIGN_EXTEND))) + return Op; + + // Is the result of the right shift being truncated to the same value type as + // the original operands, OpA and OpB? + SDValue OpA = ExtendOpA.getOperand(0); + SDValue OpB = ExtendOpB.getOperand(0); + EVT OpAVT = OpA.getValueType(); + assert(ExtendOpA.getValueType() == ExtendOpB.getValueType()); + if (!(VT == OpAVT && OpAVT == OpB.getValueType())) + return Op; + + // Is the XOR using a constant amount of all ones in the right hand side? + uint64_t C; + if (!isAllConstantBuildVector(Xor.getOperand(1), C)) + return Op; + + unsigned ElemSizeInBits = VT.getScalarSizeInBits(); + APInt CAsAPInt(ElemSizeInBits, C); + if (CAsAPInt != APInt::getAllOnesValue(ElemSizeInBits)) + return Op; + + SDLoc DL(Op); + bool IsSignExtend = ExtendOpAOpc == ISD::SIGN_EXTEND; + unsigned RHADDOpc = IsSignExtend ? AArch64ISD::SRHADD : AArch64ISD::URHADD; + SDValue ResultURHADD = DAG.getNode(RHADDOpc, DL, VT, OpA, OpB); + + return ResultURHADD; +} + SDValue AArch64TargetLowering::LowerVectorSRA_SRL_SHL(SDValue Op, SelectionDAG &DAG) const { EVT VT = Op.getValueType(); @@ -8264,6 +8936,9 @@ SDValue AArch64TargetLowering::LowerVectorSRA_SRL_SHL(SDValue Op, llvm_unreachable("unexpected shift opcode"); case ISD::SHL: + if (VT.isScalableVector()) + return LowerToPredicatedOp(Op, DAG, AArch64ISD::SHL_MERGE_OP1); + if (isVShiftLImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize) return DAG.getNode(AArch64ISD::VSHL, DL, VT, Op.getOperand(0), DAG.getConstant(Cnt, DL, MVT::i32)); @@ -8273,6 +8948,12 @@ SDValue AArch64TargetLowering::LowerVectorSRA_SRL_SHL(SDValue Op, Op.getOperand(0), Op.getOperand(1)); case ISD::SRA: case ISD::SRL: + if (VT.isScalableVector()) { + unsigned Opc = Op.getOpcode() == ISD::SRA ? AArch64ISD::SRA_MERGE_OP1 + : AArch64ISD::SRL_MERGE_OP1; + return LowerToPredicatedOp(Op, DAG, Opc); + } + // Right shift immediate if (isVShiftRImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize) { unsigned Opc = @@ -8395,6 +9076,12 @@ static SDValue EmitVectorComparison(SDValue LHS, SDValue RHS, SDValue AArch64TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) const { + if (Op.getValueType().isScalableVector()) { + if (Op.getOperand(0).getValueType().isFloatingPoint()) + return Op; + return LowerToPredicatedOp(Op, DAG, AArch64ISD::SETCC_MERGE_ZERO); + } + ISD::CondCode CC = cast(Op.getOperand(2))->get(); SDValue LHS = Op.getOperand(0); SDValue RHS = Op.getOperand(1); @@ -8570,7 +9257,8 @@ AArch64TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SDNode *Node = Op.getNode(); SDValue Chain = Op.getOperand(0); SDValue Size = Op.getOperand(1); - unsigned Align = cast(Op.getOperand(2))->getZExtValue(); + MaybeAlign Align = + cast(Op.getOperand(2))->getMaybeAlignValue(); EVT VT = Node->getValueType(0); if (DAG.getMachineFunction().getFunction().hasFnAttribute( @@ -8580,7 +9268,7 @@ AArch64TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size); if (Align) SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0), - DAG.getConstant(-(uint64_t)Align, dl, VT)); + DAG.getConstant(-(uint64_t)Align->value(), dl, VT)); Chain = DAG.getCopyToReg(Chain, dl, AArch64::SP, SP); SDValue Ops[2] = {SP, Chain}; return DAG.getMergeValues(Ops, dl); @@ -8595,7 +9283,7 @@ AArch64TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size); if (Align) SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0), - DAG.getConstant(-(uint64_t)Align, dl, VT)); + DAG.getConstant(-(uint64_t)Align->value(), dl, VT)); Chain = DAG.getCopyToReg(Chain, dl, AArch64::SP, SP); Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, dl, true), @@ -8605,6 +9293,41 @@ AArch64TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, return DAG.getMergeValues(Ops, dl); } +SDValue AArch64TargetLowering::LowerVSCALE(SDValue Op, + SelectionDAG &DAG) const { + EVT VT = Op.getValueType(); + assert(VT != MVT::i64 && "Expected illegal VSCALE node"); + + SDLoc DL(Op); + APInt MulImm = cast(Op.getOperand(0))->getAPIntValue(); + return DAG.getZExtOrTrunc(DAG.getVScale(DL, MVT::i64, MulImm.sextOrSelf(64)), + DL, VT); +} + +/// Set the IntrinsicInfo for the `aarch64_sve_st` intrinsics. +template +static bool setInfoSVEStN(AArch64TargetLowering::IntrinsicInfo &Info, + const CallInst &CI) { + Info.opc = ISD::INTRINSIC_VOID; + // Retrieve EC from first vector argument. + const EVT VT = EVT::getEVT(CI.getArgOperand(0)->getType()); + ElementCount EC = VT.getVectorElementCount(); +#ifndef NDEBUG + // Check the assumption that all input vectors are the same type. + for (unsigned I = 0; I < NumVecs; ++I) + assert(VT == EVT::getEVT(CI.getArgOperand(I)->getType()) && + "Invalid type."); +#endif + // memVT is `NumVecs * VT`. + Info.memVT = EVT::getVectorVT(CI.getType()->getContext(), VT.getScalarType(), + EC * NumVecs); + Info.ptrVal = CI.getArgOperand(CI.getNumArgOperands() - 1); + Info.offset = 0; + Info.align.reset(); + Info.flags = MachineMemOperand::MOStore; + return true; +} + /// getTgtMemIntrinsic - Represent NEON load and store intrinsics as /// MemIntrinsicNodes. The associated MachineMemOperands record the alignment /// specified in the intrinsic calls. @@ -8614,6 +9337,12 @@ bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, unsigned Intrinsic) const { auto &DL = I.getModule()->getDataLayout(); switch (Intrinsic) { + case Intrinsic::aarch64_sve_st2: + return setInfoSVEStN<2>(Info, I); + case Intrinsic::aarch64_sve_st3: + return setInfoSVEStN<3>(Info, I); + case Intrinsic::aarch64_sve_st4: + return setInfoSVEStN<4>(Info, I); case Intrinsic::aarch64_neon_ld2: case Intrinsic::aarch64_neon_ld3: case Intrinsic::aarch64_neon_ld4: @@ -8670,7 +9399,7 @@ bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.memVT = MVT::getVT(PtrTy->getElementType()); Info.ptrVal = I.getArgOperand(0); Info.offset = 0; - Info.align = MaybeAlign(DL.getABITypeAlignment(PtrTy->getElementType())); + Info.align = DL.getABITypeAlign(PtrTy->getElementType()); Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile; return true; } @@ -8681,7 +9410,7 @@ bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.memVT = MVT::getVT(PtrTy->getElementType()); Info.ptrVal = I.getArgOperand(1); Info.offset = 0; - Info.align = MaybeAlign(DL.getABITypeAlignment(PtrTy->getElementType())); + Info.align = DL.getABITypeAlign(PtrTy->getElementType()); Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOVolatile; return true; } @@ -8706,21 +9435,25 @@ bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, case Intrinsic::aarch64_sve_ldnt1: { PointerType *PtrTy = cast(I.getArgOperand(1)->getType()); Info.opc = ISD::INTRINSIC_W_CHAIN; - Info.memVT = MVT::getVT(PtrTy->getElementType()); + Info.memVT = MVT::getVT(I.getType()); Info.ptrVal = I.getArgOperand(1); Info.offset = 0; - Info.align = MaybeAlign(DL.getABITypeAlignment(PtrTy->getElementType())); - Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MONonTemporal; + Info.align = DL.getABITypeAlign(PtrTy->getElementType()); + Info.flags = MachineMemOperand::MOLoad; + if (Intrinsic == Intrinsic::aarch64_sve_ldnt1) + Info.flags |= MachineMemOperand::MONonTemporal; return true; } case Intrinsic::aarch64_sve_stnt1: { PointerType *PtrTy = cast(I.getArgOperand(2)->getType()); Info.opc = ISD::INTRINSIC_W_CHAIN; - Info.memVT = MVT::getVT(PtrTy->getElementType()); + Info.memVT = MVT::getVT(I.getOperand(0)->getType()); Info.ptrVal = I.getArgOperand(2); Info.offset = 0; - Info.align = MaybeAlign(DL.getABITypeAlignment(PtrTy->getElementType())); - Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MONonTemporal; + Info.align = DL.getABITypeAlign(PtrTy->getElementType()); + Info.flags = MachineMemOperand::MOStore; + if (Intrinsic == Intrinsic::aarch64_sve_stnt1) + Info.flags |= MachineMemOperand::MONonTemporal; return true; } default: @@ -8895,21 +9628,22 @@ bool AArch64TargetLowering::isExtFreeImpl(const Instruction *Ext) const { /// or upper half of the vector elements. static bool areExtractShuffleVectors(Value *Op1, Value *Op2) { auto areTypesHalfed = [](Value *FullV, Value *HalfV) { - auto *FullVT = cast(FullV->getType()); - auto *HalfVT = cast(HalfV->getType()); - return FullVT->getBitWidth() == 2 * HalfVT->getBitWidth(); + auto *FullTy = FullV->getType(); + auto *HalfTy = HalfV->getType(); + return FullTy->getPrimitiveSizeInBits().getFixedSize() == + 2 * HalfTy->getPrimitiveSizeInBits().getFixedSize(); }; auto extractHalf = [](Value *FullV, Value *HalfV) { - auto *FullVT = cast(FullV->getType()); - auto *HalfVT = cast(HalfV->getType()); + auto *FullVT = cast(FullV->getType()); + auto *HalfVT = cast(HalfV->getType()); return FullVT->getNumElements() == 2 * HalfVT->getNumElements(); }; - Constant *M1, *M2; + ArrayRef M1, M2; Value *S1Op1, *S2Op1; - if (!match(Op1, m_ShuffleVector(m_Value(S1Op1), m_Undef(), m_Constant(M1))) || - !match(Op2, m_ShuffleVector(m_Value(S2Op1), m_Undef(), m_Constant(M2)))) + if (!match(Op1, m_Shuffle(m_Value(S1Op1), m_Undef(), m_Mask(M1))) || + !match(Op2, m_Shuffle(m_Value(S2Op1), m_Undef(), m_Mask(M2)))) return false; // Check that the operands are half as wide as the result and we extract @@ -8922,7 +9656,7 @@ static bool areExtractShuffleVectors(Value *Op1, Value *Op2) { // elements. int M1Start = -1; int M2Start = -1; - int NumElements = cast(Op1->getType())->getNumElements() * 2; + int NumElements = cast(Op1->getType())->getNumElements() * 2; if (!ShuffleVectorInst::isExtractSubvectorMask(M1, NumElements, M1Start) || !ShuffleVectorInst::isExtractSubvectorMask(M2, NumElements, M2Start) || M1Start != M2Start || (M1Start != 0 && M2Start != (NumElements / 2))) @@ -8948,8 +9682,24 @@ static bool areExtractExts(Value *Ext1, Value *Ext2) { return true; } -/// Check if sinking \p I's operands to I's basic block is profitable, because -/// the operands can be folded into a target instruction, e.g. +/// Check if Op could be used with vmull_high_p64 intrinsic. +static bool isOperandOfVmullHighP64(Value *Op) { + Value *VectorOperand = nullptr; + ConstantInt *ElementIndex = nullptr; + return match(Op, m_ExtractElt(m_Value(VectorOperand), + m_ConstantInt(ElementIndex))) && + ElementIndex->getValue() == 1 && + isa(VectorOperand->getType()) && + cast(VectorOperand->getType())->getNumElements() == 2; +} + +/// Check if Op1 and Op2 could be used with vmull_high_p64 intrinsic. +static bool areOperandsOfVmullHighP64(Value *Op1, Value *Op2) { + return isOperandOfVmullHighP64(Op1) && isOperandOfVmullHighP64(Op2); +} + +/// Check if sinking \p I's operands to I's basic block is profitable, because +/// the operands can be folded into a target instruction, e.g. /// shufflevectors extracts and/or sext/zext can be folded into (u,s)subl(2). bool AArch64TargetLowering::shouldSinkOperands( Instruction *I, SmallVectorImpl &Ops) const { @@ -8964,6 +9714,15 @@ bool AArch64TargetLowering::shouldSinkOperands( Ops.push_back(&II->getOperandUse(0)); Ops.push_back(&II->getOperandUse(1)); return true; + + case Intrinsic::aarch64_neon_pmull64: + if (!areOperandsOfVmullHighP64(II->getArgOperand(0), + II->getArgOperand(1))) + return false; + Ops.push_back(&II->getArgOperandUse(0)); + Ops.push_back(&II->getArgOperandUse(1)); + return true; + default: return false; } @@ -8996,12 +9755,12 @@ bool AArch64TargetLowering::shouldSinkOperands( } bool AArch64TargetLowering::hasPairedLoad(EVT LoadedType, - unsigned &RequiredAligment) const { + Align &RequiredAligment) const { if (!LoadedType.isSimple() || (!LoadedType.isInteger() && !LoadedType.isFloatingPoint())) return false; // Cyclone supports unaligned accesses. - RequiredAligment = 0; + RequiredAligment = Align(1); unsigned NumBits = LoadedType.getSizeInBits(); return NumBits == 32 || NumBits == 64; } @@ -9015,7 +9774,7 @@ AArch64TargetLowering::getNumInterleavedAccesses(VectorType *VecTy, } MachineMemOperand::Flags -AArch64TargetLowering::getMMOFlags(const Instruction &I) const { +AArch64TargetLowering::getTargetMMOFlags(const Instruction &I) const { if (Subtarget->getProcFamily() == AArch64Subtarget::Falkor && I.getMetadata(FALKOR_STRIDED_ACCESS_MD) != nullptr) return MOStridedAccess; @@ -9029,7 +9788,7 @@ bool AArch64TargetLowering::isLegalInterleavedAccessType( unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType()); // Ensure the number of vector elements is greater than 1. - if (VecTy->getNumElements() < 2) + if (cast(VecTy)->getNumElements() < 2) return false; // Ensure the element type is legal. @@ -9063,22 +9822,24 @@ bool AArch64TargetLowering::lowerInterleavedLoad( const DataLayout &DL = LI->getModule()->getDataLayout(); - VectorType *VecTy = Shuffles[0]->getType(); + VectorType *VTy = Shuffles[0]->getType(); // Skip if we do not have NEON and skip illegal vector types. We can // "legalize" wide vector types into multiple interleaved accesses as long as // the vector types are divisible by 128. - if (!Subtarget->hasNEON() || !isLegalInterleavedAccessType(VecTy, DL)) + if (!Subtarget->hasNEON() || !isLegalInterleavedAccessType(VTy, DL)) return false; - unsigned NumLoads = getNumInterleavedAccesses(VecTy, DL); + unsigned NumLoads = getNumInterleavedAccesses(VTy, DL); + + auto *FVTy = cast(VTy); // A pointer vector can not be the return type of the ldN intrinsics. Need to // load integer vectors first and then convert to pointer vectors. - Type *EltTy = VecTy->getVectorElementType(); + Type *EltTy = FVTy->getElementType(); if (EltTy->isPointerTy()) - VecTy = - VectorType::get(DL.getIntPtrType(EltTy), VecTy->getVectorNumElements()); + FVTy = + FixedVectorType::get(DL.getIntPtrType(EltTy), FVTy->getNumElements()); IRBuilder<> Builder(LI); @@ -9088,19 +9849,19 @@ bool AArch64TargetLowering::lowerInterleavedLoad( if (NumLoads > 1) { // If we're going to generate more than one load, reset the sub-vector type // to something legal. - VecTy = VectorType::get(VecTy->getVectorElementType(), - VecTy->getVectorNumElements() / NumLoads); + FVTy = FixedVectorType::get(FVTy->getElementType(), + FVTy->getNumElements() / NumLoads); // We will compute the pointer operand of each load from the original base // address using GEPs. Cast the base address to a pointer to the scalar // element type. BaseAddr = Builder.CreateBitCast( - BaseAddr, VecTy->getVectorElementType()->getPointerTo( - LI->getPointerAddressSpace())); + BaseAddr, + FVTy->getElementType()->getPointerTo(LI->getPointerAddressSpace())); } - Type *PtrTy = VecTy->getPointerTo(LI->getPointerAddressSpace()); - Type *Tys[2] = {VecTy, PtrTy}; + Type *PtrTy = FVTy->getPointerTo(LI->getPointerAddressSpace()); + Type *Tys[2] = {FVTy, PtrTy}; static const Intrinsic::ID LoadInts[3] = {Intrinsic::aarch64_neon_ld2, Intrinsic::aarch64_neon_ld3, Intrinsic::aarch64_neon_ld4}; @@ -9117,9 +9878,8 @@ bool AArch64TargetLowering::lowerInterleavedLoad( // If we're generating more than one load, compute the base address of // subsequent loads as an offset from the previous. if (LoadCount > 0) - BaseAddr = - Builder.CreateConstGEP1_32(VecTy->getVectorElementType(), BaseAddr, - VecTy->getVectorNumElements() * Factor); + BaseAddr = Builder.CreateConstGEP1_32(FVTy->getElementType(), BaseAddr, + FVTy->getNumElements() * Factor); CallInst *LdN = Builder.CreateCall( LdNFunc, Builder.CreateBitCast(BaseAddr, PtrTy), "ldN"); @@ -9134,8 +9894,8 @@ bool AArch64TargetLowering::lowerInterleavedLoad( // Convert the integer vector to pointer vector if the element is pointer. if (EltTy->isPointerTy()) SubVec = Builder.CreateIntToPtr( - SubVec, VectorType::get(SVI->getType()->getVectorElementType(), - VecTy->getVectorNumElements())); + SubVec, FixedVectorType::get(SVI->getType()->getElementType(), + FVTy->getNumElements())); SubVecs[SVI].push_back(SubVec); } } @@ -9186,13 +9946,12 @@ bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI, assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() && "Invalid interleave factor"); - VectorType *VecTy = SVI->getType(); - assert(VecTy->getVectorNumElements() % Factor == 0 && - "Invalid interleaved store"); + auto *VecTy = cast(SVI->getType()); + assert(VecTy->getNumElements() % Factor == 0 && "Invalid interleaved store"); - unsigned LaneLen = VecTy->getVectorNumElements() / Factor; - Type *EltTy = VecTy->getVectorElementType(); - VectorType *SubVecTy = VectorType::get(EltTy, LaneLen); + unsigned LaneLen = VecTy->getNumElements() / Factor; + Type *EltTy = VecTy->getElementType(); + auto *SubVecTy = FixedVectorType::get(EltTy, LaneLen); const DataLayout &DL = SI->getModule()->getDataLayout(); @@ -9212,14 +9971,15 @@ bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI, // vectors to integer vectors. if (EltTy->isPointerTy()) { Type *IntTy = DL.getIntPtrType(EltTy); - unsigned NumOpElts = Op0->getType()->getVectorNumElements(); + unsigned NumOpElts = + cast(Op0->getType())->getNumElements(); // Convert to the corresponding integer vector. - Type *IntVecTy = VectorType::get(IntTy, NumOpElts); + auto *IntVecTy = FixedVectorType::get(IntTy, NumOpElts); Op0 = Builder.CreatePtrToInt(Op0, IntVecTy); Op1 = Builder.CreatePtrToInt(Op1, IntVecTy); - SubVecTy = VectorType::get(IntTy, LaneLen); + SubVecTy = FixedVectorType::get(IntTy, LaneLen); } // The base address of the store. @@ -9229,14 +9989,14 @@ bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI, // If we're going to generate more than one store, reset the lane length // and sub-vector type to something legal. LaneLen /= NumStores; - SubVecTy = VectorType::get(SubVecTy->getVectorElementType(), LaneLen); + SubVecTy = FixedVectorType::get(SubVecTy->getElementType(), LaneLen); // We will compute the pointer operand of each store from the original base // address using GEPs. Cast the base address to a pointer to the scalar // element type. BaseAddr = Builder.CreateBitCast( - BaseAddr, SubVecTy->getVectorElementType()->getPointerTo( - SI->getPointerAddressSpace())); + BaseAddr, + SubVecTy->getElementType()->getPointerTo(SI->getPointerAddressSpace())); } auto Mask = SVI->getShuffleMask(); @@ -9258,7 +10018,7 @@ bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI, unsigned IdxI = StoreCount * LaneLen * Factor + i; if (Mask[IdxI] >= 0) { Ops.push_back(Builder.CreateShuffleVector( - Op0, Op1, createSequentialMask(Builder, Mask[IdxI], LaneLen, 0))); + Op0, Op1, createSequentialMask(Mask[IdxI], LaneLen, 0))); } else { unsigned StartMask = 0; for (unsigned j = 1; j < LaneLen; j++) { @@ -9274,14 +10034,14 @@ bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI, // Note: StartMask cannot be negative, it's checked in // isReInterleaveMask Ops.push_back(Builder.CreateShuffleVector( - Op0, Op1, createSequentialMask(Builder, StartMask, LaneLen, 0))); + Op0, Op1, createSequentialMask(StartMask, LaneLen, 0))); } } // If we generating more than one store, we compute the base address of // subsequent stores as an offset from the previous. if (StoreCount > 0) - BaseAddr = Builder.CreateConstGEP1_32(SubVecTy->getVectorElementType(), + BaseAddr = Builder.CreateConstGEP1_32(SubVecTy->getElementType(), BaseAddr, LaneLen * Factor); Ops.push_back(Builder.CreateBitCast(BaseAddr, PtrTy)); @@ -9290,16 +10050,59 @@ bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI, return true; } -static bool memOpAlign(unsigned DstAlign, unsigned SrcAlign, - unsigned AlignCheck) { - return ((SrcAlign == 0 || SrcAlign % AlignCheck == 0) && - (DstAlign == 0 || DstAlign % AlignCheck == 0)); +// Lower an SVE structured load intrinsic returning a tuple type to target +// specific intrinsic taking the same input but returning a multi-result value +// of the split tuple type. +// +// E.g. Lowering an LD3: +// +// call @llvm.aarch64.sve.ld3.nxv12i32( +// %pred, +// * %addr) +// +// Output DAG: +// +// t0: ch = EntryToken +// t2: nxv4i1,ch = CopyFromReg t0, Register:nxv4i1 %0 +// t4: i64,ch = CopyFromReg t0, Register:i64 %1 +// t5: nxv4i32,nxv4i32,nxv4i32,ch = AArch64ISD::SVE_LD3 t0, t2, t4 +// t6: nxv12i32 = concat_vectors t5, t5:1, t5:2 +// +// This is called pre-legalization to avoid widening/splitting issues with +// non-power-of-2 tuple types used for LD3, such as nxv12i32. +SDValue AArch64TargetLowering::LowerSVEStructLoad(unsigned Intrinsic, + ArrayRef LoadOps, + EVT VT, SelectionDAG &DAG, + const SDLoc &DL) const { + assert(VT.isScalableVector() && "Can only lower scalable vectors"); + + unsigned N, Opcode; + static std::map> IntrinsicMap = { + {Intrinsic::aarch64_sve_ld2, {2, AArch64ISD::SVE_LD2_MERGE_ZERO}}, + {Intrinsic::aarch64_sve_ld3, {3, AArch64ISD::SVE_LD3_MERGE_ZERO}}, + {Intrinsic::aarch64_sve_ld4, {4, AArch64ISD::SVE_LD4_MERGE_ZERO}}}; + + std::tie(N, Opcode) = IntrinsicMap[Intrinsic]; + assert(VT.getVectorElementCount().Min % N == 0 && + "invalid tuple vector type!"); + + EVT SplitVT = EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), + VT.getVectorElementCount() / N); + assert(isTypeLegal(SplitVT)); + + SmallVector VTs(N, SplitVT); + VTs.push_back(MVT::Other); // Chain + SDVTList NodeTys = DAG.getVTList(VTs); + + SDValue PseudoLoad = DAG.getNode(Opcode, DL, NodeTys, LoadOps); + SmallVector PseudoLoadOps; + for (unsigned I = 0; I < N; ++I) + PseudoLoadOps.push_back(SDValue(PseudoLoad.getNode(), I)); + return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, PseudoLoadOps); } EVT AArch64TargetLowering::getOptimalMemOpType( - uint64_t Size, unsigned DstAlign, unsigned SrcAlign, bool IsMemset, - bool ZeroMemset, bool MemcpyStrSrc, - const AttributeList &FuncAttributes) const { + const MemOp &Op, const AttributeList &FuncAttributes) const { bool CanImplicitFloat = !FuncAttributes.hasFnAttribute(Attribute::NoImplicitFloat); bool CanUseNEON = Subtarget->hasNEON() && CanImplicitFloat; @@ -9307,9 +10110,9 @@ EVT AArch64TargetLowering::getOptimalMemOpType( // Only use AdvSIMD to implement memset of 32-byte and above. It would have // taken one instruction to materialize the v2i64 zero and one store (with // restrictive addressing mode). Just do i64 stores. - bool IsSmallMemset = IsMemset && Size < 32; - auto AlignmentIsAcceptable = [&](EVT VT, unsigned AlignCheck) { - if (memOpAlign(SrcAlign, DstAlign, AlignCheck)) + bool IsSmallMemset = Op.isMemset() && Op.size() < 32; + auto AlignmentIsAcceptable = [&](EVT VT, Align AlignCheck) { + if (Op.isAligned(AlignCheck)) return true; bool Fast; return allowsMisalignedMemoryAccesses(VT, 0, 1, MachineMemOperand::MONone, @@ -9317,22 +10120,20 @@ EVT AArch64TargetLowering::getOptimalMemOpType( Fast; }; - if (CanUseNEON && IsMemset && !IsSmallMemset && - AlignmentIsAcceptable(MVT::v2i64, 16)) + if (CanUseNEON && Op.isMemset() && !IsSmallMemset && + AlignmentIsAcceptable(MVT::v2i64, Align(16))) return MVT::v2i64; - if (CanUseFP && !IsSmallMemset && AlignmentIsAcceptable(MVT::f128, 16)) + if (CanUseFP && !IsSmallMemset && AlignmentIsAcceptable(MVT::f128, Align(16))) return MVT::f128; - if (Size >= 8 && AlignmentIsAcceptable(MVT::i64, 8)) + if (Op.size() >= 8 && AlignmentIsAcceptable(MVT::i64, Align(8))) return MVT::i64; - if (Size >= 4 && AlignmentIsAcceptable(MVT::i32, 4)) + if (Op.size() >= 4 && AlignmentIsAcceptable(MVT::i32, Align(4))) return MVT::i32; return MVT::Other; } LLT AArch64TargetLowering::getOptimalMemOpLLT( - uint64_t Size, unsigned DstAlign, unsigned SrcAlign, bool IsMemset, - bool ZeroMemset, bool MemcpyStrSrc, - const AttributeList &FuncAttributes) const { + const MemOp &Op, const AttributeList &FuncAttributes) const { bool CanImplicitFloat = !FuncAttributes.hasFnAttribute(Attribute::NoImplicitFloat); bool CanUseNEON = Subtarget->hasNEON() && CanImplicitFloat; @@ -9340,9 +10141,9 @@ LLT AArch64TargetLowering::getOptimalMemOpLLT( // Only use AdvSIMD to implement memset of 32-byte and above. It would have // taken one instruction to materialize the v2i64 zero and one store (with // restrictive addressing mode). Just do i64 stores. - bool IsSmallMemset = IsMemset && Size < 32; - auto AlignmentIsAcceptable = [&](EVT VT, unsigned AlignCheck) { - if (memOpAlign(SrcAlign, DstAlign, AlignCheck)) + bool IsSmallMemset = Op.isMemset() && Op.size() < 32; + auto AlignmentIsAcceptable = [&](EVT VT, Align AlignCheck) { + if (Op.isAligned(AlignCheck)) return true; bool Fast; return allowsMisalignedMemoryAccesses(VT, 0, 1, MachineMemOperand::MONone, @@ -9350,14 +10151,14 @@ LLT AArch64TargetLowering::getOptimalMemOpLLT( Fast; }; - if (CanUseNEON && IsMemset && !IsSmallMemset && - AlignmentIsAcceptable(MVT::v2i64, 16)) + if (CanUseNEON && Op.isMemset() && !IsSmallMemset && + AlignmentIsAcceptable(MVT::v2i64, Align(16))) return LLT::vector(2, 64); - if (CanUseFP && !IsSmallMemset && AlignmentIsAcceptable(MVT::f128, 16)) + if (CanUseFP && !IsSmallMemset && AlignmentIsAcceptable(MVT::f128, Align(16))) return LLT::scalar(128); - if (Size >= 8 && AlignmentIsAcceptable(MVT::i64, 8)) + if (Op.size() >= 8 && AlignmentIsAcceptable(MVT::i64, Align(8))) return LLT::scalar(64); - if (Size >= 4 && AlignmentIsAcceptable(MVT::i32, 4)) + if (Op.size() >= 4 && AlignmentIsAcceptable(MVT::i32, Align(4))) return LLT::scalar(32); return LLT(); } @@ -9404,6 +10205,10 @@ bool AArch64TargetLowering::isLegalAddressingMode(const DataLayout &DL, if (AM.HasBaseReg && AM.BaseOffs && AM.Scale) return false; + // FIXME: Update this method to support scalable addressing modes. + if (isa(Ty)) + return AM.HasBaseReg && !AM.BaseOffs && !AM.Scale; + // check reg + imm case: // i.e., reg + 0, reg + imm9, reg + SIZE_IN_BYTES * uimm12 uint64_t NumBytes = 0; @@ -10110,7 +10915,7 @@ static SDValue tryCombineToBSL(SDNode *N, } if (FoundMatch) - return DAG.getNode(AArch64ISD::BSL, DL, VT, SDValue(BVN0, 0), + return DAG.getNode(AArch64ISD::BSP, DL, VT, SDValue(BVN0, 0), N0->getOperand(1 - i), N1->getOperand(1 - j)); } @@ -10167,29 +10972,81 @@ static SDValue performSVEAndCombine(SDNode *N, if (DCI.isBeforeLegalizeOps()) return SDValue(); + SelectionDAG &DAG = DCI.DAG; SDValue Src = N->getOperand(0); + unsigned Opc = Src->getOpcode(); + + // Zero/any extend of an unsigned unpack + if (Opc == AArch64ISD::UUNPKHI || Opc == AArch64ISD::UUNPKLO) { + SDValue UnpkOp = Src->getOperand(0); + SDValue Dup = N->getOperand(1); + + if (Dup.getOpcode() != AArch64ISD::DUP) + return SDValue(); + + SDLoc DL(N); + ConstantSDNode *C = dyn_cast(Dup->getOperand(0)); + uint64_t ExtVal = C->getZExtValue(); + + // If the mask is fully covered by the unpack, we don't need to push + // a new AND onto the operand + EVT EltTy = UnpkOp->getValueType(0).getVectorElementType(); + if ((ExtVal == 0xFF && EltTy == MVT::i8) || + (ExtVal == 0xFFFF && EltTy == MVT::i16) || + (ExtVal == 0xFFFFFFFF && EltTy == MVT::i32)) + return Src; + + // Truncate to prevent a DUP with an over wide constant + APInt Mask = C->getAPIntValue().trunc(EltTy.getSizeInBits()); + + // Otherwise, make sure we propagate the AND to the operand + // of the unpack + Dup = DAG.getNode(AArch64ISD::DUP, DL, + UnpkOp->getValueType(0), + DAG.getConstant(Mask.zextOrTrunc(32), DL, MVT::i32)); + + SDValue And = DAG.getNode(ISD::AND, DL, + UnpkOp->getValueType(0), UnpkOp, Dup); + + return DAG.getNode(Opc, DL, N->getValueType(0), And); + } + SDValue Mask = N->getOperand(1); if (!Src.hasOneUse()) return SDValue(); - // GLD1* instructions perform an implicit zero-extend, which makes them + EVT MemVT; + + // SVE load instructions perform an implicit zero-extend, which makes them // perfect candidates for combining. - switch (Src->getOpcode()) { - case AArch64ISD::GLD1: - case AArch64ISD::GLD1_SCALED: - case AArch64ISD::GLD1_SXTW: - case AArch64ISD::GLD1_SXTW_SCALED: - case AArch64ISD::GLD1_UXTW: - case AArch64ISD::GLD1_UXTW_SCALED: - case AArch64ISD::GLD1_IMM: + switch (Opc) { + case AArch64ISD::LD1_MERGE_ZERO: + case AArch64ISD::LDNF1_MERGE_ZERO: + case AArch64ISD::LDFF1_MERGE_ZERO: + MemVT = cast(Src->getOperand(3))->getVT(); + break; + case AArch64ISD::GLD1_MERGE_ZERO: + case AArch64ISD::GLD1_SCALED_MERGE_ZERO: + case AArch64ISD::GLD1_SXTW_MERGE_ZERO: + case AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO: + case AArch64ISD::GLD1_UXTW_MERGE_ZERO: + case AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO: + case AArch64ISD::GLD1_IMM_MERGE_ZERO: + case AArch64ISD::GLDFF1_MERGE_ZERO: + case AArch64ISD::GLDFF1_SCALED_MERGE_ZERO: + case AArch64ISD::GLDFF1_SXTW_MERGE_ZERO: + case AArch64ISD::GLDFF1_SXTW_SCALED_MERGE_ZERO: + case AArch64ISD::GLDFF1_UXTW_MERGE_ZERO: + case AArch64ISD::GLDFF1_UXTW_SCALED_MERGE_ZERO: + case AArch64ISD::GLDFF1_IMM_MERGE_ZERO: + case AArch64ISD::GLDNT1_MERGE_ZERO: + MemVT = cast(Src->getOperand(4))->getVT(); break; default: return SDValue(); } - EVT MemVT = cast(Src->getOperand(4))->getVT(); - if (isConstantSplatVectorMaskForType(Mask.getNode(), MemVT)) return Src; @@ -10273,6 +11130,7 @@ static SDValue performConcatVectorsCombine(SDNode *N, SDLoc dl(N); EVT VT = N->getValueType(0); SDValue N0 = N->getOperand(0), N1 = N->getOperand(1); + unsigned N0Opc = N0->getOpcode(), N1Opc = N1->getOpcode(); // Optimize concat_vectors of truncated vectors, where the intermediate // type is illegal, to avoid said illegality, e.g., @@ -10285,9 +11143,8 @@ static SDValue performConcatVectorsCombine(SDNode *N, // This isn't really target-specific, but ISD::TRUNCATE legality isn't keyed // on both input and result type, so we might generate worse code. // On AArch64 we know it's fine for v2i64->v4i16 and v4i32->v8i8. - if (N->getNumOperands() == 2 && - N0->getOpcode() == ISD::TRUNCATE && - N1->getOpcode() == ISD::TRUNCATE) { + if (N->getNumOperands() == 2 && N0Opc == ISD::TRUNCATE && + N1Opc == ISD::TRUNCATE) { SDValue N00 = N0->getOperand(0); SDValue N10 = N1->getOperand(0); EVT N00VT = N00.getValueType(); @@ -10312,6 +11169,52 @@ static SDValue performConcatVectorsCombine(SDNode *N, if (DCI.isBeforeLegalizeOps()) return SDValue(); + // Optimise concat_vectors of two [us]rhadds that use extracted subvectors + // from the same original vectors. Combine these into a single [us]rhadd that + // operates on the two original vectors. Example: + // (v16i8 (concat_vectors (v8i8 (urhadd (extract_subvector (v16i8 OpA, <0>), + // extract_subvector (v16i8 OpB, + // <0>))), + // (v8i8 (urhadd (extract_subvector (v16i8 OpA, <8>), + // extract_subvector (v16i8 OpB, + // <8>))))) + // -> + // (v16i8(urhadd(v16i8 OpA, v16i8 OpB))) + if (N->getNumOperands() == 2 && N0Opc == N1Opc && + (N0Opc == AArch64ISD::URHADD || N0Opc == AArch64ISD::SRHADD)) { + SDValue N00 = N0->getOperand(0); + SDValue N01 = N0->getOperand(1); + SDValue N10 = N1->getOperand(0); + SDValue N11 = N1->getOperand(1); + + EVT N00VT = N00.getValueType(); + EVT N10VT = N10.getValueType(); + + if (N00->getOpcode() == ISD::EXTRACT_SUBVECTOR && + N01->getOpcode() == ISD::EXTRACT_SUBVECTOR && + N10->getOpcode() == ISD::EXTRACT_SUBVECTOR && + N11->getOpcode() == ISD::EXTRACT_SUBVECTOR && N00VT == N10VT) { + SDValue N00Source = N00->getOperand(0); + SDValue N01Source = N01->getOperand(0); + SDValue N10Source = N10->getOperand(0); + SDValue N11Source = N11->getOperand(0); + + if (N00Source == N10Source && N01Source == N11Source && + N00Source.getValueType() == VT && N01Source.getValueType() == VT) { + assert(N0.getValueType() == N1.getValueType()); + + uint64_t N00Index = N00.getConstantOperandVal(1); + uint64_t N01Index = N01.getConstantOperandVal(1); + uint64_t N10Index = N10.getConstantOperandVal(1); + uint64_t N11Index = N11.getConstantOperandVal(1); + + if (N00Index == N01Index && N10Index == N11Index && N00Index == 0 && + N10Index == N00VT.getVectorNumElements()) + return DAG.getNode(N0Opc, dl, VT, N00Source, N01Source); + } + } + } + // If we see a (concat_vectors (v1x64 A), (v1x64 A)) it's really a vector // splat. The indexed instructions are going to be expecting a DUPLANE64, so // canonicalise to that. @@ -10330,7 +11233,7 @@ static SDValue performConcatVectorsCombine(SDNode *N, // becomes // (bitconvert (concat_vectors (v4i16 (bitconvert LHS)), RHS)) - if (N1->getOpcode() != ISD::BITCAST) + if (N1Opc != ISD::BITCAST) return SDValue(); SDValue RHS = N1->getOperand(0); MVT RHSTy = RHS.getValueType().getSimpleVT(); @@ -10794,6 +11697,35 @@ static SDValue LowerSVEIntReduction(SDNode *N, unsigned Opc, return SDValue(); } +static SDValue LowerSVEIntrinsicIndex(SDNode *N, SelectionDAG &DAG) { + SDLoc DL(N); + SDValue Op1 = N->getOperand(1); + SDValue Op2 = N->getOperand(2); + EVT ScalarTy = Op1.getValueType(); + + if ((ScalarTy == MVT::i8) || (ScalarTy == MVT::i16)) { + Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op1); + Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op2); + } + + return DAG.getNode(AArch64ISD::INDEX_VECTOR, DL, N->getValueType(0), + Op1, Op2); +} + +static SDValue LowerSVEIntrinsicDUP(SDNode *N, SelectionDAG &DAG) { + SDLoc dl(N); + SDValue Scalar = N->getOperand(3); + EVT ScalarTy = Scalar.getValueType(); + + if ((ScalarTy == MVT::i8) || (ScalarTy == MVT::i16)) + Scalar = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Scalar); + + SDValue Passthru = N->getOperand(1); + SDValue Pred = N->getOperand(2); + return DAG.getNode(AArch64ISD::DUP_MERGE_PASSTHRU, dl, N->getValueType(0), + Pred, Scalar, Passthru); +} + static SDValue LowerSVEIntrinsicEXT(SDNode *N, SelectionDAG &DAG) { SDLoc dl(N); LLVMContext &Ctx = *DAG.getContext(); @@ -10819,8 +11751,7 @@ static SDValue LowerSVEIntrinsicEXT(SDNode *N, SelectionDAG &DAG) { return DAG.getNode(ISD::BITCAST, dl, VT, EXT); } -static SDValue tryConvertSVEWideCompare(SDNode *N, unsigned ReplacementIID, - bool Invert, +static SDValue tryConvertSVEWideCompare(SDNode *N, ISD::CondCode CC, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG) { if (DCI.isBeforeLegalize()) @@ -10873,18 +11804,12 @@ static SDValue tryConvertSVEWideCompare(SDNode *N, unsigned ReplacementIID, } } + if (!Imm) + return SDValue(); + SDValue Splat = DAG.getNode(ISD::SPLAT_VECTOR, DL, CmpVT, Imm); - SDValue ID = DAG.getTargetConstant(ReplacementIID, DL, MVT::i64); - SDValue Op0, Op1; - if (Invert) { - Op0 = Splat; - Op1 = N->getOperand(2); - } else { - Op0 = N->getOperand(2); - Op1 = Splat; - } - return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, - ID, Pred, Op0, Op1); + return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, DL, VT, Pred, + N->getOperand(2), Splat, DAG.getCondCode(CC)); } return SDValue(); @@ -10914,6 +11839,46 @@ static SDValue getPTest(SelectionDAG &DAG, EVT VT, SDValue Pg, SDValue Op, return DAG.getZExtOrTrunc(Res, DL, VT); } +static SDValue combineSVEReductionFP(SDNode *N, unsigned Opc, + SelectionDAG &DAG) { + SDLoc DL(N); + + SDValue Pred = N->getOperand(1); + SDValue VecToReduce = N->getOperand(2); + + EVT ReduceVT = VecToReduce.getValueType(); + SDValue Reduce = DAG.getNode(Opc, DL, ReduceVT, Pred, VecToReduce); + + // SVE reductions set the whole vector register with the first element + // containing the reduction result, which we'll now extract. + SDValue Zero = DAG.getConstant(0, DL, MVT::i64); + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0), Reduce, + Zero); +} + +static SDValue combineSVEReductionOrderedFP(SDNode *N, unsigned Opc, + SelectionDAG &DAG) { + SDLoc DL(N); + + SDValue Pred = N->getOperand(1); + SDValue InitVal = N->getOperand(2); + SDValue VecToReduce = N->getOperand(3); + EVT ReduceVT = VecToReduce.getValueType(); + + // Ordered reductions use the first lane of the result vector as the + // reduction's initial value. + SDValue Zero = DAG.getConstant(0, DL, MVT::i64); + InitVal = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ReduceVT, + DAG.getUNDEF(ReduceVT), InitVal, Zero); + + SDValue Reduce = DAG.getNode(Opc, DL, ReduceVT, Pred, InitVal, VecToReduce); + + // SVE reductions set the whole vector register with the first element + // containing the reduction result, which we'll now extract. + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0), Reduce, + Zero); +} + static SDValue performIntrinsicCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget) { @@ -10982,38 +11947,107 @@ static SDValue performIntrinsicCombine(SDNode *N, return LowerSVEIntReduction(N, AArch64ISD::EORV_PRED, DAG); case Intrinsic::aarch64_sve_andv: return LowerSVEIntReduction(N, AArch64ISD::ANDV_PRED, DAG); + case Intrinsic::aarch64_sve_index: + return LowerSVEIntrinsicIndex(N, DAG); + case Intrinsic::aarch64_sve_dup: + return LowerSVEIntrinsicDUP(N, DAG); + case Intrinsic::aarch64_sve_dup_x: + return DAG.getNode(ISD::SPLAT_VECTOR, SDLoc(N), N->getValueType(0), + N->getOperand(1)); case Intrinsic::aarch64_sve_ext: return LowerSVEIntrinsicEXT(N, DAG); + case Intrinsic::aarch64_sve_smin: + return DAG.getNode(AArch64ISD::SMIN_MERGE_OP1, SDLoc(N), N->getValueType(0), + N->getOperand(1), N->getOperand(2), N->getOperand(3)); + case Intrinsic::aarch64_sve_umin: + return DAG.getNode(AArch64ISD::UMIN_MERGE_OP1, SDLoc(N), N->getValueType(0), + N->getOperand(1), N->getOperand(2), N->getOperand(3)); + case Intrinsic::aarch64_sve_smax: + return DAG.getNode(AArch64ISD::SMAX_MERGE_OP1, SDLoc(N), N->getValueType(0), + N->getOperand(1), N->getOperand(2), N->getOperand(3)); + case Intrinsic::aarch64_sve_umax: + return DAG.getNode(AArch64ISD::UMAX_MERGE_OP1, SDLoc(N), N->getValueType(0), + N->getOperand(1), N->getOperand(2), N->getOperand(3)); + case Intrinsic::aarch64_sve_lsl: + return DAG.getNode(AArch64ISD::SHL_MERGE_OP1, SDLoc(N), N->getValueType(0), + N->getOperand(1), N->getOperand(2), N->getOperand(3)); + case Intrinsic::aarch64_sve_lsr: + return DAG.getNode(AArch64ISD::SRL_MERGE_OP1, SDLoc(N), N->getValueType(0), + N->getOperand(1), N->getOperand(2), N->getOperand(3)); + case Intrinsic::aarch64_sve_asr: + return DAG.getNode(AArch64ISD::SRA_MERGE_OP1, SDLoc(N), N->getValueType(0), + N->getOperand(1), N->getOperand(2), N->getOperand(3)); + case Intrinsic::aarch64_sve_cmphs: + if (!N->getOperand(2).getValueType().isFloatingPoint()) + return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N), + N->getValueType(0), N->getOperand(1), N->getOperand(2), + N->getOperand(3), DAG.getCondCode(ISD::SETUGE)); + break; + case Intrinsic::aarch64_sve_cmphi: + if (!N->getOperand(2).getValueType().isFloatingPoint()) + return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N), + N->getValueType(0), N->getOperand(1), N->getOperand(2), + N->getOperand(3), DAG.getCondCode(ISD::SETUGT)); + break; + case Intrinsic::aarch64_sve_cmpge: + if (!N->getOperand(2).getValueType().isFloatingPoint()) + return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N), + N->getValueType(0), N->getOperand(1), N->getOperand(2), + N->getOperand(3), DAG.getCondCode(ISD::SETGE)); + break; + case Intrinsic::aarch64_sve_cmpgt: + if (!N->getOperand(2).getValueType().isFloatingPoint()) + return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N), + N->getValueType(0), N->getOperand(1), N->getOperand(2), + N->getOperand(3), DAG.getCondCode(ISD::SETGT)); + break; + case Intrinsic::aarch64_sve_cmpeq: + if (!N->getOperand(2).getValueType().isFloatingPoint()) + return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N), + N->getValueType(0), N->getOperand(1), N->getOperand(2), + N->getOperand(3), DAG.getCondCode(ISD::SETEQ)); + break; + case Intrinsic::aarch64_sve_cmpne: + if (!N->getOperand(2).getValueType().isFloatingPoint()) + return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N), + N->getValueType(0), N->getOperand(1), N->getOperand(2), + N->getOperand(3), DAG.getCondCode(ISD::SETNE)); + break; + case Intrinsic::aarch64_sve_fadda: + return combineSVEReductionOrderedFP(N, AArch64ISD::FADDA_PRED, DAG); + case Intrinsic::aarch64_sve_faddv: + return combineSVEReductionFP(N, AArch64ISD::FADDV_PRED, DAG); + case Intrinsic::aarch64_sve_fmaxnmv: + return combineSVEReductionFP(N, AArch64ISD::FMAXNMV_PRED, DAG); + case Intrinsic::aarch64_sve_fmaxv: + return combineSVEReductionFP(N, AArch64ISD::FMAXV_PRED, DAG); + case Intrinsic::aarch64_sve_fminnmv: + return combineSVEReductionFP(N, AArch64ISD::FMINNMV_PRED, DAG); + case Intrinsic::aarch64_sve_fminv: + return combineSVEReductionFP(N, AArch64ISD::FMINV_PRED, DAG); + case Intrinsic::aarch64_sve_sel: + return DAG.getNode(ISD::VSELECT, SDLoc(N), N->getValueType(0), + N->getOperand(1), N->getOperand(2), N->getOperand(3)); case Intrinsic::aarch64_sve_cmpeq_wide: - return tryConvertSVEWideCompare(N, Intrinsic::aarch64_sve_cmpeq, - false, DCI, DAG); + return tryConvertSVEWideCompare(N, ISD::SETEQ, DCI, DAG); case Intrinsic::aarch64_sve_cmpne_wide: - return tryConvertSVEWideCompare(N, Intrinsic::aarch64_sve_cmpne, - false, DCI, DAG); + return tryConvertSVEWideCompare(N, ISD::SETNE, DCI, DAG); case Intrinsic::aarch64_sve_cmpge_wide: - return tryConvertSVEWideCompare(N, Intrinsic::aarch64_sve_cmpge, - false, DCI, DAG); + return tryConvertSVEWideCompare(N, ISD::SETGE, DCI, DAG); case Intrinsic::aarch64_sve_cmpgt_wide: - return tryConvertSVEWideCompare(N, Intrinsic::aarch64_sve_cmpgt, - false, DCI, DAG); + return tryConvertSVEWideCompare(N, ISD::SETGT, DCI, DAG); case Intrinsic::aarch64_sve_cmplt_wide: - return tryConvertSVEWideCompare(N, Intrinsic::aarch64_sve_cmpgt, - true, DCI, DAG); + return tryConvertSVEWideCompare(N, ISD::SETLT, DCI, DAG); case Intrinsic::aarch64_sve_cmple_wide: - return tryConvertSVEWideCompare(N, Intrinsic::aarch64_sve_cmpge, - true, DCI, DAG); + return tryConvertSVEWideCompare(N, ISD::SETLE, DCI, DAG); case Intrinsic::aarch64_sve_cmphs_wide: - return tryConvertSVEWideCompare(N, Intrinsic::aarch64_sve_cmphs, - false, DCI, DAG); + return tryConvertSVEWideCompare(N, ISD::SETUGE, DCI, DAG); case Intrinsic::aarch64_sve_cmphi_wide: - return tryConvertSVEWideCompare(N, Intrinsic::aarch64_sve_cmphi, - false, DCI, DAG); + return tryConvertSVEWideCompare(N, ISD::SETUGT, DCI, DAG); case Intrinsic::aarch64_sve_cmplo_wide: - return tryConvertSVEWideCompare(N, Intrinsic::aarch64_sve_cmphi, true, - DCI, DAG); + return tryConvertSVEWideCompare(N, ISD::SETULT, DCI, DAG); case Intrinsic::aarch64_sve_cmpls_wide: - return tryConvertSVEWideCompare(N, Intrinsic::aarch64_sve_cmphs, true, - DCI, DAG); + return tryConvertSVEWideCompare(N, ISD::SETULE, DCI, DAG); case Intrinsic::aarch64_sve_ptest_any: return getPTest(DAG, N->getValueType(0), N->getOperand(1), N->getOperand(2), AArch64CC::ANY_ACTIVE); @@ -11091,14 +12125,14 @@ static SDValue performExtendCombine(SDNode *N, if (!ResVT.isSimple() || !SrcVT.isSimple()) return SDValue(); - // If the source VT is a 64-bit vector, we can play games and get the - // better results we want. - if (SrcVT.getSizeInBits() != 64) + // If the source VT is a 64-bit fixed or scalable vector, we can play games + // and get the better results we want. + if (SrcVT.getSizeInBits().getKnownMinSize() != 64) return SDValue(); unsigned SrcEltSize = SrcVT.getScalarSizeInBits(); - unsigned ElementCount = SrcVT.getVectorNumElements(); - SrcVT = MVT::getVectorVT(MVT::getIntegerVT(SrcEltSize * 2), ElementCount); + ElementCount SrcEC = SrcVT.getVectorElementCount(); + SrcVT = MVT::getVectorVT(MVT::getIntegerVT(SrcEltSize * 2), SrcEC); SDLoc DL(N); Src = DAG.getNode(N->getOpcode(), DL, SrcVT, Src); @@ -11106,17 +12140,14 @@ static SDValue performExtendCombine(SDNode *N, // bit source. EVT LoVT, HiVT; SDValue Lo, Hi; - unsigned NumElements = ResVT.getVectorNumElements(); - assert(!(NumElements & 1) && "Splitting vector, but not in half!"); - LoVT = HiVT = EVT::getVectorVT(*DAG.getContext(), - ResVT.getVectorElementType(), NumElements / 2); + LoVT = HiVT = ResVT.getHalfNumVectorElementsVT(*DAG.getContext()); EVT InNVT = EVT::getVectorVT(*DAG.getContext(), SrcVT.getVectorElementType(), - LoVT.getVectorNumElements()); + LoVT.getVectorElementCount()); Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InNVT, Src, DAG.getConstant(0, DL, MVT::i64)); Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InNVT, Src, - DAG.getConstant(InNVT.getVectorNumElements(), DL, MVT::i64)); + DAG.getConstant(InNVT.getVectorMinNumElements(), DL, MVT::i64)); Lo = DAG.getNode(N->getOpcode(), DL, LoVT, Lo); Hi = DAG.getNode(N->getOpcode(), DL, HiVT, Hi); @@ -11165,11 +12196,71 @@ static SDValue splitStoreSplat(SelectionDAG &DAG, StoreSDNode &St, return NewST1; } +// Returns an SVE type that ContentTy can be trivially sign or zero extended +// into. +static MVT getSVEContainerType(EVT ContentTy) { + assert(ContentTy.isSimple() && "No SVE containers for extended types"); + + switch (ContentTy.getSimpleVT().SimpleTy) { + default: + llvm_unreachable("No known SVE container for this MVT type"); + case MVT::nxv2i8: + case MVT::nxv2i16: + case MVT::nxv2i32: + case MVT::nxv2i64: + case MVT::nxv2f32: + case MVT::nxv2f64: + return MVT::nxv2i64; + case MVT::nxv4i8: + case MVT::nxv4i16: + case MVT::nxv4i32: + case MVT::nxv4f32: + return MVT::nxv4i32; + case MVT::nxv8i8: + case MVT::nxv8i16: + case MVT::nxv8f16: + case MVT::nxv8bf16: + return MVT::nxv8i16; + case MVT::nxv16i8: + return MVT::nxv16i8; + } +} + +static SDValue performLD1Combine(SDNode *N, SelectionDAG &DAG, unsigned Opc) { + SDLoc DL(N); + EVT VT = N->getValueType(0); + + if (VT.getSizeInBits().getKnownMinSize() > AArch64::SVEBitsPerBlock) + return SDValue(); + + EVT ContainerVT = VT; + if (ContainerVT.isInteger()) + ContainerVT = getSVEContainerType(ContainerVT); + + SDVTList VTs = DAG.getVTList(ContainerVT, MVT::Other); + SDValue Ops[] = { N->getOperand(0), // Chain + N->getOperand(2), // Pg + N->getOperand(3), // Base + DAG.getValueType(VT) }; + + SDValue Load = DAG.getNode(Opc, DL, VTs, Ops); + SDValue LoadChain = SDValue(Load.getNode(), 1); + + if (ContainerVT.isInteger() && (VT != ContainerVT)) + Load = DAG.getNode(ISD::TRUNCATE, DL, VT, Load.getValue(0)); + + return DAG.getMergeValues({ Load, LoadChain }, DL); +} + static SDValue performLDNT1Combine(SDNode *N, SelectionDAG &DAG) { SDLoc DL(N); EVT VT = N->getValueType(0); EVT PtrTy = N->getOperand(3).getValueType(); + if (VT == MVT::nxv8bf16 && + !static_cast(DAG.getSubtarget()).hasBF16()) + return SDValue(); + EVT LoadVT = VT; if (VT.isFloatingPoint()) LoadVT = VT.changeTypeToInteger(); @@ -11190,6 +12281,58 @@ static SDValue performLDNT1Combine(SDNode *N, SelectionDAG &DAG) { return L; } +template +static SDValue performLD1ReplicateCombine(SDNode *N, SelectionDAG &DAG) { + static_assert(Opcode == AArch64ISD::LD1RQ_MERGE_ZERO || + Opcode == AArch64ISD::LD1RO_MERGE_ZERO, + "Unsupported opcode."); + SDLoc DL(N); + EVT VT = N->getValueType(0); + + EVT LoadVT = VT; + if (VT.isFloatingPoint()) + LoadVT = VT.changeTypeToInteger(); + + SDValue Ops[] = {N->getOperand(0), N->getOperand(2), N->getOperand(3)}; + SDValue Load = DAG.getNode(Opcode, DL, {LoadVT, MVT::Other}, Ops); + SDValue LoadChain = SDValue(Load.getNode(), 1); + + if (VT.isFloatingPoint()) + Load = DAG.getNode(ISD::BITCAST, DL, VT, Load.getValue(0)); + + return DAG.getMergeValues({Load, LoadChain}, DL); +} + +static SDValue performST1Combine(SDNode *N, SelectionDAG &DAG) { + SDLoc DL(N); + SDValue Data = N->getOperand(2); + EVT DataVT = Data.getValueType(); + EVT HwSrcVt = getSVEContainerType(DataVT); + SDValue InputVT = DAG.getValueType(DataVT); + + if (DataVT == MVT::nxv8bf16 && + !static_cast(DAG.getSubtarget()).hasBF16()) + return SDValue(); + + if (DataVT.isFloatingPoint()) + InputVT = DAG.getValueType(HwSrcVt); + + SDValue SrcNew; + if (Data.getValueType().isFloatingPoint()) + SrcNew = DAG.getNode(ISD::BITCAST, DL, HwSrcVt, Data); + else + SrcNew = DAG.getNode(ISD::ANY_EXTEND, DL, HwSrcVt, Data); + + SDValue Ops[] = { N->getOperand(0), // Chain + SrcNew, + N->getOperand(4), // Base + N->getOperand(3), // Pg + InputVT + }; + + return DAG.getNode(AArch64ISD::ST1_PRED, DL, N->getValueType(0), Ops); +} + static SDValue performSTNT1Combine(SDNode *N, SelectionDAG &DAG) { SDLoc DL(N); @@ -11197,6 +12340,10 @@ static SDValue performSTNT1Combine(SDNode *N, SelectionDAG &DAG) { EVT DataVT = Data.getValueType(); EVT PtrTy = N->getOperand(4).getValueType(); + if (DataVT == MVT::nxv8bf16 && + !static_cast(DAG.getSubtarget()).hasBF16()) + return SDValue(); + if (DataVT.isFloatingPoint()) Data = DAG.getNode(ISD::BITCAST, DL, DataVT.changeTypeToInteger(), Data); @@ -11226,6 +12373,10 @@ static SDValue replaceZeroVectorStore(SelectionDAG &DAG, StoreSDNode &St) { SDValue StVal = St.getValue(); EVT VT = StVal.getValueType(); + // Avoid scalarizing zero splat stores for scalable vectors. + if (VT.isScalableVector()) + return SDValue(); + // It is beneficial to scalarize a zero splat store for 2 or 3 i64 elements or // 2, 3 or 4 i32 elements. int NumVecElts = VT.getVectorNumElements(); @@ -11348,7 +12499,8 @@ static SDValue splitStores(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SDValue StVal = S->getValue(); EVT VT = StVal.getValueType(); - if (!VT.isVector()) + + if (!VT.isFixedLengthVector()) return SDValue(); // If we get a splat of zeros, convert this vector store to a store of @@ -11419,6 +12571,9 @@ static SDValue performPostLD1Combine(SDNode *N, SelectionDAG &DAG = DCI.DAG; EVT VT = N->getValueType(0); + if (VT.isScalableVector()) + return SDValue(); + unsigned LoadIdx = IsLaneOp ? 1 : 0; SDNode *LD = N->getOperand(LoadIdx).getNode(); // If it is not LOAD, can not do such combine. @@ -12258,32 +13413,57 @@ static SDValue performGlobalAddressCombine(SDNode *N, SelectionDAG &DAG, DAG.getConstant(MinOffset, DL, MVT::i64)); } -// Returns an SVE type that ContentTy can be trivially sign or zero extended -// into. -static MVT getSVEContainerType(EVT ContentTy) { - assert(ContentTy.isSimple() && "No SVE containers for extended types"); +// Turns the vector of indices into a vector of byte offstes by scaling Offset +// by (BitWidth / 8). +static SDValue getScaledOffsetForBitWidth(SelectionDAG &DAG, SDValue Offset, + SDLoc DL, unsigned BitWidth) { + assert(Offset.getValueType().isScalableVector() && + "This method is only for scalable vectors of offsets"); - switch (ContentTy.getSimpleVT().SimpleTy) { - default: - llvm_unreachable("No known SVE container for this MVT type"); - case MVT::nxv2i8: - case MVT::nxv2i16: - case MVT::nxv2i32: - case MVT::nxv2i64: - case MVT::nxv2f32: - case MVT::nxv2f64: - return MVT::nxv2i64; - case MVT::nxv4i8: - case MVT::nxv4i16: - case MVT::nxv4i32: - case MVT::nxv4f32: - return MVT::nxv4i32; - } + SDValue Shift = DAG.getConstant(Log2_32(BitWidth / 8), DL, MVT::i64); + SDValue SplatShift = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::nxv2i64, Shift); + + return DAG.getNode(ISD::SHL, DL, MVT::nxv2i64, Offset, SplatShift); } -static SDValue performST1ScatterCombine(SDNode *N, SelectionDAG &DAG, - unsigned Opcode, - bool OnlyPackedOffsets = true) { +/// Check if the value of \p OffsetInBytes can be used as an immediate for +/// the gather load/prefetch and scatter store instructions with vector base and +/// immediate offset addressing mode: +/// +/// [.[S|D]{, #}] +/// +/// where = sizeof() * k, for k = 0, 1, ..., 31. + +inline static bool isValidImmForSVEVecImmAddrMode(unsigned OffsetInBytes, + unsigned ScalarSizeInBytes) { + // The immediate is not a multiple of the scalar size. + if (OffsetInBytes % ScalarSizeInBytes) + return false; + + // The immediate is out of range. + if (OffsetInBytes / ScalarSizeInBytes > 31) + return false; + + return true; +} + +/// Check if the value of \p Offset represents a valid immediate for the SVE +/// gather load/prefetch and scatter store instructiona with vector base and +/// immediate offset addressing mode: +/// +/// [.[S|D]{, #}] +/// +/// where = sizeof() * k, for k = 0, 1, ..., 31. +static bool isValidImmForSVEVecImmAddrMode(SDValue Offset, + unsigned ScalarSizeInBytes) { + ConstantSDNode *OffsetConst = dyn_cast(Offset.getNode()); + return OffsetConst && isValidImmForSVEVecImmAddrMode( + OffsetConst->getZExtValue(), ScalarSizeInBytes); +} + +static SDValue performScatterStoreCombine(SDNode *N, SelectionDAG &DAG, + unsigned Opcode, + bool OnlyPackedOffsets = true) { const SDValue Src = N->getOperand(2); const EVT SrcVT = Src->getValueType(0); assert(SrcVT.isScalableVector() && @@ -12303,11 +13483,46 @@ static SDValue performST1ScatterCombine(SDNode *N, SelectionDAG &DAG, // Depending on the addressing mode, this is either a pointer or a vector of // pointers (that fits into one register) - const SDValue Base = N->getOperand(4); + SDValue Base = N->getOperand(4); // Depending on the addressing mode, this is either a single offset or a // vector of offsets (that fits into one register) SDValue Offset = N->getOperand(5); + // For "scalar + vector of indices", just scale the indices. This only + // applies to non-temporal scatters because there's no instruction that takes + // indicies. + if (Opcode == AArch64ISD::SSTNT1_INDEX_PRED) { + Offset = + getScaledOffsetForBitWidth(DAG, Offset, DL, SrcElVT.getSizeInBits()); + Opcode = AArch64ISD::SSTNT1_PRED; + } + + // In the case of non-temporal gather loads there's only one SVE instruction + // per data-size: "scalar + vector", i.e. + // * stnt1{b|h|w|d} { z0.s }, p0/z, [z0.s, x0] + // Since we do have intrinsics that allow the arguments to be in a different + // order, we may need to swap them to match the spec. + if (Opcode == AArch64ISD::SSTNT1_PRED && Offset.getValueType().isVector()) + std::swap(Base, Offset); + + // SST1_IMM requires that the offset is an immediate that is: + // * a multiple of #SizeInBytes, + // * in the range [0, 31 x #SizeInBytes], + // where #SizeInBytes is the size in bytes of the stored items. For + // immediates outside that range and non-immediate scalar offsets use SST1 or + // SST1_UXTW instead. + if (Opcode == AArch64ISD::SST1_IMM_PRED) { + if (!isValidImmForSVEVecImmAddrMode(Offset, + SrcVT.getScalarSizeInBits() / 8)) { + if (MVT::nxv4i32 == Base.getValueType().getSimpleVT().SimpleTy) + Opcode = AArch64ISD::SST1_UXTW_PRED; + else + Opcode = AArch64ISD::SST1_PRED; + + std::swap(Base, Offset); + } + } + auto &TLI = DAG.getTargetLoweringInfo(); if (!TLI.isTypeLegal(Base.getValueType())) return SDValue(); @@ -12325,9 +13540,9 @@ static SDValue performST1ScatterCombine(SDNode *N, SelectionDAG &DAG, // Source value type that is representable in hardware EVT HwSrcVt = getSVEContainerType(SrcVT); - // Keep the original type of the input data to store - this is needed to - // differentiate between ST1B, ST1H, ST1W and ST1D. For FP values we want the - // integer equivalent, so just use HwSrcVt. + // Keep the original type of the input data to store - this is needed to be + // able to select the correct instruction, e.g. ST1B, ST1H, ST1W and ST1D. For + // FP values we want the integer equivalent, so just use HwSrcVt. SDValue InputVT = DAG.getValueType(SrcVT); if (SrcVT.isFloatingPoint()) InputVT = DAG.getValueType(HwSrcVt); @@ -12350,24 +13565,67 @@ static SDValue performST1ScatterCombine(SDNode *N, SelectionDAG &DAG, return DAG.getNode(Opcode, DL, VTs, Ops); } -static SDValue performLD1GatherCombine(SDNode *N, SelectionDAG &DAG, - unsigned Opcode, - bool OnlyPackedOffsets = true) { - EVT RetVT = N->getValueType(0); +static SDValue performGatherLoadCombine(SDNode *N, SelectionDAG &DAG, + unsigned Opcode, + bool OnlyPackedOffsets = true) { + const EVT RetVT = N->getValueType(0); assert(RetVT.isScalableVector() && "Gather loads are only possible for SVE vectors"); + SDLoc DL(N); + // Make sure that the loaded data will fit into an SVE register if (RetVT.getSizeInBits().getKnownMinSize() > AArch64::SVEBitsPerBlock) return SDValue(); // Depending on the addressing mode, this is either a pointer or a vector of // pointers (that fits into one register) - const SDValue Base = N->getOperand(3); + SDValue Base = N->getOperand(3); // Depending on the addressing mode, this is either a single offset or a // vector of offsets (that fits into one register) SDValue Offset = N->getOperand(4); + // For "scalar + vector of indices", just scale the indices. This only + // applies to non-temporal gathers because there's no instruction that takes + // indicies. + if (Opcode == AArch64ISD::GLDNT1_INDEX_MERGE_ZERO) { + Offset = getScaledOffsetForBitWidth(DAG, Offset, DL, + RetVT.getScalarSizeInBits()); + Opcode = AArch64ISD::GLDNT1_MERGE_ZERO; + } + + // In the case of non-temporal gather loads there's only one SVE instruction + // per data-size: "scalar + vector", i.e. + // * ldnt1{b|h|w|d} { z0.s }, p0/z, [z0.s, x0] + // Since we do have intrinsics that allow the arguments to be in a different + // order, we may need to swap them to match the spec. + if (Opcode == AArch64ISD::GLDNT1_MERGE_ZERO && + Offset.getValueType().isVector()) + std::swap(Base, Offset); + + // GLD{FF}1_IMM requires that the offset is an immediate that is: + // * a multiple of #SizeInBytes, + // * in the range [0, 31 x #SizeInBytes], + // where #SizeInBytes is the size in bytes of the loaded items. For + // immediates outside that range and non-immediate scalar offsets use + // GLD1_MERGE_ZERO or GLD1_UXTW_MERGE_ZERO instead. + if (Opcode == AArch64ISD::GLD1_IMM_MERGE_ZERO || + Opcode == AArch64ISD::GLDFF1_IMM_MERGE_ZERO) { + if (!isValidImmForSVEVecImmAddrMode(Offset, + RetVT.getScalarSizeInBits() / 8)) { + if (MVT::nxv4i32 == Base.getValueType().getSimpleVT().SimpleTy) + Opcode = (Opcode == AArch64ISD::GLD1_IMM_MERGE_ZERO) + ? AArch64ISD::GLD1_UXTW_MERGE_ZERO + : AArch64ISD::GLDFF1_UXTW_MERGE_ZERO; + else + Opcode = (Opcode == AArch64ISD::GLD1_IMM_MERGE_ZERO) + ? AArch64ISD::GLD1_MERGE_ZERO + : AArch64ISD::GLDFF1_MERGE_ZERO; + + std::swap(Base, Offset); + } + } + auto &TLI = DAG.getTargetLoweringInfo(); if (!TLI.isTypeLegal(Base.getValueType())) return SDValue(); @@ -12382,10 +13640,9 @@ static SDValue performLD1GatherCombine(SDNode *N, SelectionDAG &DAG, // Return value type that is representable in hardware EVT HwRetVt = getSVEContainerType(RetVT); - // Keep the original output value type around - this will better inform - // optimisations (e.g. instruction folding when load is followed by - // zext/sext). This will only be used for ints, so the value for FPs - // doesn't matter. + // Keep the original output value type around - this is needed to be able to + // select the correct instruction, e.g. LD1B, LD1H, LD1W and LD1D. For FP + // values we want the integer equivalent, so just use HwRetVT. SDValue OutVT = DAG.getValueType(RetVT); if (RetVT.isFloatingPoint()) OutVT = DAG.getValueType(HwRetVt); @@ -12409,55 +13666,126 @@ static SDValue performLD1GatherCombine(SDNode *N, SelectionDAG &DAG, return DAG.getMergeValues({Load, LoadChain}, DL); } - static SDValue performSignExtendInRegCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG) { if (DCI.isBeforeLegalizeOps()) return SDValue(); + SDLoc DL(N); SDValue Src = N->getOperand(0); unsigned Opc = Src->getOpcode(); - // Gather load nodes (e.g. AArch64ISD::GLD1) are straightforward candidates + // Sign extend of an unsigned unpack -> signed unpack + if (Opc == AArch64ISD::UUNPKHI || Opc == AArch64ISD::UUNPKLO) { + + unsigned SOpc = Opc == AArch64ISD::UUNPKHI ? AArch64ISD::SUNPKHI + : AArch64ISD::SUNPKLO; + + // Push the sign extend to the operand of the unpack + // This is necessary where, for example, the operand of the unpack + // is another unpack: + // 4i32 sign_extend_inreg (4i32 uunpklo(8i16 uunpklo (16i8 opnd)), from 4i8) + // -> + // 4i32 sunpklo (8i16 sign_extend_inreg(8i16 uunpklo (16i8 opnd), from 8i8) + // -> + // 4i32 sunpklo(8i16 sunpklo(16i8 opnd)) + SDValue ExtOp = Src->getOperand(0); + auto VT = cast(N->getOperand(1))->getVT(); + EVT EltTy = VT.getVectorElementType(); + (void)EltTy; + + assert((EltTy == MVT::i8 || EltTy == MVT::i16 || EltTy == MVT::i32) && + "Sign extending from an invalid type"); + + EVT ExtVT = EVT::getVectorVT(*DAG.getContext(), + VT.getVectorElementType(), + VT.getVectorElementCount() * 2); + + SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, ExtOp.getValueType(), + ExtOp, DAG.getValueType(ExtVT)); + + return DAG.getNode(SOpc, DL, N->getValueType(0), Ext); + } + + // SVE load nodes (e.g. AArch64ISD::GLD1) are straightforward candidates // for DAG Combine with SIGN_EXTEND_INREG. Bail out for all other nodes. unsigned NewOpc; + unsigned MemVTOpNum = 4; switch (Opc) { - case AArch64ISD::GLD1: - NewOpc = AArch64ISD::GLD1S; + case AArch64ISD::LD1_MERGE_ZERO: + NewOpc = AArch64ISD::LD1S_MERGE_ZERO; + MemVTOpNum = 3; + break; + case AArch64ISD::LDNF1_MERGE_ZERO: + NewOpc = AArch64ISD::LDNF1S_MERGE_ZERO; + MemVTOpNum = 3; + break; + case AArch64ISD::LDFF1_MERGE_ZERO: + NewOpc = AArch64ISD::LDFF1S_MERGE_ZERO; + MemVTOpNum = 3; + break; + case AArch64ISD::GLD1_MERGE_ZERO: + NewOpc = AArch64ISD::GLD1S_MERGE_ZERO; break; - case AArch64ISD::GLD1_SCALED: - NewOpc = AArch64ISD::GLD1S_SCALED; + case AArch64ISD::GLD1_SCALED_MERGE_ZERO: + NewOpc = AArch64ISD::GLD1S_SCALED_MERGE_ZERO; break; - case AArch64ISD::GLD1_SXTW: - NewOpc = AArch64ISD::GLD1S_SXTW; + case AArch64ISD::GLD1_SXTW_MERGE_ZERO: + NewOpc = AArch64ISD::GLD1S_SXTW_MERGE_ZERO; break; - case AArch64ISD::GLD1_SXTW_SCALED: - NewOpc = AArch64ISD::GLD1S_SXTW_SCALED; + case AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO: + NewOpc = AArch64ISD::GLD1S_SXTW_SCALED_MERGE_ZERO; break; - case AArch64ISD::GLD1_UXTW: - NewOpc = AArch64ISD::GLD1S_UXTW; + case AArch64ISD::GLD1_UXTW_MERGE_ZERO: + NewOpc = AArch64ISD::GLD1S_UXTW_MERGE_ZERO; break; - case AArch64ISD::GLD1_UXTW_SCALED: - NewOpc = AArch64ISD::GLD1S_UXTW_SCALED; + case AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO: + NewOpc = AArch64ISD::GLD1S_UXTW_SCALED_MERGE_ZERO; break; - case AArch64ISD::GLD1_IMM: - NewOpc = AArch64ISD::GLD1S_IMM; + case AArch64ISD::GLD1_IMM_MERGE_ZERO: + NewOpc = AArch64ISD::GLD1S_IMM_MERGE_ZERO; + break; + case AArch64ISD::GLDFF1_MERGE_ZERO: + NewOpc = AArch64ISD::GLDFF1S_MERGE_ZERO; + break; + case AArch64ISD::GLDFF1_SCALED_MERGE_ZERO: + NewOpc = AArch64ISD::GLDFF1S_SCALED_MERGE_ZERO; + break; + case AArch64ISD::GLDFF1_SXTW_MERGE_ZERO: + NewOpc = AArch64ISD::GLDFF1S_SXTW_MERGE_ZERO; + break; + case AArch64ISD::GLDFF1_SXTW_SCALED_MERGE_ZERO: + NewOpc = AArch64ISD::GLDFF1S_SXTW_SCALED_MERGE_ZERO; + break; + case AArch64ISD::GLDFF1_UXTW_MERGE_ZERO: + NewOpc = AArch64ISD::GLDFF1S_UXTW_MERGE_ZERO; + break; + case AArch64ISD::GLDFF1_UXTW_SCALED_MERGE_ZERO: + NewOpc = AArch64ISD::GLDFF1S_UXTW_SCALED_MERGE_ZERO; + break; + case AArch64ISD::GLDFF1_IMM_MERGE_ZERO: + NewOpc = AArch64ISD::GLDFF1S_IMM_MERGE_ZERO; + break; + case AArch64ISD::GLDNT1_MERGE_ZERO: + NewOpc = AArch64ISD::GLDNT1S_MERGE_ZERO; break; default: return SDValue(); } EVT SignExtSrcVT = cast(N->getOperand(1))->getVT(); - EVT GLD1SrcMemVT = cast(Src->getOperand(4))->getVT(); + EVT SrcMemVT = cast(Src->getOperand(MemVTOpNum))->getVT(); - if ((SignExtSrcVT != GLD1SrcMemVT) || !Src.hasOneUse()) + if ((SignExtSrcVT != SrcMemVT) || !Src.hasOneUse()) return SDValue(); EVT DstVT = N->getValueType(0); SDVTList VTs = DAG.getVTList(DstVT, MVT::Other); - SDValue Ops[] = {Src->getOperand(0), Src->getOperand(1), Src->getOperand(2), - Src->getOperand(3), Src->getOperand(4)}; + + SmallVector Ops; + for (unsigned I = 0; I < Src->getNumOperands(); ++I) + Ops.push_back(Src->getOperand(I)); SDValue ExtLoad = DAG.getNode(NewOpc, SDLoc(N), VTs, Ops); DCI.CombineTo(N, ExtLoad); @@ -12467,6 +13795,51 @@ performSignExtendInRegCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, return SDValue(N, 0); } +/// Legalize the gather prefetch (scalar + vector addressing mode) when the +/// offset vector is an unpacked 32-bit scalable vector. The other cases (Offset +/// != nxv2i32) do not need legalization. +static SDValue legalizeSVEGatherPrefetchOffsVec(SDNode *N, SelectionDAG &DAG) { + const unsigned OffsetPos = 4; + SDValue Offset = N->getOperand(OffsetPos); + + // Not an unpacked vector, bail out. + if (Offset.getValueType().getSimpleVT().SimpleTy != MVT::nxv2i32) + return SDValue(); + + // Extend the unpacked offset vector to 64-bit lanes. + SDLoc DL(N); + Offset = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::nxv2i64, Offset); + SmallVector Ops(N->op_begin(), N->op_end()); + // Replace the offset operand with the 64-bit one. + Ops[OffsetPos] = Offset; + + return DAG.getNode(N->getOpcode(), DL, DAG.getVTList(MVT::Other), Ops); +} + +/// Combines a node carrying the intrinsic +/// `aarch64_sve_prf_gather_scalar_offset` into a node that uses +/// `aarch64_sve_prfb_gather_uxtw_index` when the scalar offset passed to +/// `aarch64_sve_prf_gather_scalar_offset` is not a valid immediate for the +/// sve gather prefetch instruction with vector plus immediate addressing mode. +static SDValue combineSVEPrefetchVecBaseImmOff(SDNode *N, SelectionDAG &DAG, + unsigned ScalarSizeInBytes) { + const unsigned ImmPos = 4, OffsetPos = 3; + // No need to combine the node if the immediate is valid... + if (isValidImmForSVEVecImmAddrMode(N->getOperand(ImmPos), ScalarSizeInBytes)) + return SDValue(); + + // ...otherwise swap the offset base with the offset... + SmallVector Ops(N->op_begin(), N->op_end()); + std::swap(Ops[ImmPos], Ops[OffsetPos]); + // ...and remap the intrinsic `aarch64_sve_prf_gather_scalar_offset` to + // `aarch64_sve_prfb_gather_uxtw_index`. + SDLoc DL(N); + Ops[1] = DAG.getConstant(Intrinsic::aarch64_sve_prfb_gather_uxtw_index, DL, + MVT::i64); + + return DAG.getNode(N->getOpcode(), DL, DAG.getVTList(MVT::Other), Ops); +} + SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; @@ -12531,6 +13904,23 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N, case ISD::INTRINSIC_VOID: case ISD::INTRINSIC_W_CHAIN: switch (cast(N->getOperand(1))->getZExtValue()) { + case Intrinsic::aarch64_sve_prfb_gather_scalar_offset: + return combineSVEPrefetchVecBaseImmOff(N, DAG, 1 /*=ScalarSizeInBytes*/); + case Intrinsic::aarch64_sve_prfh_gather_scalar_offset: + return combineSVEPrefetchVecBaseImmOff(N, DAG, 2 /*=ScalarSizeInBytes*/); + case Intrinsic::aarch64_sve_prfw_gather_scalar_offset: + return combineSVEPrefetchVecBaseImmOff(N, DAG, 4 /*=ScalarSizeInBytes*/); + case Intrinsic::aarch64_sve_prfd_gather_scalar_offset: + return combineSVEPrefetchVecBaseImmOff(N, DAG, 8 /*=ScalarSizeInBytes*/); + case Intrinsic::aarch64_sve_prfb_gather_uxtw_index: + case Intrinsic::aarch64_sve_prfb_gather_sxtw_index: + case Intrinsic::aarch64_sve_prfh_gather_uxtw_index: + case Intrinsic::aarch64_sve_prfh_gather_sxtw_index: + case Intrinsic::aarch64_sve_prfw_gather_uxtw_index: + case Intrinsic::aarch64_sve_prfw_gather_sxtw_index: + case Intrinsic::aarch64_sve_prfd_gather_uxtw_index: + case Intrinsic::aarch64_sve_prfd_gather_sxtw_index: + return legalizeSVEGatherPrefetchOffsVec(N, DAG); case Intrinsic::aarch64_neon_ld2: case Intrinsic::aarch64_neon_ld3: case Intrinsic::aarch64_neon_ld4: @@ -12555,44 +13945,180 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N, return performNEONPostLDSTCombine(N, DCI, DAG); case Intrinsic::aarch64_sve_ldnt1: return performLDNT1Combine(N, DAG); + case Intrinsic::aarch64_sve_ld1rq: + return performLD1ReplicateCombine(N, DAG); + case Intrinsic::aarch64_sve_ld1ro: + return performLD1ReplicateCombine(N, DAG); + case Intrinsic::aarch64_sve_ldnt1_gather_scalar_offset: + return performGatherLoadCombine(N, DAG, AArch64ISD::GLDNT1_MERGE_ZERO); + case Intrinsic::aarch64_sve_ldnt1_gather: + return performGatherLoadCombine(N, DAG, AArch64ISD::GLDNT1_MERGE_ZERO); + case Intrinsic::aarch64_sve_ldnt1_gather_index: + return performGatherLoadCombine(N, DAG, + AArch64ISD::GLDNT1_INDEX_MERGE_ZERO); + case Intrinsic::aarch64_sve_ldnt1_gather_uxtw: + return performGatherLoadCombine(N, DAG, AArch64ISD::GLDNT1_MERGE_ZERO); + case Intrinsic::aarch64_sve_ld1: + return performLD1Combine(N, DAG, AArch64ISD::LD1_MERGE_ZERO); + case Intrinsic::aarch64_sve_ldnf1: + return performLD1Combine(N, DAG, AArch64ISD::LDNF1_MERGE_ZERO); + case Intrinsic::aarch64_sve_ldff1: + return performLD1Combine(N, DAG, AArch64ISD::LDFF1_MERGE_ZERO); + case Intrinsic::aarch64_sve_st1: + return performST1Combine(N, DAG); case Intrinsic::aarch64_sve_stnt1: return performSTNT1Combine(N, DAG); + case Intrinsic::aarch64_sve_stnt1_scatter_scalar_offset: + return performScatterStoreCombine(N, DAG, AArch64ISD::SSTNT1_PRED); + case Intrinsic::aarch64_sve_stnt1_scatter_uxtw: + return performScatterStoreCombine(N, DAG, AArch64ISD::SSTNT1_PRED); + case Intrinsic::aarch64_sve_stnt1_scatter: + return performScatterStoreCombine(N, DAG, AArch64ISD::SSTNT1_PRED); + case Intrinsic::aarch64_sve_stnt1_scatter_index: + return performScatterStoreCombine(N, DAG, AArch64ISD::SSTNT1_INDEX_PRED); case Intrinsic::aarch64_sve_ld1_gather: - return performLD1GatherCombine(N, DAG, AArch64ISD::GLD1); + return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1_MERGE_ZERO); case Intrinsic::aarch64_sve_ld1_gather_index: - return performLD1GatherCombine(N, DAG, AArch64ISD::GLD1_SCALED); + return performGatherLoadCombine(N, DAG, + AArch64ISD::GLD1_SCALED_MERGE_ZERO); case Intrinsic::aarch64_sve_ld1_gather_sxtw: - return performLD1GatherCombine(N, DAG, AArch64ISD::GLD1_SXTW, + return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1_SXTW_MERGE_ZERO, /*OnlyPackedOffsets=*/false); case Intrinsic::aarch64_sve_ld1_gather_uxtw: - return performLD1GatherCombine(N, DAG, AArch64ISD::GLD1_UXTW, + return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1_UXTW_MERGE_ZERO, /*OnlyPackedOffsets=*/false); case Intrinsic::aarch64_sve_ld1_gather_sxtw_index: - return performLD1GatherCombine(N, DAG, AArch64ISD::GLD1_SXTW_SCALED, + return performGatherLoadCombine(N, DAG, + AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO, /*OnlyPackedOffsets=*/false); case Intrinsic::aarch64_sve_ld1_gather_uxtw_index: - return performLD1GatherCombine(N, DAG, AArch64ISD::GLD1_UXTW_SCALED, + return performGatherLoadCombine(N, DAG, + AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO, + /*OnlyPackedOffsets=*/false); + case Intrinsic::aarch64_sve_ld1_gather_scalar_offset: + return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1_IMM_MERGE_ZERO); + case Intrinsic::aarch64_sve_ldff1_gather: + return performGatherLoadCombine(N, DAG, AArch64ISD::GLDFF1_MERGE_ZERO); + case Intrinsic::aarch64_sve_ldff1_gather_index: + return performGatherLoadCombine(N, DAG, + AArch64ISD::GLDFF1_SCALED_MERGE_ZERO); + case Intrinsic::aarch64_sve_ldff1_gather_sxtw: + return performGatherLoadCombine(N, DAG, + AArch64ISD::GLDFF1_SXTW_MERGE_ZERO, /*OnlyPackedOffsets=*/false); - case Intrinsic::aarch64_sve_ld1_gather_imm: - return performLD1GatherCombine(N, DAG, AArch64ISD::GLD1_IMM); + case Intrinsic::aarch64_sve_ldff1_gather_uxtw: + return performGatherLoadCombine(N, DAG, + AArch64ISD::GLDFF1_UXTW_MERGE_ZERO, + /*OnlyPackedOffsets=*/false); + case Intrinsic::aarch64_sve_ldff1_gather_sxtw_index: + return performGatherLoadCombine(N, DAG, + AArch64ISD::GLDFF1_SXTW_SCALED_MERGE_ZERO, + /*OnlyPackedOffsets=*/false); + case Intrinsic::aarch64_sve_ldff1_gather_uxtw_index: + return performGatherLoadCombine(N, DAG, + AArch64ISD::GLDFF1_UXTW_SCALED_MERGE_ZERO, + /*OnlyPackedOffsets=*/false); + case Intrinsic::aarch64_sve_ldff1_gather_scalar_offset: + return performGatherLoadCombine(N, DAG, + AArch64ISD::GLDFF1_IMM_MERGE_ZERO); case Intrinsic::aarch64_sve_st1_scatter: - return performST1ScatterCombine(N, DAG, AArch64ISD::SST1); + return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_PRED); case Intrinsic::aarch64_sve_st1_scatter_index: - return performST1ScatterCombine(N, DAG, AArch64ISD::SST1_SCALED); + return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_SCALED_PRED); case Intrinsic::aarch64_sve_st1_scatter_sxtw: - return performST1ScatterCombine(N, DAG, AArch64ISD::SST1_SXTW, - /*OnlyPackedOffsets=*/false); + return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_SXTW_PRED, + /*OnlyPackedOffsets=*/false); case Intrinsic::aarch64_sve_st1_scatter_uxtw: - return performST1ScatterCombine(N, DAG, AArch64ISD::SST1_UXTW, - /*OnlyPackedOffsets=*/false); + return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_UXTW_PRED, + /*OnlyPackedOffsets=*/false); case Intrinsic::aarch64_sve_st1_scatter_sxtw_index: - return performST1ScatterCombine(N, DAG, AArch64ISD::SST1_SXTW_SCALED, - /*OnlyPackedOffsets=*/false); + return performScatterStoreCombine(N, DAG, + AArch64ISD::SST1_SXTW_SCALED_PRED, + /*OnlyPackedOffsets=*/false); case Intrinsic::aarch64_sve_st1_scatter_uxtw_index: - return performST1ScatterCombine(N, DAG, AArch64ISD::SST1_UXTW_SCALED, - /*OnlyPackedOffsets=*/false); - case Intrinsic::aarch64_sve_st1_scatter_imm: - return performST1ScatterCombine(N, DAG, AArch64ISD::SST1_IMM); + return performScatterStoreCombine(N, DAG, + AArch64ISD::SST1_UXTW_SCALED_PRED, + /*OnlyPackedOffsets=*/false); + case Intrinsic::aarch64_sve_st1_scatter_scalar_offset: + return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_IMM_PRED); + case Intrinsic::aarch64_sve_tuple_get: { + SDLoc DL(N); + SDValue Chain = N->getOperand(0); + SDValue Src1 = N->getOperand(2); + SDValue Idx = N->getOperand(3); + + uint64_t IdxConst = cast(Idx)->getZExtValue(); + EVT ResVT = N->getValueType(0); + uint64_t NumLanes = ResVT.getVectorElementCount().Min; + SDValue Val = + DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ResVT, Src1, + DAG.getConstant(IdxConst * NumLanes, DL, MVT::i32)); + return DAG.getMergeValues({Val, Chain}, DL); + } + case Intrinsic::aarch64_sve_tuple_set: { + SDLoc DL(N); + SDValue Chain = N->getOperand(0); + SDValue Tuple = N->getOperand(2); + SDValue Idx = N->getOperand(3); + SDValue Vec = N->getOperand(4); + + EVT TupleVT = Tuple.getValueType(); + uint64_t TupleLanes = TupleVT.getVectorElementCount().Min; + + uint64_t IdxConst = cast(Idx)->getZExtValue(); + uint64_t NumLanes = Vec.getValueType().getVectorElementCount().Min; + + if ((TupleLanes % NumLanes) != 0) + report_fatal_error("invalid tuple vector!"); + + uint64_t NumVecs = TupleLanes / NumLanes; + + SmallVector Opnds; + for (unsigned I = 0; I < NumVecs; ++I) { + if (I == IdxConst) + Opnds.push_back(Vec); + else { + Opnds.push_back( + DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, Vec.getValueType(), Tuple, + DAG.getConstant(I * NumLanes, DL, MVT::i32))); + } + } + SDValue Concat = + DAG.getNode(ISD::CONCAT_VECTORS, DL, Tuple.getValueType(), Opnds); + return DAG.getMergeValues({Concat, Chain}, DL); + } + case Intrinsic::aarch64_sve_tuple_create2: + case Intrinsic::aarch64_sve_tuple_create3: + case Intrinsic::aarch64_sve_tuple_create4: { + SDLoc DL(N); + SDValue Chain = N->getOperand(0); + + SmallVector Opnds; + for (unsigned I = 2; I < N->getNumOperands(); ++I) + Opnds.push_back(N->getOperand(I)); + + EVT VT = Opnds[0].getValueType(); + EVT EltVT = VT.getVectorElementType(); + EVT DestVT = EVT::getVectorVT(*DAG.getContext(), EltVT, + VT.getVectorElementCount() * + (N->getNumOperands() - 2)); + SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, DestVT, Opnds); + return DAG.getMergeValues({Concat, Chain}, DL); + } + case Intrinsic::aarch64_sve_ld2: + case Intrinsic::aarch64_sve_ld3: + case Intrinsic::aarch64_sve_ld4: { + SDLoc DL(N); + SDValue Chain = N->getOperand(0); + SDValue Mask = N->getOperand(2); + SDValue BasePtr = N->getOperand(3); + SDValue LoadOps[] = {Chain, Mask, BasePtr}; + unsigned IntrinsicID = + cast(N->getOperand(1))->getZExtValue(); + SDValue Result = + LowerSVEStructLoad(IntrinsicID, LoadOps, N->getValueType(0), DAG, DL); + return DAG.getMergeValues({Result, Chain}, DL); + } default: break; } @@ -12724,7 +14250,8 @@ static void ReplaceBITCASTResults(SDNode *N, SmallVectorImpl &Results, SDLoc DL(N); SDValue Op = N->getOperand(0); - if (N->getValueType(0) != MVT::i16 || Op.getValueType() != MVT::f16) + if (N->getValueType(0) != MVT::i16 || + (Op.getValueType() != MVT::f16 && Op.getValueType() != MVT::bf16)) return; Op = SDValue( @@ -12759,6 +14286,40 @@ static std::pair splitInt128(SDValue N, SelectionDAG &DAG) { return std::make_pair(Lo, Hi); } +void AArch64TargetLowering::ReplaceExtractSubVectorResults( + SDNode *N, SmallVectorImpl &Results, SelectionDAG &DAG) const { + SDValue In = N->getOperand(0); + EVT InVT = In.getValueType(); + + // Common code will handle these just fine. + if (!InVT.isScalableVector() || !InVT.isInteger()) + return; + + SDLoc DL(N); + EVT VT = N->getValueType(0); + + // The following checks bail if this is not a halving operation. + + ElementCount ResEC = VT.getVectorElementCount(); + + if (InVT.getVectorElementCount().Min != (ResEC.Min * 2)) + return; + + auto *CIndex = dyn_cast(N->getOperand(1)); + if (!CIndex) + return; + + unsigned Index = CIndex->getZExtValue(); + if ((Index != 0) && (Index != ResEC.Min)) + return; + + unsigned Opcode = (Index == 0) ? AArch64ISD::UUNPKLO : AArch64ISD::UUNPKHI; + EVT ExtendedHalfVT = VT.widenIntegerVectorElementType(*DAG.getContext()); + + SDValue Half = DAG.getNode(Opcode, DL, ExtendedHalfVT, N->getOperand(0)); + Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, Half)); +} + // Create an even/odd pair of X registers holding integer value V. static SDValue createGPRPairNode(SelectionDAG &DAG, SDValue V) { SDLoc dl(V.getNode()); @@ -12822,10 +14383,12 @@ static void ReplaceCMP_SWAP_128Results(SDNode *N, unsigned SubReg1 = AArch64::sube64, SubReg2 = AArch64::subo64; if (DAG.getDataLayout().isBigEndian()) std::swap(SubReg1, SubReg2); - Results.push_back(DAG.getTargetExtractSubreg(SubReg1, SDLoc(N), MVT::i64, - SDValue(CmpSwap, 0))); - Results.push_back(DAG.getTargetExtractSubreg(SubReg2, SDLoc(N), MVT::i64, - SDValue(CmpSwap, 0))); + SDValue Lo = DAG.getTargetExtractSubreg(SubReg1, SDLoc(N), MVT::i64, + SDValue(CmpSwap, 0)); + SDValue Hi = DAG.getTargetExtractSubreg(SubReg2, SDLoc(N), MVT::i64, + SDValue(CmpSwap, 0)); + Results.push_back( + DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i128, Lo, Hi)); Results.push_back(SDValue(CmpSwap, 1)); // Chain out return; } @@ -12841,8 +14404,8 @@ static void ReplaceCMP_SWAP_128Results(SDNode *N, MachineMemOperand *MemOp = cast(N)->getMemOperand(); DAG.setNodeMemRefs(cast(CmpSwap), {MemOp}); - Results.push_back(SDValue(CmpSwap, 0)); - Results.push_back(SDValue(CmpSwap, 1)); + Results.push_back(DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i128, + SDValue(CmpSwap, 0), SDValue(CmpSwap, 1))); Results.push_back(SDValue(CmpSwap, 3)); } @@ -12862,6 +14425,9 @@ void AArch64TargetLowering::ReplaceNodeResults( Results.push_back(LowerVECREDUCE(SDValue(N, 0), DAG)); return; + case ISD::CTPOP: + Results.push_back(LowerCTPOP(SDValue(N, 0), DAG)); + return; case AArch64ISD::SADDV: ReplaceReductionResults(N, Results, DAG, ISD::ADD, AArch64ISD::SADDV); return; @@ -12909,6 +14475,9 @@ void AArch64TargetLowering::ReplaceNodeResults( Results.append({Pair, Result.getValue(2) /* Chain */}); return; } + case ISD::EXTRACT_SUBVECTOR: + ReplaceExtractSubVectorResults(N, Results, DAG); + return; case ISD::INTRINSIC_WO_CHAIN: { EVT VT = N->getValueType(0); assert((VT == MVT::i8 || VT == MVT::i16) && @@ -13019,7 +14588,7 @@ AArch64TargetLowering::shouldExpandAtomicCmpXchgInIR( // on the stack and close enough to the spill slot, this can lead to a // situation where the monitor always gets cleared and the atomic operation // can never succeed. So at -O0 we need a late-expanded pseudo-inst instead. - if (getTargetMachine().getOptLevel() == 0) + if (getTargetMachine().getOptLevel() == CodeGenOpt::None) return AtomicExpansionKind::None; return AtomicExpansionKind::LLSC; } @@ -13278,8 +14847,7 @@ bool AArch64TargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const { // integer division, leaving the division as-is is a loss even in terms of // size, because it will have to be scalarized, while the alternative code // sequence can be performed in vector form. - bool OptSize = - Attr.hasAttribute(AttributeList::FunctionIndex, Attribute::MinSize); + bool OptSize = Attr.hasFnAttribute(Attribute::MinSize); return OptSize && !VT.isVector(); } @@ -13309,3 +14877,280 @@ void AArch64TargetLowering::finalizeLowering(MachineFunction &MF) const { bool AArch64TargetLowering::needsFixedCatchObjects() const { return false; } + +bool AArch64TargetLowering::shouldLocalize( + const MachineInstr &MI, const TargetTransformInfo *TTI) const { + switch (MI.getOpcode()) { + case TargetOpcode::G_GLOBAL_VALUE: { + // On Darwin, TLS global vars get selected into function calls, which + // we don't want localized, as they can get moved into the middle of a + // another call sequence. + const GlobalValue &GV = *MI.getOperand(1).getGlobal(); + if (GV.isThreadLocal() && Subtarget->isTargetMachO()) + return false; + break; + } + // If we legalized G_GLOBAL_VALUE into ADRP + G_ADD_LOW, mark both as being + // localizable. + case AArch64::ADRP: + case AArch64::G_ADD_LOW: + return true; + default: + break; + } + return TargetLoweringBase::shouldLocalize(MI, TTI); +} + +bool AArch64TargetLowering::fallBackToDAGISel(const Instruction &Inst) const { + if (isa(Inst.getType())) + return true; + + for (unsigned i = 0; i < Inst.getNumOperands(); ++i) + if (isa(Inst.getOperand(i)->getType())) + return true; + + return false; +} + +// Return the largest legal scalable vector type that matches VT's element type. +static EVT getContainerForFixedLengthVector(SelectionDAG &DAG, EVT VT) { + assert(VT.isFixedLengthVector() && + DAG.getTargetLoweringInfo().isTypeLegal(VT) && + "Expected legal fixed length vector!"); + switch (VT.getVectorElementType().getSimpleVT().SimpleTy) { + default: + llvm_unreachable("unexpected element type for SVE container"); + case MVT::i8: + return EVT(MVT::nxv16i8); + case MVT::i16: + return EVT(MVT::nxv8i16); + case MVT::i32: + return EVT(MVT::nxv4i32); + case MVT::i64: + return EVT(MVT::nxv2i64); + case MVT::f16: + return EVT(MVT::nxv8f16); + case MVT::f32: + return EVT(MVT::nxv4f32); + case MVT::f64: + return EVT(MVT::nxv2f64); + } +} + +// Return a PTRUE with active lanes corresponding to the extent of VT. +static SDValue getPredicateForFixedLengthVector(SelectionDAG &DAG, SDLoc &DL, + EVT VT) { + assert(VT.isFixedLengthVector() && + DAG.getTargetLoweringInfo().isTypeLegal(VT) && + "Expected legal fixed length vector!"); + + int PgPattern; + switch (VT.getVectorNumElements()) { + default: + llvm_unreachable("unexpected element count for SVE predicate"); + case 1: + PgPattern = AArch64SVEPredPattern::vl1; + break; + case 2: + PgPattern = AArch64SVEPredPattern::vl2; + break; + case 4: + PgPattern = AArch64SVEPredPattern::vl4; + break; + case 8: + PgPattern = AArch64SVEPredPattern::vl8; + break; + case 16: + PgPattern = AArch64SVEPredPattern::vl16; + break; + case 32: + PgPattern = AArch64SVEPredPattern::vl32; + break; + case 64: + PgPattern = AArch64SVEPredPattern::vl64; + break; + case 128: + PgPattern = AArch64SVEPredPattern::vl128; + break; + case 256: + PgPattern = AArch64SVEPredPattern::vl256; + break; + } + + // TODO: For vectors that are exactly getMaxSVEVectorSizeInBits big, we can + // use AArch64SVEPredPattern::all, which can enable the use of unpredicated + // variants of instructions when available. + + MVT MaskVT; + switch (VT.getVectorElementType().getSimpleVT().SimpleTy) { + default: + llvm_unreachable("unexpected element type for SVE predicate"); + case MVT::i8: + MaskVT = MVT::nxv16i1; + break; + case MVT::i16: + case MVT::f16: + MaskVT = MVT::nxv8i1; + break; + case MVT::i32: + case MVT::f32: + MaskVT = MVT::nxv4i1; + break; + case MVT::i64: + case MVT::f64: + MaskVT = MVT::nxv2i1; + break; + } + + return DAG.getNode(AArch64ISD::PTRUE, DL, MaskVT, + DAG.getTargetConstant(PgPattern, DL, MVT::i64)); +} + +static SDValue getPredicateForScalableVector(SelectionDAG &DAG, SDLoc &DL, + EVT VT) { + assert(VT.isScalableVector() && DAG.getTargetLoweringInfo().isTypeLegal(VT) && + "Expected legal scalable vector!"); + auto PredTy = VT.changeVectorElementType(MVT::i1); + return getPTrue(DAG, DL, PredTy, AArch64SVEPredPattern::all); +} + +static SDValue getPredicateForVector(SelectionDAG &DAG, SDLoc &DL, EVT VT) { + if (VT.isFixedLengthVector()) + return getPredicateForFixedLengthVector(DAG, DL, VT); + + return getPredicateForScalableVector(DAG, DL, VT); +} + +// Grow V to consume an entire SVE register. +static SDValue convertToScalableVector(SelectionDAG &DAG, EVT VT, SDValue V) { + assert(VT.isScalableVector() && + "Expected to convert into a scalable vector!"); + assert(V.getValueType().isFixedLengthVector() && + "Expected a fixed length vector operand!"); + SDLoc DL(V); + SDValue Zero = DAG.getConstant(0, DL, MVT::i64); + return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V, Zero); +} + +// Shrink V so it's just big enough to maintain a VT's worth of data. +static SDValue convertFromScalableVector(SelectionDAG &DAG, EVT VT, SDValue V) { + assert(VT.isFixedLengthVector() && + "Expected to convert into a fixed length vector!"); + assert(V.getValueType().isScalableVector() && + "Expected a scalable vector operand!"); + SDLoc DL(V); + SDValue Zero = DAG.getConstant(0, DL, MVT::i64); + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V, Zero); +} + +// Convert all fixed length vector loads larger than NEON to masked_loads. +SDValue AArch64TargetLowering::LowerFixedLengthVectorLoadToSVE( + SDValue Op, SelectionDAG &DAG) const { + auto Load = cast(Op); + + SDLoc DL(Op); + EVT VT = Op.getValueType(); + EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT); + + auto NewLoad = DAG.getMaskedLoad( + ContainerVT, DL, Load->getChain(), Load->getBasePtr(), Load->getOffset(), + getPredicateForFixedLengthVector(DAG, DL, VT), DAG.getUNDEF(ContainerVT), + Load->getMemoryVT(), Load->getMemOperand(), Load->getAddressingMode(), + Load->getExtensionType()); + + auto Result = convertFromScalableVector(DAG, VT, NewLoad); + SDValue MergedValues[2] = {Result, Load->getChain()}; + return DAG.getMergeValues(MergedValues, DL); +} + +// Convert all fixed length vector stores larger than NEON to masked_stores. +SDValue AArch64TargetLowering::LowerFixedLengthVectorStoreToSVE( + SDValue Op, SelectionDAG &DAG) const { + auto Store = cast(Op); + + SDLoc DL(Op); + EVT VT = Store->getValue().getValueType(); + EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT); + + auto NewValue = convertToScalableVector(DAG, ContainerVT, Store->getValue()); + return DAG.getMaskedStore( + Store->getChain(), DL, NewValue, Store->getBasePtr(), Store->getOffset(), + getPredicateForFixedLengthVector(DAG, DL, VT), Store->getMemoryVT(), + Store->getMemOperand(), Store->getAddressingMode(), + Store->isTruncatingStore()); +} + +SDValue AArch64TargetLowering::LowerFixedLengthVectorTruncateToSVE( + SDValue Op, SelectionDAG &DAG) const { + EVT VT = Op.getValueType(); + assert(VT.isFixedLengthVector() && "Expected fixed length vector type!"); + + SDLoc DL(Op); + SDValue Val = Op.getOperand(0); + EVT ContainerVT = getContainerForFixedLengthVector(DAG, Val.getValueType()); + Val = convertToScalableVector(DAG, ContainerVT, Val); + + // Repeatedly truncate Val until the result is of the desired element type. + switch (ContainerVT.getSimpleVT().SimpleTy) { + default: + llvm_unreachable("unimplemented container type"); + case MVT::nxv2i64: + Val = DAG.getNode(ISD::BITCAST, DL, MVT::nxv4i32, Val); + Val = DAG.getNode(AArch64ISD::UZP1, DL, MVT::nxv4i32, Val, Val); + if (VT.getVectorElementType() == MVT::i32) + break; + LLVM_FALLTHROUGH; + case MVT::nxv4i32: + Val = DAG.getNode(ISD::BITCAST, DL, MVT::nxv8i16, Val); + Val = DAG.getNode(AArch64ISD::UZP1, DL, MVT::nxv8i16, Val, Val); + if (VT.getVectorElementType() == MVT::i16) + break; + LLVM_FALLTHROUGH; + case MVT::nxv8i16: + Val = DAG.getNode(ISD::BITCAST, DL, MVT::nxv16i8, Val); + Val = DAG.getNode(AArch64ISD::UZP1, DL, MVT::nxv16i8, Val, Val); + assert(VT.getVectorElementType() == MVT::i8 && "Unexpected element type!"); + break; + } + + return convertFromScalableVector(DAG, VT, Val); +} + +SDValue AArch64TargetLowering::LowerToPredicatedOp(SDValue Op, + SelectionDAG &DAG, + unsigned NewOp) const { + EVT VT = Op.getValueType(); + SDLoc DL(Op); + auto Pg = getPredicateForVector(DAG, DL, VT); + + if (useSVEForFixedLengthVectorVT(VT)) { + EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT); + + // Create list of operands by convereting existing ones to scalable types. + SmallVector Operands = {Pg}; + for (const SDValue &V : Op->op_values()) { + if (isa(V)) { + Operands.push_back(V); + continue; + } + + assert(useSVEForFixedLengthVectorVT(V.getValueType()) && + "Only fixed length vectors are supported!"); + Operands.push_back(convertToScalableVector(DAG, ContainerVT, V)); + } + + auto ScalableRes = DAG.getNode(NewOp, DL, ContainerVT, Operands); + return convertFromScalableVector(DAG, VT, ScalableRes); + } + + assert(VT.isScalableVector() && "Only expect to lower scalable vector op!"); + + SmallVector Operands = {Pg}; + for (const SDValue &V : Op->op_values()) { + assert((isa(V) || V.getValueType().isScalableVector()) && + "Only scalable vectors are supported!"); + Operands.push_back(V); + } + + return DAG.getNode(NewOp, DL, VT, Operands); +} diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h index 672dfc4fcbc06..4fe77481706b3 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -25,6 +25,26 @@ namespace llvm { namespace AArch64ISD { +// For predicated nodes where the result is a vector, the operation is +// controlled by a governing predicate and the inactive lanes are explicitly +// defined with a value, please stick the following naming convention: +// +// _MERGE_OP The result value is a vector with inactive lanes equal +// to source operand OP. +// +// _MERGE_ZERO The result value is a vector with inactive lanes +// actively zeroed. +// +// _MERGE_PASSTHRU The result value is a vector with inactive lanes equal +// to the last source operand which only purpose is being +// a passthru value. +// +// For other cases where no explicit action is needed to set the inactive lanes, +// or when the result is not a vector and it is needed or helpful to +// distinguish a node from similar unpredicated nodes, use: +// +// _PRED +// enum NodeType : unsigned { FIRST_NUMBER = ISD::BUILTIN_OP_END, WrapperLarge, // 4-instruction MOVZ/MOVK sequence for 64-bit addresses. @@ -52,6 +72,22 @@ enum NodeType : unsigned { ADC, SBC, // adc, sbc instructions + // Arithmetic instructions + ADD_PRED, + FADD_PRED, + SDIV_PRED, + UDIV_PRED, + FMA_PRED, + SMIN_MERGE_OP1, + UMIN_MERGE_OP1, + SMAX_MERGE_OP1, + UMAX_MERGE_OP1, + SHL_MERGE_OP1, + SRL_MERGE_OP1, + SRA_MERGE_OP1, + + SETCC_MERGE_ZERO, + // Arithmetic instructions which write flags. ADDS, SUBS, @@ -90,9 +126,9 @@ enum NodeType : unsigned { BICi, ORRi, - // Vector bit select: similar to ISD::VSELECT but not all bits within an + // Vector bitwise select: similar to ISD::VSELECT but not all bits within an // element must be identical. - BSL, + BSP, // Vector arithmetic negation NEG, @@ -121,6 +157,10 @@ enum NodeType : unsigned { SRSHR_I, URSHR_I, + // Vector shift by constant and insert + VSLI, + VSRI, + // Vector comparisons CMEQ, CMGE, @@ -148,6 +188,10 @@ enum NodeType : unsigned { SADDV, UADDV, + // Vector rounding halving addition + SRHADD, + URHADD, + // Vector across-lanes min/max // Only the lower result lane is defined. SMINV, @@ -166,7 +210,7 @@ enum NodeType : unsigned { // Vector bitwise negation NOT, - // Vector bitwise selection + // Vector bitwise insertion BIT, // Compare-and-branch @@ -196,8 +240,10 @@ enum NodeType : unsigned { UMULL, // Reciprocal estimates and steps. - FRECPE, FRECPS, - FRSQRTE, FRSQRTS, + FRECPE, + FRECPS, + FRSQRTE, + FRSQRTS, SUNPKHI, SUNPKLO, @@ -211,35 +257,97 @@ enum NodeType : unsigned { REV, TBL, + // Floating-point reductions. + FADDA_PRED, + FADDV_PRED, + FMAXV_PRED, + FMAXNMV_PRED, + FMINV_PRED, + FMINNMV_PRED, + INSR, PTEST, PTRUE, + DUP_MERGE_PASSTHRU, + INDEX_VECTOR, + + REINTERPRET_CAST, + + LD1_MERGE_ZERO, + LD1S_MERGE_ZERO, + LDNF1_MERGE_ZERO, + LDNF1S_MERGE_ZERO, + LDFF1_MERGE_ZERO, + LDFF1S_MERGE_ZERO, + LD1RQ_MERGE_ZERO, + LD1RO_MERGE_ZERO, + + // Structured loads. + SVE_LD2_MERGE_ZERO, + SVE_LD3_MERGE_ZERO, + SVE_LD4_MERGE_ZERO, + // Unsigned gather loads. - GLD1, - GLD1_SCALED, - GLD1_UXTW, - GLD1_SXTW, - GLD1_UXTW_SCALED, - GLD1_SXTW_SCALED, - GLD1_IMM, + GLD1_MERGE_ZERO, + GLD1_SCALED_MERGE_ZERO, + GLD1_UXTW_MERGE_ZERO, + GLD1_SXTW_MERGE_ZERO, + GLD1_UXTW_SCALED_MERGE_ZERO, + GLD1_SXTW_SCALED_MERGE_ZERO, + GLD1_IMM_MERGE_ZERO, // Signed gather loads - GLD1S, - GLD1S_SCALED, - GLD1S_UXTW, - GLD1S_SXTW, - GLD1S_UXTW_SCALED, - GLD1S_SXTW_SCALED, - GLD1S_IMM, + GLD1S_MERGE_ZERO, + GLD1S_SCALED_MERGE_ZERO, + GLD1S_UXTW_MERGE_ZERO, + GLD1S_SXTW_MERGE_ZERO, + GLD1S_UXTW_SCALED_MERGE_ZERO, + GLD1S_SXTW_SCALED_MERGE_ZERO, + GLD1S_IMM_MERGE_ZERO, + + // Unsigned gather loads. + GLDFF1_MERGE_ZERO, + GLDFF1_SCALED_MERGE_ZERO, + GLDFF1_UXTW_MERGE_ZERO, + GLDFF1_SXTW_MERGE_ZERO, + GLDFF1_UXTW_SCALED_MERGE_ZERO, + GLDFF1_SXTW_SCALED_MERGE_ZERO, + GLDFF1_IMM_MERGE_ZERO, + + // Signed gather loads. + GLDFF1S_MERGE_ZERO, + GLDFF1S_SCALED_MERGE_ZERO, + GLDFF1S_UXTW_MERGE_ZERO, + GLDFF1S_SXTW_MERGE_ZERO, + GLDFF1S_UXTW_SCALED_MERGE_ZERO, + GLDFF1S_SXTW_SCALED_MERGE_ZERO, + GLDFF1S_IMM_MERGE_ZERO, + + // Non-temporal gather loads + GLDNT1_MERGE_ZERO, + GLDNT1_INDEX_MERGE_ZERO, + GLDNT1S_MERGE_ZERO, + + // Contiguous masked store. + ST1_PRED, + // Scatter store - SST1, - SST1_SCALED, - SST1_UXTW, - SST1_SXTW, - SST1_UXTW_SCALED, - SST1_SXTW_SCALED, - SST1_IMM, + SST1_PRED, + SST1_SCALED_PRED, + SST1_UXTW_PRED, + SST1_SXTW_PRED, + SST1_UXTW_SCALED_PRED, + SST1_SXTW_SCALED_PRED, + SST1_IMM_PRED, + + // Non-temporal scatter store + SSTNT1_PRED, + SSTNT1_INDEX_PRED, + + // Strict (exception-raising) floating point comparison + STRICT_FCMP = ISD::FIRST_TARGET_STRICTFP_OPCODE, + STRICT_FCMPE, // NEON Load/Store with post-increment base updates LD2post = ISD::FIRST_TARGET_MEMORY_OPCODE, @@ -272,7 +380,8 @@ enum NodeType : unsigned { STZ2G, LDP, - STP + STP, + STNP }; } // end namespace AArch64ISD @@ -321,7 +430,8 @@ public: return MVT::getIntegerVT(64); } - bool targetShrinkDemandedConstant(SDValue Op, const APInt &Demanded, + bool targetShrinkDemandedConstant(SDValue Op, const APInt &DemandedBits, + const APInt &DemandedElts, TargetLoweringOpt &TLO) const override; MVT getScalarShiftAmountTy(const DataLayout &DL, EVT) const override; @@ -333,9 +443,10 @@ public: MachineMemOperand::Flags Flags = MachineMemOperand::MONone, bool *Fast = nullptr) const override; /// LLT variant. - bool allowsMisalignedMemoryAccesses( - LLT Ty, unsigned AddrSpace, unsigned Align, MachineMemOperand::Flags Flags, - bool *Fast = nullptr) const override; + bool allowsMisalignedMemoryAccesses(LLT Ty, unsigned AddrSpace, + Align Alignment, + MachineMemOperand::Flags Flags, + bool *Fast = nullptr) const override; /// Provide custom lowering hooks for some operations. SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override; @@ -376,9 +487,6 @@ public: MachineBasicBlock *EmitLoweredCatchRet(MachineInstr &MI, MachineBasicBlock *BB) const; - MachineBasicBlock *EmitLoweredCatchPad(MachineInstr &MI, - MachineBasicBlock *BB) const; - MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const override; @@ -402,7 +510,7 @@ public: bool shouldSinkOperands(Instruction *I, SmallVectorImpl &Ops) const override; - bool hasPairedLoad(EVT LoadedType, unsigned &RequiredAligment) const override; + bool hasPairedLoad(EVT LoadedType, Align &RequiredAligment) const override; unsigned getMaxSupportedInterleaveFactor() const override { return 4; } @@ -418,13 +526,11 @@ public: bool shouldConsiderGEPOffsetSplit() const override; - EVT getOptimalMemOpType(uint64_t Size, unsigned DstAlign, unsigned SrcAlign, - bool IsMemset, bool ZeroMemset, bool MemcpyStrSrc, + EVT getOptimalMemOpType(const MemOp &Op, const AttributeList &FuncAttributes) const override; - LLT getOptimalMemOpLLT(uint64_t Size, unsigned DstAlign, unsigned SrcAlign, - bool IsMemset, bool ZeroMemset, bool MemcpyStrSrc, - const AttributeList &FuncAttributes) const override; + LLT getOptimalMemOpLLT(const MemOp &Op, + const AttributeList &FuncAttributes) const override; /// Return true if the addressing mode represented by AM is legal for this /// target, for a load/store of the specified type. @@ -463,6 +569,13 @@ public: bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const override; + bool shouldFormOverflowOp(unsigned Opcode, EVT VT, + bool MathUsed) const override { + // Using overflow ops for overflow checks only should beneficial on + // AArch64. + return TargetLowering::shouldFormOverflowOp(Opcode, VT, true); + } + Value *emitLoadLinked(IRBuilder<> &Builder, Value *Addr, AtomicOrdering Ord) const override; Value *emitStoreConditional(IRBuilder<> &Builder, Value *Val, @@ -497,7 +610,7 @@ public: /// If a physical register, this returns the register that receives the /// exception address on entry to an EH pad. - unsigned + Register getExceptionPointerRegister(const Constant *PersonalityFn) const override { // FIXME: This is a guess. Has this been defined yet? return AArch64::X0; @@ -505,7 +618,7 @@ public: /// If a physical register, this returns the register that receives the /// exception typeid on entry to a landing pad. - unsigned + Register getExceptionSelectorRegister(const Constant *PersonalityFn) const override { // FIXME: This is a guess. Has this been defined yet? return AArch64::X1; @@ -611,13 +724,27 @@ public: unsigned getNumInterleavedAccesses(VectorType *VecTy, const DataLayout &DL) const; - MachineMemOperand::Flags getMMOFlags(const Instruction &I) const override; + MachineMemOperand::Flags getTargetMMOFlags( + const Instruction &I) const override; bool functionArgumentNeedsConsecutiveRegisters(Type *Ty, CallingConv::ID CallConv, bool isVarArg) const override; /// Used for exception handling on Win64. bool needsFixedCatchObjects() const override; + + bool fallBackToDAGISel(const Instruction &Inst) const override; + + /// SVE code generation for fixed length vectors does not custom lower + /// BUILD_VECTOR. This makes BUILD_VECTOR legalisation a source of stores to + /// merge. However, merging them creates a BUILD_VECTOR that is just as + /// illegal as the original, thus leading to an infinite legalisation loop. + /// NOTE: Once BUILD_VECTOR is legal or can be custom lowered for all legal + /// vector types this override can be removed. + bool mergeStoresAfterLegalization(EVT VT) const override { + return !useSVEForFixedLengthVectors(); + } + private: /// Keep a pointer to the AArch64Subtarget around so that we can /// make the right decision when generating code for different targets. @@ -626,6 +753,7 @@ private: bool isExtFreeImpl(const Instruction *Ext) const override; void addTypeForNEON(MVT VT, MVT PromotedBitwiseVT); + void addTypeForFixedLengthSVE(MVT VT); void addDRTypeForNEON(MVT VT); void addQRTypeForNEON(MVT VT); @@ -729,7 +857,11 @@ private: SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const; SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSPLAT_VECTOR(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerDUPQLane(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerToPredicatedOp(SDValue Op, SelectionDAG &DAG, + unsigned NewOp) const; SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerINSERT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const; SDValue LowerVectorSRA_SRL_SHL(SDValue Op, SelectionDAG &DAG) const; SDValue LowerShiftLeftParts(SDValue Op, SelectionDAG &DAG) const; SDValue LowerShiftRightParts(SDValue Op, SelectionDAG &DAG) const; @@ -746,6 +878,8 @@ private: SDValue LowerVectorOR(SDValue Op, SelectionDAG &DAG) const; SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerVSCALE(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerVECREDUCE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerATOMIC_LOAD_SUB(SDValue Op, SelectionDAG &DAG) const; SDValue LowerATOMIC_LOAD_AND(SDValue Op, SelectionDAG &DAG) const; @@ -753,6 +887,13 @@ private: SDValue LowerWindowsDYNAMIC_STACKALLOC(SDValue Op, SDValue Chain, SDValue &Size, SelectionDAG &DAG) const; + SDValue LowerSVEStructLoad(unsigned Intrinsic, ArrayRef LoadOps, + EVT VT, SelectionDAG &DAG, const SDLoc &DL) const; + + SDValue LowerFixedLengthVectorLoadToSVE(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFixedLengthVectorStoreToSVE(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFixedLengthVectorTruncateToSVE(SDValue Op, + SelectionDAG &DAG) const; SDValue BuildSDIVPow2(SDNode *N, const APInt &Divisor, SelectionDAG &DAG, SmallVectorImpl &Created) const override; @@ -807,10 +948,19 @@ private: void ReplaceNodeResults(SDNode *N, SmallVectorImpl &Results, SelectionDAG &DAG) const override; + void ReplaceExtractSubVectorResults(SDNode *N, + SmallVectorImpl &Results, + SelectionDAG &DAG) const; bool shouldNormalizeToSelectSequence(LLVMContext &, EVT) const override; void finalizeLowering(MachineFunction &MF) const override; + + bool shouldLocalize(const MachineInstr &MI, + const TargetTransformInfo *TTI) const override; + + bool useSVEForFixedLengthVectors() const; + bool useSVEForFixedLengthVectorVT(EVT VT) const; }; namespace AArch64 { diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td index c3efe03a0987f..6df7970f4d82b 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td +++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td @@ -20,6 +20,30 @@ class Format val> { def PseudoFrm : Format<0>; def NormalFrm : Format<1>; // Do we need any others? +// Enum describing whether an instruction is +// destructive in its first source operand. +class DestructiveInstTypeEnum val> { + bits<4> Value = val; +} +def NotDestructive : DestructiveInstTypeEnum<0>; +// Destructive in its first operand and can be MOVPRFX'd, but has no other +// special properties. +def DestructiveOther : DestructiveInstTypeEnum<1>; +def DestructiveUnary : DestructiveInstTypeEnum<2>; +def DestructiveBinaryImm : DestructiveInstTypeEnum<3>; +def DestructiveBinaryShImmUnpred : DestructiveInstTypeEnum<4>; +def DestructiveBinary : DestructiveInstTypeEnum<5>; +def DestructiveBinaryComm : DestructiveInstTypeEnum<6>; +def DestructiveBinaryCommWithRev : DestructiveInstTypeEnum<7>; +def DestructiveTernaryCommWithRev : DestructiveInstTypeEnum<8>; + +class FalseLanesEnum val> { + bits<2> Value = val; +} +def FalseLanesNone : FalseLanesEnum<0>; +def FalseLanesZero : FalseLanesEnum<1>; +def FalseLanesUndef : FalseLanesEnum<2>; + // AArch64 Instruction Format class AArch64Inst : Instruction { field bits<32> Inst; // Instruction encoding. @@ -34,6 +58,16 @@ class AArch64Inst : Instruction { let Namespace = "AArch64"; Format F = f; bits<2> Form = F.Value; + + // Defaults + FalseLanesEnum FalseLanes = FalseLanesNone; + DestructiveInstTypeEnum DestructiveInstType = NotDestructive; + ElementSizeEnum ElementSize = ElementSizeNone; + + let TSFlags{8-7} = FalseLanes.Value; + let TSFlags{6-3} = DestructiveInstType.Value; + let TSFlags{2-0} = ElementSize.Value; + let Pattern = []; let Constraints = cstr; } @@ -48,6 +82,7 @@ class Pseudo pattern, string cstr = ""> dag InOperandList = iops; let Pattern = pattern; let isCodeGenOnly = 1; + let isPseudo = 1; } // Real instructions (have encoding information) @@ -56,14 +91,6 @@ class EncodedI pattern> : AArch64Inst { let Size = 4; } -// Enum describing whether an instruction is -// destructive in its first source operand. -class DestructiveInstTypeEnum val> { - bits<1> Value = val; -} -def NotDestructive : DestructiveInstTypeEnum<0>; -def Destructive : DestructiveInstTypeEnum<1>; - // Normal instructions class I pattern> @@ -71,13 +98,6 @@ class I : PatFrag<(ops node:$LHS, node:$MHS, node:$RHS), res>; @@ -327,6 +347,18 @@ def simm5_32b : Operand, ImmLeaf= -16 && Imm < 16; }]> let DecoderMethod = "DecodeSImm<5>"; } +def simm5_8b : Operand, ImmLeaf= -16 && (int8_t)Imm < 16; }]> { + let ParserMatchClass = SImm5Operand; + let DecoderMethod = "DecodeSImm<5>"; + let PrintMethod = "printSImm<8>"; +} + +def simm5_16b : Operand, ImmLeaf= -16 && (int16_t)Imm < 16; }]> { + let ParserMatchClass = SImm5Operand; + let DecoderMethod = "DecodeSImm<5>"; + let PrintMethod = "printSImm<16>"; +} + // simm7sN predicate - True if the immediate is a multiple of N in the range // [-64 * N, 63 * N]. @@ -349,6 +381,8 @@ def simm7s16 : Operand { let PrintMethod = "printImmScale<16>"; } +def am_sve_fi : ComplexPattern; + def am_indexed7s8 : ComplexPattern; def am_indexed7s16 : ComplexPattern; def am_indexed7s32 : ComplexPattern; @@ -358,6 +392,9 @@ def am_indexed7s128 : ComplexPattern; def am_indexedu6s128 : ComplexPattern; def am_indexeds9s128 : ComplexPattern; +def UImmS1XForm : SDNodeXFormgetTargetConstant(N->getZExtValue(), SDLoc(N), MVT::i64); +}]>; def UImmS2XForm : SDNodeXFormgetTargetConstant(N->getZExtValue() / 2, SDLoc(N), MVT::i64); }]>; @@ -446,6 +483,19 @@ def uimm6s16 : Operand, ImmLeafgetTargetConstant(N->getSExtValue() / 2, SDLoc(N), MVT::i64); +}]>; +def SImmS3XForm : SDNodeXFormgetTargetConstant(N->getSExtValue() / 3, SDLoc(N), MVT::i64); +}]>; +def SImmS4XForm : SDNodeXFormgetTargetConstant(N->getSExtValue() / 4, SDLoc(N), MVT::i64); +}]>; +def SImmS16XForm : SDNodeXFormgetTargetConstant(N->getSExtValue() / 16, SDLoc(N), MVT::i64); +}]>; + // simm6sN predicate - True if the immediate is a multiple of N in the range // [-32 * N, 31 * N]. def SImm6s1Operand : SImmScaledMemoryIndexed<6, 1>; @@ -461,6 +511,7 @@ def SImm4s2Operand : SImmScaledMemoryIndexed<4, 2>; def SImm4s3Operand : SImmScaledMemoryIndexed<4, 3>; def SImm4s4Operand : SImmScaledMemoryIndexed<4, 4>; def SImm4s16Operand : SImmScaledMemoryIndexed<4, 16>; +def SImm4s32Operand : SImmScaledMemoryIndexed<4, 32>; def simm4s1 : Operand, ImmLeaf=-8 && Imm <= 7; }]> { @@ -469,31 +520,37 @@ def simm4s1 : Operand, ImmLeaf, ImmLeaf=-16 && Imm <= 14 && (Imm % 2) == 0x0; }]> { +[{ return Imm >=-16 && Imm <= 14 && (Imm % 2) == 0x0; }], SImmS2XForm> { let PrintMethod = "printImmScale<2>"; let ParserMatchClass = SImm4s2Operand; let DecoderMethod = "DecodeSImm<4>"; } def simm4s3 : Operand, ImmLeaf=-24 && Imm <= 21 && (Imm % 3) == 0x0; }]> { +[{ return Imm >=-24 && Imm <= 21 && (Imm % 3) == 0x0; }], SImmS3XForm> { let PrintMethod = "printImmScale<3>"; let ParserMatchClass = SImm4s3Operand; let DecoderMethod = "DecodeSImm<4>"; } def simm4s4 : Operand, ImmLeaf=-32 && Imm <= 28 && (Imm % 4) == 0x0; }]> { +[{ return Imm >=-32 && Imm <= 28 && (Imm % 4) == 0x0; }], SImmS4XForm> { let PrintMethod = "printImmScale<4>"; let ParserMatchClass = SImm4s4Operand; let DecoderMethod = "DecodeSImm<4>"; } def simm4s16 : Operand, ImmLeaf=-128 && Imm <= 112 && (Imm % 16) == 0x0; }]> { +[{ return Imm >=-128 && Imm <= 112 && (Imm % 16) == 0x0; }], SImmS16XForm> { let PrintMethod = "printImmScale<16>"; let ParserMatchClass = SImm4s16Operand; let DecoderMethod = "DecodeSImm<4>"; } +def simm4s32 : Operand, ImmLeaf=-256 && Imm <= 224 && (Imm % 32) == 0x0; }]> { + let PrintMethod = "printImmScale<32>"; + let ParserMatchClass = SImm4s32Operand; + let DecoderMethod = "DecodeSImm<4>"; +} def Imm1_8Operand : AsmImmRange<1, 8>; def Imm1_16Operand : AsmImmRange<1, 16>; @@ -647,6 +704,13 @@ def tvecshiftR32 : Operand, TImmLeaf, TImmLeaf 0) && (((uint32_t)Imm) < 65); +}]> { + let EncoderMethod = "getVecShiftR64OpValue"; + let DecoderMethod = "DecodeVecShiftR64Imm"; + let ParserMatchClass = Imm1_64Operand; +} def Imm0_1Operand : AsmImmRange<0, 1>; def Imm0_7Operand : AsmImmRange<0, 7>; @@ -683,6 +747,36 @@ def vecshiftL64 : Operand, ImmLeaf, TImmLeaf { + let EncoderMethod = "getVecShiftL8OpValue"; + let DecoderMethod = "DecodeVecShiftL8Imm"; + let ParserMatchClass = Imm0_7Operand; +} +def tvecshiftL16 : Operand, TImmLeaf { + let EncoderMethod = "getVecShiftL16OpValue"; + let DecoderMethod = "DecodeVecShiftL16Imm"; + let ParserMatchClass = Imm0_15Operand; +} +def tvecshiftL32 : Operand, TImmLeaf { + let EncoderMethod = "getVecShiftL32OpValue"; + let DecoderMethod = "DecodeVecShiftL32Imm"; + let ParserMatchClass = Imm0_31Operand; +} +def tvecshiftL64 : Operand, TImmLeaf { + let EncoderMethod = "getVecShiftL64OpValue"; + let DecoderMethod = "DecodeVecShiftL64Imm"; + let ParserMatchClass = Imm0_63Operand; +} // Crazy immediate formats used by 32-bit and 64-bit logical immediate // instructions for splatting repeating bit patterns across the immediate. @@ -796,7 +890,7 @@ def imm0_31 : Operand, ImmLeaf, TImmLeaf { @@ -832,7 +926,7 @@ def imm0_7 : Operand, ImmLeaf, ImmLeaf, TImmLeaf { let ParserMatchClass = Imm0_7Operand; @@ -1091,29 +1185,44 @@ class AsmVectorIndex : AsmOperandClass { let RenderMethod = "addVectorIndexOperands"; } -class AsmVectorIndexOpnd - : Operand, ImmLeaf { +class AsmVectorIndexOpnd + : Operand { let ParserMatchClass = mc; let PrintMethod = "printVectorIndex"; } +multiclass VectorIndex { + def "" : AsmVectorIndexOpnd, ImmLeaf; + def _timm : AsmVectorIndexOpnd, TImmLeaf; +} + def VectorIndex1Operand : AsmVectorIndex<1, 1>; def VectorIndexBOperand : AsmVectorIndex<0, 15>; def VectorIndexHOperand : AsmVectorIndex<0, 7>; def VectorIndexSOperand : AsmVectorIndex<0, 3>; def VectorIndexDOperand : AsmVectorIndex<0, 1>; -def VectorIndex1 : AsmVectorIndexOpnd; -def VectorIndexB : AsmVectorIndexOpnd; -def VectorIndexH : AsmVectorIndexOpnd; -def VectorIndexS : AsmVectorIndexOpnd; -def VectorIndexD : AsmVectorIndexOpnd; - -def VectorIndex132b : AsmVectorIndexOpnd; -def VectorIndexB32b : AsmVectorIndexOpnd; -def VectorIndexH32b : AsmVectorIndexOpnd; -def VectorIndexS32b : AsmVectorIndexOpnd; -def VectorIndexD32b : AsmVectorIndexOpnd; +defm VectorIndex1 : VectorIndex; +defm VectorIndexB : VectorIndex; +defm VectorIndexH : VectorIndex; +defm VectorIndexS : VectorIndex; +defm VectorIndexD : VectorIndex; + +defm VectorIndex132b : VectorIndex; +defm VectorIndexB32b : VectorIndex; +defm VectorIndexH32b : VectorIndex; +defm VectorIndexS32b : VectorIndex; +defm VectorIndexD32b : VectorIndex; def SVEVectorIndexExtDupBOperand : AsmVectorIndex<0, 63, "SVE">; def SVEVectorIndexExtDupHOperand : AsmVectorIndex<0, 31, "SVE">; @@ -1121,16 +1230,21 @@ def SVEVectorIndexExtDupSOperand : AsmVectorIndex<0, 15, "SVE">; def SVEVectorIndexExtDupDOperand : AsmVectorIndex<0, 7, "SVE">; def SVEVectorIndexExtDupQOperand : AsmVectorIndex<0, 3, "SVE">; -def sve_elm_idx_extdup_b - : AsmVectorIndexOpnd; -def sve_elm_idx_extdup_h - : AsmVectorIndexOpnd; -def sve_elm_idx_extdup_s - : AsmVectorIndexOpnd; -def sve_elm_idx_extdup_d - : AsmVectorIndexOpnd; -def sve_elm_idx_extdup_q - : AsmVectorIndexOpnd; +defm sve_elm_idx_extdup_b + : VectorIndex; +defm sve_elm_idx_extdup_h + : VectorIndex; +defm sve_elm_idx_extdup_s + : VectorIndex; +defm sve_elm_idx_extdup_d + : VectorIndex; +defm sve_elm_idx_extdup_q + : VectorIndex; // 8-bit immediate for AdvSIMD where 64-bit values of the form: // aaaaaaaa bbbbbbbb cccccccc dddddddd eeeeeeee ffffffff gggggggg hhhhhhhh @@ -1533,6 +1647,8 @@ class BaseAuthLoad { @@ -4333,14 +4449,14 @@ multiclass FPToIntegerUnscaled rmode, bits<3> opcode, string asm, SDPatternOperator OpN> { // Unscaled half-precision to 32-bit def UWHr : BaseFPToIntegerUnscaled<0b11, rmode, opcode, FPR16, GPR32, asm, - [(set GPR32:$Rd, (OpN FPR16:$Rn))]> { + [(set GPR32:$Rd, (OpN (f16 FPR16:$Rn)))]> { let Inst{31} = 0; // 32-bit GPR flag let Predicates = [HasFullFP16]; } // Unscaled half-precision to 64-bit def UXHr : BaseFPToIntegerUnscaled<0b11, rmode, opcode, FPR16, GPR64, asm, - [(set GPR64:$Rd, (OpN FPR16:$Rn))]> { + [(set GPR64:$Rd, (OpN (f16 FPR16:$Rn)))]> { let Inst{31} = 1; // 64-bit GPR flag let Predicates = [HasFullFP16]; } @@ -4375,7 +4491,7 @@ multiclass FPToIntegerScaled rmode, bits<3> opcode, string asm, // Scaled half-precision to 32-bit def SWHri : BaseFPToInteger<0b11, rmode, opcode, FPR16, GPR32, fixedpoint_f16_i32, asm, - [(set GPR32:$Rd, (OpN (fmul FPR16:$Rn, + [(set GPR32:$Rd, (OpN (fmul (f16 FPR16:$Rn), fixedpoint_f16_i32:$scale)))]> { let Inst{31} = 0; // 32-bit GPR flag let scale{5} = 1; @@ -4385,7 +4501,7 @@ multiclass FPToIntegerScaled rmode, bits<3> opcode, string asm, // Scaled half-precision to 64-bit def SXHri : BaseFPToInteger<0b11, rmode, opcode, FPR16, GPR64, fixedpoint_f16_i64, asm, - [(set GPR64:$Rd, (OpN (fmul FPR16:$Rn, + [(set GPR64:$Rd, (OpN (fmul (f16 FPR16:$Rn), fixedpoint_f16_i64:$scale)))]> { let Inst{31} = 1; // 64-bit GPR flag let Predicates = [HasFullFP16]; @@ -4501,7 +4617,7 @@ multiclass IntegerToFP { // Scaled def SWHri: BaseIntegerToFP { let Inst{31} = 0; // 32-bit GPR flag @@ -4529,7 +4645,7 @@ multiclass IntegerToFP { } def SXHri: BaseIntegerToFP { let Inst{31} = 1; // 64-bit GPR flag @@ -4702,19 +4818,19 @@ class BaseFPConversion type, bits<2> opcode, RegisterClass dstType, multiclass FPConversion { // Double-precision to Half-precision def HDr : BaseFPConversion<0b01, 0b11, FPR16, FPR64, asm, - [(set FPR16:$Rd, (fpround FPR64:$Rn))]>; + [(set (f16 FPR16:$Rd), (any_fpround FPR64:$Rn))]>; // Double-precision to Single-precision def SDr : BaseFPConversion<0b01, 0b00, FPR32, FPR64, asm, - [(set FPR32:$Rd, (fpround FPR64:$Rn))]>; + [(set FPR32:$Rd, (any_fpround FPR64:$Rn))]>; // Half-precision to Double-precision def DHr : BaseFPConversion<0b11, 0b01, FPR64, FPR16, asm, - [(set FPR64:$Rd, (fpextend FPR16:$Rn))]>; + [(set FPR64:$Rd, (fpextend (f16 FPR16:$Rn)))]>; // Half-precision to Single-precision def SHr : BaseFPConversion<0b11, 0b00, FPR32, FPR16, asm, - [(set FPR32:$Rd, (fpextend FPR16:$Rn))]>; + [(set FPR32:$Rd, (fpextend (f16 FPR16:$Rn)))]>; // Single-precision to Double-precision def DSr : BaseFPConversion<0b00, 0b01, FPR64, FPR32, asm, @@ -4722,7 +4838,7 @@ multiclass FPConversion { // Single-precision to Half-precision def HSr : BaseFPConversion<0b00, 0b11, FPR16, FPR32, asm, - [(set FPR16:$Rd, (fpround FPR32:$Rn))]>; + [(set (f16 FPR16:$Rd), (any_fpround FPR32:$Rn))]>; } //--- @@ -4824,7 +4940,7 @@ multiclass TwoOperandFPData opcode, string asm, multiclass TwoOperandFPDataNeg opcode, string asm, SDNode node> { def Hrr : BaseTwoOperandFPData { + [(set (f16 FPR16:$Rd), (fneg (node (f16 FPR16:$Rn), (f16 FPR16:$Rm))))]> { let Inst{23-22} = 0b11; // 16-bit size flag let Predicates = [HasFullFP16]; } @@ -4866,7 +4982,7 @@ class BaseThreeOperandFPData { def Hrrr : BaseThreeOperandFPData { let Inst{23-22} = 0b11; // 16-bit size flag let Predicates = [HasFullFP16]; @@ -4928,7 +5044,7 @@ multiclass FPComparison { let Defs = [NZCV] in { def Hrr : BaseTwoOperandFPComparison { + [(OpNode (f16 FPR16:$Rn), (f16 FPR16:$Rm)), (implicit NZCV)]> { let Inst{23-22} = 0b11; let Predicates = [HasFullFP16]; } @@ -5142,6 +5258,47 @@ class BaseSIMDThreeSameVectorTied size, bits<5> opcode, let Inst{4-0} = Rd; } +let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in +class BaseSIMDThreeSameVectorPseudo pattern> + : Pseudo<(outs regtype:$dst), (ins regtype:$Rd, regtype:$Rn, regtype:$Rm), pattern>, + Sched<[WriteV]>; + +multiclass SIMDLogicalThreeVectorPseudo { + def v8i8 : BaseSIMDThreeSameVectorPseudo; + def v16i8 : BaseSIMDThreeSameVectorPseudo; + + def : Pat<(v4i16 (OpNode (v4i16 V64:$LHS), (v4i16 V64:$MHS), + (v4i16 V64:$RHS))), + (!cast(NAME#"v8i8") + V64:$LHS, V64:$MHS, V64:$RHS)>; + def : Pat<(v2i32 (OpNode (v2i32 V64:$LHS), (v2i32 V64:$MHS), + (v2i32 V64:$RHS))), + (!cast(NAME#"v8i8") + V64:$LHS, V64:$MHS, V64:$RHS)>; + def : Pat<(v1i64 (OpNode (v1i64 V64:$LHS), (v1i64 V64:$MHS), + (v1i64 V64:$RHS))), + (!cast(NAME#"v8i8") + V64:$LHS, V64:$MHS, V64:$RHS)>; + + def : Pat<(v8i16 (OpNode (v8i16 V128:$LHS), (v8i16 V128:$MHS), + (v8i16 V128:$RHS))), + (!cast(NAME#"v16i8") + V128:$LHS, V128:$MHS, V128:$RHS)>; + def : Pat<(v4i32 (OpNode (v4i32 V128:$LHS), (v4i32 V128:$MHS), + (v4i32 V128:$RHS))), + (!cast(NAME#"v16i8") + V128:$LHS, V128:$MHS, V128:$RHS)>; + def : Pat<(v2i64 (OpNode (v2i64 V128:$LHS), (v2i64 V128:$MHS), + (v2i64 V128:$RHS))), + (!cast(NAME#"v16i8") + V128:$LHS, V128:$MHS, V128:$RHS)>; +} + // All operand sizes distinguished in the encoding. multiclass SIMDThreeSameVector opc, string asm, SDPatternOperator OpNode> { @@ -5362,7 +5519,7 @@ multiclass SIMDLogicalThreeVector size, string asm, } multiclass SIMDLogicalThreeVectorTied size, - string asm, SDPatternOperator OpNode> { + string asm, SDPatternOperator OpNode = null_frag> { def v8i8 : BaseSIMDThreeSameVectorTied<0, U, {size,1}, 0b00011, V64, asm, ".8b", [(set (v8i8 V64:$dst), @@ -5402,11 +5559,11 @@ multiclass SIMDLogicalThreeVectorTied size, // ARMv8.2-A Dot Product Instructions (Vector): These instructions extract // bytes from S-sized elements. -class BaseSIMDThreeSameVectorDot : - BaseSIMDThreeSameVectorTied { - def v8i8 : BaseSIMDThreeSameVectorDot<0, U, asm, ".2s", ".8b", V64, +multiclass SIMDThreeSameVectorDot { + def v8i8 : BaseSIMDThreeSameVectorDot<0, U, Mixed, asm, ".2s", ".8b", V64, v2i32, v8i8, OpNode>; - def v16i8 : BaseSIMDThreeSameVectorDot<1, U, asm, ".4s", ".16b", V128, + def v16i8 : BaseSIMDThreeSameVectorDot<1, U, Mixed, asm, ".4s", ".16b", V128, v4i32, v16i8, OpNode>; } @@ -6581,13 +6738,13 @@ multiclass SIMDThreeScalarHSTied opc, string asm, multiclass SIMDFPThreeScalar opc, string asm, SDPatternOperator OpNode = null_frag> { let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in { - def #NAME#64 : BaseSIMDThreeScalar; - def #NAME#32 : BaseSIMDThreeScalar; let Predicates = [HasNEON, HasFullFP16] in { - def #NAME#16 : BaseSIMDThreeScalar; + def NAME#16 : BaseSIMDThreeScalar; } // Predicates = [HasNEON, HasFullFP16] } @@ -6598,12 +6755,12 @@ multiclass SIMDFPThreeScalar opc, string asm, multiclass SIMDThreeScalarFPCmp opc, string asm, SDPatternOperator OpNode = null_frag> { let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in { - def #NAME#64 : BaseSIMDThreeScalar; - def #NAME#32 : BaseSIMDThreeScalar; let Predicates = [HasNEON, HasFullFP16] in { - def #NAME#16 : BaseSIMDThreeScalar; } // Predicates = [HasNEON, HasFullFP16] } @@ -6794,7 +6951,7 @@ multiclass SIMDFPTwoScalarCVT opc, string asm, [(set FPR32:$Rd, (OpNode (f32 FPR32:$Rn)))]>; let Predicates = [HasNEON, HasFullFP16] in { def v1i16 : BaseSIMDTwoScalar; + [(set (f16 FPR16:$Rd), (OpNode (f16 FPR16:$Rn)))]>; } } @@ -6936,10 +7093,10 @@ multiclass SIMDFPAcrossLanes opcode, bit sz1, string asm, let Predicates = [HasNEON, HasFullFP16] in { def v4i16v : BaseSIMDAcrossLanes<0, 0, {sz1, 0}, opcode, FPR16, V64, asm, ".4h", - [(set FPR16:$Rd, (intOp (v4f16 V64:$Rn)))]>; + [(set (f16 FPR16:$Rd), (intOp (v4f16 V64:$Rn)))]>; def v8i16v : BaseSIMDAcrossLanes<1, 0, {sz1, 0}, opcode, FPR16, V128, asm, ".8h", - [(set FPR16:$Rd, (intOp (v8f16 V128:$Rn)))]>; + [(set (f16 FPR16:$Rd), (intOp (v8f16 V128:$Rn)))]>; } // Predicates = [HasNEON, HasFullFP16] def v4i32v : BaseSIMDAcrossLanes<1, 1, {sz1, 0}, opcode, FPR32, V128, asm, ".4s", @@ -7136,7 +7293,7 @@ class SIMDInsMainMovAlias; class SIMDInsElementMovAlias - : InstAlias<"mov" # "{\t$dst" # size # "$idx, $src" # size # "$idx2" # + : InstAlias<"mov" # "{\t$dst" # size # "$idx, $src" # size # "$idx2" # "|" # size #"\t$dst$idx, $src$idx2}", (inst V128:$dst, idxtype:$idx, V128:$src, idxtype:$idx2)>; @@ -7377,7 +7534,7 @@ class BaseSIMDScalarCPY - : InstAlias; @@ -7651,13 +7808,152 @@ class BaseSIMDIndexedTied size, bits<4> opc, let Inst{4-0} = Rd; } + +//---------------------------------------------------------------------------- +// Armv8.6 BFloat16 Extension +//---------------------------------------------------------------------------- +let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in { + +class BaseSIMDThreeSameVectorBFDot + : BaseSIMDThreeSameVectorTied { + let AsmString = !strconcat(asm, + "{\t$Rd" # kind1 # ", $Rn" # kind2 # + ", $Rm" # kind2 # "}"); +} + +multiclass SIMDThreeSameVectorBFDot { + def v4bf16 : BaseSIMDThreeSameVectorBFDot<0, U, asm, ".2s", ".4h", V64, + v2f32, v8i8>; + def v8bf16 : BaseSIMDThreeSameVectorBFDot<1, U, asm, ".4s", ".8h", V128, + v4f32, v16i8>; +} + +class BaseSIMDThreeSameVectorBF16DotI + : BaseSIMDIndexedTied { + + bits<2> idx; + let Inst{21} = idx{0}; // L + let Inst{11} = idx{1}; // H +} + +multiclass SIMDThreeSameVectorBF16DotI { + + def v4bf16 : BaseSIMDThreeSameVectorBF16DotI<0, U, asm, ".2s", ".4h", + ".2h", V64, v2f32, v8i8>; + def v8bf16 : BaseSIMDThreeSameVectorBF16DotI<1, U, asm, ".4s", ".8h", + ".2h", V128, v4f32, v16i8>; +} + +class SIMDBF16MLAL + : BaseSIMDThreeSameVectorTied { + let AsmString = !strconcat(asm, "{\t$Rd.4s, $Rn.8h, $Rm.8h}"); +} + +class SIMDBF16MLALIndex + : I<(outs V128:$dst), + (ins V128:$Rd, V128:$Rn, V128_lo:$Rm, VectorIndexH:$idx), asm, + "{\t$Rd.4s, $Rn.8h, $Rm.h$idx}", "$Rd = $dst", + [(set (v4f32 V128:$dst), + (v4f32 (OpNode (v4f32 V128:$Rd), + (v16i8 V128:$Rn), + (v16i8 (bitconvert (v8bf16 + (AArch64duplane16 (v8bf16 V128_lo:$Rm), + VectorIndexH:$idx)))))))]>, + Sched<[WriteV]> { + bits<5> Rd; + bits<5> Rn; + bits<4> Rm; + bits<3> idx; + + let Inst{31} = 0; + let Inst{30} = Q; + let Inst{29-22} = 0b00111111; + let Inst{21-20} = idx{1-0}; + let Inst{19-16} = Rm; + let Inst{15-12} = 0b1111; + let Inst{11} = idx{2}; // H + let Inst{10} = 0; + let Inst{9-5} = Rn; + let Inst{4-0} = Rd; +} + +class SIMDThreeSameVectorBF16MatrixMul + : BaseSIMDThreeSameVectorTied<1, 1, 0b010, 0b11101, + V128, asm, ".4s", + [(set (v4f32 V128:$dst), + (int_aarch64_neon_bfmmla (v4f32 V128:$Rd), + (v16i8 V128:$Rn), + (v16i8 V128:$Rm)))]> { + let AsmString = !strconcat(asm, "{\t$Rd", ".4s", ", $Rn", ".8h", + ", $Rm", ".8h", "}"); +} + +class SIMD_BFCVTN + : BaseSIMDMixedTwoVector<0, 0, 0b10, 0b10110, V128, V128, + "bfcvtn", ".4h", ".4s", + [(set (v8bf16 V128:$Rd), + (int_aarch64_neon_bfcvtn (v4f32 V128:$Rn)))]>; + +class SIMD_BFCVTN2 + : BaseSIMDMixedTwoVectorTied<1, 0, 0b10, 0b10110, V128, V128, + "bfcvtn2", ".8h", ".4s", + [(set (v8bf16 V128:$dst), + (int_aarch64_neon_bfcvtn2 (v8bf16 V128:$Rd), (v4f32 V128:$Rn)))]>; + +class BF16ToSinglePrecision + : I<(outs FPR16:$Rd), (ins FPR32:$Rn), asm, "\t$Rd, $Rn", "", + [(set (bf16 FPR16:$Rd), (int_aarch64_neon_bfcvt (f32 FPR32:$Rn)))]>, + Sched<[WriteFCvt]> { + bits<5> Rd; + bits<5> Rn; + let Inst{31-10} = 0b0001111001100011010000; + let Inst{9-5} = Rn; + let Inst{4-0} = Rd; +} +} // End of let mayStore = 0, mayLoad = 0, hasSideEffects = 0 + +//---------------------------------------------------------------------------- +// Armv8.6 Matrix Multiply Extension +//---------------------------------------------------------------------------- + +class SIMDThreeSameVectorMatMul + : BaseSIMDThreeSameVectorTied<1, U, 0b100, {0b1010, B}, V128, asm, ".4s", + [(set (v4i32 V128:$dst), (OpNode (v4i32 V128:$Rd), + (v16i8 V128:$Rn), + (v16i8 V128:$Rm)))]> { + let AsmString = asm # "{\t$Rd.4s, $Rn.16b, $Rm.16b}"; +} + +//---------------------------------------------------------------------------- // ARMv8.2-A Dot Product Instructions (Indexed) -class BaseSIMDThreeSameVectorDotIndex size, string asm, + string dst_kind, string lhs_kind, string rhs_kind, RegisterOperand RegType, ValueType AccumType, ValueType InputType, SDPatternOperator OpNode> : - BaseSIMDIndexedTied size, string asm, SDPatternOperator OpNode> { - def v8i8 : BaseSIMDThreeSameVectorDotIndex<0, U, asm, ".2s", ".8b", ".4b", + def v8i8 : BaseSIMDThreeSameVectorDotIndex<0, U, Mixed, size, asm, ".2s", ".8b", ".4b", V64, v2i32, v8i8, OpNode>; - def v16i8 : BaseSIMDThreeSameVectorDotIndex<1, U, asm, ".4s", ".16b", ".4b", + def v16i8 : BaseSIMDThreeSameVectorDotIndex<1, U, Mixed, size, asm, ".4s", ".16b", ".4b", V128, v4i32, v16i8, OpNode>; } @@ -7813,6 +8109,34 @@ multiclass SIMDFPIndexed opc, string asm, } multiclass SIMDFPIndexedTiedPatterns { + let Predicates = [HasNEON, HasFullFP16] in { + // Patterns for f16: DUPLANE, DUP scalar and vector_extract. + def : Pat<(v8f16 (OpNode (v8f16 V128:$Rd), (v8f16 V128:$Rn), + (AArch64duplane16 (v8f16 V128_lo:$Rm), + VectorIndexH:$idx))), + (!cast(INST # "v8i16_indexed") + V128:$Rd, V128:$Rn, V128_lo:$Rm, VectorIndexH:$idx)>; + def : Pat<(v8f16 (OpNode (v8f16 V128:$Rd), (v8f16 V128:$Rn), + (AArch64dup (f16 FPR16Op_lo:$Rm)))), + (!cast(INST # "v8i16_indexed") V128:$Rd, V128:$Rn, + (SUBREG_TO_REG (i32 0), (f16 FPR16Op_lo:$Rm), hsub), (i64 0))>; + + def : Pat<(v4f16 (OpNode (v4f16 V64:$Rd), (v4f16 V64:$Rn), + (AArch64duplane16 (v8f16 V128_lo:$Rm), + VectorIndexH:$idx))), + (!cast(INST # "v4i16_indexed") + V64:$Rd, V64:$Rn, V128_lo:$Rm, VectorIndexH:$idx)>; + def : Pat<(v4f16 (OpNode (v4f16 V64:$Rd), (v4f16 V64:$Rn), + (AArch64dup (f16 FPR16Op_lo:$Rm)))), + (!cast(INST # "v4i16_indexed") V64:$Rd, V64:$Rn, + (SUBREG_TO_REG (i32 0), (f16 FPR16Op_lo:$Rm), hsub), (i64 0))>; + + def : Pat<(f16 (OpNode (f16 FPR16:$Rd), (f16 FPR16:$Rn), + (vector_extract (v8f16 V128_lo:$Rm), VectorIndexH:$idx))), + (!cast(INST # "v1i16_indexed") FPR16:$Rd, FPR16:$Rn, + V128_lo:$Rm, VectorIndexH:$idx)>; + } // Predicates = [HasNEON, HasFullFP16] + // 2 variants for the .2s version: DUPLANE from 128-bit and DUP scalar. def : Pat<(v2f32 (OpNode (v2f32 V64:$Rd), (v2f32 V64:$Rn), (AArch64duplane32 (v4f32 V128:$Rm), @@ -7847,15 +8171,11 @@ multiclass SIMDFPIndexedTiedPatterns { (!cast(INST # "v2i64_indexed") V128:$Rd, V128:$Rn, (SUBREG_TO_REG (i32 0), FPR64Op:$Rm, dsub), (i64 0))>; - // 2 variants for 32-bit scalar version: extract from .2s or from .4s + // Covers 2 variants for 32-bit scalar version: extract from .2s or from .4s def : Pat<(f32 (OpNode (f32 FPR32:$Rd), (f32 FPR32:$Rn), (vector_extract (v4f32 V128:$Rm), VectorIndexS:$idx))), (!cast(INST # "v1i32_indexed") FPR32:$Rd, FPR32:$Rn, V128:$Rm, VectorIndexS:$idx)>; - def : Pat<(f32 (OpNode (f32 FPR32:$Rd), (f32 FPR32:$Rn), - (vector_extract (v2f32 V64:$Rm), VectorIndexS:$idx))), - (!cast(INST # "v1i32_indexed") FPR32:$Rd, FPR32:$Rn, - (SUBREG_TO_REG (i32 0), V64:$Rm, dsub), VectorIndexS:$idx)>; // 1 variant for 64-bit scalar version: extract from .1d or from .2d def : Pat<(f64 (OpNode (f64 FPR64:$Rd), (f64 FPR64:$Rn), @@ -7940,6 +8260,64 @@ multiclass SIMDFPIndexedTied opc, string asm> { } } +multiclass SIMDIndexedHSPatterns { + + def : Pat<(v4i16 (OpNodeLane + (v4i16 V64:$Rn), (v4i16 V64_lo:$Rm), + VectorIndexS32b:$idx)), + (!cast(NAME # v4i16_indexed) $Rn, + (SUBREG_TO_REG (i32 0), (v4i16 V64_lo:$Rm), dsub), + (UImmS1XForm $idx))>; + + def : Pat<(v4i16 (OpNodeLaneQ + (v4i16 V64:$Rn), (v8i16 V128_lo:$Rm), + VectorIndexH32b:$idx)), + (!cast(NAME # v4i16_indexed) $Rn, $Rm, + (UImmS1XForm $idx))>; + + def : Pat<(v8i16 (OpNodeLane + (v8i16 V128:$Rn), (v4i16 V64_lo:$Rm), + VectorIndexS32b:$idx)), + (!cast(NAME # v8i16_indexed) $Rn, + (SUBREG_TO_REG (i32 0), $Rm, dsub), + (UImmS1XForm $idx))>; + + def : Pat<(v8i16 (OpNodeLaneQ + (v8i16 V128:$Rn), (v8i16 V128_lo:$Rm), + VectorIndexH32b:$idx)), + (!cast(NAME # v8i16_indexed) $Rn, $Rm, + (UImmS1XForm $idx))>; + + def : Pat<(v2i32 (OpNodeLane + (v2i32 V64:$Rn), (v2i32 V64:$Rm), + VectorIndexD32b:$idx)), + (!cast(NAME # v2i32_indexed) $Rn, + (SUBREG_TO_REG (i32 0), (v2i32 V64_lo:$Rm), dsub), + (UImmS1XForm $idx))>; + + def : Pat<(v2i32 (OpNodeLaneQ + (v2i32 V64:$Rn), (v4i32 V128:$Rm), + VectorIndexS32b:$idx)), + (!cast(NAME # v2i32_indexed) $Rn, $Rm, + (UImmS1XForm $idx))>; + + def : Pat<(v4i32 (OpNodeLane + (v4i32 V128:$Rn), (v2i32 V64:$Rm), + VectorIndexD32b:$idx)), + (!cast(NAME # v4i32_indexed) $Rn, + (SUBREG_TO_REG (i32 0), $Rm, dsub), + (UImmS1XForm $idx))>; + + def : Pat<(v4i32 (OpNodeLaneQ + (v4i32 V128:$Rn), + (v4i32 V128:$Rm), + VectorIndexS32b:$idx)), + (!cast(NAME # v4i32_indexed) $Rn, $Rm, + (UImmS1XForm $idx))>; + +} + multiclass SIMDIndexedHS opc, string asm, SDPatternOperator OpNode> { def v4i16_indexed : BaseSIMDIndexed<0, U, 0, 0b01, opc, V64, V64, @@ -10154,15 +10532,15 @@ class ComplexRotationOperand let DiagnosticType = "InvalidComplexRotation" # Type; let Name = "ComplexRotation" # Type; } -def complexrotateop : Operand, ImmLeaf= 0 && Imm <= 270; }], - SDNodeXForm, TImmLeaf= 0 && Imm <= 270; }], + SDNodeXFormgetTargetConstant((N->getSExtValue() / 90), SDLoc(N), MVT::i32); }]>> { let ParserMatchClass = ComplexRotationOperand<90, 0, "Even">; let PrintMethod = "printComplexRotationOp<90, 0>"; } -def complexrotateopodd : Operand, ImmLeaf= 0 && Imm <= 270; }], - SDNodeXForm, TImmLeaf= 0 && Imm <= 270; }], + SDNodeXFormgetTargetConstant(((N->getSExtValue() - 90) / 180), SDLoc(N), MVT::i32); }]>> { let ParserMatchClass = ComplexRotationOperand<180, 90, "Odd">; diff --git a/llvm/lib/Target/AArch64/AArch64InstrGISel.td b/llvm/lib/Target/AArch64/AArch64InstrGISel.td new file mode 100644 index 0000000000000..a0e7c782f68c3 --- /dev/null +++ b/llvm/lib/Target/AArch64/AArch64InstrGISel.td @@ -0,0 +1,124 @@ +//=----- AArch64InstrGISel.td - AArch64 GISel target pseudos -*- tablegen -*-=// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// AArch64 GlobalISel target pseudo instruction definitions. This is kept +// separately from the other tablegen files for organizational purposes, but +// share the same infrastructure. +// +//===----------------------------------------------------------------------===// + + +class AArch64GenericInstruction : GenericInstruction { + let Namespace = "AArch64"; +} + +// A pseudo to represent a relocatable add instruction as part of address +// computation. +def G_ADD_LOW : AArch64GenericInstruction { + let OutOperandList = (outs type0:$dst); + let InOperandList = (ins type1:$src, type2:$imm); + let hasSideEffects = 0; +} + +// Pseudo for a rev16 instruction. Produced post-legalization from +// G_SHUFFLE_VECTORs with appropriate masks. +def G_REV16 : AArch64GenericInstruction { + let OutOperandList = (outs type0:$dst); + let InOperandList = (ins type0:$src); + let hasSideEffects = 0; +} + +// Pseudo for a rev32 instruction. Produced post-legalization from +// G_SHUFFLE_VECTORs with appropriate masks. +def G_REV32 : AArch64GenericInstruction { + let OutOperandList = (outs type0:$dst); + let InOperandList = (ins type0:$src); + let hasSideEffects = 0; +} + +// Pseudo for a rev64 instruction. Produced post-legalization from +// G_SHUFFLE_VECTORs with appropriate masks. +def G_REV64 : AArch64GenericInstruction { + let OutOperandList = (outs type0:$dst); + let InOperandList = (ins type0:$src); + let hasSideEffects = 0; +} + +// Represents an uzp1 instruction. Produced post-legalization from +// G_SHUFFLE_VECTORs with appropriate masks. +def G_UZP1 : AArch64GenericInstruction { + let OutOperandList = (outs type0:$dst); + let InOperandList = (ins type0:$v1, type0:$v2); + let hasSideEffects = 0; +} + +// Represents an uzp2 instruction. Produced post-legalization from +// G_SHUFFLE_VECTORs with appropriate masks. +def G_UZP2 : AArch64GenericInstruction { + let OutOperandList = (outs type0:$dst); + let InOperandList = (ins type0:$v1, type0:$v2); + let hasSideEffects = 0; +} + +// Represents a zip1 instruction. Produced post-legalization from +// G_SHUFFLE_VECTORs with appropriate masks. +def G_ZIP1 : AArch64GenericInstruction { + let OutOperandList = (outs type0:$dst); + let InOperandList = (ins type0:$v1, type0:$v2); + let hasSideEffects = 0; +} + +// Represents a zip2 instruction. Produced post-legalization from +// G_SHUFFLE_VECTORs with appropriate masks. +def G_ZIP2 : AArch64GenericInstruction { + let OutOperandList = (outs type0:$dst); + let InOperandList = (ins type0:$v1, type0:$v2); + let hasSideEffects = 0; +} + +// Represents a dup instruction. Produced post-legalization from +// G_SHUFFLE_VECTORs with appropriate masks. +def G_DUP: AArch64GenericInstruction { + let OutOperandList = (outs type0:$dst); + let InOperandList = (ins type1:$lane); + let hasSideEffects = 0; +} +// Represents a trn1 instruction. Produced post-legalization from +// G_SHUFFLE_VECTORs with appropriate masks. +def G_TRN1 : AArch64GenericInstruction { + let OutOperandList = (outs type0:$dst); + let InOperandList = (ins type0:$v1, type0:$v2); + let hasSideEffects = 0; +} + +// Represents a trn2 instruction. Produced post-legalization from +// G_SHUFFLE_VECTORs with appropriate masks. +def G_TRN2 : AArch64GenericInstruction { + let OutOperandList = (outs type0:$dst); + let InOperandList = (ins type0:$v1, type0:$v2); + let hasSideEffects = 0; +} + +// Represents an ext instruction. Produced post-legalization from +// G_SHUFFLE_VECTORs with appropriate masks. +def G_EXT: AArch64GenericInstruction { + let OutOperandList = (outs type0:$dst); + let InOperandList = (ins type0:$v1, type0:$v2, untyped_imm_0:$imm); +} + +def : GINodeEquiv; +def : GINodeEquiv; +def : GINodeEquiv; +def : GINodeEquiv; +def : GINodeEquiv; +def : GINodeEquiv; +def : GINodeEquiv; +def : GINodeEquiv; +def : GINodeEquiv; +def : GINodeEquiv; +def : GINodeEquiv; diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp index 54f3f7c101324..5139ae5ccaf19 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -24,9 +24,9 @@ #include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineMemOperand.h" +#include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/CodeGen/StackMaps.h" #include "llvm/CodeGen/TargetRegisterInfo.h" #include "llvm/CodeGen/TargetSubtargetInfo.h" @@ -111,6 +111,14 @@ unsigned AArch64InstrInfo::getInstSizeInBytes(const MachineInstr &MI) const { // This gets lowered to an instruction sequence which takes 16 bytes NumBytes = 16; break; + case AArch64::SpeculationBarrierISBDSBEndBB: + // This gets lowered to 2 4-byte instructions. + NumBytes = 8; + break; + case AArch64::SpeculationBarrierSBEndBB: + // This gets lowered to 1 4-byte instructions. + NumBytes = 4; + break; case AArch64::JumpTableDest32: case AArch64::JumpTableDest16: case AArch64::JumpTableDest8: @@ -119,11 +127,25 @@ unsigned AArch64InstrInfo::getInstSizeInBytes(const MachineInstr &MI) const { case AArch64::SPACE: NumBytes = MI.getOperand(1).getImm(); break; + case TargetOpcode::BUNDLE: + NumBytes = getInstBundleLength(MI); + break; } return NumBytes; } +unsigned AArch64InstrInfo::getInstBundleLength(const MachineInstr &MI) const { + unsigned Size = 0; + MachineBasicBlock::const_instr_iterator I = MI.getIterator(); + MachineBasicBlock::const_instr_iterator E = MI.getParent()->instr_end(); + while (++I != E && I->isInsideBundle()) { + assert(!I->isBundle() && "No nested bundle!"); + Size += getInstSizeInBytes(*I); + } + return Size; +} + static void parseCondBranch(MachineInstr *LastInst, MachineBasicBlock *&Target, SmallVectorImpl &Cond) { // Block ends with fall-through condbranch. @@ -216,6 +238,12 @@ bool AArch64InstrInfo::analyzeBranch(MachineBasicBlock &MBB, if (I == MBB.end()) return false; + // Skip over SpeculationBarrierEndBB terminators + if (I->getOpcode() == AArch64::SpeculationBarrierISBDSBEndBB || + I->getOpcode() == AArch64::SpeculationBarrierSBEndBB) { + --I; + } + if (!isUnpredicatedTerminator(*I)) return false; @@ -496,8 +524,9 @@ static unsigned canFoldIntoCSel(const MachineRegisterInfo &MRI, unsigned VReg, bool AArch64InstrInfo::canInsertSelect(const MachineBasicBlock &MBB, ArrayRef Cond, - unsigned TrueReg, unsigned FalseReg, - int &CondCycles, int &TrueCycles, + Register DstReg, Register TrueReg, + Register FalseReg, int &CondCycles, + int &TrueCycles, int &FalseCycles) const { // Check register classes. const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); @@ -506,6 +535,12 @@ bool AArch64InstrInfo::canInsertSelect(const MachineBasicBlock &MBB, if (!RC) return false; + // Also need to check the dest regclass, in case we're trying to optimize + // something like: + // %1(gpr) = PHI %2(fpr), bb1, %(fpr), bb2 + if (!RI.getCommonSubClass(RC, MRI.getRegClass(DstReg))) + return false; + // Expanding cbz/tbz requires an extra cycle of latency on the condition. unsigned ExtraCondLat = Cond.size() != 1; @@ -538,9 +573,9 @@ bool AArch64InstrInfo::canInsertSelect(const MachineBasicBlock &MBB, void AArch64InstrInfo::insertSelect(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, - const DebugLoc &DL, unsigned DstReg, + const DebugLoc &DL, Register DstReg, ArrayRef Cond, - unsigned TrueReg, unsigned FalseReg) const { + Register TrueReg, Register FalseReg) const { MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); // Parse the condition code, see parseCondBranch() above. @@ -910,7 +945,7 @@ bool AArch64InstrInfo::isSEHInstruction(const MachineInstr &MI) { } bool AArch64InstrInfo::isCoalescableExtInstr(const MachineInstr &MI, - unsigned &SrcReg, unsigned &DstReg, + Register &SrcReg, Register &DstReg, unsigned &SubIdx) const { switch (MI.getOpcode()) { default: @@ -935,6 +970,7 @@ bool AArch64InstrInfo::areMemAccessesTriviallyDisjoint( const MachineOperand *BaseOpA = nullptr, *BaseOpB = nullptr; int64_t OffsetA = 0, OffsetB = 0; unsigned WidthA = 0, WidthB = 0; + bool OffsetAIsScalable = false, OffsetBIsScalable = false; assert(MIa.mayLoadOrStore() && "MIa must be a load or store."); assert(MIb.mayLoadOrStore() && "MIb must be a load or store."); @@ -948,9 +984,14 @@ bool AArch64InstrInfo::areMemAccessesTriviallyDisjoint( // base are identical, and the offset of a lower memory access + // the width doesn't overlap the offset of a higher memory access, // then the memory accesses are different. - if (getMemOperandWithOffsetWidth(MIa, BaseOpA, OffsetA, WidthA, TRI) && - getMemOperandWithOffsetWidth(MIb, BaseOpB, OffsetB, WidthB, TRI)) { - if (BaseOpA->isIdenticalTo(*BaseOpB)) { + // If OffsetAIsScalable and OffsetBIsScalable are both true, they + // are assumed to have the same scale (vscale). + if (getMemOperandWithOffsetWidth(MIa, BaseOpA, OffsetA, OffsetAIsScalable, + WidthA, TRI) && + getMemOperandWithOffsetWidth(MIb, BaseOpB, OffsetB, OffsetBIsScalable, + WidthB, TRI)) { + if (BaseOpA->isIdenticalTo(*BaseOpB) && + OffsetAIsScalable == OffsetBIsScalable) { int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB; int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA; int LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB; @@ -984,8 +1025,8 @@ bool AArch64InstrInfo::isSchedulingBoundary(const MachineInstr &MI, /// analyzeCompare - For a comparison instruction, return the source registers /// in SrcReg and SrcReg2, and the value it compares against in CmpValue. /// Return true if the comparison instruction can be analyzed. -bool AArch64InstrInfo::analyzeCompare(const MachineInstr &MI, unsigned &SrcReg, - unsigned &SrcReg2, int &CmpMask, +bool AArch64InstrInfo::analyzeCompare(const MachineInstr &MI, Register &SrcReg, + Register &SrcReg2, int &CmpMask, int &CmpValue) const { // The first operand can be a frame index where we'd normally expect a // register. @@ -1156,10 +1197,9 @@ static bool areCFlagsAccessedBetweenInstrs( return MI.getIterator() == From; }) != To->getParent()->rend()); - // We iterate backward starting \p To until we hit \p From. - for (--To; To != From; --To) { - const MachineInstr &Instr = *To; - + // We iterate backward starting at \p To until we hit \p From. + for (const MachineInstr &Instr : + instructionsWithoutDebug(++To.getReverse(), From.getReverse())) { if (((AccessToCheck & AK_Write) && Instr.modifiesRegister(AArch64::NZCV, TRI)) || ((AccessToCheck & AK_Read) && Instr.readsRegister(AArch64::NZCV, TRI))) @@ -1180,7 +1220,7 @@ static bool areCFlagsAccessedBetweenInstrs( /// instruction. /// Only comparison with zero is supported. bool AArch64InstrInfo::optimizeCompareInstr( - MachineInstr &CmpInstr, unsigned SrcReg, unsigned SrcReg2, int CmpMask, + MachineInstr &CmpInstr, Register SrcReg, Register SrcReg2, int CmpMask, int CmpValue, const MachineRegisterInfo *MRI) const { assert(CmpInstr.getParent()); assert(MRI); @@ -1416,10 +1456,9 @@ static bool canInstrSubstituteCmpInstr(MachineInstr *MI, MachineInstr *CmpInstr, return false; UsedNZCV NZCVUsedAfterCmp; - for (auto I = std::next(CmpInstr->getIterator()), - E = CmpInstr->getParent()->instr_end(); - I != E; ++I) { - const MachineInstr &Instr = *I; + for (const MachineInstr &Instr : + instructionsWithoutDebug(std::next(CmpInstr->getIterator()), + CmpInstr->getParent()->instr_end())) { if (Instr.readsRegister(AArch64::NZCV, TRI)) { AArch64CC::CondCode CC = findCondCodeUsedByInstr(Instr); if (CC == AArch64CC::Invalid) // Unsupported conditional instruction @@ -1684,6 +1723,8 @@ unsigned AArch64InstrInfo::isStoreToStackSlot(const MachineInstr &MI, case AArch64::STRSui: case AArch64::STRDui: case AArch64::STRQui: + case AArch64::LDR_PXI: + case AArch64::STR_PXI: if (MI.getOperand(0).getSubReg() == 0 && MI.getOperand(1).isFI() && MI.getOperand(2).isImm() && MI.getOperand(2).getImm() == 0) { FrameIndex = MI.getOperand(1).getIndex(); @@ -1796,9 +1837,37 @@ unsigned AArch64InstrInfo::getLoadStoreImmIdx(unsigned Opc) { case AArch64::STNPSi: case AArch64::LDG: case AArch64::STGPi: + case AArch64::LD1B_IMM: + case AArch64::LD1H_IMM: + case AArch64::LD1W_IMM: + case AArch64::LD1D_IMM: + case AArch64::ST1B_IMM: + case AArch64::ST1H_IMM: + case AArch64::ST1W_IMM: + case AArch64::ST1D_IMM: + case AArch64::LD1B_H_IMM: + case AArch64::LD1SB_H_IMM: + case AArch64::LD1H_S_IMM: + case AArch64::LD1SH_S_IMM: + case AArch64::LD1W_D_IMM: + case AArch64::LD1SW_D_IMM: + case AArch64::ST1B_H_IMM: + case AArch64::ST1H_S_IMM: + case AArch64::ST1W_D_IMM: + case AArch64::LD1B_S_IMM: + case AArch64::LD1SB_S_IMM: + case AArch64::LD1H_D_IMM: + case AArch64::LD1SH_D_IMM: + case AArch64::ST1B_S_IMM: + case AArch64::ST1H_D_IMM: + case AArch64::LD1B_D_IMM: + case AArch64::LD1SB_D_IMM: + case AArch64::ST1B_D_IMM: return 3; case AArch64::ADDG: case AArch64::STGOffset: + case AArch64::LDR_PXI: + case AArch64::STR_PXI: return 2; } } @@ -1978,20 +2047,25 @@ bool AArch64InstrInfo::isCandidateToMergeOrPair(const MachineInstr &MI) const { return true; } -bool AArch64InstrInfo::getMemOperandWithOffset(const MachineInstr &LdSt, - const MachineOperand *&BaseOp, - int64_t &Offset, - const TargetRegisterInfo *TRI) const { +bool AArch64InstrInfo::getMemOperandsWithOffsetWidth( + const MachineInstr &LdSt, SmallVectorImpl &BaseOps, + int64_t &Offset, bool &OffsetIsScalable, unsigned &Width, + const TargetRegisterInfo *TRI) const { if (!LdSt.mayLoadOrStore()) return false; - unsigned Width; - return getMemOperandWithOffsetWidth(LdSt, BaseOp, Offset, Width, TRI); + const MachineOperand *BaseOp; + if (!getMemOperandWithOffsetWidth(LdSt, BaseOp, Offset, OffsetIsScalable, + Width, TRI)) + return false; + BaseOps.push_back(BaseOp); + return true; } bool AArch64InstrInfo::getMemOperandWithOffsetWidth( const MachineInstr &LdSt, const MachineOperand *&BaseOp, int64_t &Offset, - unsigned &Width, const TargetRegisterInfo *TRI) const { + bool &OffsetIsScalable, unsigned &Width, + const TargetRegisterInfo *TRI) const { assert(LdSt.mayLoadOrStore() && "Expected a memory operation."); // Handle only loads/stores with base register followed by immediate offset. if (LdSt.getNumExplicitOperands() == 3) { @@ -2010,7 +2084,7 @@ bool AArch64InstrInfo::getMemOperandWithOffsetWidth( // Get the scaling factor for the instruction and set the width for the // instruction. - unsigned Scale = 0; + TypeSize Scale(0U, false); int64_t Dummy1, Dummy2; // If this returns false, then it's an instruction we don't want to handle. @@ -2022,12 +2096,13 @@ bool AArch64InstrInfo::getMemOperandWithOffsetWidth( // set to 1. if (LdSt.getNumExplicitOperands() == 3) { BaseOp = &LdSt.getOperand(1); - Offset = LdSt.getOperand(2).getImm() * Scale; + Offset = LdSt.getOperand(2).getImm() * Scale.getKnownMinSize(); } else { assert(LdSt.getNumExplicitOperands() == 4 && "invalid number of operands"); BaseOp = &LdSt.getOperand(2); - Offset = LdSt.getOperand(3).getImm() * Scale; + Offset = LdSt.getOperand(3).getImm() * Scale.getKnownMinSize(); } + OffsetIsScalable = Scale.isScalable(); if (!BaseOp->isReg() && !BaseOp->isFI()) return false; @@ -2043,26 +2118,28 @@ AArch64InstrInfo::getMemOpBaseRegImmOfsOffsetOperand(MachineInstr &LdSt) const { return OfsOp; } -bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, unsigned &Scale, +bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, TypeSize &Scale, unsigned &Width, int64_t &MinOffset, int64_t &MaxOffset) { + const unsigned SVEMaxBytesPerVector = AArch64::SVEMaxBitsPerVector / 8; switch (Opcode) { // Not a memory operation or something we want to handle. default: - Scale = Width = 0; + Scale = TypeSize::Fixed(0); + Width = 0; MinOffset = MaxOffset = 0; return false; case AArch64::STRWpost: case AArch64::LDRWpost: Width = 32; - Scale = 4; + Scale = TypeSize::Fixed(4); MinOffset = -256; MaxOffset = 255; break; case AArch64::LDURQi: case AArch64::STURQi: Width = 16; - Scale = 1; + Scale = TypeSize::Fixed(1); MinOffset = -256; MaxOffset = 255; break; @@ -2072,7 +2149,7 @@ bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, unsigned &Scale, case AArch64::STURXi: case AArch64::STURDi: Width = 8; - Scale = 1; + Scale = TypeSize::Fixed(1); MinOffset = -256; MaxOffset = 255; break; @@ -2082,7 +2159,7 @@ bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, unsigned &Scale, case AArch64::STURWi: case AArch64::STURSi: Width = 4; - Scale = 1; + Scale = TypeSize::Fixed(1); MinOffset = -256; MaxOffset = 255; break; @@ -2093,7 +2170,7 @@ bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, unsigned &Scale, case AArch64::STURHi: case AArch64::STURHHi: Width = 2; - Scale = 1; + Scale = TypeSize::Fixed(1); MinOffset = -256; MaxOffset = 255; break; @@ -2104,7 +2181,7 @@ bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, unsigned &Scale, case AArch64::STURBi: case AArch64::STURBBi: Width = 1; - Scale = 1; + Scale = TypeSize::Fixed(1); MinOffset = -256; MaxOffset = 255; break; @@ -2112,14 +2189,15 @@ bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, unsigned &Scale, case AArch64::LDNPQi: case AArch64::STPQi: case AArch64::STNPQi: - Scale = 16; + Scale = TypeSize::Fixed(16); Width = 32; MinOffset = -64; MaxOffset = 63; break; case AArch64::LDRQui: case AArch64::STRQui: - Scale = Width = 16; + Scale = TypeSize::Fixed(16); + Width = 16; MinOffset = 0; MaxOffset = 4095; break; @@ -2131,7 +2209,7 @@ bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, unsigned &Scale, case AArch64::STPDi: case AArch64::STNPXi: case AArch64::STNPDi: - Scale = 8; + Scale = TypeSize::Fixed(8); Width = 16; MinOffset = -64; MaxOffset = 63; @@ -2141,7 +2219,8 @@ bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, unsigned &Scale, case AArch64::LDRDui: case AArch64::STRXui: case AArch64::STRDui: - Scale = Width = 8; + Scale = TypeSize::Fixed(8); + Width = 8; MinOffset = 0; MaxOffset = 4095; break; @@ -2153,7 +2232,7 @@ bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, unsigned &Scale, case AArch64::STPSi: case AArch64::STNPWi: case AArch64::STNPSi: - Scale = 4; + Scale = TypeSize::Fixed(4); Width = 8; MinOffset = -64; MaxOffset = 63; @@ -2163,7 +2242,8 @@ bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, unsigned &Scale, case AArch64::LDRSWui: case AArch64::STRWui: case AArch64::STRSui: - Scale = Width = 4; + Scale = TypeSize::Fixed(4); + Width = 4; MinOffset = 0; MaxOffset = 4095; break; @@ -2173,7 +2253,8 @@ bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, unsigned &Scale, case AArch64::LDRSHXui: case AArch64::STRHui: case AArch64::STRHHui: - Scale = Width = 2; + Scale = TypeSize::Fixed(2); + Width = 2; MinOffset = 0; MaxOffset = 4095; break; @@ -2183,18 +2264,19 @@ bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, unsigned &Scale, case AArch64::LDRSBXui: case AArch64::STRBui: case AArch64::STRBBui: - Scale = Width = 1; + Scale = TypeSize::Fixed(1); + Width = 1; MinOffset = 0; MaxOffset = 4095; break; case AArch64::ADDG: - Scale = 16; + Scale = TypeSize::Fixed(16); Width = 0; MinOffset = 0; MaxOffset = 63; break; case AArch64::TAGPstack: - Scale = 16; + Scale = TypeSize::Fixed(16); Width = 0; // TAGP with a negative offset turns into SUBP, which has a maximum offset // of 63 (not 64!). @@ -2204,31 +2286,110 @@ bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, unsigned &Scale, case AArch64::LDG: case AArch64::STGOffset: case AArch64::STZGOffset: - Scale = Width = 16; + Scale = TypeSize::Fixed(16); + Width = 16; MinOffset = -256; MaxOffset = 255; break; + case AArch64::STR_ZZZZXI: + case AArch64::LDR_ZZZZXI: + Scale = TypeSize::Scalable(16); + Width = SVEMaxBytesPerVector * 4; + MinOffset = -256; + MaxOffset = 252; + break; + case AArch64::STR_ZZZXI: + case AArch64::LDR_ZZZXI: + Scale = TypeSize::Scalable(16); + Width = SVEMaxBytesPerVector * 3; + MinOffset = -256; + MaxOffset = 253; + break; + case AArch64::STR_ZZXI: + case AArch64::LDR_ZZXI: + Scale = TypeSize::Scalable(16); + Width = SVEMaxBytesPerVector * 2; + MinOffset = -256; + MaxOffset = 254; + break; case AArch64::LDR_PXI: case AArch64::STR_PXI: - Scale = Width = 2; + Scale = TypeSize::Scalable(2); + Width = SVEMaxBytesPerVector / 8; MinOffset = -256; MaxOffset = 255; break; case AArch64::LDR_ZXI: case AArch64::STR_ZXI: - Scale = Width = 16; + Scale = TypeSize::Scalable(16); + Width = SVEMaxBytesPerVector; MinOffset = -256; MaxOffset = 255; break; + case AArch64::LD1B_IMM: + case AArch64::LD1H_IMM: + case AArch64::LD1W_IMM: + case AArch64::LD1D_IMM: + case AArch64::ST1B_IMM: + case AArch64::ST1H_IMM: + case AArch64::ST1W_IMM: + case AArch64::ST1D_IMM: + // A full vectors worth of data + // Width = mbytes * elements + Scale = TypeSize::Scalable(16); + Width = SVEMaxBytesPerVector; + MinOffset = -8; + MaxOffset = 7; + break; + case AArch64::LD1B_H_IMM: + case AArch64::LD1SB_H_IMM: + case AArch64::LD1H_S_IMM: + case AArch64::LD1SH_S_IMM: + case AArch64::LD1W_D_IMM: + case AArch64::LD1SW_D_IMM: + case AArch64::ST1B_H_IMM: + case AArch64::ST1H_S_IMM: + case AArch64::ST1W_D_IMM: + // A half vector worth of data + // Width = mbytes * elements + Scale = TypeSize::Scalable(8); + Width = SVEMaxBytesPerVector / 2; + MinOffset = -8; + MaxOffset = 7; + break; + case AArch64::LD1B_S_IMM: + case AArch64::LD1SB_S_IMM: + case AArch64::LD1H_D_IMM: + case AArch64::LD1SH_D_IMM: + case AArch64::ST1B_S_IMM: + case AArch64::ST1H_D_IMM: + // A quarter vector worth of data + // Width = mbytes * elements + Scale = TypeSize::Scalable(4); + Width = SVEMaxBytesPerVector / 4; + MinOffset = -8; + MaxOffset = 7; + break; + case AArch64::LD1B_D_IMM: + case AArch64::LD1SB_D_IMM: + case AArch64::ST1B_D_IMM: + // A eighth vector worth of data + // Width = mbytes * elements + Scale = TypeSize::Scalable(2); + Width = SVEMaxBytesPerVector / 8; + MinOffset = -8; + MaxOffset = 7; + break; case AArch64::ST2GOffset: case AArch64::STZ2GOffset: - Scale = 16; + Scale = TypeSize::Fixed(16); Width = 32; MinOffset = -256; MaxOffset = 255; break; case AArch64::STGPi: - Scale = Width = 16; + Scale = TypeSize::Fixed(16); + Width = 16; MinOffset = -64; MaxOffset = 63; break; @@ -2363,9 +2524,13 @@ static bool shouldClusterFI(const MachineFrameInfo &MFI, int FI1, /// Detect opportunities for ldp/stp formation. /// /// Only called for LdSt for which getMemOperandWithOffset returns true. -bool AArch64InstrInfo::shouldClusterMemOps(const MachineOperand &BaseOp1, - const MachineOperand &BaseOp2, - unsigned NumLoads) const { +bool AArch64InstrInfo::shouldClusterMemOps( + ArrayRef BaseOps1, + ArrayRef BaseOps2, unsigned NumLoads, + unsigned NumBytes) const { + assert(BaseOps1.size() == 1 && BaseOps2.size() == 1); + const MachineOperand &BaseOp1 = *BaseOps1.front(); + const MachineOperand &BaseOp2 = *BaseOps2.front(); const MachineInstr &FirstLdSt = *BaseOp1.getParent(); const MachineInstr &SecondLdSt = *BaseOp2.getParent(); if (BaseOp1.getType() != BaseOp2.getType()) @@ -2379,7 +2544,7 @@ bool AArch64InstrInfo::shouldClusterMemOps(const MachineOperand &BaseOp1, return false; // Only cluster up to a single pair. - if (NumLoads > 1) + if (NumLoads > 2) return false; if (!isPairableLdStInst(FirstLdSt) || !isPairableLdStInst(SecondLdSt)) @@ -2822,11 +2987,11 @@ static void storeRegPairToStackSlot(const TargetRegisterInfo &TRI, MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertBefore, const MCInstrDesc &MCID, - unsigned SrcReg, bool IsKill, + Register SrcReg, bool IsKill, unsigned SubIdx0, unsigned SubIdx1, int FI, MachineMemOperand *MMO) { - unsigned SrcReg0 = SrcReg; - unsigned SrcReg1 = SrcReg; + Register SrcReg0 = SrcReg; + Register SrcReg1 = SrcReg; if (Register::isPhysicalRegister(SrcReg)) { SrcReg0 = TRI.getSubReg(SrcReg, SubIdx0); SubIdx0 = 0; @@ -2842,18 +3007,19 @@ static void storeRegPairToStackSlot(const TargetRegisterInfo &TRI, } void AArch64InstrInfo::storeRegToStackSlot( - MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, unsigned SrcReg, + MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register SrcReg, bool isKill, int FI, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI) const { MachineFunction &MF = *MBB.getParent(); MachineFrameInfo &MFI = MF.getFrameInfo(); - unsigned Align = MFI.getObjectAlignment(FI); MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI); - MachineMemOperand *MMO = MF.getMachineMemOperand( - PtrInfo, MachineMemOperand::MOStore, MFI.getObjectSize(FI), Align); + MachineMemOperand *MMO = + MF.getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore, + MFI.getObjectSize(FI), MFI.getObjectAlign(FI)); unsigned Opc = 0; bool Offset = true; + unsigned StackID = TargetStackID::Default; switch (TRI->getSpillSize(*RC)) { case 1: if (AArch64::FPR8RegClass.hasSubClassEq(RC)) @@ -2862,6 +3028,11 @@ void AArch64InstrInfo::storeRegToStackSlot( case 2: if (AArch64::FPR16RegClass.hasSubClassEq(RC)) Opc = AArch64::STRHui; + else if (AArch64::PPRRegClass.hasSubClassEq(RC)) { + assert(Subtarget.hasSVE() && "Unexpected register store without SVE"); + Opc = AArch64::STR_PXI; + StackID = TargetStackID::SVEVector; + } break; case 4: if (AArch64::GPR32allRegClass.hasSubClassEq(RC)) { @@ -2901,6 +3072,10 @@ void AArch64InstrInfo::storeRegToStackSlot( get(AArch64::STPXi), SrcReg, isKill, AArch64::sube64, AArch64::subo64, FI, MMO); return; + } else if (AArch64::ZPRRegClass.hasSubClassEq(RC)) { + assert(Subtarget.hasSVE() && "Unexpected register store without SVE"); + Opc = AArch64::STR_ZXI; + StackID = TargetStackID::SVEVector; } break; case 24: @@ -2919,6 +3094,10 @@ void AArch64InstrInfo::storeRegToStackSlot( assert(Subtarget.hasNEON() && "Unexpected register store without NEON"); Opc = AArch64::ST1Twov2d; Offset = false; + } else if (AArch64::ZPR2RegClass.hasSubClassEq(RC)) { + assert(Subtarget.hasSVE() && "Unexpected register store without SVE"); + Opc = AArch64::STR_ZZXI; + StackID = TargetStackID::SVEVector; } break; case 48: @@ -2926,6 +3105,10 @@ void AArch64InstrInfo::storeRegToStackSlot( assert(Subtarget.hasNEON() && "Unexpected register store without NEON"); Opc = AArch64::ST1Threev2d; Offset = false; + } else if (AArch64::ZPR3RegClass.hasSubClassEq(RC)) { + assert(Subtarget.hasSVE() && "Unexpected register store without SVE"); + Opc = AArch64::STR_ZZZXI; + StackID = TargetStackID::SVEVector; } break; case 64: @@ -2933,19 +3116,13 @@ void AArch64InstrInfo::storeRegToStackSlot( assert(Subtarget.hasNEON() && "Unexpected register store without NEON"); Opc = AArch64::ST1Fourv2d; Offset = false; + } else if (AArch64::ZPR4RegClass.hasSubClassEq(RC)) { + assert(Subtarget.hasSVE() && "Unexpected register store without SVE"); + Opc = AArch64::STR_ZZZZXI; + StackID = TargetStackID::SVEVector; } break; } - unsigned StackID = TargetStackID::Default; - if (AArch64::PPRRegClass.hasSubClassEq(RC)) { - assert(Subtarget.hasSVE() && "Unexpected register store without SVE"); - Opc = AArch64::STR_PXI; - StackID = TargetStackID::SVEVector; - } else if (AArch64::ZPRRegClass.hasSubClassEq(RC)) { - assert(Subtarget.hasSVE() && "Unexpected register store without SVE"); - Opc = AArch64::STR_ZXI; - StackID = TargetStackID::SVEVector; - } assert(Opc && "Unknown register class"); MFI.setStackID(FI, StackID); @@ -2962,11 +3139,11 @@ static void loadRegPairFromStackSlot(const TargetRegisterInfo &TRI, MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertBefore, const MCInstrDesc &MCID, - unsigned DestReg, unsigned SubIdx0, + Register DestReg, unsigned SubIdx0, unsigned SubIdx1, int FI, MachineMemOperand *MMO) { - unsigned DestReg0 = DestReg; - unsigned DestReg1 = DestReg; + Register DestReg0 = DestReg; + Register DestReg1 = DestReg; bool IsUndef = true; if (Register::isPhysicalRegister(DestReg)) { DestReg0 = TRI.getSubReg(DestReg, SubIdx0); @@ -2984,18 +3161,19 @@ static void loadRegPairFromStackSlot(const TargetRegisterInfo &TRI, } void AArch64InstrInfo::loadRegFromStackSlot( - MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, unsigned DestReg, + MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register DestReg, int FI, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI) const { MachineFunction &MF = *MBB.getParent(); MachineFrameInfo &MFI = MF.getFrameInfo(); - unsigned Align = MFI.getObjectAlignment(FI); MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI); - MachineMemOperand *MMO = MF.getMachineMemOperand( - PtrInfo, MachineMemOperand::MOLoad, MFI.getObjectSize(FI), Align); + MachineMemOperand *MMO = + MF.getMachineMemOperand(PtrInfo, MachineMemOperand::MOLoad, + MFI.getObjectSize(FI), MFI.getObjectAlign(FI)); unsigned Opc = 0; bool Offset = true; + unsigned StackID = TargetStackID::Default; switch (TRI->getSpillSize(*RC)) { case 1: if (AArch64::FPR8RegClass.hasSubClassEq(RC)) @@ -3004,6 +3182,11 @@ void AArch64InstrInfo::loadRegFromStackSlot( case 2: if (AArch64::FPR16RegClass.hasSubClassEq(RC)) Opc = AArch64::LDRHui; + else if (AArch64::PPRRegClass.hasSubClassEq(RC)) { + assert(Subtarget.hasSVE() && "Unexpected register load without SVE"); + Opc = AArch64::LDR_PXI; + StackID = TargetStackID::SVEVector; + } break; case 4: if (AArch64::GPR32allRegClass.hasSubClassEq(RC)) { @@ -3043,6 +3226,10 @@ void AArch64InstrInfo::loadRegFromStackSlot( get(AArch64::LDPXi), DestReg, AArch64::sube64, AArch64::subo64, FI, MMO); return; + } else if (AArch64::ZPRRegClass.hasSubClassEq(RC)) { + assert(Subtarget.hasSVE() && "Unexpected register load without SVE"); + Opc = AArch64::LDR_ZXI; + StackID = TargetStackID::SVEVector; } break; case 24: @@ -3061,6 +3248,10 @@ void AArch64InstrInfo::loadRegFromStackSlot( assert(Subtarget.hasNEON() && "Unexpected register load without NEON"); Opc = AArch64::LD1Twov2d; Offset = false; + } else if (AArch64::ZPR2RegClass.hasSubClassEq(RC)) { + assert(Subtarget.hasSVE() && "Unexpected register load without SVE"); + Opc = AArch64::LDR_ZZXI; + StackID = TargetStackID::SVEVector; } break; case 48: @@ -3068,6 +3259,10 @@ void AArch64InstrInfo::loadRegFromStackSlot( assert(Subtarget.hasNEON() && "Unexpected register load without NEON"); Opc = AArch64::LD1Threev2d; Offset = false; + } else if (AArch64::ZPR3RegClass.hasSubClassEq(RC)) { + assert(Subtarget.hasSVE() && "Unexpected register load without SVE"); + Opc = AArch64::LDR_ZZZXI; + StackID = TargetStackID::SVEVector; } break; case 64: @@ -3075,20 +3270,14 @@ void AArch64InstrInfo::loadRegFromStackSlot( assert(Subtarget.hasNEON() && "Unexpected register load without NEON"); Opc = AArch64::LD1Fourv2d; Offset = false; + } else if (AArch64::ZPR4RegClass.hasSubClassEq(RC)) { + assert(Subtarget.hasSVE() && "Unexpected register load without SVE"); + Opc = AArch64::LDR_ZZZZXI; + StackID = TargetStackID::SVEVector; } break; } - unsigned StackID = TargetStackID::Default; - if (AArch64::PPRRegClass.hasSubClassEq(RC)) { - assert(Subtarget.hasSVE() && "Unexpected register load without SVE"); - Opc = AArch64::LDR_PXI; - StackID = TargetStackID::SVEVector; - } else if (AArch64::ZPRRegClass.hasSubClassEq(RC)) { - assert(Subtarget.hasSVE() && "Unexpected register load without SVE"); - Opc = AArch64::LDR_ZXI; - StackID = TargetStackID::SVEVector; - } assert(Opc && "Unknown register class"); MFI.setStackID(FI, StackID); @@ -3100,6 +3289,17 @@ void AArch64InstrInfo::loadRegFromStackSlot( MI.addMemOperand(MMO); } +bool llvm::isNZCVTouchedInInstructionRange(const MachineInstr &DefMI, + const MachineInstr &UseMI, + const TargetRegisterInfo *TRI) { + return any_of(instructionsWithoutDebug(std::next(DefMI.getIterator()), + UseMI.getIterator()), + [TRI](const MachineInstr &I) { + return I.modifiesRegister(AArch64::NZCV, TRI) || + I.readsRegister(AArch64::NZCV, TRI); + }); +} + // Helper function to emit a frame offset adjustment from a given // pointer (SrcReg), stored into DestReg. This function is explicit // in that it requires the opcode. @@ -3146,6 +3346,10 @@ static void emitFrameOffsetAdj(MachineBasicBlock &MBB, // assert(Offset < (1 << 24) && "unimplemented reg plus immediate"); const unsigned MaxEncodableValue = MaxEncoding << ShiftSize; + Register TmpReg = DestReg; + if (TmpReg == AArch64::XZR) + TmpReg = MBB.getParent()->getRegInfo().createVirtualRegister( + &AArch64::GPR64RegClass); do { uint64_t ThisVal = std::min(Offset, MaxEncodableValue); unsigned LocalShiftSize = 0; @@ -3155,7 +3359,11 @@ static void emitFrameOffsetAdj(MachineBasicBlock &MBB, } assert((ThisVal >> ShiftSize) <= MaxEncoding && "Encoding cannot handle value that big"); - auto MBI = BuildMI(MBB, MBBI, DL, TII->get(Opc), DestReg) + + Offset -= ThisVal << LocalShiftSize; + if (Offset == 0) + TmpReg = DestReg; + auto MBI = BuildMI(MBB, MBBI, DL, TII->get(Opc), TmpReg) .addReg(SrcReg) .addImm(Sign * (int)ThisVal); if (ShiftSize) @@ -3176,8 +3384,8 @@ static void emitFrameOffsetAdj(MachineBasicBlock &MBB, BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_AddFP)) .addImm(Imm) .setMIFlag(Flag); - assert((Offset - Imm) == 0 && "Expected remaining offset to be zero to " - "emit a single SEH directive"); + assert(Offset == 0 && "Expected remaining offset to be zero to " + "emit a single SEH directive"); } else if (DestReg == AArch64::SP) { if (HasWinCFI) *HasWinCFI = true; @@ -3190,8 +3398,7 @@ static void emitFrameOffsetAdj(MachineBasicBlock &MBB, *HasWinCFI = true; } - SrcReg = DestReg; - Offset -= ThisVal << LocalShiftSize; + SrcReg = TmpReg; } while (Offset); } @@ -3414,18 +3621,6 @@ MachineInstr *AArch64InstrInfo::foldMemoryOperandImpl( return nullptr; } -static bool isSVEScaledImmInstruction(unsigned Opcode) { - switch (Opcode) { - case AArch64::LDR_ZXI: - case AArch64::STR_ZXI: - case AArch64::LDR_PXI: - case AArch64::STR_PXI: - return true; - default: - return false; - } -} - int llvm::isAArch64FrameOffsetLegal(const MachineInstr &MI, StackOffset &SOffset, bool *OutUseUnscaledOp, @@ -3458,20 +3653,23 @@ int llvm::isAArch64FrameOffsetLegal(const MachineInstr &MI, case AArch64::ST1Fourv1d: case AArch64::IRG: case AArch64::IRGstack: + case AArch64::STGloop: + case AArch64::STZGloop: return AArch64FrameOffsetCannotUpdate; } // Get the min/max offset and the scale. - unsigned Scale, Width; + TypeSize ScaleValue(0U, false); + unsigned Width; int64_t MinOff, MaxOff; - if (!AArch64InstrInfo::getMemOpInfo(MI.getOpcode(), Scale, Width, MinOff, + if (!AArch64InstrInfo::getMemOpInfo(MI.getOpcode(), ScaleValue, Width, MinOff, MaxOff)) llvm_unreachable("unhandled opcode in isAArch64FrameOffsetLegal"); // Construct the complete offset. - bool IsMulVL = isSVEScaledImmInstruction(MI.getOpcode()); - int64_t Offset = - IsMulVL ? (SOffset.getScalableBytes()) : (SOffset.getBytes()); + bool IsMulVL = ScaleValue.isScalable(); + unsigned Scale = ScaleValue.getKnownMinSize(); + int64_t Offset = IsMulVL ? SOffset.getScalableBytes() : SOffset.getBytes(); const MachineOperand &ImmOpnd = MI.getOperand(AArch64InstrInfo::getLoadStoreImmIdx(MI.getOpcode())); @@ -3484,9 +3682,14 @@ int llvm::isAArch64FrameOffsetLegal(const MachineInstr &MI, AArch64InstrInfo::getUnscaledLdSt(MI.getOpcode()); bool useUnscaledOp = UnscaledOp && (Offset % Scale || Offset < 0); if (useUnscaledOp && - !AArch64InstrInfo::getMemOpInfo(*UnscaledOp, Scale, Width, MinOff, MaxOff)) + !AArch64InstrInfo::getMemOpInfo(*UnscaledOp, ScaleValue, Width, MinOff, + MaxOff)) llvm_unreachable("unhandled opcode in isAArch64FrameOffsetLegal"); + Scale = ScaleValue.getKnownMinSize(); + assert(IsMulVL == ScaleValue.isScalable() && + "Unscaled opcode has different value for scalable"); + int64_t Remainder = Offset % Scale; assert(!(Remainder && useUnscaledOp) && "Cannot have remainder when using unscaled op"); @@ -5791,6 +5994,35 @@ outliner::OutlinedFunction AArch64InstrInfo::getOutliningCandidateInfo( return C.getMF()->getFunction().hasFnAttribute("branch-target-enforcement"); }); + // We check to see if CFI Instructions are present, and if they are + // we find the number of CFI Instructions in the candidates. + unsigned CFICount = 0; + MachineBasicBlock::iterator MBBI = RepeatedSequenceLocs[0].front(); + for (unsigned Loc = RepeatedSequenceLocs[0].getStartIdx(); + Loc < RepeatedSequenceLocs[0].getEndIdx() + 1; Loc++) { + const std::vector &CFIInstructions = + RepeatedSequenceLocs[0].getMF()->getFrameInstructions(); + if (MBBI->isCFIInstruction()) { + unsigned CFIIndex = MBBI->getOperand(0).getCFIIndex(); + MCCFIInstruction CFI = CFIInstructions[CFIIndex]; + CFICount++; + } + MBBI++; + } + + // We compare the number of found CFI Instructions to the number of CFI + // instructions in the parent function for each candidate. We must check this + // since if we outline one of the CFI instructions in a function, we have to + // outline them all for correctness. If we do not, the address offsets will be + // incorrect between the two sections of the program. + for (outliner::Candidate &C : RepeatedSequenceLocs) { + std::vector CFIInstructions = + C.getMF()->getFrameInstructions(); + + if (CFICount > 0 && CFICount != CFIInstructions.size()) + return outliner::OutlinedFunction(); + } + // Returns true if an instructions is safe to fix up, false otherwise. auto IsSafeToFixup = [this, &TRI](MachineInstr &MI) { if (MI.isCall()) @@ -5811,23 +6043,29 @@ outliner::OutlinedFunction AArch64InstrInfo::getOutliningCandidateInfo( if (MI.mayLoadOrStore()) { const MachineOperand *Base; // Filled with the base operand of MI. int64_t Offset; // Filled with the offset of MI. + bool OffsetIsScalable; // Does it allow us to offset the base operand and is the base the // register SP? - if (!getMemOperandWithOffset(MI, Base, Offset, &TRI) || !Base->isReg() || - Base->getReg() != AArch64::SP) + if (!getMemOperandWithOffset(MI, Base, Offset, OffsetIsScalable, &TRI) || + !Base->isReg() || Base->getReg() != AArch64::SP) + return false; + + // Fixe-up code below assumes bytes. + if (OffsetIsScalable) return false; // Find the minimum/maximum offset for this instruction and check // if fixing it up would be in range. int64_t MinOffset, MaxOffset; // Unscaled offsets for the instruction. - unsigned Scale; // The scale to multiply the offsets by. + TypeSize Scale(0U, false); // The scale to multiply the offsets by. unsigned DummyWidth; getMemOpInfo(MI.getOpcode(), Scale, DummyWidth, MinOffset, MaxOffset); Offset += 16; // Update the offset to what it would be if we outlined. - if (Offset < MinOffset * Scale || Offset > MaxOffset * Scale) + if (Offset < MinOffset * (int64_t)Scale.getFixedSize() || + Offset > MaxOffset * (int64_t)Scale.getFixedSize()) return false; // It's in range, so we can outline it. @@ -5854,7 +6092,9 @@ outliner::OutlinedFunction AArch64InstrInfo::getOutliningCandidateInfo( } else if (LastInstrOpcode == AArch64::BL || - (LastInstrOpcode == AArch64::BLR && !HasBTI)) { + ((LastInstrOpcode == AArch64::BLR || + LastInstrOpcode == AArch64::BLRNoIP) && + !HasBTI)) { // FIXME: Do we need to check if the code after this uses the value of LR? FrameID = MachineOutlinerThunk; NumBytesToCreateFrame = 0; @@ -5960,6 +6200,11 @@ outliner::OutlinedFunction AArch64InstrInfo::getOutliningCandidateInfo( } } + // If we have CFI instructions, we can only outline if the outlined section + // can be a tail call + if (FrameID != MachineOutlinerTailCall && CFICount > 0) + return outliner::OutlinedFunction(); + return outliner::OutlinedFunction(RepeatedSequenceLocs, SequenceSize, NumBytesToCreateFrame, FrameID); } @@ -5986,6 +6231,10 @@ bool AArch64InstrInfo::isFunctionSafeToOutlineFrom( if (!AFI || AFI->hasRedZone().getValueOr(true)) return false; + // FIXME: Teach the outliner to generate/handle Windows unwind info. + if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI()) + return false; + // It's safe to outline from MF. return true; } @@ -6081,6 +6330,15 @@ AArch64InstrInfo::getOutliningType(MachineBasicBlock::iterator &MIT, if (FuncInfo->getLOHRelated().count(&MI)) return outliner::InstrType::Illegal; + // We can only outline these if we will tail call the outlined function, or + // fix up the CFI offsets. Currently, CFI instructions are outlined only if + // in a tail call. + // + // FIXME: If the proper fixups for the offset are implemented, this should be + // possible. + if (MI.isCFIInstruction()) + return outliner::InstrType::Legal; + // Don't allow debug values to impact outlining type. if (MI.isDebugInstr() || MI.isIndirectDebugValue()) return outliner::InstrType::Invisible; @@ -6150,10 +6408,11 @@ AArch64InstrInfo::getOutliningType(MachineBasicBlock::iterator &MIT, // If we don't know anything about the callee, assume it depends on the // stack layout of the caller. In that case, it's only legal to outline - // as a tail-call. Whitelist the call instructions we know about so we + // as a tail-call. Explicitly list the call instructions we know about so we // don't get unexpected results with call pseudo-instructions. auto UnknownCallOutlineType = outliner::InstrType::Illegal; - if (MI.getOpcode() == AArch64::BLR || MI.getOpcode() == AArch64::BL) + if (MI.getOpcode() == AArch64::BLR || + MI.getOpcode() == AArch64::BLRNoIP || MI.getOpcode() == AArch64::BL) UnknownCallOutlineType = outliner::InstrType::LegalTerminator; if (!Callee) @@ -6205,26 +6464,29 @@ void AArch64InstrInfo::fixupPostOutline(MachineBasicBlock &MBB) const { const MachineOperand *Base; unsigned Width; int64_t Offset; + bool OffsetIsScalable; // Is this a load or store with an immediate offset with SP as the base? if (!MI.mayLoadOrStore() || - !getMemOperandWithOffsetWidth(MI, Base, Offset, Width, &RI) || + !getMemOperandWithOffsetWidth(MI, Base, Offset, OffsetIsScalable, Width, + &RI) || (Base->isReg() && Base->getReg() != AArch64::SP)) continue; // It is, so we have to fix it up. - unsigned Scale; + TypeSize Scale(0U, false); int64_t Dummy1, Dummy2; MachineOperand &StackOffsetOperand = getMemOpBaseRegImmOfsOffsetOperand(MI); assert(StackOffsetOperand.isImm() && "Stack offset wasn't immediate!"); getMemOpInfo(MI.getOpcode(), Scale, Width, Dummy1, Dummy2); assert(Scale != 0 && "Unexpected opcode!"); + assert(!OffsetIsScalable && "Expected offset to be a byte offset"); // We've pushed the return address to the stack, so add 16 to the offset. // This is safe, since we already checked if it would overflow when we // checked if this instruction was legal to outline. - int64_t NewImm = (Offset + 16) / Scale; + int64_t NewImm = (Offset + 16) / (int64_t)Scale.getFixedSize(); StackOffsetOperand.setImm(NewImm); } } @@ -6285,15 +6547,21 @@ static void signOutlinedFunction(MachineFunction &MF, MachineBasicBlock &MBB, void AArch64InstrInfo::buildOutlinedFrame( MachineBasicBlock &MBB, MachineFunction &MF, const outliner::OutlinedFunction &OF) const { - // For thunk outlining, rewrite the last instruction from a call to a - // tail-call. - if (OF.FrameConstructionID == MachineOutlinerThunk) { + + AArch64FunctionInfo *FI = MF.getInfo(); + + if (OF.FrameConstructionID == MachineOutlinerTailCall) + FI->setOutliningStyle("Tail Call"); + else if (OF.FrameConstructionID == MachineOutlinerThunk) { + // For thunk outlining, rewrite the last instruction from a call to a + // tail-call. MachineInstr *Call = &*--MBB.instr_end(); unsigned TailOpcode; if (Call->getOpcode() == AArch64::BL) { TailOpcode = AArch64::TCRETURNdi; } else { - assert(Call->getOpcode() == AArch64::BLR); + assert(Call->getOpcode() == AArch64::BLR || + Call->getOpcode() == AArch64::BLRNoIP); TailOpcode = AArch64::TCRETURNriALL; } MachineInstr *TC = BuildMI(MF, DebugLoc(), get(TailOpcode)) @@ -6301,6 +6569,8 @@ void AArch64InstrInfo::buildOutlinedFrame( .addImm(0); MBB.insert(MBB.end(), TC); Call->eraseFromParent(); + + FI->setOutliningStyle("Thunk"); } bool IsLeafFunction = true; @@ -6320,7 +6590,8 @@ void AArch64InstrInfo::buildOutlinedFrame( IsLeafFunction = false; // LR has to be a live in so that we can save it. - MBB.addLiveIn(AArch64::LR); + if (!MBB.isLiveIn(AArch64::LR)) + MBB.addLiveIn(AArch64::LR); MachineBasicBlock::iterator It = MBB.begin(); MachineBasicBlock::iterator Et = MBB.end(); @@ -6343,7 +6614,7 @@ void AArch64InstrInfo::buildOutlinedFrame( // Add a CFI saying the stack was moved 16 B down. int64_t StackPosEntry = - MF.addFrameInst(MCCFIInstruction::createDefCfaOffset(nullptr, 16)); + MF.addFrameInst(MCCFIInstruction::cfiDefCfaOffset(nullptr, 16)); BuildMI(MBB, It, DebugLoc(), get(AArch64::CFI_INSTRUCTION)) .addCFIIndex(StackPosEntry) .setMIFlags(MachineInstr::FrameSetup); @@ -6351,7 +6622,7 @@ void AArch64InstrInfo::buildOutlinedFrame( // Add a CFI saying that the LR that we want to find is now 16 B higher than // before. int64_t LRPosEntry = - MF.addFrameInst(MCCFIInstruction::createOffset(nullptr, DwarfReg, 16)); + MF.addFrameInst(MCCFIInstruction::createOffset(nullptr, DwarfReg, -16)); BuildMI(MBB, It, DebugLoc(), get(AArch64::CFI_INSTRUCTION)) .addCFIIndex(LRPosEntry) .setMIFlags(MachineInstr::FrameSetup); @@ -6399,13 +6670,20 @@ void AArch64InstrInfo::buildOutlinedFrame( } // It's not a tail call, so we have to insert the return ourselves. + + // LR has to be a live in so that we can return to it. + if (!MBB.isLiveIn(AArch64::LR)) + MBB.addLiveIn(AArch64::LR); + MachineInstr *ret = BuildMI(MF, DebugLoc(), get(AArch64::RET)) - .addReg(AArch64::LR, RegState::Undef); + .addReg(AArch64::LR); MBB.insert(MBB.end(), ret); signOutlinedFunction(MF, MBB, ShouldSignReturnAddr, ShouldSignReturnAddrWithAKey); + FI->setOutliningStyle("Function"); + // Did we have to modify the stack by saving the link register? if (OF.FrameConstructionID != MachineOutlinerDefault) return; @@ -6519,7 +6797,8 @@ Optional AArch64InstrInfo::isAddImmediate(const MachineInstr &MI, // TODO: Handle cases where Reg is a super- or sub-register of the // destination register. - if (Reg != MI.getOperand(0).getReg()) + const MachineOperand &Op0 = MI.getOperand(0); + if (!Op0.isReg() || Reg != Op0.getReg()) return None; switch (MI.getOpcode()) { @@ -6614,5 +6893,17 @@ AArch64InstrInfo::describeLoadedValue(const MachineInstr &MI, return TargetInstrInfo::describeLoadedValue(MI, Reg); } +uint64_t AArch64InstrInfo::getElementSizeForOpcode(unsigned Opc) const { + return get(Opc).TSFlags & AArch64::ElementSizeMask; +} + +unsigned llvm::getBLRCallOpcode(const MachineFunction &MF) { + if (MF.getSubtarget().hardenSlsBlr()) + return AArch64::BLRNoIP; + else + return AArch64::BLR; +} + #define GET_INSTRINFO_HELPERS +#define GET_INSTRMAP_INFO #include "AArch64GenInstrInfo.inc" diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.h b/llvm/lib/Target/AArch64/AArch64InstrInfo.h index 66e517e549035..298c04d81708d 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.h +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.h @@ -19,6 +19,7 @@ #include "llvm/ADT/Optional.h" #include "llvm/CodeGen/MachineCombinerPattern.h" #include "llvm/CodeGen/TargetInstrInfo.h" +#include "llvm/Support/TypeSize.h" #define GET_INSTRINFO_HEADER #include "AArch64GenInstrInfo.inc" @@ -51,8 +52,8 @@ public: bool isAsCheapAsAMove(const MachineInstr &MI) const override; - bool isCoalescableExtInstr(const MachineInstr &MI, unsigned &SrcReg, - unsigned &DstReg, unsigned &SubIdx) const override; + bool isCoalescableExtInstr(const MachineInstr &MI, Register &SrcReg, + Register &DstReg, unsigned &SubIdx) const override; bool areMemAccessesTriviallyDisjoint(const MachineInstr &MIa, @@ -112,14 +113,19 @@ public: /// Hint that pairing the given load or store is unprofitable. static void suppressLdStPair(MachineInstr &MI); - bool getMemOperandWithOffset(const MachineInstr &MI, - const MachineOperand *&BaseOp, - int64_t &Offset, - const TargetRegisterInfo *TRI) const override; + bool getMemOperandsWithOffsetWidth( + const MachineInstr &MI, SmallVectorImpl &BaseOps, + int64_t &Offset, bool &OffsetIsScalable, unsigned &Width, + const TargetRegisterInfo *TRI) const override; + /// If \p OffsetIsScalable is set to 'true', the offset is scaled by `vscale`. + /// This is true for some SVE instructions like ldr/str that have a + /// 'reg + imm' addressing mode where the immediate is an index to the + /// scalable vector located at 'reg + imm * vscale x #bytes'. bool getMemOperandWithOffsetWidth(const MachineInstr &MI, const MachineOperand *&BaseOp, - int64_t &Offset, unsigned &Width, + int64_t &Offset, bool &OffsetIsScalable, + unsigned &Width, const TargetRegisterInfo *TRI) const; /// Return the immediate offset of the base register in a load/store \p LdSt. @@ -129,12 +135,12 @@ public: /// \p Scale, \p Width, \p MinOffset, and \p MaxOffset accordingly. /// /// For unscaled instructions, \p Scale is set to 1. - static bool getMemOpInfo(unsigned Opcode, unsigned &Scale, unsigned &Width, + static bool getMemOpInfo(unsigned Opcode, TypeSize &Scale, unsigned &Width, int64_t &MinOffset, int64_t &MaxOffset); - bool shouldClusterMemOps(const MachineOperand &BaseOp1, - const MachineOperand &BaseOp2, - unsigned NumLoads) const override; + bool shouldClusterMemOps(ArrayRef BaseOps1, + ArrayRef BaseOps2, + unsigned NumLoads, unsigned NumBytes) const override; void copyPhysRegTuple(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, MCRegister DestReg, @@ -149,13 +155,13 @@ public: bool KillSrc) const override; void storeRegToStackSlot(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MBBI, unsigned SrcReg, + MachineBasicBlock::iterator MBBI, Register SrcReg, bool isKill, int FrameIndex, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI) const override; void loadRegFromStackSlot(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MBBI, unsigned DestReg, + MachineBasicBlock::iterator MBBI, Register DestReg, int FrameIndex, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI) const override; @@ -191,11 +197,12 @@ public: bool reverseBranchCondition(SmallVectorImpl &Cond) const override; bool canInsertSelect(const MachineBasicBlock &, ArrayRef Cond, - unsigned, unsigned, int &, int &, int &) const override; + Register, Register, Register, int &, int &, + int &) const override; void insertSelect(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, - const DebugLoc &DL, unsigned DstReg, - ArrayRef Cond, unsigned TrueReg, - unsigned FalseReg) const override; + const DebugLoc &DL, Register DstReg, + ArrayRef Cond, Register TrueReg, + Register FalseReg) const override; void getNoop(MCInst &NopInst) const override; bool isSchedulingBoundary(const MachineInstr &MI, @@ -205,13 +212,13 @@ public: /// analyzeCompare - For a comparison instruction, return the source registers /// in SrcReg and SrcReg2, and the value it compares against in CmpValue. /// Return true if the comparison instruction can be analyzed. - bool analyzeCompare(const MachineInstr &MI, unsigned &SrcReg, - unsigned &SrcReg2, int &CmpMask, + bool analyzeCompare(const MachineInstr &MI, Register &SrcReg, + Register &SrcReg2, int &CmpMask, int &CmpValue) const override; /// optimizeCompareInstr - Convert the instruction supplying the argument to /// the comparison into one that sets the zero bit in the flags register. - bool optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg, - unsigned SrcReg2, int CmpMask, int CmpValue, + bool optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, + Register SrcReg2, int CmpMask, int CmpValue, const MachineRegisterInfo *MRI) const override; bool optimizeCondBranch(MachineInstr &MI) const override; @@ -264,6 +271,8 @@ public: MachineBasicBlock::iterator &It, MachineFunction &MF, const outliner::Candidate &C) const override; bool shouldOutlineFromFunctionByDefault(MachineFunction &MF) const override; + /// Returns the vector element size (B, H, S or D) of an SVE opcode. + uint64_t getElementSizeForOpcode(unsigned Opc) const; /// Returns true if the instruction has a shift by immediate that can be /// executed in one cycle less. static bool isFalkorShiftExtFast(const MachineInstr &MI); @@ -288,6 +297,8 @@ protected: isCopyInstrImpl(const MachineInstr &MI) const override; private: + unsigned getInstBundleLength(const MachineInstr &MI) const; + /// Sets the offsets on outlined instructions in \p MBB which use SP /// so that they will be valid post-outlining. /// @@ -305,6 +316,12 @@ private: unsigned findRegisterToSaveLRTo(const outliner::Candidate &C) const; }; +/// Return true if there is an instruction /after/ \p DefMI and before \p UseMI +/// which either reads or clobbers NZCV. +bool isNZCVTouchedInInstructionRange(const MachineInstr &DefMI, + const MachineInstr &UseMI, + const TargetRegisterInfo *TRI); + /// emitFrameOffset - Emit instructions as needed to set DestReg to SrcReg /// plus Offset. This is intended to be used from within the prolog/epilog /// insertion (PEI) pass, where a virtual scratch register may be allocated @@ -369,12 +386,24 @@ static inline bool isCondBranchOpcode(int Opc) { } static inline bool isIndirectBranchOpcode(int Opc) { - return Opc == AArch64::BR; + switch (Opc) { + case AArch64::BR: + case AArch64::BRAA: + case AArch64::BRAB: + case AArch64::BRAAZ: + case AArch64::BRABZ: + return true; + } + return false; } +/// Return opcode to be used for indirect calls. +unsigned getBLRCallOpcode(const MachineFunction &MF); + // struct TSFlags { #define TSFLAG_ELEMENT_SIZE_TYPE(X) (X) // 3-bits -#define TSFLAG_DESTRUCTIVE_INST_TYPE(X) ((X) << 3) // 1-bit +#define TSFLAG_DESTRUCTIVE_INST_TYPE(X) ((X) << 3) // 4-bit +#define TSFLAG_FALSE_LANE_TYPE(X) ((X) << 7) // 2-bits // } namespace AArch64 { @@ -389,13 +418,31 @@ enum ElementSizeType { }; enum DestructiveInstType { - DestructiveInstTypeMask = TSFLAG_DESTRUCTIVE_INST_TYPE(0x1), - NotDestructive = TSFLAG_DESTRUCTIVE_INST_TYPE(0x0), - Destructive = TSFLAG_DESTRUCTIVE_INST_TYPE(0x1), + DestructiveInstTypeMask = TSFLAG_DESTRUCTIVE_INST_TYPE(0xf), + NotDestructive = TSFLAG_DESTRUCTIVE_INST_TYPE(0x0), + DestructiveOther = TSFLAG_DESTRUCTIVE_INST_TYPE(0x1), + DestructiveUnary = TSFLAG_DESTRUCTIVE_INST_TYPE(0x2), + DestructiveBinaryImm = TSFLAG_DESTRUCTIVE_INST_TYPE(0x3), + DestructiveBinaryShImmUnpred = TSFLAG_DESTRUCTIVE_INST_TYPE(0x4), + DestructiveBinary = TSFLAG_DESTRUCTIVE_INST_TYPE(0x5), + DestructiveBinaryComm = TSFLAG_DESTRUCTIVE_INST_TYPE(0x6), + DestructiveBinaryCommWithRev = TSFLAG_DESTRUCTIVE_INST_TYPE(0x7), + DestructiveTernaryCommWithRev = TSFLAG_DESTRUCTIVE_INST_TYPE(0x8), +}; + +enum FalseLaneType { + FalseLanesMask = TSFLAG_FALSE_LANE_TYPE(0x3), + FalseLanesZero = TSFLAG_FALSE_LANE_TYPE(0x1), + FalseLanesUndef = TSFLAG_FALSE_LANE_TYPE(0x2), }; #undef TSFLAG_ELEMENT_SIZE_TYPE #undef TSFLAG_DESTRUCTIVE_INST_TYPE +#undef TSFLAG_FALSE_LANE_TYPE + +int getSVEPseudoMap(uint16_t Opcode); +int getSVERevInstr(uint16_t Opcode); +int getSVENonRevInstr(uint16_t Opcode); } } // end namespace llvm diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td index d590d4d913ff8..f4a5f639e4973 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -14,142 +14,154 @@ // ARM Instruction Predicate Definitions. // def HasV8_1a : Predicate<"Subtarget->hasV8_1aOps()">, - AssemblerPredicate<"HasV8_1aOps", "armv8.1a">; + AssemblerPredicate<(all_of HasV8_1aOps), "armv8.1a">; def HasV8_2a : Predicate<"Subtarget->hasV8_2aOps()">, - AssemblerPredicate<"HasV8_2aOps", "armv8.2a">; + AssemblerPredicate<(all_of HasV8_2aOps), "armv8.2a">; def HasV8_3a : Predicate<"Subtarget->hasV8_3aOps()">, - AssemblerPredicate<"HasV8_3aOps", "armv8.3a">; + AssemblerPredicate<(all_of HasV8_3aOps), "armv8.3a">; def HasV8_4a : Predicate<"Subtarget->hasV8_4aOps()">, - AssemblerPredicate<"HasV8_4aOps", "armv8.4a">; + AssemblerPredicate<(all_of HasV8_4aOps), "armv8.4a">; def HasV8_5a : Predicate<"Subtarget->hasV8_5aOps()">, - AssemblerPredicate<"HasV8_5aOps", "armv8.5a">; + AssemblerPredicate<(all_of HasV8_5aOps), "armv8.5a">; +def HasV8_6a : Predicate<"Subtarget->hasV8_6aOps()">, + AssemblerPredicate<(all_of HasV8_6aOps), "armv8.6a">; def HasVH : Predicate<"Subtarget->hasVH()">, - AssemblerPredicate<"FeatureVH", "vh">; + AssemblerPredicate<(all_of FeatureVH), "vh">; def HasLOR : Predicate<"Subtarget->hasLOR()">, - AssemblerPredicate<"FeatureLOR", "lor">; + AssemblerPredicate<(all_of FeatureLOR), "lor">; def HasPA : Predicate<"Subtarget->hasPA()">, - AssemblerPredicate<"FeaturePA", "pa">; + AssemblerPredicate<(all_of FeaturePA), "pa">; def HasJS : Predicate<"Subtarget->hasJS()">, - AssemblerPredicate<"FeatureJS", "jsconv">; + AssemblerPredicate<(all_of FeatureJS), "jsconv">; def HasCCIDX : Predicate<"Subtarget->hasCCIDX()">, - AssemblerPredicate<"FeatureCCIDX", "ccidx">; + AssemblerPredicate<(all_of FeatureCCIDX), "ccidx">; def HasComplxNum : Predicate<"Subtarget->hasComplxNum()">, - AssemblerPredicate<"FeatureComplxNum", "complxnum">; + AssemblerPredicate<(all_of FeatureComplxNum), "complxnum">; def HasNV : Predicate<"Subtarget->hasNV()">, - AssemblerPredicate<"FeatureNV", "nv">; + AssemblerPredicate<(all_of FeatureNV), "nv">; def HasRASv8_4 : Predicate<"Subtarget->hasRASv8_4()">, - AssemblerPredicate<"FeatureRASv8_4", "rasv8_4">; + AssemblerPredicate<(all_of FeatureRASv8_4), "rasv8_4">; def HasMPAM : Predicate<"Subtarget->hasMPAM()">, - AssemblerPredicate<"FeatureMPAM", "mpam">; + AssemblerPredicate<(all_of FeatureMPAM), "mpam">; def HasDIT : Predicate<"Subtarget->hasDIT()">, - AssemblerPredicate<"FeatureDIT", "dit">; + AssemblerPredicate<(all_of FeatureDIT), "dit">; def HasTRACEV8_4 : Predicate<"Subtarget->hasTRACEV8_4()">, - AssemblerPredicate<"FeatureTRACEV8_4", "tracev8.4">; + AssemblerPredicate<(all_of FeatureTRACEV8_4), "tracev8.4">; def HasAM : Predicate<"Subtarget->hasAM()">, - AssemblerPredicate<"FeatureAM", "am">; + AssemblerPredicate<(all_of FeatureAM), "am">; def HasSEL2 : Predicate<"Subtarget->hasSEL2()">, - AssemblerPredicate<"FeatureSEL2", "sel2">; + AssemblerPredicate<(all_of FeatureSEL2), "sel2">; def HasPMU : Predicate<"Subtarget->hasPMU()">, - AssemblerPredicate<"FeaturePMU", "pmu">; + AssemblerPredicate<(all_of FeaturePMU), "pmu">; def HasTLB_RMI : Predicate<"Subtarget->hasTLB_RMI()">, - AssemblerPredicate<"FeatureTLB_RMI", "tlb-rmi">; + AssemblerPredicate<(all_of FeatureTLB_RMI), "tlb-rmi">; def HasFMI : Predicate<"Subtarget->hasFMI()">, - AssemblerPredicate<"FeatureFMI", "fmi">; + AssemblerPredicate<(all_of FeatureFMI), "fmi">; def HasRCPC_IMMO : Predicate<"Subtarget->hasRCPCImm()">, - AssemblerPredicate<"FeatureRCPC_IMMO", "rcpc-immo">; + AssemblerPredicate<(all_of FeatureRCPC_IMMO), "rcpc-immo">; def HasFPARMv8 : Predicate<"Subtarget->hasFPARMv8()">, - AssemblerPredicate<"FeatureFPARMv8", "fp-armv8">; + AssemblerPredicate<(all_of FeatureFPARMv8), "fp-armv8">; def HasNEON : Predicate<"Subtarget->hasNEON()">, - AssemblerPredicate<"FeatureNEON", "neon">; + AssemblerPredicate<(all_of FeatureNEON), "neon">; def HasCrypto : Predicate<"Subtarget->hasCrypto()">, - AssemblerPredicate<"FeatureCrypto", "crypto">; + AssemblerPredicate<(all_of FeatureCrypto), "crypto">; def HasSM4 : Predicate<"Subtarget->hasSM4()">, - AssemblerPredicate<"FeatureSM4", "sm4">; + AssemblerPredicate<(all_of FeatureSM4), "sm4">; def HasSHA3 : Predicate<"Subtarget->hasSHA3()">, - AssemblerPredicate<"FeatureSHA3", "sha3">; + AssemblerPredicate<(all_of FeatureSHA3), "sha3">; def HasSHA2 : Predicate<"Subtarget->hasSHA2()">, - AssemblerPredicate<"FeatureSHA2", "sha2">; + AssemblerPredicate<(all_of FeatureSHA2), "sha2">; def HasAES : Predicate<"Subtarget->hasAES()">, - AssemblerPredicate<"FeatureAES", "aes">; + AssemblerPredicate<(all_of FeatureAES), "aes">; def HasDotProd : Predicate<"Subtarget->hasDotProd()">, - AssemblerPredicate<"FeatureDotProd", "dotprod">; + AssemblerPredicate<(all_of FeatureDotProd), "dotprod">; def HasCRC : Predicate<"Subtarget->hasCRC()">, - AssemblerPredicate<"FeatureCRC", "crc">; + AssemblerPredicate<(all_of FeatureCRC), "crc">; def HasLSE : Predicate<"Subtarget->hasLSE()">, - AssemblerPredicate<"FeatureLSE", "lse">; + AssemblerPredicate<(all_of FeatureLSE), "lse">; def HasRAS : Predicate<"Subtarget->hasRAS()">, - AssemblerPredicate<"FeatureRAS", "ras">; + AssemblerPredicate<(all_of FeatureRAS), "ras">; def HasRDM : Predicate<"Subtarget->hasRDM()">, - AssemblerPredicate<"FeatureRDM", "rdm">; + AssemblerPredicate<(all_of FeatureRDM), "rdm">; def HasPerfMon : Predicate<"Subtarget->hasPerfMon()">; def HasFullFP16 : Predicate<"Subtarget->hasFullFP16()">, - AssemblerPredicate<"FeatureFullFP16", "fullfp16">; + AssemblerPredicate<(all_of FeatureFullFP16), "fullfp16">; def HasFP16FML : Predicate<"Subtarget->hasFP16FML()">, - AssemblerPredicate<"FeatureFP16FML", "fp16fml">; + AssemblerPredicate<(all_of FeatureFP16FML), "fp16fml">; def HasSPE : Predicate<"Subtarget->hasSPE()">, - AssemblerPredicate<"FeatureSPE", "spe">; + AssemblerPredicate<(all_of FeatureSPE), "spe">; def HasFuseAES : Predicate<"Subtarget->hasFuseAES()">, - AssemblerPredicate<"FeatureFuseAES", + AssemblerPredicate<(all_of FeatureFuseAES), "fuse-aes">; def HasSVE : Predicate<"Subtarget->hasSVE()">, - AssemblerPredicate<"FeatureSVE", "sve">; + AssemblerPredicate<(all_of FeatureSVE), "sve">; def HasSVE2 : Predicate<"Subtarget->hasSVE2()">, - AssemblerPredicate<"FeatureSVE2", "sve2">; + AssemblerPredicate<(all_of FeatureSVE2), "sve2">; def HasSVE2AES : Predicate<"Subtarget->hasSVE2AES()">, - AssemblerPredicate<"FeatureSVE2AES", "sve2-aes">; + AssemblerPredicate<(all_of FeatureSVE2AES), "sve2-aes">; def HasSVE2SM4 : Predicate<"Subtarget->hasSVE2SM4()">, - AssemblerPredicate<"FeatureSVE2SM4", "sve2-sm4">; + AssemblerPredicate<(all_of FeatureSVE2SM4), "sve2-sm4">; def HasSVE2SHA3 : Predicate<"Subtarget->hasSVE2SHA3()">, - AssemblerPredicate<"FeatureSVE2SHA3", "sve2-sha3">; + AssemblerPredicate<(all_of FeatureSVE2SHA3), "sve2-sha3">; def HasSVE2BitPerm : Predicate<"Subtarget->hasSVE2BitPerm()">, - AssemblerPredicate<"FeatureSVE2BitPerm", "sve2-bitperm">; + AssemblerPredicate<(all_of FeatureSVE2BitPerm), "sve2-bitperm">; def HasRCPC : Predicate<"Subtarget->hasRCPC()">, - AssemblerPredicate<"FeatureRCPC", "rcpc">; + AssemblerPredicate<(all_of FeatureRCPC), "rcpc">; def HasAltNZCV : Predicate<"Subtarget->hasAlternativeNZCV()">, - AssemblerPredicate<"FeatureAltFPCmp", "altnzcv">; + AssemblerPredicate<(all_of FeatureAltFPCmp), "altnzcv">; def HasFRInt3264 : Predicate<"Subtarget->hasFRInt3264()">, - AssemblerPredicate<"FeatureFRInt3264", "frint3264">; + AssemblerPredicate<(all_of FeatureFRInt3264), "frint3264">; def HasSB : Predicate<"Subtarget->hasSB()">, - AssemblerPredicate<"FeatureSB", "sb">; + AssemblerPredicate<(all_of FeatureSB), "sb">; def HasPredRes : Predicate<"Subtarget->hasPredRes()">, - AssemblerPredicate<"FeaturePredRes", "predres">; + AssemblerPredicate<(all_of FeaturePredRes), "predres">; def HasCCDP : Predicate<"Subtarget->hasCCDP()">, - AssemblerPredicate<"FeatureCacheDeepPersist", "ccdp">; + AssemblerPredicate<(all_of FeatureCacheDeepPersist), "ccdp">; def HasBTI : Predicate<"Subtarget->hasBTI()">, - AssemblerPredicate<"FeatureBranchTargetId", "bti">; + AssemblerPredicate<(all_of FeatureBranchTargetId), "bti">; def HasMTE : Predicate<"Subtarget->hasMTE()">, - AssemblerPredicate<"FeatureMTE", "mte">; + AssemblerPredicate<(all_of FeatureMTE), "mte">; def HasTME : Predicate<"Subtarget->hasTME()">, - AssemblerPredicate<"FeatureTME", "tme">; + AssemblerPredicate<(all_of FeatureTME), "tme">; def HasETE : Predicate<"Subtarget->hasETE()">, - AssemblerPredicate<"FeatureETE", "ete">; + AssemblerPredicate<(all_of FeatureETE), "ete">; def HasTRBE : Predicate<"Subtarget->hasTRBE()">, - AssemblerPredicate<"FeatureTRBE", "trbe">; + AssemblerPredicate<(all_of FeatureTRBE), "trbe">; +def HasBF16 : Predicate<"Subtarget->hasBF16()">, + AssemblerPredicate<(all_of FeatureBF16), "bf16">; +def HasMatMulInt8 : Predicate<"Subtarget->hasMatMulInt8()">, + AssemblerPredicate<(all_of FeatureMatMulInt8), "i8mm">; +def HasMatMulFP32 : Predicate<"Subtarget->hasMatMulFP32()">, + AssemblerPredicate<(all_of FeatureMatMulFP32), "f32mm">; +def HasMatMulFP64 : Predicate<"Subtarget->hasMatMulFP64()">, + AssemblerPredicate<(all_of FeatureMatMulFP64), "f64mm">; def IsLE : Predicate<"Subtarget->isLittleEndian()">; def IsBE : Predicate<"!Subtarget->isLittleEndian()">; def IsWindows : Predicate<"Subtarget->isTargetWindows()">; +def UseExperimentalZeroingPseudos + : Predicate<"Subtarget->useExperimentalZeroingPseudos()">; def UseAlternateSExtLoadCVTF32 : Predicate<"Subtarget->useAlternateSExtLoadCVTF32Pattern()">; def UseNegativeImmediates - : Predicate<"false">, AssemblerPredicate<"!FeatureNoNegativeImmediates", + : Predicate<"false">, AssemblerPredicate<(all_of (not FeatureNoNegativeImmediates)), "NegativeImmediates">; def AArch64LocalRecover : SDNode<"ISD::LOCAL_RECOVER", @@ -227,6 +239,10 @@ def SDT_AArch64ExtVec: SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>, SDTCisSameAs<0,2>, SDTCisInt<3>]>; def SDT_AArch64vshift : SDTypeProfile<1, 2, [SDTCisSameAs<0,1>, SDTCisInt<2>]>; +def SDT_AArch64vshiftinsert : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisInt<3>, + SDTCisSameAs<0,1>, + SDTCisSameAs<0,2>]>; + def SDT_AArch64unvec : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisSameAs<0,1>]>; def SDT_AArch64fcmpz : SDTypeProfile<1, 1, []>; def SDT_AArch64fcmp : SDTypeProfile<1, 2, [SDTCisSameAs<1,2>]>; @@ -245,6 +261,7 @@ def SDT_AArch64TLSDescCall : SDTypeProfile<0, -2, [SDTCisPtrTy<0>, def SDT_AArch64ldp : SDTypeProfile<2, 1, [SDTCisVT<0, i64>, SDTCisSameAs<0, 1>, SDTCisPtrTy<2>]>; def SDT_AArch64stp : SDTypeProfile<0, 3, [SDTCisVT<0, i64>, SDTCisSameAs<0, 1>, SDTCisPtrTy<2>]>; +def SDT_AArch64stnp : SDTypeProfile<0, 3, [SDTCisVT<0, v4i32>, SDTCisSameAs<0, 1>, SDTCisPtrTy<2>]>; // Generates the general dynamic sequences, i.e. // adrp x0, :tlsdesc:var @@ -419,7 +436,14 @@ def AArch64fccmp : SDNode<"AArch64ISD::FCCMP", SDT_AArch64FCCMP>; def AArch64threadpointer : SDNode<"AArch64ISD::THREAD_POINTER", SDTPtrLeaf>; -def AArch64fcmp : SDNode<"AArch64ISD::FCMP", SDT_AArch64FCmp>; +def AArch64fcmp : SDNode<"AArch64ISD::FCMP", SDT_AArch64FCmp>; +def AArch64strict_fcmp : SDNode<"AArch64ISD::STRICT_FCMP", SDT_AArch64FCmp, + [SDNPHasChain]>; +def AArch64strict_fcmpe : SDNode<"AArch64ISD::STRICT_FCMPE", SDT_AArch64FCmp, + [SDNPHasChain]>; +def AArch64any_fcmp : PatFrags<(ops node:$lhs, node:$rhs), + [(AArch64strict_fcmp node:$lhs, node:$rhs), + (AArch64fcmp node:$lhs, node:$rhs)]>; def AArch64dup : SDNode<"AArch64ISD::DUP", SDT_AArch64Dup>; def AArch64duplane8 : SDNode<"AArch64ISD::DUPLANE8", SDT_AArch64DupLane>; @@ -457,10 +481,12 @@ def AArch64uqshli : SDNode<"AArch64ISD::UQSHL_I", SDT_AArch64vshift>; def AArch64sqshlui : SDNode<"AArch64ISD::SQSHLU_I", SDT_AArch64vshift>; def AArch64srshri : SDNode<"AArch64ISD::SRSHR_I", SDT_AArch64vshift>; def AArch64urshri : SDNode<"AArch64ISD::URSHR_I", SDT_AArch64vshift>; +def AArch64vsli : SDNode<"AArch64ISD::VSLI", SDT_AArch64vshiftinsert>; +def AArch64vsri : SDNode<"AArch64ISD::VSRI", SDT_AArch64vshiftinsert>; def AArch64not: SDNode<"AArch64ISD::NOT", SDT_AArch64unvec>; def AArch64bit: SDNode<"AArch64ISD::BIT", SDT_AArch64trivec>; -def AArch64bsl: SDNode<"AArch64ISD::BSL", SDT_AArch64trivec>; +def AArch64bsp: SDNode<"AArch64ISD::BSP", SDT_AArch64trivec>; def AArch64cmeq: SDNode<"AArch64ISD::CMEQ", SDT_AArch64binvec>; def AArch64cmge: SDNode<"AArch64ISD::CMGE", SDT_AArch64binvec>; @@ -528,6 +554,9 @@ def AArch64uminv : SDNode<"AArch64ISD::UMINV", SDT_AArch64UnaryVec>; def AArch64smaxv : SDNode<"AArch64ISD::SMAXV", SDT_AArch64UnaryVec>; def AArch64umaxv : SDNode<"AArch64ISD::UMAXV", SDT_AArch64UnaryVec>; +def AArch64srhadd : SDNode<"AArch64ISD::SRHADD", SDT_AArch64binvec>; +def AArch64urhadd : SDNode<"AArch64ISD::URHADD", SDT_AArch64binvec>; + def SDT_AArch64SETTAG : SDTypeProfile<0, 2, [SDTCisPtrTy<0>, SDTCisPtrTy<1>]>; def AArch64stg : SDNode<"AArch64ISD::STG", SDT_AArch64SETTAG, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; def AArch64stzg : SDNode<"AArch64ISD::STZG", SDT_AArch64SETTAG, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; @@ -544,6 +573,7 @@ def AArch64uunpklo : SDNode<"AArch64ISD::UUNPKLO", SDT_AArch64unpk>; def AArch64ldp : SDNode<"AArch64ISD::LDP", SDT_AArch64ldp, [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>; def AArch64stp : SDNode<"AArch64ISD::STP", SDT_AArch64stp, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; +def AArch64stnp : SDNode<"AArch64ISD::STNP", SDT_AArch64stnp, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; def AArch64tbl : SDNode<"AArch64ISD::TBL", SDT_AArch64TBL>; @@ -564,6 +594,8 @@ let RecomputePerFunction = 1 in { def UseBTI : Predicate<[{ MF->getFunction().hasFnAttribute("branch-target-enforcement") }]>; def NotUseBTI : Predicate<[{ !MF->getFunction().hasFnAttribute("branch-target-enforcement") }]>; + def SLSBLRMitigation : Predicate<[{ MF->getSubtarget().hardenSlsBlr() }]>; + def NoSLSBLRMitigation : Predicate<[{ !MF->getSubtarget().hardenSlsBlr() }]>; // Toggles patterns which aren't beneficial in GlobalISel when we aren't // optimizing. This allows us to selectively use patterns without impacting // SelectionDAG's behaviour. @@ -686,6 +718,14 @@ let hasSideEffects = 1, isCodeGenOnly = 1 in { : Pseudo<(outs GPR32:$dst), (ins GPR32:$src), []>, Sched<[]>; } +// SpeculationBarrierEndBB must only be used after an unconditional control +// flow, i.e. after a terminator for which isBarrier is True. +let hasSideEffects = 1, isCodeGenOnly = 1, isTerminator = 1, isBarrier = 1 in { + def SpeculationBarrierISBDSBEndBB + : Pseudo<(outs), (ins), []>, Sched<[]>; + def SpeculationBarrierSBEndBB + : Pseudo<(outs), (ins), []>, Sched<[]>; +} //===----------------------------------------------------------------------===// // System instructions. @@ -698,8 +738,15 @@ def : InstAlias<"wfe", (HINT 0b010)>; def : InstAlias<"wfi", (HINT 0b011)>; def : InstAlias<"sev", (HINT 0b100)>; def : InstAlias<"sevl", (HINT 0b101)>; +def : InstAlias<"dgh", (HINT 0b110)>; def : InstAlias<"esb", (HINT 0b10000)>, Requires<[HasRAS]>; def : InstAlias<"csdb", (HINT 20)>; +// In order to be able to write readable assembly, LLVM should accept assembly +// inputs that use Branch Target Indentification mnemonics, even with BTI disabled. +// However, in order to be compatible with other assemblers (e.g. GAS), LLVM +// should not emit these mnemonics unless BTI is enabled. +def : InstAlias<"bti", (HINT 32), 0>; +def : InstAlias<"bti $op", (HINT btihint_op:$op), 0>; def : InstAlias<"bti", (HINT 32)>, Requires<[HasBTI]>; def : InstAlias<"bti $op", (HINT btihint_op:$op)>, Requires<[HasBTI]>; @@ -731,10 +778,58 @@ def TSB : CRmSystemI { // ARMv8.2-A Dot Product let Predicates = [HasDotProd] in { -defm SDOT : SIMDThreeSameVectorDot<0, "sdot", int_aarch64_neon_sdot>; -defm UDOT : SIMDThreeSameVectorDot<1, "udot", int_aarch64_neon_udot>; -defm SDOTlane : SIMDThreeSameVectorDotIndex<0, "sdot", int_aarch64_neon_sdot>; -defm UDOTlane : SIMDThreeSameVectorDotIndex<1, "udot", int_aarch64_neon_udot>; +defm SDOT : SIMDThreeSameVectorDot<0, 0, "sdot", int_aarch64_neon_sdot>; +defm UDOT : SIMDThreeSameVectorDot<1, 0, "udot", int_aarch64_neon_udot>; +defm SDOTlane : SIMDThreeSameVectorDotIndex<0, 0, 0b10, "sdot", int_aarch64_neon_sdot>; +defm UDOTlane : SIMDThreeSameVectorDotIndex<1, 0, 0b10, "udot", int_aarch64_neon_udot>; +} + +// ARMv8.6-A BFloat +let Predicates = [HasBF16] in { +defm BFDOT : SIMDThreeSameVectorBFDot<1, "bfdot">; +defm BF16DOTlane : SIMDThreeSameVectorBF16DotI<0, "bfdot">; +def BFMMLA : SIMDThreeSameVectorBF16MatrixMul<"bfmmla">; +def BFMLALB : SIMDBF16MLAL<0, "bfmlalb", int_aarch64_neon_bfmlalb>; +def BFMLALT : SIMDBF16MLAL<1, "bfmlalt", int_aarch64_neon_bfmlalt>; +def BFMLALBIdx : SIMDBF16MLALIndex<0, "bfmlalb", int_aarch64_neon_bfmlalb>; +def BFMLALTIdx : SIMDBF16MLALIndex<1, "bfmlalt", int_aarch64_neon_bfmlalt>; +def BFCVTN : SIMD_BFCVTN; +def BFCVTN2 : SIMD_BFCVTN2; +def BFCVT : BF16ToSinglePrecision<"bfcvt">; +} + +// ARMv8.6A AArch64 matrix multiplication +let Predicates = [HasMatMulInt8] in { +def SMMLA : SIMDThreeSameVectorMatMul<0, 0, "smmla", int_aarch64_neon_smmla>; +def UMMLA : SIMDThreeSameVectorMatMul<0, 1, "ummla", int_aarch64_neon_ummla>; +def USMMLA : SIMDThreeSameVectorMatMul<1, 0, "usmmla", int_aarch64_neon_usmmla>; +defm USDOT : SIMDThreeSameVectorDot<0, 1, "usdot", int_aarch64_neon_usdot>; +defm USDOTlane : SIMDThreeSameVectorDotIndex<0, 1, 0b10, "usdot", int_aarch64_neon_usdot>; + +// sudot lane has a pattern where usdot is expected (there is no sudot). +// The second operand is used in the dup operation to repeat the indexed +// element. +class BaseSIMDSUDOTIndex + : BaseSIMDThreeSameVectorDotIndex { + let Pattern = [(set (AccumType RegType:$dst), + (AccumType (int_aarch64_neon_usdot (AccumType RegType:$Rd), + (InputType (bitconvert (AccumType + (AArch64duplane32 (v4i32 V128:$Rm), + VectorIndexS:$idx)))), + (InputType RegType:$Rn))))]; +} + +multiclass SIMDSUDOTIndex { + def v8i8 : BaseSIMDSUDOTIndex<0, ".2s", ".8b", ".4b", V64, v2i32, v8i8>; + def v16i8 : BaseSIMDSUDOTIndex<1, ".4s", ".16b", ".4b", V128, v4i32, v16i8>; +} + +defm SUDOTlane : SIMDSUDOTIndex; + } // ARMv8.2-A FP16 Fused Multiply-Add Long @@ -819,38 +914,56 @@ let Predicates = [HasComplxNum, HasNEON] in { // important for compatibility with other assemblers (e.g. GAS) when building // software compatible with both CPUs that do or don't implement PA. let Uses = [LR], Defs = [LR] in { - def PACIAZ : SystemNoOperands<0b000, "hint #24">; - def PACIBZ : SystemNoOperands<0b010, "hint #26">; + def PACIAZ : SystemNoOperands<0b000, "hint\t#24">; + def PACIBZ : SystemNoOperands<0b010, "hint\t#26">; let isAuthenticated = 1 in { - def AUTIAZ : SystemNoOperands<0b100, "hint #28">; - def AUTIBZ : SystemNoOperands<0b110, "hint #30">; + def AUTIAZ : SystemNoOperands<0b100, "hint\t#28">; + def AUTIBZ : SystemNoOperands<0b110, "hint\t#30">; } } let Uses = [LR, SP], Defs = [LR] in { - def PACIASP : SystemNoOperands<0b001, "hint #25">; - def PACIBSP : SystemNoOperands<0b011, "hint #27">; + def PACIASP : SystemNoOperands<0b001, "hint\t#25">; + def PACIBSP : SystemNoOperands<0b011, "hint\t#27">; let isAuthenticated = 1 in { - def AUTIASP : SystemNoOperands<0b101, "hint #29">; - def AUTIBSP : SystemNoOperands<0b111, "hint #31">; + def AUTIASP : SystemNoOperands<0b101, "hint\t#29">; + def AUTIBSP : SystemNoOperands<0b111, "hint\t#31">; } } let Uses = [X16, X17], Defs = [X17], CRm = 0b0001 in { - def PACIA1716 : SystemNoOperands<0b000, "hint #8">; - def PACIB1716 : SystemNoOperands<0b010, "hint #10">; + def PACIA1716 : SystemNoOperands<0b000, "hint\t#8">; + def PACIB1716 : SystemNoOperands<0b010, "hint\t#10">; let isAuthenticated = 1 in { - def AUTIA1716 : SystemNoOperands<0b100, "hint #12">; - def AUTIB1716 : SystemNoOperands<0b110, "hint #14">; + def AUTIA1716 : SystemNoOperands<0b100, "hint\t#12">; + def AUTIB1716 : SystemNoOperands<0b110, "hint\t#14">; } } let Uses = [LR], Defs = [LR], CRm = 0b0000 in { - def XPACLRI : SystemNoOperands<0b111, "hint #7">; -} + def XPACLRI : SystemNoOperands<0b111, "hint\t#7">; +} + +// In order to be able to write readable assembly, LLVM should accept assembly +// inputs that use pointer authentication mnemonics, even with PA disabled. +// However, in order to be compatible with other assemblers (e.g. GAS), LLVM +// should not emit these mnemonics unless PA is enabled. +def : InstAlias<"paciaz", (PACIAZ), 0>; +def : InstAlias<"pacibz", (PACIBZ), 0>; +def : InstAlias<"autiaz", (AUTIAZ), 0>; +def : InstAlias<"autibz", (AUTIBZ), 0>; +def : InstAlias<"paciasp", (PACIASP), 0>; +def : InstAlias<"pacibsp", (PACIBSP), 0>; +def : InstAlias<"autiasp", (AUTIASP), 0>; +def : InstAlias<"autibsp", (AUTIBSP), 0>; +def : InstAlias<"pacia1716", (PACIA1716), 0>; +def : InstAlias<"pacib1716", (PACIB1716), 0>; +def : InstAlias<"autia1716", (AUTIA1716), 0>; +def : InstAlias<"autib1716", (AUTIB1716), 0>; +def : InstAlias<"xpaclri", (XPACLRI), 0>; // These pointer authentication instructions require armv8.3a let Predicates = [HasPA] in { - // When compiling with PA, there is a better mnemonic for these instructions. + // When PA is enabled, a better mnemonic should be emitted. def : InstAlias<"paciaz", (PACIAZ), 1>; def : InstAlias<"pacibz", (PACIBZ), 1>; def : InstAlias<"autiaz", (AUTIAZ), 1>; @@ -884,15 +997,23 @@ let Predicates = [HasPA] in { def PACGA : SignAuthTwoOperand<0b1100, "pacga", null_frag>; // Combined Instructions - def BRAA : AuthBranchTwoOperands<0, 0, "braa">; - def BRAB : AuthBranchTwoOperands<0, 1, "brab">; - def BLRAA : AuthBranchTwoOperands<1, 0, "blraa">; - def BLRAB : AuthBranchTwoOperands<1, 1, "blrab">; + let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in { + def BRAA : AuthBranchTwoOperands<0, 0, "braa">; + def BRAB : AuthBranchTwoOperands<0, 1, "brab">; + } + let isCall = 1, Defs = [LR], Uses = [SP] in { + def BLRAA : AuthBranchTwoOperands<1, 0, "blraa">; + def BLRAB : AuthBranchTwoOperands<1, 1, "blrab">; + } - def BRAAZ : AuthOneOperand<0b000, 0, "braaz">; - def BRABZ : AuthOneOperand<0b000, 1, "brabz">; - def BLRAAZ : AuthOneOperand<0b001, 0, "blraaz">; - def BLRABZ : AuthOneOperand<0b001, 1, "blrabz">; + let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in { + def BRAAZ : AuthOneOperand<0b000, 0, "braaz">; + def BRABZ : AuthOneOperand<0b000, 1, "brabz">; + } + let isCall = 1, Defs = [LR], Uses = [SP] in { + def BLRAAZ : AuthOneOperand<0b001, 0, "blraaz">; + def BLRABZ : AuthOneOperand<0b001, 1, "blrabz">; + } let isReturn = 1, isTerminator = 1, isBarrier = 1 in { def RETAA : AuthReturn<0b010, 0, "retaa">; @@ -1538,17 +1659,29 @@ def TAGPstack // register / expression for the tagged base pointer of the current function. def : Pat<(int_aarch64_irg_sp i64:$Rm), (IRGstack SP, i64:$Rm)>; -// Large STG to be expanded into a loop. $Rm is the size, $Rn is start address. -// $Rn_wback is one past the end of the range. +// Large STG to be expanded into a loop. $sz is the size, $Rn is start address. +// $Rn_wback is one past the end of the range. $Rm is the loop counter. let isCodeGenOnly=1, mayStore=1 in { +def STGloop_wback + : Pseudo<(outs GPR64common:$Rm, GPR64sp:$Rn_wback), (ins i64imm:$sz, GPR64sp:$Rn), + [], "$Rn = $Rn_wback,@earlyclobber $Rn_wback,@earlyclobber $Rm" >, + Sched<[WriteAdr, WriteST]>; + +def STZGloop_wback + : Pseudo<(outs GPR64common:$Rm, GPR64sp:$Rn_wback), (ins i64imm:$sz, GPR64sp:$Rn), + [], "$Rn = $Rn_wback,@earlyclobber $Rn_wback,@earlyclobber $Rm" >, + Sched<[WriteAdr, WriteST]>; + +// A variant of the above where $Rn2 is an independent register not tied to the input register $Rn. +// Their purpose is to use a FrameIndex operand as $Rn (which of course can not be written back). def STGloop - : Pseudo<(outs GPR64common:$Rm_wback, GPR64sp:$Rn_wback), (ins GPR64common:$Rm, GPR64sp:$Rn), - [], "$Rn = $Rn_wback,@earlyclobber $Rn_wback,$Rm = $Rm_wback,@earlyclobber $Rm_wback" >, + : Pseudo<(outs GPR64common:$Rm, GPR64sp:$Rn2), (ins i64imm:$sz, GPR64sp:$Rn), + [], "@earlyclobber $Rn2,@earlyclobber $Rm" >, Sched<[WriteAdr, WriteST]>; def STZGloop - : Pseudo<(outs GPR64common:$Rm_wback, GPR64sp:$Rn_wback), (ins GPR64common:$Rm, GPR64sp:$Rn), - [], "$Rn = $Rn_wback,@earlyclobber $Rn_wback,$Rm = $Rm_wback,@earlyclobber $Rm_wback" >, + : Pseudo<(outs GPR64common:$Rm, GPR64sp:$Rn2), (ins i64imm:$sz, GPR64sp:$Rn), + [], "@earlyclobber $Rn2,@earlyclobber $Rm" >, Sched<[WriteAdr, WriteST]>; } @@ -1894,9 +2027,19 @@ def ERET : SpecialReturn<0b0100, "eret">; def : InstAlias<"ret", (RET LR)>; let isCall = 1, Defs = [LR], Uses = [SP] in { -def BLR : BranchReg<0b0001, "blr", [(AArch64call GPR64:$Rn)]>; + def BLR : BranchReg<0b0001, "blr", []>; + def BLRNoIP : Pseudo<(outs), (ins GPR64noip:$Rn), []>, + Sched<[WriteBrReg]>, + PseudoInstExpansion<(BLR GPR64:$Rn)>; } // isCall +def : Pat<(AArch64call GPR64:$Rn), + (BLR GPR64:$Rn)>, + Requires<[NoSLSBLRMitigation]>; +def : Pat<(AArch64call GPR64noip:$Rn), + (BLRNoIP GPR64noip:$Rn)>, + Requires<[SLSBLRMitigation]>; + let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in { def BR : BranchReg<0b0000, "br", [(brind GPR64:$Rn)]>; } // isBranch, isTerminator, isBarrier, isIndirectBranch @@ -2129,6 +2272,7 @@ let Predicates = [IsLE] in { defm : VecROLoadPat; defm : VecROLoadPat; defm : VecROLoadPat; + defm : VecROLoadPat; } defm : VecROLoadPat; @@ -2143,6 +2287,7 @@ let Predicates = [IsLE] in { defm : VecROLoadPat; defm : VecROLoadPat; defm : VecROLoadPat; + defm : VecROLoadPat; defm : VecROLoadPat; } } // AddedComplexity = 10 @@ -2225,6 +2370,10 @@ defm LDRQ : LoadUI<0b00, 1, 0b11, FPR128Op, uimm12s16, "ldr", [(set (f128 FPR128Op:$Rt), (load (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)))]>; +// bf16 load pattern +def : Pat <(bf16 (load (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))), + (LDRHui GPR64sp:$Rn, uimm12s2:$offset)>; + // For regular load, we do not have any alignment requirement. // Thus, it is safe to directly map the vector loads with interesting // addressing modes. @@ -2274,6 +2423,8 @@ let Predicates = [IsLE] in { (LDRDui GPR64sp:$Rn, uimm12s8:$offset)>; def : Pat<(v4f16 (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))), (LDRDui GPR64sp:$Rn, uimm12s8:$offset)>; + def : Pat<(v4bf16 (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))), + (LDRDui GPR64sp:$Rn, uimm12s8:$offset)>; } def : Pat<(v1f64 (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))), (LDRDui GPR64sp:$Rn, uimm12s8:$offset)>; @@ -2297,6 +2448,8 @@ let Predicates = [IsLE] in { (LDRQui GPR64sp:$Rn, uimm12s16:$offset)>; def : Pat<(v8f16 (load (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset))), (LDRQui GPR64sp:$Rn, uimm12s16:$offset)>; + def : Pat<(v8bf16 (load (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset))), + (LDRQui GPR64sp:$Rn, uimm12s16:$offset)>; } def : Pat<(f128 (load (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset))), (LDRQui GPR64sp:$Rn, uimm12s16:$offset)>; @@ -2381,11 +2534,11 @@ def : InstAlias<"prfm $Rt, [$Rn]", (PRFMui prfop:$Rt, GPR64sp:$Rn, 0)>; def alignedglobal : PatLeaf<(iPTR iPTR:$label), [{ if (auto *G = dyn_cast(N)) { const DataLayout &DL = MF->getDataLayout(); - MaybeAlign Align = G->getGlobal()->getPointerAlignment(DL); - return Align && *Align >= 4 && G->getOffset() % 4 == 0; + Align Align = G->getGlobal()->getPointerAlignment(DL); + return Align >= 4 && G->getOffset() % 4 == 0; } if (auto *C = dyn_cast(N)) - return C->getAlignment() >= 4 && C->getOffset() % 4 == 0; + return C->getAlign() >= 4 && C->getOffset() % 4 == 0; return false; }]>; @@ -2425,7 +2578,7 @@ defm LDURB : LoadUnscaled<0b00, 1, 0b01, FPR8Op, "ldur", [(set FPR8Op:$Rt, (load (am_unscaled8 GPR64sp:$Rn, simm9:$offset)))]>; defm LDURH : LoadUnscaled<0b01, 1, 0b01, FPR16Op, "ldur", - [(set FPR16Op:$Rt, + [(set (f16 FPR16Op:$Rt), (load (am_unscaled16 GPR64sp:$Rn, simm9:$offset)))]>; defm LDURS : LoadUnscaled<0b10, 1, 0b01, FPR32Op, "ldur", [(set (f32 FPR32Op:$Rt), @@ -2722,6 +2875,10 @@ defm STNPQ : StorePairNoAlloc<0b10, 1, FPR128Op, simm7s16, "stnp">; def : Pat<(AArch64stp GPR64z:$Rt, GPR64z:$Rt2, (am_indexed7s64 GPR64sp:$Rn, simm7s8:$offset)), (STPXi GPR64z:$Rt, GPR64z:$Rt2, GPR64sp:$Rn, simm7s8:$offset)>; +def : Pat<(AArch64stnp FPR128:$Rt, FPR128:$Rt2, (am_indexed7s128 GPR64sp:$Rn, simm7s16:$offset)), + (STNPQi FPR128:$Rt, FPR128:$Rt2, GPR64sp:$Rn, simm7s16:$offset)>; + + //--- // (Register offset) @@ -2791,6 +2948,7 @@ let Predicates = [IsLE] in { defm : VecROStorePat; defm : VecROStorePat; defm : VecROStorePat; + defm : VecROStorePat; } defm : VecROStorePat; @@ -2806,6 +2964,7 @@ let Predicates = [IsLE, UseSTRQro] in { defm : VecROStorePat; defm : VecROStorePat; defm : VecROStorePat; + defm : VecROStorePat; } } // AddedComplexity = 10 @@ -2866,6 +3025,11 @@ defm STRBB : StoreUIz<0b00, 0, 0b00, GPR32z, uimm12s1, "strb", (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))]>; +// bf16 store pattern +def : Pat<(store (bf16 FPR16Op:$Rt), + (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset)), + (STRHui FPR16:$Rt, GPR64sp:$Rn, uimm12s2:$offset)>; + let AddedComplexity = 10 in { // Match all store 64 bits width whose type is compatible with FPR64 @@ -2893,6 +3057,9 @@ let Predicates = [IsLE] in { def : Pat<(store (v4f16 FPR64:$Rt), (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)), (STRDui FPR64:$Rt, GPR64sp:$Rn, uimm12s8:$offset)>; + def : Pat<(store (v4bf16 FPR64:$Rt), + (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)), + (STRDui FPR64:$Rt, GPR64sp:$Rn, uimm12s8:$offset)>; } // Match all store 128 bits width whose type is compatible with FPR128 @@ -2923,6 +3090,9 @@ let Predicates = [IsLE] in { def : Pat<(store (v8f16 FPR128:$Rt), (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)), (STRQui FPR128:$Rt, GPR64sp:$Rn, uimm12s16:$offset)>; + def : Pat<(store (v8bf16 FPR128:$Rt), + (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)), + (STRQui FPR128:$Rt, GPR64sp:$Rn, uimm12s16:$offset)>; } // truncstore i64 @@ -3030,6 +3200,9 @@ let Predicates = [IsLE] in { def : Pat<(store (v4f16 FPR64:$Rt), (am_unscaled64 GPR64sp:$Rn, simm9:$offset)), (STURDi FPR64:$Rt, GPR64sp:$Rn, simm9:$offset)>; + def : Pat<(store (v4bf16 FPR64:$Rt), + (am_unscaled64 GPR64sp:$Rn, simm9:$offset)), + (STURDi FPR64:$Rt, GPR64sp:$Rn, simm9:$offset)>; } // Match all store 128 bits width whose type is compatible with FPR128 @@ -3062,6 +3235,9 @@ let Predicates = [IsLE] in { def : Pat<(store (v8f16 FPR128:$Rt), (am_unscaled128 GPR64sp:$Rn, simm9:$offset)), (STURQi FPR128:$Rt, GPR64sp:$Rn, simm9:$offset)>; + def : Pat<(store (v8bf16 FPR128:$Rt), + (am_unscaled128 GPR64sp:$Rn, simm9:$offset)), + (STURQi FPR128:$Rt, GPR64sp:$Rn, simm9:$offset)>; } } // AddedComplexity = 10 @@ -3300,10 +3476,10 @@ defm FCVTNS : FPToIntegerUnscaled<0b00, 0b000, "fcvtns", int_aarch64_neon_fcvtns defm FCVTNU : FPToIntegerUnscaled<0b00, 0b001, "fcvtnu", int_aarch64_neon_fcvtnu>; defm FCVTPS : FPToIntegerUnscaled<0b01, 0b000, "fcvtps", int_aarch64_neon_fcvtps>; defm FCVTPU : FPToIntegerUnscaled<0b01, 0b001, "fcvtpu", int_aarch64_neon_fcvtpu>; -defm FCVTZS : FPToIntegerUnscaled<0b11, 0b000, "fcvtzs", fp_to_sint>; -defm FCVTZU : FPToIntegerUnscaled<0b11, 0b001, "fcvtzu", fp_to_uint>; -defm FCVTZS : FPToIntegerScaled<0b11, 0b000, "fcvtzs", fp_to_sint>; -defm FCVTZU : FPToIntegerScaled<0b11, 0b001, "fcvtzu", fp_to_uint>; +defm FCVTZS : FPToIntegerUnscaled<0b11, 0b000, "fcvtzs", any_fp_to_sint>; +defm FCVTZU : FPToIntegerUnscaled<0b11, 0b001, "fcvtzu", any_fp_to_uint>; +defm FCVTZS : FPToIntegerScaled<0b11, 0b000, "fcvtzs", any_fp_to_sint>; +defm FCVTZU : FPToIntegerScaled<0b11, 0b001, "fcvtzu", any_fp_to_uint>; multiclass FPToIntegerIntPats { def : Pat<(i32 (round f16:$Rn)), (!cast(INST # UWHr) $Rn)>; @@ -3375,8 +3551,8 @@ def : Pat<(i64 (llround f64:$Rn)), // Scaled integer to floating point conversion instructions. //===----------------------------------------------------------------------===// -defm SCVTF : IntegerToFP<0, "scvtf", sint_to_fp>; -defm UCVTF : IntegerToFP<1, "ucvtf", uint_to_fp>; +defm SCVTF : IntegerToFP<0, "scvtf", any_sint_to_fp>; +defm UCVTF : IntegerToFP<1, "ucvtf", any_uint_to_fp>; //===----------------------------------------------------------------------===// // Unscaled integer to floating point conversion instruction. @@ -3541,8 +3717,8 @@ def : Pat<(f64 (fma FPR64:$Rn, (fneg FPR64:$Rm), (fneg FPR64:$Ra))), // Floating point comparison instructions. //===----------------------------------------------------------------------===// -defm FCMPE : FPComparison<1, "fcmpe">; -defm FCMP : FPComparison<0, "fcmp", AArch64fcmp>; +defm FCMPE : FPComparison<1, "fcmpe", AArch64strict_fcmpe>; +defm FCMP : FPComparison<0, "fcmp", AArch64any_fcmp>; //===----------------------------------------------------------------------===// // Floating point conditional comparison instructions. @@ -3603,10 +3779,6 @@ let isTerminator = 1, hasSideEffects = 1, isBarrier = 1, hasCtrlDep = 1, Sched<[]>; } -let hasSideEffects = 1, hasCtrlDep = 1, isCodeGenOnly = 1, - usesCustomInserter = 1 in -def CATCHPAD : Pseudo<(outs), (ins), [(catchpad)]>, Sched<[]>; - //===----------------------------------------------------------------------===// // Floating point immediate move. //===----------------------------------------------------------------------===// @@ -3788,12 +3960,16 @@ defm URSQRTE: SIMDTwoVectorS<1, 1, 0b11100, "ursqrte", int_aarch64_neon_ursqrte> defm USQADD : SIMDTwoVectorBHSDTied<1, 0b00011, "usqadd",int_aarch64_neon_usqadd>; defm XTN : SIMDMixedTwoVector<0, 0b10010, "xtn", trunc>; -def : Pat<(v4f16 (AArch64rev32 V64:$Rn)), (REV32v4i16 V64:$Rn)>; -def : Pat<(v4f16 (AArch64rev64 V64:$Rn)), (REV64v4i16 V64:$Rn)>; -def : Pat<(v8f16 (AArch64rev32 V128:$Rn)), (REV32v8i16 V128:$Rn)>; -def : Pat<(v8f16 (AArch64rev64 V128:$Rn)), (REV64v8i16 V128:$Rn)>; -def : Pat<(v2f32 (AArch64rev64 V64:$Rn)), (REV64v2i32 V64:$Rn)>; -def : Pat<(v4f32 (AArch64rev64 V128:$Rn)), (REV64v4i32 V128:$Rn)>; +def : Pat<(v4f16 (AArch64rev32 V64:$Rn)), (REV32v4i16 V64:$Rn)>; +def : Pat<(v4f16 (AArch64rev64 V64:$Rn)), (REV64v4i16 V64:$Rn)>; +def : Pat<(v4bf16 (AArch64rev32 V64:$Rn)), (REV32v4i16 V64:$Rn)>; +def : Pat<(v4bf16 (AArch64rev64 V64:$Rn)), (REV64v4i16 V64:$Rn)>; +def : Pat<(v8f16 (AArch64rev32 V128:$Rn)), (REV32v8i16 V128:$Rn)>; +def : Pat<(v8f16 (AArch64rev64 V128:$Rn)), (REV64v8i16 V128:$Rn)>; +def : Pat<(v8bf16 (AArch64rev32 V128:$Rn)), (REV32v8i16 V128:$Rn)>; +def : Pat<(v8bf16 (AArch64rev64 V128:$Rn)), (REV64v8i16 V128:$Rn)>; +def : Pat<(v2f32 (AArch64rev64 V64:$Rn)), (REV64v2i32 V64:$Rn)>; +def : Pat<(v4f32 (AArch64rev64 V128:$Rn)), (REV64v4i32 V128:$Rn)>; // Patterns for vector long shift (by element width). These need to match all // three of zext, sext and anyext so it's easier to pull the patterns out of the @@ -3900,7 +4076,7 @@ defm SQRDMULH : SIMDThreeSameVectorHS<1,0b10110,"sqrdmulh",int_aarch64_neon_sqrd defm SQRSHL : SIMDThreeSameVector<0,0b01011,"sqrshl", int_aarch64_neon_sqrshl>; defm SQSHL : SIMDThreeSameVector<0,0b01001,"sqshl", int_aarch64_neon_sqshl>; defm SQSUB : SIMDThreeSameVector<0,0b00101,"sqsub", int_aarch64_neon_sqsub>; -defm SRHADD : SIMDThreeSameVectorBHS<0,0b00010,"srhadd",int_aarch64_neon_srhadd>; +defm SRHADD : SIMDThreeSameVectorBHS<0,0b00010,"srhadd", AArch64srhadd>; defm SRSHL : SIMDThreeSameVector<0,0b01010,"srshl", int_aarch64_neon_srshl>; defm SSHL : SIMDThreeSameVector<0,0b01000,"sshl", int_aarch64_neon_sshl>; defm SUB : SIMDThreeSameVector<1,0b10000,"sub", sub>; @@ -3917,7 +4093,7 @@ defm UQADD : SIMDThreeSameVector<1,0b00001,"uqadd", int_aarch64_neon_uqadd>; defm UQRSHL : SIMDThreeSameVector<1,0b01011,"uqrshl", int_aarch64_neon_uqrshl>; defm UQSHL : SIMDThreeSameVector<1,0b01001,"uqshl", int_aarch64_neon_uqshl>; defm UQSUB : SIMDThreeSameVector<1,0b00101,"uqsub", int_aarch64_neon_uqsub>; -defm URHADD : SIMDThreeSameVectorBHS<1,0b00010,"urhadd", int_aarch64_neon_urhadd>; +defm URHADD : SIMDThreeSameVectorBHS<1,0b00010,"urhadd", AArch64urhadd>; defm URSHL : SIMDThreeSameVector<1,0b01010,"urshl", int_aarch64_neon_urshl>; defm USHL : SIMDThreeSameVector<1,0b01000,"ushl", int_aarch64_neon_ushl>; defm SQRDMLAH : SIMDThreeSameVectorSQRDMLxHTiedHS<1,0b10000,"sqrdmlah", @@ -3934,33 +4110,36 @@ defm : SIMDThreeSameVectorExtraPatterns<"UQSUB", usubsat>; defm AND : SIMDLogicalThreeVector<0, 0b00, "and", and>; defm BIC : SIMDLogicalThreeVector<0, 0b01, "bic", BinOpFrag<(and node:$LHS, (vnot node:$RHS))> >; -defm BIF : SIMDLogicalThreeVector<1, 0b11, "bif">; -defm BIT : SIMDLogicalThreeVectorTied<1, 0b10, "bit", AArch64bit>; -defm BSL : SIMDLogicalThreeVectorTied<1, 0b01, "bsl", - TriOpFrag<(or (and node:$LHS, node:$MHS), (and (vnot node:$LHS), node:$RHS))>>; defm EOR : SIMDLogicalThreeVector<1, 0b00, "eor", xor>; defm ORN : SIMDLogicalThreeVector<0, 0b11, "orn", BinOpFrag<(or node:$LHS, (vnot node:$RHS))> >; defm ORR : SIMDLogicalThreeVector<0, 0b10, "orr", or>; - -def : Pat<(AArch64bsl (v8i8 V64:$Rd), V64:$Rn, V64:$Rm), - (BSLv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>; -def : Pat<(AArch64bsl (v4i16 V64:$Rd), V64:$Rn, V64:$Rm), - (BSLv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>; -def : Pat<(AArch64bsl (v2i32 V64:$Rd), V64:$Rn, V64:$Rm), - (BSLv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>; -def : Pat<(AArch64bsl (v1i64 V64:$Rd), V64:$Rn, V64:$Rm), - (BSLv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>; - -def : Pat<(AArch64bsl (v16i8 V128:$Rd), V128:$Rn, V128:$Rm), - (BSLv16i8 V128:$Rd, V128:$Rn, V128:$Rm)>; -def : Pat<(AArch64bsl (v8i16 V128:$Rd), V128:$Rn, V128:$Rm), - (BSLv16i8 V128:$Rd, V128:$Rn, V128:$Rm)>; -def : Pat<(AArch64bsl (v4i32 V128:$Rd), V128:$Rn, V128:$Rm), - (BSLv16i8 V128:$Rd, V128:$Rn, V128:$Rm)>; -def : Pat<(AArch64bsl (v2i64 V128:$Rd), V128:$Rn, V128:$Rm), - (BSLv16i8 V128:$Rd, V128:$Rn, V128:$Rm)>; +// Pseudo bitwise select pattern BSP. +// It is expanded into BSL/BIT/BIF after register allocation. +defm BSP : SIMDLogicalThreeVectorPseudo>; +defm BSL : SIMDLogicalThreeVectorTied<1, 0b01, "bsl">; +defm BIT : SIMDLogicalThreeVectorTied<1, 0b10, "bit", AArch64bit>; +defm BIF : SIMDLogicalThreeVectorTied<1, 0b11, "bif">; + +def : Pat<(AArch64bsp (v8i8 V64:$Rd), V64:$Rn, V64:$Rm), + (BSPv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>; +def : Pat<(AArch64bsp (v4i16 V64:$Rd), V64:$Rn, V64:$Rm), + (BSPv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>; +def : Pat<(AArch64bsp (v2i32 V64:$Rd), V64:$Rn, V64:$Rm), + (BSPv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>; +def : Pat<(AArch64bsp (v1i64 V64:$Rd), V64:$Rn, V64:$Rm), + (BSPv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>; + +def : Pat<(AArch64bsp (v16i8 V128:$Rd), V128:$Rn, V128:$Rm), + (BSPv16i8 V128:$Rd, V128:$Rn, V128:$Rm)>; +def : Pat<(AArch64bsp (v8i16 V128:$Rd), V128:$Rn, V128:$Rm), + (BSPv16i8 V128:$Rd, V128:$Rn, V128:$Rm)>; +def : Pat<(AArch64bsp (v4i32 V128:$Rd), V128:$Rn, V128:$Rm), + (BSPv16i8 V128:$Rd, V128:$Rn, V128:$Rm)>; +def : Pat<(AArch64bsp (v2i64 V128:$Rd), V128:$Rn, V128:$Rm), + (BSPv16i8 V128:$Rd, V128:$Rn, V128:$Rm)>; def : InstAlias<"mov{\t$dst.16b, $src.16b|.16b\t$dst, $src}", (ORRv16i8 V128:$dst, V128:$src, V128:$src), 1>; @@ -4669,6 +4848,7 @@ multiclass ExtPat { defm : ExtPat; defm : ExtPat; defm : ExtPat; +defm : ExtPat; defm : ExtPat; defm : ExtPat; defm : ExtPat; @@ -4790,16 +4970,29 @@ def : Pat<(v4f16 (AArch64dup (f16 FPR16:$Rn))), (v4f16 (DUPv4i16lane (INSERT_SUBREG (v8i16 (IMPLICIT_DEF)), FPR16:$Rn, hsub), (i64 0)))>; +def : Pat<(v4bf16 (AArch64dup (bf16 FPR16:$Rn))), + (v4bf16 (DUPv4i16lane + (INSERT_SUBREG (v8i16 (IMPLICIT_DEF)), FPR16:$Rn, hsub), + (i64 0)))>; def : Pat<(v8f16 (AArch64dup (f16 FPR16:$Rn))), (v8f16 (DUPv8i16lane (INSERT_SUBREG (v8i16 (IMPLICIT_DEF)), FPR16:$Rn, hsub), (i64 0)))>; +def : Pat<(v8bf16 (AArch64dup (bf16 FPR16:$Rn))), + (v8bf16 (DUPv8i16lane + (INSERT_SUBREG (v8i16 (IMPLICIT_DEF)), FPR16:$Rn, hsub), + (i64 0)))>; def : Pat<(v4f16 (AArch64duplane16 (v8f16 V128:$Rn), VectorIndexH:$imm)), (DUPv4i16lane V128:$Rn, VectorIndexH:$imm)>; def : Pat<(v8f16 (AArch64duplane16 (v8f16 V128:$Rn), VectorIndexH:$imm)), (DUPv8i16lane V128:$Rn, VectorIndexH:$imm)>; +def : Pat<(v4bf16 (AArch64duplane16 (v8bf16 V128:$Rn), VectorIndexH:$imm)), + (DUPv4i16lane V128:$Rn, VectorIndexH:$imm)>; +def : Pat<(v8bf16 (AArch64duplane16 (v8bf16 V128:$Rn), VectorIndexH:$imm)), + (DUPv8i16lane V128:$Rn, VectorIndexH:$imm)>; + def : Pat<(v2f32 (AArch64duplane32 (v4f32 V128:$Rn), VectorIndexS:$imm)), (DUPv2i32lane V128:$Rn, VectorIndexS:$imm)>; def : Pat<(v4f32 (AArch64duplane32 (v4f32 V128:$Rn), VectorIndexS:$imm)), @@ -4915,6 +5108,11 @@ def : Pat<(v4f16 (scalar_to_vector (f16 FPR16:$Rn))), def : Pat<(v8f16 (scalar_to_vector (f16 FPR16:$Rn))), (INSERT_SUBREG (v8f16 (IMPLICIT_DEF)), FPR16:$Rn, hsub)>; +def : Pat<(v4bf16 (scalar_to_vector (bf16 FPR16:$Rn))), + (INSERT_SUBREG (v4bf16 (IMPLICIT_DEF)), FPR16:$Rn, hsub)>; +def : Pat<(v8bf16 (scalar_to_vector (bf16 FPR16:$Rn))), + (INSERT_SUBREG (v8bf16 (IMPLICIT_DEF)), FPR16:$Rn, hsub)>; + def : Pat<(v2i32 (scalar_to_vector (i32 FPR32:$Rn))), (v2i32 (INSERT_SUBREG (v2i32 (IMPLICIT_DEF)), (i32 FPR32:$Rn), ssub))>; @@ -4931,6 +5129,11 @@ def : Pat<(v4f16 (scalar_to_vector (f16 FPR16:$Rn))), def : Pat<(v8f16 (scalar_to_vector (f16 FPR16:$Rn))), (INSERT_SUBREG (v8f16 (IMPLICIT_DEF)), FPR16:$Rn, hsub)>; +def : Pat<(v4bf16 (scalar_to_vector (bf16 FPR16:$Rn))), + (INSERT_SUBREG (v4bf16 (IMPLICIT_DEF)), FPR16:$Rn, hsub)>; +def : Pat<(v8bf16 (scalar_to_vector (bf16 FPR16:$Rn))), + (INSERT_SUBREG (v8bf16 (IMPLICIT_DEF)), FPR16:$Rn, hsub)>; + def : Pat<(v4f32 (scalar_to_vector (f32 FPR32:$Rn))), (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FPR32:$Rn, ssub)>; def : Pat<(v2f32 (scalar_to_vector (f32 FPR32:$Rn))), @@ -4956,6 +5159,23 @@ def : Pat<(v8f16 (vector_insert (v8f16 V128:$Rn), (v8f16 (INSERT_SUBREG (v8f16 (IMPLICIT_DEF)), FPR16:$Rm, hsub)), (i64 0))>; +def : Pat<(v4bf16 (vector_insert (v4bf16 V64:$Rn), + (bf16 FPR16:$Rm), (i64 VectorIndexS:$imm))), + (EXTRACT_SUBREG + (INSvi16lane + (v8bf16 (INSERT_SUBREG (v8bf16 (IMPLICIT_DEF)), V64:$Rn, dsub)), + VectorIndexS:$imm, + (v8bf16 (INSERT_SUBREG (v8bf16 (IMPLICIT_DEF)), FPR16:$Rm, hsub)), + (i64 0)), + dsub)>; + +def : Pat<(v8bf16 (vector_insert (v8bf16 V128:$Rn), + (bf16 FPR16:$Rm), (i64 VectorIndexH:$imm))), + (INSvi16lane + V128:$Rn, VectorIndexH:$imm, + (v8bf16 (INSERT_SUBREG (v8bf16 (IMPLICIT_DEF)), FPR16:$Rm, hsub)), + (i64 0))>; + def : Pat<(v2f32 (vector_insert (v2f32 V64:$Rn), (f32 FPR32:$Rm), (i64 VectorIndexS:$imm))), (EXTRACT_SUBREG @@ -5037,6 +5257,7 @@ multiclass Neon_INS_elt_pattern; +defm : Neon_INS_elt_pattern; defm : Neon_INS_elt_pattern; defm : Neon_INS_elt_pattern; @@ -5050,6 +5271,9 @@ def : Pat<(vector_extract (v4f32 V128:$Rn), 0), (f32 (EXTRACT_SUBREG V128:$Rn, ssub))>; def : Pat<(vector_extract (v8f16 V128:$Rn), 0), (f16 (EXTRACT_SUBREG V128:$Rn, hsub))>; +def : Pat<(vector_extract (v8bf16 V128:$Rn), 0), + (bf16 (EXTRACT_SUBREG V128:$Rn, hsub))>; + def : Pat<(vector_extract (v2f64 V128:$Rn), VectorIndexD:$idx), (f64 (CPYi64 V128:$Rn, VectorIndexD:$idx))>; @@ -5057,6 +5281,8 @@ def : Pat<(vector_extract (v4f32 V128:$Rn), VectorIndexS:$idx), (f32 (CPYi32 V128:$Rn, VectorIndexS:$idx))>; def : Pat<(vector_extract (v8f16 V128:$Rn), VectorIndexH:$idx), (f16 (CPYi16 V128:$Rn, VectorIndexH:$idx))>; +def : Pat<(vector_extract (v8bf16 V128:$Rn), VectorIndexH:$idx), + (bf16 (CPYi16 V128:$Rn, VectorIndexH:$idx))>; // All concat_vectors operations are canonicalised to act on i64 vectors for // AArch64. In the general case we need an instruction, which had just as well be @@ -5072,6 +5298,7 @@ def : ConcatPat; def : ConcatPat; def : ConcatPat; def : ConcatPat; +def : ConcatPat; def : ConcatPat; // If the high lanes are undef, though, we can just ignore them: @@ -5613,6 +5840,11 @@ def : Pat<(v2f64 (fmul V128:$Rn, (AArch64dup (f64 FPR64:$Rm)))), defm SQDMULH : SIMDIndexedHS<0, 0b1100, "sqdmulh", int_aarch64_neon_sqdmulh>; defm SQRDMULH : SIMDIndexedHS<0, 0b1101, "sqrdmulh", int_aarch64_neon_sqrdmulh>; +defm SQDMULH : SIMDIndexedHSPatterns; +defm SQRDMULH : SIMDIndexedHSPatterns; + // Generated by MachineCombine defm MLA : SIMDVectorIndexedHSTied<1, 0b0000, "mla", null_frag>; defm MLS : SIMDVectorIndexedHSTied<1, 0b0100, "mls", null_frag>; @@ -5780,8 +6012,8 @@ defm RSHRN : SIMDVectorRShiftNarrowBHS<0, 0b10001, "rshrn", defm SHL : SIMDVectorLShiftBHSD<0, 0b01010, "shl", AArch64vshl>; defm SHRN : SIMDVectorRShiftNarrowBHS<0, 0b10000, "shrn", BinOpFrag<(trunc (AArch64vashr node:$LHS, node:$RHS))>>; -defm SLI : SIMDVectorLShiftBHSDTied<1, 0b01010, "sli", int_aarch64_neon_vsli>; -def : Pat<(v1i64 (int_aarch64_neon_vsli (v1i64 FPR64:$Rd), (v1i64 FPR64:$Rn), +defm SLI : SIMDVectorLShiftBHSDTied<1, 0b01010, "sli", AArch64vsli>; +def : Pat<(v1i64 (AArch64vsli (v1i64 FPR64:$Rd), (v1i64 FPR64:$Rn), (i32 vecshiftL64:$imm))), (SLId FPR64:$Rd, FPR64:$Rn, vecshiftL64:$imm)>; defm SQRSHRN : SIMDVectorRShiftNarrowBHS<0, 0b10011, "sqrshrn", @@ -5794,8 +6026,8 @@ defm SQSHRN : SIMDVectorRShiftNarrowBHS<0, 0b10010, "sqshrn", int_aarch64_neon_sqshrn>; defm SQSHRUN : SIMDVectorRShiftNarrowBHS<1, 0b10000, "sqshrun", int_aarch64_neon_sqshrun>; -defm SRI : SIMDVectorRShiftBHSDTied<1, 0b01000, "sri", int_aarch64_neon_vsri>; -def : Pat<(v1i64 (int_aarch64_neon_vsri (v1i64 FPR64:$Rd), (v1i64 FPR64:$Rn), +defm SRI : SIMDVectorRShiftBHSDTied<1, 0b01000, "sri", AArch64vsri>; +def : Pat<(v1i64 (AArch64vsri (v1i64 FPR64:$Rd), (v1i64 FPR64:$Rn), (i32 vecshiftR64:$imm))), (SRId FPR64:$Rd, FPR64:$Rn, vecshiftR64:$imm)>; defm SRSHR : SIMDVectorRShiftBHSD<0, 0b00100, "srshr", AArch64srshri>; @@ -6147,6 +6379,10 @@ def : Pat<(v4f16 (AArch64dup (f16 (load GPR64sp:$Rn)))), (LD1Rv4h GPR64sp:$Rn)>; def : Pat<(v8f16 (AArch64dup (f16 (load GPR64sp:$Rn)))), (LD1Rv8h GPR64sp:$Rn)>; +def : Pat<(v4bf16 (AArch64dup (bf16 (load GPR64sp:$Rn)))), + (LD1Rv4h GPR64sp:$Rn)>; +def : Pat<(v8bf16 (AArch64dup (bf16 (load GPR64sp:$Rn)))), + (LD1Rv8h GPR64sp:$Rn)>; class Ld1Lane128Pat @@ -6161,6 +6397,7 @@ def : Ld1Lane128Pat; def : Ld1Lane128Pat; def : Ld1Lane128Pat; def : Ld1Lane128Pat; +def : Ld1Lane128Pat; class Ld1Lane64Pat @@ -6176,6 +6413,7 @@ def : Ld1Lane64Pat; def : Ld1Lane64Pat; def : Ld1Lane64Pat; def : Ld1Lane64Pat; +def : Ld1Lane64Pat; defm LD1 : SIMDLdSt1SingleAliases<"ld1">; @@ -6204,6 +6442,7 @@ def : St1Lane128Pat; def : St1Lane128Pat; def : St1Lane128Pat; def : St1Lane128Pat; +def : St1Lane128Pat; let AddedComplexity = 19 in class St1Lane64Pat; def : St1Lane64Pat; def : St1Lane64Pat; def : St1Lane64Pat; +def : St1Lane64Pat; multiclass St1LanePost64Pat; defm : St1LanePost64Pat; defm : St1LanePost64Pat; defm : St1LanePost64Pat; +defm : St1LanePost64Pat; multiclass St1LanePost128Pat; defm : St1LanePost128Pat; defm : St1LanePost128Pat; defm : St1LanePost128Pat; +defm : St1LanePost128Pat; let mayStore = 1, hasSideEffects = 0 in { defm ST2 : SIMDStSingleB<1, 0b000, "st2", VecListTwob, GPR64pi2>; @@ -6508,6 +6750,7 @@ def : Pat<(v4i32 (mulhu V128:$Rn, V128:$Rm)), def : Pat<(v8i8 (AArch64NvCast (v2i32 FPR64:$src))), (v8i8 FPR64:$src)>; def : Pat<(v4i16 (AArch64NvCast (v2i32 FPR64:$src))), (v4i16 FPR64:$src)>; def : Pat<(v4f16 (AArch64NvCast (v2i32 FPR64:$src))), (v4f16 FPR64:$src)>; +def : Pat<(v4bf16 (AArch64NvCast (v2i32 FPR64:$src))), (v4bf16 FPR64:$src)>; def : Pat<(v2i32 (AArch64NvCast (v2i32 FPR64:$src))), (v2i32 FPR64:$src)>; def : Pat<(v2f32 (AArch64NvCast (v2i32 FPR64:$src))), (v2f32 FPR64:$src)>; def : Pat<(v1i64 (AArch64NvCast (v2i32 FPR64:$src))), (v1i64 FPR64:$src)>; @@ -6515,12 +6758,14 @@ def : Pat<(v1i64 (AArch64NvCast (v2i32 FPR64:$src))), (v1i64 FPR64:$src)>; def : Pat<(v8i8 (AArch64NvCast (v4i16 FPR64:$src))), (v8i8 FPR64:$src)>; def : Pat<(v4i16 (AArch64NvCast (v4i16 FPR64:$src))), (v4i16 FPR64:$src)>; def : Pat<(v4f16 (AArch64NvCast (v4i16 FPR64:$src))), (v4f16 FPR64:$src)>; +def : Pat<(v4bf16 (AArch64NvCast (v4i16 FPR64:$src))), (v4bf16 FPR64:$src)>; def : Pat<(v2i32 (AArch64NvCast (v4i16 FPR64:$src))), (v2i32 FPR64:$src)>; def : Pat<(v1i64 (AArch64NvCast (v4i16 FPR64:$src))), (v1i64 FPR64:$src)>; def : Pat<(v8i8 (AArch64NvCast (v8i8 FPR64:$src))), (v8i8 FPR64:$src)>; def : Pat<(v4i16 (AArch64NvCast (v8i8 FPR64:$src))), (v4i16 FPR64:$src)>; def : Pat<(v4f16 (AArch64NvCast (v8i8 FPR64:$src))), (v4f16 FPR64:$src)>; +def : Pat<(v4bf16 (AArch64NvCast (v8i8 FPR64:$src))), (v4bf16 FPR64:$src)>; def : Pat<(v2i32 (AArch64NvCast (v8i8 FPR64:$src))), (v2i32 FPR64:$src)>; def : Pat<(v2f32 (AArch64NvCast (v8i8 FPR64:$src))), (v2f32 FPR64:$src)>; def : Pat<(v1i64 (AArch64NvCast (v8i8 FPR64:$src))), (v1i64 FPR64:$src)>; @@ -6528,6 +6773,7 @@ def : Pat<(v1i64 (AArch64NvCast (v8i8 FPR64:$src))), (v1i64 FPR64:$src)>; def : Pat<(v8i8 (AArch64NvCast (f64 FPR64:$src))), (v8i8 FPR64:$src)>; def : Pat<(v4i16 (AArch64NvCast (f64 FPR64:$src))), (v4i16 FPR64:$src)>; def : Pat<(v4f16 (AArch64NvCast (f64 FPR64:$src))), (v4f16 FPR64:$src)>; +def : Pat<(v4bf16 (AArch64NvCast (f64 FPR64:$src))), (v4bf16 FPR64:$src)>; def : Pat<(v2i32 (AArch64NvCast (f64 FPR64:$src))), (v2i32 FPR64:$src)>; def : Pat<(v2f32 (AArch64NvCast (f64 FPR64:$src))), (v2f32 FPR64:$src)>; def : Pat<(v1i64 (AArch64NvCast (f64 FPR64:$src))), (v1i64 FPR64:$src)>; @@ -6544,6 +6790,7 @@ def : Pat<(v1f64 (AArch64NvCast (v2f32 FPR64:$src))), (v1f64 FPR64:$src)>; def : Pat<(v16i8 (AArch64NvCast (v4i32 FPR128:$src))), (v16i8 FPR128:$src)>; def : Pat<(v8i16 (AArch64NvCast (v4i32 FPR128:$src))), (v8i16 FPR128:$src)>; def : Pat<(v8f16 (AArch64NvCast (v4i32 FPR128:$src))), (v8f16 FPR128:$src)>; +def : Pat<(v8bf16 (AArch64NvCast (v4i32 FPR128:$src))), (v8bf16 FPR128:$src)>; def : Pat<(v4i32 (AArch64NvCast (v4i32 FPR128:$src))), (v4i32 FPR128:$src)>; def : Pat<(v4f32 (AArch64NvCast (v4i32 FPR128:$src))), (v4f32 FPR128:$src)>; def : Pat<(v2i64 (AArch64NvCast (v4i32 FPR128:$src))), (v2i64 FPR128:$src)>; @@ -6552,6 +6799,7 @@ def : Pat<(v2f64 (AArch64NvCast (v4i32 FPR128:$src))), (v2f64 FPR128:$src)>; def : Pat<(v16i8 (AArch64NvCast (v8i16 FPR128:$src))), (v16i8 FPR128:$src)>; def : Pat<(v8i16 (AArch64NvCast (v8i16 FPR128:$src))), (v8i16 FPR128:$src)>; def : Pat<(v8f16 (AArch64NvCast (v8i16 FPR128:$src))), (v8f16 FPR128:$src)>; +def : Pat<(v8bf16 (AArch64NvCast (v8i16 FPR128:$src))), (v8bf16 FPR128:$src)>; def : Pat<(v4i32 (AArch64NvCast (v8i16 FPR128:$src))), (v4i32 FPR128:$src)>; def : Pat<(v2i64 (AArch64NvCast (v8i16 FPR128:$src))), (v2i64 FPR128:$src)>; def : Pat<(v4f32 (AArch64NvCast (v8i16 FPR128:$src))), (v4f32 FPR128:$src)>; @@ -6560,6 +6808,7 @@ def : Pat<(v2f64 (AArch64NvCast (v8i16 FPR128:$src))), (v2f64 FPR128:$src)>; def : Pat<(v16i8 (AArch64NvCast (v16i8 FPR128:$src))), (v16i8 FPR128:$src)>; def : Pat<(v8i16 (AArch64NvCast (v16i8 FPR128:$src))), (v8i16 FPR128:$src)>; def : Pat<(v8f16 (AArch64NvCast (v16i8 FPR128:$src))), (v8f16 FPR128:$src)>; +def : Pat<(v8bf16 (AArch64NvCast (v16i8 FPR128:$src))), (v8bf16 FPR128:$src)>; def : Pat<(v4i32 (AArch64NvCast (v16i8 FPR128:$src))), (v4i32 FPR128:$src)>; def : Pat<(v2i64 (AArch64NvCast (v16i8 FPR128:$src))), (v2i64 FPR128:$src)>; def : Pat<(v4f32 (AArch64NvCast (v16i8 FPR128:$src))), (v4f32 FPR128:$src)>; @@ -6568,6 +6817,7 @@ def : Pat<(v2f64 (AArch64NvCast (v16i8 FPR128:$src))), (v2f64 FPR128:$src)>; def : Pat<(v16i8 (AArch64NvCast (v2i64 FPR128:$src))), (v16i8 FPR128:$src)>; def : Pat<(v8i16 (AArch64NvCast (v2i64 FPR128:$src))), (v8i16 FPR128:$src)>; def : Pat<(v8f16 (AArch64NvCast (v2i64 FPR128:$src))), (v8f16 FPR128:$src)>; +def : Pat<(v8bf16 (AArch64NvCast (v2i64 FPR128:$src))), (v8bf16 FPR128:$src)>; def : Pat<(v4i32 (AArch64NvCast (v2i64 FPR128:$src))), (v4i32 FPR128:$src)>; def : Pat<(v2i64 (AArch64NvCast (v2i64 FPR128:$src))), (v2i64 FPR128:$src)>; def : Pat<(v4f32 (AArch64NvCast (v2i64 FPR128:$src))), (v4f32 FPR128:$src)>; @@ -6579,6 +6829,7 @@ def : Pat<(v4i32 (AArch64NvCast (v4f32 FPR128:$src))), (v4i32 FPR128:$src)>; def : Pat<(v4f32 (AArch64NvCast (v4f32 FPR128:$src))), (v4f32 FPR128:$src)>; def : Pat<(v2i64 (AArch64NvCast (v4f32 FPR128:$src))), (v2i64 FPR128:$src)>; def : Pat<(v8f16 (AArch64NvCast (v4f32 FPR128:$src))), (v8f16 FPR128:$src)>; +def : Pat<(v8bf16 (AArch64NvCast (v4f32 FPR128:$src))), (v8bf16 FPR128:$src)>; def : Pat<(v2f64 (AArch64NvCast (v4f32 FPR128:$src))), (v2f64 FPR128:$src)>; def : Pat<(v16i8 (AArch64NvCast (v2f64 FPR128:$src))), (v16i8 FPR128:$src)>; @@ -6587,6 +6838,7 @@ def : Pat<(v4i32 (AArch64NvCast (v2f64 FPR128:$src))), (v4i32 FPR128:$src)>; def : Pat<(v2i64 (AArch64NvCast (v2f64 FPR128:$src))), (v2i64 FPR128:$src)>; def : Pat<(v2f64 (AArch64NvCast (v2f64 FPR128:$src))), (v2f64 FPR128:$src)>; def : Pat<(v8f16 (AArch64NvCast (v2f64 FPR128:$src))), (v8f16 FPR128:$src)>; +def : Pat<(v8bf16 (AArch64NvCast (v2f64 FPR128:$src))), (v8bf16 FPR128:$src)>; def : Pat<(v4f32 (AArch64NvCast (v2f64 FPR128:$src))), (v4f32 FPR128:$src)>; let Predicates = [IsLE] in { @@ -6594,6 +6846,7 @@ def : Pat<(v8i8 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>; def : Pat<(v4i16 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>; def : Pat<(v2i32 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>; def : Pat<(v4f16 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>; +def : Pat<(v4bf16 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>; def : Pat<(v2f32 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>; def : Pat<(i64 (bitconvert (v8i8 V64:$Vn))), @@ -6604,6 +6857,8 @@ def : Pat<(i64 (bitconvert (v2i32 V64:$Vn))), (COPY_TO_REGCLASS V64:$Vn, GPR64)>; def : Pat<(i64 (bitconvert (v4f16 V64:$Vn))), (COPY_TO_REGCLASS V64:$Vn, GPR64)>; +def : Pat<(i64 (bitconvert (v4bf16 V64:$Vn))), + (COPY_TO_REGCLASS V64:$Vn, GPR64)>; def : Pat<(i64 (bitconvert (v2f32 V64:$Vn))), (COPY_TO_REGCLASS V64:$Vn, GPR64)>; def : Pat<(i64 (bitconvert (v1f64 V64:$Vn))), @@ -6618,6 +6873,8 @@ def : Pat<(v2i32 (bitconvert GPR64:$Xn)), (REV64v2i32 (COPY_TO_REGCLASS GPR64:$Xn, FPR64))>; def : Pat<(v4f16 (bitconvert GPR64:$Xn)), (REV64v4i16 (COPY_TO_REGCLASS GPR64:$Xn, FPR64))>; +def : Pat<(v4bf16 (bitconvert GPR64:$Xn)), + (REV64v4i16 (COPY_TO_REGCLASS GPR64:$Xn, FPR64))>; def : Pat<(v2f32 (bitconvert GPR64:$Xn)), (REV64v2i32 (COPY_TO_REGCLASS GPR64:$Xn, FPR64))>; @@ -6629,6 +6886,8 @@ def : Pat<(i64 (bitconvert (v2i32 V64:$Vn))), (REV64v2i32 (COPY_TO_REGCLASS V64:$Vn, GPR64))>; def : Pat<(i64 (bitconvert (v4f16 V64:$Vn))), (REV64v4i16 (COPY_TO_REGCLASS V64:$Vn, GPR64))>; +def : Pat<(i64 (bitconvert (v4bf16 V64:$Vn))), + (REV64v4i16 (COPY_TO_REGCLASS V64:$Vn, GPR64))>; def : Pat<(i64 (bitconvert (v2f32 V64:$Vn))), (REV64v2i32 (COPY_TO_REGCLASS V64:$Vn, GPR64))>; } @@ -6658,6 +6917,7 @@ def : Pat<(v1i64 (bitconvert (v2i32 FPR64:$src))), (v1i64 FPR64:$src)>; def : Pat<(v1i64 (bitconvert (v4i16 FPR64:$src))), (v1i64 FPR64:$src)>; def : Pat<(v1i64 (bitconvert (v8i8 FPR64:$src))), (v1i64 FPR64:$src)>; def : Pat<(v1i64 (bitconvert (v4f16 FPR64:$src))), (v1i64 FPR64:$src)>; +def : Pat<(v1i64 (bitconvert (v4bf16 FPR64:$src))), (v1i64 FPR64:$src)>; def : Pat<(v1i64 (bitconvert (v2f32 FPR64:$src))), (v1i64 FPR64:$src)>; } let Predicates = [IsBE] in { @@ -6669,6 +6929,8 @@ def : Pat<(v1i64 (bitconvert (v8i8 FPR64:$src))), (v1i64 (REV64v8i8 FPR64:$src))>; def : Pat<(v1i64 (bitconvert (v4f16 FPR64:$src))), (v1i64 (REV64v4i16 FPR64:$src))>; +def : Pat<(v1i64 (bitconvert (v4bf16 FPR64:$src))), + (v1i64 (REV64v4i16 FPR64:$src))>; def : Pat<(v1i64 (bitconvert (v2f32 FPR64:$src))), (v1i64 (REV64v2i32 FPR64:$src))>; } @@ -6682,6 +6944,7 @@ def : Pat<(v2i32 (bitconvert (v8i8 FPR64:$src))), (v2i32 FPR64:$src)>; def : Pat<(v2i32 (bitconvert (f64 FPR64:$src))), (v2i32 FPR64:$src)>; def : Pat<(v2i32 (bitconvert (v1f64 FPR64:$src))), (v2i32 FPR64:$src)>; def : Pat<(v2i32 (bitconvert (v4f16 FPR64:$src))), (v2i32 FPR64:$src)>; +def : Pat<(v2i32 (bitconvert (v4bf16 FPR64:$src))), (v2i32 FPR64:$src)>; } let Predicates = [IsBE] in { def : Pat<(v2i32 (bitconvert (v1i64 FPR64:$src))), @@ -6696,6 +6959,8 @@ def : Pat<(v2i32 (bitconvert (v1f64 FPR64:$src))), (v2i32 (REV64v2i32 FPR64:$src))>; def : Pat<(v2i32 (bitconvert (v4f16 FPR64:$src))), (v2i32 (REV32v4i16 FPR64:$src))>; +def : Pat<(v2i32 (bitconvert (v4bf16 FPR64:$src))), + (v2i32 (REV32v4i16 FPR64:$src))>; } def : Pat<(v2i32 (bitconvert (v2f32 FPR64:$src))), (v2i32 FPR64:$src)>; @@ -6722,6 +6987,7 @@ def : Pat<(v4i16 (bitconvert (v1f64 FPR64:$src))), (v4i16 (REV64v4i16 FPR64:$src))>; } def : Pat<(v4i16 (bitconvert (v4f16 FPR64:$src))), (v4i16 FPR64:$src)>; +def : Pat<(v4i16 (bitconvert (v4bf16 FPR64:$src))), (v4i16 FPR64:$src)>; let Predicates = [IsLE] in { def : Pat<(v4f16 (bitconvert (v1i64 FPR64:$src))), (v4f16 FPR64:$src)>; @@ -6730,6 +6996,13 @@ def : Pat<(v4f16 (bitconvert (v8i8 FPR64:$src))), (v4f16 FPR64:$src)>; def : Pat<(v4f16 (bitconvert (f64 FPR64:$src))), (v4f16 FPR64:$src)>; def : Pat<(v4f16 (bitconvert (v2f32 FPR64:$src))), (v4f16 FPR64:$src)>; def : Pat<(v4f16 (bitconvert (v1f64 FPR64:$src))), (v4f16 FPR64:$src)>; + +def : Pat<(v4bf16 (bitconvert (v1i64 FPR64:$src))), (v4bf16 FPR64:$src)>; +def : Pat<(v4bf16 (bitconvert (v2i32 FPR64:$src))), (v4bf16 FPR64:$src)>; +def : Pat<(v4bf16 (bitconvert (v8i8 FPR64:$src))), (v4bf16 FPR64:$src)>; +def : Pat<(v4bf16 (bitconvert (f64 FPR64:$src))), (v4bf16 FPR64:$src)>; +def : Pat<(v4bf16 (bitconvert (v2f32 FPR64:$src))), (v4bf16 FPR64:$src)>; +def : Pat<(v4bf16 (bitconvert (v1f64 FPR64:$src))), (v4bf16 FPR64:$src)>; } let Predicates = [IsBE] in { def : Pat<(v4f16 (bitconvert (v1i64 FPR64:$src))), @@ -6744,8 +7017,22 @@ def : Pat<(v4f16 (bitconvert (v2f32 FPR64:$src))), (v4f16 (REV32v4i16 FPR64:$src))>; def : Pat<(v4f16 (bitconvert (v1f64 FPR64:$src))), (v4f16 (REV64v4i16 FPR64:$src))>; + +def : Pat<(v4bf16 (bitconvert (v1i64 FPR64:$src))), + (v4bf16 (REV64v4i16 FPR64:$src))>; +def : Pat<(v4bf16 (bitconvert (v2i32 FPR64:$src))), + (v4bf16 (REV32v4i16 FPR64:$src))>; +def : Pat<(v4bf16 (bitconvert (v8i8 FPR64:$src))), + (v4bf16 (REV16v8i8 FPR64:$src))>; +def : Pat<(v4bf16 (bitconvert (f64 FPR64:$src))), + (v4bf16 (REV64v4i16 FPR64:$src))>; +def : Pat<(v4bf16 (bitconvert (v2f32 FPR64:$src))), + (v4bf16 (REV32v4i16 FPR64:$src))>; +def : Pat<(v4bf16 (bitconvert (v1f64 FPR64:$src))), + (v4bf16 (REV64v4i16 FPR64:$src))>; } def : Pat<(v4f16 (bitconvert (v4i16 FPR64:$src))), (v4f16 FPR64:$src)>; +def : Pat<(v4bf16 (bitconvert (v4i16 FPR64:$src))), (v4bf16 FPR64:$src)>; let Predicates = [IsLE] in { def : Pat<(v8i8 (bitconvert (v1i64 FPR64:$src))), (v8i8 FPR64:$src)>; @@ -6755,6 +7042,7 @@ def : Pat<(v8i8 (bitconvert (f64 FPR64:$src))), (v8i8 FPR64:$src)>; def : Pat<(v8i8 (bitconvert (v2f32 FPR64:$src))), (v8i8 FPR64:$src)>; def : Pat<(v8i8 (bitconvert (v1f64 FPR64:$src))), (v8i8 FPR64:$src)>; def : Pat<(v8i8 (bitconvert (v4f16 FPR64:$src))), (v8i8 FPR64:$src)>; +def : Pat<(v8i8 (bitconvert (v4bf16 FPR64:$src))), (v8i8 FPR64:$src)>; } let Predicates = [IsBE] in { def : Pat<(v8i8 (bitconvert (v1i64 FPR64:$src))), @@ -6771,6 +7059,8 @@ def : Pat<(v8i8 (bitconvert (v1f64 FPR64:$src))), (v8i8 (REV64v8i8 FPR64:$src))>; def : Pat<(v8i8 (bitconvert (v4f16 FPR64:$src))), (v8i8 (REV16v8i8 FPR64:$src))>; +def : Pat<(v8i8 (bitconvert (v4bf16 FPR64:$src))), + (v8i8 (REV16v8i8 FPR64:$src))>; } let Predicates = [IsLE] in { @@ -6779,6 +7069,7 @@ def : Pat<(f64 (bitconvert (v4i16 FPR64:$src))), (f64 FPR64:$src)>; def : Pat<(f64 (bitconvert (v2f32 FPR64:$src))), (f64 FPR64:$src)>; def : Pat<(f64 (bitconvert (v8i8 FPR64:$src))), (f64 FPR64:$src)>; def : Pat<(f64 (bitconvert (v4f16 FPR64:$src))), (f64 FPR64:$src)>; +def : Pat<(f64 (bitconvert (v4bf16 FPR64:$src))), (f64 FPR64:$src)>; } let Predicates = [IsBE] in { def : Pat<(f64 (bitconvert (v2i32 FPR64:$src))), @@ -6791,6 +7082,8 @@ def : Pat<(f64 (bitconvert (v8i8 FPR64:$src))), (f64 (REV64v8i8 FPR64:$src))>; def : Pat<(f64 (bitconvert (v4f16 FPR64:$src))), (f64 (REV64v4i16 FPR64:$src))>; +def : Pat<(f64 (bitconvert (v4bf16 FPR64:$src))), + (f64 (REV64v4i16 FPR64:$src))>; } def : Pat<(f64 (bitconvert (v1i64 FPR64:$src))), (f64 FPR64:$src)>; def : Pat<(f64 (bitconvert (v1f64 FPR64:$src))), (f64 FPR64:$src)>; @@ -6801,6 +7094,7 @@ def : Pat<(v1f64 (bitconvert (v4i16 FPR64:$src))), (v1f64 FPR64:$src)>; def : Pat<(v1f64 (bitconvert (v8i8 FPR64:$src))), (v1f64 FPR64:$src)>; def : Pat<(v1f64 (bitconvert (v2f32 FPR64:$src))), (v1f64 FPR64:$src)>; def : Pat<(v1f64 (bitconvert (v4f16 FPR64:$src))), (v1f64 FPR64:$src)>; +def : Pat<(v1f64 (bitconvert (v4bf16 FPR64:$src))), (v1f64 FPR64:$src)>; } let Predicates = [IsBE] in { def : Pat<(v1f64 (bitconvert (v2i32 FPR64:$src))), @@ -6813,6 +7107,8 @@ def : Pat<(v1f64 (bitconvert (v2f32 FPR64:$src))), (v1f64 (REV64v2i32 FPR64:$src))>; def : Pat<(v1f64 (bitconvert (v4f16 FPR64:$src))), (v1f64 (REV64v4i16 FPR64:$src))>; +def : Pat<(v1f64 (bitconvert (v4bf16 FPR64:$src))), + (v1f64 (REV64v4i16 FPR64:$src))>; } def : Pat<(v1f64 (bitconvert (v1i64 FPR64:$src))), (v1f64 FPR64:$src)>; def : Pat<(v1f64 (bitconvert (f64 FPR64:$src))), (v1f64 FPR64:$src)>; @@ -6824,6 +7120,7 @@ def : Pat<(v2f32 (bitconvert (v8i8 FPR64:$src))), (v2f32 FPR64:$src)>; def : Pat<(v2f32 (bitconvert (v1f64 FPR64:$src))), (v2f32 FPR64:$src)>; def : Pat<(v2f32 (bitconvert (f64 FPR64:$src))), (v2f32 FPR64:$src)>; def : Pat<(v2f32 (bitconvert (v4f16 FPR64:$src))), (v2f32 FPR64:$src)>; +def : Pat<(v2f32 (bitconvert (v4bf16 FPR64:$src))), (v2f32 FPR64:$src)>; } let Predicates = [IsBE] in { def : Pat<(v2f32 (bitconvert (v1i64 FPR64:$src))), @@ -6838,6 +7135,8 @@ def : Pat<(v2f32 (bitconvert (f64 FPR64:$src))), (v2f32 (REV64v2i32 FPR64:$src))>; def : Pat<(v2f32 (bitconvert (v4f16 FPR64:$src))), (v2f32 (REV32v4i16 FPR64:$src))>; +def : Pat<(v2f32 (bitconvert (v4bf16 FPR64:$src))), + (v2f32 (REV32v4i16 FPR64:$src))>; } def : Pat<(v2f32 (bitconvert (v2i32 FPR64:$src))), (v2f32 FPR64:$src)>; @@ -6848,6 +7147,7 @@ def : Pat<(f128 (bitconvert (v8i16 FPR128:$src))), (f128 FPR128:$src)>; def : Pat<(f128 (bitconvert (v2f64 FPR128:$src))), (f128 FPR128:$src)>; def : Pat<(f128 (bitconvert (v4f32 FPR128:$src))), (f128 FPR128:$src)>; def : Pat<(f128 (bitconvert (v8f16 FPR128:$src))), (f128 FPR128:$src)>; +def : Pat<(f128 (bitconvert (v8bf16 FPR128:$src))), (f128 FPR128:$src)>; def : Pat<(f128 (bitconvert (v16i8 FPR128:$src))), (f128 FPR128:$src)>; } let Predicates = [IsBE] in { @@ -6862,6 +7162,9 @@ def : Pat<(f128 (bitconvert (v8i16 FPR128:$src))), def : Pat<(f128 (bitconvert (v8f16 FPR128:$src))), (f128 (EXTv16i8 (REV64v8i16 FPR128:$src), (REV64v8i16 FPR128:$src), (i32 8)))>; +def : Pat<(f128 (bitconvert (v8bf16 FPR128:$src))), + (f128 (EXTv16i8 (REV64v8i16 FPR128:$src), + (REV64v8i16 FPR128:$src), (i32 8)))>; def : Pat<(f128 (bitconvert (v2f64 FPR128:$src))), (f128 (EXTv16i8 FPR128:$src, FPR128:$src, (i32 8)))>; def : Pat<(f128 (bitconvert (v4f32 FPR128:$src))), @@ -6877,6 +7180,7 @@ def : Pat<(v2f64 (bitconvert (f128 FPR128:$src))), (v2f64 FPR128:$src)>; def : Pat<(v2f64 (bitconvert (v4i32 FPR128:$src))), (v2f64 FPR128:$src)>; def : Pat<(v2f64 (bitconvert (v8i16 FPR128:$src))), (v2f64 FPR128:$src)>; def : Pat<(v2f64 (bitconvert (v8f16 FPR128:$src))), (v2f64 FPR128:$src)>; +def : Pat<(v2f64 (bitconvert (v8bf16 FPR128:$src))), (v2f64 FPR128:$src)>; def : Pat<(v2f64 (bitconvert (v16i8 FPR128:$src))), (v2f64 FPR128:$src)>; def : Pat<(v2f64 (bitconvert (v4f32 FPR128:$src))), (v2f64 FPR128:$src)>; } @@ -6890,6 +7194,8 @@ def : Pat<(v2f64 (bitconvert (v8i16 FPR128:$src))), (v2f64 (REV64v8i16 FPR128:$src))>; def : Pat<(v2f64 (bitconvert (v8f16 FPR128:$src))), (v2f64 (REV64v8i16 FPR128:$src))>; +def : Pat<(v2f64 (bitconvert (v8bf16 FPR128:$src))), + (v2f64 (REV64v8i16 FPR128:$src))>; def : Pat<(v2f64 (bitconvert (v16i8 FPR128:$src))), (v2f64 (REV64v16i8 FPR128:$src))>; def : Pat<(v2f64 (bitconvert (v4f32 FPR128:$src))), @@ -6901,6 +7207,7 @@ let Predicates = [IsLE] in { def : Pat<(v4f32 (bitconvert (f128 FPR128:$src))), (v4f32 FPR128:$src)>; def : Pat<(v4f32 (bitconvert (v8i16 FPR128:$src))), (v4f32 FPR128:$src)>; def : Pat<(v4f32 (bitconvert (v8f16 FPR128:$src))), (v4f32 FPR128:$src)>; +def : Pat<(v4f32 (bitconvert (v8bf16 FPR128:$src))), (v4f32 FPR128:$src)>; def : Pat<(v4f32 (bitconvert (v16i8 FPR128:$src))), (v4f32 FPR128:$src)>; def : Pat<(v4f32 (bitconvert (v2i64 FPR128:$src))), (v4f32 FPR128:$src)>; def : Pat<(v4f32 (bitconvert (v2f64 FPR128:$src))), (v4f32 FPR128:$src)>; @@ -6913,6 +7220,8 @@ def : Pat<(v4f32 (bitconvert (v8i16 FPR128:$src))), (v4f32 (REV32v8i16 FPR128:$src))>; def : Pat<(v4f32 (bitconvert (v8f16 FPR128:$src))), (v4f32 (REV32v8i16 FPR128:$src))>; +def : Pat<(v4f32 (bitconvert (v8bf16 FPR128:$src))), + (v4f32 (REV32v8i16 FPR128:$src))>; def : Pat<(v4f32 (bitconvert (v16i8 FPR128:$src))), (v4f32 (REV32v16i8 FPR128:$src))>; def : Pat<(v4f32 (bitconvert (v2i64 FPR128:$src))), @@ -6929,6 +7238,7 @@ def : Pat<(v2i64 (bitconvert (v8i16 FPR128:$src))), (v2i64 FPR128:$src)>; def : Pat<(v2i64 (bitconvert (v16i8 FPR128:$src))), (v2i64 FPR128:$src)>; def : Pat<(v2i64 (bitconvert (v4f32 FPR128:$src))), (v2i64 FPR128:$src)>; def : Pat<(v2i64 (bitconvert (v8f16 FPR128:$src))), (v2i64 FPR128:$src)>; +def : Pat<(v2i64 (bitconvert (v8bf16 FPR128:$src))), (v2i64 FPR128:$src)>; } let Predicates = [IsBE] in { def : Pat<(v2i64 (bitconvert (f128 FPR128:$src))), @@ -6944,6 +7254,8 @@ def : Pat<(v2i64 (bitconvert (v4f32 FPR128:$src))), (v2i64 (REV64v4i32 FPR128:$src))>; def : Pat<(v2i64 (bitconvert (v8f16 FPR128:$src))), (v2i64 (REV64v8i16 FPR128:$src))>; +def : Pat<(v2i64 (bitconvert (v8bf16 FPR128:$src))), + (v2i64 (REV64v8i16 FPR128:$src))>; } def : Pat<(v2i64 (bitconvert (v2f64 FPR128:$src))), (v2i64 FPR128:$src)>; @@ -6954,6 +7266,7 @@ def : Pat<(v4i32 (bitconvert (v8i16 FPR128:$src))), (v4i32 FPR128:$src)>; def : Pat<(v4i32 (bitconvert (v16i8 FPR128:$src))), (v4i32 FPR128:$src)>; def : Pat<(v4i32 (bitconvert (v2f64 FPR128:$src))), (v4i32 FPR128:$src)>; def : Pat<(v4i32 (bitconvert (v8f16 FPR128:$src))), (v4i32 FPR128:$src)>; +def : Pat<(v4i32 (bitconvert (v8bf16 FPR128:$src))), (v4i32 FPR128:$src)>; } let Predicates = [IsBE] in { def : Pat<(v4i32 (bitconvert (f128 FPR128:$src))), @@ -6970,6 +7283,8 @@ def : Pat<(v4i32 (bitconvert (v2f64 FPR128:$src))), (v4i32 (REV64v4i32 FPR128:$src))>; def : Pat<(v4i32 (bitconvert (v8f16 FPR128:$src))), (v4i32 (REV32v8i16 FPR128:$src))>; +def : Pat<(v4i32 (bitconvert (v8bf16 FPR128:$src))), + (v4i32 (REV32v8i16 FPR128:$src))>; } def : Pat<(v4i32 (bitconvert (v4f32 FPR128:$src))), (v4i32 FPR128:$src)>; @@ -6998,6 +7313,7 @@ def : Pat<(v8i16 (bitconvert (v4f32 FPR128:$src))), (v8i16 (REV32v8i16 FPR128:$src))>; } def : Pat<(v8i16 (bitconvert (v8f16 FPR128:$src))), (v8i16 FPR128:$src)>; +def : Pat<(v8i16 (bitconvert (v8bf16 FPR128:$src))), (v8i16 FPR128:$src)>; let Predicates = [IsLE] in { def : Pat<(v8f16 (bitconvert (f128 FPR128:$src))), (v8f16 FPR128:$src)>; @@ -7006,6 +7322,13 @@ def : Pat<(v8f16 (bitconvert (v4i32 FPR128:$src))), (v8f16 FPR128:$src)>; def : Pat<(v8f16 (bitconvert (v16i8 FPR128:$src))), (v8f16 FPR128:$src)>; def : Pat<(v8f16 (bitconvert (v2f64 FPR128:$src))), (v8f16 FPR128:$src)>; def : Pat<(v8f16 (bitconvert (v4f32 FPR128:$src))), (v8f16 FPR128:$src)>; + +def : Pat<(v8bf16 (bitconvert (f128 FPR128:$src))), (v8bf16 FPR128:$src)>; +def : Pat<(v8bf16 (bitconvert (v2i64 FPR128:$src))), (v8bf16 FPR128:$src)>; +def : Pat<(v8bf16 (bitconvert (v4i32 FPR128:$src))), (v8bf16 FPR128:$src)>; +def : Pat<(v8bf16 (bitconvert (v16i8 FPR128:$src))), (v8bf16 FPR128:$src)>; +def : Pat<(v8bf16 (bitconvert (v2f64 FPR128:$src))), (v8bf16 FPR128:$src)>; +def : Pat<(v8bf16 (bitconvert (v4f32 FPR128:$src))), (v8bf16 FPR128:$src)>; } let Predicates = [IsBE] in { def : Pat<(v8f16 (bitconvert (f128 FPR128:$src))), @@ -7022,8 +7345,24 @@ def : Pat<(v8f16 (bitconvert (v2f64 FPR128:$src))), (v8f16 (REV64v8i16 FPR128:$src))>; def : Pat<(v8f16 (bitconvert (v4f32 FPR128:$src))), (v8f16 (REV32v8i16 FPR128:$src))>; + +def : Pat<(v8bf16 (bitconvert (f128 FPR128:$src))), + (v8bf16 (EXTv16i8 (REV64v8i16 FPR128:$src), + (REV64v8i16 FPR128:$src), + (i32 8)))>; +def : Pat<(v8bf16 (bitconvert (v2i64 FPR128:$src))), + (v8bf16 (REV64v8i16 FPR128:$src))>; +def : Pat<(v8bf16 (bitconvert (v4i32 FPR128:$src))), + (v8bf16 (REV32v8i16 FPR128:$src))>; +def : Pat<(v8bf16 (bitconvert (v16i8 FPR128:$src))), + (v8bf16 (REV16v16i8 FPR128:$src))>; +def : Pat<(v8bf16 (bitconvert (v2f64 FPR128:$src))), + (v8bf16 (REV64v8i16 FPR128:$src))>; +def : Pat<(v8bf16 (bitconvert (v4f32 FPR128:$src))), + (v8bf16 (REV32v8i16 FPR128:$src))>; } def : Pat<(v8f16 (bitconvert (v8i16 FPR128:$src))), (v8f16 FPR128:$src)>; +def : Pat<(v8bf16 (bitconvert (v8i16 FPR128:$src))), (v8bf16 FPR128:$src)>; let Predicates = [IsLE] in { def : Pat<(v16i8 (bitconvert (f128 FPR128:$src))), (v16i8 FPR128:$src)>; @@ -7033,6 +7372,7 @@ def : Pat<(v16i8 (bitconvert (v8i16 FPR128:$src))), (v16i8 FPR128:$src)>; def : Pat<(v16i8 (bitconvert (v2f64 FPR128:$src))), (v16i8 FPR128:$src)>; def : Pat<(v16i8 (bitconvert (v4f32 FPR128:$src))), (v16i8 FPR128:$src)>; def : Pat<(v16i8 (bitconvert (v8f16 FPR128:$src))), (v16i8 FPR128:$src)>; +def : Pat<(v16i8 (bitconvert (v8bf16 FPR128:$src))), (v16i8 FPR128:$src)>; } let Predicates = [IsBE] in { def : Pat<(v16i8 (bitconvert (f128 FPR128:$src))), @@ -7051,6 +7391,8 @@ def : Pat<(v16i8 (bitconvert (v4f32 FPR128:$src))), (v16i8 (REV32v16i8 FPR128:$src))>; def : Pat<(v16i8 (bitconvert (v8f16 FPR128:$src))), (v16i8 (REV16v16i8 FPR128:$src))>; +def : Pat<(v16i8 (bitconvert (v8bf16 FPR128:$src))), + (v16i8 (REV16v16i8 FPR128:$src))>; } def : Pat<(v4i16 (extract_subvector V128:$Rn, (i64 0))), @@ -7061,6 +7403,8 @@ def : Pat<(v2f32 (extract_subvector V128:$Rn, (i64 0))), (EXTRACT_SUBREG V128:$Rn, dsub)>; def : Pat<(v4f16 (extract_subvector V128:$Rn, (i64 0))), (EXTRACT_SUBREG V128:$Rn, dsub)>; +def : Pat<(v4bf16 (extract_subvector V128:$Rn, (i64 0))), + (EXTRACT_SUBREG V128:$Rn, dsub)>; def : Pat<(v2i32 (extract_subvector V128:$Rn, (i64 0))), (EXTRACT_SUBREG V128:$Rn, dsub)>; def : Pat<(v1i64 (extract_subvector V128:$Rn, (i64 0))), @@ -7092,6 +7436,8 @@ multiclass InsertSubvectorUndef { (INSERT_SUBREG (v8i16 (IMPLICIT_DEF)), FPR64:$src, dsub)>; def : Pat<(insert_subvector undef, (v4f16 FPR64:$src), (Ty 0)), (INSERT_SUBREG (v8f16 (IMPLICIT_DEF)), FPR64:$src, dsub)>; + def : Pat<(insert_subvector undef, (v4bf16 FPR64:$src), (Ty 0)), + (INSERT_SUBREG (v8bf16 (IMPLICIT_DEF)), FPR64:$src, dsub)>; def : Pat<(insert_subvector undef, (v8i8 FPR64:$src), (Ty 0)), (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), FPR64:$src, dsub)>; } @@ -7317,3 +7663,5 @@ let AddedComplexity = 10 in { include "AArch64InstrAtomics.td" include "AArch64SVEInstrInfo.td" + +include "AArch64InstrGISel.td" diff --git a/llvm/lib/Target/AArch64/AArch64InstructionSelector.cpp b/llvm/lib/Target/AArch64/AArch64InstructionSelector.cpp deleted file mode 100644 index b9ac2657e1c5a..0000000000000 --- a/llvm/lib/Target/AArch64/AArch64InstructionSelector.cpp +++ /dev/null @@ -1,4918 +0,0 @@ -//===- AArch64InstructionSelector.cpp ----------------------------*- C++ -*-==// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -/// \file -/// This file implements the targeting of the InstructionSelector class for -/// AArch64. -/// \todo This should be generated by TableGen. -//===----------------------------------------------------------------------===// - -#include "AArch64InstrInfo.h" -#include "AArch64MachineFunctionInfo.h" -#include "AArch64RegisterBankInfo.h" -#include "AArch64RegisterInfo.h" -#include "AArch64Subtarget.h" -#include "AArch64TargetMachine.h" -#include "MCTargetDesc/AArch64AddressingModes.h" -#include "llvm/ADT/Optional.h" -#include "llvm/CodeGen/GlobalISel/InstructionSelector.h" -#include "llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h" -#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" -#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" -#include "llvm/CodeGen/GlobalISel/Utils.h" -#include "llvm/CodeGen/MachineBasicBlock.h" -#include "llvm/CodeGen/MachineConstantPool.h" -#include "llvm/CodeGen/MachineFunction.h" -#include "llvm/CodeGen/MachineInstr.h" -#include "llvm/CodeGen/MachineInstrBuilder.h" -#include "llvm/CodeGen/MachineOperand.h" -#include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/IR/Type.h" -#include "llvm/IR/IntrinsicsAArch64.h" -#include "llvm/Support/Debug.h" -#include "llvm/Support/raw_ostream.h" - -#define DEBUG_TYPE "aarch64-isel" - -using namespace llvm; - -namespace { - -#define GET_GLOBALISEL_PREDICATE_BITSET -#include "AArch64GenGlobalISel.inc" -#undef GET_GLOBALISEL_PREDICATE_BITSET - -class AArch64InstructionSelector : public InstructionSelector { -public: - AArch64InstructionSelector(const AArch64TargetMachine &TM, - const AArch64Subtarget &STI, - const AArch64RegisterBankInfo &RBI); - - bool select(MachineInstr &I) override; - static const char *getName() { return DEBUG_TYPE; } - - void setupMF(MachineFunction &MF, GISelKnownBits &KB, - CodeGenCoverage &CoverageInfo) override { - InstructionSelector::setupMF(MF, KB, CoverageInfo); - - // hasFnAttribute() is expensive to call on every BRCOND selection, so - // cache it here for each run of the selector. - ProduceNonFlagSettingCondBr = - !MF.getFunction().hasFnAttribute(Attribute::SpeculativeLoadHardening); - } - -private: - /// tblgen-erated 'select' implementation, used as the initial selector for - /// the patterns that don't require complex C++. - bool selectImpl(MachineInstr &I, CodeGenCoverage &CoverageInfo) const; - - // A lowering phase that runs before any selection attempts. - - void preISelLower(MachineInstr &I) const; - - // An early selection function that runs before the selectImpl() call. - bool earlySelect(MachineInstr &I) const; - - bool earlySelectSHL(MachineInstr &I, MachineRegisterInfo &MRI) const; - - /// Eliminate same-sized cross-bank copies into stores before selectImpl(). - void contractCrossBankCopyIntoStore(MachineInstr &I, - MachineRegisterInfo &MRI) const; - - bool selectVaStartAAPCS(MachineInstr &I, MachineFunction &MF, - MachineRegisterInfo &MRI) const; - bool selectVaStartDarwin(MachineInstr &I, MachineFunction &MF, - MachineRegisterInfo &MRI) const; - - bool selectCompareBranch(MachineInstr &I, MachineFunction &MF, - MachineRegisterInfo &MRI) const; - - bool selectVectorASHR(MachineInstr &I, MachineRegisterInfo &MRI) const; - bool selectVectorSHL(MachineInstr &I, MachineRegisterInfo &MRI) const; - - // Helper to generate an equivalent of scalar_to_vector into a new register, - // returned via 'Dst'. - MachineInstr *emitScalarToVector(unsigned EltSize, - const TargetRegisterClass *DstRC, - Register Scalar, - MachineIRBuilder &MIRBuilder) const; - - /// Emit a lane insert into \p DstReg, or a new vector register if None is - /// provided. - /// - /// The lane inserted into is defined by \p LaneIdx. The vector source - /// register is given by \p SrcReg. The register containing the element is - /// given by \p EltReg. - MachineInstr *emitLaneInsert(Optional DstReg, Register SrcReg, - Register EltReg, unsigned LaneIdx, - const RegisterBank &RB, - MachineIRBuilder &MIRBuilder) const; - bool selectInsertElt(MachineInstr &I, MachineRegisterInfo &MRI) const; - bool selectBuildVector(MachineInstr &I, MachineRegisterInfo &MRI) const; - bool selectMergeValues(MachineInstr &I, MachineRegisterInfo &MRI) const; - bool selectUnmergeValues(MachineInstr &I, MachineRegisterInfo &MRI) const; - - bool selectShuffleVector(MachineInstr &I, MachineRegisterInfo &MRI) const; - bool selectExtractElt(MachineInstr &I, MachineRegisterInfo &MRI) const; - bool selectConcatVectors(MachineInstr &I, MachineRegisterInfo &MRI) const; - bool selectSplitVectorUnmerge(MachineInstr &I, - MachineRegisterInfo &MRI) const; - bool selectIntrinsicWithSideEffects(MachineInstr &I, - MachineRegisterInfo &MRI) const; - bool selectIntrinsic(MachineInstr &I, MachineRegisterInfo &MRI) const; - bool selectVectorICmp(MachineInstr &I, MachineRegisterInfo &MRI) const; - bool selectIntrinsicTrunc(MachineInstr &I, MachineRegisterInfo &MRI) const; - bool selectIntrinsicRound(MachineInstr &I, MachineRegisterInfo &MRI) const; - bool selectJumpTable(MachineInstr &I, MachineRegisterInfo &MRI) const; - bool selectBrJT(MachineInstr &I, MachineRegisterInfo &MRI) const; - bool selectTLSGlobalValue(MachineInstr &I, MachineRegisterInfo &MRI) const; - - unsigned emitConstantPoolEntry(Constant *CPVal, MachineFunction &MF) const; - MachineInstr *emitLoadFromConstantPool(Constant *CPVal, - MachineIRBuilder &MIRBuilder) const; - - // Emit a vector concat operation. - MachineInstr *emitVectorConcat(Optional Dst, Register Op1, - Register Op2, - MachineIRBuilder &MIRBuilder) const; - MachineInstr *emitIntegerCompare(MachineOperand &LHS, MachineOperand &RHS, - MachineOperand &Predicate, - MachineIRBuilder &MIRBuilder) const; - MachineInstr *emitADD(Register DefReg, MachineOperand &LHS, MachineOperand &RHS, - MachineIRBuilder &MIRBuilder) const; - MachineInstr *emitCMN(MachineOperand &LHS, MachineOperand &RHS, - MachineIRBuilder &MIRBuilder) const; - MachineInstr *emitTST(const Register &LHS, const Register &RHS, - MachineIRBuilder &MIRBuilder) const; - MachineInstr *emitExtractVectorElt(Optional DstReg, - const RegisterBank &DstRB, LLT ScalarTy, - Register VecReg, unsigned LaneIdx, - MachineIRBuilder &MIRBuilder) const; - - /// Helper function for selecting G_FCONSTANT. If the G_FCONSTANT can be - /// materialized using a FMOV instruction, then update MI and return it. - /// Otherwise, do nothing and return a nullptr. - MachineInstr *emitFMovForFConstant(MachineInstr &MI, - MachineRegisterInfo &MRI) const; - - /// Emit a CSet for a compare. - MachineInstr *emitCSetForICMP(Register DefReg, unsigned Pred, - MachineIRBuilder &MIRBuilder) const; - - // Equivalent to the i32shift_a and friends from AArch64InstrInfo.td. - // We use these manually instead of using the importer since it doesn't - // support SDNodeXForm. - ComplexRendererFns selectShiftA_32(const MachineOperand &Root) const; - ComplexRendererFns selectShiftB_32(const MachineOperand &Root) const; - ComplexRendererFns selectShiftA_64(const MachineOperand &Root) const; - ComplexRendererFns selectShiftB_64(const MachineOperand &Root) const; - - ComplexRendererFns select12BitValueWithLeftShift(uint64_t Immed) const; - ComplexRendererFns selectArithImmed(MachineOperand &Root) const; - ComplexRendererFns selectNegArithImmed(MachineOperand &Root) const; - - ComplexRendererFns selectAddrModeUnscaled(MachineOperand &Root, - unsigned Size) const; - - ComplexRendererFns selectAddrModeUnscaled8(MachineOperand &Root) const { - return selectAddrModeUnscaled(Root, 1); - } - ComplexRendererFns selectAddrModeUnscaled16(MachineOperand &Root) const { - return selectAddrModeUnscaled(Root, 2); - } - ComplexRendererFns selectAddrModeUnscaled32(MachineOperand &Root) const { - return selectAddrModeUnscaled(Root, 4); - } - ComplexRendererFns selectAddrModeUnscaled64(MachineOperand &Root) const { - return selectAddrModeUnscaled(Root, 8); - } - ComplexRendererFns selectAddrModeUnscaled128(MachineOperand &Root) const { - return selectAddrModeUnscaled(Root, 16); - } - - ComplexRendererFns selectAddrModeIndexed(MachineOperand &Root, - unsigned Size) const; - template - ComplexRendererFns selectAddrModeIndexed(MachineOperand &Root) const { - return selectAddrModeIndexed(Root, Width / 8); - } - - bool isWorthFoldingIntoExtendedReg(MachineInstr &MI, - const MachineRegisterInfo &MRI) const; - ComplexRendererFns - selectAddrModeShiftedExtendXReg(MachineOperand &Root, - unsigned SizeInBytes) const; - - /// Returns a \p ComplexRendererFns which contains a base, offset, and whether - /// or not a shift + extend should be folded into an addressing mode. Returns - /// None when this is not profitable or possible. - ComplexRendererFns - selectExtendedSHL(MachineOperand &Root, MachineOperand &Base, - MachineOperand &Offset, unsigned SizeInBytes, - bool WantsExt) const; - ComplexRendererFns selectAddrModeRegisterOffset(MachineOperand &Root) const; - ComplexRendererFns selectAddrModeXRO(MachineOperand &Root, - unsigned SizeInBytes) const; - template - ComplexRendererFns selectAddrModeXRO(MachineOperand &Root) const { - return selectAddrModeXRO(Root, Width / 8); - } - - ComplexRendererFns selectAddrModeWRO(MachineOperand &Root, - unsigned SizeInBytes) const; - template - ComplexRendererFns selectAddrModeWRO(MachineOperand &Root) const { - return selectAddrModeWRO(Root, Width / 8); - } - - ComplexRendererFns selectShiftedRegister(MachineOperand &Root) const; - - ComplexRendererFns selectArithShiftedRegister(MachineOperand &Root) const { - return selectShiftedRegister(Root); - } - - ComplexRendererFns selectLogicalShiftedRegister(MachineOperand &Root) const { - // TODO: selectShiftedRegister should allow for rotates on logical shifts. - // For now, make them the same. The only difference between the two is that - // logical shifts are allowed to fold in rotates. Otherwise, these are - // functionally the same. - return selectShiftedRegister(Root); - } - - /// Given an extend instruction, determine the correct shift-extend type for - /// that instruction. - /// - /// If the instruction is going to be used in a load or store, pass - /// \p IsLoadStore = true. - AArch64_AM::ShiftExtendType - getExtendTypeForInst(MachineInstr &MI, MachineRegisterInfo &MRI, - bool IsLoadStore = false) const; - - /// Instructions that accept extend modifiers like UXTW expect the register - /// being extended to be a GPR32. Narrow ExtReg to a 32-bit register using a - /// subregister copy if necessary. Return either ExtReg, or the result of the - /// new copy. - Register narrowExtendRegIfNeeded(Register ExtReg, - MachineIRBuilder &MIB) const; - ComplexRendererFns selectArithExtendedRegister(MachineOperand &Root) const; - - void renderTruncImm(MachineInstrBuilder &MIB, const MachineInstr &MI, - int OpIdx = -1) const; - void renderLogicalImm32(MachineInstrBuilder &MIB, const MachineInstr &I, - int OpIdx = -1) const; - void renderLogicalImm64(MachineInstrBuilder &MIB, const MachineInstr &I, - int OpIdx = -1) const; - - // Materialize a GlobalValue or BlockAddress using a movz+movk sequence. - void materializeLargeCMVal(MachineInstr &I, const Value *V, - unsigned OpFlags) const; - - // Optimization methods. - bool tryOptVectorShuffle(MachineInstr &I) const; - bool tryOptVectorDup(MachineInstr &MI) const; - bool tryOptSelect(MachineInstr &MI) const; - MachineInstr *tryFoldIntegerCompare(MachineOperand &LHS, MachineOperand &RHS, - MachineOperand &Predicate, - MachineIRBuilder &MIRBuilder) const; - - /// Return true if \p MI is a load or store of \p NumBytes bytes. - bool isLoadStoreOfNumBytes(const MachineInstr &MI, unsigned NumBytes) const; - - /// Returns true if \p MI is guaranteed to have the high-half of a 64-bit - /// register zeroed out. In other words, the result of MI has been explicitly - /// zero extended. - bool isDef32(const MachineInstr &MI) const; - - const AArch64TargetMachine &TM; - const AArch64Subtarget &STI; - const AArch64InstrInfo &TII; - const AArch64RegisterInfo &TRI; - const AArch64RegisterBankInfo &RBI; - - bool ProduceNonFlagSettingCondBr = false; - -#define GET_GLOBALISEL_PREDICATES_DECL -#include "AArch64GenGlobalISel.inc" -#undef GET_GLOBALISEL_PREDICATES_DECL - -// We declare the temporaries used by selectImpl() in the class to minimize the -// cost of constructing placeholder values. -#define GET_GLOBALISEL_TEMPORARIES_DECL -#include "AArch64GenGlobalISel.inc" -#undef GET_GLOBALISEL_TEMPORARIES_DECL -}; - -} // end anonymous namespace - -#define GET_GLOBALISEL_IMPL -#include "AArch64GenGlobalISel.inc" -#undef GET_GLOBALISEL_IMPL - -AArch64InstructionSelector::AArch64InstructionSelector( - const AArch64TargetMachine &TM, const AArch64Subtarget &STI, - const AArch64RegisterBankInfo &RBI) - : InstructionSelector(), TM(TM), STI(STI), TII(*STI.getInstrInfo()), - TRI(*STI.getRegisterInfo()), RBI(RBI), -#define GET_GLOBALISEL_PREDICATES_INIT -#include "AArch64GenGlobalISel.inc" -#undef GET_GLOBALISEL_PREDICATES_INIT -#define GET_GLOBALISEL_TEMPORARIES_INIT -#include "AArch64GenGlobalISel.inc" -#undef GET_GLOBALISEL_TEMPORARIES_INIT -{ -} - -// FIXME: This should be target-independent, inferred from the types declared -// for each class in the bank. -static const TargetRegisterClass * -getRegClassForTypeOnBank(LLT Ty, const RegisterBank &RB, - const RegisterBankInfo &RBI, - bool GetAllRegSet = false) { - if (RB.getID() == AArch64::GPRRegBankID) { - if (Ty.getSizeInBits() <= 32) - return GetAllRegSet ? &AArch64::GPR32allRegClass - : &AArch64::GPR32RegClass; - if (Ty.getSizeInBits() == 64) - return GetAllRegSet ? &AArch64::GPR64allRegClass - : &AArch64::GPR64RegClass; - return nullptr; - } - - if (RB.getID() == AArch64::FPRRegBankID) { - if (Ty.getSizeInBits() <= 16) - return &AArch64::FPR16RegClass; - if (Ty.getSizeInBits() == 32) - return &AArch64::FPR32RegClass; - if (Ty.getSizeInBits() == 64) - return &AArch64::FPR64RegClass; - if (Ty.getSizeInBits() == 128) - return &AArch64::FPR128RegClass; - return nullptr; - } - - return nullptr; -} - -/// Given a register bank, and size in bits, return the smallest register class -/// that can represent that combination. -static const TargetRegisterClass * -getMinClassForRegBank(const RegisterBank &RB, unsigned SizeInBits, - bool GetAllRegSet = false) { - unsigned RegBankID = RB.getID(); - - if (RegBankID == AArch64::GPRRegBankID) { - if (SizeInBits <= 32) - return GetAllRegSet ? &AArch64::GPR32allRegClass - : &AArch64::GPR32RegClass; - if (SizeInBits == 64) - return GetAllRegSet ? &AArch64::GPR64allRegClass - : &AArch64::GPR64RegClass; - } - - if (RegBankID == AArch64::FPRRegBankID) { - switch (SizeInBits) { - default: - return nullptr; - case 8: - return &AArch64::FPR8RegClass; - case 16: - return &AArch64::FPR16RegClass; - case 32: - return &AArch64::FPR32RegClass; - case 64: - return &AArch64::FPR64RegClass; - case 128: - return &AArch64::FPR128RegClass; - } - } - - return nullptr; -} - -/// Returns the correct subregister to use for a given register class. -static bool getSubRegForClass(const TargetRegisterClass *RC, - const TargetRegisterInfo &TRI, unsigned &SubReg) { - switch (TRI.getRegSizeInBits(*RC)) { - case 8: - SubReg = AArch64::bsub; - break; - case 16: - SubReg = AArch64::hsub; - break; - case 32: - if (RC != &AArch64::FPR32RegClass) - SubReg = AArch64::sub_32; - else - SubReg = AArch64::ssub; - break; - case 64: - SubReg = AArch64::dsub; - break; - default: - LLVM_DEBUG( - dbgs() << "Couldn't find appropriate subregister for register class."); - return false; - } - - return true; -} - -/// Check whether \p I is a currently unsupported binary operation: -/// - it has an unsized type -/// - an operand is not a vreg -/// - all operands are not in the same bank -/// These are checks that should someday live in the verifier, but right now, -/// these are mostly limitations of the aarch64 selector. -static bool unsupportedBinOp(const MachineInstr &I, - const AArch64RegisterBankInfo &RBI, - const MachineRegisterInfo &MRI, - const AArch64RegisterInfo &TRI) { - LLT Ty = MRI.getType(I.getOperand(0).getReg()); - if (!Ty.isValid()) { - LLVM_DEBUG(dbgs() << "Generic binop register should be typed\n"); - return true; - } - - const RegisterBank *PrevOpBank = nullptr; - for (auto &MO : I.operands()) { - // FIXME: Support non-register operands. - if (!MO.isReg()) { - LLVM_DEBUG(dbgs() << "Generic inst non-reg operands are unsupported\n"); - return true; - } - - // FIXME: Can generic operations have physical registers operands? If - // so, this will need to be taught about that, and we'll need to get the - // bank out of the minimal class for the register. - // Either way, this needs to be documented (and possibly verified). - if (!Register::isVirtualRegister(MO.getReg())) { - LLVM_DEBUG(dbgs() << "Generic inst has physical register operand\n"); - return true; - } - - const RegisterBank *OpBank = RBI.getRegBank(MO.getReg(), MRI, TRI); - if (!OpBank) { - LLVM_DEBUG(dbgs() << "Generic register has no bank or class\n"); - return true; - } - - if (PrevOpBank && OpBank != PrevOpBank) { - LLVM_DEBUG(dbgs() << "Generic inst operands have different banks\n"); - return true; - } - PrevOpBank = OpBank; - } - return false; -} - -/// Select the AArch64 opcode for the basic binary operation \p GenericOpc -/// (such as G_OR or G_SDIV), appropriate for the register bank \p RegBankID -/// and of size \p OpSize. -/// \returns \p GenericOpc if the combination is unsupported. -static unsigned selectBinaryOp(unsigned GenericOpc, unsigned RegBankID, - unsigned OpSize) { - switch (RegBankID) { - case AArch64::GPRRegBankID: - if (OpSize == 32) { - switch (GenericOpc) { - case TargetOpcode::G_SHL: - return AArch64::LSLVWr; - case TargetOpcode::G_LSHR: - return AArch64::LSRVWr; - case TargetOpcode::G_ASHR: - return AArch64::ASRVWr; - default: - return GenericOpc; - } - } else if (OpSize == 64) { - switch (GenericOpc) { - case TargetOpcode::G_PTR_ADD: - return AArch64::ADDXrr; - case TargetOpcode::G_SHL: - return AArch64::LSLVXr; - case TargetOpcode::G_LSHR: - return AArch64::LSRVXr; - case TargetOpcode::G_ASHR: - return AArch64::ASRVXr; - default: - return GenericOpc; - } - } - break; - case AArch64::FPRRegBankID: - switch (OpSize) { - case 32: - switch (GenericOpc) { - case TargetOpcode::G_FADD: - return AArch64::FADDSrr; - case TargetOpcode::G_FSUB: - return AArch64::FSUBSrr; - case TargetOpcode::G_FMUL: - return AArch64::FMULSrr; - case TargetOpcode::G_FDIV: - return AArch64::FDIVSrr; - default: - return GenericOpc; - } - case 64: - switch (GenericOpc) { - case TargetOpcode::G_FADD: - return AArch64::FADDDrr; - case TargetOpcode::G_FSUB: - return AArch64::FSUBDrr; - case TargetOpcode::G_FMUL: - return AArch64::FMULDrr; - case TargetOpcode::G_FDIV: - return AArch64::FDIVDrr; - case TargetOpcode::G_OR: - return AArch64::ORRv8i8; - default: - return GenericOpc; - } - } - break; - } - return GenericOpc; -} - -/// Select the AArch64 opcode for the G_LOAD or G_STORE operation \p GenericOpc, -/// appropriate for the (value) register bank \p RegBankID and of memory access -/// size \p OpSize. This returns the variant with the base+unsigned-immediate -/// addressing mode (e.g., LDRXui). -/// \returns \p GenericOpc if the combination is unsupported. -static unsigned selectLoadStoreUIOp(unsigned GenericOpc, unsigned RegBankID, - unsigned OpSize) { - const bool isStore = GenericOpc == TargetOpcode::G_STORE; - switch (RegBankID) { - case AArch64::GPRRegBankID: - switch (OpSize) { - case 8: - return isStore ? AArch64::STRBBui : AArch64::LDRBBui; - case 16: - return isStore ? AArch64::STRHHui : AArch64::LDRHHui; - case 32: - return isStore ? AArch64::STRWui : AArch64::LDRWui; - case 64: - return isStore ? AArch64::STRXui : AArch64::LDRXui; - } - break; - case AArch64::FPRRegBankID: - switch (OpSize) { - case 8: - return isStore ? AArch64::STRBui : AArch64::LDRBui; - case 16: - return isStore ? AArch64::STRHui : AArch64::LDRHui; - case 32: - return isStore ? AArch64::STRSui : AArch64::LDRSui; - case 64: - return isStore ? AArch64::STRDui : AArch64::LDRDui; - } - break; - } - return GenericOpc; -} - -#ifndef NDEBUG -/// Helper function that verifies that we have a valid copy at the end of -/// selectCopy. Verifies that the source and dest have the expected sizes and -/// then returns true. -static bool isValidCopy(const MachineInstr &I, const RegisterBank &DstBank, - const MachineRegisterInfo &MRI, - const TargetRegisterInfo &TRI, - const RegisterBankInfo &RBI) { - const Register DstReg = I.getOperand(0).getReg(); - const Register SrcReg = I.getOperand(1).getReg(); - const unsigned DstSize = RBI.getSizeInBits(DstReg, MRI, TRI); - const unsigned SrcSize = RBI.getSizeInBits(SrcReg, MRI, TRI); - - // Make sure the size of the source and dest line up. - assert( - (DstSize == SrcSize || - // Copies are a mean to setup initial types, the number of - // bits may not exactly match. - (Register::isPhysicalRegister(SrcReg) && DstSize <= SrcSize) || - // Copies are a mean to copy bits around, as long as we are - // on the same register class, that's fine. Otherwise, that - // means we need some SUBREG_TO_REG or AND & co. - (((DstSize + 31) / 32 == (SrcSize + 31) / 32) && DstSize > SrcSize)) && - "Copy with different width?!"); - - // Check the size of the destination. - assert((DstSize <= 64 || DstBank.getID() == AArch64::FPRRegBankID) && - "GPRs cannot get more than 64-bit width values"); - - return true; -} -#endif - -/// Helper function for selectCopy. Inserts a subregister copy from -/// \p *From to \p *To, linking it up to \p I. -/// -/// e.g, given I = "Dst = COPY SrcReg", we'll transform that into -/// -/// CopyReg (From class) = COPY SrcReg -/// SubRegCopy (To class) = COPY CopyReg:SubReg -/// Dst = COPY SubRegCopy -static bool selectSubregisterCopy(MachineInstr &I, MachineRegisterInfo &MRI, - const RegisterBankInfo &RBI, Register SrcReg, - const TargetRegisterClass *From, - const TargetRegisterClass *To, - unsigned SubReg) { - MachineIRBuilder MIB(I); - auto Copy = MIB.buildCopy({From}, {SrcReg}); - auto SubRegCopy = MIB.buildInstr(TargetOpcode::COPY, {To}, {}) - .addReg(Copy.getReg(0), 0, SubReg); - MachineOperand &RegOp = I.getOperand(1); - RegOp.setReg(SubRegCopy.getReg(0)); - - // It's possible that the destination register won't be constrained. Make - // sure that happens. - if (!Register::isPhysicalRegister(I.getOperand(0).getReg())) - RBI.constrainGenericRegister(I.getOperand(0).getReg(), *To, MRI); - - return true; -} - -/// Helper function to get the source and destination register classes for a -/// copy. Returns a std::pair containing the source register class for the -/// copy, and the destination register class for the copy. If a register class -/// cannot be determined, then it will be nullptr. -static std::pair -getRegClassesForCopy(MachineInstr &I, const TargetInstrInfo &TII, - MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI, - const RegisterBankInfo &RBI) { - Register DstReg = I.getOperand(0).getReg(); - Register SrcReg = I.getOperand(1).getReg(); - const RegisterBank &DstRegBank = *RBI.getRegBank(DstReg, MRI, TRI); - const RegisterBank &SrcRegBank = *RBI.getRegBank(SrcReg, MRI, TRI); - unsigned DstSize = RBI.getSizeInBits(DstReg, MRI, TRI); - unsigned SrcSize = RBI.getSizeInBits(SrcReg, MRI, TRI); - - // Special casing for cross-bank copies of s1s. We can technically represent - // a 1-bit value with any size of register. The minimum size for a GPR is 32 - // bits. So, we need to put the FPR on 32 bits as well. - // - // FIXME: I'm not sure if this case holds true outside of copies. If it does, - // then we can pull it into the helpers that get the appropriate class for a - // register bank. Or make a new helper that carries along some constraint - // information. - if (SrcRegBank != DstRegBank && (DstSize == 1 && SrcSize == 1)) - SrcSize = DstSize = 32; - - return {getMinClassForRegBank(SrcRegBank, SrcSize, true), - getMinClassForRegBank(DstRegBank, DstSize, true)}; -} - -static bool selectCopy(MachineInstr &I, const TargetInstrInfo &TII, - MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI, - const RegisterBankInfo &RBI) { - - Register DstReg = I.getOperand(0).getReg(); - Register SrcReg = I.getOperand(1).getReg(); - const RegisterBank &DstRegBank = *RBI.getRegBank(DstReg, MRI, TRI); - const RegisterBank &SrcRegBank = *RBI.getRegBank(SrcReg, MRI, TRI); - - // Find the correct register classes for the source and destination registers. - const TargetRegisterClass *SrcRC; - const TargetRegisterClass *DstRC; - std::tie(SrcRC, DstRC) = getRegClassesForCopy(I, TII, MRI, TRI, RBI); - - if (!DstRC) { - LLVM_DEBUG(dbgs() << "Unexpected dest size " - << RBI.getSizeInBits(DstReg, MRI, TRI) << '\n'); - return false; - } - - // A couple helpers below, for making sure that the copy we produce is valid. - - // Set to true if we insert a SUBREG_TO_REG. If we do this, then we don't want - // to verify that the src and dst are the same size, since that's handled by - // the SUBREG_TO_REG. - bool KnownValid = false; - - // Returns true, or asserts if something we don't expect happens. Instead of - // returning true, we return isValidCopy() to ensure that we verify the - // result. - auto CheckCopy = [&]() { - // If we have a bitcast or something, we can't have physical registers. - assert((I.isCopy() || - (!Register::isPhysicalRegister(I.getOperand(0).getReg()) && - !Register::isPhysicalRegister(I.getOperand(1).getReg()))) && - "No phys reg on generic operator!"); - assert(KnownValid || isValidCopy(I, DstRegBank, MRI, TRI, RBI)); - (void)KnownValid; - return true; - }; - - // Is this a copy? If so, then we may need to insert a subregister copy, or - // a SUBREG_TO_REG. - if (I.isCopy()) { - // Yes. Check if there's anything to fix up. - if (!SrcRC) { - LLVM_DEBUG(dbgs() << "Couldn't determine source register class\n"); - return false; - } - - unsigned SrcSize = TRI.getRegSizeInBits(*SrcRC); - unsigned DstSize = TRI.getRegSizeInBits(*DstRC); - - // If we're doing a cross-bank copy on different-sized registers, we need - // to do a bit more work. - if (SrcSize > DstSize) { - // We're doing a cross-bank copy into a smaller register. We need a - // subregister copy. First, get a register class that's on the same bank - // as the destination, but the same size as the source. - const TargetRegisterClass *SubregRC = - getMinClassForRegBank(DstRegBank, SrcSize, true); - assert(SubregRC && "Didn't get a register class for subreg?"); - - // Get the appropriate subregister for the destination. - unsigned SubReg = 0; - if (!getSubRegForClass(DstRC, TRI, SubReg)) { - LLVM_DEBUG(dbgs() << "Couldn't determine subregister for copy.\n"); - return false; - } - - // Now, insert a subregister copy using the new register class. - selectSubregisterCopy(I, MRI, RBI, SrcReg, SubregRC, DstRC, SubReg); - return CheckCopy(); - } - - // Is this a cross-bank copy? - if (DstRegBank.getID() != SrcRegBank.getID()) { - if (DstRegBank.getID() == AArch64::GPRRegBankID && DstSize == 32 && - SrcSize == 16) { - // Special case for FPR16 to GPR32. - // FIXME: This can probably be generalized like the above case. - Register PromoteReg = - MRI.createVirtualRegister(&AArch64::FPR32RegClass); - BuildMI(*I.getParent(), I, I.getDebugLoc(), - TII.get(AArch64::SUBREG_TO_REG), PromoteReg) - .addImm(0) - .addUse(SrcReg) - .addImm(AArch64::hsub); - MachineOperand &RegOp = I.getOperand(1); - RegOp.setReg(PromoteReg); - - // Promise that the copy is implicitly validated by the SUBREG_TO_REG. - KnownValid = true; - } - } - - // If the destination is a physical register, then there's nothing to - // change, so we're done. - if (Register::isPhysicalRegister(DstReg)) - return CheckCopy(); - } - - // No need to constrain SrcReg. It will get constrained when we hit another - // of its use or its defs. Copies do not have constraints. - if (!RBI.constrainGenericRegister(DstReg, *DstRC, MRI)) { - LLVM_DEBUG(dbgs() << "Failed to constrain " << TII.getName(I.getOpcode()) - << " operand\n"); - return false; - } - I.setDesc(TII.get(AArch64::COPY)); - return CheckCopy(); -} - -static unsigned selectFPConvOpc(unsigned GenericOpc, LLT DstTy, LLT SrcTy) { - if (!DstTy.isScalar() || !SrcTy.isScalar()) - return GenericOpc; - - const unsigned DstSize = DstTy.getSizeInBits(); - const unsigned SrcSize = SrcTy.getSizeInBits(); - - switch (DstSize) { - case 32: - switch (SrcSize) { - case 32: - switch (GenericOpc) { - case TargetOpcode::G_SITOFP: - return AArch64::SCVTFUWSri; - case TargetOpcode::G_UITOFP: - return AArch64::UCVTFUWSri; - case TargetOpcode::G_FPTOSI: - return AArch64::FCVTZSUWSr; - case TargetOpcode::G_FPTOUI: - return AArch64::FCVTZUUWSr; - default: - return GenericOpc; - } - case 64: - switch (GenericOpc) { - case TargetOpcode::G_SITOFP: - return AArch64::SCVTFUXSri; - case TargetOpcode::G_UITOFP: - return AArch64::UCVTFUXSri; - case TargetOpcode::G_FPTOSI: - return AArch64::FCVTZSUWDr; - case TargetOpcode::G_FPTOUI: - return AArch64::FCVTZUUWDr; - default: - return GenericOpc; - } - default: - return GenericOpc; - } - case 64: - switch (SrcSize) { - case 32: - switch (GenericOpc) { - case TargetOpcode::G_SITOFP: - return AArch64::SCVTFUWDri; - case TargetOpcode::G_UITOFP: - return AArch64::UCVTFUWDri; - case TargetOpcode::G_FPTOSI: - return AArch64::FCVTZSUXSr; - case TargetOpcode::G_FPTOUI: - return AArch64::FCVTZUUXSr; - default: - return GenericOpc; - } - case 64: - switch (GenericOpc) { - case TargetOpcode::G_SITOFP: - return AArch64::SCVTFUXDri; - case TargetOpcode::G_UITOFP: - return AArch64::UCVTFUXDri; - case TargetOpcode::G_FPTOSI: - return AArch64::FCVTZSUXDr; - case TargetOpcode::G_FPTOUI: - return AArch64::FCVTZUUXDr; - default: - return GenericOpc; - } - default: - return GenericOpc; - } - default: - return GenericOpc; - }; - return GenericOpc; -} - -static unsigned selectSelectOpc(MachineInstr &I, MachineRegisterInfo &MRI, - const RegisterBankInfo &RBI) { - const TargetRegisterInfo &TRI = *MRI.getTargetRegisterInfo(); - bool IsFP = (RBI.getRegBank(I.getOperand(0).getReg(), MRI, TRI)->getID() != - AArch64::GPRRegBankID); - LLT Ty = MRI.getType(I.getOperand(0).getReg()); - if (Ty == LLT::scalar(32)) - return IsFP ? AArch64::FCSELSrrr : AArch64::CSELWr; - else if (Ty == LLT::scalar(64) || Ty == LLT::pointer(0, 64)) - return IsFP ? AArch64::FCSELDrrr : AArch64::CSELXr; - return 0; -} - -/// Helper function to select the opcode for a G_FCMP. -static unsigned selectFCMPOpc(MachineInstr &I, MachineRegisterInfo &MRI) { - // If this is a compare against +0.0, then we don't have to explicitly - // materialize a constant. - const ConstantFP *FPImm = getConstantFPVRegVal(I.getOperand(3).getReg(), MRI); - bool ShouldUseImm = FPImm && (FPImm->isZero() && !FPImm->isNegative()); - unsigned OpSize = MRI.getType(I.getOperand(2).getReg()).getSizeInBits(); - if (OpSize != 32 && OpSize != 64) - return 0; - unsigned CmpOpcTbl[2][2] = {{AArch64::FCMPSrr, AArch64::FCMPDrr}, - {AArch64::FCMPSri, AArch64::FCMPDri}}; - return CmpOpcTbl[ShouldUseImm][OpSize == 64]; -} - -/// Returns true if \p P is an unsigned integer comparison predicate. -static bool isUnsignedICMPPred(const CmpInst::Predicate P) { - switch (P) { - default: - return false; - case CmpInst::ICMP_UGT: - case CmpInst::ICMP_UGE: - case CmpInst::ICMP_ULT: - case CmpInst::ICMP_ULE: - return true; - } -} - -static AArch64CC::CondCode changeICMPPredToAArch64CC(CmpInst::Predicate P) { - switch (P) { - default: - llvm_unreachable("Unknown condition code!"); - case CmpInst::ICMP_NE: - return AArch64CC::NE; - case CmpInst::ICMP_EQ: - return AArch64CC::EQ; - case CmpInst::ICMP_SGT: - return AArch64CC::GT; - case CmpInst::ICMP_SGE: - return AArch64CC::GE; - case CmpInst::ICMP_SLT: - return AArch64CC::LT; - case CmpInst::ICMP_SLE: - return AArch64CC::LE; - case CmpInst::ICMP_UGT: - return AArch64CC::HI; - case CmpInst::ICMP_UGE: - return AArch64CC::HS; - case CmpInst::ICMP_ULT: - return AArch64CC::LO; - case CmpInst::ICMP_ULE: - return AArch64CC::LS; - } -} - -static void changeFCMPPredToAArch64CC(CmpInst::Predicate P, - AArch64CC::CondCode &CondCode, - AArch64CC::CondCode &CondCode2) { - CondCode2 = AArch64CC::AL; - switch (P) { - default: - llvm_unreachable("Unknown FP condition!"); - case CmpInst::FCMP_OEQ: - CondCode = AArch64CC::EQ; - break; - case CmpInst::FCMP_OGT: - CondCode = AArch64CC::GT; - break; - case CmpInst::FCMP_OGE: - CondCode = AArch64CC::GE; - break; - case CmpInst::FCMP_OLT: - CondCode = AArch64CC::MI; - break; - case CmpInst::FCMP_OLE: - CondCode = AArch64CC::LS; - break; - case CmpInst::FCMP_ONE: - CondCode = AArch64CC::MI; - CondCode2 = AArch64CC::GT; - break; - case CmpInst::FCMP_ORD: - CondCode = AArch64CC::VC; - break; - case CmpInst::FCMP_UNO: - CondCode = AArch64CC::VS; - break; - case CmpInst::FCMP_UEQ: - CondCode = AArch64CC::EQ; - CondCode2 = AArch64CC::VS; - break; - case CmpInst::FCMP_UGT: - CondCode = AArch64CC::HI; - break; - case CmpInst::FCMP_UGE: - CondCode = AArch64CC::PL; - break; - case CmpInst::FCMP_ULT: - CondCode = AArch64CC::LT; - break; - case CmpInst::FCMP_ULE: - CondCode = AArch64CC::LE; - break; - case CmpInst::FCMP_UNE: - CondCode = AArch64CC::NE; - break; - } -} - -bool AArch64InstructionSelector::selectCompareBranch( - MachineInstr &I, MachineFunction &MF, MachineRegisterInfo &MRI) const { - - const Register CondReg = I.getOperand(0).getReg(); - MachineBasicBlock *DestMBB = I.getOperand(1).getMBB(); - MachineInstr *CCMI = MRI.getVRegDef(CondReg); - if (CCMI->getOpcode() == TargetOpcode::G_TRUNC) - CCMI = MRI.getVRegDef(CCMI->getOperand(1).getReg()); - if (CCMI->getOpcode() != TargetOpcode::G_ICMP) - return false; - - Register LHS = CCMI->getOperand(2).getReg(); - Register RHS = CCMI->getOperand(3).getReg(); - auto VRegAndVal = getConstantVRegValWithLookThrough(RHS, MRI); - if (!VRegAndVal) - std::swap(RHS, LHS); - - VRegAndVal = getConstantVRegValWithLookThrough(RHS, MRI); - if (!VRegAndVal || VRegAndVal->Value != 0) { - MachineIRBuilder MIB(I); - // If we can't select a CBZ then emit a cmp + Bcc. - if (!emitIntegerCompare(CCMI->getOperand(2), CCMI->getOperand(3), - CCMI->getOperand(1), MIB)) - return false; - const AArch64CC::CondCode CC = changeICMPPredToAArch64CC( - (CmpInst::Predicate)CCMI->getOperand(1).getPredicate()); - MIB.buildInstr(AArch64::Bcc, {}, {}).addImm(CC).addMBB(DestMBB); - I.eraseFromParent(); - return true; - } - - const RegisterBank &RB = *RBI.getRegBank(LHS, MRI, TRI); - if (RB.getID() != AArch64::GPRRegBankID) - return false; - - const auto Pred = (CmpInst::Predicate)CCMI->getOperand(1).getPredicate(); - if (Pred != CmpInst::ICMP_NE && Pred != CmpInst::ICMP_EQ) - return false; - - const unsigned CmpWidth = MRI.getType(LHS).getSizeInBits(); - unsigned CBOpc = 0; - if (CmpWidth <= 32) - CBOpc = (Pred == CmpInst::ICMP_EQ ? AArch64::CBZW : AArch64::CBNZW); - else if (CmpWidth == 64) - CBOpc = (Pred == CmpInst::ICMP_EQ ? AArch64::CBZX : AArch64::CBNZX); - else - return false; - - BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(CBOpc)) - .addUse(LHS) - .addMBB(DestMBB) - .constrainAllUses(TII, TRI, RBI); - - I.eraseFromParent(); - return true; -} - -/// Returns the element immediate value of a vector shift operand if found. -/// This needs to detect a splat-like operation, e.g. a G_BUILD_VECTOR. -static Optional getVectorShiftImm(Register Reg, - MachineRegisterInfo &MRI) { - assert(MRI.getType(Reg).isVector() && "Expected a *vector* shift operand"); - MachineInstr *OpMI = MRI.getVRegDef(Reg); - assert(OpMI && "Expected to find a vreg def for vector shift operand"); - if (OpMI->getOpcode() != TargetOpcode::G_BUILD_VECTOR) - return None; - - // Check all operands are identical immediates. - int64_t ImmVal = 0; - for (unsigned Idx = 1; Idx < OpMI->getNumOperands(); ++Idx) { - auto VRegAndVal = getConstantVRegValWithLookThrough(OpMI->getOperand(Idx).getReg(), MRI); - if (!VRegAndVal) - return None; - - if (Idx == 1) - ImmVal = VRegAndVal->Value; - if (ImmVal != VRegAndVal->Value) - return None; - } - - return ImmVal; -} - -/// Matches and returns the shift immediate value for a SHL instruction given -/// a shift operand. -static Optional getVectorSHLImm(LLT SrcTy, Register Reg, MachineRegisterInfo &MRI) { - Optional ShiftImm = getVectorShiftImm(Reg, MRI); - if (!ShiftImm) - return None; - // Check the immediate is in range for a SHL. - int64_t Imm = *ShiftImm; - if (Imm < 0) - return None; - switch (SrcTy.getElementType().getSizeInBits()) { - default: - LLVM_DEBUG(dbgs() << "Unhandled element type for vector shift"); - return None; - case 8: - if (Imm > 7) - return None; - break; - case 16: - if (Imm > 15) - return None; - break; - case 32: - if (Imm > 31) - return None; - break; - case 64: - if (Imm > 63) - return None; - break; - } - return Imm; -} - -bool AArch64InstructionSelector::selectVectorSHL( - MachineInstr &I, MachineRegisterInfo &MRI) const { - assert(I.getOpcode() == TargetOpcode::G_SHL); - Register DstReg = I.getOperand(0).getReg(); - const LLT Ty = MRI.getType(DstReg); - Register Src1Reg = I.getOperand(1).getReg(); - Register Src2Reg = I.getOperand(2).getReg(); - - if (!Ty.isVector()) - return false; - - // Check if we have a vector of constants on RHS that we can select as the - // immediate form. - Optional ImmVal = getVectorSHLImm(Ty, Src2Reg, MRI); - - unsigned Opc = 0; - if (Ty == LLT::vector(2, 64)) { - Opc = ImmVal ? AArch64::SHLv2i64_shift : AArch64::USHLv2i64; - } else if (Ty == LLT::vector(4, 32)) { - Opc = ImmVal ? AArch64::SHLv4i32_shift : AArch64::USHLv4i32; - } else if (Ty == LLT::vector(2, 32)) { - Opc = ImmVal ? AArch64::SHLv2i32_shift : AArch64::USHLv2i32; - } else { - LLVM_DEBUG(dbgs() << "Unhandled G_SHL type"); - return false; - } - - MachineIRBuilder MIB(I); - auto Shl = MIB.buildInstr(Opc, {DstReg}, {Src1Reg}); - if (ImmVal) - Shl.addImm(*ImmVal); - else - Shl.addUse(Src2Reg); - constrainSelectedInstRegOperands(*Shl, TII, TRI, RBI); - I.eraseFromParent(); - return true; -} - -bool AArch64InstructionSelector::selectVectorASHR( - MachineInstr &I, MachineRegisterInfo &MRI) const { - assert(I.getOpcode() == TargetOpcode::G_ASHR); - Register DstReg = I.getOperand(0).getReg(); - const LLT Ty = MRI.getType(DstReg); - Register Src1Reg = I.getOperand(1).getReg(); - Register Src2Reg = I.getOperand(2).getReg(); - - if (!Ty.isVector()) - return false; - - // There is not a shift right register instruction, but the shift left - // register instruction takes a signed value, where negative numbers specify a - // right shift. - - unsigned Opc = 0; - unsigned NegOpc = 0; - const TargetRegisterClass *RC = nullptr; - if (Ty == LLT::vector(2, 64)) { - Opc = AArch64::SSHLv2i64; - NegOpc = AArch64::NEGv2i64; - RC = &AArch64::FPR128RegClass; - } else if (Ty == LLT::vector(4, 32)) { - Opc = AArch64::SSHLv4i32; - NegOpc = AArch64::NEGv4i32; - RC = &AArch64::FPR128RegClass; - } else if (Ty == LLT::vector(2, 32)) { - Opc = AArch64::SSHLv2i32; - NegOpc = AArch64::NEGv2i32; - RC = &AArch64::FPR64RegClass; - } else { - LLVM_DEBUG(dbgs() << "Unhandled G_ASHR type"); - return false; - } - - MachineIRBuilder MIB(I); - auto Neg = MIB.buildInstr(NegOpc, {RC}, {Src2Reg}); - constrainSelectedInstRegOperands(*Neg, TII, TRI, RBI); - auto SShl = MIB.buildInstr(Opc, {DstReg}, {Src1Reg, Neg}); - constrainSelectedInstRegOperands(*SShl, TII, TRI, RBI); - I.eraseFromParent(); - return true; -} - -bool AArch64InstructionSelector::selectVaStartAAPCS( - MachineInstr &I, MachineFunction &MF, MachineRegisterInfo &MRI) const { - return false; -} - -bool AArch64InstructionSelector::selectVaStartDarwin( - MachineInstr &I, MachineFunction &MF, MachineRegisterInfo &MRI) const { - AArch64FunctionInfo *FuncInfo = MF.getInfo(); - Register ListReg = I.getOperand(0).getReg(); - - Register ArgsAddrReg = MRI.createVirtualRegister(&AArch64::GPR64RegClass); - - auto MIB = - BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(AArch64::ADDXri)) - .addDef(ArgsAddrReg) - .addFrameIndex(FuncInfo->getVarArgsStackIndex()) - .addImm(0) - .addImm(0); - - constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); - - MIB = BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(AArch64::STRXui)) - .addUse(ArgsAddrReg) - .addUse(ListReg) - .addImm(0) - .addMemOperand(*I.memoperands_begin()); - - constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); - I.eraseFromParent(); - return true; -} - -void AArch64InstructionSelector::materializeLargeCMVal( - MachineInstr &I, const Value *V, unsigned OpFlags) const { - MachineBasicBlock &MBB = *I.getParent(); - MachineFunction &MF = *MBB.getParent(); - MachineRegisterInfo &MRI = MF.getRegInfo(); - MachineIRBuilder MIB(I); - - auto MovZ = MIB.buildInstr(AArch64::MOVZXi, {&AArch64::GPR64RegClass}, {}); - MovZ->addOperand(MF, I.getOperand(1)); - MovZ->getOperand(1).setTargetFlags(OpFlags | AArch64II::MO_G0 | - AArch64II::MO_NC); - MovZ->addOperand(MF, MachineOperand::CreateImm(0)); - constrainSelectedInstRegOperands(*MovZ, TII, TRI, RBI); - - auto BuildMovK = [&](Register SrcReg, unsigned char Flags, unsigned Offset, - Register ForceDstReg) { - Register DstReg = ForceDstReg - ? ForceDstReg - : MRI.createVirtualRegister(&AArch64::GPR64RegClass); - auto MovI = MIB.buildInstr(AArch64::MOVKXi).addDef(DstReg).addUse(SrcReg); - if (auto *GV = dyn_cast(V)) { - MovI->addOperand(MF, MachineOperand::CreateGA( - GV, MovZ->getOperand(1).getOffset(), Flags)); - } else { - MovI->addOperand( - MF, MachineOperand::CreateBA(cast(V), - MovZ->getOperand(1).getOffset(), Flags)); - } - MovI->addOperand(MF, MachineOperand::CreateImm(Offset)); - constrainSelectedInstRegOperands(*MovI, TII, TRI, RBI); - return DstReg; - }; - Register DstReg = BuildMovK(MovZ.getReg(0), - AArch64II::MO_G1 | AArch64II::MO_NC, 16, 0); - DstReg = BuildMovK(DstReg, AArch64II::MO_G2 | AArch64II::MO_NC, 32, 0); - BuildMovK(DstReg, AArch64II::MO_G3, 48, I.getOperand(0).getReg()); - return; -} - -void AArch64InstructionSelector::preISelLower(MachineInstr &I) const { - MachineBasicBlock &MBB = *I.getParent(); - MachineFunction &MF = *MBB.getParent(); - MachineRegisterInfo &MRI = MF.getRegInfo(); - - switch (I.getOpcode()) { - case TargetOpcode::G_SHL: - case TargetOpcode::G_ASHR: - case TargetOpcode::G_LSHR: { - // These shifts are legalized to have 64 bit shift amounts because we want - // to take advantage of the existing imported selection patterns that assume - // the immediates are s64s. However, if the shifted type is 32 bits and for - // some reason we receive input GMIR that has an s64 shift amount that's not - // a G_CONSTANT, insert a truncate so that we can still select the s32 - // register-register variant. - Register SrcReg = I.getOperand(1).getReg(); - Register ShiftReg = I.getOperand(2).getReg(); - const LLT ShiftTy = MRI.getType(ShiftReg); - const LLT SrcTy = MRI.getType(SrcReg); - if (SrcTy.isVector()) - return; - assert(!ShiftTy.isVector() && "unexpected vector shift ty"); - if (SrcTy.getSizeInBits() != 32 || ShiftTy.getSizeInBits() != 64) - return; - auto *AmtMI = MRI.getVRegDef(ShiftReg); - assert(AmtMI && "could not find a vreg definition for shift amount"); - if (AmtMI->getOpcode() != TargetOpcode::G_CONSTANT) { - // Insert a subregister copy to implement a 64->32 trunc - MachineIRBuilder MIB(I); - auto Trunc = MIB.buildInstr(TargetOpcode::COPY, {SrcTy}, {}) - .addReg(ShiftReg, 0, AArch64::sub_32); - MRI.setRegBank(Trunc.getReg(0), RBI.getRegBank(AArch64::GPRRegBankID)); - I.getOperand(2).setReg(Trunc.getReg(0)); - } - return; - } - case TargetOpcode::G_STORE: - contractCrossBankCopyIntoStore(I, MRI); - return; - default: - return; - } -} - -bool AArch64InstructionSelector::earlySelectSHL( - MachineInstr &I, MachineRegisterInfo &MRI) const { - // We try to match the immediate variant of LSL, which is actually an alias - // for a special case of UBFM. Otherwise, we fall back to the imported - // selector which will match the register variant. - assert(I.getOpcode() == TargetOpcode::G_SHL && "unexpected op"); - const auto &MO = I.getOperand(2); - auto VRegAndVal = getConstantVRegVal(MO.getReg(), MRI); - if (!VRegAndVal) - return false; - - const LLT DstTy = MRI.getType(I.getOperand(0).getReg()); - if (DstTy.isVector()) - return false; - bool Is64Bit = DstTy.getSizeInBits() == 64; - auto Imm1Fn = Is64Bit ? selectShiftA_64(MO) : selectShiftA_32(MO); - auto Imm2Fn = Is64Bit ? selectShiftB_64(MO) : selectShiftB_32(MO); - MachineIRBuilder MIB(I); - - if (!Imm1Fn || !Imm2Fn) - return false; - - auto NewI = - MIB.buildInstr(Is64Bit ? AArch64::UBFMXri : AArch64::UBFMWri, - {I.getOperand(0).getReg()}, {I.getOperand(1).getReg()}); - - for (auto &RenderFn : *Imm1Fn) - RenderFn(NewI); - for (auto &RenderFn : *Imm2Fn) - RenderFn(NewI); - - I.eraseFromParent(); - return constrainSelectedInstRegOperands(*NewI, TII, TRI, RBI); -} - -void AArch64InstructionSelector::contractCrossBankCopyIntoStore( - MachineInstr &I, MachineRegisterInfo &MRI) const { - assert(I.getOpcode() == TargetOpcode::G_STORE && "Expected G_STORE"); - // If we're storing a scalar, it doesn't matter what register bank that - // scalar is on. All that matters is the size. - // - // So, if we see something like this (with a 32-bit scalar as an example): - // - // %x:gpr(s32) = ... something ... - // %y:fpr(s32) = COPY %x:gpr(s32) - // G_STORE %y:fpr(s32) - // - // We can fix this up into something like this: - // - // G_STORE %x:gpr(s32) - // - // And then continue the selection process normally. - MachineInstr *Def = getDefIgnoringCopies(I.getOperand(0).getReg(), MRI); - if (!Def) - return; - Register DefDstReg = Def->getOperand(0).getReg(); - LLT DefDstTy = MRI.getType(DefDstReg); - Register StoreSrcReg = I.getOperand(0).getReg(); - LLT StoreSrcTy = MRI.getType(StoreSrcReg); - - // If we get something strange like a physical register, then we shouldn't - // go any further. - if (!DefDstTy.isValid()) - return; - - // Are the source and dst types the same size? - if (DefDstTy.getSizeInBits() != StoreSrcTy.getSizeInBits()) - return; - - if (RBI.getRegBank(StoreSrcReg, MRI, TRI) == - RBI.getRegBank(DefDstReg, MRI, TRI)) - return; - - // We have a cross-bank copy, which is entering a store. Let's fold it. - I.getOperand(0).setReg(DefDstReg); -} - -bool AArch64InstructionSelector::earlySelect(MachineInstr &I) const { - assert(I.getParent() && "Instruction should be in a basic block!"); - assert(I.getParent()->getParent() && "Instruction should be in a function!"); - - MachineBasicBlock &MBB = *I.getParent(); - MachineFunction &MF = *MBB.getParent(); - MachineRegisterInfo &MRI = MF.getRegInfo(); - - switch (I.getOpcode()) { - case TargetOpcode::G_SHL: - return earlySelectSHL(I, MRI); - case TargetOpcode::G_CONSTANT: { - bool IsZero = false; - if (I.getOperand(1).isCImm()) - IsZero = I.getOperand(1).getCImm()->getZExtValue() == 0; - else if (I.getOperand(1).isImm()) - IsZero = I.getOperand(1).getImm() == 0; - - if (!IsZero) - return false; - - Register DefReg = I.getOperand(0).getReg(); - LLT Ty = MRI.getType(DefReg); - if (Ty != LLT::scalar(64) && Ty != LLT::scalar(32)) - return false; - - if (Ty == LLT::scalar(64)) { - I.getOperand(1).ChangeToRegister(AArch64::XZR, false); - RBI.constrainGenericRegister(DefReg, AArch64::GPR64RegClass, MRI); - } else { - I.getOperand(1).ChangeToRegister(AArch64::WZR, false); - RBI.constrainGenericRegister(DefReg, AArch64::GPR32RegClass, MRI); - } - I.setDesc(TII.get(TargetOpcode::COPY)); - return true; - } - default: - return false; - } -} - -bool AArch64InstructionSelector::select(MachineInstr &I) { - assert(I.getParent() && "Instruction should be in a basic block!"); - assert(I.getParent()->getParent() && "Instruction should be in a function!"); - - MachineBasicBlock &MBB = *I.getParent(); - MachineFunction &MF = *MBB.getParent(); - MachineRegisterInfo &MRI = MF.getRegInfo(); - - unsigned Opcode = I.getOpcode(); - // G_PHI requires same handling as PHI - if (!isPreISelGenericOpcode(Opcode) || Opcode == TargetOpcode::G_PHI) { - // Certain non-generic instructions also need some special handling. - - if (Opcode == TargetOpcode::LOAD_STACK_GUARD) - return constrainSelectedInstRegOperands(I, TII, TRI, RBI); - - if (Opcode == TargetOpcode::PHI || Opcode == TargetOpcode::G_PHI) { - const Register DefReg = I.getOperand(0).getReg(); - const LLT DefTy = MRI.getType(DefReg); - - const RegClassOrRegBank &RegClassOrBank = - MRI.getRegClassOrRegBank(DefReg); - - const TargetRegisterClass *DefRC - = RegClassOrBank.dyn_cast(); - if (!DefRC) { - if (!DefTy.isValid()) { - LLVM_DEBUG(dbgs() << "PHI operand has no type, not a gvreg?\n"); - return false; - } - const RegisterBank &RB = *RegClassOrBank.get(); - DefRC = getRegClassForTypeOnBank(DefTy, RB, RBI); - if (!DefRC) { - LLVM_DEBUG(dbgs() << "PHI operand has unexpected size/bank\n"); - return false; - } - } - - I.setDesc(TII.get(TargetOpcode::PHI)); - - return RBI.constrainGenericRegister(DefReg, *DefRC, MRI); - } - - if (I.isCopy()) - return selectCopy(I, TII, MRI, TRI, RBI); - - return true; - } - - - if (I.getNumOperands() != I.getNumExplicitOperands()) { - LLVM_DEBUG( - dbgs() << "Generic instruction has unexpected implicit operands\n"); - return false; - } - - // Try to do some lowering before we start instruction selecting. These - // lowerings are purely transformations on the input G_MIR and so selection - // must continue after any modification of the instruction. - preISelLower(I); - - // There may be patterns where the importer can't deal with them optimally, - // but does select it to a suboptimal sequence so our custom C++ selection - // code later never has a chance to work on it. Therefore, we have an early - // selection attempt here to give priority to certain selection routines - // over the imported ones. - if (earlySelect(I)) - return true; - - if (selectImpl(I, *CoverageInfo)) - return true; - - LLT Ty = - I.getOperand(0).isReg() ? MRI.getType(I.getOperand(0).getReg()) : LLT{}; - - MachineIRBuilder MIB(I); - - switch (Opcode) { - case TargetOpcode::G_BRCOND: { - if (Ty.getSizeInBits() > 32) { - // We shouldn't need this on AArch64, but it would be implemented as an - // EXTRACT_SUBREG followed by a TBNZW because TBNZX has no encoding if the - // bit being tested is < 32. - LLVM_DEBUG(dbgs() << "G_BRCOND has type: " << Ty - << ", expected at most 32-bits"); - return false; - } - - const Register CondReg = I.getOperand(0).getReg(); - MachineBasicBlock *DestMBB = I.getOperand(1).getMBB(); - - // Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z - // instructions will not be produced, as they are conditional branch - // instructions that do not set flags. - bool ProduceNonFlagSettingCondBr = - !MF.getFunction().hasFnAttribute(Attribute::SpeculativeLoadHardening); - if (ProduceNonFlagSettingCondBr && selectCompareBranch(I, MF, MRI)) - return true; - - if (ProduceNonFlagSettingCondBr) { - auto MIB = BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::TBNZW)) - .addUse(CondReg) - .addImm(/*bit offset=*/0) - .addMBB(DestMBB); - - I.eraseFromParent(); - return constrainSelectedInstRegOperands(*MIB.getInstr(), TII, TRI, RBI); - } else { - auto CMP = BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::ANDSWri)) - .addDef(AArch64::WZR) - .addUse(CondReg) - .addImm(1); - constrainSelectedInstRegOperands(*CMP.getInstr(), TII, TRI, RBI); - auto Bcc = - BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::Bcc)) - .addImm(AArch64CC::EQ) - .addMBB(DestMBB); - - I.eraseFromParent(); - return constrainSelectedInstRegOperands(*Bcc.getInstr(), TII, TRI, RBI); - } - } - - case TargetOpcode::G_BRINDIRECT: { - I.setDesc(TII.get(AArch64::BR)); - return constrainSelectedInstRegOperands(I, TII, TRI, RBI); - } - - case TargetOpcode::G_BRJT: - return selectBrJT(I, MRI); - - case TargetOpcode::G_BSWAP: { - // Handle vector types for G_BSWAP directly. - Register DstReg = I.getOperand(0).getReg(); - LLT DstTy = MRI.getType(DstReg); - - // We should only get vector types here; everything else is handled by the - // importer right now. - if (!DstTy.isVector() || DstTy.getSizeInBits() > 128) { - LLVM_DEBUG(dbgs() << "Dst type for G_BSWAP currently unsupported.\n"); - return false; - } - - // Only handle 4 and 2 element vectors for now. - // TODO: 16-bit elements. - unsigned NumElts = DstTy.getNumElements(); - if (NumElts != 4 && NumElts != 2) { - LLVM_DEBUG(dbgs() << "Unsupported number of elements for G_BSWAP.\n"); - return false; - } - - // Choose the correct opcode for the supported types. Right now, that's - // v2s32, v4s32, and v2s64. - unsigned Opc = 0; - unsigned EltSize = DstTy.getElementType().getSizeInBits(); - if (EltSize == 32) - Opc = (DstTy.getNumElements() == 2) ? AArch64::REV32v8i8 - : AArch64::REV32v16i8; - else if (EltSize == 64) - Opc = AArch64::REV64v16i8; - - // We should always get something by the time we get here... - assert(Opc != 0 && "Didn't get an opcode for G_BSWAP?"); - - I.setDesc(TII.get(Opc)); - return constrainSelectedInstRegOperands(I, TII, TRI, RBI); - } - - case TargetOpcode::G_FCONSTANT: - case TargetOpcode::G_CONSTANT: { - const bool isFP = Opcode == TargetOpcode::G_FCONSTANT; - - const LLT s8 = LLT::scalar(8); - const LLT s16 = LLT::scalar(16); - const LLT s32 = LLT::scalar(32); - const LLT s64 = LLT::scalar(64); - const LLT p0 = LLT::pointer(0, 64); - - const Register DefReg = I.getOperand(0).getReg(); - const LLT DefTy = MRI.getType(DefReg); - const unsigned DefSize = DefTy.getSizeInBits(); - const RegisterBank &RB = *RBI.getRegBank(DefReg, MRI, TRI); - - // FIXME: Redundant check, but even less readable when factored out. - if (isFP) { - if (Ty != s32 && Ty != s64) { - LLVM_DEBUG(dbgs() << "Unable to materialize FP " << Ty - << " constant, expected: " << s32 << " or " << s64 - << '\n'); - return false; - } - - if (RB.getID() != AArch64::FPRRegBankID) { - LLVM_DEBUG(dbgs() << "Unable to materialize FP " << Ty - << " constant on bank: " << RB - << ", expected: FPR\n"); - return false; - } - - // The case when we have 0.0 is covered by tablegen. Reject it here so we - // can be sure tablegen works correctly and isn't rescued by this code. - if (I.getOperand(1).getFPImm()->getValueAPF().isExactlyValue(0.0)) - return false; - } else { - // s32 and s64 are covered by tablegen. - if (Ty != p0 && Ty != s8 && Ty != s16) { - LLVM_DEBUG(dbgs() << "Unable to materialize integer " << Ty - << " constant, expected: " << s32 << ", " << s64 - << ", or " << p0 << '\n'); - return false; - } - - if (RB.getID() != AArch64::GPRRegBankID) { - LLVM_DEBUG(dbgs() << "Unable to materialize integer " << Ty - << " constant on bank: " << RB - << ", expected: GPR\n"); - return false; - } - } - - // We allow G_CONSTANT of types < 32b. - const unsigned MovOpc = - DefSize == 64 ? AArch64::MOVi64imm : AArch64::MOVi32imm; - - if (isFP) { - // Either emit a FMOV, or emit a copy to emit a normal mov. - const TargetRegisterClass &GPRRC = - DefSize == 32 ? AArch64::GPR32RegClass : AArch64::GPR64RegClass; - const TargetRegisterClass &FPRRC = - DefSize == 32 ? AArch64::FPR32RegClass : AArch64::FPR64RegClass; - - // Can we use a FMOV instruction to represent the immediate? - if (emitFMovForFConstant(I, MRI)) - return true; - - // Nope. Emit a copy and use a normal mov instead. - const Register DefGPRReg = MRI.createVirtualRegister(&GPRRC); - MachineOperand &RegOp = I.getOperand(0); - RegOp.setReg(DefGPRReg); - MIB.setInsertPt(MIB.getMBB(), std::next(I.getIterator())); - MIB.buildCopy({DefReg}, {DefGPRReg}); - - if (!RBI.constrainGenericRegister(DefReg, FPRRC, MRI)) { - LLVM_DEBUG(dbgs() << "Failed to constrain G_FCONSTANT def operand\n"); - return false; - } - - MachineOperand &ImmOp = I.getOperand(1); - // FIXME: Is going through int64_t always correct? - ImmOp.ChangeToImmediate( - ImmOp.getFPImm()->getValueAPF().bitcastToAPInt().getZExtValue()); - } else if (I.getOperand(1).isCImm()) { - uint64_t Val = I.getOperand(1).getCImm()->getZExtValue(); - I.getOperand(1).ChangeToImmediate(Val); - } else if (I.getOperand(1).isImm()) { - uint64_t Val = I.getOperand(1).getImm(); - I.getOperand(1).ChangeToImmediate(Val); - } - - I.setDesc(TII.get(MovOpc)); - constrainSelectedInstRegOperands(I, TII, TRI, RBI); - return true; - } - case TargetOpcode::G_EXTRACT: { - Register DstReg = I.getOperand(0).getReg(); - Register SrcReg = I.getOperand(1).getReg(); - LLT SrcTy = MRI.getType(SrcReg); - LLT DstTy = MRI.getType(DstReg); - (void)DstTy; - unsigned SrcSize = SrcTy.getSizeInBits(); - - if (SrcTy.getSizeInBits() > 64) { - // This should be an extract of an s128, which is like a vector extract. - if (SrcTy.getSizeInBits() != 128) - return false; - // Only support extracting 64 bits from an s128 at the moment. - if (DstTy.getSizeInBits() != 64) - return false; - - const RegisterBank &SrcRB = *RBI.getRegBank(SrcReg, MRI, TRI); - const RegisterBank &DstRB = *RBI.getRegBank(DstReg, MRI, TRI); - // Check we have the right regbank always. - assert(SrcRB.getID() == AArch64::FPRRegBankID && - DstRB.getID() == AArch64::FPRRegBankID && - "Wrong extract regbank!"); - (void)SrcRB; - - // Emit the same code as a vector extract. - // Offset must be a multiple of 64. - unsigned Offset = I.getOperand(2).getImm(); - if (Offset % 64 != 0) - return false; - unsigned LaneIdx = Offset / 64; - MachineIRBuilder MIB(I); - MachineInstr *Extract = emitExtractVectorElt( - DstReg, DstRB, LLT::scalar(64), SrcReg, LaneIdx, MIB); - if (!Extract) - return false; - I.eraseFromParent(); - return true; - } - - I.setDesc(TII.get(SrcSize == 64 ? AArch64::UBFMXri : AArch64::UBFMWri)); - MachineInstrBuilder(MF, I).addImm(I.getOperand(2).getImm() + - Ty.getSizeInBits() - 1); - - if (SrcSize < 64) { - assert(SrcSize == 32 && DstTy.getSizeInBits() == 16 && - "unexpected G_EXTRACT types"); - return constrainSelectedInstRegOperands(I, TII, TRI, RBI); - } - - DstReg = MRI.createGenericVirtualRegister(LLT::scalar(64)); - MIB.setInsertPt(MIB.getMBB(), std::next(I.getIterator())); - MIB.buildInstr(TargetOpcode::COPY, {I.getOperand(0).getReg()}, {}) - .addReg(DstReg, 0, AArch64::sub_32); - RBI.constrainGenericRegister(I.getOperand(0).getReg(), - AArch64::GPR32RegClass, MRI); - I.getOperand(0).setReg(DstReg); - - return constrainSelectedInstRegOperands(I, TII, TRI, RBI); - } - - case TargetOpcode::G_INSERT: { - LLT SrcTy = MRI.getType(I.getOperand(2).getReg()); - LLT DstTy = MRI.getType(I.getOperand(0).getReg()); - unsigned DstSize = DstTy.getSizeInBits(); - // Larger inserts are vectors, same-size ones should be something else by - // now (split up or turned into COPYs). - if (Ty.getSizeInBits() > 64 || SrcTy.getSizeInBits() > 32) - return false; - - I.setDesc(TII.get(DstSize == 64 ? AArch64::BFMXri : AArch64::BFMWri)); - unsigned LSB = I.getOperand(3).getImm(); - unsigned Width = MRI.getType(I.getOperand(2).getReg()).getSizeInBits(); - I.getOperand(3).setImm((DstSize - LSB) % DstSize); - MachineInstrBuilder(MF, I).addImm(Width - 1); - - if (DstSize < 64) { - assert(DstSize == 32 && SrcTy.getSizeInBits() == 16 && - "unexpected G_INSERT types"); - return constrainSelectedInstRegOperands(I, TII, TRI, RBI); - } - - Register SrcReg = MRI.createGenericVirtualRegister(LLT::scalar(64)); - BuildMI(MBB, I.getIterator(), I.getDebugLoc(), - TII.get(AArch64::SUBREG_TO_REG)) - .addDef(SrcReg) - .addImm(0) - .addUse(I.getOperand(2).getReg()) - .addImm(AArch64::sub_32); - RBI.constrainGenericRegister(I.getOperand(2).getReg(), - AArch64::GPR32RegClass, MRI); - I.getOperand(2).setReg(SrcReg); - - return constrainSelectedInstRegOperands(I, TII, TRI, RBI); - } - case TargetOpcode::G_FRAME_INDEX: { - // allocas and G_FRAME_INDEX are only supported in addrspace(0). - if (Ty != LLT::pointer(0, 64)) { - LLVM_DEBUG(dbgs() << "G_FRAME_INDEX pointer has type: " << Ty - << ", expected: " << LLT::pointer(0, 64) << '\n'); - return false; - } - I.setDesc(TII.get(AArch64::ADDXri)); - - // MOs for a #0 shifted immediate. - I.addOperand(MachineOperand::CreateImm(0)); - I.addOperand(MachineOperand::CreateImm(0)); - - return constrainSelectedInstRegOperands(I, TII, TRI, RBI); - } - - case TargetOpcode::G_GLOBAL_VALUE: { - auto GV = I.getOperand(1).getGlobal(); - if (GV->isThreadLocal()) - return selectTLSGlobalValue(I, MRI); - - unsigned OpFlags = STI.ClassifyGlobalReference(GV, TM); - if (OpFlags & AArch64II::MO_GOT) { - I.setDesc(TII.get(AArch64::LOADgot)); - I.getOperand(1).setTargetFlags(OpFlags); - } else if (TM.getCodeModel() == CodeModel::Large) { - // Materialize the global using movz/movk instructions. - materializeLargeCMVal(I, GV, OpFlags); - I.eraseFromParent(); - return true; - } else if (TM.getCodeModel() == CodeModel::Tiny) { - I.setDesc(TII.get(AArch64::ADR)); - I.getOperand(1).setTargetFlags(OpFlags); - } else { - I.setDesc(TII.get(AArch64::MOVaddr)); - I.getOperand(1).setTargetFlags(OpFlags | AArch64II::MO_PAGE); - MachineInstrBuilder MIB(MF, I); - MIB.addGlobalAddress(GV, I.getOperand(1).getOffset(), - OpFlags | AArch64II::MO_PAGEOFF | AArch64II::MO_NC); - } - return constrainSelectedInstRegOperands(I, TII, TRI, RBI); - } - - case TargetOpcode::G_ZEXTLOAD: - case TargetOpcode::G_LOAD: - case TargetOpcode::G_STORE: { - bool IsZExtLoad = I.getOpcode() == TargetOpcode::G_ZEXTLOAD; - MachineIRBuilder MIB(I); - - LLT PtrTy = MRI.getType(I.getOperand(1).getReg()); - - if (PtrTy != LLT::pointer(0, 64)) { - LLVM_DEBUG(dbgs() << "Load/Store pointer has type: " << PtrTy - << ", expected: " << LLT::pointer(0, 64) << '\n'); - return false; - } - - auto &MemOp = **I.memoperands_begin(); - if (MemOp.isAtomic()) { - // For now we just support s8 acquire loads to be able to compile stack - // protector code. - if (MemOp.getOrdering() == AtomicOrdering::Acquire && - MemOp.getSize() == 1) { - I.setDesc(TII.get(AArch64::LDARB)); - return constrainSelectedInstRegOperands(I, TII, TRI, RBI); - } - LLVM_DEBUG(dbgs() << "Atomic load/store not fully supported yet\n"); - return false; - } - unsigned MemSizeInBits = MemOp.getSize() * 8; - - const Register PtrReg = I.getOperand(1).getReg(); -#ifndef NDEBUG - const RegisterBank &PtrRB = *RBI.getRegBank(PtrReg, MRI, TRI); - // Sanity-check the pointer register. - assert(PtrRB.getID() == AArch64::GPRRegBankID && - "Load/Store pointer operand isn't a GPR"); - assert(MRI.getType(PtrReg).isPointer() && - "Load/Store pointer operand isn't a pointer"); -#endif - - const Register ValReg = I.getOperand(0).getReg(); - const RegisterBank &RB = *RBI.getRegBank(ValReg, MRI, TRI); - - const unsigned NewOpc = - selectLoadStoreUIOp(I.getOpcode(), RB.getID(), MemSizeInBits); - if (NewOpc == I.getOpcode()) - return false; - - I.setDesc(TII.get(NewOpc)); - - uint64_t Offset = 0; - auto *PtrMI = MRI.getVRegDef(PtrReg); - - // Try to fold a GEP into our unsigned immediate addressing mode. - if (PtrMI->getOpcode() == TargetOpcode::G_PTR_ADD) { - if (auto COff = getConstantVRegVal(PtrMI->getOperand(2).getReg(), MRI)) { - int64_t Imm = *COff; - const unsigned Size = MemSizeInBits / 8; - const unsigned Scale = Log2_32(Size); - if ((Imm & (Size - 1)) == 0 && Imm >= 0 && Imm < (0x1000 << Scale)) { - Register Ptr2Reg = PtrMI->getOperand(1).getReg(); - I.getOperand(1).setReg(Ptr2Reg); - PtrMI = MRI.getVRegDef(Ptr2Reg); - Offset = Imm / Size; - } - } - } - - // If we haven't folded anything into our addressing mode yet, try to fold - // a frame index into the base+offset. - if (!Offset && PtrMI->getOpcode() == TargetOpcode::G_FRAME_INDEX) - I.getOperand(1).ChangeToFrameIndex(PtrMI->getOperand(1).getIndex()); - - I.addOperand(MachineOperand::CreateImm(Offset)); - - // If we're storing a 0, use WZR/XZR. - if (auto CVal = getConstantVRegVal(ValReg, MRI)) { - if (*CVal == 0 && Opcode == TargetOpcode::G_STORE) { - if (I.getOpcode() == AArch64::STRWui) - I.getOperand(0).setReg(AArch64::WZR); - else if (I.getOpcode() == AArch64::STRXui) - I.getOperand(0).setReg(AArch64::XZR); - } - } - - if (IsZExtLoad) { - // The zextload from a smaller type to i32 should be handled by the importer. - if (MRI.getType(ValReg).getSizeInBits() != 64) - return false; - // If we have a ZEXTLOAD then change the load's type to be a narrower reg - //and zero_extend with SUBREG_TO_REG. - Register LdReg = MRI.createVirtualRegister(&AArch64::GPR32RegClass); - Register DstReg = I.getOperand(0).getReg(); - I.getOperand(0).setReg(LdReg); - - MIB.setInsertPt(MIB.getMBB(), std::next(I.getIterator())); - MIB.buildInstr(AArch64::SUBREG_TO_REG, {DstReg}, {}) - .addImm(0) - .addUse(LdReg) - .addImm(AArch64::sub_32); - constrainSelectedInstRegOperands(I, TII, TRI, RBI); - return RBI.constrainGenericRegister(DstReg, AArch64::GPR64allRegClass, - MRI); - } - return constrainSelectedInstRegOperands(I, TII, TRI, RBI); - } - - case TargetOpcode::G_SMULH: - case TargetOpcode::G_UMULH: { - // Reject the various things we don't support yet. - if (unsupportedBinOp(I, RBI, MRI, TRI)) - return false; - - const Register DefReg = I.getOperand(0).getReg(); - const RegisterBank &RB = *RBI.getRegBank(DefReg, MRI, TRI); - - if (RB.getID() != AArch64::GPRRegBankID) { - LLVM_DEBUG(dbgs() << "G_[SU]MULH on bank: " << RB << ", expected: GPR\n"); - return false; - } - - if (Ty != LLT::scalar(64)) { - LLVM_DEBUG(dbgs() << "G_[SU]MULH has type: " << Ty - << ", expected: " << LLT::scalar(64) << '\n'); - return false; - } - - unsigned NewOpc = I.getOpcode() == TargetOpcode::G_SMULH ? AArch64::SMULHrr - : AArch64::UMULHrr; - I.setDesc(TII.get(NewOpc)); - - // Now that we selected an opcode, we need to constrain the register - // operands to use appropriate classes. - return constrainSelectedInstRegOperands(I, TII, TRI, RBI); - } - case TargetOpcode::G_FADD: - case TargetOpcode::G_FSUB: - case TargetOpcode::G_FMUL: - case TargetOpcode::G_FDIV: - - case TargetOpcode::G_ASHR: - if (MRI.getType(I.getOperand(0).getReg()).isVector()) - return selectVectorASHR(I, MRI); - LLVM_FALLTHROUGH; - case TargetOpcode::G_SHL: - if (Opcode == TargetOpcode::G_SHL && - MRI.getType(I.getOperand(0).getReg()).isVector()) - return selectVectorSHL(I, MRI); - LLVM_FALLTHROUGH; - case TargetOpcode::G_OR: - case TargetOpcode::G_LSHR: { - // Reject the various things we don't support yet. - if (unsupportedBinOp(I, RBI, MRI, TRI)) - return false; - - const unsigned OpSize = Ty.getSizeInBits(); - - const Register DefReg = I.getOperand(0).getReg(); - const RegisterBank &RB = *RBI.getRegBank(DefReg, MRI, TRI); - - const unsigned NewOpc = selectBinaryOp(I.getOpcode(), RB.getID(), OpSize); - if (NewOpc == I.getOpcode()) - return false; - - I.setDesc(TII.get(NewOpc)); - // FIXME: Should the type be always reset in setDesc? - - // Now that we selected an opcode, we need to constrain the register - // operands to use appropriate classes. - return constrainSelectedInstRegOperands(I, TII, TRI, RBI); - } - - case TargetOpcode::G_PTR_ADD: { - MachineIRBuilder MIRBuilder(I); - emitADD(I.getOperand(0).getReg(), I.getOperand(1), I.getOperand(2), - MIRBuilder); - I.eraseFromParent(); - return true; - } - case TargetOpcode::G_UADDO: { - // TODO: Support other types. - unsigned OpSize = Ty.getSizeInBits(); - if (OpSize != 32 && OpSize != 64) { - LLVM_DEBUG( - dbgs() - << "G_UADDO currently only supported for 32 and 64 b types.\n"); - return false; - } - - // TODO: Support vectors. - if (Ty.isVector()) { - LLVM_DEBUG(dbgs() << "G_UADDO currently only supported for scalars.\n"); - return false; - } - - // Add and set the set condition flag. - unsigned AddsOpc = OpSize == 32 ? AArch64::ADDSWrr : AArch64::ADDSXrr; - MachineIRBuilder MIRBuilder(I); - auto AddsMI = MIRBuilder.buildInstr( - AddsOpc, {I.getOperand(0).getReg()}, - {I.getOperand(2).getReg(), I.getOperand(3).getReg()}); - constrainSelectedInstRegOperands(*AddsMI, TII, TRI, RBI); - - // Now, put the overflow result in the register given by the first operand - // to the G_UADDO. CSINC increments the result when the predicate is false, - // so to get the increment when it's true, we need to use the inverse. In - // this case, we want to increment when carry is set. - auto CsetMI = MIRBuilder - .buildInstr(AArch64::CSINCWr, {I.getOperand(1).getReg()}, - {Register(AArch64::WZR), Register(AArch64::WZR)}) - .addImm(getInvertedCondCode(AArch64CC::HS)); - constrainSelectedInstRegOperands(*CsetMI, TII, TRI, RBI); - I.eraseFromParent(); - return true; - } - - case TargetOpcode::G_PTR_MASK: { - uint64_t Align = I.getOperand(2).getImm(); - if (Align >= 64 || Align == 0) - return false; - - uint64_t Mask = ~((1ULL << Align) - 1); - I.setDesc(TII.get(AArch64::ANDXri)); - I.getOperand(2).setImm(AArch64_AM::encodeLogicalImmediate(Mask, 64)); - - return constrainSelectedInstRegOperands(I, TII, TRI, RBI); - } - case TargetOpcode::G_PTRTOINT: - case TargetOpcode::G_TRUNC: { - const LLT DstTy = MRI.getType(I.getOperand(0).getReg()); - const LLT SrcTy = MRI.getType(I.getOperand(1).getReg()); - - const Register DstReg = I.getOperand(0).getReg(); - const Register SrcReg = I.getOperand(1).getReg(); - - const RegisterBank &DstRB = *RBI.getRegBank(DstReg, MRI, TRI); - const RegisterBank &SrcRB = *RBI.getRegBank(SrcReg, MRI, TRI); - - if (DstRB.getID() != SrcRB.getID()) { - LLVM_DEBUG( - dbgs() << "G_TRUNC/G_PTRTOINT input/output on different banks\n"); - return false; - } - - if (DstRB.getID() == AArch64::GPRRegBankID) { - const TargetRegisterClass *DstRC = - getRegClassForTypeOnBank(DstTy, DstRB, RBI); - if (!DstRC) - return false; - - const TargetRegisterClass *SrcRC = - getRegClassForTypeOnBank(SrcTy, SrcRB, RBI); - if (!SrcRC) - return false; - - if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, MRI) || - !RBI.constrainGenericRegister(DstReg, *DstRC, MRI)) { - LLVM_DEBUG(dbgs() << "Failed to constrain G_TRUNC/G_PTRTOINT\n"); - return false; - } - - if (DstRC == SrcRC) { - // Nothing to be done - } else if (Opcode == TargetOpcode::G_TRUNC && DstTy == LLT::scalar(32) && - SrcTy == LLT::scalar(64)) { - llvm_unreachable("TableGen can import this case"); - return false; - } else if (DstRC == &AArch64::GPR32RegClass && - SrcRC == &AArch64::GPR64RegClass) { - I.getOperand(1).setSubReg(AArch64::sub_32); - } else { - LLVM_DEBUG( - dbgs() << "Unhandled mismatched classes in G_TRUNC/G_PTRTOINT\n"); - return false; - } - - I.setDesc(TII.get(TargetOpcode::COPY)); - return true; - } else if (DstRB.getID() == AArch64::FPRRegBankID) { - if (DstTy == LLT::vector(4, 16) && SrcTy == LLT::vector(4, 32)) { - I.setDesc(TII.get(AArch64::XTNv4i16)); - constrainSelectedInstRegOperands(I, TII, TRI, RBI); - return true; - } - - if (!SrcTy.isVector() && SrcTy.getSizeInBits() == 128) { - MachineIRBuilder MIB(I); - MachineInstr *Extract = emitExtractVectorElt( - DstReg, DstRB, LLT::scalar(DstTy.getSizeInBits()), SrcReg, 0, MIB); - if (!Extract) - return false; - I.eraseFromParent(); - return true; - } - } - - return false; - } - - case TargetOpcode::G_ANYEXT: { - const Register DstReg = I.getOperand(0).getReg(); - const Register SrcReg = I.getOperand(1).getReg(); - - const RegisterBank &RBDst = *RBI.getRegBank(DstReg, MRI, TRI); - if (RBDst.getID() != AArch64::GPRRegBankID) { - LLVM_DEBUG(dbgs() << "G_ANYEXT on bank: " << RBDst - << ", expected: GPR\n"); - return false; - } - - const RegisterBank &RBSrc = *RBI.getRegBank(SrcReg, MRI, TRI); - if (RBSrc.getID() != AArch64::GPRRegBankID) { - LLVM_DEBUG(dbgs() << "G_ANYEXT on bank: " << RBSrc - << ", expected: GPR\n"); - return false; - } - - const unsigned DstSize = MRI.getType(DstReg).getSizeInBits(); - - if (DstSize == 0) { - LLVM_DEBUG(dbgs() << "G_ANYEXT operand has no size, not a gvreg?\n"); - return false; - } - - if (DstSize != 64 && DstSize > 32) { - LLVM_DEBUG(dbgs() << "G_ANYEXT to size: " << DstSize - << ", expected: 32 or 64\n"); - return false; - } - // At this point G_ANYEXT is just like a plain COPY, but we need - // to explicitly form the 64-bit value if any. - if (DstSize > 32) { - Register ExtSrc = MRI.createVirtualRegister(&AArch64::GPR64allRegClass); - BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::SUBREG_TO_REG)) - .addDef(ExtSrc) - .addImm(0) - .addUse(SrcReg) - .addImm(AArch64::sub_32); - I.getOperand(1).setReg(ExtSrc); - } - return selectCopy(I, TII, MRI, TRI, RBI); - } - - case TargetOpcode::G_ZEXT: - case TargetOpcode::G_SEXT: { - unsigned Opcode = I.getOpcode(); - const bool IsSigned = Opcode == TargetOpcode::G_SEXT; - const Register DefReg = I.getOperand(0).getReg(); - const Register SrcReg = I.getOperand(1).getReg(); - const LLT DstTy = MRI.getType(DefReg); - const LLT SrcTy = MRI.getType(SrcReg); - unsigned DstSize = DstTy.getSizeInBits(); - unsigned SrcSize = SrcTy.getSizeInBits(); - - if (DstTy.isVector()) - return false; // Should be handled by imported patterns. - - assert((*RBI.getRegBank(DefReg, MRI, TRI)).getID() == - AArch64::GPRRegBankID && - "Unexpected ext regbank"); - - MachineIRBuilder MIB(I); - MachineInstr *ExtI; - - // First check if we're extending the result of a load which has a dest type - // smaller than 32 bits, then this zext is redundant. GPR32 is the smallest - // GPR register on AArch64 and all loads which are smaller automatically - // zero-extend the upper bits. E.g. - // %v(s8) = G_LOAD %p, :: (load 1) - // %v2(s32) = G_ZEXT %v(s8) - if (!IsSigned) { - auto *LoadMI = getOpcodeDef(TargetOpcode::G_LOAD, SrcReg, MRI); - if (LoadMI && - RBI.getRegBank(SrcReg, MRI, TRI)->getID() == AArch64::GPRRegBankID) { - const MachineMemOperand *MemOp = *LoadMI->memoperands_begin(); - unsigned BytesLoaded = MemOp->getSize(); - if (BytesLoaded < 4 && SrcTy.getSizeInBytes() == BytesLoaded) - return selectCopy(I, TII, MRI, TRI, RBI); - } - } - - if (DstSize == 64) { - // FIXME: Can we avoid manually doing this? - if (!RBI.constrainGenericRegister(SrcReg, AArch64::GPR32RegClass, MRI)) { - LLVM_DEBUG(dbgs() << "Failed to constrain " << TII.getName(Opcode) - << " operand\n"); - return false; - } - - auto SubregToReg = - MIB.buildInstr(AArch64::SUBREG_TO_REG, {&AArch64::GPR64RegClass}, {}) - .addImm(0) - .addUse(SrcReg) - .addImm(AArch64::sub_32); - - ExtI = MIB.buildInstr(IsSigned ? AArch64::SBFMXri : AArch64::UBFMXri, - {DefReg}, {SubregToReg}) - .addImm(0) - .addImm(SrcSize - 1); - } else if (DstSize <= 32) { - ExtI = MIB.buildInstr(IsSigned ? AArch64::SBFMWri : AArch64::UBFMWri, - {DefReg}, {SrcReg}) - .addImm(0) - .addImm(SrcSize - 1); - } else { - return false; - } - - constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI); - I.eraseFromParent(); - return true; - } - - case TargetOpcode::G_SITOFP: - case TargetOpcode::G_UITOFP: - case TargetOpcode::G_FPTOSI: - case TargetOpcode::G_FPTOUI: { - const LLT DstTy = MRI.getType(I.getOperand(0).getReg()), - SrcTy = MRI.getType(I.getOperand(1).getReg()); - const unsigned NewOpc = selectFPConvOpc(Opcode, DstTy, SrcTy); - if (NewOpc == Opcode) - return false; - - I.setDesc(TII.get(NewOpc)); - constrainSelectedInstRegOperands(I, TII, TRI, RBI); - - return true; - } - - - case TargetOpcode::G_INTTOPTR: - // The importer is currently unable to import pointer types since they - // didn't exist in SelectionDAG. - return selectCopy(I, TII, MRI, TRI, RBI); - - case TargetOpcode::G_BITCAST: - // Imported SelectionDAG rules can handle every bitcast except those that - // bitcast from a type to the same type. Ideally, these shouldn't occur - // but we might not run an optimizer that deletes them. The other exception - // is bitcasts involving pointer types, as SelectionDAG has no knowledge - // of them. - return selectCopy(I, TII, MRI, TRI, RBI); - - case TargetOpcode::G_SELECT: { - if (MRI.getType(I.getOperand(1).getReg()) != LLT::scalar(1)) { - LLVM_DEBUG(dbgs() << "G_SELECT cond has type: " << Ty - << ", expected: " << LLT::scalar(1) << '\n'); - return false; - } - - const Register CondReg = I.getOperand(1).getReg(); - const Register TReg = I.getOperand(2).getReg(); - const Register FReg = I.getOperand(3).getReg(); - - if (tryOptSelect(I)) - return true; - - Register CSelOpc = selectSelectOpc(I, MRI, RBI); - MachineInstr &TstMI = - *BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::ANDSWri)) - .addDef(AArch64::WZR) - .addUse(CondReg) - .addImm(AArch64_AM::encodeLogicalImmediate(1, 32)); - - MachineInstr &CSelMI = *BuildMI(MBB, I, I.getDebugLoc(), TII.get(CSelOpc)) - .addDef(I.getOperand(0).getReg()) - .addUse(TReg) - .addUse(FReg) - .addImm(AArch64CC::NE); - - constrainSelectedInstRegOperands(TstMI, TII, TRI, RBI); - constrainSelectedInstRegOperands(CSelMI, TII, TRI, RBI); - - I.eraseFromParent(); - return true; - } - case TargetOpcode::G_ICMP: { - if (Ty.isVector()) - return selectVectorICmp(I, MRI); - - if (Ty != LLT::scalar(32)) { - LLVM_DEBUG(dbgs() << "G_ICMP result has type: " << Ty - << ", expected: " << LLT::scalar(32) << '\n'); - return false; - } - - MachineIRBuilder MIRBuilder(I); - if (!emitIntegerCompare(I.getOperand(2), I.getOperand(3), I.getOperand(1), - MIRBuilder)) - return false; - emitCSetForICMP(I.getOperand(0).getReg(), I.getOperand(1).getPredicate(), - MIRBuilder); - I.eraseFromParent(); - return true; - } - - case TargetOpcode::G_FCMP: { - if (Ty != LLT::scalar(32)) { - LLVM_DEBUG(dbgs() << "G_FCMP result has type: " << Ty - << ", expected: " << LLT::scalar(32) << '\n'); - return false; - } - - unsigned CmpOpc = selectFCMPOpc(I, MRI); - if (!CmpOpc) - return false; - - // FIXME: regbank - - AArch64CC::CondCode CC1, CC2; - changeFCMPPredToAArch64CC( - (CmpInst::Predicate)I.getOperand(1).getPredicate(), CC1, CC2); - - // Partially build the compare. Decide if we need to add a use for the - // third operand based off whether or not we're comparing against 0.0. - auto CmpMI = BuildMI(MBB, I, I.getDebugLoc(), TII.get(CmpOpc)) - .addUse(I.getOperand(2).getReg()); - - // If we don't have an immediate compare, then we need to add a use of the - // register which wasn't used for the immediate. - // Note that the immediate will always be the last operand. - if (CmpOpc != AArch64::FCMPSri && CmpOpc != AArch64::FCMPDri) - CmpMI = CmpMI.addUse(I.getOperand(3).getReg()); - - const Register DefReg = I.getOperand(0).getReg(); - Register Def1Reg = DefReg; - if (CC2 != AArch64CC::AL) - Def1Reg = MRI.createVirtualRegister(&AArch64::GPR32RegClass); - - MachineInstr &CSetMI = - *BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::CSINCWr)) - .addDef(Def1Reg) - .addUse(AArch64::WZR) - .addUse(AArch64::WZR) - .addImm(getInvertedCondCode(CC1)); - - if (CC2 != AArch64CC::AL) { - Register Def2Reg = MRI.createVirtualRegister(&AArch64::GPR32RegClass); - MachineInstr &CSet2MI = - *BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::CSINCWr)) - .addDef(Def2Reg) - .addUse(AArch64::WZR) - .addUse(AArch64::WZR) - .addImm(getInvertedCondCode(CC2)); - MachineInstr &OrMI = - *BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::ORRWrr)) - .addDef(DefReg) - .addUse(Def1Reg) - .addUse(Def2Reg); - constrainSelectedInstRegOperands(OrMI, TII, TRI, RBI); - constrainSelectedInstRegOperands(CSet2MI, TII, TRI, RBI); - } - constrainSelectedInstRegOperands(*CmpMI, TII, TRI, RBI); - constrainSelectedInstRegOperands(CSetMI, TII, TRI, RBI); - - I.eraseFromParent(); - return true; - } - case TargetOpcode::G_VASTART: - return STI.isTargetDarwin() ? selectVaStartDarwin(I, MF, MRI) - : selectVaStartAAPCS(I, MF, MRI); - case TargetOpcode::G_INTRINSIC: - return selectIntrinsic(I, MRI); - case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS: - return selectIntrinsicWithSideEffects(I, MRI); - case TargetOpcode::G_IMPLICIT_DEF: { - I.setDesc(TII.get(TargetOpcode::IMPLICIT_DEF)); - const LLT DstTy = MRI.getType(I.getOperand(0).getReg()); - const Register DstReg = I.getOperand(0).getReg(); - const RegisterBank &DstRB = *RBI.getRegBank(DstReg, MRI, TRI); - const TargetRegisterClass *DstRC = - getRegClassForTypeOnBank(DstTy, DstRB, RBI); - RBI.constrainGenericRegister(DstReg, *DstRC, MRI); - return true; - } - case TargetOpcode::G_BLOCK_ADDR: { - if (TM.getCodeModel() == CodeModel::Large) { - materializeLargeCMVal(I, I.getOperand(1).getBlockAddress(), 0); - I.eraseFromParent(); - return true; - } else { - I.setDesc(TII.get(AArch64::MOVaddrBA)); - auto MovMI = BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::MOVaddrBA), - I.getOperand(0).getReg()) - .addBlockAddress(I.getOperand(1).getBlockAddress(), - /* Offset */ 0, AArch64II::MO_PAGE) - .addBlockAddress( - I.getOperand(1).getBlockAddress(), /* Offset */ 0, - AArch64II::MO_NC | AArch64II::MO_PAGEOFF); - I.eraseFromParent(); - return constrainSelectedInstRegOperands(*MovMI, TII, TRI, RBI); - } - } - case TargetOpcode::G_INTRINSIC_TRUNC: - return selectIntrinsicTrunc(I, MRI); - case TargetOpcode::G_INTRINSIC_ROUND: - return selectIntrinsicRound(I, MRI); - case TargetOpcode::G_BUILD_VECTOR: - return selectBuildVector(I, MRI); - case TargetOpcode::G_MERGE_VALUES: - return selectMergeValues(I, MRI); - case TargetOpcode::G_UNMERGE_VALUES: - return selectUnmergeValues(I, MRI); - case TargetOpcode::G_SHUFFLE_VECTOR: - return selectShuffleVector(I, MRI); - case TargetOpcode::G_EXTRACT_VECTOR_ELT: - return selectExtractElt(I, MRI); - case TargetOpcode::G_INSERT_VECTOR_ELT: - return selectInsertElt(I, MRI); - case TargetOpcode::G_CONCAT_VECTORS: - return selectConcatVectors(I, MRI); - case TargetOpcode::G_JUMP_TABLE: - return selectJumpTable(I, MRI); - } - - return false; -} - -bool AArch64InstructionSelector::selectBrJT(MachineInstr &I, - MachineRegisterInfo &MRI) const { - assert(I.getOpcode() == TargetOpcode::G_BRJT && "Expected G_BRJT"); - Register JTAddr = I.getOperand(0).getReg(); - unsigned JTI = I.getOperand(1).getIndex(); - Register Index = I.getOperand(2).getReg(); - MachineIRBuilder MIB(I); - - Register TargetReg = MRI.createVirtualRegister(&AArch64::GPR64RegClass); - Register ScratchReg = MRI.createVirtualRegister(&AArch64::GPR64spRegClass); - MIB.buildInstr(AArch64::JumpTableDest32, {TargetReg, ScratchReg}, - {JTAddr, Index}) - .addJumpTableIndex(JTI); - - // Build the indirect branch. - MIB.buildInstr(AArch64::BR, {}, {TargetReg}); - I.eraseFromParent(); - return true; -} - -bool AArch64InstructionSelector::selectJumpTable( - MachineInstr &I, MachineRegisterInfo &MRI) const { - assert(I.getOpcode() == TargetOpcode::G_JUMP_TABLE && "Expected jump table"); - assert(I.getOperand(1).isJTI() && "Jump table op should have a JTI!"); - - Register DstReg = I.getOperand(0).getReg(); - unsigned JTI = I.getOperand(1).getIndex(); - // We generate a MOVaddrJT which will get expanded to an ADRP + ADD later. - MachineIRBuilder MIB(I); - auto MovMI = - MIB.buildInstr(AArch64::MOVaddrJT, {DstReg}, {}) - .addJumpTableIndex(JTI, AArch64II::MO_PAGE) - .addJumpTableIndex(JTI, AArch64II::MO_NC | AArch64II::MO_PAGEOFF); - I.eraseFromParent(); - return constrainSelectedInstRegOperands(*MovMI, TII, TRI, RBI); -} - -bool AArch64InstructionSelector::selectTLSGlobalValue( - MachineInstr &I, MachineRegisterInfo &MRI) const { - if (!STI.isTargetMachO()) - return false; - MachineFunction &MF = *I.getParent()->getParent(); - MF.getFrameInfo().setAdjustsStack(true); - - const GlobalValue &GV = *I.getOperand(1).getGlobal(); - MachineIRBuilder MIB(I); - - MIB.buildInstr(AArch64::LOADgot, {AArch64::X0}, {}) - .addGlobalAddress(&GV, 0, AArch64II::MO_TLS); - - auto Load = MIB.buildInstr(AArch64::LDRXui, {&AArch64::GPR64commonRegClass}, - {Register(AArch64::X0)}) - .addImm(0); - - // TLS calls preserve all registers except those that absolutely must be - // trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be - // silly). - MIB.buildInstr(AArch64::BLR, {}, {Load}) - .addDef(AArch64::X0, RegState::Implicit) - .addRegMask(TRI.getTLSCallPreservedMask()); - - MIB.buildCopy(I.getOperand(0).getReg(), Register(AArch64::X0)); - RBI.constrainGenericRegister(I.getOperand(0).getReg(), AArch64::GPR64RegClass, - MRI); - I.eraseFromParent(); - return true; -} - -bool AArch64InstructionSelector::selectIntrinsicTrunc( - MachineInstr &I, MachineRegisterInfo &MRI) const { - const LLT SrcTy = MRI.getType(I.getOperand(0).getReg()); - - // Select the correct opcode. - unsigned Opc = 0; - if (!SrcTy.isVector()) { - switch (SrcTy.getSizeInBits()) { - default: - case 16: - Opc = AArch64::FRINTZHr; - break; - case 32: - Opc = AArch64::FRINTZSr; - break; - case 64: - Opc = AArch64::FRINTZDr; - break; - } - } else { - unsigned NumElts = SrcTy.getNumElements(); - switch (SrcTy.getElementType().getSizeInBits()) { - default: - break; - case 16: - if (NumElts == 4) - Opc = AArch64::FRINTZv4f16; - else if (NumElts == 8) - Opc = AArch64::FRINTZv8f16; - break; - case 32: - if (NumElts == 2) - Opc = AArch64::FRINTZv2f32; - else if (NumElts == 4) - Opc = AArch64::FRINTZv4f32; - break; - case 64: - if (NumElts == 2) - Opc = AArch64::FRINTZv2f64; - break; - } - } - - if (!Opc) { - // Didn't get an opcode above, bail. - LLVM_DEBUG(dbgs() << "Unsupported type for G_INTRINSIC_TRUNC!\n"); - return false; - } - - // Legalization would have set us up perfectly for this; we just need to - // set the opcode and move on. - I.setDesc(TII.get(Opc)); - return constrainSelectedInstRegOperands(I, TII, TRI, RBI); -} - -bool AArch64InstructionSelector::selectIntrinsicRound( - MachineInstr &I, MachineRegisterInfo &MRI) const { - const LLT SrcTy = MRI.getType(I.getOperand(0).getReg()); - - // Select the correct opcode. - unsigned Opc = 0; - if (!SrcTy.isVector()) { - switch (SrcTy.getSizeInBits()) { - default: - case 16: - Opc = AArch64::FRINTAHr; - break; - case 32: - Opc = AArch64::FRINTASr; - break; - case 64: - Opc = AArch64::FRINTADr; - break; - } - } else { - unsigned NumElts = SrcTy.getNumElements(); - switch (SrcTy.getElementType().getSizeInBits()) { - default: - break; - case 16: - if (NumElts == 4) - Opc = AArch64::FRINTAv4f16; - else if (NumElts == 8) - Opc = AArch64::FRINTAv8f16; - break; - case 32: - if (NumElts == 2) - Opc = AArch64::FRINTAv2f32; - else if (NumElts == 4) - Opc = AArch64::FRINTAv4f32; - break; - case 64: - if (NumElts == 2) - Opc = AArch64::FRINTAv2f64; - break; - } - } - - if (!Opc) { - // Didn't get an opcode above, bail. - LLVM_DEBUG(dbgs() << "Unsupported type for G_INTRINSIC_ROUND!\n"); - return false; - } - - // Legalization would have set us up perfectly for this; we just need to - // set the opcode and move on. - I.setDesc(TII.get(Opc)); - return constrainSelectedInstRegOperands(I, TII, TRI, RBI); -} - -bool AArch64InstructionSelector::selectVectorICmp( - MachineInstr &I, MachineRegisterInfo &MRI) const { - Register DstReg = I.getOperand(0).getReg(); - LLT DstTy = MRI.getType(DstReg); - Register SrcReg = I.getOperand(2).getReg(); - Register Src2Reg = I.getOperand(3).getReg(); - LLT SrcTy = MRI.getType(SrcReg); - - unsigned SrcEltSize = SrcTy.getElementType().getSizeInBits(); - unsigned NumElts = DstTy.getNumElements(); - - // First index is element size, 0 == 8b, 1 == 16b, 2 == 32b, 3 == 64b - // Second index is num elts, 0 == v2, 1 == v4, 2 == v8, 3 == v16 - // Third index is cc opcode: - // 0 == eq - // 1 == ugt - // 2 == uge - // 3 == ult - // 4 == ule - // 5 == sgt - // 6 == sge - // 7 == slt - // 8 == sle - // ne is done by negating 'eq' result. - - // This table below assumes that for some comparisons the operands will be - // commuted. - // ult op == commute + ugt op - // ule op == commute + uge op - // slt op == commute + sgt op - // sle op == commute + sge op - unsigned PredIdx = 0; - bool SwapOperands = false; - CmpInst::Predicate Pred = (CmpInst::Predicate)I.getOperand(1).getPredicate(); - switch (Pred) { - case CmpInst::ICMP_NE: - case CmpInst::ICMP_EQ: - PredIdx = 0; - break; - case CmpInst::ICMP_UGT: - PredIdx = 1; - break; - case CmpInst::ICMP_UGE: - PredIdx = 2; - break; - case CmpInst::ICMP_ULT: - PredIdx = 3; - SwapOperands = true; - break; - case CmpInst::ICMP_ULE: - PredIdx = 4; - SwapOperands = true; - break; - case CmpInst::ICMP_SGT: - PredIdx = 5; - break; - case CmpInst::ICMP_SGE: - PredIdx = 6; - break; - case CmpInst::ICMP_SLT: - PredIdx = 7; - SwapOperands = true; - break; - case CmpInst::ICMP_SLE: - PredIdx = 8; - SwapOperands = true; - break; - default: - llvm_unreachable("Unhandled icmp predicate"); - return false; - } - - // This table obviously should be tablegen'd when we have our GISel native - // tablegen selector. - - static const unsigned OpcTable[4][4][9] = { - { - {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, - 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, - 0 /* invalid */}, - {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, - 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, - 0 /* invalid */}, - {AArch64::CMEQv8i8, AArch64::CMHIv8i8, AArch64::CMHSv8i8, - AArch64::CMHIv8i8, AArch64::CMHSv8i8, AArch64::CMGTv8i8, - AArch64::CMGEv8i8, AArch64::CMGTv8i8, AArch64::CMGEv8i8}, - {AArch64::CMEQv16i8, AArch64::CMHIv16i8, AArch64::CMHSv16i8, - AArch64::CMHIv16i8, AArch64::CMHSv16i8, AArch64::CMGTv16i8, - AArch64::CMGEv16i8, AArch64::CMGTv16i8, AArch64::CMGEv16i8} - }, - { - {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, - 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, - 0 /* invalid */}, - {AArch64::CMEQv4i16, AArch64::CMHIv4i16, AArch64::CMHSv4i16, - AArch64::CMHIv4i16, AArch64::CMHSv4i16, AArch64::CMGTv4i16, - AArch64::CMGEv4i16, AArch64::CMGTv4i16, AArch64::CMGEv4i16}, - {AArch64::CMEQv8i16, AArch64::CMHIv8i16, AArch64::CMHSv8i16, - AArch64::CMHIv8i16, AArch64::CMHSv8i16, AArch64::CMGTv8i16, - AArch64::CMGEv8i16, AArch64::CMGTv8i16, AArch64::CMGEv8i16}, - {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, - 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, - 0 /* invalid */} - }, - { - {AArch64::CMEQv2i32, AArch64::CMHIv2i32, AArch64::CMHSv2i32, - AArch64::CMHIv2i32, AArch64::CMHSv2i32, AArch64::CMGTv2i32, - AArch64::CMGEv2i32, AArch64::CMGTv2i32, AArch64::CMGEv2i32}, - {AArch64::CMEQv4i32, AArch64::CMHIv4i32, AArch64::CMHSv4i32, - AArch64::CMHIv4i32, AArch64::CMHSv4i32, AArch64::CMGTv4i32, - AArch64::CMGEv4i32, AArch64::CMGTv4i32, AArch64::CMGEv4i32}, - {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, - 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, - 0 /* invalid */}, - {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, - 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, - 0 /* invalid */} - }, - { - {AArch64::CMEQv2i64, AArch64::CMHIv2i64, AArch64::CMHSv2i64, - AArch64::CMHIv2i64, AArch64::CMHSv2i64, AArch64::CMGTv2i64, - AArch64::CMGEv2i64, AArch64::CMGTv2i64, AArch64::CMGEv2i64}, - {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, - 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, - 0 /* invalid */}, - {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, - 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, - 0 /* invalid */}, - {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, - 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, - 0 /* invalid */} - }, - }; - unsigned EltIdx = Log2_32(SrcEltSize / 8); - unsigned NumEltsIdx = Log2_32(NumElts / 2); - unsigned Opc = OpcTable[EltIdx][NumEltsIdx][PredIdx]; - if (!Opc) { - LLVM_DEBUG(dbgs() << "Could not map G_ICMP to cmp opcode"); - return false; - } - - const RegisterBank &VecRB = *RBI.getRegBank(SrcReg, MRI, TRI); - const TargetRegisterClass *SrcRC = - getRegClassForTypeOnBank(SrcTy, VecRB, RBI, true); - if (!SrcRC) { - LLVM_DEBUG(dbgs() << "Could not determine source register class.\n"); - return false; - } - - unsigned NotOpc = Pred == ICmpInst::ICMP_NE ? AArch64::NOTv8i8 : 0; - if (SrcTy.getSizeInBits() == 128) - NotOpc = NotOpc ? AArch64::NOTv16i8 : 0; - - if (SwapOperands) - std::swap(SrcReg, Src2Reg); - - MachineIRBuilder MIB(I); - auto Cmp = MIB.buildInstr(Opc, {SrcRC}, {SrcReg, Src2Reg}); - constrainSelectedInstRegOperands(*Cmp, TII, TRI, RBI); - - // Invert if we had a 'ne' cc. - if (NotOpc) { - Cmp = MIB.buildInstr(NotOpc, {DstReg}, {Cmp}); - constrainSelectedInstRegOperands(*Cmp, TII, TRI, RBI); - } else { - MIB.buildCopy(DstReg, Cmp.getReg(0)); - } - RBI.constrainGenericRegister(DstReg, *SrcRC, MRI); - I.eraseFromParent(); - return true; -} - -MachineInstr *AArch64InstructionSelector::emitScalarToVector( - unsigned EltSize, const TargetRegisterClass *DstRC, Register Scalar, - MachineIRBuilder &MIRBuilder) const { - auto Undef = MIRBuilder.buildInstr(TargetOpcode::IMPLICIT_DEF, {DstRC}, {}); - - auto BuildFn = [&](unsigned SubregIndex) { - auto Ins = - MIRBuilder - .buildInstr(TargetOpcode::INSERT_SUBREG, {DstRC}, {Undef, Scalar}) - .addImm(SubregIndex); - constrainSelectedInstRegOperands(*Undef, TII, TRI, RBI); - constrainSelectedInstRegOperands(*Ins, TII, TRI, RBI); - return &*Ins; - }; - - switch (EltSize) { - case 16: - return BuildFn(AArch64::hsub); - case 32: - return BuildFn(AArch64::ssub); - case 64: - return BuildFn(AArch64::dsub); - default: - return nullptr; - } -} - -bool AArch64InstructionSelector::selectMergeValues( - MachineInstr &I, MachineRegisterInfo &MRI) const { - assert(I.getOpcode() == TargetOpcode::G_MERGE_VALUES && "unexpected opcode"); - const LLT DstTy = MRI.getType(I.getOperand(0).getReg()); - const LLT SrcTy = MRI.getType(I.getOperand(1).getReg()); - assert(!DstTy.isVector() && !SrcTy.isVector() && "invalid merge operation"); - const RegisterBank &RB = *RBI.getRegBank(I.getOperand(1).getReg(), MRI, TRI); - - if (I.getNumOperands() != 3) - return false; - - // Merging 2 s64s into an s128. - if (DstTy == LLT::scalar(128)) { - if (SrcTy.getSizeInBits() != 64) - return false; - MachineIRBuilder MIB(I); - Register DstReg = I.getOperand(0).getReg(); - Register Src1Reg = I.getOperand(1).getReg(); - Register Src2Reg = I.getOperand(2).getReg(); - auto Tmp = MIB.buildInstr(TargetOpcode::IMPLICIT_DEF, {DstTy}, {}); - MachineInstr *InsMI = - emitLaneInsert(None, Tmp.getReg(0), Src1Reg, /* LaneIdx */ 0, RB, MIB); - if (!InsMI) - return false; - MachineInstr *Ins2MI = emitLaneInsert(DstReg, InsMI->getOperand(0).getReg(), - Src2Reg, /* LaneIdx */ 1, RB, MIB); - if (!Ins2MI) - return false; - constrainSelectedInstRegOperands(*InsMI, TII, TRI, RBI); - constrainSelectedInstRegOperands(*Ins2MI, TII, TRI, RBI); - I.eraseFromParent(); - return true; - } - - if (RB.getID() != AArch64::GPRRegBankID) - return false; - - if (DstTy.getSizeInBits() != 64 || SrcTy.getSizeInBits() != 32) - return false; - - auto *DstRC = &AArch64::GPR64RegClass; - Register SubToRegDef = MRI.createVirtualRegister(DstRC); - MachineInstr &SubRegMI = *BuildMI(*I.getParent(), I, I.getDebugLoc(), - TII.get(TargetOpcode::SUBREG_TO_REG)) - .addDef(SubToRegDef) - .addImm(0) - .addUse(I.getOperand(1).getReg()) - .addImm(AArch64::sub_32); - Register SubToRegDef2 = MRI.createVirtualRegister(DstRC); - // Need to anyext the second scalar before we can use bfm - MachineInstr &SubRegMI2 = *BuildMI(*I.getParent(), I, I.getDebugLoc(), - TII.get(TargetOpcode::SUBREG_TO_REG)) - .addDef(SubToRegDef2) - .addImm(0) - .addUse(I.getOperand(2).getReg()) - .addImm(AArch64::sub_32); - MachineInstr &BFM = - *BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(AArch64::BFMXri)) - .addDef(I.getOperand(0).getReg()) - .addUse(SubToRegDef) - .addUse(SubToRegDef2) - .addImm(32) - .addImm(31); - constrainSelectedInstRegOperands(SubRegMI, TII, TRI, RBI); - constrainSelectedInstRegOperands(SubRegMI2, TII, TRI, RBI); - constrainSelectedInstRegOperands(BFM, TII, TRI, RBI); - I.eraseFromParent(); - return true; -} - -static bool getLaneCopyOpcode(unsigned &CopyOpc, unsigned &ExtractSubReg, - const unsigned EltSize) { - // Choose a lane copy opcode and subregister based off of the size of the - // vector's elements. - switch (EltSize) { - case 16: - CopyOpc = AArch64::CPYi16; - ExtractSubReg = AArch64::hsub; - break; - case 32: - CopyOpc = AArch64::CPYi32; - ExtractSubReg = AArch64::ssub; - break; - case 64: - CopyOpc = AArch64::CPYi64; - ExtractSubReg = AArch64::dsub; - break; - default: - // Unknown size, bail out. - LLVM_DEBUG(dbgs() << "Elt size '" << EltSize << "' unsupported.\n"); - return false; - } - return true; -} - -MachineInstr *AArch64InstructionSelector::emitExtractVectorElt( - Optional DstReg, const RegisterBank &DstRB, LLT ScalarTy, - Register VecReg, unsigned LaneIdx, MachineIRBuilder &MIRBuilder) const { - MachineRegisterInfo &MRI = *MIRBuilder.getMRI(); - unsigned CopyOpc = 0; - unsigned ExtractSubReg = 0; - if (!getLaneCopyOpcode(CopyOpc, ExtractSubReg, ScalarTy.getSizeInBits())) { - LLVM_DEBUG( - dbgs() << "Couldn't determine lane copy opcode for instruction.\n"); - return nullptr; - } - - const TargetRegisterClass *DstRC = - getRegClassForTypeOnBank(ScalarTy, DstRB, RBI, true); - if (!DstRC) { - LLVM_DEBUG(dbgs() << "Could not determine destination register class.\n"); - return nullptr; - } - - const RegisterBank &VecRB = *RBI.getRegBank(VecReg, MRI, TRI); - const LLT &VecTy = MRI.getType(VecReg); - const TargetRegisterClass *VecRC = - getRegClassForTypeOnBank(VecTy, VecRB, RBI, true); - if (!VecRC) { - LLVM_DEBUG(dbgs() << "Could not determine source register class.\n"); - return nullptr; - } - - // The register that we're going to copy into. - Register InsertReg = VecReg; - if (!DstReg) - DstReg = MRI.createVirtualRegister(DstRC); - // If the lane index is 0, we just use a subregister COPY. - if (LaneIdx == 0) { - auto Copy = MIRBuilder.buildInstr(TargetOpcode::COPY, {*DstReg}, {}) - .addReg(VecReg, 0, ExtractSubReg); - RBI.constrainGenericRegister(*DstReg, *DstRC, MRI); - return &*Copy; - } - - // Lane copies require 128-bit wide registers. If we're dealing with an - // unpacked vector, then we need to move up to that width. Insert an implicit - // def and a subregister insert to get us there. - if (VecTy.getSizeInBits() != 128) { - MachineInstr *ScalarToVector = emitScalarToVector( - VecTy.getSizeInBits(), &AArch64::FPR128RegClass, VecReg, MIRBuilder); - if (!ScalarToVector) - return nullptr; - InsertReg = ScalarToVector->getOperand(0).getReg(); - } - - MachineInstr *LaneCopyMI = - MIRBuilder.buildInstr(CopyOpc, {*DstReg}, {InsertReg}).addImm(LaneIdx); - constrainSelectedInstRegOperands(*LaneCopyMI, TII, TRI, RBI); - - // Make sure that we actually constrain the initial copy. - RBI.constrainGenericRegister(*DstReg, *DstRC, MRI); - return LaneCopyMI; -} - -bool AArch64InstructionSelector::selectExtractElt( - MachineInstr &I, MachineRegisterInfo &MRI) const { - assert(I.getOpcode() == TargetOpcode::G_EXTRACT_VECTOR_ELT && - "unexpected opcode!"); - Register DstReg = I.getOperand(0).getReg(); - const LLT NarrowTy = MRI.getType(DstReg); - const Register SrcReg = I.getOperand(1).getReg(); - const LLT WideTy = MRI.getType(SrcReg); - (void)WideTy; - assert(WideTy.getSizeInBits() >= NarrowTy.getSizeInBits() && - "source register size too small!"); - assert(NarrowTy.isScalar() && "cannot extract vector into vector!"); - - // Need the lane index to determine the correct copy opcode. - MachineOperand &LaneIdxOp = I.getOperand(2); - assert(LaneIdxOp.isReg() && "Lane index operand was not a register?"); - - if (RBI.getRegBank(DstReg, MRI, TRI)->getID() != AArch64::FPRRegBankID) { - LLVM_DEBUG(dbgs() << "Cannot extract into GPR.\n"); - return false; - } - - // Find the index to extract from. - auto VRegAndVal = getConstantVRegValWithLookThrough(LaneIdxOp.getReg(), MRI); - if (!VRegAndVal) - return false; - unsigned LaneIdx = VRegAndVal->Value; - - MachineIRBuilder MIRBuilder(I); - - const RegisterBank &DstRB = *RBI.getRegBank(DstReg, MRI, TRI); - MachineInstr *Extract = emitExtractVectorElt(DstReg, DstRB, NarrowTy, SrcReg, - LaneIdx, MIRBuilder); - if (!Extract) - return false; - - I.eraseFromParent(); - return true; -} - -bool AArch64InstructionSelector::selectSplitVectorUnmerge( - MachineInstr &I, MachineRegisterInfo &MRI) const { - unsigned NumElts = I.getNumOperands() - 1; - Register SrcReg = I.getOperand(NumElts).getReg(); - const LLT NarrowTy = MRI.getType(I.getOperand(0).getReg()); - const LLT SrcTy = MRI.getType(SrcReg); - - assert(NarrowTy.isVector() && "Expected an unmerge into vectors"); - if (SrcTy.getSizeInBits() > 128) { - LLVM_DEBUG(dbgs() << "Unexpected vector type for vec split unmerge"); - return false; - } - - MachineIRBuilder MIB(I); - - // We implement a split vector operation by treating the sub-vectors as - // scalars and extracting them. - const RegisterBank &DstRB = - *RBI.getRegBank(I.getOperand(0).getReg(), MRI, TRI); - for (unsigned OpIdx = 0; OpIdx < NumElts; ++OpIdx) { - Register Dst = I.getOperand(OpIdx).getReg(); - MachineInstr *Extract = - emitExtractVectorElt(Dst, DstRB, NarrowTy, SrcReg, OpIdx, MIB); - if (!Extract) - return false; - } - I.eraseFromParent(); - return true; -} - -bool AArch64InstructionSelector::selectUnmergeValues( - MachineInstr &I, MachineRegisterInfo &MRI) const { - assert(I.getOpcode() == TargetOpcode::G_UNMERGE_VALUES && - "unexpected opcode"); - - // TODO: Handle unmerging into GPRs and from scalars to scalars. - if (RBI.getRegBank(I.getOperand(0).getReg(), MRI, TRI)->getID() != - AArch64::FPRRegBankID || - RBI.getRegBank(I.getOperand(1).getReg(), MRI, TRI)->getID() != - AArch64::FPRRegBankID) { - LLVM_DEBUG(dbgs() << "Unmerging vector-to-gpr and scalar-to-scalar " - "currently unsupported.\n"); - return false; - } - - // The last operand is the vector source register, and every other operand is - // a register to unpack into. - unsigned NumElts = I.getNumOperands() - 1; - Register SrcReg = I.getOperand(NumElts).getReg(); - const LLT NarrowTy = MRI.getType(I.getOperand(0).getReg()); - const LLT WideTy = MRI.getType(SrcReg); - (void)WideTy; - assert((WideTy.isVector() || WideTy.getSizeInBits() == 128) && - "can only unmerge from vector or s128 types!"); - assert(WideTy.getSizeInBits() > NarrowTy.getSizeInBits() && - "source register size too small!"); - - if (!NarrowTy.isScalar()) - return selectSplitVectorUnmerge(I, MRI); - - MachineIRBuilder MIB(I); - - // Choose a lane copy opcode and subregister based off of the size of the - // vector's elements. - unsigned CopyOpc = 0; - unsigned ExtractSubReg = 0; - if (!getLaneCopyOpcode(CopyOpc, ExtractSubReg, NarrowTy.getSizeInBits())) - return false; - - // Set up for the lane copies. - MachineBasicBlock &MBB = *I.getParent(); - - // Stores the registers we'll be copying from. - SmallVector InsertRegs; - - // We'll use the first register twice, so we only need NumElts-1 registers. - unsigned NumInsertRegs = NumElts - 1; - - // If our elements fit into exactly 128 bits, then we can copy from the source - // directly. Otherwise, we need to do a bit of setup with some subregister - // inserts. - if (NarrowTy.getSizeInBits() * NumElts == 128) { - InsertRegs = SmallVector(NumInsertRegs, SrcReg); - } else { - // No. We have to perform subregister inserts. For each insert, create an - // implicit def and a subregister insert, and save the register we create. - for (unsigned Idx = 0; Idx < NumInsertRegs; ++Idx) { - Register ImpDefReg = MRI.createVirtualRegister(&AArch64::FPR128RegClass); - MachineInstr &ImpDefMI = - *BuildMI(MBB, I, I.getDebugLoc(), TII.get(TargetOpcode::IMPLICIT_DEF), - ImpDefReg); - - // Now, create the subregister insert from SrcReg. - Register InsertReg = MRI.createVirtualRegister(&AArch64::FPR128RegClass); - MachineInstr &InsMI = - *BuildMI(MBB, I, I.getDebugLoc(), - TII.get(TargetOpcode::INSERT_SUBREG), InsertReg) - .addUse(ImpDefReg) - .addUse(SrcReg) - .addImm(AArch64::dsub); - - constrainSelectedInstRegOperands(ImpDefMI, TII, TRI, RBI); - constrainSelectedInstRegOperands(InsMI, TII, TRI, RBI); - - // Save the register so that we can copy from it after. - InsertRegs.push_back(InsertReg); - } - } - - // Now that we've created any necessary subregister inserts, we can - // create the copies. - // - // Perform the first copy separately as a subregister copy. - Register CopyTo = I.getOperand(0).getReg(); - auto FirstCopy = MIB.buildInstr(TargetOpcode::COPY, {CopyTo}, {}) - .addReg(InsertRegs[0], 0, ExtractSubReg); - constrainSelectedInstRegOperands(*FirstCopy, TII, TRI, RBI); - - // Now, perform the remaining copies as vector lane copies. - unsigned LaneIdx = 1; - for (Register InsReg : InsertRegs) { - Register CopyTo = I.getOperand(LaneIdx).getReg(); - MachineInstr &CopyInst = - *BuildMI(MBB, I, I.getDebugLoc(), TII.get(CopyOpc), CopyTo) - .addUse(InsReg) - .addImm(LaneIdx); - constrainSelectedInstRegOperands(CopyInst, TII, TRI, RBI); - ++LaneIdx; - } - - // Separately constrain the first copy's destination. Because of the - // limitation in constrainOperandRegClass, we can't guarantee that this will - // actually be constrained. So, do it ourselves using the second operand. - const TargetRegisterClass *RC = - MRI.getRegClassOrNull(I.getOperand(1).getReg()); - if (!RC) { - LLVM_DEBUG(dbgs() << "Couldn't constrain copy destination.\n"); - return false; - } - - RBI.constrainGenericRegister(CopyTo, *RC, MRI); - I.eraseFromParent(); - return true; -} - -bool AArch64InstructionSelector::selectConcatVectors( - MachineInstr &I, MachineRegisterInfo &MRI) const { - assert(I.getOpcode() == TargetOpcode::G_CONCAT_VECTORS && - "Unexpected opcode"); - Register Dst = I.getOperand(0).getReg(); - Register Op1 = I.getOperand(1).getReg(); - Register Op2 = I.getOperand(2).getReg(); - MachineIRBuilder MIRBuilder(I); - MachineInstr *ConcatMI = emitVectorConcat(Dst, Op1, Op2, MIRBuilder); - if (!ConcatMI) - return false; - I.eraseFromParent(); - return true; -} - -unsigned -AArch64InstructionSelector::emitConstantPoolEntry(Constant *CPVal, - MachineFunction &MF) const { - Type *CPTy = CPVal->getType(); - unsigned Align = MF.getDataLayout().getPrefTypeAlignment(CPTy); - if (Align == 0) - Align = MF.getDataLayout().getTypeAllocSize(CPTy); - - MachineConstantPool *MCP = MF.getConstantPool(); - return MCP->getConstantPoolIndex(CPVal, Align); -} - -MachineInstr *AArch64InstructionSelector::emitLoadFromConstantPool( - Constant *CPVal, MachineIRBuilder &MIRBuilder) const { - unsigned CPIdx = emitConstantPoolEntry(CPVal, MIRBuilder.getMF()); - - auto Adrp = - MIRBuilder.buildInstr(AArch64::ADRP, {&AArch64::GPR64RegClass}, {}) - .addConstantPoolIndex(CPIdx, 0, AArch64II::MO_PAGE); - - MachineInstr *LoadMI = nullptr; - switch (MIRBuilder.getDataLayout().getTypeStoreSize(CPVal->getType())) { - case 16: - LoadMI = - &*MIRBuilder - .buildInstr(AArch64::LDRQui, {&AArch64::FPR128RegClass}, {Adrp}) - .addConstantPoolIndex(CPIdx, 0, - AArch64II::MO_PAGEOFF | AArch64II::MO_NC); - break; - case 8: - LoadMI = &*MIRBuilder - .buildInstr(AArch64::LDRDui, {&AArch64::FPR64RegClass}, {Adrp}) - .addConstantPoolIndex( - CPIdx, 0, AArch64II::MO_PAGEOFF | AArch64II::MO_NC); - break; - default: - LLVM_DEBUG(dbgs() << "Could not load from constant pool of type " - << *CPVal->getType()); - return nullptr; - } - constrainSelectedInstRegOperands(*Adrp, TII, TRI, RBI); - constrainSelectedInstRegOperands(*LoadMI, TII, TRI, RBI); - return LoadMI; -} - -/// Return an pair to do an vector elt insert of a given -/// size and RB. -static std::pair -getInsertVecEltOpInfo(const RegisterBank &RB, unsigned EltSize) { - unsigned Opc, SubregIdx; - if (RB.getID() == AArch64::GPRRegBankID) { - if (EltSize == 32) { - Opc = AArch64::INSvi32gpr; - SubregIdx = AArch64::ssub; - } else if (EltSize == 64) { - Opc = AArch64::INSvi64gpr; - SubregIdx = AArch64::dsub; - } else { - llvm_unreachable("invalid elt size!"); - } - } else { - if (EltSize == 8) { - Opc = AArch64::INSvi8lane; - SubregIdx = AArch64::bsub; - } else if (EltSize == 16) { - Opc = AArch64::INSvi16lane; - SubregIdx = AArch64::hsub; - } else if (EltSize == 32) { - Opc = AArch64::INSvi32lane; - SubregIdx = AArch64::ssub; - } else if (EltSize == 64) { - Opc = AArch64::INSvi64lane; - SubregIdx = AArch64::dsub; - } else { - llvm_unreachable("invalid elt size!"); - } - } - return std::make_pair(Opc, SubregIdx); -} - -MachineInstr * -AArch64InstructionSelector::emitADD(Register DefReg, MachineOperand &LHS, - MachineOperand &RHS, - MachineIRBuilder &MIRBuilder) const { - assert(LHS.isReg() && RHS.isReg() && "Expected LHS and RHS to be registers!"); - MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo(); - static const unsigned OpcTable[2][2]{{AArch64::ADDXrr, AArch64::ADDXri}, - {AArch64::ADDWrr, AArch64::ADDWri}}; - bool Is32Bit = MRI.getType(LHS.getReg()).getSizeInBits() == 32; - auto ImmFns = selectArithImmed(RHS); - unsigned Opc = OpcTable[Is32Bit][ImmFns.hasValue()]; - auto AddMI = MIRBuilder.buildInstr(Opc, {DefReg}, {LHS.getReg()}); - - // If we matched a valid constant immediate, add those operands. - if (ImmFns) { - for (auto &RenderFn : *ImmFns) - RenderFn(AddMI); - } else { - AddMI.addUse(RHS.getReg()); - } - - constrainSelectedInstRegOperands(*AddMI, TII, TRI, RBI); - return &*AddMI; -} - -MachineInstr * -AArch64InstructionSelector::emitCMN(MachineOperand &LHS, MachineOperand &RHS, - MachineIRBuilder &MIRBuilder) const { - assert(LHS.isReg() && RHS.isReg() && "Expected LHS and RHS to be registers!"); - MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo(); - static const unsigned OpcTable[2][2]{{AArch64::ADDSXrr, AArch64::ADDSXri}, - {AArch64::ADDSWrr, AArch64::ADDSWri}}; - bool Is32Bit = (MRI.getType(LHS.getReg()).getSizeInBits() == 32); - auto ImmFns = selectArithImmed(RHS); - unsigned Opc = OpcTable[Is32Bit][ImmFns.hasValue()]; - Register ZReg = Is32Bit ? AArch64::WZR : AArch64::XZR; - - auto CmpMI = MIRBuilder.buildInstr(Opc, {ZReg}, {LHS.getReg()}); - - // If we matched a valid constant immediate, add those operands. - if (ImmFns) { - for (auto &RenderFn : *ImmFns) - RenderFn(CmpMI); - } else { - CmpMI.addUse(RHS.getReg()); - } - - constrainSelectedInstRegOperands(*CmpMI, TII, TRI, RBI); - return &*CmpMI; -} - -MachineInstr * -AArch64InstructionSelector::emitTST(const Register &LHS, const Register &RHS, - MachineIRBuilder &MIRBuilder) const { - MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo(); - unsigned RegSize = MRI.getType(LHS).getSizeInBits(); - bool Is32Bit = (RegSize == 32); - static const unsigned OpcTable[2][2]{{AArch64::ANDSXrr, AArch64::ANDSXri}, - {AArch64::ANDSWrr, AArch64::ANDSWri}}; - Register ZReg = Is32Bit ? AArch64::WZR : AArch64::XZR; - - // We might be able to fold in an immediate into the TST. We need to make sure - // it's a logical immediate though, since ANDS requires that. - auto ValAndVReg = getConstantVRegValWithLookThrough(RHS, MRI); - bool IsImmForm = ValAndVReg.hasValue() && - AArch64_AM::isLogicalImmediate(ValAndVReg->Value, RegSize); - unsigned Opc = OpcTable[Is32Bit][IsImmForm]; - auto TstMI = MIRBuilder.buildInstr(Opc, {ZReg}, {LHS}); - - if (IsImmForm) - TstMI.addImm( - AArch64_AM::encodeLogicalImmediate(ValAndVReg->Value, RegSize)); - else - TstMI.addUse(RHS); - - constrainSelectedInstRegOperands(*TstMI, TII, TRI, RBI); - return &*TstMI; -} - -MachineInstr *AArch64InstructionSelector::emitIntegerCompare( - MachineOperand &LHS, MachineOperand &RHS, MachineOperand &Predicate, - MachineIRBuilder &MIRBuilder) const { - assert(LHS.isReg() && RHS.isReg() && "Expected LHS and RHS to be registers!"); - MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo(); - - // Fold the compare if possible. - MachineInstr *FoldCmp = - tryFoldIntegerCompare(LHS, RHS, Predicate, MIRBuilder); - if (FoldCmp) - return FoldCmp; - - // Can't fold into a CMN. Just emit a normal compare. - unsigned CmpOpc = 0; - Register ZReg; - - LLT CmpTy = MRI.getType(LHS.getReg()); - assert((CmpTy.isScalar() || CmpTy.isPointer()) && - "Expected scalar or pointer"); - if (CmpTy == LLT::scalar(32)) { - CmpOpc = AArch64::SUBSWrr; - ZReg = AArch64::WZR; - } else if (CmpTy == LLT::scalar(64) || CmpTy.isPointer()) { - CmpOpc = AArch64::SUBSXrr; - ZReg = AArch64::XZR; - } else { - return nullptr; - } - - // Try to match immediate forms. - auto ImmFns = selectArithImmed(RHS); - if (ImmFns) - CmpOpc = CmpOpc == AArch64::SUBSWrr ? AArch64::SUBSWri : AArch64::SUBSXri; - - auto CmpMI = MIRBuilder.buildInstr(CmpOpc).addDef(ZReg).addUse(LHS.getReg()); - // If we matched a valid constant immediate, add those operands. - if (ImmFns) { - for (auto &RenderFn : *ImmFns) - RenderFn(CmpMI); - } else { - CmpMI.addUse(RHS.getReg()); - } - - // Make sure that we can constrain the compare that we emitted. - constrainSelectedInstRegOperands(*CmpMI, TII, TRI, RBI); - return &*CmpMI; -} - -MachineInstr *AArch64InstructionSelector::emitVectorConcat( - Optional Dst, Register Op1, Register Op2, - MachineIRBuilder &MIRBuilder) const { - // We implement a vector concat by: - // 1. Use scalar_to_vector to insert the lower vector into the larger dest - // 2. Insert the upper vector into the destination's upper element - // TODO: some of this code is common with G_BUILD_VECTOR handling. - MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo(); - - const LLT Op1Ty = MRI.getType(Op1); - const LLT Op2Ty = MRI.getType(Op2); - - if (Op1Ty != Op2Ty) { - LLVM_DEBUG(dbgs() << "Could not do vector concat of differing vector tys"); - return nullptr; - } - assert(Op1Ty.isVector() && "Expected a vector for vector concat"); - - if (Op1Ty.getSizeInBits() >= 128) { - LLVM_DEBUG(dbgs() << "Vector concat not supported for full size vectors"); - return nullptr; - } - - // At the moment we just support 64 bit vector concats. - if (Op1Ty.getSizeInBits() != 64) { - LLVM_DEBUG(dbgs() << "Vector concat supported for 64b vectors"); - return nullptr; - } - - const LLT ScalarTy = LLT::scalar(Op1Ty.getSizeInBits()); - const RegisterBank &FPRBank = *RBI.getRegBank(Op1, MRI, TRI); - const TargetRegisterClass *DstRC = - getMinClassForRegBank(FPRBank, Op1Ty.getSizeInBits() * 2); - - MachineInstr *WidenedOp1 = - emitScalarToVector(ScalarTy.getSizeInBits(), DstRC, Op1, MIRBuilder); - MachineInstr *WidenedOp2 = - emitScalarToVector(ScalarTy.getSizeInBits(), DstRC, Op2, MIRBuilder); - if (!WidenedOp1 || !WidenedOp2) { - LLVM_DEBUG(dbgs() << "Could not emit a vector from scalar value"); - return nullptr; - } - - // Now do the insert of the upper element. - unsigned InsertOpc, InsSubRegIdx; - std::tie(InsertOpc, InsSubRegIdx) = - getInsertVecEltOpInfo(FPRBank, ScalarTy.getSizeInBits()); - - if (!Dst) - Dst = MRI.createVirtualRegister(DstRC); - auto InsElt = - MIRBuilder - .buildInstr(InsertOpc, {*Dst}, {WidenedOp1->getOperand(0).getReg()}) - .addImm(1) /* Lane index */ - .addUse(WidenedOp2->getOperand(0).getReg()) - .addImm(0); - constrainSelectedInstRegOperands(*InsElt, TII, TRI, RBI); - return &*InsElt; -} - -MachineInstr *AArch64InstructionSelector::emitFMovForFConstant( - MachineInstr &I, MachineRegisterInfo &MRI) const { - assert(I.getOpcode() == TargetOpcode::G_FCONSTANT && - "Expected a G_FCONSTANT!"); - MachineOperand &ImmOp = I.getOperand(1); - unsigned DefSize = MRI.getType(I.getOperand(0).getReg()).getSizeInBits(); - - // Only handle 32 and 64 bit defs for now. - if (DefSize != 32 && DefSize != 64) - return nullptr; - - // Don't handle null values using FMOV. - if (ImmOp.getFPImm()->isNullValue()) - return nullptr; - - // Get the immediate representation for the FMOV. - const APFloat &ImmValAPF = ImmOp.getFPImm()->getValueAPF(); - int Imm = DefSize == 32 ? AArch64_AM::getFP32Imm(ImmValAPF) - : AArch64_AM::getFP64Imm(ImmValAPF); - - // If this is -1, it means the immediate can't be represented as the requested - // floating point value. Bail. - if (Imm == -1) - return nullptr; - - // Update MI to represent the new FMOV instruction, constrain it, and return. - ImmOp.ChangeToImmediate(Imm); - unsigned MovOpc = DefSize == 32 ? AArch64::FMOVSi : AArch64::FMOVDi; - I.setDesc(TII.get(MovOpc)); - constrainSelectedInstRegOperands(I, TII, TRI, RBI); - return &I; -} - -MachineInstr * -AArch64InstructionSelector::emitCSetForICMP(Register DefReg, unsigned Pred, - MachineIRBuilder &MIRBuilder) const { - // CSINC increments the result when the predicate is false. Invert it. - const AArch64CC::CondCode InvCC = changeICMPPredToAArch64CC( - CmpInst::getInversePredicate((CmpInst::Predicate)Pred)); - auto I = - MIRBuilder - .buildInstr(AArch64::CSINCWr, {DefReg}, {Register(AArch64::WZR), Register(AArch64::WZR)}) - .addImm(InvCC); - constrainSelectedInstRegOperands(*I, TII, TRI, RBI); - return &*I; -} - -bool AArch64InstructionSelector::tryOptSelect(MachineInstr &I) const { - MachineIRBuilder MIB(I); - MachineRegisterInfo &MRI = *MIB.getMRI(); - const TargetRegisterInfo &TRI = *MRI.getTargetRegisterInfo(); - - // We want to recognize this pattern: - // - // $z = G_FCMP pred, $x, $y - // ... - // $w = G_SELECT $z, $a, $b - // - // Where the value of $z is *only* ever used by the G_SELECT (possibly with - // some copies/truncs in between.) - // - // If we see this, then we can emit something like this: - // - // fcmp $x, $y - // fcsel $w, $a, $b, pred - // - // Rather than emitting both of the rather long sequences in the standard - // G_FCMP/G_SELECT select methods. - - // First, check if the condition is defined by a compare. - MachineInstr *CondDef = MRI.getVRegDef(I.getOperand(1).getReg()); - while (CondDef) { - // We can only fold if all of the defs have one use. - if (!MRI.hasOneUse(CondDef->getOperand(0).getReg())) - return false; - - // We can skip over G_TRUNC since the condition is 1-bit. - // Truncating/extending can have no impact on the value. - unsigned Opc = CondDef->getOpcode(); - if (Opc != TargetOpcode::COPY && Opc != TargetOpcode::G_TRUNC) - break; - - // Can't see past copies from physregs. - if (Opc == TargetOpcode::COPY && - Register::isPhysicalRegister(CondDef->getOperand(1).getReg())) - return false; - - CondDef = MRI.getVRegDef(CondDef->getOperand(1).getReg()); - } - - // Is the condition defined by a compare? - if (!CondDef) - return false; - - unsigned CondOpc = CondDef->getOpcode(); - if (CondOpc != TargetOpcode::G_ICMP && CondOpc != TargetOpcode::G_FCMP) - return false; - - AArch64CC::CondCode CondCode; - if (CondOpc == TargetOpcode::G_ICMP) { - CondCode = changeICMPPredToAArch64CC( - (CmpInst::Predicate)CondDef->getOperand(1).getPredicate()); - if (!emitIntegerCompare(CondDef->getOperand(2), CondDef->getOperand(3), - CondDef->getOperand(1), MIB)) { - LLVM_DEBUG(dbgs() << "Couldn't emit compare for select!\n"); - return false; - } - } else { - // Get the condition code for the select. - AArch64CC::CondCode CondCode2; - changeFCMPPredToAArch64CC( - (CmpInst::Predicate)CondDef->getOperand(1).getPredicate(), CondCode, - CondCode2); - - // changeFCMPPredToAArch64CC sets CondCode2 to AL when we require two - // instructions to emit the comparison. - // TODO: Handle FCMP_UEQ and FCMP_ONE. After that, this check will be - // unnecessary. - if (CondCode2 != AArch64CC::AL) - return false; - - // Make sure we'll be able to select the compare. - unsigned CmpOpc = selectFCMPOpc(*CondDef, MRI); - if (!CmpOpc) - return false; - - // Emit a new compare. - auto Cmp = MIB.buildInstr(CmpOpc, {}, {CondDef->getOperand(2).getReg()}); - if (CmpOpc != AArch64::FCMPSri && CmpOpc != AArch64::FCMPDri) - Cmp.addUse(CondDef->getOperand(3).getReg()); - constrainSelectedInstRegOperands(*Cmp, TII, TRI, RBI); - } - - // Emit the select. - unsigned CSelOpc = selectSelectOpc(I, MRI, RBI); - auto CSel = - MIB.buildInstr(CSelOpc, {I.getOperand(0).getReg()}, - {I.getOperand(2).getReg(), I.getOperand(3).getReg()}) - .addImm(CondCode); - constrainSelectedInstRegOperands(*CSel, TII, TRI, RBI); - I.eraseFromParent(); - return true; -} - -MachineInstr *AArch64InstructionSelector::tryFoldIntegerCompare( - MachineOperand &LHS, MachineOperand &RHS, MachineOperand &Predicate, - MachineIRBuilder &MIRBuilder) const { - assert(LHS.isReg() && RHS.isReg() && Predicate.isPredicate() && - "Unexpected MachineOperand"); - MachineRegisterInfo &MRI = *MIRBuilder.getMRI(); - // We want to find this sort of thing: - // x = G_SUB 0, y - // G_ICMP z, x - // - // In this case, we can fold the G_SUB into the G_ICMP using a CMN instead. - // e.g: - // - // cmn z, y - - // Helper lambda to detect the subtract followed by the compare. - // Takes in the def of the LHS or RHS, and checks if it's a subtract from 0. - auto IsCMN = [&](MachineInstr *DefMI, const AArch64CC::CondCode &CC) { - if (!DefMI || DefMI->getOpcode() != TargetOpcode::G_SUB) - return false; - - // Need to make sure NZCV is the same at the end of the transformation. - if (CC != AArch64CC::EQ && CC != AArch64CC::NE) - return false; - - // We want to match against SUBs. - if (DefMI->getOpcode() != TargetOpcode::G_SUB) - return false; - - // Make sure that we're getting - // x = G_SUB 0, y - auto ValAndVReg = - getConstantVRegValWithLookThrough(DefMI->getOperand(1).getReg(), MRI); - if (!ValAndVReg || ValAndVReg->Value != 0) - return false; - - // This can safely be represented as a CMN. - return true; - }; - - // Check if the RHS or LHS of the G_ICMP is defined by a SUB - MachineInstr *LHSDef = getDefIgnoringCopies(LHS.getReg(), MRI); - MachineInstr *RHSDef = getDefIgnoringCopies(RHS.getReg(), MRI); - CmpInst::Predicate P = (CmpInst::Predicate)Predicate.getPredicate(); - const AArch64CC::CondCode CC = changeICMPPredToAArch64CC(P); - - // Given this: - // - // x = G_SUB 0, y - // G_ICMP x, z - // - // Produce this: - // - // cmn y, z - if (IsCMN(LHSDef, CC)) - return emitCMN(LHSDef->getOperand(2), RHS, MIRBuilder); - - // Same idea here, but with the RHS of the compare instead: - // - // Given this: - // - // x = G_SUB 0, y - // G_ICMP z, x - // - // Produce this: - // - // cmn z, y - if (IsCMN(RHSDef, CC)) - return emitCMN(LHS, RHSDef->getOperand(2), MIRBuilder); - - // Given this: - // - // z = G_AND x, y - // G_ICMP z, 0 - // - // Produce this if the compare is signed: - // - // tst x, y - if (!isUnsignedICMPPred(P) && LHSDef && - LHSDef->getOpcode() == TargetOpcode::G_AND) { - // Make sure that the RHS is 0. - auto ValAndVReg = getConstantVRegValWithLookThrough(RHS.getReg(), MRI); - if (!ValAndVReg || ValAndVReg->Value != 0) - return nullptr; - - return emitTST(LHSDef->getOperand(1).getReg(), - LHSDef->getOperand(2).getReg(), MIRBuilder); - } - - return nullptr; -} - -bool AArch64InstructionSelector::tryOptVectorDup(MachineInstr &I) const { - // Try to match a vector splat operation into a dup instruction. - // We're looking for this pattern: - // %scalar:gpr(s64) = COPY $x0 - // %undef:fpr(<2 x s64>) = G_IMPLICIT_DEF - // %cst0:gpr(s32) = G_CONSTANT i32 0 - // %zerovec:fpr(<2 x s32>) = G_BUILD_VECTOR %cst0(s32), %cst0(s32) - // %ins:fpr(<2 x s64>) = G_INSERT_VECTOR_ELT %undef, %scalar(s64), %cst0(s32) - // %splat:fpr(<2 x s64>) = G_SHUFFLE_VECTOR %ins(<2 x s64>), %undef, - // %zerovec(<2 x s32>) - // - // ...into: - // %splat = DUP %scalar - // We use the regbank of the scalar to determine which kind of dup to use. - MachineIRBuilder MIB(I); - MachineRegisterInfo &MRI = *MIB.getMRI(); - const TargetRegisterInfo &TRI = *MRI.getTargetRegisterInfo(); - using namespace TargetOpcode; - using namespace MIPatternMatch; - - // Begin matching the insert. - auto *InsMI = - getOpcodeDef(G_INSERT_VECTOR_ELT, I.getOperand(1).getReg(), MRI); - if (!InsMI) - return false; - // Match the undef vector operand. - auto *UndefMI = - getOpcodeDef(G_IMPLICIT_DEF, InsMI->getOperand(1).getReg(), MRI); - if (!UndefMI) - return false; - // Match the scalar being splatted. - Register ScalarReg = InsMI->getOperand(2).getReg(); - const RegisterBank *ScalarRB = RBI.getRegBank(ScalarReg, MRI, TRI); - // Match the index constant 0. - int64_t Index = 0; - if (!mi_match(InsMI->getOperand(3).getReg(), MRI, m_ICst(Index)) || Index) - return false; - - // The shuffle's second operand doesn't matter if the mask is all zero. - ArrayRef Mask = I.getOperand(3).getShuffleMask(); - if (!all_of(Mask, [](int Elem) { return Elem == 0; })) - return false; - - // We're done, now find out what kind of splat we need. - LLT VecTy = MRI.getType(I.getOperand(0).getReg()); - LLT EltTy = VecTy.getElementType(); - if (EltTy.getSizeInBits() < 32) { - LLVM_DEBUG(dbgs() << "Could not optimize splat pattern < 32b elts yet"); - return false; - } - bool IsFP = ScalarRB->getID() == AArch64::FPRRegBankID; - unsigned Opc = 0; - if (IsFP) { - switch (EltTy.getSizeInBits()) { - case 32: - if (VecTy.getNumElements() == 2) { - Opc = AArch64::DUPv2i32lane; - } else { - Opc = AArch64::DUPv4i32lane; - assert(VecTy.getNumElements() == 4); - } - break; - case 64: - assert(VecTy.getNumElements() == 2 && "Unexpected num elts"); - Opc = AArch64::DUPv2i64lane; - break; - } - } else { - switch (EltTy.getSizeInBits()) { - case 32: - if (VecTy.getNumElements() == 2) { - Opc = AArch64::DUPv2i32gpr; - } else { - Opc = AArch64::DUPv4i32gpr; - assert(VecTy.getNumElements() == 4); - } - break; - case 64: - assert(VecTy.getNumElements() == 2 && "Unexpected num elts"); - Opc = AArch64::DUPv2i64gpr; - break; - } - } - assert(Opc && "Did not compute an opcode for a dup"); - - // For FP splats, we need to widen the scalar reg via undef too. - if (IsFP) { - MachineInstr *Widen = emitScalarToVector( - EltTy.getSizeInBits(), &AArch64::FPR128RegClass, ScalarReg, MIB); - if (!Widen) - return false; - ScalarReg = Widen->getOperand(0).getReg(); - } - auto Dup = MIB.buildInstr(Opc, {I.getOperand(0).getReg()}, {ScalarReg}); - if (IsFP) - Dup.addImm(0); - constrainSelectedInstRegOperands(*Dup, TII, TRI, RBI); - I.eraseFromParent(); - return true; -} - -bool AArch64InstructionSelector::tryOptVectorShuffle(MachineInstr &I) const { - if (TM.getOptLevel() == CodeGenOpt::None) - return false; - if (tryOptVectorDup(I)) - return true; - return false; -} - -bool AArch64InstructionSelector::selectShuffleVector( - MachineInstr &I, MachineRegisterInfo &MRI) const { - if (tryOptVectorShuffle(I)) - return true; - const LLT DstTy = MRI.getType(I.getOperand(0).getReg()); - Register Src1Reg = I.getOperand(1).getReg(); - const LLT Src1Ty = MRI.getType(Src1Reg); - Register Src2Reg = I.getOperand(2).getReg(); - const LLT Src2Ty = MRI.getType(Src2Reg); - ArrayRef Mask = I.getOperand(3).getShuffleMask(); - - MachineBasicBlock &MBB = *I.getParent(); - MachineFunction &MF = *MBB.getParent(); - LLVMContext &Ctx = MF.getFunction().getContext(); - - // G_SHUFFLE_VECTOR is weird in that the source operands can be scalars, if - // it's originated from a <1 x T> type. Those should have been lowered into - // G_BUILD_VECTOR earlier. - if (!Src1Ty.isVector() || !Src2Ty.isVector()) { - LLVM_DEBUG(dbgs() << "Could not select a \"scalar\" G_SHUFFLE_VECTOR\n"); - return false; - } - - unsigned BytesPerElt = DstTy.getElementType().getSizeInBits() / 8; - - SmallVector CstIdxs; - for (int Val : Mask) { - // For now, any undef indexes we'll just assume to be 0. This should be - // optimized in future, e.g. to select DUP etc. - Val = Val < 0 ? 0 : Val; - for (unsigned Byte = 0; Byte < BytesPerElt; ++Byte) { - unsigned Offset = Byte + Val * BytesPerElt; - CstIdxs.emplace_back(ConstantInt::get(Type::getInt8Ty(Ctx), Offset)); - } - } - - MachineIRBuilder MIRBuilder(I); - - // Use a constant pool to load the index vector for TBL. - Constant *CPVal = ConstantVector::get(CstIdxs); - MachineInstr *IndexLoad = emitLoadFromConstantPool(CPVal, MIRBuilder); - if (!IndexLoad) { - LLVM_DEBUG(dbgs() << "Could not load from a constant pool"); - return false; - } - - if (DstTy.getSizeInBits() != 128) { - assert(DstTy.getSizeInBits() == 64 && "Unexpected shuffle result ty"); - // This case can be done with TBL1. - MachineInstr *Concat = emitVectorConcat(None, Src1Reg, Src2Reg, MIRBuilder); - if (!Concat) { - LLVM_DEBUG(dbgs() << "Could not do vector concat for tbl1"); - return false; - } - - // The constant pool load will be 64 bits, so need to convert to FPR128 reg. - IndexLoad = - emitScalarToVector(64, &AArch64::FPR128RegClass, - IndexLoad->getOperand(0).getReg(), MIRBuilder); - - auto TBL1 = MIRBuilder.buildInstr( - AArch64::TBLv16i8One, {&AArch64::FPR128RegClass}, - {Concat->getOperand(0).getReg(), IndexLoad->getOperand(0).getReg()}); - constrainSelectedInstRegOperands(*TBL1, TII, TRI, RBI); - - auto Copy = - MIRBuilder - .buildInstr(TargetOpcode::COPY, {I.getOperand(0).getReg()}, {}) - .addReg(TBL1.getReg(0), 0, AArch64::dsub); - RBI.constrainGenericRegister(Copy.getReg(0), AArch64::FPR64RegClass, MRI); - I.eraseFromParent(); - return true; - } - - // For TBL2 we need to emit a REG_SEQUENCE to tie together two consecutive - // Q registers for regalloc. - auto RegSeq = MIRBuilder - .buildInstr(TargetOpcode::REG_SEQUENCE, - {&AArch64::QQRegClass}, {Src1Reg}) - .addImm(AArch64::qsub0) - .addUse(Src2Reg) - .addImm(AArch64::qsub1); - - auto TBL2 = - MIRBuilder.buildInstr(AArch64::TBLv16i8Two, {I.getOperand(0).getReg()}, - {RegSeq, IndexLoad->getOperand(0).getReg()}); - constrainSelectedInstRegOperands(*RegSeq, TII, TRI, RBI); - constrainSelectedInstRegOperands(*TBL2, TII, TRI, RBI); - I.eraseFromParent(); - return true; -} - -MachineInstr *AArch64InstructionSelector::emitLaneInsert( - Optional DstReg, Register SrcReg, Register EltReg, - unsigned LaneIdx, const RegisterBank &RB, - MachineIRBuilder &MIRBuilder) const { - MachineInstr *InsElt = nullptr; - const TargetRegisterClass *DstRC = &AArch64::FPR128RegClass; - MachineRegisterInfo &MRI = *MIRBuilder.getMRI(); - - // Create a register to define with the insert if one wasn't passed in. - if (!DstReg) - DstReg = MRI.createVirtualRegister(DstRC); - - unsigned EltSize = MRI.getType(EltReg).getSizeInBits(); - unsigned Opc = getInsertVecEltOpInfo(RB, EltSize).first; - - if (RB.getID() == AArch64::FPRRegBankID) { - auto InsSub = emitScalarToVector(EltSize, DstRC, EltReg, MIRBuilder); - InsElt = MIRBuilder.buildInstr(Opc, {*DstReg}, {SrcReg}) - .addImm(LaneIdx) - .addUse(InsSub->getOperand(0).getReg()) - .addImm(0); - } else { - InsElt = MIRBuilder.buildInstr(Opc, {*DstReg}, {SrcReg}) - .addImm(LaneIdx) - .addUse(EltReg); - } - - constrainSelectedInstRegOperands(*InsElt, TII, TRI, RBI); - return InsElt; -} - -bool AArch64InstructionSelector::selectInsertElt( - MachineInstr &I, MachineRegisterInfo &MRI) const { - assert(I.getOpcode() == TargetOpcode::G_INSERT_VECTOR_ELT); - - // Get information on the destination. - Register DstReg = I.getOperand(0).getReg(); - const LLT DstTy = MRI.getType(DstReg); - unsigned VecSize = DstTy.getSizeInBits(); - - // Get information on the element we want to insert into the destination. - Register EltReg = I.getOperand(2).getReg(); - const LLT EltTy = MRI.getType(EltReg); - unsigned EltSize = EltTy.getSizeInBits(); - if (EltSize < 16 || EltSize > 64) - return false; // Don't support all element types yet. - - // Find the definition of the index. Bail out if it's not defined by a - // G_CONSTANT. - Register IdxReg = I.getOperand(3).getReg(); - auto VRegAndVal = getConstantVRegValWithLookThrough(IdxReg, MRI); - if (!VRegAndVal) - return false; - unsigned LaneIdx = VRegAndVal->Value; - - // Perform the lane insert. - Register SrcReg = I.getOperand(1).getReg(); - const RegisterBank &EltRB = *RBI.getRegBank(EltReg, MRI, TRI); - MachineIRBuilder MIRBuilder(I); - - if (VecSize < 128) { - // If the vector we're inserting into is smaller than 128 bits, widen it - // to 128 to do the insert. - MachineInstr *ScalarToVec = emitScalarToVector( - VecSize, &AArch64::FPR128RegClass, SrcReg, MIRBuilder); - if (!ScalarToVec) - return false; - SrcReg = ScalarToVec->getOperand(0).getReg(); - } - - // Create an insert into a new FPR128 register. - // Note that if our vector is already 128 bits, we end up emitting an extra - // register. - MachineInstr *InsMI = - emitLaneInsert(None, SrcReg, EltReg, LaneIdx, EltRB, MIRBuilder); - - if (VecSize < 128) { - // If we had to widen to perform the insert, then we have to demote back to - // the original size to get the result we want. - Register DemoteVec = InsMI->getOperand(0).getReg(); - const TargetRegisterClass *RC = - getMinClassForRegBank(*RBI.getRegBank(DemoteVec, MRI, TRI), VecSize); - if (RC != &AArch64::FPR32RegClass && RC != &AArch64::FPR64RegClass) { - LLVM_DEBUG(dbgs() << "Unsupported register class!\n"); - return false; - } - unsigned SubReg = 0; - if (!getSubRegForClass(RC, TRI, SubReg)) - return false; - if (SubReg != AArch64::ssub && SubReg != AArch64::dsub) { - LLVM_DEBUG(dbgs() << "Unsupported destination size! (" << VecSize - << "\n"); - return false; - } - MIRBuilder.buildInstr(TargetOpcode::COPY, {DstReg}, {}) - .addReg(DemoteVec, 0, SubReg); - RBI.constrainGenericRegister(DstReg, *RC, MRI); - } else { - // No widening needed. - InsMI->getOperand(0).setReg(DstReg); - constrainSelectedInstRegOperands(*InsMI, TII, TRI, RBI); - } - - I.eraseFromParent(); - return true; -} - -bool AArch64InstructionSelector::selectBuildVector( - MachineInstr &I, MachineRegisterInfo &MRI) const { - assert(I.getOpcode() == TargetOpcode::G_BUILD_VECTOR); - // Until we port more of the optimized selections, for now just use a vector - // insert sequence. - const LLT DstTy = MRI.getType(I.getOperand(0).getReg()); - const LLT EltTy = MRI.getType(I.getOperand(1).getReg()); - unsigned EltSize = EltTy.getSizeInBits(); - if (EltSize < 16 || EltSize > 64) - return false; // Don't support all element types yet. - const RegisterBank &RB = *RBI.getRegBank(I.getOperand(1).getReg(), MRI, TRI); - MachineIRBuilder MIRBuilder(I); - - const TargetRegisterClass *DstRC = &AArch64::FPR128RegClass; - MachineInstr *ScalarToVec = - emitScalarToVector(DstTy.getElementType().getSizeInBits(), DstRC, - I.getOperand(1).getReg(), MIRBuilder); - if (!ScalarToVec) - return false; - - Register DstVec = ScalarToVec->getOperand(0).getReg(); - unsigned DstSize = DstTy.getSizeInBits(); - - // Keep track of the last MI we inserted. Later on, we might be able to save - // a copy using it. - MachineInstr *PrevMI = nullptr; - for (unsigned i = 2, e = DstSize / EltSize + 1; i < e; ++i) { - // Note that if we don't do a subregister copy, we can end up making an - // extra register. - PrevMI = &*emitLaneInsert(None, DstVec, I.getOperand(i).getReg(), i - 1, RB, - MIRBuilder); - DstVec = PrevMI->getOperand(0).getReg(); - } - - // If DstTy's size in bits is less than 128, then emit a subregister copy - // from DstVec to the last register we've defined. - if (DstSize < 128) { - // Force this to be FPR using the destination vector. - const TargetRegisterClass *RC = - getMinClassForRegBank(*RBI.getRegBank(DstVec, MRI, TRI), DstSize); - if (!RC) - return false; - if (RC != &AArch64::FPR32RegClass && RC != &AArch64::FPR64RegClass) { - LLVM_DEBUG(dbgs() << "Unsupported register class!\n"); - return false; - } - - unsigned SubReg = 0; - if (!getSubRegForClass(RC, TRI, SubReg)) - return false; - if (SubReg != AArch64::ssub && SubReg != AArch64::dsub) { - LLVM_DEBUG(dbgs() << "Unsupported destination size! (" << DstSize - << "\n"); - return false; - } - - Register Reg = MRI.createVirtualRegister(RC); - Register DstReg = I.getOperand(0).getReg(); - - MIRBuilder.buildInstr(TargetOpcode::COPY, {DstReg}, {}) - .addReg(DstVec, 0, SubReg); - MachineOperand &RegOp = I.getOperand(1); - RegOp.setReg(Reg); - RBI.constrainGenericRegister(DstReg, *RC, MRI); - } else { - // We don't need a subregister copy. Save a copy by re-using the - // destination register on the final insert. - assert(PrevMI && "PrevMI was null?"); - PrevMI->getOperand(0).setReg(I.getOperand(0).getReg()); - constrainSelectedInstRegOperands(*PrevMI, TII, TRI, RBI); - } - - I.eraseFromParent(); - return true; -} - -/// Helper function to find an intrinsic ID on an a MachineInstr. Returns the -/// ID if it exists, and 0 otherwise. -static unsigned findIntrinsicID(MachineInstr &I) { - auto IntrinOp = find_if(I.operands(), [&](const MachineOperand &Op) { - return Op.isIntrinsicID(); - }); - if (IntrinOp == I.operands_end()) - return 0; - return IntrinOp->getIntrinsicID(); -} - -bool AArch64InstructionSelector::selectIntrinsicWithSideEffects( - MachineInstr &I, MachineRegisterInfo &MRI) const { - // Find the intrinsic ID. - unsigned IntrinID = findIntrinsicID(I); - if (!IntrinID) - return false; - MachineIRBuilder MIRBuilder(I); - - // Select the instruction. - switch (IntrinID) { - default: - return false; - case Intrinsic::trap: - MIRBuilder.buildInstr(AArch64::BRK, {}, {}).addImm(1); - break; - case Intrinsic::debugtrap: - if (!STI.isTargetWindows()) - return false; - MIRBuilder.buildInstr(AArch64::BRK, {}, {}).addImm(0xF000); - break; - } - - I.eraseFromParent(); - return true; -} - -bool AArch64InstructionSelector::selectIntrinsic( - MachineInstr &I, MachineRegisterInfo &MRI) const { - unsigned IntrinID = findIntrinsicID(I); - if (!IntrinID) - return false; - MachineIRBuilder MIRBuilder(I); - - switch (IntrinID) { - default: - break; - case Intrinsic::aarch64_crypto_sha1h: - Register DstReg = I.getOperand(0).getReg(); - Register SrcReg = I.getOperand(2).getReg(); - - // FIXME: Should this be an assert? - if (MRI.getType(DstReg).getSizeInBits() != 32 || - MRI.getType(SrcReg).getSizeInBits() != 32) - return false; - - // The operation has to happen on FPRs. Set up some new FPR registers for - // the source and destination if they are on GPRs. - if (RBI.getRegBank(SrcReg, MRI, TRI)->getID() != AArch64::FPRRegBankID) { - SrcReg = MRI.createVirtualRegister(&AArch64::FPR32RegClass); - MIRBuilder.buildCopy({SrcReg}, {I.getOperand(2)}); - - // Make sure the copy ends up getting constrained properly. - RBI.constrainGenericRegister(I.getOperand(2).getReg(), - AArch64::GPR32RegClass, MRI); - } - - if (RBI.getRegBank(DstReg, MRI, TRI)->getID() != AArch64::FPRRegBankID) - DstReg = MRI.createVirtualRegister(&AArch64::FPR32RegClass); - - // Actually insert the instruction. - auto SHA1Inst = MIRBuilder.buildInstr(AArch64::SHA1Hrr, {DstReg}, {SrcReg}); - constrainSelectedInstRegOperands(*SHA1Inst, TII, TRI, RBI); - - // Did we create a new register for the destination? - if (DstReg != I.getOperand(0).getReg()) { - // Yep. Copy the result of the instruction back into the original - // destination. - MIRBuilder.buildCopy({I.getOperand(0)}, {DstReg}); - RBI.constrainGenericRegister(I.getOperand(0).getReg(), - AArch64::GPR32RegClass, MRI); - } - - I.eraseFromParent(); - return true; - } - return false; -} - -static Optional getImmedFromMO(const MachineOperand &Root) { - auto &MI = *Root.getParent(); - auto &MBB = *MI.getParent(); - auto &MF = *MBB.getParent(); - auto &MRI = MF.getRegInfo(); - uint64_t Immed; - if (Root.isImm()) - Immed = Root.getImm(); - else if (Root.isCImm()) - Immed = Root.getCImm()->getZExtValue(); - else if (Root.isReg()) { - auto ValAndVReg = - getConstantVRegValWithLookThrough(Root.getReg(), MRI, true); - if (!ValAndVReg) - return None; - Immed = ValAndVReg->Value; - } else - return None; - return Immed; -} - -InstructionSelector::ComplexRendererFns -AArch64InstructionSelector::selectShiftA_32(const MachineOperand &Root) const { - auto MaybeImmed = getImmedFromMO(Root); - if (MaybeImmed == None || *MaybeImmed > 31) - return None; - uint64_t Enc = (32 - *MaybeImmed) & 0x1f; - return {{[=](MachineInstrBuilder &MIB) { MIB.addImm(Enc); }}}; -} - -InstructionSelector::ComplexRendererFns -AArch64InstructionSelector::selectShiftB_32(const MachineOperand &Root) const { - auto MaybeImmed = getImmedFromMO(Root); - if (MaybeImmed == None || *MaybeImmed > 31) - return None; - uint64_t Enc = 31 - *MaybeImmed; - return {{[=](MachineInstrBuilder &MIB) { MIB.addImm(Enc); }}}; -} - -InstructionSelector::ComplexRendererFns -AArch64InstructionSelector::selectShiftA_64(const MachineOperand &Root) const { - auto MaybeImmed = getImmedFromMO(Root); - if (MaybeImmed == None || *MaybeImmed > 63) - return None; - uint64_t Enc = (64 - *MaybeImmed) & 0x3f; - return {{[=](MachineInstrBuilder &MIB) { MIB.addImm(Enc); }}}; -} - -InstructionSelector::ComplexRendererFns -AArch64InstructionSelector::selectShiftB_64(const MachineOperand &Root) const { - auto MaybeImmed = getImmedFromMO(Root); - if (MaybeImmed == None || *MaybeImmed > 63) - return None; - uint64_t Enc = 63 - *MaybeImmed; - return {{[=](MachineInstrBuilder &MIB) { MIB.addImm(Enc); }}}; -} - -/// Helper to select an immediate value that can be represented as a 12-bit -/// value shifted left by either 0 or 12. If it is possible to do so, return -/// the immediate and shift value. If not, return None. -/// -/// Used by selectArithImmed and selectNegArithImmed. -InstructionSelector::ComplexRendererFns -AArch64InstructionSelector::select12BitValueWithLeftShift( - uint64_t Immed) const { - unsigned ShiftAmt; - if (Immed >> 12 == 0) { - ShiftAmt = 0; - } else if ((Immed & 0xfff) == 0 && Immed >> 24 == 0) { - ShiftAmt = 12; - Immed = Immed >> 12; - } else - return None; - - unsigned ShVal = AArch64_AM::getShifterImm(AArch64_AM::LSL, ShiftAmt); - return {{ - [=](MachineInstrBuilder &MIB) { MIB.addImm(Immed); }, - [=](MachineInstrBuilder &MIB) { MIB.addImm(ShVal); }, - }}; -} - -/// SelectArithImmed - Select an immediate value that can be represented as -/// a 12-bit value shifted left by either 0 or 12. If so, return true with -/// Val set to the 12-bit value and Shift set to the shifter operand. -InstructionSelector::ComplexRendererFns -AArch64InstructionSelector::selectArithImmed(MachineOperand &Root) const { - // This function is called from the addsub_shifted_imm ComplexPattern, - // which lists [imm] as the list of opcode it's interested in, however - // we still need to check whether the operand is actually an immediate - // here because the ComplexPattern opcode list is only used in - // root-level opcode matching. - auto MaybeImmed = getImmedFromMO(Root); - if (MaybeImmed == None) - return None; - return select12BitValueWithLeftShift(*MaybeImmed); -} - -/// SelectNegArithImmed - As above, but negates the value before trying to -/// select it. -InstructionSelector::ComplexRendererFns -AArch64InstructionSelector::selectNegArithImmed(MachineOperand &Root) const { - // We need a register here, because we need to know if we have a 64 or 32 - // bit immediate. - if (!Root.isReg()) - return None; - auto MaybeImmed = getImmedFromMO(Root); - if (MaybeImmed == None) - return None; - uint64_t Immed = *MaybeImmed; - - // This negation is almost always valid, but "cmp wN, #0" and "cmn wN, #0" - // have the opposite effect on the C flag, so this pattern mustn't match under - // those circumstances. - if (Immed == 0) - return None; - - // Check if we're dealing with a 32-bit type on the root or a 64-bit type on - // the root. - MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo(); - if (MRI.getType(Root.getReg()).getSizeInBits() == 32) - Immed = ~((uint32_t)Immed) + 1; - else - Immed = ~Immed + 1ULL; - - if (Immed & 0xFFFFFFFFFF000000ULL) - return None; - - Immed &= 0xFFFFFFULL; - return select12BitValueWithLeftShift(Immed); -} - -/// Return true if it is worth folding MI into an extended register. That is, -/// if it's safe to pull it into the addressing mode of a load or store as a -/// shift. -bool AArch64InstructionSelector::isWorthFoldingIntoExtendedReg( - MachineInstr &MI, const MachineRegisterInfo &MRI) const { - // Always fold if there is one use, or if we're optimizing for size. - Register DefReg = MI.getOperand(0).getReg(); - if (MRI.hasOneUse(DefReg) || - MI.getParent()->getParent()->getFunction().hasMinSize()) - return true; - - // It's better to avoid folding and recomputing shifts when we don't have a - // fastpath. - if (!STI.hasLSLFast()) - return false; - - // We have a fastpath, so folding a shift in and potentially computing it - // many times may be beneficial. Check if this is only used in memory ops. - // If it is, then we should fold. - return all_of(MRI.use_instructions(DefReg), - [](MachineInstr &Use) { return Use.mayLoadOrStore(); }); -} - -InstructionSelector::ComplexRendererFns -AArch64InstructionSelector::selectExtendedSHL( - MachineOperand &Root, MachineOperand &Base, MachineOperand &Offset, - unsigned SizeInBytes, bool WantsExt) const { - assert(Base.isReg() && "Expected base to be a register operand"); - assert(Offset.isReg() && "Expected offset to be a register operand"); - - MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo(); - MachineInstr *OffsetInst = MRI.getVRegDef(Offset.getReg()); - if (!OffsetInst) - return None; - - unsigned OffsetOpc = OffsetInst->getOpcode(); - if (OffsetOpc != TargetOpcode::G_SHL && OffsetOpc != TargetOpcode::G_MUL) - return None; - - // Make sure that the memory op is a valid size. - int64_t LegalShiftVal = Log2_32(SizeInBytes); - if (LegalShiftVal == 0) - return None; - if (!isWorthFoldingIntoExtendedReg(*OffsetInst, MRI)) - return None; - - // Now, try to find the specific G_CONSTANT. Start by assuming that the - // register we will offset is the LHS, and the register containing the - // constant is the RHS. - Register OffsetReg = OffsetInst->getOperand(1).getReg(); - Register ConstantReg = OffsetInst->getOperand(2).getReg(); - auto ValAndVReg = getConstantVRegValWithLookThrough(ConstantReg, MRI); - if (!ValAndVReg) { - // We didn't get a constant on the RHS. If the opcode is a shift, then - // we're done. - if (OffsetOpc == TargetOpcode::G_SHL) - return None; - - // If we have a G_MUL, we can use either register. Try looking at the RHS. - std::swap(OffsetReg, ConstantReg); - ValAndVReg = getConstantVRegValWithLookThrough(ConstantReg, MRI); - if (!ValAndVReg) - return None; - } - - // The value must fit into 3 bits, and must be positive. Make sure that is - // true. - int64_t ImmVal = ValAndVReg->Value; - - // Since we're going to pull this into a shift, the constant value must be - // a power of 2. If we got a multiply, then we need to check this. - if (OffsetOpc == TargetOpcode::G_MUL) { - if (!isPowerOf2_32(ImmVal)) - return None; - - // Got a power of 2. So, the amount we'll shift is the log base-2 of that. - ImmVal = Log2_32(ImmVal); - } - - if ((ImmVal & 0x7) != ImmVal) - return None; - - // We are only allowed to shift by LegalShiftVal. This shift value is built - // into the instruction, so we can't just use whatever we want. - if (ImmVal != LegalShiftVal) - return None; - - unsigned SignExtend = 0; - if (WantsExt) { - // Check if the offset is defined by an extend. - MachineInstr *ExtInst = getDefIgnoringCopies(OffsetReg, MRI); - auto Ext = getExtendTypeForInst(*ExtInst, MRI, true); - if (Ext == AArch64_AM::InvalidShiftExtend) - return None; - - SignExtend = Ext == AArch64_AM::SXTW; - - // Need a 32-bit wide register here. - MachineIRBuilder MIB(*MRI.getVRegDef(Root.getReg())); - OffsetReg = ExtInst->getOperand(1).getReg(); - OffsetReg = narrowExtendRegIfNeeded(OffsetReg, MIB); - } - - // We can use the LHS of the GEP as the base, and the LHS of the shift as an - // offset. Signify that we are shifting by setting the shift flag to 1. - return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(Base.getReg()); }, - [=](MachineInstrBuilder &MIB) { MIB.addUse(OffsetReg); }, - [=](MachineInstrBuilder &MIB) { - // Need to add both immediates here to make sure that they are both - // added to the instruction. - MIB.addImm(SignExtend); - MIB.addImm(1); - }}}; -} - -/// This is used for computing addresses like this: -/// -/// ldr x1, [x2, x3, lsl #3] -/// -/// Where x2 is the base register, and x3 is an offset register. The shift-left -/// is a constant value specific to this load instruction. That is, we'll never -/// see anything other than a 3 here (which corresponds to the size of the -/// element being loaded.) -InstructionSelector::ComplexRendererFns -AArch64InstructionSelector::selectAddrModeShiftedExtendXReg( - MachineOperand &Root, unsigned SizeInBytes) const { - if (!Root.isReg()) - return None; - MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo(); - - // We want to find something like this: - // - // val = G_CONSTANT LegalShiftVal - // shift = G_SHL off_reg val - // ptr = G_PTR_ADD base_reg shift - // x = G_LOAD ptr - // - // And fold it into this addressing mode: - // - // ldr x, [base_reg, off_reg, lsl #LegalShiftVal] - - // Check if we can find the G_PTR_ADD. - MachineInstr *PtrAdd = - getOpcodeDef(TargetOpcode::G_PTR_ADD, Root.getReg(), MRI); - if (!PtrAdd || !isWorthFoldingIntoExtendedReg(*PtrAdd, MRI)) - return None; - - // Now, try to match an opcode which will match our specific offset. - // We want a G_SHL or a G_MUL. - MachineInstr *OffsetInst = - getDefIgnoringCopies(PtrAdd->getOperand(2).getReg(), MRI); - return selectExtendedSHL(Root, PtrAdd->getOperand(1), - OffsetInst->getOperand(0), SizeInBytes, - /*WantsExt=*/false); -} - -/// This is used for computing addresses like this: -/// -/// ldr x1, [x2, x3] -/// -/// Where x2 is the base register, and x3 is an offset register. -/// -/// When possible (or profitable) to fold a G_PTR_ADD into the address calculation, -/// this will do so. Otherwise, it will return None. -InstructionSelector::ComplexRendererFns -AArch64InstructionSelector::selectAddrModeRegisterOffset( - MachineOperand &Root) const { - MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo(); - - // We need a GEP. - MachineInstr *Gep = MRI.getVRegDef(Root.getReg()); - if (!Gep || Gep->getOpcode() != TargetOpcode::G_PTR_ADD) - return None; - - // If this is used more than once, let's not bother folding. - // TODO: Check if they are memory ops. If they are, then we can still fold - // without having to recompute anything. - if (!MRI.hasOneUse(Gep->getOperand(0).getReg())) - return None; - - // Base is the GEP's LHS, offset is its RHS. - return {{[=](MachineInstrBuilder &MIB) { - MIB.addUse(Gep->getOperand(1).getReg()); - }, - [=](MachineInstrBuilder &MIB) { - MIB.addUse(Gep->getOperand(2).getReg()); - }, - [=](MachineInstrBuilder &MIB) { - // Need to add both immediates here to make sure that they are both - // added to the instruction. - MIB.addImm(0); - MIB.addImm(0); - }}}; -} - -/// This is intended to be equivalent to selectAddrModeXRO in -/// AArch64ISelDAGtoDAG. It's used for selecting X register offset loads. -InstructionSelector::ComplexRendererFns -AArch64InstructionSelector::selectAddrModeXRO(MachineOperand &Root, - unsigned SizeInBytes) const { - MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo(); - - // If we have a constant offset, then we probably don't want to match a - // register offset. - if (isBaseWithConstantOffset(Root, MRI)) - return None; - - // Try to fold shifts into the addressing mode. - auto AddrModeFns = selectAddrModeShiftedExtendXReg(Root, SizeInBytes); - if (AddrModeFns) - return AddrModeFns; - - // If that doesn't work, see if it's possible to fold in registers from - // a GEP. - return selectAddrModeRegisterOffset(Root); -} - -/// This is used for computing addresses like this: -/// -/// ldr x0, [xBase, wOffset, sxtw #LegalShiftVal] -/// -/// Where we have a 64-bit base register, a 32-bit offset register, and an -/// extend (which may or may not be signed). -InstructionSelector::ComplexRendererFns -AArch64InstructionSelector::selectAddrModeWRO(MachineOperand &Root, - unsigned SizeInBytes) const { - MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo(); - - MachineInstr *PtrAdd = - getOpcodeDef(TargetOpcode::G_PTR_ADD, Root.getReg(), MRI); - if (!PtrAdd || !isWorthFoldingIntoExtendedReg(*PtrAdd, MRI)) - return None; - - MachineOperand &LHS = PtrAdd->getOperand(1); - MachineOperand &RHS = PtrAdd->getOperand(2); - MachineInstr *OffsetInst = getDefIgnoringCopies(RHS.getReg(), MRI); - - // The first case is the same as selectAddrModeXRO, except we need an extend. - // In this case, we try to find a shift and extend, and fold them into the - // addressing mode. - // - // E.g. - // - // off_reg = G_Z/S/ANYEXT ext_reg - // val = G_CONSTANT LegalShiftVal - // shift = G_SHL off_reg val - // ptr = G_PTR_ADD base_reg shift - // x = G_LOAD ptr - // - // In this case we can get a load like this: - // - // ldr x0, [base_reg, ext_reg, sxtw #LegalShiftVal] - auto ExtendedShl = selectExtendedSHL(Root, LHS, OffsetInst->getOperand(0), - SizeInBytes, /*WantsExt=*/true); - if (ExtendedShl) - return ExtendedShl; - - // There was no shift. We can try and fold a G_Z/S/ANYEXT in alone though. - // - // e.g. - // ldr something, [base_reg, ext_reg, sxtw] - if (!isWorthFoldingIntoExtendedReg(*OffsetInst, MRI)) - return None; - - // Check if this is an extend. We'll get an extend type if it is. - AArch64_AM::ShiftExtendType Ext = - getExtendTypeForInst(*OffsetInst, MRI, /*IsLoadStore=*/true); - if (Ext == AArch64_AM::InvalidShiftExtend) - return None; - - // Need a 32-bit wide register. - MachineIRBuilder MIB(*PtrAdd); - Register ExtReg = - narrowExtendRegIfNeeded(OffsetInst->getOperand(1).getReg(), MIB); - unsigned SignExtend = Ext == AArch64_AM::SXTW; - - // Base is LHS, offset is ExtReg. - return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(LHS.getReg()); }, - [=](MachineInstrBuilder &MIB) { MIB.addUse(ExtReg); }, - [=](MachineInstrBuilder &MIB) { - MIB.addImm(SignExtend); - MIB.addImm(0); - }}}; -} - -/// Select a "register plus unscaled signed 9-bit immediate" address. This -/// should only match when there is an offset that is not valid for a scaled -/// immediate addressing mode. The "Size" argument is the size in bytes of the -/// memory reference, which is needed here to know what is valid for a scaled -/// immediate. -InstructionSelector::ComplexRendererFns -AArch64InstructionSelector::selectAddrModeUnscaled(MachineOperand &Root, - unsigned Size) const { - MachineRegisterInfo &MRI = - Root.getParent()->getParent()->getParent()->getRegInfo(); - - if (!Root.isReg()) - return None; - - if (!isBaseWithConstantOffset(Root, MRI)) - return None; - - MachineInstr *RootDef = MRI.getVRegDef(Root.getReg()); - if (!RootDef) - return None; - - MachineOperand &OffImm = RootDef->getOperand(2); - if (!OffImm.isReg()) - return None; - MachineInstr *RHS = MRI.getVRegDef(OffImm.getReg()); - if (!RHS || RHS->getOpcode() != TargetOpcode::G_CONSTANT) - return None; - int64_t RHSC; - MachineOperand &RHSOp1 = RHS->getOperand(1); - if (!RHSOp1.isCImm() || RHSOp1.getCImm()->getBitWidth() > 64) - return None; - RHSC = RHSOp1.getCImm()->getSExtValue(); - - // If the offset is valid as a scaled immediate, don't match here. - if ((RHSC & (Size - 1)) == 0 && RHSC >= 0 && RHSC < (0x1000 << Log2_32(Size))) - return None; - if (RHSC >= -256 && RHSC < 256) { - MachineOperand &Base = RootDef->getOperand(1); - return {{ - [=](MachineInstrBuilder &MIB) { MIB.add(Base); }, - [=](MachineInstrBuilder &MIB) { MIB.addImm(RHSC); }, - }}; - } - return None; -} - -/// Select a "register plus scaled unsigned 12-bit immediate" address. The -/// "Size" argument is the size in bytes of the memory reference, which -/// determines the scale. -InstructionSelector::ComplexRendererFns -AArch64InstructionSelector::selectAddrModeIndexed(MachineOperand &Root, - unsigned Size) const { - MachineRegisterInfo &MRI = - Root.getParent()->getParent()->getParent()->getRegInfo(); - - if (!Root.isReg()) - return None; - - MachineInstr *RootDef = MRI.getVRegDef(Root.getReg()); - if (!RootDef) - return None; - - if (RootDef->getOpcode() == TargetOpcode::G_FRAME_INDEX) { - return {{ - [=](MachineInstrBuilder &MIB) { MIB.add(RootDef->getOperand(1)); }, - [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, - }}; - } - - if (isBaseWithConstantOffset(Root, MRI)) { - MachineOperand &LHS = RootDef->getOperand(1); - MachineOperand &RHS = RootDef->getOperand(2); - MachineInstr *LHSDef = MRI.getVRegDef(LHS.getReg()); - MachineInstr *RHSDef = MRI.getVRegDef(RHS.getReg()); - if (LHSDef && RHSDef) { - int64_t RHSC = (int64_t)RHSDef->getOperand(1).getCImm()->getZExtValue(); - unsigned Scale = Log2_32(Size); - if ((RHSC & (Size - 1)) == 0 && RHSC >= 0 && RHSC < (0x1000 << Scale)) { - if (LHSDef->getOpcode() == TargetOpcode::G_FRAME_INDEX) - return {{ - [=](MachineInstrBuilder &MIB) { MIB.add(LHSDef->getOperand(1)); }, - [=](MachineInstrBuilder &MIB) { MIB.addImm(RHSC >> Scale); }, - }}; - - return {{ - [=](MachineInstrBuilder &MIB) { MIB.add(LHS); }, - [=](MachineInstrBuilder &MIB) { MIB.addImm(RHSC >> Scale); }, - }}; - } - } - } - - // Before falling back to our general case, check if the unscaled - // instructions can handle this. If so, that's preferable. - if (selectAddrModeUnscaled(Root, Size).hasValue()) - return None; - - return {{ - [=](MachineInstrBuilder &MIB) { MIB.add(Root); }, - [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, - }}; -} - -/// Given a shift instruction, return the correct shift type for that -/// instruction. -static AArch64_AM::ShiftExtendType getShiftTypeForInst(MachineInstr &MI) { - // TODO: Handle AArch64_AM::ROR - switch (MI.getOpcode()) { - default: - return AArch64_AM::InvalidShiftExtend; - case TargetOpcode::G_SHL: - return AArch64_AM::LSL; - case TargetOpcode::G_LSHR: - return AArch64_AM::LSR; - case TargetOpcode::G_ASHR: - return AArch64_AM::ASR; - } -} - -/// Select a "shifted register" operand. If the value is not shifted, set the -/// shift operand to a default value of "lsl 0". -/// -/// TODO: Allow shifted register to be rotated in logical instructions. -InstructionSelector::ComplexRendererFns -AArch64InstructionSelector::selectShiftedRegister(MachineOperand &Root) const { - if (!Root.isReg()) - return None; - MachineRegisterInfo &MRI = - Root.getParent()->getParent()->getParent()->getRegInfo(); - - // Check if the operand is defined by an instruction which corresponds to - // a ShiftExtendType. E.g. a G_SHL, G_LSHR, etc. - // - // TODO: Handle AArch64_AM::ROR for logical instructions. - MachineInstr *ShiftInst = MRI.getVRegDef(Root.getReg()); - if (!ShiftInst) - return None; - AArch64_AM::ShiftExtendType ShType = getShiftTypeForInst(*ShiftInst); - if (ShType == AArch64_AM::InvalidShiftExtend) - return None; - if (!isWorthFoldingIntoExtendedReg(*ShiftInst, MRI)) - return None; - - // Need an immediate on the RHS. - MachineOperand &ShiftRHS = ShiftInst->getOperand(2); - auto Immed = getImmedFromMO(ShiftRHS); - if (!Immed) - return None; - - // We have something that we can fold. Fold in the shift's LHS and RHS into - // the instruction. - MachineOperand &ShiftLHS = ShiftInst->getOperand(1); - Register ShiftReg = ShiftLHS.getReg(); - - unsigned NumBits = MRI.getType(ShiftReg).getSizeInBits(); - unsigned Val = *Immed & (NumBits - 1); - unsigned ShiftVal = AArch64_AM::getShifterImm(ShType, Val); - - return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(ShiftReg); }, - [=](MachineInstrBuilder &MIB) { MIB.addImm(ShiftVal); }}}; -} - -AArch64_AM::ShiftExtendType AArch64InstructionSelector::getExtendTypeForInst( - MachineInstr &MI, MachineRegisterInfo &MRI, bool IsLoadStore) const { - unsigned Opc = MI.getOpcode(); - - // Handle explicit extend instructions first. - if (Opc == TargetOpcode::G_SEXT || Opc == TargetOpcode::G_SEXT_INREG) { - unsigned Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits(); - assert(Size != 64 && "Extend from 64 bits?"); - switch (Size) { - case 8: - return AArch64_AM::SXTB; - case 16: - return AArch64_AM::SXTH; - case 32: - return AArch64_AM::SXTW; - default: - return AArch64_AM::InvalidShiftExtend; - } - } - - if (Opc == TargetOpcode::G_ZEXT || Opc == TargetOpcode::G_ANYEXT) { - unsigned Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits(); - assert(Size != 64 && "Extend from 64 bits?"); - switch (Size) { - case 8: - return AArch64_AM::UXTB; - case 16: - return AArch64_AM::UXTH; - case 32: - return AArch64_AM::UXTW; - default: - return AArch64_AM::InvalidShiftExtend; - } - } - - // Don't have an explicit extend. Try to handle a G_AND with a constant mask - // on the RHS. - if (Opc != TargetOpcode::G_AND) - return AArch64_AM::InvalidShiftExtend; - - Optional MaybeAndMask = getImmedFromMO(MI.getOperand(2)); - if (!MaybeAndMask) - return AArch64_AM::InvalidShiftExtend; - uint64_t AndMask = *MaybeAndMask; - switch (AndMask) { - default: - return AArch64_AM::InvalidShiftExtend; - case 0xFF: - return !IsLoadStore ? AArch64_AM::UXTB : AArch64_AM::InvalidShiftExtend; - case 0xFFFF: - return !IsLoadStore ? AArch64_AM::UXTH : AArch64_AM::InvalidShiftExtend; - case 0xFFFFFFFF: - return AArch64_AM::UXTW; - } -} - -Register AArch64InstructionSelector::narrowExtendRegIfNeeded( - Register ExtReg, MachineIRBuilder &MIB) const { - MachineRegisterInfo &MRI = *MIB.getMRI(); - if (MRI.getType(ExtReg).getSizeInBits() == 32) - return ExtReg; - - // Insert a copy to move ExtReg to GPR32. - Register NarrowReg = MRI.createVirtualRegister(&AArch64::GPR32RegClass); - auto Copy = MIB.buildCopy({NarrowReg}, {ExtReg}); - - // Select the copy into a subregister copy. - selectCopy(*Copy, TII, MRI, TRI, RBI); - return Copy.getReg(0); -} - -/// Select an "extended register" operand. This operand folds in an extend -/// followed by an optional left shift. -InstructionSelector::ComplexRendererFns -AArch64InstructionSelector::selectArithExtendedRegister( - MachineOperand &Root) const { - if (!Root.isReg()) - return None; - MachineRegisterInfo &MRI = - Root.getParent()->getParent()->getParent()->getRegInfo(); - - uint64_t ShiftVal = 0; - Register ExtReg; - AArch64_AM::ShiftExtendType Ext; - MachineInstr *RootDef = getDefIgnoringCopies(Root.getReg(), MRI); - if (!RootDef) - return None; - - if (!isWorthFoldingIntoExtendedReg(*RootDef, MRI)) - return None; - - // Check if we can fold a shift and an extend. - if (RootDef->getOpcode() == TargetOpcode::G_SHL) { - // Look for a constant on the RHS of the shift. - MachineOperand &RHS = RootDef->getOperand(2); - Optional MaybeShiftVal = getImmedFromMO(RHS); - if (!MaybeShiftVal) - return None; - ShiftVal = *MaybeShiftVal; - if (ShiftVal > 4) - return None; - // Look for a valid extend instruction on the LHS of the shift. - MachineOperand &LHS = RootDef->getOperand(1); - MachineInstr *ExtDef = getDefIgnoringCopies(LHS.getReg(), MRI); - if (!ExtDef) - return None; - Ext = getExtendTypeForInst(*ExtDef, MRI); - if (Ext == AArch64_AM::InvalidShiftExtend) - return None; - ExtReg = ExtDef->getOperand(1).getReg(); - } else { - // Didn't get a shift. Try just folding an extend. - Ext = getExtendTypeForInst(*RootDef, MRI); - if (Ext == AArch64_AM::InvalidShiftExtend) - return None; - ExtReg = RootDef->getOperand(1).getReg(); - - // If we have a 32 bit instruction which zeroes out the high half of a - // register, we get an implicit zero extend for free. Check if we have one. - // FIXME: We actually emit the extend right now even though we don't have - // to. - if (Ext == AArch64_AM::UXTW && MRI.getType(ExtReg).getSizeInBits() == 32) { - MachineInstr *ExtInst = MRI.getVRegDef(ExtReg); - if (ExtInst && isDef32(*ExtInst)) - return None; - } - } - - // We require a GPR32 here. Narrow the ExtReg if needed using a subregister - // copy. - MachineIRBuilder MIB(*RootDef); - ExtReg = narrowExtendRegIfNeeded(ExtReg, MIB); - - return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(ExtReg); }, - [=](MachineInstrBuilder &MIB) { - MIB.addImm(getArithExtendImm(Ext, ShiftVal)); - }}}; -} - -void AArch64InstructionSelector::renderTruncImm(MachineInstrBuilder &MIB, - const MachineInstr &MI, - int OpIdx) const { - const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); - assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 && - "Expected G_CONSTANT"); - Optional CstVal = getConstantVRegVal(MI.getOperand(0).getReg(), MRI); - assert(CstVal && "Expected constant value"); - MIB.addImm(CstVal.getValue()); -} - -void AArch64InstructionSelector::renderLogicalImm32( - MachineInstrBuilder &MIB, const MachineInstr &I, int OpIdx) const { - assert(I.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 && - "Expected G_CONSTANT"); - uint64_t CstVal = I.getOperand(1).getCImm()->getZExtValue(); - uint64_t Enc = AArch64_AM::encodeLogicalImmediate(CstVal, 32); - MIB.addImm(Enc); -} - -void AArch64InstructionSelector::renderLogicalImm64( - MachineInstrBuilder &MIB, const MachineInstr &I, int OpIdx) const { - assert(I.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 && - "Expected G_CONSTANT"); - uint64_t CstVal = I.getOperand(1).getCImm()->getZExtValue(); - uint64_t Enc = AArch64_AM::encodeLogicalImmediate(CstVal, 64); - MIB.addImm(Enc); -} - -bool AArch64InstructionSelector::isLoadStoreOfNumBytes( - const MachineInstr &MI, unsigned NumBytes) const { - if (!MI.mayLoadOrStore()) - return false; - assert(MI.hasOneMemOperand() && - "Expected load/store to have only one mem op!"); - return (*MI.memoperands_begin())->getSize() == NumBytes; -} - -bool AArch64InstructionSelector::isDef32(const MachineInstr &MI) const { - const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); - if (MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() != 32) - return false; - - // Only return true if we know the operation will zero-out the high half of - // the 64-bit register. Truncates can be subregister copies, which don't - // zero out the high bits. Copies and other copy-like instructions can be - // fed by truncates, or could be lowered as subregister copies. - switch (MI.getOpcode()) { - default: - return true; - case TargetOpcode::COPY: - case TargetOpcode::G_BITCAST: - case TargetOpcode::G_TRUNC: - case TargetOpcode::G_PHI: - return false; - } -} - -namespace llvm { -InstructionSelector * -createAArch64InstructionSelector(const AArch64TargetMachine &TM, - AArch64Subtarget &Subtarget, - AArch64RegisterBankInfo &RBI) { - return new AArch64InstructionSelector(TM, Subtarget, RBI); -} -} diff --git a/llvm/lib/Target/AArch64/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/AArch64LegalizerInfo.cpp deleted file mode 100644 index 95719a35c6daa..0000000000000 --- a/llvm/lib/Target/AArch64/AArch64LegalizerInfo.cpp +++ /dev/null @@ -1,771 +0,0 @@ -//===- AArch64LegalizerInfo.cpp ----------------------------------*- C++ -*-==// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -/// \file -/// This file implements the targeting of the Machinelegalizer class for -/// AArch64. -/// \todo This should be generated by TableGen. -//===----------------------------------------------------------------------===// - -#include "AArch64LegalizerInfo.h" -#include "AArch64Subtarget.h" -#include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" -#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" -#include "llvm/CodeGen/GlobalISel/Utils.h" -#include "llvm/CodeGen/MachineInstr.h" -#include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/CodeGen/TargetOpcodes.h" -#include "llvm/CodeGen/ValueTypes.h" -#include "llvm/IR/DerivedTypes.h" -#include "llvm/IR/Type.h" - -#define DEBUG_TYPE "aarch64-legalinfo" - -using namespace llvm; -using namespace LegalizeActions; -using namespace LegalizeMutations; -using namespace LegalityPredicates; - -AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) { - using namespace TargetOpcode; - const LLT p0 = LLT::pointer(0, 64); - const LLT s1 = LLT::scalar(1); - const LLT s8 = LLT::scalar(8); - const LLT s16 = LLT::scalar(16); - const LLT s32 = LLT::scalar(32); - const LLT s64 = LLT::scalar(64); - const LLT s128 = LLT::scalar(128); - const LLT s256 = LLT::scalar(256); - const LLT s512 = LLT::scalar(512); - const LLT v16s8 = LLT::vector(16, 8); - const LLT v8s8 = LLT::vector(8, 8); - const LLT v4s8 = LLT::vector(4, 8); - const LLT v8s16 = LLT::vector(8, 16); - const LLT v4s16 = LLT::vector(4, 16); - const LLT v2s16 = LLT::vector(2, 16); - const LLT v2s32 = LLT::vector(2, 32); - const LLT v4s32 = LLT::vector(4, 32); - const LLT v2s64 = LLT::vector(2, 64); - const LLT v2p0 = LLT::vector(2, p0); - - // FIXME: support subtargets which have neon/fp-armv8 disabled. - if (!ST.hasNEON() || !ST.hasFPARMv8()) { - computeTables(); - return; - } - - getActionDefinitionsBuilder(G_IMPLICIT_DEF) - .legalFor({p0, s1, s8, s16, s32, s64, v2s32, v4s32, v2s64}) - .clampScalar(0, s1, s64) - .widenScalarToNextPow2(0, 8) - .fewerElementsIf( - [=](const LegalityQuery &Query) { - return Query.Types[0].isVector() && - (Query.Types[0].getElementType() != s64 || - Query.Types[0].getNumElements() != 2); - }, - [=](const LegalityQuery &Query) { - LLT EltTy = Query.Types[0].getElementType(); - if (EltTy == s64) - return std::make_pair(0, LLT::vector(2, 64)); - return std::make_pair(0, EltTy); - }); - - getActionDefinitionsBuilder(G_PHI) - .legalFor({p0, s16, s32, s64, v2s32, v4s32, v2s64}) - .clampScalar(0, s16, s64) - .widenScalarToNextPow2(0); - - getActionDefinitionsBuilder(G_BSWAP) - .legalFor({s32, s64, v4s32, v2s32, v2s64}) - .clampScalar(0, s32, s64) - .widenScalarToNextPow2(0); - - getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL, G_AND, G_OR, G_XOR}) - .legalFor({s32, s64, v2s32, v4s32, v2s64, v8s16, v16s8}) - .clampScalar(0, s32, s64) - .widenScalarToNextPow2(0) - .clampNumElements(0, v2s32, v4s32) - .clampNumElements(0, v2s64, v2s64) - .moreElementsToNextPow2(0); - - getActionDefinitionsBuilder(G_SHL) - .legalFor({{s32, s32}, {s64, s64}, - {v2s32, v2s32}, {v4s32, v4s32}, {v2s64, v2s64}}) - .clampScalar(1, s32, s64) - .clampScalar(0, s32, s64) - .widenScalarToNextPow2(0) - .clampNumElements(0, v2s32, v4s32) - .clampNumElements(0, v2s64, v2s64) - .moreElementsToNextPow2(0) - .minScalarSameAs(1, 0); - - getActionDefinitionsBuilder(G_PTR_ADD) - .legalFor({{p0, s64}}) - .clampScalar(1, s64, s64); - - getActionDefinitionsBuilder(G_PTR_MASK).legalFor({p0}); - - getActionDefinitionsBuilder({G_SDIV, G_UDIV}) - .legalFor({s32, s64}) - .libcallFor({s128}) - .clampScalar(0, s32, s64) - .widenScalarToNextPow2(0) - .scalarize(0); - - getActionDefinitionsBuilder({G_LSHR, G_ASHR}) - .customIf([=](const LegalityQuery &Query) { - const auto &SrcTy = Query.Types[0]; - const auto &AmtTy = Query.Types[1]; - return !SrcTy.isVector() && SrcTy.getSizeInBits() == 32 && - AmtTy.getSizeInBits() == 32; - }) - .legalFor({{s32, s32}, - {s32, s64}, - {s64, s64}, - {v2s32, v2s32}, - {v4s32, v4s32}, - {v2s64, v2s64}}) - .clampScalar(1, s32, s64) - .clampScalar(0, s32, s64) - .minScalarSameAs(1, 0); - - getActionDefinitionsBuilder({G_SREM, G_UREM}) - .lowerFor({s1, s8, s16, s32, s64}); - - getActionDefinitionsBuilder({G_SMULO, G_UMULO}) - .lowerFor({{s64, s1}}); - - getActionDefinitionsBuilder({G_SMULH, G_UMULH}).legalFor({s32, s64}); - - getActionDefinitionsBuilder({G_UADDE, G_USUBE, G_SADDO, G_SSUBO, G_UADDO}) - .legalFor({{s32, s1}, {s64, s1}}) - .minScalar(0, s32); - - getActionDefinitionsBuilder({G_FADD, G_FSUB, G_FMUL, G_FDIV, G_FNEG}) - .legalFor({s32, s64, v2s64, v4s32, v2s32}); - - getActionDefinitionsBuilder(G_FREM).libcallFor({s32, s64}); - - getActionDefinitionsBuilder({G_FCEIL, G_FABS, G_FSQRT, G_FFLOOR, G_FRINT, - G_FMA, G_INTRINSIC_TRUNC, G_INTRINSIC_ROUND, - G_FNEARBYINT}) - // If we don't have full FP16 support, then scalarize the elements of - // vectors containing fp16 types. - .fewerElementsIf( - [=, &ST](const LegalityQuery &Query) { - const auto &Ty = Query.Types[0]; - return Ty.isVector() && Ty.getElementType() == s16 && - !ST.hasFullFP16(); - }, - [=](const LegalityQuery &Query) { return std::make_pair(0, s16); }) - // If we don't have full FP16 support, then widen s16 to s32 if we - // encounter it. - .widenScalarIf( - [=, &ST](const LegalityQuery &Query) { - return Query.Types[0] == s16 && !ST.hasFullFP16(); - }, - [=](const LegalityQuery &Query) { return std::make_pair(0, s32); }) - .legalFor({s16, s32, s64, v2s32, v4s32, v2s64, v2s16, v4s16, v8s16}); - - getActionDefinitionsBuilder( - {G_FCOS, G_FSIN, G_FLOG10, G_FLOG, G_FLOG2, G_FEXP, G_FEXP2, G_FPOW}) - // We need a call for these, so we always need to scalarize. - .scalarize(0) - // Regardless of FP16 support, widen 16-bit elements to 32-bits. - .minScalar(0, s32) - .libcallFor({s32, s64, v2s32, v4s32, v2s64}); - - getActionDefinitionsBuilder(G_INSERT) - .unsupportedIf([=](const LegalityQuery &Query) { - return Query.Types[0].getSizeInBits() <= Query.Types[1].getSizeInBits(); - }) - .legalIf([=](const LegalityQuery &Query) { - const LLT &Ty0 = Query.Types[0]; - const LLT &Ty1 = Query.Types[1]; - if (Ty0 != s32 && Ty0 != s64 && Ty0 != p0) - return false; - return isPowerOf2_32(Ty1.getSizeInBits()) && - (Ty1.getSizeInBits() == 1 || Ty1.getSizeInBits() >= 8); - }) - .clampScalar(0, s32, s64) - .widenScalarToNextPow2(0) - .maxScalarIf(typeInSet(0, {s32}), 1, s16) - .maxScalarIf(typeInSet(0, {s64}), 1, s32) - .widenScalarToNextPow2(1); - - getActionDefinitionsBuilder(G_EXTRACT) - .unsupportedIf([=](const LegalityQuery &Query) { - return Query.Types[0].getSizeInBits() >= Query.Types[1].getSizeInBits(); - }) - .legalIf([=](const LegalityQuery &Query) { - const LLT &Ty0 = Query.Types[0]; - const LLT &Ty1 = Query.Types[1]; - if (Ty1 != s32 && Ty1 != s64 && Ty1 != s128) - return false; - if (Ty1 == p0) - return true; - return isPowerOf2_32(Ty0.getSizeInBits()) && - (Ty0.getSizeInBits() == 1 || Ty0.getSizeInBits() >= 8); - }) - .clampScalar(1, s32, s128) - .widenScalarToNextPow2(1) - .maxScalarIf(typeInSet(1, {s32}), 0, s16) - .maxScalarIf(typeInSet(1, {s64}), 0, s32) - .widenScalarToNextPow2(0); - - getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) - .legalForTypesWithMemDesc({{s32, p0, 8, 8}, - {s32, p0, 16, 8}, - {s32, p0, 32, 8}, - {s64, p0, 8, 2}, - {s64, p0, 16, 2}, - {s64, p0, 32, 4}, - {s64, p0, 64, 8}, - {p0, p0, 64, 8}, - {v2s32, p0, 64, 8}}) - .clampScalar(0, s32, s64) - .widenScalarToNextPow2(0) - // TODO: We could support sum-of-pow2's but the lowering code doesn't know - // how to do that yet. - .unsupportedIfMemSizeNotPow2() - // Lower anything left over into G_*EXT and G_LOAD - .lower(); - - auto IsPtrVecPred = [=](const LegalityQuery &Query) { - const LLT &ValTy = Query.Types[0]; - if (!ValTy.isVector()) - return false; - const LLT EltTy = ValTy.getElementType(); - return EltTy.isPointer() && EltTy.getAddressSpace() == 0; - }; - - getActionDefinitionsBuilder(G_LOAD) - .legalForTypesWithMemDesc({{s8, p0, 8, 8}, - {s16, p0, 16, 8}, - {s32, p0, 32, 8}, - {s64, p0, 64, 8}, - {p0, p0, 64, 8}, - {s128, p0, 128, 8}, - {v8s8, p0, 64, 8}, - {v16s8, p0, 128, 8}, - {v4s16, p0, 64, 8}, - {v8s16, p0, 128, 8}, - {v2s32, p0, 64, 8}, - {v4s32, p0, 128, 8}, - {v2s64, p0, 128, 8}}) - // These extends are also legal - .legalForTypesWithMemDesc({{s32, p0, 8, 8}, - {s32, p0, 16, 8}}) - .clampScalar(0, s8, s64) - .lowerIfMemSizeNotPow2() - // Lower any any-extending loads left into G_ANYEXT and G_LOAD - .lowerIf([=](const LegalityQuery &Query) { - return Query.Types[0].getSizeInBits() != Query.MMODescrs[0].SizeInBits; - }) - .widenScalarToNextPow2(0) - .clampMaxNumElements(0, s32, 2) - .clampMaxNumElements(0, s64, 1) - .customIf(IsPtrVecPred); - - getActionDefinitionsBuilder(G_STORE) - .legalForTypesWithMemDesc({{s8, p0, 8, 8}, - {s16, p0, 16, 8}, - {s32, p0, 8, 8}, - {s32, p0, 16, 8}, - {s32, p0, 32, 8}, - {s64, p0, 64, 8}, - {p0, p0, 64, 8}, - {s128, p0, 128, 8}, - {v16s8, p0, 128, 8}, - {v4s16, p0, 64, 8}, - {v8s16, p0, 128, 8}, - {v2s32, p0, 64, 8}, - {v4s32, p0, 128, 8}, - {v2s64, p0, 128, 8}}) - .clampScalar(0, s8, s64) - .lowerIfMemSizeNotPow2() - .lowerIf([=](const LegalityQuery &Query) { - return Query.Types[0].isScalar() && - Query.Types[0].getSizeInBits() != Query.MMODescrs[0].SizeInBits; - }) - .clampMaxNumElements(0, s32, 2) - .clampMaxNumElements(0, s64, 1) - .customIf(IsPtrVecPred); - - // Constants - getActionDefinitionsBuilder(G_CONSTANT) - .legalFor({p0, s8, s16, s32, s64}) - .clampScalar(0, s8, s64) - .widenScalarToNextPow2(0); - getActionDefinitionsBuilder(G_FCONSTANT) - .legalFor({s32, s64}) - .clampScalar(0, s32, s64); - - getActionDefinitionsBuilder(G_ICMP) - .legalFor({{s32, s32}, - {s32, s64}, - {s32, p0}, - {v4s32, v4s32}, - {v2s32, v2s32}, - {v2s64, v2s64}, - {v2s64, v2p0}, - {v4s16, v4s16}, - {v8s16, v8s16}, - {v8s8, v8s8}, - {v16s8, v16s8}}) - .clampScalar(1, s32, s64) - .clampScalar(0, s32, s32) - .minScalarEltSameAsIf( - [=](const LegalityQuery &Query) { - const LLT &Ty = Query.Types[0]; - const LLT &SrcTy = Query.Types[1]; - return Ty.isVector() && !SrcTy.getElementType().isPointer() && - Ty.getElementType() != SrcTy.getElementType(); - }, - 0, 1) - .minScalarOrEltIf( - [=](const LegalityQuery &Query) { return Query.Types[1] == v2s16; }, - 1, s32) - .minScalarOrEltIf( - [=](const LegalityQuery &Query) { return Query.Types[1] == v2p0; }, 0, - s64) - .widenScalarOrEltToNextPow2(1); - - getActionDefinitionsBuilder(G_FCMP) - .legalFor({{s32, s32}, {s32, s64}}) - .clampScalar(0, s32, s32) - .clampScalar(1, s32, s64) - .widenScalarToNextPow2(1); - - // Extensions - auto ExtLegalFunc = [=](const LegalityQuery &Query) { - unsigned DstSize = Query.Types[0].getSizeInBits(); - - if (DstSize == 128 && !Query.Types[0].isVector()) - return false; // Extending to a scalar s128 needs narrowing. - - // Make sure that we have something that will fit in a register, and - // make sure it's a power of 2. - if (DstSize < 8 || DstSize > 128 || !isPowerOf2_32(DstSize)) - return false; - - const LLT &SrcTy = Query.Types[1]; - - // Special case for s1. - if (SrcTy == s1) - return true; - - // Make sure we fit in a register otherwise. Don't bother checking that - // the source type is below 128 bits. We shouldn't be allowing anything - // through which is wider than the destination in the first place. - unsigned SrcSize = SrcTy.getSizeInBits(); - if (SrcSize < 8 || !isPowerOf2_32(SrcSize)) - return false; - - return true; - }; - getActionDefinitionsBuilder({G_ZEXT, G_SEXT, G_ANYEXT}) - .legalIf(ExtLegalFunc) - .clampScalar(0, s64, s64); // Just for s128, others are handled above. - - getActionDefinitionsBuilder(G_TRUNC).alwaysLegal(); - - getActionDefinitionsBuilder(G_SEXT_INREG).lower(); - - // FP conversions - getActionDefinitionsBuilder(G_FPTRUNC).legalFor( - {{s16, s32}, {s16, s64}, {s32, s64}, {v4s16, v4s32}, {v2s32, v2s64}}); - getActionDefinitionsBuilder(G_FPEXT).legalFor( - {{s32, s16}, {s64, s16}, {s64, s32}, {v4s32, v4s16}, {v2s64, v2s32}}); - - // Conversions - getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) - .legalForCartesianProduct({s32, s64, v2s64, v4s32, v2s32}) - .clampScalar(0, s32, s64) - .widenScalarToNextPow2(0) - .clampScalar(1, s32, s64) - .widenScalarToNextPow2(1); - - getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) - .legalForCartesianProduct({s32, s64, v2s64, v4s32, v2s32}) - .clampScalar(1, s32, s64) - .widenScalarToNextPow2(1) - .clampScalar(0, s32, s64) - .widenScalarToNextPow2(0); - - // Control-flow - getActionDefinitionsBuilder(G_BRCOND).legalFor({s1, s8, s16, s32}); - getActionDefinitionsBuilder(G_BRINDIRECT).legalFor({p0}); - - // Select - // FIXME: We can probably do a bit better than just scalarizing vector - // selects. - getActionDefinitionsBuilder(G_SELECT) - .legalFor({{s32, s1}, {s64, s1}, {p0, s1}}) - .clampScalar(0, s32, s64) - .widenScalarToNextPow2(0) - .scalarize(0); - - // Pointer-handling - getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({p0}); - getActionDefinitionsBuilder(G_GLOBAL_VALUE).legalFor({p0}); - - getActionDefinitionsBuilder(G_PTRTOINT) - .legalForCartesianProduct({s1, s8, s16, s32, s64}, {p0}) - .maxScalar(0, s64) - .widenScalarToNextPow2(0, /*Min*/ 8); - - getActionDefinitionsBuilder(G_INTTOPTR) - .unsupportedIf([&](const LegalityQuery &Query) { - return Query.Types[0].getSizeInBits() != Query.Types[1].getSizeInBits(); - }) - .legalFor({{p0, s64}}); - - // Casts for 32 and 64-bit width type are just copies. - // Same for 128-bit width type, except they are on the FPR bank. - getActionDefinitionsBuilder(G_BITCAST) - // FIXME: This is wrong since G_BITCAST is not allowed to change the - // number of bits but it's what the previous code described and fixing - // it breaks tests. - .legalForCartesianProduct({s1, s8, s16, s32, s64, s128, v16s8, v8s8, v4s8, - v8s16, v4s16, v2s16, v4s32, v2s32, v2s64, - v2p0}); - - getActionDefinitionsBuilder(G_VASTART).legalFor({p0}); - - // va_list must be a pointer, but most sized types are pretty easy to handle - // as the destination. - getActionDefinitionsBuilder(G_VAARG) - .customForCartesianProduct({s8, s16, s32, s64, p0}, {p0}) - .clampScalar(0, s8, s64) - .widenScalarToNextPow2(0, /*Min*/ 8); - - if (ST.hasLSE()) { - getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG_WITH_SUCCESS) - .lowerIf(all( - typeInSet(0, {s8, s16, s32, s64}), typeIs(1, s1), typeIs(2, p0), - atomicOrderingAtLeastOrStrongerThan(0, AtomicOrdering::Monotonic))); - - getActionDefinitionsBuilder( - {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, G_ATOMICRMW_AND, - G_ATOMICRMW_OR, G_ATOMICRMW_XOR, G_ATOMICRMW_MIN, G_ATOMICRMW_MAX, - G_ATOMICRMW_UMIN, G_ATOMICRMW_UMAX, G_ATOMIC_CMPXCHG}) - .legalIf(all( - typeInSet(0, {s8, s16, s32, s64}), typeIs(1, p0), - atomicOrderingAtLeastOrStrongerThan(0, AtomicOrdering::Monotonic))); - } - - getActionDefinitionsBuilder(G_BLOCK_ADDR).legalFor({p0}); - - // Merge/Unmerge - for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { - unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; - unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; - - auto notValidElt = [](const LegalityQuery &Query, unsigned TypeIdx) { - const LLT &Ty = Query.Types[TypeIdx]; - if (Ty.isVector()) { - const LLT &EltTy = Ty.getElementType(); - if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 64) - return true; - if (!isPowerOf2_32(EltTy.getSizeInBits())) - return true; - } - return false; - }; - - // FIXME: This rule is horrible, but specifies the same as what we had - // before with the particularly strange definitions removed (e.g. - // s8 = G_MERGE_VALUES s32, s32). - // Part of the complexity comes from these ops being extremely flexible. For - // example, you can build/decompose vectors with it, concatenate vectors, - // etc. and in addition to this you can also bitcast with it at the same - // time. We've been considering breaking it up into multiple ops to make it - // more manageable throughout the backend. - getActionDefinitionsBuilder(Op) - // Break up vectors with weird elements into scalars - .fewerElementsIf( - [=](const LegalityQuery &Query) { return notValidElt(Query, 0); }, - scalarize(0)) - .fewerElementsIf( - [=](const LegalityQuery &Query) { return notValidElt(Query, 1); }, - scalarize(1)) - // Clamp the big scalar to s8-s512 and make it either a power of 2, 192, - // or 384. - .clampScalar(BigTyIdx, s8, s512) - .widenScalarIf( - [=](const LegalityQuery &Query) { - const LLT &Ty = Query.Types[BigTyIdx]; - return !isPowerOf2_32(Ty.getSizeInBits()) && - Ty.getSizeInBits() % 64 != 0; - }, - [=](const LegalityQuery &Query) { - // Pick the next power of 2, or a multiple of 64 over 128. - // Whichever is smaller. - const LLT &Ty = Query.Types[BigTyIdx]; - unsigned NewSizeInBits = 1 - << Log2_32_Ceil(Ty.getSizeInBits() + 1); - if (NewSizeInBits >= 256) { - unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); - if (RoundedTo < NewSizeInBits) - NewSizeInBits = RoundedTo; - } - return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits)); - }) - // Clamp the little scalar to s8-s256 and make it a power of 2. It's not - // worth considering the multiples of 64 since 2*192 and 2*384 are not - // valid. - .clampScalar(LitTyIdx, s8, s256) - .widenScalarToNextPow2(LitTyIdx, /*Min*/ 8) - // So at this point, we have s8, s16, s32, s64, s128, s192, s256, s384, - // s512, , , , or . - // At this point it's simple enough to accept the legal types. - .legalIf([=](const LegalityQuery &Query) { - const LLT &BigTy = Query.Types[BigTyIdx]; - const LLT &LitTy = Query.Types[LitTyIdx]; - if (BigTy.isVector() && BigTy.getSizeInBits() < 32) - return false; - if (LitTy.isVector() && LitTy.getSizeInBits() < 32) - return false; - return BigTy.getSizeInBits() % LitTy.getSizeInBits() == 0; - }) - // Any vectors left are the wrong size. Scalarize them. - .scalarize(0) - .scalarize(1); - } - - getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) - .unsupportedIf([=](const LegalityQuery &Query) { - const LLT &EltTy = Query.Types[1].getElementType(); - return Query.Types[0] != EltTy; - }) - .minScalar(2, s64) - .legalIf([=](const LegalityQuery &Query) { - const LLT &VecTy = Query.Types[1]; - return VecTy == v2s16 || VecTy == v4s16 || VecTy == v8s16 || - VecTy == v4s32 || VecTy == v2s64 || VecTy == v2s32; - }); - - getActionDefinitionsBuilder(G_INSERT_VECTOR_ELT) - .legalIf([=](const LegalityQuery &Query) { - const LLT &VecTy = Query.Types[0]; - // TODO: Support s8 and s16 - return VecTy == v2s32 || VecTy == v4s32 || VecTy == v2s64; - }); - - getActionDefinitionsBuilder(G_BUILD_VECTOR) - .legalFor({{v4s16, s16}, - {v8s16, s16}, - {v2s32, s32}, - {v4s32, s32}, - {v2p0, p0}, - {v2s64, s64}}) - .clampNumElements(0, v4s32, v4s32) - .clampNumElements(0, v2s64, v2s64) - - // Deal with larger scalar types, which will be implicitly truncated. - .legalIf([=](const LegalityQuery &Query) { - return Query.Types[0].getScalarSizeInBits() < - Query.Types[1].getSizeInBits(); - }) - .minScalarSameAs(1, 0); - - getActionDefinitionsBuilder(G_CTLZ).legalForCartesianProduct( - {s32, s64, v8s8, v16s8, v4s16, v8s16, v2s32, v4s32}) - .scalarize(1); - - getActionDefinitionsBuilder(G_SHUFFLE_VECTOR) - .legalIf([=](const LegalityQuery &Query) { - const LLT &DstTy = Query.Types[0]; - const LLT &SrcTy = Query.Types[1]; - // For now just support the TBL2 variant which needs the source vectors - // to be the same size as the dest. - if (DstTy != SrcTy) - return false; - for (auto &Ty : {v2s32, v4s32, v2s64}) { - if (DstTy == Ty) - return true; - } - return false; - }) - // G_SHUFFLE_VECTOR can have scalar sources (from 1 x s vectors), we - // just want those lowered into G_BUILD_VECTOR - .lowerIf([=](const LegalityQuery &Query) { - return !Query.Types[1].isVector(); - }) - .clampNumElements(0, v4s32, v4s32) - .clampNumElements(0, v2s64, v2s64); - - getActionDefinitionsBuilder(G_CONCAT_VECTORS) - .legalFor({{v4s32, v2s32}, {v8s16, v4s16}}); - - getActionDefinitionsBuilder(G_JUMP_TABLE) - .legalFor({{p0}, {s64}}); - - getActionDefinitionsBuilder(G_BRJT).legalIf([=](const LegalityQuery &Query) { - return Query.Types[0] == p0 && Query.Types[1] == s64; - }); - - getActionDefinitionsBuilder(G_DYN_STACKALLOC).lower(); - - computeTables(); - verify(*ST.getInstrInfo()); -} - -bool AArch64LegalizerInfo::legalizeCustom(MachineInstr &MI, - MachineRegisterInfo &MRI, - MachineIRBuilder &MIRBuilder, - GISelChangeObserver &Observer) const { - switch (MI.getOpcode()) { - default: - // No idea what to do. - return false; - case TargetOpcode::G_VAARG: - return legalizeVaArg(MI, MRI, MIRBuilder); - case TargetOpcode::G_LOAD: - case TargetOpcode::G_STORE: - return legalizeLoadStore(MI, MRI, MIRBuilder, Observer); - case TargetOpcode::G_SHL: - case TargetOpcode::G_ASHR: - case TargetOpcode::G_LSHR: - return legalizeShlAshrLshr(MI, MRI, MIRBuilder, Observer); - } - - llvm_unreachable("expected switch to return"); -} - -bool AArch64LegalizerInfo::legalizeIntrinsic( - MachineInstr &MI, MachineRegisterInfo &MRI, - MachineIRBuilder &MIRBuilder) const { - switch (MI.getIntrinsicID()) { - case Intrinsic::memcpy: - case Intrinsic::memset: - case Intrinsic::memmove: - if (createMemLibcall(MIRBuilder, MRI, MI) == - LegalizerHelper::UnableToLegalize) - return false; - MI.eraseFromParent(); - return true; - default: - break; - } - return true; -} - -bool AArch64LegalizerInfo::legalizeShlAshrLshr( - MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &MIRBuilder, - GISelChangeObserver &Observer) const { - assert(MI.getOpcode() == TargetOpcode::G_ASHR || - MI.getOpcode() == TargetOpcode::G_LSHR || - MI.getOpcode() == TargetOpcode::G_SHL); - // If the shift amount is a G_CONSTANT, promote it to a 64 bit type so the - // imported patterns can select it later. Either way, it will be legal. - Register AmtReg = MI.getOperand(2).getReg(); - auto *CstMI = MRI.getVRegDef(AmtReg); - assert(CstMI && "expected to find a vreg def"); - if (CstMI->getOpcode() != TargetOpcode::G_CONSTANT) - return true; - // Check the shift amount is in range for an immediate form. - unsigned Amount = CstMI->getOperand(1).getCImm()->getZExtValue(); - if (Amount > 31) - return true; // This will have to remain a register variant. - assert(MRI.getType(AmtReg).getSizeInBits() == 32); - MIRBuilder.setInstr(MI); - auto ExtCst = MIRBuilder.buildZExt(LLT::scalar(64), AmtReg); - MI.getOperand(2).setReg(ExtCst.getReg(0)); - return true; -} - -bool AArch64LegalizerInfo::legalizeLoadStore( - MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &MIRBuilder, - GISelChangeObserver &Observer) const { - assert(MI.getOpcode() == TargetOpcode::G_STORE || - MI.getOpcode() == TargetOpcode::G_LOAD); - // Here we just try to handle vector loads/stores where our value type might - // have pointer elements, which the SelectionDAG importer can't handle. To - // allow the existing patterns for s64 to fire for p0, we just try to bitcast - // the value to use s64 types. - - // Custom legalization requires the instruction, if not deleted, must be fully - // legalized. In order to allow further legalization of the inst, we create - // a new instruction and erase the existing one. - - Register ValReg = MI.getOperand(0).getReg(); - const LLT ValTy = MRI.getType(ValReg); - - if (!ValTy.isVector() || !ValTy.getElementType().isPointer() || - ValTy.getElementType().getAddressSpace() != 0) { - LLVM_DEBUG(dbgs() << "Tried to do custom legalization on wrong load/store"); - return false; - } - - MIRBuilder.setInstr(MI); - unsigned PtrSize = ValTy.getElementType().getSizeInBits(); - const LLT NewTy = LLT::vector(ValTy.getNumElements(), PtrSize); - auto &MMO = **MI.memoperands_begin(); - if (MI.getOpcode() == TargetOpcode::G_STORE) { - auto Bitcast = MIRBuilder.buildBitcast({NewTy}, {ValReg}); - MIRBuilder.buildStore(Bitcast.getReg(0), MI.getOperand(1).getReg(), MMO); - } else { - Register NewReg = MRI.createGenericVirtualRegister(NewTy); - auto NewLoad = MIRBuilder.buildLoad(NewReg, MI.getOperand(1).getReg(), MMO); - MIRBuilder.buildBitcast({ValReg}, {NewLoad}); - } - MI.eraseFromParent(); - return true; -} - -bool AArch64LegalizerInfo::legalizeVaArg(MachineInstr &MI, - MachineRegisterInfo &MRI, - MachineIRBuilder &MIRBuilder) const { - MIRBuilder.setInstr(MI); - MachineFunction &MF = MIRBuilder.getMF(); - unsigned Align = MI.getOperand(2).getImm(); - Register Dst = MI.getOperand(0).getReg(); - Register ListPtr = MI.getOperand(1).getReg(); - - LLT PtrTy = MRI.getType(ListPtr); - LLT IntPtrTy = LLT::scalar(PtrTy.getSizeInBits()); - - const unsigned PtrSize = PtrTy.getSizeInBits() / 8; - Register List = MRI.createGenericVirtualRegister(PtrTy); - MIRBuilder.buildLoad( - List, ListPtr, - *MF.getMachineMemOperand(MachinePointerInfo(), MachineMemOperand::MOLoad, - PtrSize, /* Align = */ PtrSize)); - - Register DstPtr; - if (Align > PtrSize) { - // Realign the list to the actual required alignment. - auto AlignMinus1 = MIRBuilder.buildConstant(IntPtrTy, Align - 1); - - auto ListTmp = MIRBuilder.buildPtrAdd(PtrTy, List, AlignMinus1.getReg(0)); - - DstPtr = MRI.createGenericVirtualRegister(PtrTy); - MIRBuilder.buildPtrMask(DstPtr, ListTmp, Log2_64(Align)); - } else - DstPtr = List; - - uint64_t ValSize = MRI.getType(Dst).getSizeInBits() / 8; - MIRBuilder.buildLoad( - Dst, DstPtr, - *MF.getMachineMemOperand(MachinePointerInfo(), MachineMemOperand::MOLoad, - ValSize, std::max(Align, PtrSize))); - - auto Size = MIRBuilder.buildConstant(IntPtrTy, alignTo(ValSize, PtrSize)); - - auto NewList = MIRBuilder.buildPtrAdd(PtrTy, DstPtr, Size.getReg(0)); - - MIRBuilder.buildStore( - NewList, ListPtr, - *MF.getMachineMemOperand(MachinePointerInfo(), MachineMemOperand::MOStore, - PtrSize, /* Align = */ PtrSize)); - - MI.eraseFromParent(); - return true; -} diff --git a/llvm/lib/Target/AArch64/AArch64LegalizerInfo.h b/llvm/lib/Target/AArch64/AArch64LegalizerInfo.h deleted file mode 100644 index 15161bab466c4..0000000000000 --- a/llvm/lib/Target/AArch64/AArch64LegalizerInfo.h +++ /dev/null @@ -1,48 +0,0 @@ -//===- AArch64LegalizerInfo --------------------------------------*- C++ -*-==// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -/// \file -/// This file declares the targeting of the Machinelegalizer class for -/// AArch64. -/// \todo This should be generated by TableGen. -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64MACHINELEGALIZER_H -#define LLVM_LIB_TARGET_AARCH64_AARCH64MACHINELEGALIZER_H - -#include "llvm/CodeGen/GlobalISel/GISelChangeObserver.h" -#include "llvm/CodeGen/GlobalISel/LegalizerInfo.h" - -namespace llvm { - -class LLVMContext; -class AArch64Subtarget; - -/// This class provides the information for the target register banks. -class AArch64LegalizerInfo : public LegalizerInfo { -public: - AArch64LegalizerInfo(const AArch64Subtarget &ST); - - bool legalizeCustom(MachineInstr &MI, MachineRegisterInfo &MRI, - MachineIRBuilder &MIRBuilder, - GISelChangeObserver &Observer) const override; - - bool legalizeIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, - MachineIRBuilder &MIRBuilder) const override; - -private: - bool legalizeVaArg(MachineInstr &MI, MachineRegisterInfo &MRI, - MachineIRBuilder &MIRBuilder) const; - bool legalizeLoadStore(MachineInstr &MI, MachineRegisterInfo &MRI, - MachineIRBuilder &MIRBuilder, - GISelChangeObserver &Observer) const; - bool legalizeShlAshrLshr(MachineInstr &MI, MachineRegisterInfo &MRI, - MachineIRBuilder &MIRBuilder, - GISelChangeObserver &Observer) const; -}; -} // End llvm namespace. -#endif diff --git a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp index 3156bb4469638..d975b8bd04fe6 100644 --- a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp +++ b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp @@ -29,6 +29,7 @@ #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/TargetRegisterInfo.h" #include "llvm/IR/DebugLoc.h" +#include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCRegisterInfo.h" #include "llvm/Pass.h" #include "llvm/Support/CommandLine.h" @@ -66,6 +67,10 @@ static cl::opt LdStLimit("aarch64-load-store-scan-limit", static cl::opt UpdateLimit("aarch64-update-scan-limit", cl::init(100), cl::Hidden); +// Enable register renaming to find additional store pairing opportunities. +static cl::opt EnableRenaming("aarch64-load-store-renaming", + cl::init(true), cl::Hidden); + #define AARCH64_LOAD_STORE_OPT_NAME "AArch64 load / store optimization pass" namespace { @@ -673,14 +678,14 @@ AArch64LoadStoreOpt::mergeNarrowZeroStores(MachineBasicBlock::iterator I, assert(isPromotableZeroStoreInst(*I) && isPromotableZeroStoreInst(*MergeMI) && "Expected promotable zero stores."); - MachineBasicBlock::iterator NextI = I; - ++NextI; + MachineBasicBlock::iterator E = I->getParent()->end(); + MachineBasicBlock::iterator NextI = next_nodbg(I, E); // If NextI is the second of the two instructions to be merged, we need // to skip one further. Either way we merge will invalidate the iterator, // and we don't need to scan the new instruction, as it's a pairwise // instruction, which we're not considering for further action anyway. if (NextI == MergeMI) - ++NextI; + NextI = next_nodbg(NextI, E); unsigned Opc = I->getOpcode(); bool IsScaled = !TII->isUnscaledLdSt(Opc); @@ -743,18 +748,17 @@ static bool forAllMIsUntilDef(MachineInstr &MI, MCPhysReg DefReg, const TargetRegisterInfo *TRI, unsigned Limit, std::function &Fn) { auto MBB = MI.getParent(); - for (MachineBasicBlock::reverse_iterator I = MI.getReverseIterator(), - E = MBB->rend(); - I != E; I++) { + for (MachineInstr &I : + instructionsWithoutDebug(MI.getReverseIterator(), MBB->instr_rend())) { if (!Limit) return false; --Limit; - bool isDef = any_of(I->operands(), [DefReg, TRI](MachineOperand &MOP) { + bool isDef = any_of(I.operands(), [DefReg, TRI](MachineOperand &MOP) { return MOP.isReg() && MOP.isDef() && !MOP.isDebug() && MOP.getReg() && TRI->regsOverlap(MOP.getReg(), DefReg); }); - if (!Fn(*I, isDef)) + if (!Fn(I, isDef)) return false; if (isDef) break; @@ -778,14 +782,14 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I, MachineBasicBlock::iterator Paired, const LdStPairFlags &Flags) { - MachineBasicBlock::iterator NextI = I; - ++NextI; + MachineBasicBlock::iterator E = I->getParent()->end(); + MachineBasicBlock::iterator NextI = next_nodbg(I, E); // If NextI is the second of the two instructions to be merged, we need // to skip one further. Either way we merge will invalidate the iterator, // and we don't need to scan the new instruction, as it's a pairwise // instruction, which we're not considering for further action anyway. if (NextI == Paired) - ++NextI; + NextI = next_nodbg(NextI, E); int SExtIdx = Flags.getSExtIdx(); unsigned Opc = @@ -1004,8 +1008,8 @@ AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I, MachineBasicBlock::iterator AArch64LoadStoreOpt::promoteLoadFromStore(MachineBasicBlock::iterator LoadI, MachineBasicBlock::iterator StoreI) { - MachineBasicBlock::iterator NextI = LoadI; - ++NextI; + MachineBasicBlock::iterator NextI = + next_nodbg(LoadI, LoadI->getParent()->end()); int LoadSize = TII->getMemScale(*LoadI); int StoreSize = TII->getMemScale(*StoreI); @@ -1140,24 +1144,11 @@ static int alignTo(int Num, int PowOf2) { return (Num + PowOf2 - 1) & ~(PowOf2 - 1); } -static bool mayAlias(MachineInstr &MIa, MachineInstr &MIb, - AliasAnalysis *AA) { - // One of the instructions must modify memory. - if (!MIa.mayStore() && !MIb.mayStore()) - return false; - - // Both instructions must be memory operations. - if (!MIa.mayLoadOrStore() && !MIb.mayLoadOrStore()) - return false; - - return MIa.mayAlias(AA, MIb, /*UseTBAA*/false); -} - static bool mayAlias(MachineInstr &MIa, SmallVectorImpl &MemInsns, AliasAnalysis *AA) { for (MachineInstr *MIb : MemInsns) - if (mayAlias(MIa, *MIb, AA)) + if (MIa.mayAlias(AA, *MIb, /*UseTBAA*/ false)) return true; return false; @@ -1183,7 +1174,7 @@ bool AArch64LoadStoreOpt::findMatchingStore( unsigned Count = 0; do { - --MBBI; + MBBI = prev_nodbg(MBBI, B); MachineInstr &MI = *MBBI; // Don't count transient instructions towards the search limit since there @@ -1215,7 +1206,7 @@ bool AArch64LoadStoreOpt::findMatchingStore( return false; // If we encounter a store aliased with the load, return early. - if (MI.mayStore() && mayAlias(LoadMI, MI, AA)) + if (MI.mayStore() && LoadMI.mayAlias(AA, MI, /*UseTBAA*/ false)) return false; } while (MBBI != B && Count < Limit); return false; @@ -1296,7 +1287,23 @@ canRenameUpToDef(MachineInstr &FirstMI, LiveRegUnits &UsedInBetween, LLVM_DEBUG(dbgs() << " Operand not killed at " << FirstMI << "\n"); return false; } - auto canRenameMOP = [](const MachineOperand &MOP) { + auto canRenameMOP = [TRI](const MachineOperand &MOP) { + if (MOP.isReg()) { + auto *RegClass = TRI->getMinimalPhysRegClass(MOP.getReg()); + // Renaming registers with multiple disjunct sub-registers (e.g. the + // result of a LD3) means that all sub-registers are renamed, potentially + // impacting other instructions we did not check. Bail out. + // Note that this relies on the structure of the AArch64 register file. In + // particular, a subregister cannot be written without overwriting the + // whole register. + if (RegClass->HasDisjunctSubRegs) { + LLVM_DEBUG( + dbgs() + << " Cannot rename operands with multiple disjunct subregisters (" + << MOP << ")\n"); + return false; + } + } return MOP.isImplicit() || (MOP.isRenamable() && !MOP.isEarlyClobber() && !MOP.isTied()); }; @@ -1325,6 +1332,19 @@ canRenameUpToDef(MachineInstr &FirstMI, LiveRegUnits &UsedInBetween, // For defs, check if we can rename the first def of RegToRename. if (FoundDef) { + // For some pseudo instructions, we might not generate code in the end + // (e.g. KILL) and we would end up without a correct def for the rename + // register. + // TODO: This might be overly conservative and we could handle those cases + // in multiple ways: + // 1. Insert an extra copy, to materialize the def. + // 2. Skip pseudo-defs until we find an non-pseudo def. + if (MI.isPseudo()) { + LLVM_DEBUG(dbgs() << " Cannot rename pseudo instruction " << MI + << "\n"); + return false; + } + for (auto &MOP : MI.operands()) { if (!MOP.isReg() || !MOP.isDef() || MOP.isDebug() || !MOP.getReg() || !TRI->regsOverlap(MOP.getReg(), RegToRename)) @@ -1422,7 +1442,7 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I, MachineBasicBlock::iterator MBBI = I; MachineBasicBlock::iterator MBBIWithRenameReg; MachineInstr &FirstMI = *I; - ++MBBI; + MBBI = next_nodbg(MBBI, E); bool MayLoad = FirstMI.mayLoad(); bool IsUnscaled = TII->isUnscaledLdSt(FirstMI); @@ -1433,6 +1453,9 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I, bool IsPromotableZeroStore = isPromotableZeroStoreInst(FirstMI); Optional MaybeCanRename = None; + if (!EnableRenaming) + MaybeCanRename = {false}; + SmallPtrSet RequiredClasses; LiveRegUnits UsedInBetween; UsedInBetween.init(*TRI); @@ -1447,7 +1470,8 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I, // Remember any instructions that read/write memory between FirstMI and MI. SmallVector MemInsns; - for (unsigned Count = 0; MBBI != E && Count < Limit; ++MBBI) { + for (unsigned Count = 0; MBBI != E && Count < Limit; + MBBI = next_nodbg(MBBI, E)) { MachineInstr &MI = *MBBI; UsedInBetween.accumulate(MI); @@ -1616,12 +1640,13 @@ AArch64LoadStoreOpt::mergeUpdateInsn(MachineBasicBlock::iterator I, assert((Update->getOpcode() == AArch64::ADDXri || Update->getOpcode() == AArch64::SUBXri) && "Unexpected base register update instruction to merge!"); - MachineBasicBlock::iterator NextI = I; + MachineBasicBlock::iterator E = I->getParent()->end(); + MachineBasicBlock::iterator NextI = next_nodbg(I, E); // Return the instruction following the merged instruction, which is // the instruction following our unmerged load. Unless that's the add/sub // instruction we're merging, in which case it's the one after that. - if (++NextI == Update) - ++NextI; + if (NextI == Update) + NextI = next_nodbg(NextI, E); int Value = Update->getOperand(2).getImm(); assert(AArch64_AM::getShiftValue(Update->getOperand(3).getImm()) == 0 && @@ -1759,8 +1784,24 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnForward( // insn (inclusive) and the second insn. ModifiedRegUnits.clear(); UsedRegUnits.clear(); - ++MBBI; - for (unsigned Count = 0; MBBI != E && Count < Limit; ++MBBI) { + MBBI = next_nodbg(MBBI, E); + + // We can't post-increment the stack pointer if any instruction between + // the memory access (I) and the increment (MBBI) can access the memory + // region defined by [SP, MBBI]. + const bool BaseRegSP = BaseReg == AArch64::SP; + if (BaseRegSP) { + // FIXME: For now, we always block the optimization over SP in windows + // targets as it requires to adjust the unwind/debug info, messing up + // the unwind info can actually cause a miscompile. + const MCAsmInfo *MAI = I->getMF()->getTarget().getMCAsmInfo(); + if (MAI->usesWindowsCFI() && + I->getMF()->getFunction().needsUnwindTableEntry()) + return E; + } + + for (unsigned Count = 0; MBBI != E && Count < Limit; + MBBI = next_nodbg(MBBI, E)) { MachineInstr &MI = *MBBI; // Don't count transient instructions towards the search limit since there @@ -1777,8 +1818,11 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnForward( // Otherwise, if the base register is used or modified, we have no match, so // return early. + // If we are optimizing SP, do not allow instructions that may load or store + // in between the load and the optimized value update. if (!ModifiedRegUnits.available(BaseReg) || - !UsedRegUnits.available(BaseReg)) + !UsedRegUnits.available(BaseReg) || + (BaseRegSP && MBBI->mayLoadOrStore())) return E; } return E; @@ -1815,7 +1859,7 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward( UsedRegUnits.clear(); unsigned Count = 0; do { - --MBBI; + MBBI = prev_nodbg(MBBI, B); MachineInstr &MI = *MBBI; // Don't count transient instructions towards the search limit since there diff --git a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp new file mode 100644 index 0000000000000..a37e380725544 --- /dev/null +++ b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp @@ -0,0 +1,32 @@ +//=- AArch64MachineFunctionInfo.cpp - AArch64 Machine Function Info ---------=// + +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file implements AArch64-specific per-machine-function +/// information. +/// +//===----------------------------------------------------------------------===// + +#include "AArch64MachineFunctionInfo.h" + +using namespace llvm; + +yaml::AArch64FunctionInfo::AArch64FunctionInfo( + const llvm::AArch64FunctionInfo &MFI) + : HasRedZone(MFI.hasRedZone()) {} + +void yaml::AArch64FunctionInfo::mappingImpl(yaml::IO &YamlIO) { + MappingTraits::mapping(YamlIO, *this); +} + +void AArch64FunctionInfo::initializeBaseYamlFields( + const yaml::AArch64FunctionInfo &YamlMFI) { + if (YamlMFI.HasRedZone.hasValue()) + HasRedZone = YamlMFI.HasRedZone; +} diff --git a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h index 6ddb3fdb00463..84aa53f2bece1 100644 --- a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h +++ b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h @@ -18,6 +18,7 @@ #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/CodeGen/CallingConvLower.h" +#include "llvm/CodeGen/MIRYamlMapping.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/TargetFrameLowering.h" #include "llvm/IR/Function.h" @@ -26,6 +27,10 @@ namespace llvm { +namespace yaml { +struct AArch64FunctionInfo; +} // end namespace yaml + class MachineInstr; /// AArch64FunctionInfo - This class is derived from MachineFunctionInfo and @@ -126,6 +131,10 @@ class AArch64FunctionInfo final : public MachineFunctionInfo { // stack slot. unsigned TaggedBasePointerOffset = 0; + /// OutliningStyle denotes, if a function was outined, how it was outlined, + /// e.g. Tail Call, Thunk, or Function if none apply. + Optional OutliningStyle; + public: AArch64FunctionInfo() = default; @@ -137,6 +146,7 @@ public: if (MF.getFunction().hasFnAttribute(Attribute::NoRedZone)) HasRedZone = false; } + void initializeBaseYamlFields(const yaml::AArch64FunctionInfo &YamlMFI); unsigned getBytesInStackArgArea() const { return BytesInStackArgArea; } void setBytesInStackArgArea(unsigned bytes) { BytesInStackArgArea = bytes; } @@ -173,6 +183,9 @@ public: void setLocalStackSize(uint64_t Size) { LocalStackSize = Size; } uint64_t getLocalStackSize() const { return LocalStackSize; } + void setOutliningStyle(std::string Style) { OutliningStyle = Style; } + Optional getOutliningStyle() const { return OutliningStyle; } + void setCalleeSavedStackSize(unsigned Size) { CalleeSavedStackSize = Size; HasCalleeSavedStackSize = true; @@ -333,6 +346,25 @@ private: DenseMap> JumpTableEntryInfo; }; +namespace yaml { +struct AArch64FunctionInfo final : public yaml::MachineFunctionInfo { + Optional HasRedZone; + + AArch64FunctionInfo() = default; + AArch64FunctionInfo(const llvm::AArch64FunctionInfo &MFI); + + void mappingImpl(yaml::IO &YamlIO) override; + ~AArch64FunctionInfo() = default; +}; + +template <> struct MappingTraits { + static void mapping(IO &YamlIO, AArch64FunctionInfo &MFI) { + YamlIO.mapOptional("hasRedZone", MFI.HasRedZone); + } +}; + +} // end namespace yaml + } // end namespace llvm #endif // LLVM_LIB_TARGET_AARCH64_AARCH64MACHINEFUNCTIONINFO_H diff --git a/llvm/lib/Target/AArch64/AArch64PreLegalizerCombiner.cpp b/llvm/lib/Target/AArch64/AArch64PreLegalizerCombiner.cpp deleted file mode 100644 index 230fd514d0222..0000000000000 --- a/llvm/lib/Target/AArch64/AArch64PreLegalizerCombiner.cpp +++ /dev/null @@ -1,168 +0,0 @@ -//=== lib/CodeGen/GlobalISel/AArch64PreLegalizerCombiner.cpp --------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// This pass does combining of machine instructions at the generic MI level, -// before the legalizer. -// -//===----------------------------------------------------------------------===// - -#include "AArch64TargetMachine.h" -#include "llvm/CodeGen/GlobalISel/Combiner.h" -#include "llvm/CodeGen/GlobalISel/CombinerHelper.h" -#include "llvm/CodeGen/GlobalISel/CombinerInfo.h" -#include "llvm/CodeGen/GlobalISel/GISelKnownBits.h" -#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" -#include "llvm/CodeGen/MachineDominators.h" -#include "llvm/CodeGen/MachineFunctionPass.h" -#include "llvm/CodeGen/TargetPassConfig.h" -#include "llvm/Support/Debug.h" - -#define DEBUG_TYPE "aarch64-prelegalizer-combiner" - -using namespace llvm; -using namespace MIPatternMatch; - -#define AARCH64PRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS -#include "AArch64GenGICombiner.inc" -#undef AARCH64PRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS - -namespace { -#define AARCH64PRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H -#include "AArch64GenGICombiner.inc" -#undef AARCH64PRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H - -class AArch64PreLegalizerCombinerInfo : public CombinerInfo { - GISelKnownBits *KB; - MachineDominatorTree *MDT; - -public: - AArch64GenPreLegalizerCombinerHelper Generated; - - AArch64PreLegalizerCombinerInfo(bool EnableOpt, bool OptSize, bool MinSize, - GISelKnownBits *KB, MachineDominatorTree *MDT) - : CombinerInfo(/*AllowIllegalOps*/ true, /*ShouldLegalizeIllegal*/ false, - /*LegalizerInfo*/ nullptr, EnableOpt, OptSize, MinSize), - KB(KB), MDT(MDT) { - if (!Generated.parseCommandLineOption()) - report_fatal_error("Invalid rule identifier"); - } - - virtual bool combine(GISelChangeObserver &Observer, MachineInstr &MI, - MachineIRBuilder &B) const override; -}; - -bool AArch64PreLegalizerCombinerInfo::combine(GISelChangeObserver &Observer, - MachineInstr &MI, - MachineIRBuilder &B) const { - CombinerHelper Helper(Observer, B, KB, MDT); - - switch (MI.getOpcode()) { - case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS: - switch (MI.getIntrinsicID()) { - case Intrinsic::memcpy: - case Intrinsic::memmove: - case Intrinsic::memset: { - // If we're at -O0 set a maxlen of 32 to inline, otherwise let the other - // heuristics decide. - unsigned MaxLen = EnableOpt ? 0 : 32; - // Try to inline memcpy type calls if optimizations are enabled. - return (!EnableMinSize) ? Helper.tryCombineMemCpyFamily(MI, MaxLen) - : false; - } - default: - break; - } - } - - if (Generated.tryCombineAll(Observer, MI, B, Helper)) - return true; - - switch (MI.getOpcode()) { - case TargetOpcode::G_CONCAT_VECTORS: - return Helper.tryCombineConcatVectors(MI); - case TargetOpcode::G_SHUFFLE_VECTOR: - return Helper.tryCombineShuffleVector(MI); - } - - return false; -} - -#define AARCH64PRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP -#include "AArch64GenGICombiner.inc" -#undef AARCH64PRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP - -// Pass boilerplate -// ================ - -class AArch64PreLegalizerCombiner : public MachineFunctionPass { -public: - static char ID; - - AArch64PreLegalizerCombiner(bool IsOptNone = false); - - StringRef getPassName() const override { return "AArch64PreLegalizerCombiner"; } - - bool runOnMachineFunction(MachineFunction &MF) override; - - void getAnalysisUsage(AnalysisUsage &AU) const override; -private: - bool IsOptNone; -}; -} // end anonymous namespace - -void AArch64PreLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const { - AU.addRequired(); - AU.setPreservesCFG(); - getSelectionDAGFallbackAnalysisUsage(AU); - AU.addRequired(); - AU.addPreserved(); - if (!IsOptNone) { - AU.addRequired(); - AU.addPreserved(); - } - MachineFunctionPass::getAnalysisUsage(AU); -} - -AArch64PreLegalizerCombiner::AArch64PreLegalizerCombiner(bool IsOptNone) - : MachineFunctionPass(ID), IsOptNone(IsOptNone) { - initializeAArch64PreLegalizerCombinerPass(*PassRegistry::getPassRegistry()); -} - -bool AArch64PreLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) { - if (MF.getProperties().hasProperty( - MachineFunctionProperties::Property::FailedISel)) - return false; - auto *TPC = &getAnalysis(); - const Function &F = MF.getFunction(); - bool EnableOpt = - MF.getTarget().getOptLevel() != CodeGenOpt::None && !skipFunction(F); - GISelKnownBits *KB = &getAnalysis().get(MF); - MachineDominatorTree *MDT = - IsOptNone ? nullptr : &getAnalysis(); - AArch64PreLegalizerCombinerInfo PCInfo(EnableOpt, F.hasOptSize(), - F.hasMinSize(), KB, MDT); - Combiner C(PCInfo, TPC); - return C.combineMachineInstrs(MF, /*CSEInfo*/ nullptr); -} - -char AArch64PreLegalizerCombiner::ID = 0; -INITIALIZE_PASS_BEGIN(AArch64PreLegalizerCombiner, DEBUG_TYPE, - "Combine AArch64 machine instrs before legalization", - false, false) -INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) -INITIALIZE_PASS_DEPENDENCY(GISelKnownBitsAnalysis) -INITIALIZE_PASS_END(AArch64PreLegalizerCombiner, DEBUG_TYPE, - "Combine AArch64 machine instrs before legalization", false, - false) - - -namespace llvm { -FunctionPass *createAArch64PreLegalizeCombiner(bool IsOptNone) { - return new AArch64PreLegalizerCombiner(IsOptNone); -} -} // end namespace llvm diff --git a/llvm/lib/Target/AArch64/AArch64PromoteConstant.cpp b/llvm/lib/Target/AArch64/AArch64PromoteConstant.cpp index 9135f1b401223..9044c94bc4fe5 100644 --- a/llvm/lib/Target/AArch64/AArch64PromoteConstant.cpp +++ b/llvm/lib/Target/AArch64/AArch64PromoteConstant.cpp @@ -250,6 +250,20 @@ static bool isConstantUsingVectorTy(const Type *CstTy) { return false; } +// Returns true if \p C contains only ConstantData leafs and no global values, +// block addresses or constant expressions. Traverses ConstantAggregates. +static bool containsOnlyConstantData(const Constant *C) { + if (isa(C)) + return true; + + if (isa(C) || isa(C) || isa(C)) + return false; + + return all_of(C->operands(), [](const Use &U) { + return containsOnlyConstantData(cast(&U)); + }); +} + /// Check if the given use (Instruction + OpIdx) of Cst should be converted into /// a load of a global variable initialized with Cst. /// A use should be converted if it is legal to do so. @@ -304,7 +318,7 @@ static bool shouldConvertUse(const Constant *Cst, const Instruction *Instr, // Do not mess with inline asm. const CallInst *CI = dyn_cast(Instr); - return !(CI && isa(CI->getCalledValue())); + return !(CI && CI->isInlineAsm()); } /// Check if the given Cst should be converted into @@ -550,9 +564,10 @@ bool AArch64PromoteConstant::runOnFunction(Function &F, for (Use &U : I.operands()) { Constant *Cst = dyn_cast(U); // There is no point in promoting global values as they are already - // global. Do not promote constant expressions either, as they may - // require some code expansion. - if (!Cst || isa(Cst) || isa(Cst)) + // global. Do not promote constants containing constant expression, global + // values or blockaddresses either, as they may require some code + // expansion. + if (!Cst || isa(Cst) || !containsOnlyConstantData(Cst)) continue; // Check if this constant is worth promoting. diff --git a/llvm/lib/Target/AArch64/AArch64RegisterBankInfo.cpp b/llvm/lib/Target/AArch64/AArch64RegisterBankInfo.cpp deleted file mode 100644 index 40efac261fd99..0000000000000 --- a/llvm/lib/Target/AArch64/AArch64RegisterBankInfo.cpp +++ /dev/null @@ -1,852 +0,0 @@ -//===- AArch64RegisterBankInfo.cpp ----------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -/// \file -/// This file implements the targeting of the RegisterBankInfo class for -/// AArch64. -/// \todo This should be generated by TableGen. -//===----------------------------------------------------------------------===// - -#include "AArch64RegisterBankInfo.h" -#include "AArch64InstrInfo.h" -#include "llvm/ADT/SmallVector.h" -#include "llvm/CodeGen/GlobalISel/RegisterBank.h" -#include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h" -#include "llvm/CodeGen/LowLevelType.h" -#include "llvm/CodeGen/MachineFunction.h" -#include "llvm/CodeGen/MachineInstr.h" -#include "llvm/CodeGen/MachineOperand.h" -#include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/CodeGen/TargetOpcodes.h" -#include "llvm/CodeGen/TargetRegisterInfo.h" -#include "llvm/CodeGen/TargetSubtargetInfo.h" -#include "llvm/Support/ErrorHandling.h" -#include -#include - -#define GET_TARGET_REGBANK_IMPL -#include "AArch64GenRegisterBank.inc" - -// This file will be TableGen'ed at some point. -#include "AArch64GenRegisterBankInfo.def" - -using namespace llvm; - -AArch64RegisterBankInfo::AArch64RegisterBankInfo(const TargetRegisterInfo &TRI) - : AArch64GenRegisterBankInfo() { - static bool AlreadyInit = false; - // We have only one set of register banks, whatever the subtarget - // is. Therefore, the initialization of the RegBanks table should be - // done only once. Indeed the table of all register banks - // (AArch64::RegBanks) is unique in the compiler. At some point, it - // will get tablegen'ed and the whole constructor becomes empty. - if (AlreadyInit) - return; - AlreadyInit = true; - - const RegisterBank &RBGPR = getRegBank(AArch64::GPRRegBankID); - (void)RBGPR; - assert(&AArch64::GPRRegBank == &RBGPR && - "The order in RegBanks is messed up"); - - const RegisterBank &RBFPR = getRegBank(AArch64::FPRRegBankID); - (void)RBFPR; - assert(&AArch64::FPRRegBank == &RBFPR && - "The order in RegBanks is messed up"); - - const RegisterBank &RBCCR = getRegBank(AArch64::CCRegBankID); - (void)RBCCR; - assert(&AArch64::CCRegBank == &RBCCR && "The order in RegBanks is messed up"); - - // The GPR register bank is fully defined by all the registers in - // GR64all + its subclasses. - assert(RBGPR.covers(*TRI.getRegClass(AArch64::GPR32RegClassID)) && - "Subclass not added?"); - assert(RBGPR.getSize() == 64 && "GPRs should hold up to 64-bit"); - - // The FPR register bank is fully defined by all the registers in - // GR64all + its subclasses. - assert(RBFPR.covers(*TRI.getRegClass(AArch64::QQRegClassID)) && - "Subclass not added?"); - assert(RBFPR.covers(*TRI.getRegClass(AArch64::FPR64RegClassID)) && - "Subclass not added?"); - assert(RBFPR.getSize() == 512 && - "FPRs should hold up to 512-bit via QQQQ sequence"); - - assert(RBCCR.covers(*TRI.getRegClass(AArch64::CCRRegClassID)) && - "Class not added?"); - assert(RBCCR.getSize() == 32 && "CCR should hold up to 32-bit"); - - // Check that the TableGen'ed like file is in sync we our expectations. - // First, the Idx. - assert(checkPartialMappingIdx(PMI_FirstGPR, PMI_LastGPR, - {PMI_GPR32, PMI_GPR64}) && - "PartialMappingIdx's are incorrectly ordered"); - assert(checkPartialMappingIdx(PMI_FirstFPR, PMI_LastFPR, - {PMI_FPR16, PMI_FPR32, PMI_FPR64, PMI_FPR128, - PMI_FPR256, PMI_FPR512}) && - "PartialMappingIdx's are incorrectly ordered"); -// Now, the content. -// Check partial mapping. -#define CHECK_PARTIALMAP(Idx, ValStartIdx, ValLength, RB) \ - do { \ - assert( \ - checkPartialMap(PartialMappingIdx::Idx, ValStartIdx, ValLength, RB) && \ - #Idx " is incorrectly initialized"); \ - } while (false) - - CHECK_PARTIALMAP(PMI_GPR32, 0, 32, RBGPR); - CHECK_PARTIALMAP(PMI_GPR64, 0, 64, RBGPR); - CHECK_PARTIALMAP(PMI_FPR16, 0, 16, RBFPR); - CHECK_PARTIALMAP(PMI_FPR32, 0, 32, RBFPR); - CHECK_PARTIALMAP(PMI_FPR64, 0, 64, RBFPR); - CHECK_PARTIALMAP(PMI_FPR128, 0, 128, RBFPR); - CHECK_PARTIALMAP(PMI_FPR256, 0, 256, RBFPR); - CHECK_PARTIALMAP(PMI_FPR512, 0, 512, RBFPR); - -// Check value mapping. -#define CHECK_VALUEMAP_IMPL(RBName, Size, Offset) \ - do { \ - assert(checkValueMapImpl(PartialMappingIdx::PMI_##RBName##Size, \ - PartialMappingIdx::PMI_First##RBName, Size, \ - Offset) && \ - #RBName #Size " " #Offset " is incorrectly initialized"); \ - } while (false) - -#define CHECK_VALUEMAP(RBName, Size) CHECK_VALUEMAP_IMPL(RBName, Size, 0) - - CHECK_VALUEMAP(GPR, 32); - CHECK_VALUEMAP(GPR, 64); - CHECK_VALUEMAP(FPR, 16); - CHECK_VALUEMAP(FPR, 32); - CHECK_VALUEMAP(FPR, 64); - CHECK_VALUEMAP(FPR, 128); - CHECK_VALUEMAP(FPR, 256); - CHECK_VALUEMAP(FPR, 512); - -// Check the value mapping for 3-operands instructions where all the operands -// map to the same value mapping. -#define CHECK_VALUEMAP_3OPS(RBName, Size) \ - do { \ - CHECK_VALUEMAP_IMPL(RBName, Size, 0); \ - CHECK_VALUEMAP_IMPL(RBName, Size, 1); \ - CHECK_VALUEMAP_IMPL(RBName, Size, 2); \ - } while (false) - - CHECK_VALUEMAP_3OPS(GPR, 32); - CHECK_VALUEMAP_3OPS(GPR, 64); - CHECK_VALUEMAP_3OPS(FPR, 32); - CHECK_VALUEMAP_3OPS(FPR, 64); - CHECK_VALUEMAP_3OPS(FPR, 128); - CHECK_VALUEMAP_3OPS(FPR, 256); - CHECK_VALUEMAP_3OPS(FPR, 512); - -#define CHECK_VALUEMAP_CROSSREGCPY(RBNameDst, RBNameSrc, Size) \ - do { \ - unsigned PartialMapDstIdx = PMI_##RBNameDst##Size - PMI_Min; \ - unsigned PartialMapSrcIdx = PMI_##RBNameSrc##Size - PMI_Min; \ - (void)PartialMapDstIdx; \ - (void)PartialMapSrcIdx; \ - const ValueMapping *Map = getCopyMapping( \ - AArch64::RBNameDst##RegBankID, AArch64::RBNameSrc##RegBankID, Size); \ - (void)Map; \ - assert(Map[0].BreakDown == \ - &AArch64GenRegisterBankInfo::PartMappings[PartialMapDstIdx] && \ - Map[0].NumBreakDowns == 1 && #RBNameDst #Size \ - " Dst is incorrectly initialized"); \ - assert(Map[1].BreakDown == \ - &AArch64GenRegisterBankInfo::PartMappings[PartialMapSrcIdx] && \ - Map[1].NumBreakDowns == 1 && #RBNameSrc #Size \ - " Src is incorrectly initialized"); \ - \ - } while (false) - - CHECK_VALUEMAP_CROSSREGCPY(GPR, GPR, 32); - CHECK_VALUEMAP_CROSSREGCPY(GPR, FPR, 32); - CHECK_VALUEMAP_CROSSREGCPY(GPR, GPR, 64); - CHECK_VALUEMAP_CROSSREGCPY(GPR, FPR, 64); - CHECK_VALUEMAP_CROSSREGCPY(FPR, FPR, 32); - CHECK_VALUEMAP_CROSSREGCPY(FPR, GPR, 32); - CHECK_VALUEMAP_CROSSREGCPY(FPR, FPR, 64); - CHECK_VALUEMAP_CROSSREGCPY(FPR, GPR, 64); - -#define CHECK_VALUEMAP_FPEXT(DstSize, SrcSize) \ - do { \ - unsigned PartialMapDstIdx = PMI_FPR##DstSize - PMI_Min; \ - unsigned PartialMapSrcIdx = PMI_FPR##SrcSize - PMI_Min; \ - (void)PartialMapDstIdx; \ - (void)PartialMapSrcIdx; \ - const ValueMapping *Map = getFPExtMapping(DstSize, SrcSize); \ - (void)Map; \ - assert(Map[0].BreakDown == \ - &AArch64GenRegisterBankInfo::PartMappings[PartialMapDstIdx] && \ - Map[0].NumBreakDowns == 1 && "FPR" #DstSize \ - " Dst is incorrectly initialized"); \ - assert(Map[1].BreakDown == \ - &AArch64GenRegisterBankInfo::PartMappings[PartialMapSrcIdx] && \ - Map[1].NumBreakDowns == 1 && "FPR" #SrcSize \ - " Src is incorrectly initialized"); \ - \ - } while (false) - - CHECK_VALUEMAP_FPEXT(32, 16); - CHECK_VALUEMAP_FPEXT(64, 16); - CHECK_VALUEMAP_FPEXT(64, 32); - CHECK_VALUEMAP_FPEXT(128, 64); - - assert(verify(TRI) && "Invalid register bank information"); -} - -unsigned AArch64RegisterBankInfo::copyCost(const RegisterBank &A, - const RegisterBank &B, - unsigned Size) const { - // What do we do with different size? - // copy are same size. - // Will introduce other hooks for different size: - // * extract cost. - // * build_sequence cost. - - // Copy from (resp. to) GPR to (resp. from) FPR involves FMOV. - // FIXME: This should be deduced from the scheduling model. - if (&A == &AArch64::GPRRegBank && &B == &AArch64::FPRRegBank) - // FMOVXDr or FMOVWSr. - return 5; - if (&A == &AArch64::FPRRegBank && &B == &AArch64::GPRRegBank) - // FMOVDXr or FMOVSWr. - return 4; - - return RegisterBankInfo::copyCost(A, B, Size); -} - -const RegisterBank & -AArch64RegisterBankInfo::getRegBankFromRegClass(const TargetRegisterClass &RC, - LLT) const { - switch (RC.getID()) { - case AArch64::FPR8RegClassID: - case AArch64::FPR16RegClassID: - case AArch64::FPR32RegClassID: - case AArch64::FPR64RegClassID: - case AArch64::FPR128RegClassID: - case AArch64::FPR128_loRegClassID: - case AArch64::DDRegClassID: - case AArch64::DDDRegClassID: - case AArch64::DDDDRegClassID: - case AArch64::QQRegClassID: - case AArch64::QQQRegClassID: - case AArch64::QQQQRegClassID: - return getRegBank(AArch64::FPRRegBankID); - case AArch64::GPR32commonRegClassID: - case AArch64::GPR32RegClassID: - case AArch64::GPR32spRegClassID: - case AArch64::GPR32sponlyRegClassID: - case AArch64::GPR32argRegClassID: - case AArch64::GPR32allRegClassID: - case AArch64::GPR64commonRegClassID: - case AArch64::GPR64RegClassID: - case AArch64::GPR64spRegClassID: - case AArch64::GPR64sponlyRegClassID: - case AArch64::GPR64argRegClassID: - case AArch64::GPR64allRegClassID: - case AArch64::GPR64noipRegClassID: - case AArch64::GPR64common_and_GPR64noipRegClassID: - case AArch64::GPR64noip_and_tcGPR64RegClassID: - case AArch64::tcGPR64RegClassID: - case AArch64::WSeqPairsClassRegClassID: - case AArch64::XSeqPairsClassRegClassID: - return getRegBank(AArch64::GPRRegBankID); - case AArch64::CCRRegClassID: - return getRegBank(AArch64::CCRegBankID); - default: - llvm_unreachable("Register class not supported"); - } -} - -RegisterBankInfo::InstructionMappings -AArch64RegisterBankInfo::getInstrAlternativeMappings( - const MachineInstr &MI) const { - const MachineFunction &MF = *MI.getParent()->getParent(); - const TargetSubtargetInfo &STI = MF.getSubtarget(); - const TargetRegisterInfo &TRI = *STI.getRegisterInfo(); - const MachineRegisterInfo &MRI = MF.getRegInfo(); - - switch (MI.getOpcode()) { - case TargetOpcode::G_OR: { - // 32 and 64-bit or can be mapped on either FPR or - // GPR for the same cost. - unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, TRI); - if (Size != 32 && Size != 64) - break; - - // If the instruction has any implicit-defs or uses, - // do not mess with it. - if (MI.getNumOperands() != 3) - break; - InstructionMappings AltMappings; - const InstructionMapping &GPRMapping = getInstructionMapping( - /*ID*/ 1, /*Cost*/ 1, getValueMapping(PMI_FirstGPR, Size), - /*NumOperands*/ 3); - const InstructionMapping &FPRMapping = getInstructionMapping( - /*ID*/ 2, /*Cost*/ 1, getValueMapping(PMI_FirstFPR, Size), - /*NumOperands*/ 3); - - AltMappings.push_back(&GPRMapping); - AltMappings.push_back(&FPRMapping); - return AltMappings; - } - case TargetOpcode::G_BITCAST: { - unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, TRI); - if (Size != 32 && Size != 64) - break; - - // If the instruction has any implicit-defs or uses, - // do not mess with it. - if (MI.getNumOperands() != 2) - break; - - InstructionMappings AltMappings; - const InstructionMapping &GPRMapping = getInstructionMapping( - /*ID*/ 1, /*Cost*/ 1, - getCopyMapping(AArch64::GPRRegBankID, AArch64::GPRRegBankID, Size), - /*NumOperands*/ 2); - const InstructionMapping &FPRMapping = getInstructionMapping( - /*ID*/ 2, /*Cost*/ 1, - getCopyMapping(AArch64::FPRRegBankID, AArch64::FPRRegBankID, Size), - /*NumOperands*/ 2); - const InstructionMapping &GPRToFPRMapping = getInstructionMapping( - /*ID*/ 3, - /*Cost*/ copyCost(AArch64::GPRRegBank, AArch64::FPRRegBank, Size), - getCopyMapping(AArch64::FPRRegBankID, AArch64::GPRRegBankID, Size), - /*NumOperands*/ 2); - const InstructionMapping &FPRToGPRMapping = getInstructionMapping( - /*ID*/ 3, - /*Cost*/ copyCost(AArch64::GPRRegBank, AArch64::FPRRegBank, Size), - getCopyMapping(AArch64::GPRRegBankID, AArch64::FPRRegBankID, Size), - /*NumOperands*/ 2); - - AltMappings.push_back(&GPRMapping); - AltMappings.push_back(&FPRMapping); - AltMappings.push_back(&GPRToFPRMapping); - AltMappings.push_back(&FPRToGPRMapping); - return AltMappings; - } - case TargetOpcode::G_LOAD: { - unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, TRI); - if (Size != 64) - break; - - // If the instruction has any implicit-defs or uses, - // do not mess with it. - if (MI.getNumOperands() != 2) - break; - - InstructionMappings AltMappings; - const InstructionMapping &GPRMapping = getInstructionMapping( - /*ID*/ 1, /*Cost*/ 1, - getOperandsMapping({getValueMapping(PMI_FirstGPR, Size), - // Addresses are GPR 64-bit. - getValueMapping(PMI_FirstGPR, 64)}), - /*NumOperands*/ 2); - const InstructionMapping &FPRMapping = getInstructionMapping( - /*ID*/ 2, /*Cost*/ 1, - getOperandsMapping({getValueMapping(PMI_FirstFPR, Size), - // Addresses are GPR 64-bit. - getValueMapping(PMI_FirstGPR, 64)}), - /*NumOperands*/ 2); - - AltMappings.push_back(&GPRMapping); - AltMappings.push_back(&FPRMapping); - return AltMappings; - } - default: - break; - } - return RegisterBankInfo::getInstrAlternativeMappings(MI); -} - -void AArch64RegisterBankInfo::applyMappingImpl( - const OperandsMapper &OpdMapper) const { - switch (OpdMapper.getMI().getOpcode()) { - case TargetOpcode::G_OR: - case TargetOpcode::G_BITCAST: - case TargetOpcode::G_LOAD: - // Those ID must match getInstrAlternativeMappings. - assert((OpdMapper.getInstrMapping().getID() >= 1 && - OpdMapper.getInstrMapping().getID() <= 4) && - "Don't know how to handle that ID"); - return applyDefaultMapping(OpdMapper); - default: - llvm_unreachable("Don't know how to handle that operation"); - } -} - -/// Returns whether opcode \p Opc is a pre-isel generic floating-point opcode, -/// having only floating-point operands. -static bool isPreISelGenericFloatingPointOpcode(unsigned Opc) { - switch (Opc) { - case TargetOpcode::G_FADD: - case TargetOpcode::G_FSUB: - case TargetOpcode::G_FMUL: - case TargetOpcode::G_FMA: - case TargetOpcode::G_FDIV: - case TargetOpcode::G_FCONSTANT: - case TargetOpcode::G_FPEXT: - case TargetOpcode::G_FPTRUNC: - case TargetOpcode::G_FCEIL: - case TargetOpcode::G_FFLOOR: - case TargetOpcode::G_FNEARBYINT: - case TargetOpcode::G_FNEG: - case TargetOpcode::G_FCOS: - case TargetOpcode::G_FSIN: - case TargetOpcode::G_FLOG10: - case TargetOpcode::G_FLOG: - case TargetOpcode::G_FLOG2: - case TargetOpcode::G_FSQRT: - case TargetOpcode::G_FABS: - case TargetOpcode::G_FEXP: - case TargetOpcode::G_FRINT: - case TargetOpcode::G_INTRINSIC_TRUNC: - case TargetOpcode::G_INTRINSIC_ROUND: - return true; - } - return false; -} - -const RegisterBankInfo::InstructionMapping & -AArch64RegisterBankInfo::getSameKindOfOperandsMapping( - const MachineInstr &MI) const { - const unsigned Opc = MI.getOpcode(); - const MachineFunction &MF = *MI.getParent()->getParent(); - const MachineRegisterInfo &MRI = MF.getRegInfo(); - - unsigned NumOperands = MI.getNumOperands(); - assert(NumOperands <= 3 && - "This code is for instructions with 3 or less operands"); - - LLT Ty = MRI.getType(MI.getOperand(0).getReg()); - unsigned Size = Ty.getSizeInBits(); - bool IsFPR = Ty.isVector() || isPreISelGenericFloatingPointOpcode(Opc); - - PartialMappingIdx RBIdx = IsFPR ? PMI_FirstFPR : PMI_FirstGPR; - -#ifndef NDEBUG - // Make sure all the operands are using similar size and type. - // Should probably be checked by the machine verifier. - // This code won't catch cases where the number of lanes is - // different between the operands. - // If we want to go to that level of details, it is probably - // best to check that the types are the same, period. - // Currently, we just check that the register banks are the same - // for each types. - for (unsigned Idx = 1; Idx != NumOperands; ++Idx) { - LLT OpTy = MRI.getType(MI.getOperand(Idx).getReg()); - assert( - AArch64GenRegisterBankInfo::getRegBankBaseIdxOffset( - RBIdx, OpTy.getSizeInBits()) == - AArch64GenRegisterBankInfo::getRegBankBaseIdxOffset(RBIdx, Size) && - "Operand has incompatible size"); - bool OpIsFPR = OpTy.isVector() || isPreISelGenericFloatingPointOpcode(Opc); - (void)OpIsFPR; - assert(IsFPR == OpIsFPR && "Operand has incompatible type"); - } -#endif // End NDEBUG. - - return getInstructionMapping(DefaultMappingID, 1, - getValueMapping(RBIdx, Size), NumOperands); -} - -bool AArch64RegisterBankInfo::hasFPConstraints( - const MachineInstr &MI, const MachineRegisterInfo &MRI, - const TargetRegisterInfo &TRI) const { - unsigned Op = MI.getOpcode(); - - // Do we have an explicit floating point instruction? - if (isPreISelGenericFloatingPointOpcode(Op)) - return true; - - // No. Check if we have a copy-like instruction. If we do, then we could - // still be fed by floating point instructions. - if (Op != TargetOpcode::COPY && !MI.isPHI()) - return false; - - // MI is copy-like. Return true if it outputs an FPR. - return getRegBank(MI.getOperand(0).getReg(), MRI, TRI) == - &AArch64::FPRRegBank; -} - -bool AArch64RegisterBankInfo::onlyUsesFP(const MachineInstr &MI, - const MachineRegisterInfo &MRI, - const TargetRegisterInfo &TRI) const { - switch (MI.getOpcode()) { - case TargetOpcode::G_FPTOSI: - case TargetOpcode::G_FPTOUI: - case TargetOpcode::G_FCMP: - return true; - default: - break; - } - return hasFPConstraints(MI, MRI, TRI); -} - -bool AArch64RegisterBankInfo::onlyDefinesFP( - const MachineInstr &MI, const MachineRegisterInfo &MRI, - const TargetRegisterInfo &TRI) const { - switch (MI.getOpcode()) { - case TargetOpcode::G_SITOFP: - case TargetOpcode::G_UITOFP: - case TargetOpcode::G_EXTRACT_VECTOR_ELT: - case TargetOpcode::G_INSERT_VECTOR_ELT: - return true; - default: - break; - } - return hasFPConstraints(MI, MRI, TRI); -} - -const RegisterBankInfo::InstructionMapping & -AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { - const unsigned Opc = MI.getOpcode(); - - // Try the default logic for non-generic instructions that are either copies - // or already have some operands assigned to banks. - if ((Opc != TargetOpcode::COPY && !isPreISelGenericOpcode(Opc)) || - Opc == TargetOpcode::G_PHI) { - const RegisterBankInfo::InstructionMapping &Mapping = - getInstrMappingImpl(MI); - if (Mapping.isValid()) - return Mapping; - } - - const MachineFunction &MF = *MI.getParent()->getParent(); - const MachineRegisterInfo &MRI = MF.getRegInfo(); - const TargetSubtargetInfo &STI = MF.getSubtarget(); - const TargetRegisterInfo &TRI = *STI.getRegisterInfo(); - - switch (Opc) { - // G_{F|S|U}REM are not listed because they are not legal. - // Arithmetic ops. - case TargetOpcode::G_ADD: - case TargetOpcode::G_SUB: - case TargetOpcode::G_PTR_ADD: - case TargetOpcode::G_MUL: - case TargetOpcode::G_SDIV: - case TargetOpcode::G_UDIV: - // Bitwise ops. - case TargetOpcode::G_AND: - case TargetOpcode::G_OR: - case TargetOpcode::G_XOR: - // Floating point ops. - case TargetOpcode::G_FADD: - case TargetOpcode::G_FSUB: - case TargetOpcode::G_FMUL: - case TargetOpcode::G_FDIV: - return getSameKindOfOperandsMapping(MI); - case TargetOpcode::G_FPEXT: { - LLT DstTy = MRI.getType(MI.getOperand(0).getReg()); - LLT SrcTy = MRI.getType(MI.getOperand(1).getReg()); - return getInstructionMapping( - DefaultMappingID, /*Cost*/ 1, - getFPExtMapping(DstTy.getSizeInBits(), SrcTy.getSizeInBits()), - /*NumOperands*/ 2); - } - // Shifts. - case TargetOpcode::G_SHL: - case TargetOpcode::G_LSHR: - case TargetOpcode::G_ASHR: { - LLT ShiftAmtTy = MRI.getType(MI.getOperand(2).getReg()); - LLT SrcTy = MRI.getType(MI.getOperand(1).getReg()); - if (ShiftAmtTy.getSizeInBits() == 64 && SrcTy.getSizeInBits() == 32) - return getInstructionMapping(DefaultMappingID, 1, - &ValMappings[Shift64Imm], 3); - return getSameKindOfOperandsMapping(MI); - } - case TargetOpcode::COPY: { - Register DstReg = MI.getOperand(0).getReg(); - Register SrcReg = MI.getOperand(1).getReg(); - // Check if one of the register is not a generic register. - if ((Register::isPhysicalRegister(DstReg) || - !MRI.getType(DstReg).isValid()) || - (Register::isPhysicalRegister(SrcReg) || - !MRI.getType(SrcReg).isValid())) { - const RegisterBank *DstRB = getRegBank(DstReg, MRI, TRI); - const RegisterBank *SrcRB = getRegBank(SrcReg, MRI, TRI); - if (!DstRB) - DstRB = SrcRB; - else if (!SrcRB) - SrcRB = DstRB; - // If both RB are null that means both registers are generic. - // We shouldn't be here. - assert(DstRB && SrcRB && "Both RegBank were nullptr"); - unsigned Size = getSizeInBits(DstReg, MRI, TRI); - return getInstructionMapping( - DefaultMappingID, copyCost(*DstRB, *SrcRB, Size), - getCopyMapping(DstRB->getID(), SrcRB->getID(), Size), - // We only care about the mapping of the destination. - /*NumOperands*/ 1); - } - // Both registers are generic, use G_BITCAST. - LLVM_FALLTHROUGH; - } - case TargetOpcode::G_BITCAST: { - LLT DstTy = MRI.getType(MI.getOperand(0).getReg()); - LLT SrcTy = MRI.getType(MI.getOperand(1).getReg()); - unsigned Size = DstTy.getSizeInBits(); - bool DstIsGPR = !DstTy.isVector() && DstTy.getSizeInBits() <= 64; - bool SrcIsGPR = !SrcTy.isVector() && SrcTy.getSizeInBits() <= 64; - const RegisterBank &DstRB = - DstIsGPR ? AArch64::GPRRegBank : AArch64::FPRRegBank; - const RegisterBank &SrcRB = - SrcIsGPR ? AArch64::GPRRegBank : AArch64::FPRRegBank; - return getInstructionMapping( - DefaultMappingID, copyCost(DstRB, SrcRB, Size), - getCopyMapping(DstRB.getID(), SrcRB.getID(), Size), - // We only care about the mapping of the destination for COPY. - /*NumOperands*/ Opc == TargetOpcode::G_BITCAST ? 2 : 1); - } - default: - break; - } - - unsigned NumOperands = MI.getNumOperands(); - - // Track the size and bank of each register. We don't do partial mappings. - SmallVector OpSize(NumOperands); - SmallVector OpRegBankIdx(NumOperands); - for (unsigned Idx = 0; Idx < NumOperands; ++Idx) { - auto &MO = MI.getOperand(Idx); - if (!MO.isReg() || !MO.getReg()) - continue; - - LLT Ty = MRI.getType(MO.getReg()); - OpSize[Idx] = Ty.getSizeInBits(); - - // As a top-level guess, vectors go in FPRs, scalars and pointers in GPRs. - // For floating-point instructions, scalars go in FPRs. - if (Ty.isVector() || isPreISelGenericFloatingPointOpcode(Opc) || - Ty.getSizeInBits() > 64) - OpRegBankIdx[Idx] = PMI_FirstFPR; - else - OpRegBankIdx[Idx] = PMI_FirstGPR; - } - - unsigned Cost = 1; - // Some of the floating-point instructions have mixed GPR and FPR operands: - // fine-tune the computed mapping. - switch (Opc) { - case TargetOpcode::G_TRUNC: { - LLT SrcTy = MRI.getType(MI.getOperand(1).getReg()); - if (!SrcTy.isVector() && SrcTy.getSizeInBits() == 128) - OpRegBankIdx = {PMI_FirstFPR, PMI_FirstFPR}; - break; - } - case TargetOpcode::G_SITOFP: - case TargetOpcode::G_UITOFP: - if (MRI.getType(MI.getOperand(0).getReg()).isVector()) - break; - OpRegBankIdx = {PMI_FirstFPR, PMI_FirstGPR}; - break; - case TargetOpcode::G_FPTOSI: - case TargetOpcode::G_FPTOUI: - if (MRI.getType(MI.getOperand(0).getReg()).isVector()) - break; - OpRegBankIdx = {PMI_FirstGPR, PMI_FirstFPR}; - break; - case TargetOpcode::G_FCMP: - OpRegBankIdx = {PMI_FirstGPR, - /* Predicate */ PMI_None, PMI_FirstFPR, PMI_FirstFPR}; - break; - case TargetOpcode::G_BITCAST: - // This is going to be a cross register bank copy and this is expensive. - if (OpRegBankIdx[0] != OpRegBankIdx[1]) - Cost = copyCost( - *AArch64GenRegisterBankInfo::PartMappings[OpRegBankIdx[0]].RegBank, - *AArch64GenRegisterBankInfo::PartMappings[OpRegBankIdx[1]].RegBank, - OpSize[0]); - break; - case TargetOpcode::G_LOAD: - // Loading in vector unit is slightly more expensive. - // This is actually only true for the LD1R and co instructions, - // but anyway for the fast mode this number does not matter and - // for the greedy mode the cost of the cross bank copy will - // offset this number. - // FIXME: Should be derived from the scheduling model. - if (OpRegBankIdx[0] != PMI_FirstGPR) - Cost = 2; - else - // Check if that load feeds fp instructions. - // In that case, we want the default mapping to be on FPR - // instead of blind map every scalar to GPR. - for (const MachineInstr &UseMI : - MRI.use_instructions(MI.getOperand(0).getReg())) { - // If we have at least one direct use in a FP instruction, - // assume this was a floating point load in the IR. - // If it was not, we would have had a bitcast before - // reaching that instruction. - if (onlyUsesFP(UseMI, MRI, TRI)) { - OpRegBankIdx[0] = PMI_FirstFPR; - break; - } - } - break; - case TargetOpcode::G_STORE: - // Check if that store is fed by fp instructions. - if (OpRegBankIdx[0] == PMI_FirstGPR) { - Register VReg = MI.getOperand(0).getReg(); - if (!VReg) - break; - MachineInstr *DefMI = MRI.getVRegDef(VReg); - if (onlyDefinesFP(*DefMI, MRI, TRI)) - OpRegBankIdx[0] = PMI_FirstFPR; - break; - } - break; - case TargetOpcode::G_SELECT: { - // If the destination is FPR, preserve that. - if (OpRegBankIdx[0] != PMI_FirstGPR) - break; - - // If we're taking in vectors, we have no choice but to put everything on - // FPRs, except for the condition. The condition must always be on a GPR. - LLT SrcTy = MRI.getType(MI.getOperand(2).getReg()); - if (SrcTy.isVector()) { - OpRegBankIdx = {PMI_FirstFPR, PMI_FirstGPR, PMI_FirstFPR, PMI_FirstFPR}; - break; - } - - // Try to minimize the number of copies. If we have more floating point - // constrained values than not, then we'll put everything on FPR. Otherwise, - // everything has to be on GPR. - unsigned NumFP = 0; - - // Check if the uses of the result always produce floating point values. - // - // For example: - // - // %z = G_SELECT %cond %x %y - // fpr = G_FOO %z ... - if (any_of( - MRI.use_instructions(MI.getOperand(0).getReg()), - [&](MachineInstr &MI) { return onlyUsesFP(MI, MRI, TRI); })) - ++NumFP; - - // Check if the defs of the source values always produce floating point - // values. - // - // For example: - // - // %x = G_SOMETHING_ALWAYS_FLOAT %a ... - // %z = G_SELECT %cond %x %y - // - // Also check whether or not the sources have already been decided to be - // FPR. Keep track of this. - // - // This doesn't check the condition, since it's just whatever is in NZCV. - // This isn't passed explicitly in a register to fcsel/csel. - for (unsigned Idx = 2; Idx < 4; ++Idx) { - Register VReg = MI.getOperand(Idx).getReg(); - MachineInstr *DefMI = MRI.getVRegDef(VReg); - if (getRegBank(VReg, MRI, TRI) == &AArch64::FPRRegBank || - onlyDefinesFP(*DefMI, MRI, TRI)) - ++NumFP; - } - - // If we have more FP constraints than not, then move everything over to - // FPR. - if (NumFP >= 2) - OpRegBankIdx = {PMI_FirstFPR, PMI_FirstGPR, PMI_FirstFPR, PMI_FirstFPR}; - - break; - } - case TargetOpcode::G_UNMERGE_VALUES: { - // If the first operand belongs to a FPR register bank, then make sure that - // we preserve that. - if (OpRegBankIdx[0] != PMI_FirstGPR) - break; - - LLT SrcTy = MRI.getType(MI.getOperand(MI.getNumOperands()-1).getReg()); - // UNMERGE into scalars from a vector should always use FPR. - // Likewise if any of the uses are FP instructions. - if (SrcTy.isVector() || SrcTy == LLT::scalar(128) || - any_of(MRI.use_instructions(MI.getOperand(0).getReg()), - [&](MachineInstr &MI) { return onlyUsesFP(MI, MRI, TRI); })) { - // Set the register bank of every operand to FPR. - for (unsigned Idx = 0, NumOperands = MI.getNumOperands(); - Idx < NumOperands; ++Idx) - OpRegBankIdx[Idx] = PMI_FirstFPR; - } - break; - } - case TargetOpcode::G_EXTRACT_VECTOR_ELT: - // Destination and source need to be FPRs. - OpRegBankIdx[0] = PMI_FirstFPR; - OpRegBankIdx[1] = PMI_FirstFPR; - - // Index needs to be a GPR. - OpRegBankIdx[2] = PMI_FirstGPR; - break; - case TargetOpcode::G_INSERT_VECTOR_ELT: - OpRegBankIdx[0] = PMI_FirstFPR; - OpRegBankIdx[1] = PMI_FirstFPR; - - // The element may be either a GPR or FPR. Preserve that behaviour. - if (getRegBank(MI.getOperand(2).getReg(), MRI, TRI) == &AArch64::FPRRegBank) - OpRegBankIdx[2] = PMI_FirstFPR; - else - OpRegBankIdx[2] = PMI_FirstGPR; - - // Index needs to be a GPR. - OpRegBankIdx[3] = PMI_FirstGPR; - break; - case TargetOpcode::G_EXTRACT: { - // For s128 sources we have to use fpr. - LLT SrcTy = MRI.getType(MI.getOperand(1).getReg()); - if (SrcTy.getSizeInBits() == 128) { - OpRegBankIdx[0] = PMI_FirstFPR; - OpRegBankIdx[1] = PMI_FirstFPR; - } - break; - } - case TargetOpcode::G_BUILD_VECTOR: - // If the first source operand belongs to a FPR register bank, then make - // sure that we preserve that. - if (OpRegBankIdx[1] != PMI_FirstGPR) - break; - Register VReg = MI.getOperand(1).getReg(); - if (!VReg) - break; - - // Get the instruction that defined the source operand reg, and check if - // it's a floating point operation. Or, if it's a type like s16 which - // doesn't have a exact size gpr register class. - MachineInstr *DefMI = MRI.getVRegDef(VReg); - unsigned DefOpc = DefMI->getOpcode(); - const LLT SrcTy = MRI.getType(VReg); - if (isPreISelGenericFloatingPointOpcode(DefOpc) || - SrcTy.getSizeInBits() < 32) { - // Have a floating point op. - // Make sure every operand gets mapped to a FPR register class. - unsigned NumOperands = MI.getNumOperands(); - for (unsigned Idx = 0; Idx < NumOperands; ++Idx) - OpRegBankIdx[Idx] = PMI_FirstFPR; - } - break; - } - - // Finally construct the computed mapping. - SmallVector OpdsMapping(NumOperands); - for (unsigned Idx = 0; Idx < NumOperands; ++Idx) { - if (MI.getOperand(Idx).isReg() && MI.getOperand(Idx).getReg()) { - auto Mapping = getValueMapping(OpRegBankIdx[Idx], OpSize[Idx]); - if (!Mapping->isValid()) - return getInvalidInstructionMapping(); - - OpdsMapping[Idx] = Mapping; - } - } - - return getInstructionMapping(DefaultMappingID, Cost, - getOperandsMapping(OpdsMapping), NumOperands); -} diff --git a/llvm/lib/Target/AArch64/AArch64RegisterBankInfo.h b/llvm/lib/Target/AArch64/AArch64RegisterBankInfo.h deleted file mode 100644 index e956fca1aa109..0000000000000 --- a/llvm/lib/Target/AArch64/AArch64RegisterBankInfo.h +++ /dev/null @@ -1,145 +0,0 @@ -//===- AArch64RegisterBankInfo -----------------------------------*- C++ -*-==// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -/// \file -/// This file declares the targeting of the RegisterBankInfo class for AArch64. -/// \todo This should be generated by TableGen. -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64REGISTERBANKINFO_H -#define LLVM_LIB_TARGET_AARCH64_AARCH64REGISTERBANKINFO_H - -#include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h" - -#define GET_REGBANK_DECLARATIONS -#include "AArch64GenRegisterBank.inc" - -namespace llvm { - -class TargetRegisterInfo; - -class AArch64GenRegisterBankInfo : public RegisterBankInfo { -protected: - enum PartialMappingIdx { - PMI_None = -1, - PMI_FPR16 = 1, - PMI_FPR32, - PMI_FPR64, - PMI_FPR128, - PMI_FPR256, - PMI_FPR512, - PMI_GPR32, - PMI_GPR64, - PMI_FirstGPR = PMI_GPR32, - PMI_LastGPR = PMI_GPR64, - PMI_FirstFPR = PMI_FPR16, - PMI_LastFPR = PMI_FPR512, - PMI_Min = PMI_FirstFPR, - }; - - static RegisterBankInfo::PartialMapping PartMappings[]; - static RegisterBankInfo::ValueMapping ValMappings[]; - static PartialMappingIdx BankIDToCopyMapIdx[]; - - enum ValueMappingIdx { - InvalidIdx = 0, - First3OpsIdx = 1, - Last3OpsIdx = 22, - DistanceBetweenRegBanks = 3, - FirstCrossRegCpyIdx = 25, - LastCrossRegCpyIdx = 39, - DistanceBetweenCrossRegCpy = 2, - FPExt16To32Idx = 41, - FPExt16To64Idx = 43, - FPExt32To64Idx = 45, - FPExt64To128Idx = 47, - Shift64Imm = 49 - }; - - static bool checkPartialMap(unsigned Idx, unsigned ValStartIdx, - unsigned ValLength, const RegisterBank &RB); - static bool checkValueMapImpl(unsigned Idx, unsigned FirstInBank, - unsigned Size, unsigned Offset); - static bool checkPartialMappingIdx(PartialMappingIdx FirstAlias, - PartialMappingIdx LastAlias, - ArrayRef Order); - - static unsigned getRegBankBaseIdxOffset(unsigned RBIdx, unsigned Size); - - /// Get the pointer to the ValueMapping representing the RegisterBank - /// at \p RBIdx with a size of \p Size. - /// - /// The returned mapping works for instructions with the same kind of - /// operands for up to 3 operands. - /// - /// \pre \p RBIdx != PartialMappingIdx::None - static const RegisterBankInfo::ValueMapping * - getValueMapping(PartialMappingIdx RBIdx, unsigned Size); - - /// Get the pointer to the ValueMapping of the operands of a copy - /// instruction from the \p SrcBankID register bank to the \p DstBankID - /// register bank with a size of \p Size. - static const RegisterBankInfo::ValueMapping * - getCopyMapping(unsigned DstBankID, unsigned SrcBankID, unsigned Size); - - /// Get the instruction mapping for G_FPEXT. - /// - /// \pre (DstSize, SrcSize) pair is one of the following: - /// (32, 16), (64, 16), (64, 32), (128, 64) - /// - /// \return An InstructionMapping with statically allocated OperandsMapping. - static const RegisterBankInfo::ValueMapping * - getFPExtMapping(unsigned DstSize, unsigned SrcSize); - -#define GET_TARGET_REGBANK_CLASS -#include "AArch64GenRegisterBank.inc" -}; - -/// This class provides the information for the target register banks. -class AArch64RegisterBankInfo final : public AArch64GenRegisterBankInfo { - /// See RegisterBankInfo::applyMapping. - void applyMappingImpl(const OperandsMapper &OpdMapper) const override; - - /// Get an instruction mapping where all the operands map to - /// the same register bank and have similar size. - /// - /// \pre MI.getNumOperands() <= 3 - /// - /// \return An InstructionMappings with a statically allocated - /// OperandsMapping. - const InstructionMapping & - getSameKindOfOperandsMapping(const MachineInstr &MI) const; - - /// Returns true if the output of \p MI must be stored on a FPR register. - bool hasFPConstraints(const MachineInstr &MI, const MachineRegisterInfo &MRI, - const TargetRegisterInfo &TRI) const; - - /// Returns true if the source registers of \p MI must all be FPRs. - bool onlyUsesFP(const MachineInstr &MI, const MachineRegisterInfo &MRI, - const TargetRegisterInfo &TRI) const; - - /// Returns true if the destination register of \p MI must be a FPR. - bool onlyDefinesFP(const MachineInstr &MI, const MachineRegisterInfo &MRI, - const TargetRegisterInfo &TRI) const; - -public: - AArch64RegisterBankInfo(const TargetRegisterInfo &TRI); - - unsigned copyCost(const RegisterBank &A, const RegisterBank &B, - unsigned Size) const override; - - const RegisterBank &getRegBankFromRegClass(const TargetRegisterClass &RC, - LLT) const override; - - InstructionMappings - getInstrAlternativeMappings(const MachineInstr &MI) const override; - - const InstructionMapping & - getInstrMapping(const MachineInstr &MI) const override; -}; -} // End llvm namespace. -#endif diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp index 14f839cd4f812..886158ca44901 100644 --- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp @@ -43,24 +43,27 @@ AArch64RegisterInfo::AArch64RegisterInfo(const Triple &TT) const MCPhysReg * AArch64RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { assert(MF && "Invalid MachineFunction pointer."); - if (MF->getFunction().getCallingConv() == CallingConv::CFGuard_Check) - return CSR_Win_AArch64_CFGuard_Check_SaveList; - if (MF->getSubtarget().isTargetWindows()) - return CSR_Win_AArch64_AAPCS_SaveList; + if (MF->getFunction().getCallingConv() == CallingConv::GHC) // GHC set of callee saved regs is empty as all those regs are // used for passing STG regs around return CSR_AArch64_NoRegs_SaveList; if (MF->getFunction().getCallingConv() == CallingConv::AnyReg) return CSR_AArch64_AllRegs_SaveList; + + // Darwin has its own CSR_AArch64_AAPCS_SaveList, which means most CSR save + // lists depending on that will need to have their Darwin variant as well. + if (MF->getSubtarget().isTargetDarwin()) + return getDarwinCalleeSavedRegs(MF); + + if (MF->getFunction().getCallingConv() == CallingConv::CFGuard_Check) + return CSR_Win_AArch64_CFGuard_Check_SaveList; + if (MF->getSubtarget().isTargetWindows()) + return CSR_Win_AArch64_AAPCS_SaveList; if (MF->getFunction().getCallingConv() == CallingConv::AArch64_VectorCall) return CSR_AArch64_AAVPCS_SaveList; if (MF->getFunction().getCallingConv() == CallingConv::AArch64_SVE_VectorCall) return CSR_AArch64_SVE_AAPCS_SaveList; - if (MF->getFunction().getCallingConv() == CallingConv::CXX_FAST_TLS) - return MF->getInfo()->isSplitCSR() ? - CSR_AArch64_CXX_TLS_Darwin_PE_SaveList : - CSR_AArch64_CXX_TLS_Darwin_SaveList; if (MF->getSubtarget().getTargetLowering() ->supportSwiftError() && MF->getFunction().getAttributes().hasAttrSomewhere( @@ -68,17 +71,47 @@ AArch64RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { return CSR_AArch64_AAPCS_SwiftError_SaveList; if (MF->getFunction().getCallingConv() == CallingConv::PreserveMost) return CSR_AArch64_RT_MostRegs_SaveList; - if (MF->getSubtarget().isTargetDarwin()) - return CSR_Darwin_AArch64_AAPCS_SaveList; + if (MF->getFunction().getCallingConv() == CallingConv::Win64) + // This is for OSes other than Windows; Windows is a separate case further + // above. + return CSR_AArch64_AAPCS_X18_SaveList; return CSR_AArch64_AAPCS_SaveList; } +const MCPhysReg * +AArch64RegisterInfo::getDarwinCalleeSavedRegs(const MachineFunction *MF) const { + assert(MF && "Invalid MachineFunction pointer."); + assert(MF->getSubtarget().isTargetDarwin() && + "Invalid subtarget for getDarwinCalleeSavedRegs"); + + if (MF->getFunction().getCallingConv() == CallingConv::CFGuard_Check) + report_fatal_error( + "Calling convention CFGuard_Check is unsupported on Darwin."); + if (MF->getFunction().getCallingConv() == CallingConv::AArch64_VectorCall) + return CSR_Darwin_AArch64_AAVPCS_SaveList; + if (MF->getFunction().getCallingConv() == CallingConv::AArch64_SVE_VectorCall) + report_fatal_error( + "Calling convention SVE_VectorCall is unsupported on Darwin."); + if (MF->getFunction().getCallingConv() == CallingConv::CXX_FAST_TLS) + return MF->getInfo()->isSplitCSR() + ? CSR_Darwin_AArch64_CXX_TLS_PE_SaveList + : CSR_Darwin_AArch64_CXX_TLS_SaveList; + if (MF->getSubtarget().getTargetLowering() + ->supportSwiftError() && + MF->getFunction().getAttributes().hasAttrSomewhere( + Attribute::SwiftError)) + return CSR_Darwin_AArch64_AAPCS_SwiftError_SaveList; + if (MF->getFunction().getCallingConv() == CallingConv::PreserveMost) + return CSR_Darwin_AArch64_RT_MostRegs_SaveList; + return CSR_Darwin_AArch64_AAPCS_SaveList; +} + const MCPhysReg *AArch64RegisterInfo::getCalleeSavedRegsViaCopy( const MachineFunction *MF) const { assert(MF && "Invalid MachineFunction pointer."); if (MF->getFunction().getCallingConv() == CallingConv::CXX_FAST_TLS && MF->getInfo()->isSplitCSR()) - return CSR_AArch64_CXX_TLS_Darwin_ViaCopy_SaveList; + return CSR_Darwin_AArch64_CXX_TLS_ViaCopy_SaveList; return nullptr; } @@ -112,6 +145,32 @@ AArch64RegisterInfo::getSubClassWithSubReg(const TargetRegisterClass *RC, return AArch64GenRegisterInfo::getSubClassWithSubReg(RC, Idx); } +const uint32_t * +AArch64RegisterInfo::getDarwinCallPreservedMask(const MachineFunction &MF, + CallingConv::ID CC) const { + assert(MF.getSubtarget().isTargetDarwin() && + "Invalid subtarget for getDarwinCallPreservedMask"); + + if (CC == CallingConv::CXX_FAST_TLS) + return CSR_Darwin_AArch64_CXX_TLS_RegMask; + if (CC == CallingConv::AArch64_VectorCall) + return CSR_Darwin_AArch64_AAVPCS_RegMask; + if (CC == CallingConv::AArch64_SVE_VectorCall) + report_fatal_error( + "Calling convention SVE_VectorCall is unsupported on Darwin."); + if (CC == CallingConv::CFGuard_Check) + report_fatal_error( + "Calling convention CFGuard_Check is unsupported on Darwin."); + if (MF.getSubtarget() + .getTargetLowering() + ->supportSwiftError() && + MF.getFunction().getAttributes().hasAttrSomewhere(Attribute::SwiftError)) + return CSR_Darwin_AArch64_AAPCS_SwiftError_RegMask; + if (CC == CallingConv::PreserveMost) + return CSR_Darwin_AArch64_RT_MostRegs_RegMask; + return CSR_Darwin_AArch64_AAPCS_RegMask; +} + const uint32_t * AArch64RegisterInfo::getCallPreservedMask(const MachineFunction &MF, CallingConv::ID CC) const { @@ -121,9 +180,14 @@ AArch64RegisterInfo::getCallPreservedMask(const MachineFunction &MF, return SCS ? CSR_AArch64_NoRegs_SCS_RegMask : CSR_AArch64_NoRegs_RegMask; if (CC == CallingConv::AnyReg) return SCS ? CSR_AArch64_AllRegs_SCS_RegMask : CSR_AArch64_AllRegs_RegMask; - if (CC == CallingConv::CXX_FAST_TLS) - return SCS ? CSR_AArch64_CXX_TLS_Darwin_SCS_RegMask - : CSR_AArch64_CXX_TLS_Darwin_RegMask; + + // All the following calling conventions are handled differently on Darwin. + if (MF.getSubtarget().isTargetDarwin()) { + if (SCS) + report_fatal_error("ShadowCallStack attribute not supported on Darwin."); + return getDarwinCallPreservedMask(MF, CC); + } + if (CC == CallingConv::AArch64_VectorCall) return SCS ? CSR_AArch64_AAVPCS_SCS_RegMask : CSR_AArch64_AAVPCS_RegMask; if (CC == CallingConv::AArch64_SVE_VectorCall) @@ -145,7 +209,7 @@ AArch64RegisterInfo::getCallPreservedMask(const MachineFunction &MF, const uint32_t *AArch64RegisterInfo::getTLSCallPreservedMask() const { if (TT.isOSDarwin()) - return CSR_AArch64_TLS_Darwin_RegMask; + return CSR_Darwin_AArch64_TLS_RegMask; assert(TT.isOSBinFormatELF() && "Invalid target"); return CSR_AArch64_TLS_ELF_RegMask; @@ -186,6 +250,8 @@ AArch64RegisterInfo::getThisReturnPreservedMask(const MachineFunction &MF, // In case that the calling convention does not use the same register for // both, the function should return NULL (does not currently apply) assert(CC != CallingConv::GHC && "should not be GHC calling convention."); + if (MF.getSubtarget().isTargetDarwin()) + return CSR_Darwin_AArch64_AAPCS_ThisReturn_RegMask; return CSR_AArch64_AAPCS_ThisReturn_RegMask; } @@ -222,7 +288,7 @@ AArch64RegisterInfo::getReservedRegs(const MachineFunction &MF) const { } bool AArch64RegisterInfo::isReservedReg(const MachineFunction &MF, - unsigned Reg) const { + MCRegister Reg) const { return getReservedRegs(MF)[Reg]; } @@ -240,11 +306,11 @@ void AArch64RegisterInfo::emitReservedArgRegCallError( } bool AArch64RegisterInfo::isAsmClobberable(const MachineFunction &MF, - unsigned PhysReg) const { + MCRegister PhysReg) const { return !isReservedReg(MF, PhysReg); } -bool AArch64RegisterInfo::isConstantPhysReg(unsigned PhysReg) const { +bool AArch64RegisterInfo::isConstantPhysReg(MCRegister PhysReg) const { return PhysReg == AArch64::WZR || PhysReg == AArch64::XZR; } @@ -390,12 +456,16 @@ bool AArch64RegisterInfo::needsFrameBaseReg(MachineInstr *MI, if (isFrameOffsetLegal(MI, AArch64::SP, Offset)) return false; + // If even offset 0 is illegal, we don't want a virtual base register. + if (!isFrameOffsetLegal(MI, AArch64::SP, 0)) + return false; + // The offset likely isn't legal; we want to allocate a virtual base register. return true; } bool AArch64RegisterInfo::isFrameOffsetLegal(const MachineInstr *MI, - unsigned BaseReg, + Register BaseReg, int64_t Offset) const { assert(MI && "Unable to get the legal offset for nil instruction."); StackOffset SaveOffset(Offset, MVT::i8); @@ -405,7 +475,7 @@ bool AArch64RegisterInfo::isFrameOffsetLegal(const MachineInstr *MI, /// Insert defining instruction(s) for BaseReg to be a pointer to FrameIdx /// at the beginning of the basic block. void AArch64RegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB, - unsigned BaseReg, + Register BaseReg, int FrameIdx, int64_t Offset) const { MachineBasicBlock::iterator Ins = MBB->begin(); @@ -426,7 +496,7 @@ void AArch64RegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB, .addImm(Shifter); } -void AArch64RegisterInfo::resolveFrameIndex(MachineInstr &MI, unsigned BaseReg, +void AArch64RegisterInfo::resolveFrameIndex(MachineInstr &MI, Register BaseReg, int64_t Offset) const { // ARM doesn't need the general 64-bit offsets StackOffset Off(Offset, MVT::i8); @@ -445,6 +515,27 @@ void AArch64RegisterInfo::resolveFrameIndex(MachineInstr &MI, unsigned BaseReg, (void)Done; } +// Create a scratch register for the frame index elimination in an instruction. +// This function has special handling of stack tagging loop pseudos, in which +// case it can also change the instruction opcode (but not the operands). +static Register +createScratchRegisterForInstruction(MachineInstr &MI, + const AArch64InstrInfo *TII) { + // ST*Gloop have a reserved scratch register in operand 1. Use it, and also + // replace the instruction with the writeback variant because it will now + // satisfy the operand constraints for it. + if (MI.getOpcode() == AArch64::STGloop) { + MI.setDesc(TII->get(AArch64::STGloop_wback)); + return MI.getOperand(1).getReg(); + } else if (MI.getOpcode() == AArch64::STZGloop) { + MI.setDesc(TII->get(AArch64::STZGloop_wback)); + return MI.getOperand(1).getReg(); + } else { + return MI.getMF()->getRegInfo().createVirtualRegister( + &AArch64::GPR64RegClass); + } +} + void AArch64RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj, unsigned FIOperandNum, RegScavenger *RS) const { @@ -461,7 +552,7 @@ void AArch64RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, int FrameIndex = MI.getOperand(FIOperandNum).getIndex(); bool Tagged = MI.getOperand(FIOperandNum).getTargetFlags() & AArch64II::MO_TAGGED; - unsigned FrameReg; + Register FrameReg; // Special handling of dbg_value, stackmap and patchpoint instructions. if (MI.isDebugValue() || MI.getOpcode() == TargetOpcode::STACKMAP || @@ -531,8 +622,7 @@ void AArch64RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, // If we get here, the immediate doesn't fit into the instruction. We folded // as much as possible above. Handle the rest, providing a register that is // SP+LargeImm. - Register ScratchReg = - MF.getRegInfo().createVirtualRegister(&AArch64::GPR64RegClass); + Register ScratchReg = createScratchRegisterForInstruction(MI, TII); emitFrameOffset(MBB, II, MI.getDebugLoc(), ScratchReg, FrameReg, Offset, TII); MI.getOperand(FIOperandNum).ChangeToRegister(ScratchReg, false, false, true); } @@ -572,6 +662,8 @@ unsigned AArch64RegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC, return 32; case AArch64::FPR128_loRegClassID: + case AArch64::FPR64_loRegClassID: + case AArch64::FPR16_loRegClassID: return 16; } } diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.h b/llvm/lib/Target/AArch64/AArch64RegisterInfo.h index 2c3f82c530d8a..22a8ba76c6111 100644 --- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.h +++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.h @@ -34,7 +34,7 @@ public: return getEncodingValue(i); } - bool isReservedReg(const MachineFunction &MF, unsigned Reg) const; + bool isReservedReg(const MachineFunction &MF, MCRegister Reg) const; bool isAnyArgRegReserved(const MachineFunction &MF) const; void emitReservedArgRegCallError(const MachineFunction &MF) const; @@ -44,10 +44,13 @@ public: /// Code Generation virtual methods... const MCPhysReg *getCalleeSavedRegs(const MachineFunction *MF) const override; + const MCPhysReg *getDarwinCalleeSavedRegs(const MachineFunction *MF) const; const MCPhysReg * getCalleeSavedRegsViaCopy(const MachineFunction *MF) const; const uint32_t *getCallPreservedMask(const MachineFunction &MF, CallingConv::ID) const override; + const uint32_t *getDarwinCallPreservedMask(const MachineFunction &MF, + CallingConv::ID) const; unsigned getCSRFirstUseCost() const override { // The cost will be compared against BlockFrequency where entry has the @@ -83,8 +86,8 @@ public: BitVector getReservedRegs(const MachineFunction &MF) const override; bool isAsmClobberable(const MachineFunction &MF, - unsigned PhysReg) const override; - bool isConstantPhysReg(unsigned PhysReg) const override; + MCRegister PhysReg) const override; + bool isConstantPhysReg(MCRegister PhysReg) const override; const TargetRegisterClass * getPointerRegClass(const MachineFunction &MF, unsigned Kind = 0) const override; @@ -96,12 +99,12 @@ public: bool requiresFrameIndexScavenging(const MachineFunction &MF) const override; bool needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const override; - bool isFrameOffsetLegal(const MachineInstr *MI, unsigned BaseReg, + bool isFrameOffsetLegal(const MachineInstr *MI, Register BaseReg, int64_t Offset) const override; - void materializeFrameBaseRegister(MachineBasicBlock *MBB, unsigned BaseReg, + void materializeFrameBaseRegister(MachineBasicBlock *MBB, Register BaseReg, int FrameIdx, int64_t Offset) const override; - void resolveFrameIndex(MachineInstr &MI, unsigned BaseReg, + void resolveFrameIndex(MachineInstr &MI, Register BaseReg, int64_t Offset) const override; void eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj, unsigned FIOperandNum, @@ -118,10 +121,6 @@ public: unsigned getRegPressureLimit(const TargetRegisterClass *RC, MachineFunction &MF) const override; - bool trackLivenessAfterRegAlloc(const MachineFunction&) const override { - return true; - } - unsigned getLocalAddressRegister(const MachineFunction &MF) const; }; diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.td b/llvm/lib/Target/AArch64/AArch64RegisterInfo.td index f52feab039530..bd05c56009a1d 100644 --- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.td +++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.td @@ -422,25 +422,35 @@ def Q31 : AArch64Reg<31, "q31", [D31], ["v31", ""]>, DwarfRegAlias; def FPR8 : RegisterClass<"AArch64", [untyped], 8, (sequence "B%u", 0, 31)> { let Size = 8; } -def FPR16 : RegisterClass<"AArch64", [f16], 16, (sequence "H%u", 0, 31)> { +def FPR16 : RegisterClass<"AArch64", [f16, bf16], 16, (sequence "H%u", 0, 31)> { + let Size = 16; +} + +def FPR16_lo : RegisterClass<"AArch64", [f16], 16, (trunc FPR16, 16)> { let Size = 16; } def FPR32 : RegisterClass<"AArch64", [f32, i32], 32,(sequence "S%u", 0, 31)>; def FPR64 : RegisterClass<"AArch64", [f64, i64, v2f32, v1f64, v8i8, v4i16, v2i32, - v1i64, v4f16], - 64, (sequence "D%u", 0, 31)>; + v1i64, v4f16, v4bf16], + 64, (sequence "D%u", 0, 31)>; +def FPR64_lo : RegisterClass<"AArch64", + [v8i8, v4i16, v2i32, v1i64, v4f16, v4bf16, v2f32, + v1f64], + 64, (trunc FPR64, 16)>; + // We don't (yet) have an f128 legal type, so don't use that here. We // normalize 128-bit vectors to v2f64 for arg passing and such, so use // that here. def FPR128 : RegisterClass<"AArch64", [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64, f128, - v8f16], + v8f16, v8bf16], 128, (sequence "Q%u", 0, 31)>; // The lower 16 vector registers. Some instructions can only take registers // in this range. def FPR128_lo : RegisterClass<"AArch64", - [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64, v8f16], + [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64, v8f16, + v8bf16], 128, (trunc FPR128, 16)>; // Pairs, triples, and quads of 64-bit vector registers. @@ -503,6 +513,9 @@ def VectorRegLoAsmOperand : AsmOperandClass { let Name = "VectorRegLo"; let PredicateMethod = "isNeonVectorRegLo"; } +def V64_lo : RegisterOperand { + let ParserMatchClass = VectorRegLoAsmOperand; +} def V128_lo : RegisterOperand { let ParserMatchClass = VectorRegLoAsmOperand; } @@ -641,6 +654,10 @@ def FPR16Op : RegisterOperand { let ParserMatchClass = FPRAsmOperand<"FPR16">; } +def FPR16Op_lo : RegisterOperand { + let ParserMatchClass = FPRAsmOperand<"FPR16_lo">; +} + def FPR32Op : RegisterOperand { let ParserMatchClass = FPRAsmOperand<"FPR32">; } @@ -664,11 +681,11 @@ def XSeqPairs : RegisterTuples<[sube64, subo64], [(decimate (rotl GPR64, 0), 2), (decimate (rotl GPR64, 1), 2)]>; -def WSeqPairsClass : RegisterClass<"AArch64", [untyped], 32, +def WSeqPairsClass : RegisterClass<"AArch64", [untyped], 32, (add WSeqPairs)>{ let Size = 64; } -def XSeqPairsClass : RegisterClass<"AArch64", [untyped], 64, +def XSeqPairsClass : RegisterClass<"AArch64", [untyped], 64, (add XSeqPairs)>{ let Size = 128; } @@ -780,7 +797,7 @@ def Z30 : AArch64Reg<30, "z30", [Q30, Z30_HI]>, DwarfRegNum<[126]>; def Z31 : AArch64Reg<31, "z31", [Q31, Z31_HI]>, DwarfRegNum<[127]>; } -// Enum descibing the element size for destructive +// Enum describing the element size for destructive // operations. class ElementSizeEnum val> { bits<3> Value = val; @@ -862,6 +879,7 @@ def PPR3b64 : PPRRegOp<"d", PPRAsmOp3b64, ElementSizeD, PPR_3b>; class ZPRClass : RegisterClass<"AArch64", [nxv16i8, nxv8i16, nxv4i32, nxv2i64, nxv2f16, nxv4f16, nxv8f16, + nxv2bf16, nxv4bf16, nxv8bf16, nxv2f32, nxv4f32, nxv2f64], 128, (sequence "Z%u", 0, lastreg)> { diff --git a/llvm/lib/Target/AArch64/AArch64SIMDInstrOpt.cpp b/llvm/lib/Target/AArch64/AArch64SIMDInstrOpt.cpp index 28a7e680849b0..fc31e701d3af1 100644 --- a/llvm/lib/Target/AArch64/AArch64SIMDInstrOpt.cpp +++ b/llvm/lib/Target/AArch64/AArch64SIMDInstrOpt.cpp @@ -219,7 +219,7 @@ shouldReplaceInst(MachineFunction *MF, const MCInstrDesc *InstDesc, SmallVectorImpl &InstDescRepl) { // Check if replacement decision is already available in the cached table. // if so, return it. - std::string Subtarget = SchedModel.getSubtargetInfo()->getCPU(); + std::string Subtarget = std::string(SchedModel.getSubtargetInfo()->getCPU()); auto InstID = std::make_pair(InstDesc->getOpcode(), Subtarget); if (SIMDInstrTable.find(InstID) != SIMDInstrTable.end()) return SIMDInstrTable[InstID]; @@ -288,7 +288,8 @@ bool AArch64SIMDInstrOpt::shouldExitEarly(MachineFunction *MF, Subpass SP) { // For this optimization, check for all concerned instructions. case Interleave: - std::string Subtarget = SchedModel.getSubtargetInfo()->getCPU(); + std::string Subtarget = + std::string(SchedModel.getSubtargetInfo()->getCPU()); if (InterlEarlyExit.find(Subtarget) != InterlEarlyExit.end()) return InterlEarlyExit[Subtarget]; diff --git a/llvm/lib/Target/AArch64/AArch64SLSHardening.cpp b/llvm/lib/Target/AArch64/AArch64SLSHardening.cpp new file mode 100644 index 0000000000000..cb4dc8462f68d --- /dev/null +++ b/llvm/lib/Target/AArch64/AArch64SLSHardening.cpp @@ -0,0 +1,443 @@ +//===- AArch64SLSHardening.cpp - Harden Straight Line Missspeculation -----===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file contains a pass to insert code to mitigate against side channel +// vulnerabilities that may happen under straight line miss-speculation. +// +//===----------------------------------------------------------------------===// + +#include "AArch64InstrInfo.h" +#include "AArch64Subtarget.h" +#include "Utils/AArch64BaseInfo.h" +#include "llvm/ADT/BitVector.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/CodeGen/IndirectThunks.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineOperand.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/RegisterScavenging.h" +#include "llvm/IR/DebugLoc.h" +#include "llvm/Pass.h" +#include "llvm/Support/CodeGen.h" +#include "llvm/Support/Debug.h" +#include "llvm/Target/TargetMachine.h" +#include + +using namespace llvm; + +#define DEBUG_TYPE "aarch64-sls-hardening" + +#define AARCH64_SLS_HARDENING_NAME "AArch64 sls hardening pass" + +namespace { + +class AArch64SLSHardening : public MachineFunctionPass { +public: + const TargetInstrInfo *TII; + const TargetRegisterInfo *TRI; + const AArch64Subtarget *ST; + + static char ID; + + AArch64SLSHardening() : MachineFunctionPass(ID) { + initializeAArch64SLSHardeningPass(*PassRegistry::getPassRegistry()); + } + + bool runOnMachineFunction(MachineFunction &Fn) override; + + StringRef getPassName() const override { return AARCH64_SLS_HARDENING_NAME; } + +private: + bool hardenReturnsAndBRs(MachineBasicBlock &MBB) const; + bool hardenBLRs(MachineBasicBlock &MBB) const; + MachineBasicBlock &ConvertBLRToBL(MachineBasicBlock &MBB, + MachineBasicBlock::iterator) const; +}; + +} // end anonymous namespace + +char AArch64SLSHardening::ID = 0; + +INITIALIZE_PASS(AArch64SLSHardening, "aarch64-sls-hardening", + AARCH64_SLS_HARDENING_NAME, false, false) + +static void insertSpeculationBarrier(const AArch64Subtarget *ST, + MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + DebugLoc DL, + bool AlwaysUseISBDSB = false) { + assert(MBBI != MBB.begin() && + "Must not insert SpeculationBarrierEndBB as only instruction in MBB."); + assert(std::prev(MBBI)->isBarrier() && + "SpeculationBarrierEndBB must only follow unconditional control flow " + "instructions."); + assert(std::prev(MBBI)->isTerminator() && + "SpeculationBarrierEndBB must only follow terminators."); + const TargetInstrInfo *TII = ST->getInstrInfo(); + unsigned BarrierOpc = ST->hasSB() && !AlwaysUseISBDSB + ? AArch64::SpeculationBarrierSBEndBB + : AArch64::SpeculationBarrierISBDSBEndBB; + if (MBBI == MBB.end() || + (MBBI->getOpcode() != AArch64::SpeculationBarrierSBEndBB && + MBBI->getOpcode() != AArch64::SpeculationBarrierISBDSBEndBB)) + BuildMI(MBB, MBBI, DL, TII->get(BarrierOpc)); +} + +bool AArch64SLSHardening::runOnMachineFunction(MachineFunction &MF) { + ST = &MF.getSubtarget(); + TII = MF.getSubtarget().getInstrInfo(); + TRI = MF.getSubtarget().getRegisterInfo(); + + bool Modified = false; + for (auto &MBB : MF) { + Modified |= hardenReturnsAndBRs(MBB); + Modified |= hardenBLRs(MBB); + } + + return Modified; +} + +static bool isBLR(const MachineInstr &MI) { + switch (MI.getOpcode()) { + case AArch64::BLR: + case AArch64::BLRNoIP: + return true; + case AArch64::BLRAA: + case AArch64::BLRAB: + case AArch64::BLRAAZ: + case AArch64::BLRABZ: + llvm_unreachable("Currently, LLVM's code generator does not support " + "producing BLRA* instructions. Therefore, there's no " + "support in this pass for those instructions."); + } + return false; +} + +bool AArch64SLSHardening::hardenReturnsAndBRs(MachineBasicBlock &MBB) const { + if (!ST->hardenSlsRetBr()) + return false; + bool Modified = false; + MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator(), E = MBB.end(); + MachineBasicBlock::iterator NextMBBI; + for (; MBBI != E; MBBI = NextMBBI) { + MachineInstr &MI = *MBBI; + NextMBBI = std::next(MBBI); + if (MI.isReturn() || isIndirectBranchOpcode(MI.getOpcode())) { + assert(MI.isTerminator()); + insertSpeculationBarrier(ST, MBB, std::next(MBBI), MI.getDebugLoc()); + Modified = true; + } + } + return Modified; +} + +static const char SLSBLRNamePrefix[] = "__llvm_slsblr_thunk_"; + +static const struct ThunkNameAndReg { + const char* Name; + Register Reg; +} SLSBLRThunks[] = { + { "__llvm_slsblr_thunk_x0", AArch64::X0}, + { "__llvm_slsblr_thunk_x1", AArch64::X1}, + { "__llvm_slsblr_thunk_x2", AArch64::X2}, + { "__llvm_slsblr_thunk_x3", AArch64::X3}, + { "__llvm_slsblr_thunk_x4", AArch64::X4}, + { "__llvm_slsblr_thunk_x5", AArch64::X5}, + { "__llvm_slsblr_thunk_x6", AArch64::X6}, + { "__llvm_slsblr_thunk_x7", AArch64::X7}, + { "__llvm_slsblr_thunk_x8", AArch64::X8}, + { "__llvm_slsblr_thunk_x9", AArch64::X9}, + { "__llvm_slsblr_thunk_x10", AArch64::X10}, + { "__llvm_slsblr_thunk_x11", AArch64::X11}, + { "__llvm_slsblr_thunk_x12", AArch64::X12}, + { "__llvm_slsblr_thunk_x13", AArch64::X13}, + { "__llvm_slsblr_thunk_x14", AArch64::X14}, + { "__llvm_slsblr_thunk_x15", AArch64::X15}, + // X16 and X17 are deliberately missing, as the mitigation requires those + // register to not be used in BLR. See comment in ConvertBLRToBL for more + // details. + { "__llvm_slsblr_thunk_x18", AArch64::X18}, + { "__llvm_slsblr_thunk_x19", AArch64::X19}, + { "__llvm_slsblr_thunk_x20", AArch64::X20}, + { "__llvm_slsblr_thunk_x21", AArch64::X21}, + { "__llvm_slsblr_thunk_x22", AArch64::X22}, + { "__llvm_slsblr_thunk_x23", AArch64::X23}, + { "__llvm_slsblr_thunk_x24", AArch64::X24}, + { "__llvm_slsblr_thunk_x25", AArch64::X25}, + { "__llvm_slsblr_thunk_x26", AArch64::X26}, + { "__llvm_slsblr_thunk_x27", AArch64::X27}, + { "__llvm_slsblr_thunk_x28", AArch64::X28}, + { "__llvm_slsblr_thunk_x29", AArch64::FP}, + // X30 is deliberately missing, for similar reasons as X16 and X17 are + // missing. + { "__llvm_slsblr_thunk_x31", AArch64::XZR}, +}; + +namespace { +struct SLSBLRThunkInserter : ThunkInserter { + const char *getThunkPrefix() { return SLSBLRNamePrefix; } + bool mayUseThunk(const MachineFunction &MF) { + // FIXME: This could also check if there are any BLRs in the function + // to more accurately reflect if a thunk will be needed. + return MF.getSubtarget().hardenSlsBlr(); + } + void insertThunks(MachineModuleInfo &MMI); + void populateThunk(MachineFunction &MF); +}; +} // namespace + +void SLSBLRThunkInserter::insertThunks(MachineModuleInfo &MMI) { + // FIXME: It probably would be possible to filter which thunks to produce + // based on which registers are actually used in BLR instructions in this + // function. But would that be a worthwhile optimization? + for (auto T : SLSBLRThunks) + createThunkFunction(MMI, T.Name); +} + +void SLSBLRThunkInserter::populateThunk(MachineFunction &MF) { + // FIXME: How to better communicate Register number, rather than through + // name and lookup table? + assert(MF.getName().startswith(getThunkPrefix())); + auto ThunkIt = llvm::find_if( + SLSBLRThunks, [&MF](auto T) { return T.Name == MF.getName(); }); + assert(ThunkIt != std::end(SLSBLRThunks)); + Register ThunkReg = ThunkIt->Reg; + + const TargetInstrInfo *TII = + MF.getSubtarget().getInstrInfo(); + assert (MF.size() == 1); + MachineBasicBlock *Entry = &MF.front(); + Entry->clear(); + + // These thunks need to consist of the following instructions: + // __llvm_slsblr_thunk_xN: + // BR xN + // barrierInsts + Entry->addLiveIn(ThunkReg); + // MOV X16, ThunkReg == ORR X16, XZR, ThunkReg, LSL #0 + BuildMI(Entry, DebugLoc(), TII->get(AArch64::ORRXrs), AArch64::X16) + .addReg(AArch64::XZR) + .addReg(ThunkReg) + .addImm(0); + BuildMI(Entry, DebugLoc(), TII->get(AArch64::BR)).addReg(AArch64::X16); + // Make sure the thunks do not make use of the SB extension in case there is + // a function somewhere that will call to it that for some reason disabled + // the SB extension locally on that function, even though it's enabled for + // the module otherwise. Therefore set AlwaysUseISBSDB to true. + insertSpeculationBarrier(&MF.getSubtarget(), *Entry, + Entry->end(), DebugLoc(), true /*AlwaysUseISBDSB*/); +} + +MachineBasicBlock & +AArch64SLSHardening::ConvertBLRToBL(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI) const { + // Transform a BLR to a BL as follows: + // Before: + // |-----------------------------| + // | ... | + // | instI | + // | BLR xN | + // | instJ | + // | ... | + // |-----------------------------| + // + // After: + // |-----------------------------| + // | ... | + // | instI | + // | BL __llvm_slsblr_thunk_xN | + // | instJ | + // | ... | + // |-----------------------------| + // + // __llvm_slsblr_thunk_xN: + // |-----------------------------| + // | BR xN | + // | barrierInsts | + // |-----------------------------| + // + // The __llvm_slsblr_thunk_xN thunks are created by the SLSBLRThunkInserter. + // This function merely needs to transform BLR xN into BL + // __llvm_slsblr_thunk_xN. + // + // Since linkers are allowed to clobber X16 and X17 on function calls, the + // above mitigation only works if the original BLR instruction was not + // BLR X16 nor BLR X17. Code generation before must make sure that no BLR + // X16|X17 was produced if the mitigation is enabled. + + MachineInstr &BLR = *MBBI; + assert(isBLR(BLR)); + unsigned BLOpcode; + Register Reg; + bool RegIsKilled; + switch (BLR.getOpcode()) { + case AArch64::BLR: + case AArch64::BLRNoIP: + BLOpcode = AArch64::BL; + Reg = BLR.getOperand(0).getReg(); + assert(Reg != AArch64::X16 && Reg != AArch64::X17 && Reg != AArch64::LR); + RegIsKilled = BLR.getOperand(0).isKill(); + break; + case AArch64::BLRAA: + case AArch64::BLRAB: + case AArch64::BLRAAZ: + case AArch64::BLRABZ: + llvm_unreachable("BLRA instructions cannot yet be produced by LLVM, " + "therefore there is no need to support them for now."); + default: + llvm_unreachable("unhandled BLR"); + } + DebugLoc DL = BLR.getDebugLoc(); + + // If we'd like to support also BLRAA and BLRAB instructions, we'd need + // a lot more different kind of thunks. + // For example, a + // + // BLRAA xN, xM + // + // instruction probably would need to be transformed to something like: + // + // BL __llvm_slsblraa_thunk_x_x + // + // __llvm_slsblraa_thunk_x_x: + // BRAA x, x + // barrierInsts + // + // Given that about 30 different values of N are possible and about 30 + // different values of M are possible in the above, with the current way + // of producing indirect thunks, we'd be producing about 30 times 30, i.e. + // about 900 thunks (where most might not be actually called). This would + // multiply further by two to support both BLRAA and BLRAB variants of those + // instructions. + // If we'd want to support this, we'd probably need to look into a different + // way to produce thunk functions, based on which variants are actually + // needed, rather than producing all possible variants. + // So far, LLVM does never produce BLRA* instructions, so let's leave this + // for the future when LLVM can start producing BLRA* instructions. + MachineFunction &MF = *MBBI->getMF(); + MCContext &Context = MBB.getParent()->getContext(); + auto ThunkIt = + llvm::find_if(SLSBLRThunks, [Reg](auto T) { return T.Reg == Reg; }); + assert (ThunkIt != std::end(SLSBLRThunks)); + MCSymbol *Sym = Context.getOrCreateSymbol(ThunkIt->Name); + + MachineInstr *BL = BuildMI(MBB, MBBI, DL, TII->get(BLOpcode)).addSym(Sym); + + // Now copy the implicit operands from BLR to BL and copy other necessary + // info. + // However, both BLR and BL instructions implictly use SP and implicitly + // define LR. Blindly copying implicit operands would result in SP and LR + // operands to be present multiple times. While this may not be too much of + // an issue, let's avoid that for cleanliness, by removing those implicit + // operands from the BL created above before we copy over all implicit + // operands from the BLR. + int ImpLROpIdx = -1; + int ImpSPOpIdx = -1; + for (unsigned OpIdx = BL->getNumExplicitOperands(); + OpIdx < BL->getNumOperands(); OpIdx++) { + MachineOperand Op = BL->getOperand(OpIdx); + if (!Op.isReg()) + continue; + if (Op.getReg() == AArch64::LR && Op.isDef()) + ImpLROpIdx = OpIdx; + if (Op.getReg() == AArch64::SP && !Op.isDef()) + ImpSPOpIdx = OpIdx; + } + assert(ImpLROpIdx != -1); + assert(ImpSPOpIdx != -1); + int FirstOpIdxToRemove = std::max(ImpLROpIdx, ImpSPOpIdx); + int SecondOpIdxToRemove = std::min(ImpLROpIdx, ImpSPOpIdx); + BL->RemoveOperand(FirstOpIdxToRemove); + BL->RemoveOperand(SecondOpIdxToRemove); + // Now copy over the implicit operands from the original BLR + BL->copyImplicitOps(MF, BLR); + MF.moveCallSiteInfo(&BLR, BL); + // Also add the register called in the BLR as being used in the called thunk. + BL->addOperand(MachineOperand::CreateReg(Reg, false /*isDef*/, true /*isImp*/, + RegIsKilled /*isKill*/)); + // Remove BLR instruction + MBB.erase(MBBI); + + return MBB; +} + +bool AArch64SLSHardening::hardenBLRs(MachineBasicBlock &MBB) const { + if (!ST->hardenSlsBlr()) + return false; + bool Modified = false; + MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end(); + MachineBasicBlock::iterator NextMBBI; + for (; MBBI != E; MBBI = NextMBBI) { + MachineInstr &MI = *MBBI; + NextMBBI = std::next(MBBI); + if (isBLR(MI)) { + ConvertBLRToBL(MBB, MBBI); + Modified = true; + } + } + return Modified; +} + +FunctionPass *llvm::createAArch64SLSHardeningPass() { + return new AArch64SLSHardening(); +} + +namespace { +class AArch64IndirectThunks : public MachineFunctionPass { +public: + static char ID; + + AArch64IndirectThunks() : MachineFunctionPass(ID) {} + + StringRef getPassName() const override { return "AArch64 Indirect Thunks"; } + + bool doInitialization(Module &M) override; + bool runOnMachineFunction(MachineFunction &MF) override; + +private: + std::tuple TIs; + + // FIXME: When LLVM moves to C++17, these can become folds + template + static void initTIs(Module &M, + std::tuple &ThunkInserters) { + (void)std::initializer_list{ + (std::get(ThunkInserters).init(M), 0)...}; + } + template + static bool runTIs(MachineModuleInfo &MMI, MachineFunction &MF, + std::tuple &ThunkInserters) { + bool Modified = false; + (void)std::initializer_list{ + Modified |= std::get(ThunkInserters).run(MMI, MF)...}; + return Modified; + } +}; + +} // end anonymous namespace + +char AArch64IndirectThunks::ID = 0; + +FunctionPass *llvm::createAArch64IndirectThunks() { + return new AArch64IndirectThunks(); +} + +bool AArch64IndirectThunks::doInitialization(Module &M) { + initTIs(M, TIs); + return false; +} + +bool AArch64IndirectThunks::runOnMachineFunction(MachineFunction &MF) { + LLVM_DEBUG(dbgs() << getPassName() << '\n'); + auto &MMI = getAnalysis().getMMI(); + return runTIs(MMI, MF, TIs); +} diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td index c849d7af9a40b..28a54e6f7d79f 100644 --- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -10,65 +10,188 @@ // //===----------------------------------------------------------------------===// -def SDT_AArch64_GLD1 : SDTypeProfile<1, 4, [ +// For predicated nodes where the entire operation is controlled by a governing +// predicate, please stick to a similar naming convention as used for the +// ISD nodes: +// +// SDNode <=> AArch64ISD +// ------------------------------- +// _m <=> _MERGE_OP +// _mt <=> _MERGE_PASSTHRU +// _z <=> _MERGE_ZERO +// _p <=> _PRED +// +// Given the context of this file, it is not strictly necessary to use _p to +// distinguish predicated from unpredicated nodes given that most SVE +// instructions are predicated. + +// Contiguous loads - node definitions +// +def SDT_AArch64_LD1 : SDTypeProfile<1, 3, [ + SDTCisVec<0>, SDTCisVec<1>, SDTCisPtrTy<2>, + SDTCVecEltisVT<1,i1>, SDTCisSameNumEltsAs<0,1> +]>; + +def AArch64ld1_z : SDNode<"AArch64ISD::LD1_MERGE_ZERO", SDT_AArch64_LD1, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>; +def AArch64ld1s_z : SDNode<"AArch64ISD::LD1S_MERGE_ZERO", SDT_AArch64_LD1, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>; + +// Non-faulting & first-faulting loads - node definitions +// +def AArch64ldnf1_z : SDNode<"AArch64ISD::LDNF1_MERGE_ZERO", SDT_AArch64_LD1, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue, SDNPOutGlue]>; +def AArch64ldff1_z : SDNode<"AArch64ISD::LDFF1_MERGE_ZERO", SDT_AArch64_LD1, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue, SDNPOutGlue]>; + +def AArch64ldnf1s_z : SDNode<"AArch64ISD::LDNF1S_MERGE_ZERO", SDT_AArch64_LD1, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue, SDNPOutGlue]>; +def AArch64ldff1s_z : SDNode<"AArch64ISD::LDFF1S_MERGE_ZERO", SDT_AArch64_LD1, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue, SDNPOutGlue]>; + +// Contiguous load and replicate - node definitions +// + +def SDT_AArch64_LD1Replicate : SDTypeProfile<1, 2, [ + SDTCisVec<0>, SDTCisVec<1>, SDTCisPtrTy<2>, + SDTCVecEltisVT<1,i1>, SDTCisSameNumEltsAs<0,1> +]>; + +def AArch64ld1rq_z : SDNode<"AArch64ISD::LD1RQ_MERGE_ZERO", SDT_AArch64_LD1Replicate, [SDNPHasChain, SDNPMayLoad]>; +def AArch64ld1ro_z : SDNode<"AArch64ISD::LD1RO_MERGE_ZERO", SDT_AArch64_LD1Replicate, [SDNPHasChain, SDNPMayLoad]>; + +// Gather loads - node definitions +// +def SDT_AArch64_GATHER_SV : SDTypeProfile<1, 4, [ SDTCisVec<0>, SDTCisVec<1>, SDTCisPtrTy<2>, SDTCisVec<3>, SDTCisVT<4, OtherVT>, SDTCVecEltisVT<1,i1>, SDTCisSameNumEltsAs<0,1> ]>; -def SDT_AArch64_GLD1_IMM : SDTypeProfile<1, 4, [ +def SDT_AArch64_GATHER_VS : SDTypeProfile<1, 4, [ SDTCisVec<0>, SDTCisVec<1>, SDTCisVec<2>, SDTCisInt<3>, SDTCisVT<4, OtherVT>, SDTCVecEltisVT<1,i1>, SDTCisSameNumEltsAs<0,1> ]>; -def SDT_AArch64_SST1 : SDTypeProfile<0, 5, [ +def AArch64ld1_gather_z : SDNode<"AArch64ISD::GLD1_MERGE_ZERO", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad]>; +def AArch64ld1_gather_scaled_z : SDNode<"AArch64ISD::GLD1_SCALED_MERGE_ZERO", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad]>; +def AArch64ld1_gather_uxtw_z : SDNode<"AArch64ISD::GLD1_UXTW_MERGE_ZERO", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad]>; +def AArch64ld1_gather_sxtw_z : SDNode<"AArch64ISD::GLD1_SXTW_MERGE_ZERO", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad]>; +def AArch64ld1_gather_uxtw_scaled_z : SDNode<"AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad]>; +def AArch64ld1_gather_sxtw_scaled_z : SDNode<"AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad]>; +def AArch64ld1_gather_imm_z : SDNode<"AArch64ISD::GLD1_IMM_MERGE_ZERO", SDT_AArch64_GATHER_VS, [SDNPHasChain, SDNPMayLoad]>; + +def AArch64ld1s_gather_z : SDNode<"AArch64ISD::GLD1S_MERGE_ZERO", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad]>; +def AArch64ld1s_gather_scaled_z : SDNode<"AArch64ISD::GLD1S_SCALED_MERGE_ZERO", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad]>; +def AArch64ld1s_gather_uxtw_z : SDNode<"AArch64ISD::GLD1S_UXTW_MERGE_ZERO", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad]>; +def AArch64ld1s_gather_sxtw_z : SDNode<"AArch64ISD::GLD1S_SXTW_MERGE_ZERO", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad]>; +def AArch64ld1s_gather_uxtw_scaled_z : SDNode<"AArch64ISD::GLD1S_UXTW_SCALED_MERGE_ZERO", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad]>; +def AArch64ld1s_gather_sxtw_scaled_z : SDNode<"AArch64ISD::GLD1S_SXTW_SCALED_MERGE_ZERO", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad]>; +def AArch64ld1s_gather_imm_z : SDNode<"AArch64ISD::GLD1S_IMM_MERGE_ZERO", SDT_AArch64_GATHER_VS, [SDNPHasChain, SDNPMayLoad]>; + +def AArch64ldff1_gather_z : SDNode<"AArch64ISD::GLDFF1_MERGE_ZERO", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue, SDNPOutGlue]>; +def AArch64ldff1_gather_scaled_z : SDNode<"AArch64ISD::GLDFF1_SCALED_MERGE_ZERO", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue, SDNPOutGlue]>; +def AArch64ldff1_gather_uxtw_z : SDNode<"AArch64ISD::GLDFF1_UXTW_MERGE_ZERO", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue, SDNPOutGlue]>; +def AArch64ldff1_gather_sxtw_z : SDNode<"AArch64ISD::GLDFF1_SXTW_MERGE_ZERO", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue, SDNPOutGlue]>; +def AArch64ldff1_gather_uxtw_scaled_z : SDNode<"AArch64ISD::GLDFF1_UXTW_SCALED_MERGE_ZERO", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue, SDNPOutGlue]>; +def AArch64ldff1_gather_sxtw_scaled_z : SDNode<"AArch64ISD::GLDFF1_SXTW_SCALED_MERGE_ZERO", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue, SDNPOutGlue]>; +def AArch64ldff1_gather_imm_z : SDNode<"AArch64ISD::GLDFF1_IMM_MERGE_ZERO", SDT_AArch64_GATHER_VS, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue, SDNPOutGlue]>; + +def AArch64ldff1s_gather_z : SDNode<"AArch64ISD::GLDFF1S_MERGE_ZERO", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue, SDNPOutGlue]>; +def AArch64ldff1s_gather_scaled_z : SDNode<"AArch64ISD::GLDFF1S_SCALED_MERGE_ZERO", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue, SDNPOutGlue]>; +def AArch64ldff1s_gather_uxtw_z : SDNode<"AArch64ISD::GLDFF1S_UXTW_MERGE_ZERO", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue, SDNPOutGlue]>; +def AArch64ldff1s_gather_sxtw_z : SDNode<"AArch64ISD::GLDFF1S_SXTW_MERGE_ZERO", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue, SDNPOutGlue]>; +def AArch64ldff1s_gather_uxtw_scaled_z : SDNode<"AArch64ISD::GLDFF1S_UXTW_SCALED_MERGE_ZERO", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue, SDNPOutGlue]>; +def AArch64ldff1s_gather_sxtw_scaled_z : SDNode<"AArch64ISD::GLDFF1S_SXTW_SCALED_MERGE_ZERO", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue, SDNPOutGlue]>; +def AArch64ldff1s_gather_imm_z : SDNode<"AArch64ISD::GLDFF1S_IMM_MERGE_ZERO", SDT_AArch64_GATHER_VS, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue, SDNPOutGlue]>; + +def AArch64ldnt1_gather_z : SDNode<"AArch64ISD::GLDNT1_MERGE_ZERO", SDT_AArch64_GATHER_VS, [SDNPHasChain, SDNPMayLoad]>; +def AArch64ldnt1s_gather_z : SDNode<"AArch64ISD::GLDNT1S_MERGE_ZERO", SDT_AArch64_GATHER_VS, [SDNPHasChain, SDNPMayLoad]>; + +// Contiguous stores - node definitions +// +def SDT_AArch64_ST1 : SDTypeProfile<0, 4, [ + SDTCisVec<0>, SDTCisPtrTy<1>, SDTCisVec<2>, + SDTCVecEltisVT<2,i1>, SDTCisSameNumEltsAs<0,2> +]>; + +def AArch64st1 : SDNode<"AArch64ISD::ST1_PRED", SDT_AArch64_ST1, [SDNPHasChain, SDNPMayStore]>; + +// Scatter stores - node definitions +// +def SDT_AArch64_SCATTER_SV : SDTypeProfile<0, 5, [ SDTCisVec<0>, SDTCisVec<1>, SDTCisPtrTy<2>, SDTCisVec<3>, SDTCisVT<4, OtherVT>, SDTCVecEltisVT<1,i1>, SDTCisSameNumEltsAs<0,1> ]>; -def SDT_AArch64_SST1_IMM : SDTypeProfile<0, 5, [ +def SDT_AArch64_SCATTER_VS : SDTypeProfile<0, 5, [ SDTCisVec<0>, SDTCisVec<1>, SDTCisVec<2>, SDTCisInt<3>, SDTCisVT<4, OtherVT>, SDTCVecEltisVT<1,i1>, SDTCisSameNumEltsAs<0,1> ]>; -def AArch64st1_scatter : SDNode<"AArch64ISD::SST1", SDT_AArch64_SST1, [SDNPHasChain, SDNPMayStore, SDNPOptInGlue]>; -def AArch64st1_scatter_scaled : SDNode<"AArch64ISD::SST1_SCALED", SDT_AArch64_SST1, [SDNPHasChain, SDNPMayStore, SDNPOptInGlue]>; -def AArch64st1_scatter_uxtw : SDNode<"AArch64ISD::SST1_UXTW", SDT_AArch64_SST1, [SDNPHasChain, SDNPMayStore, SDNPOptInGlue]>; -def AArch64st1_scatter_sxtw : SDNode<"AArch64ISD::SST1_SXTW", SDT_AArch64_SST1, [SDNPHasChain, SDNPMayStore, SDNPOptInGlue]>; -def AArch64st1_scatter_uxtw_scaled : SDNode<"AArch64ISD::SST1_UXTW_SCALED", SDT_AArch64_SST1, [SDNPHasChain, SDNPMayStore, SDNPOptInGlue]>; -def AArch64st1_scatter_sxtw_scaled : SDNode<"AArch64ISD::SST1_SXTW_SCALED", SDT_AArch64_SST1, [SDNPHasChain, SDNPMayStore, SDNPOptInGlue]>; -def AArch64st1_scatter_imm : SDNode<"AArch64ISD::SST1_IMM", SDT_AArch64_SST1_IMM, [SDNPHasChain, SDNPMayStore, SDNPOptInGlue]>; - -def AArch64ld1_gather : SDNode<"AArch64ISD::GLD1", SDT_AArch64_GLD1, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>; -def AArch64ld1_gather_scaled : SDNode<"AArch64ISD::GLD1_SCALED", SDT_AArch64_GLD1, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>; -def AArch64ld1_gather_uxtw : SDNode<"AArch64ISD::GLD1_UXTW", SDT_AArch64_GLD1, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>; -def AArch64ld1_gather_sxtw : SDNode<"AArch64ISD::GLD1_SXTW", SDT_AArch64_GLD1, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>; -def AArch64ld1_gather_uxtw_scaled : SDNode<"AArch64ISD::GLD1_UXTW_SCALED", SDT_AArch64_GLD1, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>; -def AArch64ld1_gather_sxtw_scaled : SDNode<"AArch64ISD::GLD1_SXTW_SCALED", SDT_AArch64_GLD1, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>; -def AArch64ld1_gather_imm : SDNode<"AArch64ISD::GLD1_IMM", SDT_AArch64_GLD1_IMM, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>; - -def AArch64ld1s_gather : SDNode<"AArch64ISD::GLD1S", SDT_AArch64_GLD1, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>; -def AArch64ld1s_gather_scaled : SDNode<"AArch64ISD::GLD1S_SCALED", SDT_AArch64_GLD1, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>; -def AArch64ld1s_gather_uxtw : SDNode<"AArch64ISD::GLD1S_UXTW", SDT_AArch64_GLD1, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>; -def AArch64ld1s_gather_sxtw : SDNode<"AArch64ISD::GLD1S_SXTW", SDT_AArch64_GLD1, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>; -def AArch64ld1s_gather_uxtw_scaled : SDNode<"AArch64ISD::GLD1S_UXTW_SCALED", SDT_AArch64_GLD1, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>; -def AArch64ld1s_gather_sxtw_scaled : SDNode<"AArch64ISD::GLD1S_SXTW_SCALED", SDT_AArch64_GLD1, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>; -def AArch64ld1s_gather_imm : SDNode<"AArch64ISD::GLD1S_IMM", SDT_AArch64_GLD1_IMM, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>; +def AArch64st1_scatter : SDNode<"AArch64ISD::SST1_PRED", SDT_AArch64_SCATTER_SV, [SDNPHasChain, SDNPMayStore]>; +def AArch64st1_scatter_scaled : SDNode<"AArch64ISD::SST1_SCALED_PRED", SDT_AArch64_SCATTER_SV, [SDNPHasChain, SDNPMayStore]>; +def AArch64st1_scatter_uxtw : SDNode<"AArch64ISD::SST1_UXTW_PRED", SDT_AArch64_SCATTER_SV, [SDNPHasChain, SDNPMayStore]>; +def AArch64st1_scatter_sxtw : SDNode<"AArch64ISD::SST1_SXTW_PRED", SDT_AArch64_SCATTER_SV, [SDNPHasChain, SDNPMayStore]>; +def AArch64st1_scatter_uxtw_scaled : SDNode<"AArch64ISD::SST1_UXTW_SCALED_PRED", SDT_AArch64_SCATTER_SV, [SDNPHasChain, SDNPMayStore]>; +def AArch64st1_scatter_sxtw_scaled : SDNode<"AArch64ISD::SST1_SXTW_SCALED_PRED", SDT_AArch64_SCATTER_SV, [SDNPHasChain, SDNPMayStore]>; +def AArch64st1_scatter_imm : SDNode<"AArch64ISD::SST1_IMM_PRED", SDT_AArch64_SCATTER_VS, [SDNPHasChain, SDNPMayStore]>; + +def AArch64stnt1_scatter : SDNode<"AArch64ISD::SSTNT1_PRED", SDT_AArch64_SCATTER_VS, [SDNPHasChain, SDNPMayStore]>; + +// AArch64 SVE/SVE2 - the remaining node definitions +// + +// SVE CNT/INC/RDVL +def sve_rdvl_imm : ComplexPattern">; +def sve_cnth_imm : ComplexPattern">; +def sve_cntw_imm : ComplexPattern">; +def sve_cntd_imm : ComplexPattern">; + +// SVE DEC +def sve_cnth_imm_neg : ComplexPattern">; +def sve_cntw_imm_neg : ComplexPattern">; +def sve_cntd_imm_neg : ComplexPattern">; def SDT_AArch64Reduce : SDTypeProfile<1, 2, [SDTCisVec<1>, SDTCisVec<2>]>; +def AArch64faddv_p : SDNode<"AArch64ISD::FADDV_PRED", SDT_AArch64Reduce>; +def AArch64fmaxv_p : SDNode<"AArch64ISD::FMAXV_PRED", SDT_AArch64Reduce>; +def AArch64fmaxnmv_p : SDNode<"AArch64ISD::FMAXNMV_PRED", SDT_AArch64Reduce>; +def AArch64fminv_p : SDNode<"AArch64ISD::FMINV_PRED", SDT_AArch64Reduce>; +def AArch64fminnmv_p : SDNode<"AArch64ISD::FMINNMV_PRED", SDT_AArch64Reduce>; +def AArch64smaxv_p : SDNode<"AArch64ISD::SMAXV_PRED", SDT_AArch64Reduce>; +def AArch64umaxv_p : SDNode<"AArch64ISD::UMAXV_PRED", SDT_AArch64Reduce>; +def AArch64sminv_p : SDNode<"AArch64ISD::SMINV_PRED", SDT_AArch64Reduce>; +def AArch64uminv_p : SDNode<"AArch64ISD::UMINV_PRED", SDT_AArch64Reduce>; +def AArch64orv_p : SDNode<"AArch64ISD::ORV_PRED", SDT_AArch64Reduce>; +def AArch64eorv_p : SDNode<"AArch64ISD::EORV_PRED", SDT_AArch64Reduce>; +def AArch64andv_p : SDNode<"AArch64ISD::ANDV_PRED", SDT_AArch64Reduce>; +def AArch64lasta : SDNode<"AArch64ISD::LASTA", SDT_AArch64Reduce>; +def AArch64lastb : SDNode<"AArch64ISD::LASTB", SDT_AArch64Reduce>; + +def SDT_AArch64Arith : SDTypeProfile<1, 3, [ + SDTCisVec<0>, SDTCisVec<1>, SDTCisVec<2>, SDTCisVec<3>, + SDTCVecEltisVT<1,i1>, SDTCisSameAs<2,3> +]>; + +def SDT_AArch64FMA : SDTypeProfile<1, 4, [ + SDTCisVec<0>, SDTCisVec<1>, SDTCisVec<2>, SDTCisVec<3>, SDTCisVec<4>, + SDTCVecEltisVT<1,i1>, SDTCisSameAs<2,3>, SDTCisSameAs<3,4> +]>; -def AArch64smaxv_pred : SDNode<"AArch64ISD::SMAXV_PRED", SDT_AArch64Reduce>; -def AArch64umaxv_pred : SDNode<"AArch64ISD::UMAXV_PRED", SDT_AArch64Reduce>; -def AArch64sminv_pred : SDNode<"AArch64ISD::SMINV_PRED", SDT_AArch64Reduce>; -def AArch64uminv_pred : SDNode<"AArch64ISD::UMINV_PRED", SDT_AArch64Reduce>; -def AArch64orv_pred : SDNode<"AArch64ISD::ORV_PRED", SDT_AArch64Reduce>; -def AArch64eorv_pred : SDNode<"AArch64ISD::EORV_PRED", SDT_AArch64Reduce>; -def AArch64andv_pred : SDNode<"AArch64ISD::ANDV_PRED", SDT_AArch64Reduce>; -def AArch64lasta : SDNode<"AArch64ISD::LASTA", SDT_AArch64Reduce>; -def AArch64lastb : SDNode<"AArch64ISD::LASTB", SDT_AArch64Reduce>; +// Predicated operations with the result of inactive lanes being unspecified. +def AArch64add_p : SDNode<"AArch64ISD::ADD_PRED", SDT_AArch64Arith>; +def AArch64fadd_p : SDNode<"AArch64ISD::FADD_PRED", SDT_AArch64Arith>; +def AArch64fma_p : SDNode<"AArch64ISD::FMA_PRED", SDT_AArch64FMA>; +def AArch64sdiv_p : SDNode<"AArch64ISD::SDIV_PRED", SDT_AArch64Arith>; +def AArch64udiv_p : SDNode<"AArch64ISD::UDIV_PRED", SDT_AArch64Arith>; + +// Merging op1 into the inactive lanes. +def AArch64smin_m1 : SDNode<"AArch64ISD::SMIN_MERGE_OP1", SDT_AArch64Arith>; +def AArch64umin_m1 : SDNode<"AArch64ISD::UMIN_MERGE_OP1", SDT_AArch64Arith>; +def AArch64smax_m1 : SDNode<"AArch64ISD::SMAX_MERGE_OP1", SDT_AArch64Arith>; +def AArch64umax_m1 : SDNode<"AArch64ISD::UMAX_MERGE_OP1", SDT_AArch64Arith>; +def AArch64lsl_m1 : SDNode<"AArch64ISD::SHL_MERGE_OP1", SDT_AArch64Arith>; +def AArch64lsr_m1 : SDNode<"AArch64ISD::SRL_MERGE_OP1", SDT_AArch64Arith>; +def AArch64asr_m1 : SDNode<"AArch64ISD::SRA_MERGE_OP1", SDT_AArch64Arith>; def SDT_AArch64ReduceWithInit : SDTypeProfile<1, 3, [SDTCisVec<1>, SDTCisVec<3>]>; def AArch64clasta_n : SDNode<"AArch64ISD::CLASTA_N", SDT_AArch64ReduceWithInit>; def AArch64clastb_n : SDNode<"AArch64ISD::CLASTB_N", SDT_AArch64ReduceWithInit>; +def AArch64fadda_p : SDNode<"AArch64ISD::FADDA_PRED", SDT_AArch64ReduceWithInit>; def SDT_AArch64Rev : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisSameAs<0,1>]>; def AArch64rev : SDNode<"AArch64ISD::REV", SDT_AArch64Rev>; @@ -76,42 +199,57 @@ def AArch64rev : SDNode<"AArch64ISD::REV", SDT_AArch64Rev>; def SDT_AArch64PTest : SDTypeProfile<0, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>]>; def AArch64ptest : SDNode<"AArch64ISD::PTEST", SDT_AArch64PTest>; -let Predicates = [HasSVE] in { +def SDT_AArch64DUP_PRED : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0, 3>, SDTCisVec<1>, SDTCVecEltisVT<1,i1>]>; +def AArch64dup_mt : SDNode<"AArch64ISD::DUP_MERGE_PASSTHRU", SDT_AArch64DUP_PRED>; + +def SDT_IndexVector : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<1, 2>, SDTCisInt<2>]>; +def index_vector : SDNode<"AArch64ISD::INDEX_VECTOR", SDT_IndexVector, []>; - def RDFFR_PPz : sve_int_rdffr_pred<0b0, "rdffr">; - def RDFFRS_PPz : sve_int_rdffr_pred<0b1, "rdffrs">; - def RDFFR_P : sve_int_rdffr_unpred<"rdffr">; - def SETFFR : sve_int_setffr<"setffr">; - def WRFFR : sve_int_wrffr<"wrffr">; +def reinterpret_cast : SDNode<"AArch64ISD::REINTERPRET_CAST", SDTUnaryOp>; - defm ADD_ZZZ : sve_int_bin_cons_arit_0<0b000, "add", add>; - defm SUB_ZZZ : sve_int_bin_cons_arit_0<0b001, "sub", sub>; - defm SQADD_ZZZ : sve_int_bin_cons_arit_0<0b100, "sqadd", saddsat>; - defm UQADD_ZZZ : sve_int_bin_cons_arit_0<0b101, "uqadd", uaddsat>; - defm SQSUB_ZZZ : sve_int_bin_cons_arit_0<0b110, "sqsub", ssubsat>; - defm UQSUB_ZZZ : sve_int_bin_cons_arit_0<0b111, "uqsub", usubsat>; +let Predicates = [HasSVE] in { + defm RDFFR_PPz : sve_int_rdffr_pred<0b0, "rdffr", int_aarch64_sve_rdffr_z>; + def RDFFRS_PPz : sve_int_rdffr_pred<0b1, "rdffrs">; + defm RDFFR_P : sve_int_rdffr_unpred<"rdffr", int_aarch64_sve_rdffr>; + def SETFFR : sve_int_setffr<"setffr", int_aarch64_sve_setffr>; + def WRFFR : sve_int_wrffr<"wrffr", int_aarch64_sve_wrffr>; + + defm ADD_ZZZ : sve_int_bin_cons_arit_0<0b000, "add", add, null_frag>; + defm SUB_ZZZ : sve_int_bin_cons_arit_0<0b001, "sub", sub, null_frag>; + defm SQADD_ZZZ : sve_int_bin_cons_arit_0<0b100, "sqadd", saddsat, int_aarch64_sve_sqadd_x>; + defm UQADD_ZZZ : sve_int_bin_cons_arit_0<0b101, "uqadd", uaddsat, int_aarch64_sve_uqadd_x>; + defm SQSUB_ZZZ : sve_int_bin_cons_arit_0<0b110, "sqsub", ssubsat, int_aarch64_sve_sqsub_x>; + defm UQSUB_ZZZ : sve_int_bin_cons_arit_0<0b111, "uqsub", usubsat, int_aarch64_sve_uqsub_x>; defm AND_ZZZ : sve_int_bin_cons_log<0b00, "and", and>; defm ORR_ZZZ : sve_int_bin_cons_log<0b01, "orr", or>; defm EOR_ZZZ : sve_int_bin_cons_log<0b10, "eor", xor>; defm BIC_ZZZ : sve_int_bin_cons_log<0b11, "bic", null_frag>; - defm ADD_ZPmZ : sve_int_bin_pred_arit_0<0b000, "add", int_aarch64_sve_add>; - defm SUB_ZPmZ : sve_int_bin_pred_arit_0<0b001, "sub", int_aarch64_sve_sub>; - defm SUBR_ZPmZ : sve_int_bin_pred_arit_0<0b011, "subr", int_aarch64_sve_subr>; + defm ADD_ZPmZ : sve_int_bin_pred_arit_0<0b000, "add", "ADD_ZPZZ", int_aarch64_sve_add, DestructiveBinaryComm>; + defm SUB_ZPmZ : sve_int_bin_pred_arit_0<0b001, "sub", "SUB_ZPZZ", int_aarch64_sve_sub, DestructiveBinaryCommWithRev, "SUBR_ZPmZ">; + defm SUBR_ZPmZ : sve_int_bin_pred_arit_0<0b011, "subr", "SUBR_ZPZZ", int_aarch64_sve_subr, DestructiveBinaryCommWithRev, "SUB_ZPmZ", /*isReverseInstr*/ 1>; + + defm ADD_ZPZZ : sve_int_bin_pred_bhsd; + + let Predicates = [HasSVE, UseExperimentalZeroingPseudos] in { + defm ADD_ZPZZ : sve_int_bin_pred_zeroing_bhsd; + defm SUB_ZPZZ : sve_int_bin_pred_zeroing_bhsd; + defm SUBR_ZPZZ : sve_int_bin_pred_zeroing_bhsd; + } defm ORR_ZPmZ : sve_int_bin_pred_log<0b000, "orr", int_aarch64_sve_orr>; defm EOR_ZPmZ : sve_int_bin_pred_log<0b001, "eor", int_aarch64_sve_eor>; defm AND_ZPmZ : sve_int_bin_pred_log<0b010, "and", int_aarch64_sve_and>; defm BIC_ZPmZ : sve_int_bin_pred_log<0b011, "bic", int_aarch64_sve_bic>; - defm ADD_ZI : sve_int_arith_imm0<0b000, "add", add>; - defm SUB_ZI : sve_int_arith_imm0<0b001, "sub", sub>; + defm ADD_ZI : sve_int_arith_imm0<0b000, "add", add, null_frag>; + defm SUB_ZI : sve_int_arith_imm0<0b001, "sub", sub, null_frag>; defm SUBR_ZI : sve_int_arith_imm0_subr<0b011, "subr", sub>; - defm SQADD_ZI : sve_int_arith_imm0<0b100, "sqadd", saddsat>; - defm UQADD_ZI : sve_int_arith_imm0<0b101, "uqadd", uaddsat>; - defm SQSUB_ZI : sve_int_arith_imm0<0b110, "sqsub", ssubsat>; - defm UQSUB_ZI : sve_int_arith_imm0<0b111, "uqsub", usubsat>; + defm SQADD_ZI : sve_int_arith_imm0<0b100, "sqadd", saddsat, int_aarch64_sve_sqadd_x>; + defm UQADD_ZI : sve_int_arith_imm0<0b101, "uqadd", uaddsat, int_aarch64_sve_uqadd_x>; + defm SQSUB_ZI : sve_int_arith_imm0<0b110, "sqsub", ssubsat, int_aarch64_sve_sqsub_x>; + defm UQSUB_ZI : sve_int_arith_imm0<0b111, "uqsub", usubsat, int_aarch64_sve_uqsub_x>; defm MAD_ZPmZZ : sve_int_mladdsub_vvv_pred<0b0, "mad", int_aarch64_sve_mad>; defm MSB_ZPmZZ : sve_int_mladdsub_vvv_pred<0b1, "msb", int_aarch64_sve_msb>; @@ -121,32 +259,45 @@ let Predicates = [HasSVE] in { // SVE predicated integer reductions. defm SADDV_VPZ : sve_int_reduce_0_saddv<0b000, "saddv", int_aarch64_sve_saddv>; defm UADDV_VPZ : sve_int_reduce_0_uaddv<0b001, "uaddv", int_aarch64_sve_uaddv, int_aarch64_sve_saddv>; - defm SMAXV_VPZ : sve_int_reduce_1<0b000, "smaxv", AArch64smaxv_pred>; - defm UMAXV_VPZ : sve_int_reduce_1<0b001, "umaxv", AArch64umaxv_pred>; - defm SMINV_VPZ : sve_int_reduce_1<0b010, "sminv", AArch64sminv_pred>; - defm UMINV_VPZ : sve_int_reduce_1<0b011, "uminv", AArch64uminv_pred>; - defm ORV_VPZ : sve_int_reduce_2<0b000, "orv", AArch64orv_pred>; - defm EORV_VPZ : sve_int_reduce_2<0b001, "eorv", AArch64eorv_pred>; - defm ANDV_VPZ : sve_int_reduce_2<0b010, "andv", AArch64andv_pred>; + defm SMAXV_VPZ : sve_int_reduce_1<0b000, "smaxv", AArch64smaxv_p>; + defm UMAXV_VPZ : sve_int_reduce_1<0b001, "umaxv", AArch64umaxv_p>; + defm SMINV_VPZ : sve_int_reduce_1<0b010, "sminv", AArch64sminv_p>; + defm UMINV_VPZ : sve_int_reduce_1<0b011, "uminv", AArch64uminv_p>; + defm ORV_VPZ : sve_int_reduce_2<0b000, "orv", AArch64orv_p>; + defm EORV_VPZ : sve_int_reduce_2<0b001, "eorv", AArch64eorv_p>; + defm ANDV_VPZ : sve_int_reduce_2<0b010, "andv", AArch64andv_p>; defm ORR_ZI : sve_int_log_imm<0b00, "orr", "orn", or>; defm EOR_ZI : sve_int_log_imm<0b01, "eor", "eon", xor>; defm AND_ZI : sve_int_log_imm<0b10, "and", "bic", and>; - defm SMAX_ZI : sve_int_arith_imm1<0b00, "smax", smax>; - defm SMIN_ZI : sve_int_arith_imm1<0b10, "smin", smin>; - defm UMAX_ZI : sve_int_arith_imm1_unsigned<0b01, "umax", umax>; - defm UMIN_ZI : sve_int_arith_imm1_unsigned<0b11, "umin", umin>; + defm SMAX_ZI : sve_int_arith_imm1<0b00, "smax", AArch64smax_m1>; + defm SMIN_ZI : sve_int_arith_imm1<0b10, "smin", AArch64smin_m1>; + defm UMAX_ZI : sve_int_arith_imm1_unsigned<0b01, "umax", AArch64umax_m1>; + defm UMIN_ZI : sve_int_arith_imm1_unsigned<0b11, "umin", AArch64umin_m1>; - defm MUL_ZI : sve_int_arith_imm2<"mul", mul>; - defm MUL_ZPmZ : sve_int_bin_pred_arit_2<0b000, "mul", int_aarch64_sve_mul>; + defm MUL_ZI : sve_int_arith_imm2<"mul", mul>; + defm MUL_ZPmZ : sve_int_bin_pred_arit_2<0b000, "mul", int_aarch64_sve_mul>; defm SMULH_ZPmZ : sve_int_bin_pred_arit_2<0b010, "smulh", int_aarch64_sve_smulh>; defm UMULH_ZPmZ : sve_int_bin_pred_arit_2<0b011, "umulh", int_aarch64_sve_umulh>; - defm SDIV_ZPmZ : sve_int_bin_pred_arit_2_div<0b100, "sdiv", int_aarch64_sve_sdiv>; - defm UDIV_ZPmZ : sve_int_bin_pred_arit_2_div<0b101, "udiv", int_aarch64_sve_udiv>; - defm SDIVR_ZPmZ : sve_int_bin_pred_arit_2_div<0b110, "sdivr", int_aarch64_sve_sdivr>; - defm UDIVR_ZPmZ : sve_int_bin_pred_arit_2_div<0b111, "udivr", int_aarch64_sve_udivr>; + // Add unpredicated alternative for the mul instruction. + def : Pat<(mul nxv16i8:$Op1, nxv16i8:$Op2), + (MUL_ZPmZ_B (PTRUE_B 31), $Op1, $Op2)>; + def : Pat<(mul nxv8i16:$Op1, nxv8i16:$Op2), + (MUL_ZPmZ_H (PTRUE_H 31), $Op1, $Op2)>; + def : Pat<(mul nxv4i32:$Op1, nxv4i32:$Op2), + (MUL_ZPmZ_S (PTRUE_S 31), $Op1, $Op2)>; + def : Pat<(mul nxv2i64:$Op1, nxv2i64:$Op2), + (MUL_ZPmZ_D (PTRUE_D 31), $Op1, $Op2)>; + + defm SDIV_ZPmZ : sve_int_bin_pred_arit_2_div<0b100, "sdiv", "SDIV_ZPZZ", int_aarch64_sve_sdiv, DestructiveBinaryCommWithRev, "SDIVR_ZPmZ">; + defm UDIV_ZPmZ : sve_int_bin_pred_arit_2_div<0b101, "udiv", "UDIV_ZPZZ", int_aarch64_sve_udiv, DestructiveBinaryCommWithRev, "UDIVR_ZPmZ">; + defm SDIVR_ZPmZ : sve_int_bin_pred_arit_2_div<0b110, "sdivr", "SDIVR_ZPZZ", int_aarch64_sve_sdivr, DestructiveBinaryCommWithRev, "SDIV_ZPmZ", /*isReverseInstr*/ 1>; + defm UDIVR_ZPmZ : sve_int_bin_pred_arit_2_div<0b111, "udivr", "UDIVR_ZPZZ", int_aarch64_sve_udivr, DestructiveBinaryCommWithRev, "UDIV_ZPmZ", /*isReverseInstr*/ 1>; + + defm SDIV_ZPZZ : sve_int_bin_pred_sd; + defm UDIV_ZPZZ : sve_int_bin_pred_sd; defm SDOT_ZZZ : sve_intx_dot<0b0, "sdot", int_aarch64_sve_sdot>; defm UDOT_ZZZ : sve_intx_dot<0b1, "udot", int_aarch64_sve_udot>; @@ -166,15 +317,20 @@ let Predicates = [HasSVE] in { defm CLS_ZPmZ : sve_int_un_pred_arit_1< 0b000, "cls", int_aarch64_sve_cls>; defm CLZ_ZPmZ : sve_int_un_pred_arit_1< 0b001, "clz", int_aarch64_sve_clz>; defm CNT_ZPmZ : sve_int_un_pred_arit_1< 0b010, "cnt", int_aarch64_sve_cnt>; + + let Predicates = [HasSVE, HasBF16] in { + def : SVE_3_Op_Pat(CNT_ZPmZ_H)>; + } + defm CNOT_ZPmZ : sve_int_un_pred_arit_1< 0b011, "cnot", int_aarch64_sve_cnot>; defm NOT_ZPmZ : sve_int_un_pred_arit_1< 0b110, "not", int_aarch64_sve_not>; defm FABS_ZPmZ : sve_int_un_pred_arit_1_fp<0b100, "fabs", int_aarch64_sve_fabs>; defm FNEG_ZPmZ : sve_int_un_pred_arit_1_fp<0b101, "fneg", int_aarch64_sve_fneg>; - defm SMAX_ZPmZ : sve_int_bin_pred_arit_1<0b000, "smax", int_aarch64_sve_smax>; - defm UMAX_ZPmZ : sve_int_bin_pred_arit_1<0b001, "umax", int_aarch64_sve_umax>; - defm SMIN_ZPmZ : sve_int_bin_pred_arit_1<0b010, "smin", int_aarch64_sve_smin>; - defm UMIN_ZPmZ : sve_int_bin_pred_arit_1<0b011, "umin", int_aarch64_sve_umin>; + defm SMAX_ZPmZ : sve_int_bin_pred_arit_1<0b000, "smax", AArch64smax_m1>; + defm UMAX_ZPmZ : sve_int_bin_pred_arit_1<0b001, "umax", AArch64umax_m1>; + defm SMIN_ZPmZ : sve_int_bin_pred_arit_1<0b010, "smin", AArch64smin_m1>; + defm UMIN_ZPmZ : sve_int_bin_pred_arit_1<0b011, "umin", AArch64umin_m1>; defm SABD_ZPmZ : sve_int_bin_pred_arit_1<0b100, "sabd", int_aarch64_sve_sabd>; defm UABD_ZPmZ : sve_int_bin_pred_arit_1<0b101, "uabd", int_aarch64_sve_uabd>; @@ -190,19 +346,36 @@ let Predicates = [HasSVE] in { defm FMAX_ZPmI : sve_fp_2op_i_p_zds<0b110, "fmax", sve_fpimm_zero_one>; defm FMIN_ZPmI : sve_fp_2op_i_p_zds<0b111, "fmin", sve_fpimm_zero_one>; - defm FADD_ZPmZ : sve_fp_2op_p_zds<0b0000, "fadd", int_aarch64_sve_fadd>; - defm FSUB_ZPmZ : sve_fp_2op_p_zds<0b0001, "fsub", int_aarch64_sve_fsub>; - defm FMUL_ZPmZ : sve_fp_2op_p_zds<0b0010, "fmul", int_aarch64_sve_fmul>; - defm FSUBR_ZPmZ : sve_fp_2op_p_zds<0b0011, "fsubr", int_aarch64_sve_fsubr>; - defm FMAXNM_ZPmZ : sve_fp_2op_p_zds<0b0100, "fmaxnm", int_aarch64_sve_fmaxnm>; - defm FMINNM_ZPmZ : sve_fp_2op_p_zds<0b0101, "fminnm", int_aarch64_sve_fminnm>; - defm FMAX_ZPmZ : sve_fp_2op_p_zds<0b0110, "fmax", int_aarch64_sve_fmax>; - defm FMIN_ZPmZ : sve_fp_2op_p_zds<0b0111, "fmin", int_aarch64_sve_fmin>; - defm FABD_ZPmZ : sve_fp_2op_p_zds<0b1000, "fabd", int_aarch64_sve_fabd>; + defm FADD_ZPmZ : sve_fp_2op_p_zds<0b0000, "fadd", "FADD_ZPZZ", int_aarch64_sve_fadd, DestructiveBinaryComm>; + defm FSUB_ZPmZ : sve_fp_2op_p_zds<0b0001, "fsub", "FSUB_ZPZZ", int_aarch64_sve_fsub, DestructiveBinaryCommWithRev, "FSUBR_ZPmZ">; + defm FMUL_ZPmZ : sve_fp_2op_p_zds<0b0010, "fmul", "FMUL_ZPZZ", int_aarch64_sve_fmul, DestructiveBinaryComm>; + defm FSUBR_ZPmZ : sve_fp_2op_p_zds<0b0011, "fsubr", "FSUBR_ZPZZ", int_aarch64_sve_fsubr, DestructiveBinaryCommWithRev, "FSUB_ZPmZ", /*isReverseInstr*/ 1>; + defm FMAXNM_ZPmZ : sve_fp_2op_p_zds<0b0100, "fmaxnm", "FMAXNM_ZPZZ", int_aarch64_sve_fmaxnm, DestructiveBinaryComm>; + defm FMINNM_ZPmZ : sve_fp_2op_p_zds<0b0101, "fminnm", "FMINNM_ZPZZ", int_aarch64_sve_fminnm, DestructiveBinaryComm>; + defm FMAX_ZPmZ : sve_fp_2op_p_zds<0b0110, "fmax", "FMAX_ZPZZ", int_aarch64_sve_fmax, DestructiveBinaryComm>; + defm FMIN_ZPmZ : sve_fp_2op_p_zds<0b0111, "fmin", "FMIN_ZPZZ", int_aarch64_sve_fmin, DestructiveBinaryComm>; + defm FABD_ZPmZ : sve_fp_2op_p_zds<0b1000, "fabd", "FABD_ZPZZ", int_aarch64_sve_fabd, DestructiveBinaryComm>; defm FSCALE_ZPmZ : sve_fp_2op_p_zds_fscale<0b1001, "fscale", int_aarch64_sve_fscale>; - defm FMULX_ZPmZ : sve_fp_2op_p_zds<0b1010, "fmulx", int_aarch64_sve_fmulx>; - defm FDIVR_ZPmZ : sve_fp_2op_p_zds<0b1100, "fdivr", int_aarch64_sve_fdivr>; - defm FDIV_ZPmZ : sve_fp_2op_p_zds<0b1101, "fdiv", int_aarch64_sve_fdiv>; + defm FMULX_ZPmZ : sve_fp_2op_p_zds<0b1010, "fmulx", "FMULX_ZPZZ", int_aarch64_sve_fmulx, DestructiveBinaryComm>; + defm FDIVR_ZPmZ : sve_fp_2op_p_zds<0b1100, "fdivr", "FDIVR_ZPZZ", int_aarch64_sve_fdivr, DestructiveBinaryCommWithRev, "FDIV_ZPmZ", /*isReverseInstr*/ 1>; + defm FDIV_ZPmZ : sve_fp_2op_p_zds<0b1101, "fdiv", "FDIV_ZPZZ", int_aarch64_sve_fdiv, DestructiveBinaryCommWithRev, "FDIVR_ZPmZ">; + + defm FADD_ZPZZ : sve_fp_bin_pred_hfd; + + let Predicates = [HasSVE, UseExperimentalZeroingPseudos] in { + defm FADD_ZPZZ : sve_fp_2op_p_zds_zeroing_hsd; + defm FSUB_ZPZZ : sve_fp_2op_p_zds_zeroing_hsd; + defm FMUL_ZPZZ : sve_fp_2op_p_zds_zeroing_hsd; + defm FSUBR_ZPZZ : sve_fp_2op_p_zds_zeroing_hsd; + defm FMAXNM_ZPZZ : sve_fp_2op_p_zds_zeroing_hsd; + defm FMINNM_ZPZZ : sve_fp_2op_p_zds_zeroing_hsd; + defm FMAX_ZPZZ : sve_fp_2op_p_zds_zeroing_hsd; + defm FMIN_ZPZZ : sve_fp_2op_p_zds_zeroing_hsd; + defm FABD_ZPZZ : sve_fp_2op_p_zds_zeroing_hsd; + defm FMULX_ZPZZ : sve_fp_2op_p_zds_zeroing_hsd; + defm FDIVR_ZPZZ : sve_fp_2op_p_zds_zeroing_hsd; + defm FDIV_ZPZZ : sve_fp_2op_p_zds_zeroing_hsd; + } defm FADD_ZZZ : sve_fp_3op_u_zd<0b000, "fadd", fadd>; defm FSUB_ZZZ : sve_fp_3op_u_zd<0b001, "fsub", fsub>; @@ -226,6 +399,16 @@ let Predicates = [HasSVE] in { defm FNMAD_ZPmZZ : sve_fp_3op_p_zds_b<0b10, "fnmad", int_aarch64_sve_fnmad>; defm FNMSB_ZPmZZ : sve_fp_3op_p_zds_b<0b11, "fnmsb", int_aarch64_sve_fnmsb>; + // Add patterns for FMA where disabled lanes are undef. + // FIXME: Implement a pseudo so we can choose a better instruction after + // regalloc. + def : Pat<(nxv8f16 (AArch64fma_p nxv8i1:$P, nxv8f16:$Op1, nxv8f16:$Op2, nxv8f16:$Op3)), + (FMLA_ZPmZZ_H $P, $Op3, $Op1, $Op2)>; + def : Pat<(nxv4f32 (AArch64fma_p nxv4i1:$P, nxv4f32:$Op1, nxv4f32:$Op2, nxv4f32:$Op3)), + (FMLA_ZPmZZ_S $P, $Op3, $Op1, $Op2)>; + def : Pat<(nxv2f64 (AArch64fma_p nxv2i1:$P, nxv2f64:$Op1, nxv2f64:$Op2, nxv2f64:$Op3)), + (FMLA_ZPmZZ_D $P, $Op3, $Op1, $Op2)>; + defm FTMAD_ZZI : sve_fp_ftmad<"ftmad", int_aarch64_sve_ftmad_x>; defm FMLA_ZZZI : sve_fp_fma_by_indexed_elem<0b0, "fmla", int_aarch64_sve_fmla_lane>; @@ -235,12 +418,21 @@ let Predicates = [HasSVE] in { defm FMUL_ZZZI : sve_fp_fmul_by_indexed_elem<"fmul", int_aarch64_sve_fmul_lane>; // SVE floating point reductions. - defm FADDA_VPZ : sve_fp_2op_p_vd<0b000, "fadda", int_aarch64_sve_fadda>; - defm FADDV_VPZ : sve_fp_fast_red<0b000, "faddv", int_aarch64_sve_faddv>; - defm FMAXNMV_VPZ : sve_fp_fast_red<0b100, "fmaxnmv", int_aarch64_sve_fmaxnmv>; - defm FMINNMV_VPZ : sve_fp_fast_red<0b101, "fminnmv", int_aarch64_sve_fminnmv>; - defm FMAXV_VPZ : sve_fp_fast_red<0b110, "fmaxv", int_aarch64_sve_fmaxv>; - defm FMINV_VPZ : sve_fp_fast_red<0b111, "fminv", int_aarch64_sve_fminv>; + defm FADDA_VPZ : sve_fp_2op_p_vd<0b000, "fadda", AArch64fadda_p>; + defm FADDV_VPZ : sve_fp_fast_red<0b000, "faddv", AArch64faddv_p>; + defm FMAXNMV_VPZ : sve_fp_fast_red<0b100, "fmaxnmv", AArch64fmaxnmv_p>; + defm FMINNMV_VPZ : sve_fp_fast_red<0b101, "fminnmv", AArch64fminnmv_p>; + defm FMAXV_VPZ : sve_fp_fast_red<0b110, "fmaxv", AArch64fmaxv_p>; + defm FMINV_VPZ : sve_fp_fast_red<0b111, "fminv", AArch64fminv_p>; + + // Use more efficient NEON instructions to extract elements within the NEON + // part (first 128bits) of an SVE register. + def : Pat<(vector_extract (nxv8f16 ZPR:$Zs), (i64 0)), + (f16 (EXTRACT_SUBREG (v8f16 (EXTRACT_SUBREG ZPR:$Zs, zsub)), hsub))>; + def : Pat<(vector_extract (nxv4f32 ZPR:$Zs), (i64 0)), + (f32 (EXTRACT_SUBREG (v4f32 (EXTRACT_SUBREG ZPR:$Zs, zsub)), ssub))>; + def : Pat<(vector_extract (nxv2f64 ZPR:$Zs), (i64 0)), + (f64 (EXTRACT_SUBREG (v2f64 (EXTRACT_SUBREG ZPR:$Zs, zsub)), dsub))>; // Splat immediate (unpredicated) defm DUP_ZI : sve_int_dup_imm<"dup">; @@ -257,18 +449,88 @@ let Predicates = [HasSVE] in { defm DUP_ZZI : sve_int_perm_dup_i<"dup">; // Splat scalar register (predicated) - defm CPY_ZPmR : sve_int_perm_cpy_r<"cpy">; - defm CPY_ZPmV : sve_int_perm_cpy_v<"cpy">; + defm CPY_ZPmR : sve_int_perm_cpy_r<"cpy", AArch64dup_mt>; + defm CPY_ZPmV : sve_int_perm_cpy_v<"cpy", AArch64dup_mt>; + + let Predicates = [HasSVE, HasBF16] in { + def : Pat<(nxv8bf16 (AArch64dup_mt nxv8i1:$pg, bf16:$splat, nxv8bf16:$passthru)), + (CPY_ZPmV_H $passthru, $pg, $splat)>; + } + + // Duplicate FP scalar into all vector elements + def : Pat<(nxv8f16 (AArch64dup (f16 FPR16:$src))), + (DUP_ZZI_H (INSERT_SUBREG (IMPLICIT_DEF), FPR16:$src, hsub), 0)>; + def : Pat<(nxv4f16 (AArch64dup (f16 FPR16:$src))), + (DUP_ZZI_H (INSERT_SUBREG (IMPLICIT_DEF), FPR16:$src, hsub), 0)>; + def : Pat<(nxv2f16 (AArch64dup (f16 FPR16:$src))), + (DUP_ZZI_H (INSERT_SUBREG (IMPLICIT_DEF), FPR16:$src, hsub), 0)>; + def : Pat<(nxv4f32 (AArch64dup (f32 FPR32:$src))), + (DUP_ZZI_S (INSERT_SUBREG (IMPLICIT_DEF), FPR32:$src, ssub), 0)>; + def : Pat<(nxv2f32 (AArch64dup (f32 FPR32:$src))), + (DUP_ZZI_S (INSERT_SUBREG (IMPLICIT_DEF), FPR32:$src, ssub), 0)>; + def : Pat<(nxv2f64 (AArch64dup (f64 FPR64:$src))), + (DUP_ZZI_D (INSERT_SUBREG (IMPLICIT_DEF), FPR64:$src, dsub), 0)>; + let Predicates = [HasSVE, HasBF16] in { + def : Pat<(nxv8bf16 (AArch64dup (bf16 FPR16:$src))), + (DUP_ZZI_H (INSERT_SUBREG (IMPLICIT_DEF), FPR16:$src, hsub), 0)>; + } + + // Duplicate +0.0 into all vector elements + def : Pat<(nxv8f16 (AArch64dup (f16 fpimm0))), (DUP_ZI_H 0, 0)>; + def : Pat<(nxv4f16 (AArch64dup (f16 fpimm0))), (DUP_ZI_H 0, 0)>; + def : Pat<(nxv2f16 (AArch64dup (f16 fpimm0))), (DUP_ZI_H 0, 0)>; + def : Pat<(nxv4f32 (AArch64dup (f32 fpimm0))), (DUP_ZI_S 0, 0)>; + def : Pat<(nxv2f32 (AArch64dup (f32 fpimm0))), (DUP_ZI_S 0, 0)>; + def : Pat<(nxv2f64 (AArch64dup (f64 fpimm0))), (DUP_ZI_D 0, 0)>; + let Predicates = [HasSVE, HasBF16] in { + def : Pat<(nxv8bf16 (AArch64dup (bf16 fpimm0))), (DUP_ZI_H 0, 0)>; + } + + // Duplicate Int immediate into all vector elements + def : Pat<(nxv16i8 (AArch64dup (i32 (SVE8BitLslImm i32:$a, i32:$b)))), + (DUP_ZI_B $a, $b)>; + def : Pat<(nxv8i16 (AArch64dup (i32 (SVE8BitLslImm i32:$a, i32:$b)))), + (DUP_ZI_H $a, $b)>; + def : Pat<(nxv4i32 (AArch64dup (i32 (SVE8BitLslImm i32:$a, i32:$b)))), + (DUP_ZI_S $a, $b)>; + def : Pat<(nxv2i64 (AArch64dup (i64 (SVE8BitLslImm i32:$a, i32:$b)))), + (DUP_ZI_D $a, $b)>; + + // Duplicate FP immediate into all vector elements + let AddedComplexity = 2 in { + def : Pat<(nxv8f16 (AArch64dup fpimm16:$imm8)), + (FDUP_ZI_H fpimm16:$imm8)>; + def : Pat<(nxv4f16 (AArch64dup fpimm16:$imm8)), + (FDUP_ZI_H fpimm16:$imm8)>; + def : Pat<(nxv2f16 (AArch64dup fpimm16:$imm8)), + (FDUP_ZI_H fpimm16:$imm8)>; + def : Pat<(nxv4f32 (AArch64dup fpimm32:$imm8)), + (FDUP_ZI_S fpimm32:$imm8)>; + def : Pat<(nxv2f32 (AArch64dup fpimm32:$imm8)), + (FDUP_ZI_S fpimm32:$imm8)>; + def : Pat<(nxv2f64 (AArch64dup fpimm64:$imm8)), + (FDUP_ZI_D fpimm64:$imm8)>; + } // Select elements from either vector (predicated) defm SEL_ZPZZ : sve_int_sel_vvv<"sel", vselect>; defm SPLICE_ZPZ : sve_int_perm_splice<"splice", int_aarch64_sve_splice>; + + let Predicates = [HasSVE, HasBF16] in { + def : SVE_3_Op_Pat; + def : SVE_3_Op_Pat; + } + defm COMPACT_ZPZ : sve_int_perm_compact<"compact", int_aarch64_sve_compact>; defm INSR_ZR : sve_int_perm_insrs<"insr", AArch64insr>; defm INSR_ZV : sve_int_perm_insrv<"insr", AArch64insr>; defm EXT_ZZI : sve_int_perm_extract_i<"ext", AArch64ext>; + let Predicates = [HasSVE, HasBF16] in { + def : SVE_2_Op_Pat; + } + defm RBIT_ZPmZ : sve_int_perm_rev_rbit<"rbit", int_aarch64_sve_rbit>; defm REVB_ZPmZ : sve_int_perm_rev_revb<"revb", int_aarch64_sve_revb, bswap>; defm REVH_ZPmZ : sve_int_perm_rev_revh<"revh", int_aarch64_sve_revh>; @@ -277,6 +539,10 @@ let Predicates = [HasSVE] in { defm REV_PP : sve_int_perm_reverse_p<"rev", AArch64rev>; defm REV_ZZ : sve_int_perm_reverse_z<"rev", AArch64rev>; + let Predicates = [HasSVE, HasBF16] in { + def : SVE_1_Op_Pat; + } + defm SUNPKLO_ZZ : sve_int_perm_unpk<0b00, "sunpklo", AArch64sunpklo>; defm SUNPKHI_ZZ : sve_int_perm_unpk<0b01, "sunpkhi", AArch64sunpkhi>; defm UUNPKLO_ZZ : sve_int_perm_unpk<0b10, "uunpklo", AArch64uunpklo>; @@ -290,34 +556,34 @@ let Predicates = [HasSVE] in { def MOVPRFX_ZZ : sve_int_bin_cons_misc_0_c<0b00000001, "movprfx", ZPRAny>; defm FEXPA_ZZ : sve_int_bin_cons_misc_0_c_fexpa<"fexpa", int_aarch64_sve_fexpa_x>; - def BRKPA_PPzPP : sve_int_brkp<0b00, "brkpa">; - def BRKPAS_PPzPP : sve_int_brkp<0b10, "brkpas">; - def BRKPB_PPzPP : sve_int_brkp<0b01, "brkpb">; - def BRKPBS_PPzPP : sve_int_brkp<0b11, "brkpbs">; + defm BRKPA_PPzPP : sve_int_brkp<0b00, "brkpa", int_aarch64_sve_brkpa_z>; + defm BRKPAS_PPzPP : sve_int_brkp<0b10, "brkpas", null_frag>; + defm BRKPB_PPzPP : sve_int_brkp<0b01, "brkpb", int_aarch64_sve_brkpb_z>; + defm BRKPBS_PPzPP : sve_int_brkp<0b11, "brkpbs", null_frag>; - def BRKN_PPzP : sve_int_brkn<0b0, "brkn">; - def BRKNS_PPzP : sve_int_brkn<0b1, "brkns">; + defm BRKN_PPzP : sve_int_brkn<0b0, "brkn", int_aarch64_sve_brkn_z>; + defm BRKNS_PPzP : sve_int_brkn<0b1, "brkns", null_frag>; - defm BRKA_PPzP : sve_int_break_z<0b000, "brka">; - defm BRKA_PPmP : sve_int_break_m<0b001, "brka">; - defm BRKAS_PPzP : sve_int_break_z<0b010, "brkas">; - defm BRKB_PPzP : sve_int_break_z<0b100, "brkb">; - defm BRKB_PPmP : sve_int_break_m<0b101, "brkb">; - defm BRKBS_PPzP : sve_int_break_z<0b110, "brkbs">; + defm BRKA_PPzP : sve_int_break_z<0b000, "brka", int_aarch64_sve_brka_z>; + defm BRKA_PPmP : sve_int_break_m<0b001, "brka", int_aarch64_sve_brka>; + defm BRKAS_PPzP : sve_int_break_z<0b010, "brkas", null_frag>; + defm BRKB_PPzP : sve_int_break_z<0b100, "brkb", int_aarch64_sve_brkb_z>; + defm BRKB_PPmP : sve_int_break_m<0b101, "brkb", int_aarch64_sve_brkb>; + defm BRKBS_PPzP : sve_int_break_z<0b110, "brkbs", null_frag>; def PTEST_PP : sve_int_ptest<0b010000, "ptest">; def PFALSE : sve_int_pfalse<0b000000, "pfalse">; defm PFIRST : sve_int_pfirst<0b00000, "pfirst", int_aarch64_sve_pfirst>; defm PNEXT : sve_int_pnext<0b00110, "pnext", int_aarch64_sve_pnext>; - defm AND_PPzPP : sve_int_pred_log<0b0000, "and", int_aarch64_sve_and_z>; + defm AND_PPzPP : sve_int_pred_log<0b0000, "and", int_aarch64_sve_and_z, and>; defm BIC_PPzPP : sve_int_pred_log<0b0001, "bic", int_aarch64_sve_bic_z>; - defm EOR_PPzPP : sve_int_pred_log<0b0010, "eor", int_aarch64_sve_eor_z>; + defm EOR_PPzPP : sve_int_pred_log<0b0010, "eor", int_aarch64_sve_eor_z, xor>; defm SEL_PPPP : sve_int_pred_log<0b0011, "sel", vselect>; defm ANDS_PPzPP : sve_int_pred_log<0b0100, "ands", null_frag>; defm BICS_PPzPP : sve_int_pred_log<0b0101, "bics", null_frag>; defm EORS_PPzPP : sve_int_pred_log<0b0110, "eors", null_frag>; - defm ORR_PPzPP : sve_int_pred_log<0b1000, "orr", int_aarch64_sve_orr_z>; + defm ORR_PPzPP : sve_int_pred_log<0b1000, "orr", int_aarch64_sve_orr_z, or>; defm ORN_PPzPP : sve_int_pred_log<0b1001, "orn", int_aarch64_sve_orn_z>; defm NOR_PPzPP : sve_int_pred_log<0b1010, "nor", int_aarch64_sve_nor_z>; defm NAND_PPzPP : sve_int_pred_log<0b1011, "nand", int_aarch64_sve_nand_z>; @@ -333,11 +599,23 @@ let Predicates = [HasSVE] in { defm CLASTA_ZPZ : sve_int_perm_clast_zz<0, "clasta", int_aarch64_sve_clasta>; defm CLASTB_ZPZ : sve_int_perm_clast_zz<1, "clastb", int_aarch64_sve_clastb>; + let Predicates = [HasSVE, HasBF16] in { + def : SVE_3_Op_Pat; + def : SVE_3_Op_Pat; + def : SVE_3_Op_Pat; + def : SVE_3_Op_Pat; + } + defm LASTA_RPZ : sve_int_perm_last_r<0, "lasta", AArch64lasta>; defm LASTB_RPZ : sve_int_perm_last_r<1, "lastb", AArch64lastb>; defm LASTA_VPZ : sve_int_perm_last_v<0, "lasta", AArch64lasta>; defm LASTB_VPZ : sve_int_perm_last_v<1, "lastb", AArch64lastb>; + let Predicates = [HasSVE, HasBF16] in { + def : SVE_2_Op_Pat; + def : SVE_2_Op_Pat; + } + // continuous load with reg+immediate defm LD1B_IMM : sve_mem_cld_si<0b0000, "ld1b", Z_b, ZPR8>; defm LD1B_H_IMM : sve_mem_cld_si<0b0001, "ld1b", Z_h, ZPR16>; @@ -468,115 +746,115 @@ let Predicates = [HasSVE] in { // Gathers using unscaled 32-bit offsets, e.g. // ld1h z0.s, p0/z, [x0, z0.s, uxtw] - defm GLD1SB_S : sve_mem_32b_gld_vs_32_unscaled<0b0000, "ld1sb", AArch64ld1s_gather_sxtw, AArch64ld1s_gather_uxtw, ZPR32ExtSXTW8Only, ZPR32ExtUXTW8Only, nxv4i8>; - defm GLDFF1SB_S : sve_mem_32b_gld_vs_32_unscaled<0b0001, "ldff1sb", null_frag, null_frag, ZPR32ExtSXTW8Only, ZPR32ExtUXTW8Only, nxv4i8>; - defm GLD1B_S : sve_mem_32b_gld_vs_32_unscaled<0b0010, "ld1b", AArch64ld1_gather_sxtw, AArch64ld1_gather_uxtw, ZPR32ExtSXTW8Only, ZPR32ExtUXTW8Only, nxv4i8>; - defm GLDFF1B_S : sve_mem_32b_gld_vs_32_unscaled<0b0011, "ldff1b", null_frag, null_frag, ZPR32ExtSXTW8Only, ZPR32ExtUXTW8Only, nxv4i8>; - defm GLD1SH_S : sve_mem_32b_gld_vs_32_unscaled<0b0100, "ld1sh", AArch64ld1s_gather_sxtw, AArch64ld1s_gather_uxtw, ZPR32ExtSXTW8, ZPR32ExtUXTW8, nxv4i16>; - defm GLDFF1SH_S : sve_mem_32b_gld_vs_32_unscaled<0b0101, "ldff1sh", null_frag, null_frag, ZPR32ExtSXTW8, ZPR32ExtUXTW8, nxv4i16>; - defm GLD1H_S : sve_mem_32b_gld_vs_32_unscaled<0b0110, "ld1h", AArch64ld1_gather_sxtw, AArch64ld1_gather_uxtw, ZPR32ExtSXTW8, ZPR32ExtUXTW8, nxv4i16>; - defm GLDFF1H_S : sve_mem_32b_gld_vs_32_unscaled<0b0111, "ldff1h", null_frag, null_frag, ZPR32ExtSXTW8, ZPR32ExtUXTW8, nxv4i16>; - defm GLD1W : sve_mem_32b_gld_vs_32_unscaled<0b1010, "ld1w", AArch64ld1_gather_sxtw, AArch64ld1_gather_uxtw, ZPR32ExtSXTW8, ZPR32ExtUXTW8, nxv4i32>; - defm GLDFF1W : sve_mem_32b_gld_vs_32_unscaled<0b1011, "ldff1w", null_frag, null_frag, ZPR32ExtSXTW8, ZPR32ExtUXTW8, nxv4i32>; + defm GLD1SB_S : sve_mem_32b_gld_vs_32_unscaled<0b0000, "ld1sb", AArch64ld1s_gather_sxtw_z, AArch64ld1s_gather_uxtw_z, ZPR32ExtSXTW8Only, ZPR32ExtUXTW8Only, nxv4i8>; + defm GLDFF1SB_S : sve_mem_32b_gld_vs_32_unscaled<0b0001, "ldff1sb", AArch64ldff1s_gather_sxtw_z, AArch64ldff1s_gather_uxtw_z, ZPR32ExtSXTW8Only, ZPR32ExtUXTW8Only, nxv4i8>; + defm GLD1B_S : sve_mem_32b_gld_vs_32_unscaled<0b0010, "ld1b", AArch64ld1_gather_sxtw_z, AArch64ld1_gather_uxtw_z, ZPR32ExtSXTW8Only, ZPR32ExtUXTW8Only, nxv4i8>; + defm GLDFF1B_S : sve_mem_32b_gld_vs_32_unscaled<0b0011, "ldff1b", AArch64ldff1_gather_sxtw_z, AArch64ldff1_gather_uxtw_z, ZPR32ExtSXTW8Only, ZPR32ExtUXTW8Only, nxv4i8>; + defm GLD1SH_S : sve_mem_32b_gld_vs_32_unscaled<0b0100, "ld1sh", AArch64ld1s_gather_sxtw_z, AArch64ld1s_gather_uxtw_z, ZPR32ExtSXTW8, ZPR32ExtUXTW8, nxv4i16>; + defm GLDFF1SH_S : sve_mem_32b_gld_vs_32_unscaled<0b0101, "ldff1sh", AArch64ldff1s_gather_sxtw_z, AArch64ldff1s_gather_uxtw_z, ZPR32ExtSXTW8, ZPR32ExtUXTW8, nxv4i16>; + defm GLD1H_S : sve_mem_32b_gld_vs_32_unscaled<0b0110, "ld1h", AArch64ld1_gather_sxtw_z, AArch64ld1_gather_uxtw_z, ZPR32ExtSXTW8, ZPR32ExtUXTW8, nxv4i16>; + defm GLDFF1H_S : sve_mem_32b_gld_vs_32_unscaled<0b0111, "ldff1h", AArch64ldff1_gather_sxtw_z, AArch64ldff1_gather_uxtw_z, ZPR32ExtSXTW8, ZPR32ExtUXTW8, nxv4i16>; + defm GLD1W : sve_mem_32b_gld_vs_32_unscaled<0b1010, "ld1w", AArch64ld1_gather_sxtw_z, AArch64ld1_gather_uxtw_z, ZPR32ExtSXTW8, ZPR32ExtUXTW8, nxv4i32>; + defm GLDFF1W : sve_mem_32b_gld_vs_32_unscaled<0b1011, "ldff1w", AArch64ldff1_gather_sxtw_z, AArch64ldff1_gather_uxtw_z, ZPR32ExtSXTW8, ZPR32ExtUXTW8, nxv4i32>; // Gathers using scaled 32-bit offsets, e.g. // ld1h z0.s, p0/z, [x0, z0.s, uxtw #1] - defm GLD1SH_S : sve_mem_32b_gld_sv_32_scaled<0b0100, "ld1sh", AArch64ld1s_gather_sxtw_scaled, AArch64ld1s_gather_uxtw_scaled, ZPR32ExtSXTW16, ZPR32ExtUXTW16, nxv4i16>; - defm GLDFF1SH_S : sve_mem_32b_gld_sv_32_scaled<0b0101, "ldff1sh", null_frag, null_frag, ZPR32ExtSXTW16, ZPR32ExtUXTW16, nxv4i16>; - defm GLD1H_S : sve_mem_32b_gld_sv_32_scaled<0b0110, "ld1h", AArch64ld1_gather_sxtw_scaled, AArch64ld1_gather_uxtw_scaled, ZPR32ExtSXTW16, ZPR32ExtUXTW16, nxv4i16>; - defm GLDFF1H_S : sve_mem_32b_gld_sv_32_scaled<0b0111, "ldff1h", null_frag, null_frag, ZPR32ExtSXTW16, ZPR32ExtUXTW16, nxv4i16>; - defm GLD1W : sve_mem_32b_gld_sv_32_scaled<0b1010, "ld1w", AArch64ld1_gather_sxtw_scaled, AArch64ld1_gather_uxtw_scaled, ZPR32ExtSXTW32, ZPR32ExtUXTW32, nxv4i32>; - defm GLDFF1W : sve_mem_32b_gld_sv_32_scaled<0b1011, "ldff1w", null_frag, null_frag, ZPR32ExtSXTW32, ZPR32ExtUXTW32, nxv4i32>; + defm GLD1SH_S : sve_mem_32b_gld_sv_32_scaled<0b0100, "ld1sh", AArch64ld1s_gather_sxtw_scaled_z, AArch64ld1s_gather_uxtw_scaled_z, ZPR32ExtSXTW16, ZPR32ExtUXTW16, nxv4i16>; + defm GLDFF1SH_S : sve_mem_32b_gld_sv_32_scaled<0b0101, "ldff1sh", AArch64ldff1s_gather_sxtw_scaled_z, AArch64ldff1s_gather_uxtw_scaled_z, ZPR32ExtSXTW16, ZPR32ExtUXTW16, nxv4i16>; + defm GLD1H_S : sve_mem_32b_gld_sv_32_scaled<0b0110, "ld1h", AArch64ld1_gather_sxtw_scaled_z, AArch64ld1_gather_uxtw_scaled_z, ZPR32ExtSXTW16, ZPR32ExtUXTW16, nxv4i16>; + defm GLDFF1H_S : sve_mem_32b_gld_sv_32_scaled<0b0111, "ldff1h", AArch64ldff1_gather_sxtw_scaled_z, AArch64ldff1_gather_uxtw_scaled_z, ZPR32ExtSXTW16, ZPR32ExtUXTW16, nxv4i16>; + defm GLD1W : sve_mem_32b_gld_sv_32_scaled<0b1010, "ld1w", AArch64ld1_gather_sxtw_scaled_z, AArch64ld1_gather_uxtw_scaled_z, ZPR32ExtSXTW32, ZPR32ExtUXTW32, nxv4i32>; + defm GLDFF1W : sve_mem_32b_gld_sv_32_scaled<0b1011, "ldff1w", AArch64ldff1_gather_sxtw_scaled_z, AArch64ldff1_gather_uxtw_scaled_z, ZPR32ExtSXTW32, ZPR32ExtUXTW32, nxv4i32>; // Gathers using 32-bit pointers with scaled offset, e.g. // ld1h z0.s, p0/z, [z0.s, #16] - defm GLD1SB_S : sve_mem_32b_gld_vi_32_ptrs<0b0000, "ld1sb", imm0_31, AArch64ld1s_gather_imm, nxv4i8>; - defm GLDFF1SB_S : sve_mem_32b_gld_vi_32_ptrs<0b0001, "ldff1sb", imm0_31, null_frag, nxv4i8>; - defm GLD1B_S : sve_mem_32b_gld_vi_32_ptrs<0b0010, "ld1b", imm0_31, AArch64ld1_gather_imm, nxv4i8>; - defm GLDFF1B_S : sve_mem_32b_gld_vi_32_ptrs<0b0011, "ldff1b", imm0_31, null_frag, nxv4i8>; - defm GLD1SH_S : sve_mem_32b_gld_vi_32_ptrs<0b0100, "ld1sh", uimm5s2, AArch64ld1s_gather_imm, nxv4i16>; - defm GLDFF1SH_S : sve_mem_32b_gld_vi_32_ptrs<0b0101, "ldff1sh", uimm5s2, null_frag, nxv4i16>; - defm GLD1H_S : sve_mem_32b_gld_vi_32_ptrs<0b0110, "ld1h", uimm5s2, AArch64ld1_gather_imm, nxv4i16>; - defm GLDFF1H_S : sve_mem_32b_gld_vi_32_ptrs<0b0111, "ldff1h", uimm5s2, null_frag, nxv4i16>; - defm GLD1W : sve_mem_32b_gld_vi_32_ptrs<0b1010, "ld1w", uimm5s4, AArch64ld1_gather_imm, nxv4i32>; - defm GLDFF1W : sve_mem_32b_gld_vi_32_ptrs<0b1011, "ldff1w", uimm5s4, null_frag, nxv4i32>; + defm GLD1SB_S : sve_mem_32b_gld_vi_32_ptrs<0b0000, "ld1sb", imm0_31, AArch64ld1s_gather_imm_z, nxv4i8>; + defm GLDFF1SB_S : sve_mem_32b_gld_vi_32_ptrs<0b0001, "ldff1sb", imm0_31, AArch64ldff1s_gather_imm_z, nxv4i8>; + defm GLD1B_S : sve_mem_32b_gld_vi_32_ptrs<0b0010, "ld1b", imm0_31, AArch64ld1_gather_imm_z, nxv4i8>; + defm GLDFF1B_S : sve_mem_32b_gld_vi_32_ptrs<0b0011, "ldff1b", imm0_31, AArch64ldff1_gather_imm_z, nxv4i8>; + defm GLD1SH_S : sve_mem_32b_gld_vi_32_ptrs<0b0100, "ld1sh", uimm5s2, AArch64ld1s_gather_imm_z, nxv4i16>; + defm GLDFF1SH_S : sve_mem_32b_gld_vi_32_ptrs<0b0101, "ldff1sh", uimm5s2, AArch64ldff1s_gather_imm_z, nxv4i16>; + defm GLD1H_S : sve_mem_32b_gld_vi_32_ptrs<0b0110, "ld1h", uimm5s2, AArch64ld1_gather_imm_z, nxv4i16>; + defm GLDFF1H_S : sve_mem_32b_gld_vi_32_ptrs<0b0111, "ldff1h", uimm5s2, AArch64ldff1_gather_imm_z, nxv4i16>; + defm GLD1W : sve_mem_32b_gld_vi_32_ptrs<0b1010, "ld1w", uimm5s4, AArch64ld1_gather_imm_z, nxv4i32>; + defm GLDFF1W : sve_mem_32b_gld_vi_32_ptrs<0b1011, "ldff1w", uimm5s4, AArch64ldff1_gather_imm_z, nxv4i32>; // Gathers using 64-bit pointers with scaled offset, e.g. // ld1h z0.d, p0/z, [z0.d, #16] - defm GLD1SB_D : sve_mem_64b_gld_vi_64_ptrs<0b0000, "ld1sb", imm0_31, AArch64ld1s_gather_imm, nxv2i8>; - defm GLDFF1SB_D : sve_mem_64b_gld_vi_64_ptrs<0b0001, "ldff1sb", imm0_31, null_frag, nxv2i8>; - defm GLD1B_D : sve_mem_64b_gld_vi_64_ptrs<0b0010, "ld1b", imm0_31, AArch64ld1_gather_imm, nxv2i8>; - defm GLDFF1B_D : sve_mem_64b_gld_vi_64_ptrs<0b0011, "ldff1b", imm0_31, null_frag, nxv2i8>; - defm GLD1SH_D : sve_mem_64b_gld_vi_64_ptrs<0b0100, "ld1sh", uimm5s2, AArch64ld1s_gather_imm, nxv2i16>; - defm GLDFF1SH_D : sve_mem_64b_gld_vi_64_ptrs<0b0101, "ldff1sh", uimm5s2, null_frag, nxv2i16>; - defm GLD1H_D : sve_mem_64b_gld_vi_64_ptrs<0b0110, "ld1h", uimm5s2, AArch64ld1_gather_imm, nxv2i16>; - defm GLDFF1H_D : sve_mem_64b_gld_vi_64_ptrs<0b0111, "ldff1h", uimm5s2, null_frag, nxv2i16>; - defm GLD1SW_D : sve_mem_64b_gld_vi_64_ptrs<0b1000, "ld1sw", uimm5s4, AArch64ld1s_gather_imm, nxv2i32>; - defm GLDFF1SW_D : sve_mem_64b_gld_vi_64_ptrs<0b1001, "ldff1sw", uimm5s4, null_frag, nxv2i32>; - defm GLD1W_D : sve_mem_64b_gld_vi_64_ptrs<0b1010, "ld1w", uimm5s4, AArch64ld1_gather_imm, nxv2i32>; - defm GLDFF1W_D : sve_mem_64b_gld_vi_64_ptrs<0b1011, "ldff1w", uimm5s4, null_frag, nxv2i32>; - defm GLD1D : sve_mem_64b_gld_vi_64_ptrs<0b1110, "ld1d", uimm5s8, AArch64ld1_gather_imm, nxv2i64>; - defm GLDFF1D : sve_mem_64b_gld_vi_64_ptrs<0b1111, "ldff1d", uimm5s8, null_frag, nxv2i64>; + defm GLD1SB_D : sve_mem_64b_gld_vi_64_ptrs<0b0000, "ld1sb", imm0_31, AArch64ld1s_gather_imm_z, nxv2i8>; + defm GLDFF1SB_D : sve_mem_64b_gld_vi_64_ptrs<0b0001, "ldff1sb", imm0_31, AArch64ldff1s_gather_imm_z, nxv2i8>; + defm GLD1B_D : sve_mem_64b_gld_vi_64_ptrs<0b0010, "ld1b", imm0_31, AArch64ld1_gather_imm_z, nxv2i8>; + defm GLDFF1B_D : sve_mem_64b_gld_vi_64_ptrs<0b0011, "ldff1b", imm0_31, AArch64ldff1_gather_imm_z, nxv2i8>; + defm GLD1SH_D : sve_mem_64b_gld_vi_64_ptrs<0b0100, "ld1sh", uimm5s2, AArch64ld1s_gather_imm_z, nxv2i16>; + defm GLDFF1SH_D : sve_mem_64b_gld_vi_64_ptrs<0b0101, "ldff1sh", uimm5s2, AArch64ldff1s_gather_imm_z, nxv2i16>; + defm GLD1H_D : sve_mem_64b_gld_vi_64_ptrs<0b0110, "ld1h", uimm5s2, AArch64ld1_gather_imm_z, nxv2i16>; + defm GLDFF1H_D : sve_mem_64b_gld_vi_64_ptrs<0b0111, "ldff1h", uimm5s2, AArch64ldff1_gather_imm_z, nxv2i16>; + defm GLD1SW_D : sve_mem_64b_gld_vi_64_ptrs<0b1000, "ld1sw", uimm5s4, AArch64ld1s_gather_imm_z, nxv2i32>; + defm GLDFF1SW_D : sve_mem_64b_gld_vi_64_ptrs<0b1001, "ldff1sw", uimm5s4, AArch64ldff1s_gather_imm_z, nxv2i32>; + defm GLD1W_D : sve_mem_64b_gld_vi_64_ptrs<0b1010, "ld1w", uimm5s4, AArch64ld1_gather_imm_z, nxv2i32>; + defm GLDFF1W_D : sve_mem_64b_gld_vi_64_ptrs<0b1011, "ldff1w", uimm5s4, AArch64ldff1_gather_imm_z, nxv2i32>; + defm GLD1D : sve_mem_64b_gld_vi_64_ptrs<0b1110, "ld1d", uimm5s8, AArch64ld1_gather_imm_z, nxv2i64>; + defm GLDFF1D : sve_mem_64b_gld_vi_64_ptrs<0b1111, "ldff1d", uimm5s8, AArch64ldff1_gather_imm_z, nxv2i64>; // Gathers using unscaled 64-bit offsets, e.g. // ld1h z0.d, p0/z, [x0, z0.d] - defm GLD1SB_D : sve_mem_64b_gld_vs2_64_unscaled<0b0000, "ld1sb", AArch64ld1s_gather, nxv2i8>; - defm GLDFF1SB_D : sve_mem_64b_gld_vs2_64_unscaled<0b0001, "ldff1sb", null_frag, nxv2i8>; - defm GLD1B_D : sve_mem_64b_gld_vs2_64_unscaled<0b0010, "ld1b", AArch64ld1_gather, nxv2i8>; - defm GLDFF1B_D : sve_mem_64b_gld_vs2_64_unscaled<0b0011, "ldff1b", null_frag, nxv2i8>; - defm GLD1SH_D : sve_mem_64b_gld_vs2_64_unscaled<0b0100, "ld1sh", AArch64ld1s_gather, nxv2i16>; - defm GLDFF1SH_D : sve_mem_64b_gld_vs2_64_unscaled<0b0101, "ldff1sh", null_frag, nxv2i16>; - defm GLD1H_D : sve_mem_64b_gld_vs2_64_unscaled<0b0110, "ld1h", AArch64ld1_gather, nxv2i16>; - defm GLDFF1H_D : sve_mem_64b_gld_vs2_64_unscaled<0b0111, "ldff1h", null_frag, nxv2i16>; - defm GLD1SW_D : sve_mem_64b_gld_vs2_64_unscaled<0b1000, "ld1sw", AArch64ld1s_gather, nxv2i32>; - defm GLDFF1SW_D : sve_mem_64b_gld_vs2_64_unscaled<0b1001, "ldff1sw", null_frag, nxv2i32>; - defm GLD1W_D : sve_mem_64b_gld_vs2_64_unscaled<0b1010, "ld1w", AArch64ld1_gather, nxv2i32>; - defm GLDFF1W_D : sve_mem_64b_gld_vs2_64_unscaled<0b1011, "ldff1w", null_frag, nxv2i32>; - defm GLD1D : sve_mem_64b_gld_vs2_64_unscaled<0b1110, "ld1d", AArch64ld1_gather, nxv2i64>; - defm GLDFF1D : sve_mem_64b_gld_vs2_64_unscaled<0b1111, "ldff1d", null_frag, nxv2i64>; + defm GLD1SB_D : sve_mem_64b_gld_vs2_64_unscaled<0b0000, "ld1sb", AArch64ld1s_gather_z, nxv2i8>; + defm GLDFF1SB_D : sve_mem_64b_gld_vs2_64_unscaled<0b0001, "ldff1sb", AArch64ldff1s_gather_z, nxv2i8>; + defm GLD1B_D : sve_mem_64b_gld_vs2_64_unscaled<0b0010, "ld1b", AArch64ld1_gather_z, nxv2i8>; + defm GLDFF1B_D : sve_mem_64b_gld_vs2_64_unscaled<0b0011, "ldff1b", AArch64ldff1_gather_z, nxv2i8>; + defm GLD1SH_D : sve_mem_64b_gld_vs2_64_unscaled<0b0100, "ld1sh", AArch64ld1s_gather_z, nxv2i16>; + defm GLDFF1SH_D : sve_mem_64b_gld_vs2_64_unscaled<0b0101, "ldff1sh", AArch64ldff1s_gather_z, nxv2i16>; + defm GLD1H_D : sve_mem_64b_gld_vs2_64_unscaled<0b0110, "ld1h", AArch64ld1_gather_z, nxv2i16>; + defm GLDFF1H_D : sve_mem_64b_gld_vs2_64_unscaled<0b0111, "ldff1h", AArch64ldff1_gather_z, nxv2i16>; + defm GLD1SW_D : sve_mem_64b_gld_vs2_64_unscaled<0b1000, "ld1sw", AArch64ld1s_gather_z, nxv2i32>; + defm GLDFF1SW_D : sve_mem_64b_gld_vs2_64_unscaled<0b1001, "ldff1sw", AArch64ldff1s_gather_z, nxv2i32>; + defm GLD1W_D : sve_mem_64b_gld_vs2_64_unscaled<0b1010, "ld1w", AArch64ld1_gather_z, nxv2i32>; + defm GLDFF1W_D : sve_mem_64b_gld_vs2_64_unscaled<0b1011, "ldff1w", AArch64ldff1_gather_z, nxv2i32>; + defm GLD1D : sve_mem_64b_gld_vs2_64_unscaled<0b1110, "ld1d", AArch64ld1_gather_z, nxv2i64>; + defm GLDFF1D : sve_mem_64b_gld_vs2_64_unscaled<0b1111, "ldff1d", AArch64ldff1_gather_z, nxv2i64>; // Gathers using scaled 64-bit offsets, e.g. // ld1h z0.d, p0/z, [x0, z0.d, lsl #1] - defm GLD1SH_D : sve_mem_64b_gld_sv2_64_scaled<0b0100, "ld1sh", AArch64ld1s_gather_scaled, ZPR64ExtLSL16, nxv2i16>; - defm GLDFF1SH_D : sve_mem_64b_gld_sv2_64_scaled<0b0101, "ldff1sh", null_frag, ZPR64ExtLSL16, nxv2i16>; - defm GLD1H_D : sve_mem_64b_gld_sv2_64_scaled<0b0110, "ld1h", AArch64ld1_gather_scaled, ZPR64ExtLSL16, nxv2i16>; - defm GLDFF1H_D : sve_mem_64b_gld_sv2_64_scaled<0b0111, "ldff1h", null_frag, ZPR64ExtLSL16, nxv2i16>; - defm GLD1SW_D : sve_mem_64b_gld_sv2_64_scaled<0b1000, "ld1sw", AArch64ld1s_gather_scaled, ZPR64ExtLSL32, nxv2i32>; - defm GLDFF1SW_D : sve_mem_64b_gld_sv2_64_scaled<0b1001, "ldff1sw", null_frag, ZPR64ExtLSL32, nxv2i32>; - defm GLD1W_D : sve_mem_64b_gld_sv2_64_scaled<0b1010, "ld1w", AArch64ld1_gather_scaled, ZPR64ExtLSL32, nxv2i32>; - defm GLDFF1W_D : sve_mem_64b_gld_sv2_64_scaled<0b1011, "ldff1w", null_frag, ZPR64ExtLSL32, nxv2i32>; - defm GLD1D : sve_mem_64b_gld_sv2_64_scaled<0b1110, "ld1d", AArch64ld1_gather_scaled, ZPR64ExtLSL64, nxv2i64>; - defm GLDFF1D : sve_mem_64b_gld_sv2_64_scaled<0b1111, "ldff1d", null_frag, ZPR64ExtLSL64, nxv2i64>; + defm GLD1SH_D : sve_mem_64b_gld_sv2_64_scaled<0b0100, "ld1sh", AArch64ld1s_gather_scaled_z, ZPR64ExtLSL16, nxv2i16>; + defm GLDFF1SH_D : sve_mem_64b_gld_sv2_64_scaled<0b0101, "ldff1sh", AArch64ldff1s_gather_scaled_z, ZPR64ExtLSL16, nxv2i16>; + defm GLD1H_D : sve_mem_64b_gld_sv2_64_scaled<0b0110, "ld1h", AArch64ld1_gather_scaled_z, ZPR64ExtLSL16, nxv2i16>; + defm GLDFF1H_D : sve_mem_64b_gld_sv2_64_scaled<0b0111, "ldff1h", AArch64ldff1_gather_scaled_z, ZPR64ExtLSL16, nxv2i16>; + defm GLD1SW_D : sve_mem_64b_gld_sv2_64_scaled<0b1000, "ld1sw", AArch64ld1s_gather_scaled_z, ZPR64ExtLSL32, nxv2i32>; + defm GLDFF1SW_D : sve_mem_64b_gld_sv2_64_scaled<0b1001, "ldff1sw", AArch64ldff1s_gather_scaled_z, ZPR64ExtLSL32, nxv2i32>; + defm GLD1W_D : sve_mem_64b_gld_sv2_64_scaled<0b1010, "ld1w", AArch64ld1_gather_scaled_z, ZPR64ExtLSL32, nxv2i32>; + defm GLDFF1W_D : sve_mem_64b_gld_sv2_64_scaled<0b1011, "ldff1w", AArch64ldff1_gather_scaled_z, ZPR64ExtLSL32, nxv2i32>; + defm GLD1D : sve_mem_64b_gld_sv2_64_scaled<0b1110, "ld1d", AArch64ld1_gather_scaled_z, ZPR64ExtLSL64, nxv2i64>; + defm GLDFF1D : sve_mem_64b_gld_sv2_64_scaled<0b1111, "ldff1d", AArch64ldff1_gather_scaled_z, ZPR64ExtLSL64, nxv2i64>; // Gathers using unscaled 32-bit offsets unpacked in 64-bits elements, e.g. // ld1h z0.d, p0/z, [x0, z0.d, uxtw] - defm GLD1SB_D : sve_mem_64b_gld_vs_32_unscaled<0b0000, "ld1sb", AArch64ld1s_gather_sxtw, AArch64ld1s_gather_uxtw, ZPR64ExtSXTW8Only, ZPR64ExtUXTW8Only, nxv2i8>; - defm GLDFF1SB_D : sve_mem_64b_gld_vs_32_unscaled<0b0001, "ldff1sb", null_frag, null_frag, ZPR64ExtSXTW8Only, ZPR64ExtUXTW8Only, nxv2i8>; - defm GLD1B_D : sve_mem_64b_gld_vs_32_unscaled<0b0010, "ld1b", AArch64ld1_gather_sxtw, AArch64ld1_gather_uxtw, ZPR64ExtSXTW8Only, ZPR64ExtUXTW8Only, nxv2i8>; - defm GLDFF1B_D : sve_mem_64b_gld_vs_32_unscaled<0b0011, "ldff1b", null_frag, null_frag, ZPR64ExtSXTW8Only, ZPR64ExtUXTW8Only, nxv2i8>; - defm GLD1SH_D : sve_mem_64b_gld_vs_32_unscaled<0b0100, "ld1sh", AArch64ld1s_gather_sxtw, AArch64ld1s_gather_uxtw, ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i16>; - defm GLDFF1SH_D : sve_mem_64b_gld_vs_32_unscaled<0b0101, "ldff1sh", null_frag, null_frag, ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i16>; - defm GLD1H_D : sve_mem_64b_gld_vs_32_unscaled<0b0110, "ld1h", AArch64ld1_gather_sxtw, AArch64ld1_gather_uxtw, ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i16>; - defm GLDFF1H_D : sve_mem_64b_gld_vs_32_unscaled<0b0111, "ldff1h", null_frag, null_frag, ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i16>; - defm GLD1SW_D : sve_mem_64b_gld_vs_32_unscaled<0b1000, "ld1sw", AArch64ld1s_gather_sxtw, AArch64ld1s_gather_uxtw, ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i32>; - defm GLDFF1SW_D : sve_mem_64b_gld_vs_32_unscaled<0b1001, "ldff1sw", null_frag, null_frag, ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i32>; - defm GLD1W_D : sve_mem_64b_gld_vs_32_unscaled<0b1010, "ld1w", AArch64ld1_gather_sxtw, AArch64ld1_gather_uxtw, ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i32>; - defm GLDFF1W_D : sve_mem_64b_gld_vs_32_unscaled<0b1011, "ldff1w", null_frag, null_frag, ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i32>; - defm GLD1D : sve_mem_64b_gld_vs_32_unscaled<0b1110, "ld1d", AArch64ld1_gather_sxtw, AArch64ld1_gather_uxtw, ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i64>; - defm GLDFF1D : sve_mem_64b_gld_vs_32_unscaled<0b1111, "ldff1d", null_frag, null_frag, ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i64>; + defm GLD1SB_D : sve_mem_64b_gld_vs_32_unscaled<0b0000, "ld1sb", AArch64ld1s_gather_sxtw_z, AArch64ld1s_gather_uxtw_z, ZPR64ExtSXTW8Only, ZPR64ExtUXTW8Only, nxv2i8>; + defm GLDFF1SB_D : sve_mem_64b_gld_vs_32_unscaled<0b0001, "ldff1sb", AArch64ldff1s_gather_sxtw_z, AArch64ldff1s_gather_uxtw_z, ZPR64ExtSXTW8Only, ZPR64ExtUXTW8Only, nxv2i8>; + defm GLD1B_D : sve_mem_64b_gld_vs_32_unscaled<0b0010, "ld1b", AArch64ld1_gather_sxtw_z, AArch64ld1_gather_uxtw_z, ZPR64ExtSXTW8Only, ZPR64ExtUXTW8Only, nxv2i8>; + defm GLDFF1B_D : sve_mem_64b_gld_vs_32_unscaled<0b0011, "ldff1b", AArch64ldff1_gather_sxtw_z, AArch64ldff1_gather_uxtw_z, ZPR64ExtSXTW8Only, ZPR64ExtUXTW8Only, nxv2i8>; + defm GLD1SH_D : sve_mem_64b_gld_vs_32_unscaled<0b0100, "ld1sh", AArch64ld1s_gather_sxtw_z, AArch64ld1s_gather_uxtw_z, ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i16>; + defm GLDFF1SH_D : sve_mem_64b_gld_vs_32_unscaled<0b0101, "ldff1sh", AArch64ldff1s_gather_sxtw_z, AArch64ldff1s_gather_uxtw_z, ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i16>; + defm GLD1H_D : sve_mem_64b_gld_vs_32_unscaled<0b0110, "ld1h", AArch64ld1_gather_sxtw_z, AArch64ld1_gather_uxtw_z, ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i16>; + defm GLDFF1H_D : sve_mem_64b_gld_vs_32_unscaled<0b0111, "ldff1h", AArch64ldff1_gather_sxtw_z, AArch64ldff1_gather_uxtw_z, ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i16>; + defm GLD1SW_D : sve_mem_64b_gld_vs_32_unscaled<0b1000, "ld1sw", AArch64ld1s_gather_sxtw_z, AArch64ld1s_gather_uxtw_z, ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i32>; + defm GLDFF1SW_D : sve_mem_64b_gld_vs_32_unscaled<0b1001, "ldff1sw", AArch64ldff1s_gather_sxtw_z, AArch64ldff1s_gather_uxtw_z, ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i32>; + defm GLD1W_D : sve_mem_64b_gld_vs_32_unscaled<0b1010, "ld1w", AArch64ld1_gather_sxtw_z, AArch64ld1_gather_uxtw_z, ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i32>; + defm GLDFF1W_D : sve_mem_64b_gld_vs_32_unscaled<0b1011, "ldff1w", AArch64ldff1_gather_sxtw_z, AArch64ldff1_gather_uxtw_z, ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i32>; + defm GLD1D : sve_mem_64b_gld_vs_32_unscaled<0b1110, "ld1d", AArch64ld1_gather_sxtw_z, AArch64ld1_gather_uxtw_z, ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i64>; + defm GLDFF1D : sve_mem_64b_gld_vs_32_unscaled<0b1111, "ldff1d", AArch64ldff1_gather_sxtw_z, AArch64ldff1_gather_uxtw_z, ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i64>; // Gathers using scaled 32-bit offsets unpacked in 64-bits elements, e.g. // ld1h z0.d, p0/z, [x0, z0.d, uxtw #1] - defm GLD1SH_D : sve_mem_64b_gld_sv_32_scaled<0b0100, "ld1sh", AArch64ld1s_gather_sxtw_scaled, AArch64ld1s_gather_uxtw_scaled, ZPR64ExtSXTW16, ZPR64ExtUXTW16, nxv2i16>; - defm GLDFF1SH_D : sve_mem_64b_gld_sv_32_scaled<0b0101, "ldff1sh", null_frag, null_frag, ZPR64ExtSXTW16, ZPR64ExtUXTW16, nxv2i16>; - defm GLD1H_D : sve_mem_64b_gld_sv_32_scaled<0b0110, "ld1h", AArch64ld1_gather_sxtw_scaled, AArch64ld1_gather_uxtw_scaled, ZPR64ExtSXTW16, ZPR64ExtUXTW16, nxv2i16>; - defm GLDFF1H_D : sve_mem_64b_gld_sv_32_scaled<0b0111, "ldff1h", null_frag, null_frag, ZPR64ExtSXTW16, ZPR64ExtUXTW16, nxv2i16>; - defm GLD1SW_D : sve_mem_64b_gld_sv_32_scaled<0b1000, "ld1sw", AArch64ld1s_gather_sxtw_scaled, AArch64ld1s_gather_uxtw_scaled, ZPR64ExtSXTW32, ZPR64ExtUXTW32, nxv2i32>; - defm GLDFF1SW_D : sve_mem_64b_gld_sv_32_scaled<0b1001, "ldff1sw", null_frag, null_frag, ZPR64ExtSXTW32, ZPR64ExtUXTW32, nxv2i32>; - defm GLD1W_D : sve_mem_64b_gld_sv_32_scaled<0b1010, "ld1w", AArch64ld1_gather_sxtw_scaled, AArch64ld1_gather_uxtw_scaled, ZPR64ExtSXTW32, ZPR64ExtUXTW32, nxv2i32>; - defm GLDFF1W_D : sve_mem_64b_gld_sv_32_scaled<0b1011, "ldff1w", null_frag, null_frag, ZPR64ExtSXTW32, ZPR64ExtUXTW32, nxv2i32>; - defm GLD1D : sve_mem_64b_gld_sv_32_scaled<0b1110, "ld1d", AArch64ld1_gather_sxtw_scaled, AArch64ld1_gather_uxtw_scaled, ZPR64ExtSXTW64, ZPR64ExtUXTW64, nxv2i64>; - defm GLDFF1D : sve_mem_64b_gld_sv_32_scaled<0b1111, "ldff1d", null_frag, null_frag, ZPR64ExtSXTW64, ZPR64ExtUXTW64, nxv2i64>; + defm GLD1SH_D : sve_mem_64b_gld_sv_32_scaled<0b0100, "ld1sh", AArch64ld1s_gather_sxtw_scaled_z, AArch64ld1s_gather_uxtw_scaled_z, ZPR64ExtSXTW16, ZPR64ExtUXTW16, nxv2i16>; + defm GLDFF1SH_D : sve_mem_64b_gld_sv_32_scaled<0b0101, "ldff1sh", AArch64ldff1s_gather_sxtw_scaled_z, AArch64ldff1s_gather_uxtw_scaled_z, ZPR64ExtSXTW16, ZPR64ExtUXTW16, nxv2i16>; + defm GLD1H_D : sve_mem_64b_gld_sv_32_scaled<0b0110, "ld1h", AArch64ld1_gather_sxtw_scaled_z, AArch64ld1_gather_uxtw_scaled_z, ZPR64ExtSXTW16, ZPR64ExtUXTW16, nxv2i16>; + defm GLDFF1H_D : sve_mem_64b_gld_sv_32_scaled<0b0111, "ldff1h", AArch64ldff1_gather_sxtw_scaled_z, AArch64ldff1_gather_uxtw_scaled_z, ZPR64ExtSXTW16, ZPR64ExtUXTW16, nxv2i16>; + defm GLD1SW_D : sve_mem_64b_gld_sv_32_scaled<0b1000, "ld1sw", AArch64ld1s_gather_sxtw_scaled_z, AArch64ld1s_gather_uxtw_scaled_z, ZPR64ExtSXTW32, ZPR64ExtUXTW32, nxv2i32>; + defm GLDFF1SW_D : sve_mem_64b_gld_sv_32_scaled<0b1001, "ldff1sw", AArch64ldff1s_gather_sxtw_scaled_z, AArch64ldff1s_gather_uxtw_scaled_z, ZPR64ExtSXTW32, ZPR64ExtUXTW32, nxv2i32>; + defm GLD1W_D : sve_mem_64b_gld_sv_32_scaled<0b1010, "ld1w", AArch64ld1_gather_sxtw_scaled_z, AArch64ld1_gather_uxtw_scaled_z, ZPR64ExtSXTW32, ZPR64ExtUXTW32, nxv2i32>; + defm GLDFF1W_D : sve_mem_64b_gld_sv_32_scaled<0b1011, "ldff1w", AArch64ldff1_gather_sxtw_scaled_z, AArch64ldff1_gather_uxtw_scaled_z, ZPR64ExtSXTW32, ZPR64ExtUXTW32, nxv2i32>; + defm GLD1D : sve_mem_64b_gld_sv_32_scaled<0b1110, "ld1d", AArch64ld1_gather_sxtw_scaled_z, AArch64ld1_gather_uxtw_scaled_z, ZPR64ExtSXTW64, ZPR64ExtUXTW64, nxv2i64>; + defm GLDFF1D : sve_mem_64b_gld_sv_32_scaled<0b1111, "ldff1d", AArch64ldff1_gather_sxtw_scaled_z, AArch64ldff1_gather_uxtw_scaled_z, ZPR64ExtSXTW64, ZPR64ExtUXTW64, nxv2i64>; // Non-temporal contiguous loads (register + immediate) defm LDNT1B_ZRI : sve_mem_cldnt_si<0b00, "ldnt1b", Z_b, ZPR8>; @@ -640,16 +918,16 @@ let Predicates = [HasSVE] in { // Scatters using 32/64-bit pointers with offset, e.g. // st1h z0.s, p0, [z0.s, #16] - defm SST1B_S : sve_mem_32b_sst_vi_ptrs<0b001, "st1b", timm0_31, AArch64st1_scatter_imm, nxv4i8>; - defm SST1H_S : sve_mem_32b_sst_vi_ptrs<0b011, "st1h", tuimm5s2, AArch64st1_scatter_imm, nxv4i16>; - defm SST1W : sve_mem_32b_sst_vi_ptrs<0b101, "st1w", tuimm5s4, AArch64st1_scatter_imm, nxv4i32>; + defm SST1B_S : sve_mem_32b_sst_vi_ptrs<0b001, "st1b", imm0_31, AArch64st1_scatter_imm, nxv4i8>; + defm SST1H_S : sve_mem_32b_sst_vi_ptrs<0b011, "st1h", uimm5s2, AArch64st1_scatter_imm, nxv4i16>; + defm SST1W : sve_mem_32b_sst_vi_ptrs<0b101, "st1w", uimm5s4, AArch64st1_scatter_imm, nxv4i32>; // Scatters using 32/64-bit pointers with offset, e.g. // st1h z0.d, p0, [z0.d, #16] - defm SST1B_D : sve_mem_64b_sst_vi_ptrs<0b000, "st1b", timm0_31, AArch64st1_scatter_imm, nxv2i8>; - defm SST1H_D : sve_mem_64b_sst_vi_ptrs<0b010, "st1h", tuimm5s2, AArch64st1_scatter_imm, nxv2i16>; - defm SST1W_D : sve_mem_64b_sst_vi_ptrs<0b100, "st1w", tuimm5s4, AArch64st1_scatter_imm, nxv2i32>; - defm SST1D : sve_mem_64b_sst_vi_ptrs<0b110, "st1d", tuimm5s8, AArch64st1_scatter_imm, nxv2i64>; + defm SST1B_D : sve_mem_64b_sst_vi_ptrs<0b000, "st1b", imm0_31, AArch64st1_scatter_imm, nxv2i8>; + defm SST1H_D : sve_mem_64b_sst_vi_ptrs<0b010, "st1h", uimm5s2, AArch64st1_scatter_imm, nxv2i16>; + defm SST1W_D : sve_mem_64b_sst_vi_ptrs<0b100, "st1w", uimm5s4, AArch64st1_scatter_imm, nxv2i32>; + defm SST1D : sve_mem_64b_sst_vi_ptrs<0b110, "st1d", uimm5s8, AArch64st1_scatter_imm, nxv2i64>; // Scatters using unscaled 64-bit offsets, e.g. // st1h z0.d, p0, [x0, z0.d] @@ -722,47 +1000,92 @@ let Predicates = [HasSVE] in { def PRFS_PRR : sve_mem_prfm_ss<0b101, "prfw", GPR64NoXZRshifted32>; def PRFD_PRR : sve_mem_prfm_ss<0b111, "prfd", GPR64NoXZRshifted64>; +multiclass sve_prefetch { + // reg + imm + let AddedComplexity = 2 in { + def _reg_imm : Pat<(prefetch (PredTy PPR_3b:$gp), (am_sve_indexed_s6 GPR64sp:$base, simm6s1:$offset), (i32 sve_prfop:$prfop)), + (RegImmInst sve_prfop:$prfop, PPR_3b:$gp, GPR64:$base, simm6s1:$offset)>; + } + + // reg + reg + let AddedComplexity = 1 in { + def _reg_reg : Pat<(prefetch (PredTy PPR_3b:$gp), (AddrCP GPR64sp:$base, GPR64:$index), (i32 sve_prfop:$prfop)), + (RegRegInst sve_prfop:$prfop, PPR_3b:$gp, GPR64:$base, GPR64:$index)>; + } + + // default fallback + def _default : Pat<(prefetch (PredTy PPR_3b:$gp), GPR64:$base, (i32 sve_prfop:$prfop)), + (RegImmInst sve_prfop:$prfop, PPR_3b:$gp, GPR64:$base, (i64 0))>; + } + + defm : sve_prefetch; + defm : sve_prefetch; + defm : sve_prefetch; + defm : sve_prefetch; + // Gather prefetch using scaled 32-bit offsets, e.g. // prfh pldl1keep, p0, [x0, z0.s, uxtw #1] - defm PRFB_S : sve_mem_32b_prfm_sv_scaled<0b00, "prfb", ZPR32ExtSXTW8Only, ZPR32ExtUXTW8Only>; - defm PRFH_S : sve_mem_32b_prfm_sv_scaled<0b01, "prfh", ZPR32ExtSXTW16, ZPR32ExtUXTW16>; - defm PRFW_S : sve_mem_32b_prfm_sv_scaled<0b10, "prfw", ZPR32ExtSXTW32, ZPR32ExtUXTW32>; - defm PRFD_S : sve_mem_32b_prfm_sv_scaled<0b11, "prfd", ZPR32ExtSXTW64, ZPR32ExtUXTW64>; + defm PRFB_S : sve_mem_32b_prfm_sv_scaled<0b00, "prfb", ZPR32ExtSXTW8Only, ZPR32ExtUXTW8Only, int_aarch64_sve_prfb_gather_sxtw_index, int_aarch64_sve_prfb_gather_uxtw_index>; + defm PRFH_S : sve_mem_32b_prfm_sv_scaled<0b01, "prfh", ZPR32ExtSXTW16, ZPR32ExtUXTW16, int_aarch64_sve_prfh_gather_sxtw_index, int_aarch64_sve_prfh_gather_uxtw_index>; + defm PRFW_S : sve_mem_32b_prfm_sv_scaled<0b10, "prfw", ZPR32ExtSXTW32, ZPR32ExtUXTW32, int_aarch64_sve_prfw_gather_sxtw_index, int_aarch64_sve_prfw_gather_uxtw_index>; + defm PRFD_S : sve_mem_32b_prfm_sv_scaled<0b11, "prfd", ZPR32ExtSXTW64, ZPR32ExtUXTW64, int_aarch64_sve_prfd_gather_sxtw_index, int_aarch64_sve_prfd_gather_uxtw_index>; // Gather prefetch using unpacked, scaled 32-bit offsets, e.g. // prfh pldl1keep, p0, [x0, z0.d, uxtw #1] - defm PRFB_D : sve_mem_64b_prfm_sv_ext_scaled<0b00, "prfb", ZPR64ExtSXTW8Only, ZPR64ExtUXTW8Only>; - defm PRFH_D : sve_mem_64b_prfm_sv_ext_scaled<0b01, "prfh", ZPR64ExtSXTW16, ZPR64ExtUXTW16>; - defm PRFW_D : sve_mem_64b_prfm_sv_ext_scaled<0b10, "prfw", ZPR64ExtSXTW32, ZPR64ExtUXTW32>; - defm PRFD_D : sve_mem_64b_prfm_sv_ext_scaled<0b11, "prfd", ZPR64ExtSXTW64, ZPR64ExtUXTW64>; + defm PRFB_D : sve_mem_64b_prfm_sv_ext_scaled<0b00, "prfb", ZPR64ExtSXTW8Only, ZPR64ExtUXTW8Only, int_aarch64_sve_prfb_gather_sxtw_index, int_aarch64_sve_prfb_gather_uxtw_index>; + defm PRFH_D : sve_mem_64b_prfm_sv_ext_scaled<0b01, "prfh", ZPR64ExtSXTW16, ZPR64ExtUXTW16, int_aarch64_sve_prfh_gather_sxtw_index, int_aarch64_sve_prfh_gather_uxtw_index>; + defm PRFW_D : sve_mem_64b_prfm_sv_ext_scaled<0b10, "prfw", ZPR64ExtSXTW32, ZPR64ExtUXTW32, int_aarch64_sve_prfw_gather_sxtw_index, int_aarch64_sve_prfw_gather_uxtw_index>; + defm PRFD_D : sve_mem_64b_prfm_sv_ext_scaled<0b11, "prfd", ZPR64ExtSXTW64, ZPR64ExtUXTW64, int_aarch64_sve_prfd_gather_sxtw_index, int_aarch64_sve_prfd_gather_uxtw_index>; // Gather prefetch using scaled 64-bit offsets, e.g. // prfh pldl1keep, p0, [x0, z0.d, lsl #1] - defm PRFB_D_SCALED : sve_mem_64b_prfm_sv_lsl_scaled<0b00, "prfb", ZPR64ExtLSL8>; - defm PRFH_D_SCALED : sve_mem_64b_prfm_sv_lsl_scaled<0b01, "prfh", ZPR64ExtLSL16>; - defm PRFW_D_SCALED : sve_mem_64b_prfm_sv_lsl_scaled<0b10, "prfw", ZPR64ExtLSL32>; - defm PRFD_D_SCALED : sve_mem_64b_prfm_sv_lsl_scaled<0b11, "prfd", ZPR64ExtLSL64>; + defm PRFB_D_SCALED : sve_mem_64b_prfm_sv_lsl_scaled<0b00, "prfb", ZPR64ExtLSL8, int_aarch64_sve_prfb_gather_index>; + defm PRFH_D_SCALED : sve_mem_64b_prfm_sv_lsl_scaled<0b01, "prfh", ZPR64ExtLSL16, int_aarch64_sve_prfh_gather_index>; + defm PRFW_D_SCALED : sve_mem_64b_prfm_sv_lsl_scaled<0b10, "prfw", ZPR64ExtLSL32, int_aarch64_sve_prfw_gather_index>; + defm PRFD_D_SCALED : sve_mem_64b_prfm_sv_lsl_scaled<0b11, "prfd", ZPR64ExtLSL64, int_aarch64_sve_prfd_gather_index>; // Gather prefetch using 32/64-bit pointers with offset, e.g. // prfh pldl1keep, p0, [z0.s, #16] // prfh pldl1keep, p0, [z0.d, #16] - defm PRFB_S_PZI : sve_mem_32b_prfm_vi<0b00, "prfb", imm0_31>; - defm PRFH_S_PZI : sve_mem_32b_prfm_vi<0b01, "prfh", uimm5s2>; - defm PRFW_S_PZI : sve_mem_32b_prfm_vi<0b10, "prfw", uimm5s4>; - defm PRFD_S_PZI : sve_mem_32b_prfm_vi<0b11, "prfd", uimm5s8>; + defm PRFB_S_PZI : sve_mem_32b_prfm_vi<0b00, "prfb", imm0_31, int_aarch64_sve_prfb_gather_scalar_offset>; + defm PRFH_S_PZI : sve_mem_32b_prfm_vi<0b01, "prfh", uimm5s2, int_aarch64_sve_prfh_gather_scalar_offset>; + defm PRFW_S_PZI : sve_mem_32b_prfm_vi<0b10, "prfw", uimm5s4, int_aarch64_sve_prfw_gather_scalar_offset>; + defm PRFD_S_PZI : sve_mem_32b_prfm_vi<0b11, "prfd", uimm5s8, int_aarch64_sve_prfd_gather_scalar_offset>; - defm PRFB_D_PZI : sve_mem_64b_prfm_vi<0b00, "prfb", imm0_31>; - defm PRFH_D_PZI : sve_mem_64b_prfm_vi<0b01, "prfh", uimm5s2>; - defm PRFW_D_PZI : sve_mem_64b_prfm_vi<0b10, "prfw", uimm5s4>; - defm PRFD_D_PZI : sve_mem_64b_prfm_vi<0b11, "prfd", uimm5s8>; + defm PRFB_D_PZI : sve_mem_64b_prfm_vi<0b00, "prfb", imm0_31, int_aarch64_sve_prfb_gather_scalar_offset>; + defm PRFH_D_PZI : sve_mem_64b_prfm_vi<0b01, "prfh", uimm5s2, int_aarch64_sve_prfh_gather_scalar_offset>; + defm PRFW_D_PZI : sve_mem_64b_prfm_vi<0b10, "prfw", uimm5s4, int_aarch64_sve_prfw_gather_scalar_offset>; + defm PRFD_D_PZI : sve_mem_64b_prfm_vi<0b11, "prfd", uimm5s8, int_aarch64_sve_prfd_gather_scalar_offset>; defm ADR_SXTW_ZZZ_D : sve_int_bin_cons_misc_0_a_sxtw<0b00, "adr">; defm ADR_UXTW_ZZZ_D : sve_int_bin_cons_misc_0_a_uxtw<0b01, "adr">; defm ADR_LSL_ZZZ_S : sve_int_bin_cons_misc_0_a_32_lsl<0b10, "adr">; defm ADR_LSL_ZZZ_D : sve_int_bin_cons_misc_0_a_64_lsl<0b11, "adr">; + def : Pat<(nxv4i32 (int_aarch64_sve_adrb nxv4i32:$Op1, nxv4i32:$Op2)), + (ADR_LSL_ZZZ_S_0 $Op1, $Op2)>; + def : Pat<(nxv4i32 (int_aarch64_sve_adrh nxv4i32:$Op1, nxv4i32:$Op2)), + (ADR_LSL_ZZZ_S_1 $Op1, $Op2)>; + def : Pat<(nxv4i32 (int_aarch64_sve_adrw nxv4i32:$Op1, nxv4i32:$Op2)), + (ADR_LSL_ZZZ_S_2 $Op1, $Op2)>; + def : Pat<(nxv4i32 (int_aarch64_sve_adrd nxv4i32:$Op1, nxv4i32:$Op2)), + (ADR_LSL_ZZZ_S_3 $Op1, $Op2)>; + + def : Pat<(nxv2i64 (int_aarch64_sve_adrb nxv2i64:$Op1, nxv2i64:$Op2)), + (ADR_LSL_ZZZ_D_0 $Op1, $Op2)>; + def : Pat<(nxv2i64 (int_aarch64_sve_adrh nxv2i64:$Op1, nxv2i64:$Op2)), + (ADR_LSL_ZZZ_D_1 $Op1, $Op2)>; + def : Pat<(nxv2i64 (int_aarch64_sve_adrw nxv2i64:$Op1, nxv2i64:$Op2)), + (ADR_LSL_ZZZ_D_2 $Op1, $Op2)>; + def : Pat<(nxv2i64 (int_aarch64_sve_adrd nxv2i64:$Op1, nxv2i64:$Op2)), + (ADR_LSL_ZZZ_D_3 $Op1, $Op2)>; + defm TBL_ZZZ : sve_int_perm_tbl<"tbl", AArch64tbl>; + let Predicates = [HasSVE, HasBF16] in { + def : SVE_2_Op_Pat; + } + defm ZIP1_ZZZ : sve_int_perm_bin_perm_zz<0b000, "zip1", AArch64zip1>; defm ZIP2_ZZZ : sve_int_perm_bin_perm_zz<0b001, "zip2", AArch64zip2>; defm UZP1_ZZZ : sve_int_perm_bin_perm_zz<0b010, "uzp1", AArch64uzp1>; @@ -770,6 +1093,15 @@ let Predicates = [HasSVE] in { defm TRN1_ZZZ : sve_int_perm_bin_perm_zz<0b100, "trn1", AArch64trn1>; defm TRN2_ZZZ : sve_int_perm_bin_perm_zz<0b101, "trn2", AArch64trn2>; + let Predicates = [HasSVE, HasBF16] in { + def : SVE_2_Op_Pat; + def : SVE_2_Op_Pat; + def : SVE_2_Op_Pat; + def : SVE_2_Op_Pat; + def : SVE_2_Op_Pat; + def : SVE_2_Op_Pat; + } + defm ZIP1_PPP : sve_int_perm_bin_perm_pp<0b000, "zip1", AArch64zip1>; defm ZIP2_PPP : sve_int_perm_bin_perm_pp<0b001, "zip2", AArch64zip2>; defm UZP1_PPP : sve_int_perm_bin_perm_pp<0b010, "uzp1", AArch64uzp1>; @@ -777,12 +1109,12 @@ let Predicates = [HasSVE] in { defm TRN1_PPP : sve_int_perm_bin_perm_pp<0b100, "trn1", AArch64trn1>; defm TRN2_PPP : sve_int_perm_bin_perm_pp<0b101, "trn2", AArch64trn2>; - defm CMPHS_PPzZZ : sve_int_cmp_0<0b000, "cmphs", int_aarch64_sve_cmphs, SETUGE>; - defm CMPHI_PPzZZ : sve_int_cmp_0<0b001, "cmphi", int_aarch64_sve_cmphi, SETUGT>; - defm CMPGE_PPzZZ : sve_int_cmp_0<0b100, "cmpge", int_aarch64_sve_cmpge, SETGE>; - defm CMPGT_PPzZZ : sve_int_cmp_0<0b101, "cmpgt", int_aarch64_sve_cmpgt, SETGT>; - defm CMPEQ_PPzZZ : sve_int_cmp_0<0b110, "cmpeq", int_aarch64_sve_cmpeq, SETEQ>; - defm CMPNE_PPzZZ : sve_int_cmp_0<0b111, "cmpne", int_aarch64_sve_cmpne, SETNE>; + defm CMPHS_PPzZZ : sve_int_cmp_0<0b000, "cmphs", SETUGE, SETULE>; + defm CMPHI_PPzZZ : sve_int_cmp_0<0b001, "cmphi", SETUGT, SETULT>; + defm CMPGE_PPzZZ : sve_int_cmp_0<0b100, "cmpge", SETGE, SETLE>; + defm CMPGT_PPzZZ : sve_int_cmp_0<0b101, "cmpgt", SETGT, SETLT>; + defm CMPEQ_PPzZZ : sve_int_cmp_0<0b110, "cmpeq", SETEQ, SETEQ>; + defm CMPNE_PPzZZ : sve_int_cmp_0<0b111, "cmpne", SETNE, SETNE>; defm CMPEQ_WIDE_PPzZZ : sve_int_cmp_0_wide<0b010, "cmpeq", int_aarch64_sve_cmpeq_wide>; defm CMPNE_WIDE_PPzZZ : sve_int_cmp_0_wide<0b011, "cmpne", int_aarch64_sve_cmpne_wide>; @@ -795,22 +1127,22 @@ let Predicates = [HasSVE] in { defm CMPLO_WIDE_PPzZZ : sve_int_cmp_1_wide<0b110, "cmplo", int_aarch64_sve_cmplo_wide>; defm CMPLS_WIDE_PPzZZ : sve_int_cmp_1_wide<0b111, "cmpls", int_aarch64_sve_cmpls_wide>; - defm CMPGE_PPzZI : sve_int_scmp_vi<0b000, "cmpge", SETGE, int_aarch64_sve_cmpge>; - defm CMPGT_PPzZI : sve_int_scmp_vi<0b001, "cmpgt", SETGT, int_aarch64_sve_cmpgt>; - defm CMPLT_PPzZI : sve_int_scmp_vi<0b010, "cmplt", SETLT, null_frag, int_aarch64_sve_cmpgt>; - defm CMPLE_PPzZI : sve_int_scmp_vi<0b011, "cmple", SETLE, null_frag, int_aarch64_sve_cmpge>; - defm CMPEQ_PPzZI : sve_int_scmp_vi<0b100, "cmpeq", SETEQ, int_aarch64_sve_cmpeq>; - defm CMPNE_PPzZI : sve_int_scmp_vi<0b101, "cmpne", SETNE, int_aarch64_sve_cmpne>; - defm CMPHS_PPzZI : sve_int_ucmp_vi<0b00, "cmphs", SETUGE, int_aarch64_sve_cmphs>; - defm CMPHI_PPzZI : sve_int_ucmp_vi<0b01, "cmphi", SETUGT, int_aarch64_sve_cmphi>; - defm CMPLO_PPzZI : sve_int_ucmp_vi<0b10, "cmplo", SETULT, null_frag, int_aarch64_sve_cmphi>; - defm CMPLS_PPzZI : sve_int_ucmp_vi<0b11, "cmpls", SETULE, null_frag, int_aarch64_sve_cmphs>; - - defm FCMGE_PPzZZ : sve_fp_3op_p_pd<0b000, "fcmge", int_aarch64_sve_fcmpge>; - defm FCMGT_PPzZZ : sve_fp_3op_p_pd<0b001, "fcmgt", int_aarch64_sve_fcmpgt>; - defm FCMEQ_PPzZZ : sve_fp_3op_p_pd<0b010, "fcmeq", int_aarch64_sve_fcmpeq>; - defm FCMNE_PPzZZ : sve_fp_3op_p_pd<0b011, "fcmne", int_aarch64_sve_fcmpne>; - defm FCMUO_PPzZZ : sve_fp_3op_p_pd<0b100, "fcmuo", int_aarch64_sve_fcmpuo>; + defm CMPGE_PPzZI : sve_int_scmp_vi<0b000, "cmpge", SETGE, SETLE>; + defm CMPGT_PPzZI : sve_int_scmp_vi<0b001, "cmpgt", SETGT, SETLT>; + defm CMPLT_PPzZI : sve_int_scmp_vi<0b010, "cmplt", SETLT, SETGT>; + defm CMPLE_PPzZI : sve_int_scmp_vi<0b011, "cmple", SETLE, SETGE>; + defm CMPEQ_PPzZI : sve_int_scmp_vi<0b100, "cmpeq", SETEQ, SETEQ>; + defm CMPNE_PPzZI : sve_int_scmp_vi<0b101, "cmpne", SETNE, SETEQ>; + defm CMPHS_PPzZI : sve_int_ucmp_vi<0b00, "cmphs", SETUGE, SETULE>; + defm CMPHI_PPzZI : sve_int_ucmp_vi<0b01, "cmphi", SETUGT, SETULT>; + defm CMPLO_PPzZI : sve_int_ucmp_vi<0b10, "cmplo", SETULT, SETUGT>; + defm CMPLS_PPzZI : sve_int_ucmp_vi<0b11, "cmpls", SETULE, SETUGE>; + + defm FCMGE_PPzZZ : sve_fp_3op_p_pd_cc<0b000, "fcmge", int_aarch64_sve_fcmpge, setoge>; + defm FCMGT_PPzZZ : sve_fp_3op_p_pd_cc<0b001, "fcmgt", int_aarch64_sve_fcmpgt, setogt>; + defm FCMEQ_PPzZZ : sve_fp_3op_p_pd_cc<0b010, "fcmeq", int_aarch64_sve_fcmpeq, setoeq>; + defm FCMNE_PPzZZ : sve_fp_3op_p_pd_cc<0b011, "fcmne", int_aarch64_sve_fcmpne, setone>; + defm FCMUO_PPzZZ : sve_fp_3op_p_pd_cc<0b100, "fcmuo", int_aarch64_sve_fcmpuo, setuo>; defm FACGE_PPzZZ : sve_fp_3op_p_pd<0b101, "facge", int_aarch64_sve_facge>; defm FACGT_PPzZZ : sve_fp_3op_p_pd<0b111, "facgt", int_aarch64_sve_facgt>; @@ -928,71 +1260,78 @@ let Predicates = [HasSVE] in { defm INCP_ZP : sve_int_count_v<0b10000, "incp">; defm DECP_ZP : sve_int_count_v<0b10100, "decp">; - defm INDEX_RR : sve_int_index_rr<"index">; - defm INDEX_IR : sve_int_index_ir<"index">; - defm INDEX_RI : sve_int_index_ri<"index">; - defm INDEX_II : sve_int_index_ii<"index">; + defm INDEX_RR : sve_int_index_rr<"index", index_vector>; + defm INDEX_IR : sve_int_index_ir<"index", index_vector>; + defm INDEX_RI : sve_int_index_ri<"index", index_vector>; + defm INDEX_II : sve_int_index_ii<"index", index_vector>; // Unpredicated shifts - defm ASR_ZZI : sve_int_bin_cons_shift_imm_right<0b00, "asr">; - defm LSR_ZZI : sve_int_bin_cons_shift_imm_right<0b01, "lsr">; - defm LSL_ZZI : sve_int_bin_cons_shift_imm_left< 0b11, "lsl">; + defm ASR_ZZI : sve_int_bin_cons_shift_imm_right<0b00, "asr", AArch64asr_m1>; + defm LSR_ZZI : sve_int_bin_cons_shift_imm_right<0b01, "lsr", AArch64lsr_m1>; + defm LSL_ZZI : sve_int_bin_cons_shift_imm_left< 0b11, "lsl", AArch64lsl_m1>; defm ASR_WIDE_ZZZ : sve_int_bin_cons_shift_wide<0b00, "asr">; defm LSR_WIDE_ZZZ : sve_int_bin_cons_shift_wide<0b01, "lsr">; defm LSL_WIDE_ZZZ : sve_int_bin_cons_shift_wide<0b11, "lsl">; // Predicated shifts - defm ASR_ZPmI : sve_int_bin_pred_shift_imm_right<0b0000, "asr">; - defm LSR_ZPmI : sve_int_bin_pred_shift_imm_right<0b0001, "lsr">; + defm ASR_ZPmI : sve_int_bin_pred_shift_imm_right<0b0000, "asr", "ASR_ZPZI">; + defm LSR_ZPmI : sve_int_bin_pred_shift_imm_right<0b0001, "lsr", "LSR_ZPZI">; defm LSL_ZPmI : sve_int_bin_pred_shift_imm_left< 0b0011, "lsl">; - defm ASRD_ZPmI : sve_int_bin_pred_shift_imm_right<0b0100, "asrd", int_aarch64_sve_asrd>; + defm ASRD_ZPmI : sve_int_bin_pred_shift_imm_right<0b0100, "asrd", "ASRD_ZPZI", int_aarch64_sve_asrd>; + + let Predicates = [HasSVE, UseExperimentalZeroingPseudos] in { + defm ASR_ZPZZ : sve_int_bin_pred_zeroing_bhsd; + defm LSR_ZPZZ : sve_int_bin_pred_zeroing_bhsd; + defm LSL_ZPZZ : sve_int_bin_pred_zeroing_bhsd; + defm ASRD_ZPZI : sve_int_bin_pred_shift_imm_right_zeroing_bhsd; + } - defm ASR_ZPmZ : sve_int_bin_pred_shift<0b000, "asr", int_aarch64_sve_asr>; - defm LSR_ZPmZ : sve_int_bin_pred_shift<0b001, "lsr", int_aarch64_sve_lsr>; - defm LSL_ZPmZ : sve_int_bin_pred_shift<0b011, "lsl", int_aarch64_sve_lsl>; - defm ASRR_ZPmZ : sve_int_bin_pred_shift<0b100, "asrr", null_frag>; - defm LSRR_ZPmZ : sve_int_bin_pred_shift<0b101, "lsrr", null_frag>; - defm LSLR_ZPmZ : sve_int_bin_pred_shift<0b111, "lslr", null_frag>; + defm ASR_ZPmZ : sve_int_bin_pred_shift<0b000, "asr", "ASR_ZPZZ", AArch64asr_m1, "ASRR_ZPmZ">; + defm LSR_ZPmZ : sve_int_bin_pred_shift<0b001, "lsr", "LSR_ZPZZ", AArch64lsr_m1, "LSRR_ZPmZ">; + defm LSL_ZPmZ : sve_int_bin_pred_shift<0b011, "lsl", "LSL_ZPZZ", AArch64lsl_m1, "LSLR_ZPmZ">; + defm ASRR_ZPmZ : sve_int_bin_pred_shift<0b100, "asrr", "ASRR_ZPZZ", null_frag, "ASR_ZPmZ", /*isReverseInstr*/ 1>; + defm LSRR_ZPmZ : sve_int_bin_pred_shift<0b101, "lsrr", "LSRR_ZPZZ", null_frag, "LSR_ZPmZ", /*isReverseInstr*/ 1>; + defm LSLR_ZPmZ : sve_int_bin_pred_shift<0b111, "lslr", "LSLR_ZPZZ", null_frag, "LSL_ZPmZ", /*isReverseInstr*/ 1>; defm ASR_WIDE_ZPmZ : sve_int_bin_pred_shift_wide<0b000, "asr", int_aarch64_sve_asr_wide>; defm LSR_WIDE_ZPmZ : sve_int_bin_pred_shift_wide<0b001, "lsr", int_aarch64_sve_lsr_wide>; defm LSL_WIDE_ZPmZ : sve_int_bin_pred_shift_wide<0b011, "lsl", int_aarch64_sve_lsl_wide>; - defm FCVT_ZPmZ_StoH : sve_fp_2op_p_zd<0b1001000, "fcvt", ZPR32, ZPR16, int_aarch64_sve_fcvt_f16f32, nxv8f16, nxv16i1, nxv4f32, ElementSizeS>; - defm FCVT_ZPmZ_HtoS : sve_fp_2op_p_zd<0b1001001, "fcvt", ZPR16, ZPR32, int_aarch64_sve_fcvt_f32f16, nxv4f32, nxv16i1, nxv8f16, ElementSizeS>; - defm SCVTF_ZPmZ_HtoH : sve_fp_2op_p_zd<0b0110010, "scvtf", ZPR16, ZPR16, int_aarch64_sve_scvtf, nxv8f16, nxv8i1, nxv8i16, ElementSizeH>; - defm SCVTF_ZPmZ_StoS : sve_fp_2op_p_zd<0b1010100, "scvtf", ZPR32, ZPR32, int_aarch64_sve_scvtf, nxv4f32, nxv4i1, nxv4i32, ElementSizeS>; - defm UCVTF_ZPmZ_StoS : sve_fp_2op_p_zd<0b1010101, "ucvtf", ZPR32, ZPR32, int_aarch64_sve_ucvtf, nxv4f32, nxv4i1, nxv4i32, ElementSizeS>; - defm UCVTF_ZPmZ_HtoH : sve_fp_2op_p_zd<0b0110011, "ucvtf", ZPR16, ZPR16, int_aarch64_sve_ucvtf, nxv8f16, nxv8i1, nxv8i16, ElementSizeH>; - defm FCVTZS_ZPmZ_HtoH : sve_fp_2op_p_zd<0b0111010, "fcvtzs", ZPR16, ZPR16, int_aarch64_sve_fcvtzs, nxv8i16, nxv8i1, nxv8f16, ElementSizeH>; - defm FCVTZS_ZPmZ_StoS : sve_fp_2op_p_zd<0b1011100, "fcvtzs", ZPR32, ZPR32, int_aarch64_sve_fcvtzs, nxv4i32, nxv4i1, nxv4f32, ElementSizeS>; - defm FCVTZU_ZPmZ_HtoH : sve_fp_2op_p_zd<0b0111011, "fcvtzu", ZPR16, ZPR16, int_aarch64_sve_fcvtzu, nxv8i16, nxv8i1, nxv8f16, ElementSizeH>; - defm FCVTZU_ZPmZ_StoS : sve_fp_2op_p_zd<0b1011101, "fcvtzu", ZPR32, ZPR32, int_aarch64_sve_fcvtzu, nxv4i32, nxv4i1, nxv4f32, ElementSizeS>; - defm FCVT_ZPmZ_DtoH : sve_fp_2op_p_zd<0b1101000, "fcvt", ZPR64, ZPR16, int_aarch64_sve_fcvt_f16f64, nxv8f16, nxv16i1, nxv2f64, ElementSizeD>; - defm FCVT_ZPmZ_HtoD : sve_fp_2op_p_zd<0b1101001, "fcvt", ZPR16, ZPR64, int_aarch64_sve_fcvt_f64f16, nxv2f64, nxv16i1, nxv8f16, ElementSizeD>; - defm FCVT_ZPmZ_DtoS : sve_fp_2op_p_zd<0b1101010, "fcvt", ZPR64, ZPR32, int_aarch64_sve_fcvt_f32f64, nxv4f32, nxv16i1, nxv2f64, ElementSizeD>; - defm FCVT_ZPmZ_StoD : sve_fp_2op_p_zd<0b1101011, "fcvt", ZPR32, ZPR64, int_aarch64_sve_fcvt_f64f32, nxv2f64, nxv16i1, nxv4f32, ElementSizeD>; - defm SCVTF_ZPmZ_StoD : sve_fp_2op_p_zd<0b1110000, "scvtf", ZPR32, ZPR64, int_aarch64_sve_scvtf_f64i32, nxv2f64, nxv16i1, nxv4i32, ElementSizeD>; - defm UCVTF_ZPmZ_StoD : sve_fp_2op_p_zd<0b1110001, "ucvtf", ZPR32, ZPR64, int_aarch64_sve_ucvtf_f64i32, nxv2f64, nxv16i1, nxv4i32, ElementSizeD>; - defm UCVTF_ZPmZ_StoH : sve_fp_2op_p_zd<0b0110101, "ucvtf", ZPR32, ZPR16, int_aarch64_sve_ucvtf_f16i32, nxv8f16, nxv16i1, nxv4i32, ElementSizeS>; - defm SCVTF_ZPmZ_DtoS : sve_fp_2op_p_zd<0b1110100, "scvtf", ZPR64, ZPR32, int_aarch64_sve_scvtf_f32i64, nxv4f32, nxv16i1, nxv2i64, ElementSizeD>; - defm SCVTF_ZPmZ_StoH : sve_fp_2op_p_zd<0b0110100, "scvtf", ZPR32, ZPR16, int_aarch64_sve_scvtf_f16i32, nxv8f16, nxv16i1, nxv4i32, ElementSizeS>; - defm SCVTF_ZPmZ_DtoH : sve_fp_2op_p_zd<0b0110110, "scvtf", ZPR64, ZPR16, int_aarch64_sve_scvtf_f16i64, nxv8f16, nxv16i1, nxv2i64, ElementSizeD>; - defm UCVTF_ZPmZ_DtoS : sve_fp_2op_p_zd<0b1110101, "ucvtf", ZPR64, ZPR32, int_aarch64_sve_ucvtf_f32i64, nxv4f32, nxv16i1, nxv2i64, ElementSizeD>; - defm UCVTF_ZPmZ_DtoH : sve_fp_2op_p_zd<0b0110111, "ucvtf", ZPR64, ZPR16, int_aarch64_sve_ucvtf_f16i64, nxv8f16, nxv16i1, nxv2i64, ElementSizeD>; - defm SCVTF_ZPmZ_DtoD : sve_fp_2op_p_zd<0b1110110, "scvtf", ZPR64, ZPR64, int_aarch64_sve_scvtf, nxv2f64, nxv2i1, nxv2i64, ElementSizeD>; - defm UCVTF_ZPmZ_DtoD : sve_fp_2op_p_zd<0b1110111, "ucvtf", ZPR64, ZPR64, int_aarch64_sve_ucvtf, nxv2f64, nxv2i1, nxv2i64, ElementSizeD>; - defm FCVTZS_ZPmZ_DtoS : sve_fp_2op_p_zd<0b1111000, "fcvtzs", ZPR64, ZPR32, int_aarch64_sve_fcvtzs_i32f64, nxv4i32, nxv16i1, nxv2f64, ElementSizeD>; - defm FCVTZU_ZPmZ_DtoS : sve_fp_2op_p_zd<0b1111001, "fcvtzu", ZPR64, ZPR32, int_aarch64_sve_fcvtzu_i32f64, nxv4i32, nxv16i1, nxv2f64, ElementSizeD>; - defm FCVTZS_ZPmZ_StoD : sve_fp_2op_p_zd<0b1111100, "fcvtzs", ZPR32, ZPR64, int_aarch64_sve_fcvtzs_i64f32, nxv2i64, nxv16i1, nxv4f32, ElementSizeD>; - defm FCVTZS_ZPmZ_HtoS : sve_fp_2op_p_zd<0b0111100, "fcvtzs", ZPR16, ZPR32, int_aarch64_sve_fcvtzs_i32f16, nxv4i32, nxv16i1, nxv8f16, ElementSizeS>; - defm FCVTZS_ZPmZ_HtoD : sve_fp_2op_p_zd<0b0111110, "fcvtzs", ZPR16, ZPR64, int_aarch64_sve_fcvtzs_i64f16, nxv2i64, nxv16i1, nxv8f16, ElementSizeD>; - defm FCVTZU_ZPmZ_HtoS : sve_fp_2op_p_zd<0b0111101, "fcvtzu", ZPR16, ZPR32, int_aarch64_sve_fcvtzu_i32f16, nxv4i32, nxv16i1, nxv8f16, ElementSizeS>; - defm FCVTZU_ZPmZ_HtoD : sve_fp_2op_p_zd<0b0111111, "fcvtzu", ZPR16, ZPR64, int_aarch64_sve_fcvtzu_i64f16, nxv2i64, nxv16i1, nxv8f16, ElementSizeD>; - defm FCVTZU_ZPmZ_StoD : sve_fp_2op_p_zd<0b1111101, "fcvtzu", ZPR32, ZPR64, int_aarch64_sve_fcvtzu_i64f32, nxv2i64, nxv16i1, nxv4f32, ElementSizeD>; - defm FCVTZS_ZPmZ_DtoD : sve_fp_2op_p_zd<0b1111110, "fcvtzs", ZPR64, ZPR64, int_aarch64_sve_fcvtzs, nxv2i64, nxv2i1, nxv2f64, ElementSizeD>; - defm FCVTZU_ZPmZ_DtoD : sve_fp_2op_p_zd<0b1111111, "fcvtzu", ZPR64, ZPR64, int_aarch64_sve_fcvtzu, nxv2i64, nxv2i1, nxv2f64, ElementSizeD>; + defm FCVT_ZPmZ_StoH : sve_fp_2op_p_zd<0b1001000, "fcvt", ZPR32, ZPR16, int_aarch64_sve_fcvt_f16f32, nxv8f16, nxv4i1, nxv4f32, ElementSizeS>; + defm FCVT_ZPmZ_HtoS : sve_fp_2op_p_zd<0b1001001, "fcvt", ZPR16, ZPR32, int_aarch64_sve_fcvt_f32f16, nxv4f32, nxv4i1, nxv8f16, ElementSizeS>; + defm SCVTF_ZPmZ_HtoH : sve_fp_2op_p_zd<0b0110010, "scvtf", ZPR16, ZPR16, int_aarch64_sve_scvtf, nxv8f16, nxv8i1, nxv8i16, ElementSizeH>; + defm SCVTF_ZPmZ_StoS : sve_fp_2op_p_zd<0b1010100, "scvtf", ZPR32, ZPR32, int_aarch64_sve_scvtf, nxv4f32, nxv4i1, nxv4i32, ElementSizeS>; + defm UCVTF_ZPmZ_StoS : sve_fp_2op_p_zd<0b1010101, "ucvtf", ZPR32, ZPR32, int_aarch64_sve_ucvtf, nxv4f32, nxv4i1, nxv4i32, ElementSizeS>; + defm UCVTF_ZPmZ_HtoH : sve_fp_2op_p_zd<0b0110011, "ucvtf", ZPR16, ZPR16, int_aarch64_sve_ucvtf, nxv8f16, nxv8i1, nxv8i16, ElementSizeH>; + defm FCVTZS_ZPmZ_HtoH : sve_fp_2op_p_zd<0b0111010, "fcvtzs", ZPR16, ZPR16, int_aarch64_sve_fcvtzs, nxv8i16, nxv8i1, nxv8f16, ElementSizeH>; + defm FCVTZS_ZPmZ_StoS : sve_fp_2op_p_zd<0b1011100, "fcvtzs", ZPR32, ZPR32, int_aarch64_sve_fcvtzs, nxv4i32, nxv4i1, nxv4f32, ElementSizeS>; + defm FCVTZU_ZPmZ_HtoH : sve_fp_2op_p_zd<0b0111011, "fcvtzu", ZPR16, ZPR16, int_aarch64_sve_fcvtzu, nxv8i16, nxv8i1, nxv8f16, ElementSizeH>; + defm FCVTZU_ZPmZ_StoS : sve_fp_2op_p_zd<0b1011101, "fcvtzu", ZPR32, ZPR32, int_aarch64_sve_fcvtzu, nxv4i32, nxv4i1, nxv4f32, ElementSizeS>; + defm FCVT_ZPmZ_DtoH : sve_fp_2op_p_zd<0b1101000, "fcvt", ZPR64, ZPR16, int_aarch64_sve_fcvt_f16f64, nxv8f16, nxv2i1, nxv2f64, ElementSizeD>; + defm FCVT_ZPmZ_HtoD : sve_fp_2op_p_zd<0b1101001, "fcvt", ZPR16, ZPR64, int_aarch64_sve_fcvt_f64f16, nxv2f64, nxv2i1, nxv8f16, ElementSizeD>; + defm FCVT_ZPmZ_DtoS : sve_fp_2op_p_zd<0b1101010, "fcvt", ZPR64, ZPR32, int_aarch64_sve_fcvt_f32f64, nxv4f32, nxv2i1, nxv2f64, ElementSizeD>; + defm FCVT_ZPmZ_StoD : sve_fp_2op_p_zd<0b1101011, "fcvt", ZPR32, ZPR64, int_aarch64_sve_fcvt_f64f32, nxv2f64, nxv2i1, nxv4f32, ElementSizeD>; + defm SCVTF_ZPmZ_StoD : sve_fp_2op_p_zd<0b1110000, "scvtf", ZPR32, ZPR64, int_aarch64_sve_scvtf_f64i32, nxv2f64, nxv2i1, nxv4i32, ElementSizeD>; + defm UCVTF_ZPmZ_StoD : sve_fp_2op_p_zd<0b1110001, "ucvtf", ZPR32, ZPR64, int_aarch64_sve_ucvtf_f64i32, nxv2f64, nxv2i1, nxv4i32, ElementSizeD>; + defm UCVTF_ZPmZ_StoH : sve_fp_2op_p_zd<0b0110101, "ucvtf", ZPR32, ZPR16, int_aarch64_sve_ucvtf_f16i32, nxv8f16, nxv4i1, nxv4i32, ElementSizeS>; + defm SCVTF_ZPmZ_DtoS : sve_fp_2op_p_zd<0b1110100, "scvtf", ZPR64, ZPR32, int_aarch64_sve_scvtf_f32i64, nxv4f32, nxv2i1, nxv2i64, ElementSizeD>; + defm SCVTF_ZPmZ_StoH : sve_fp_2op_p_zd<0b0110100, "scvtf", ZPR32, ZPR16, int_aarch64_sve_scvtf_f16i32, nxv8f16, nxv4i1, nxv4i32, ElementSizeS>; + defm SCVTF_ZPmZ_DtoH : sve_fp_2op_p_zd<0b0110110, "scvtf", ZPR64, ZPR16, int_aarch64_sve_scvtf_f16i64, nxv8f16, nxv2i1, nxv2i64, ElementSizeD>; + defm UCVTF_ZPmZ_DtoS : sve_fp_2op_p_zd<0b1110101, "ucvtf", ZPR64, ZPR32, int_aarch64_sve_ucvtf_f32i64, nxv4f32, nxv2i1, nxv2i64, ElementSizeD>; + defm UCVTF_ZPmZ_DtoH : sve_fp_2op_p_zd<0b0110111, "ucvtf", ZPR64, ZPR16, int_aarch64_sve_ucvtf_f16i64, nxv8f16, nxv2i1, nxv2i64, ElementSizeD>; + defm SCVTF_ZPmZ_DtoD : sve_fp_2op_p_zd<0b1110110, "scvtf", ZPR64, ZPR64, int_aarch64_sve_scvtf, nxv2f64, nxv2i1, nxv2i64, ElementSizeD>; + defm UCVTF_ZPmZ_DtoD : sve_fp_2op_p_zd<0b1110111, "ucvtf", ZPR64, ZPR64, int_aarch64_sve_ucvtf, nxv2f64, nxv2i1, nxv2i64, ElementSizeD>; + defm FCVTZS_ZPmZ_DtoS : sve_fp_2op_p_zd<0b1111000, "fcvtzs", ZPR64, ZPR32, int_aarch64_sve_fcvtzs_i32f64, nxv4i32, nxv2i1, nxv2f64, ElementSizeD>; + defm FCVTZU_ZPmZ_DtoS : sve_fp_2op_p_zd<0b1111001, "fcvtzu", ZPR64, ZPR32, int_aarch64_sve_fcvtzu_i32f64, nxv4i32, nxv2i1, nxv2f64, ElementSizeD>; + defm FCVTZS_ZPmZ_StoD : sve_fp_2op_p_zd<0b1111100, "fcvtzs", ZPR32, ZPR64, int_aarch64_sve_fcvtzs_i64f32, nxv2i64, nxv2i1, nxv4f32, ElementSizeD>; + defm FCVTZS_ZPmZ_HtoS : sve_fp_2op_p_zd<0b0111100, "fcvtzs", ZPR16, ZPR32, int_aarch64_sve_fcvtzs_i32f16, nxv4i32, nxv4i1, nxv8f16, ElementSizeS>; + defm FCVTZS_ZPmZ_HtoD : sve_fp_2op_p_zd<0b0111110, "fcvtzs", ZPR16, ZPR64, int_aarch64_sve_fcvtzs_i64f16, nxv2i64, nxv2i1, nxv8f16, ElementSizeD>; + defm FCVTZU_ZPmZ_HtoS : sve_fp_2op_p_zd<0b0111101, "fcvtzu", ZPR16, ZPR32, int_aarch64_sve_fcvtzu_i32f16, nxv4i32, nxv4i1, nxv8f16, ElementSizeS>; + defm FCVTZU_ZPmZ_HtoD : sve_fp_2op_p_zd<0b0111111, "fcvtzu", ZPR16, ZPR64, int_aarch64_sve_fcvtzu_i64f16, nxv2i64, nxv2i1, nxv8f16, ElementSizeD>; + defm FCVTZU_ZPmZ_StoD : sve_fp_2op_p_zd<0b1111101, "fcvtzu", ZPR32, ZPR64, int_aarch64_sve_fcvtzu_i64f32, nxv2i64, nxv2i1, nxv4f32, ElementSizeD>; + defm FCVTZS_ZPmZ_DtoD : sve_fp_2op_p_zd<0b1111110, "fcvtzs", ZPR64, ZPR64, int_aarch64_sve_fcvtzs, nxv2i64, nxv2i1, nxv2f64, ElementSizeD>; + defm FCVTZU_ZPmZ_DtoD : sve_fp_2op_p_zd<0b1111111, "fcvtzu", ZPR64, ZPR64, int_aarch64_sve_fcvtzu, nxv2i64, nxv2i1, nxv2f64, ElementSizeD>; defm FRINTN_ZPmZ : sve_fp_2op_p_zd_HSD<0b00000, "frintn", int_aarch64_sve_frintn>; defm FRINTP_ZPmZ : sve_fp_2op_p_zd_HSD<0b00001, "frintp", int_aarch64_sve_frintp>; @@ -1004,6 +1343,18 @@ let Predicates = [HasSVE] in { defm FRECPX_ZPmZ : sve_fp_2op_p_zd_HSD<0b01100, "frecpx", int_aarch64_sve_frecpx>; defm FSQRT_ZPmZ : sve_fp_2op_p_zd_HSD<0b01101, "fsqrt", int_aarch64_sve_fsqrt>; + let Predicates = [HasBF16, HasSVE] in { + defm BFDOT_ZZZ : sve_bfloat_dot<"bfdot", int_aarch64_sve_bfdot>; + defm BFDOT_ZZI : sve_bfloat_dot_indexed<"bfdot", int_aarch64_sve_bfdot_lane>; + defm BFMMLA_ZZZ : sve_bfloat_matmul<"bfmmla", int_aarch64_sve_bfmmla>; + defm BFMMLA_B_ZZZ : sve_bfloat_matmul_longvecl<0b0, "bfmlalb", int_aarch64_sve_bfmlalb>; + defm BFMMLA_T_ZZZ : sve_bfloat_matmul_longvecl<0b1, "bfmlalt", int_aarch64_sve_bfmlalt>; + defm BFMMLA_B_ZZI : sve_bfloat_matmul_longvecl_idx<0b0, "bfmlalb", int_aarch64_sve_bfmlalb_lane>; + defm BFMMLA_T_ZZI : sve_bfloat_matmul_longvecl_idx<0b1, "bfmlalt", int_aarch64_sve_bfmlalt_lane>; + defm BFCVT_ZPmZ : sve_bfloat_convert<0b1, "bfcvt", int_aarch64_sve_fcvt_bf16f32>; + defm BFCVTNT_ZPmZ : sve_bfloat_convert<0b0, "bfcvtnt", int_aarch64_sve_fcvtnt_bf16f32>; + } + // InstAliases def : InstAlias<"mov $Zd, $Zn", (ORR_ZZZ ZPR64:$Zd, ZPR64:$Zn, ZPR64:$Zn), 1>; @@ -1089,6 +1440,20 @@ let Predicates = [HasSVE] in { def : InstAlias<"fcmlt $Zd, $Pg/z, $Zm, $Zn", (FCMGT_PPzZZ_D PPR64:$Zd, PPR3bAny:$Pg, ZPR64:$Zn, ZPR64:$Zm), 0>; + // Pseudo instructions representing unpredicated LDR and STR for ZPR2,3,4. + // These get expanded to individual LDR_ZXI/STR_ZXI instructions in + // AArch64ExpandPseudoInsts. + let mayLoad = 1, hasSideEffects = 0 in { + def LDR_ZZXI : Pseudo<(outs ZZ_b:$Zd), (ins GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>; + def LDR_ZZZXI : Pseudo<(outs ZZZ_b:$Zd), (ins GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>; + def LDR_ZZZZXI : Pseudo<(outs ZZZZ_b:$Zd), (ins GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>; + } + let mayStore = 1, hasSideEffects = 0 in { + def STR_ZZXI : Pseudo<(outs), (ins ZZ_b:$Zs, GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>; + def STR_ZZZXI : Pseudo<(outs), (ins ZZZ_b:$Zs, GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>; + def STR_ZZZZXI : Pseudo<(outs), (ins ZZZZ_b:$Zs, GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>; + } + def : Pat<(AArch64ptest (nxv16i1 PPR:$pg), (nxv16i1 PPR:$src)), (PTEST_PP PPR:$pg, PPR:$src)>; def : Pat<(AArch64ptest (nxv8i1 PPR:$pg), (nxv8i1 PPR:$src)), @@ -1098,6 +1463,25 @@ let Predicates = [HasSVE] in { def : Pat<(AArch64ptest (nxv2i1 PPR:$pg), (nxv2i1 PPR:$src)), (PTEST_PP PPR:$pg, PPR:$src)>; + // LD1R of 128-bit masked data + def : Pat<(nxv16i8 (AArch64ld1rq_z PPR:$gp, GPR64:$base)), + (LD1RQ_B_IMM $gp, $base, (i64 0))>; + def : Pat<(nxv8i16 (AArch64ld1rq_z PPR:$gp, GPR64:$base)), + (LD1RQ_H_IMM $gp, $base, (i64 0))>; + def : Pat<(nxv4i32 (AArch64ld1rq_z PPR:$gp, GPR64:$base)), + (LD1RQ_W_IMM $gp, $base, (i64 0))>; + def : Pat<(nxv2i64 (AArch64ld1rq_z PPR:$gp, GPR64:$base)), + (LD1RQ_D_IMM $gp, $base, (i64 0))>; + + def : Pat<(nxv16i8 (AArch64ld1rq_z PPR:$gp, (add GPR64:$base, (i64 simm4s16:$imm)))), + (LD1RQ_B_IMM $gp, $base, simm4s16:$imm)>; + def : Pat<(nxv8i16 (AArch64ld1rq_z PPR:$gp, (add GPR64:$base, (i64 simm4s16:$imm)))), + (LD1RQ_H_IMM $gp, $base, simm4s16:$imm)>; + def : Pat<(nxv4i32 (AArch64ld1rq_z PPR:$gp, (add GPR64:$base, (i64 simm4s16:$imm)))), + (LD1RQ_W_IMM $gp, $base, simm4s16:$imm)>; + def : Pat<(nxv2i64 (AArch64ld1rq_z PPR:$gp, (add GPR64:$base, (i64 simm4s16:$imm)))), + (LD1RQ_D_IMM $gp, $base, simm4s16:$imm)>; + def : Pat<(sext_inreg (nxv2i64 ZPR:$Zs), nxv2i32), (SXTW_ZPmZ_D (IMPLICIT_DEF), (PTRUE_D 31), ZPR:$Zs)>; def : Pat<(sext_inreg (nxv2i64 ZPR:$Zs), nxv2i16), (SXTH_ZPmZ_D (IMPLICIT_DEF), (PTRUE_D 31), ZPR:$Zs)>; def : Pat<(sext_inreg (nxv2i64 ZPR:$Zs), nxv2i8), (SXTB_ZPmZ_D (IMPLICIT_DEF), (PTRUE_D 31), ZPR:$Zs)>; @@ -1105,346 +1489,899 @@ let Predicates = [HasSVE] in { def : Pat<(sext_inreg (nxv4i32 ZPR:$Zs), nxv4i8), (SXTB_ZPmZ_S (IMPLICIT_DEF), (PTRUE_S 31), ZPR:$Zs)>; def : Pat<(sext_inreg (nxv8i16 ZPR:$Zs), nxv8i8), (SXTB_ZPmZ_H (IMPLICIT_DEF), (PTRUE_H 31), ZPR:$Zs)>; - def : Pat<(nxv16i8 (bitconvert (nxv8i16 ZPR:$src))), (nxv16i8 ZPR:$src)>; - def : Pat<(nxv16i8 (bitconvert (nxv4i32 ZPR:$src))), (nxv16i8 ZPR:$src)>; - def : Pat<(nxv16i8 (bitconvert (nxv2i64 ZPR:$src))), (nxv16i8 ZPR:$src)>; - def : Pat<(nxv16i8 (bitconvert (nxv8f16 ZPR:$src))), (nxv16i8 ZPR:$src)>; - def : Pat<(nxv16i8 (bitconvert (nxv4f32 ZPR:$src))), (nxv16i8 ZPR:$src)>; - def : Pat<(nxv16i8 (bitconvert (nxv2f64 ZPR:$src))), (nxv16i8 ZPR:$src)>; - - def : Pat<(nxv8i16 (bitconvert (nxv16i8 ZPR:$src))), (nxv8i16 ZPR:$src)>; - def : Pat<(nxv8i16 (bitconvert (nxv4i32 ZPR:$src))), (nxv8i16 ZPR:$src)>; - def : Pat<(nxv8i16 (bitconvert (nxv2i64 ZPR:$src))), (nxv8i16 ZPR:$src)>; - def : Pat<(nxv8i16 (bitconvert (nxv8f16 ZPR:$src))), (nxv8i16 ZPR:$src)>; - def : Pat<(nxv8i16 (bitconvert (nxv4f32 ZPR:$src))), (nxv8i16 ZPR:$src)>; - def : Pat<(nxv8i16 (bitconvert (nxv2f64 ZPR:$src))), (nxv8i16 ZPR:$src)>; - - def : Pat<(nxv4i32 (bitconvert (nxv16i8 ZPR:$src))), (nxv4i32 ZPR:$src)>; - def : Pat<(nxv4i32 (bitconvert (nxv8i16 ZPR:$src))), (nxv4i32 ZPR:$src)>; - def : Pat<(nxv4i32 (bitconvert (nxv2i64 ZPR:$src))), (nxv4i32 ZPR:$src)>; - def : Pat<(nxv4i32 (bitconvert (nxv8f16 ZPR:$src))), (nxv4i32 ZPR:$src)>; - def : Pat<(nxv4i32 (bitconvert (nxv4f32 ZPR:$src))), (nxv4i32 ZPR:$src)>; - def : Pat<(nxv4i32 (bitconvert (nxv2f64 ZPR:$src))), (nxv4i32 ZPR:$src)>; - - def : Pat<(nxv2i64 (bitconvert (nxv16i8 ZPR:$src))), (nxv2i64 ZPR:$src)>; - def : Pat<(nxv2i64 (bitconvert (nxv8i16 ZPR:$src))), (nxv2i64 ZPR:$src)>; - def : Pat<(nxv2i64 (bitconvert (nxv4i32 ZPR:$src))), (nxv2i64 ZPR:$src)>; - def : Pat<(nxv2i64 (bitconvert (nxv8f16 ZPR:$src))), (nxv2i64 ZPR:$src)>; - def : Pat<(nxv2i64 (bitconvert (nxv4f32 ZPR:$src))), (nxv2i64 ZPR:$src)>; - def : Pat<(nxv2i64 (bitconvert (nxv2f64 ZPR:$src))), (nxv2i64 ZPR:$src)>; - - def : Pat<(nxv8f16 (bitconvert (nxv16i8 ZPR:$src))), (nxv8f16 ZPR:$src)>; - def : Pat<(nxv8f16 (bitconvert (nxv8i16 ZPR:$src))), (nxv8f16 ZPR:$src)>; - def : Pat<(nxv8f16 (bitconvert (nxv4i32 ZPR:$src))), (nxv8f16 ZPR:$src)>; - def : Pat<(nxv8f16 (bitconvert (nxv2i64 ZPR:$src))), (nxv8f16 ZPR:$src)>; - def : Pat<(nxv8f16 (bitconvert (nxv4f32 ZPR:$src))), (nxv8f16 ZPR:$src)>; - def : Pat<(nxv8f16 (bitconvert (nxv2f64 ZPR:$src))), (nxv8f16 ZPR:$src)>; - - def : Pat<(nxv4f32 (bitconvert (nxv16i8 ZPR:$src))), (nxv4f32 ZPR:$src)>; - def : Pat<(nxv4f32 (bitconvert (nxv8i16 ZPR:$src))), (nxv4f32 ZPR:$src)>; - def : Pat<(nxv4f32 (bitconvert (nxv4i32 ZPR:$src))), (nxv4f32 ZPR:$src)>; - def : Pat<(nxv4f32 (bitconvert (nxv2i64 ZPR:$src))), (nxv4f32 ZPR:$src)>; - def : Pat<(nxv4f32 (bitconvert (nxv8f16 ZPR:$src))), (nxv4f32 ZPR:$src)>; - def : Pat<(nxv4f32 (bitconvert (nxv2f64 ZPR:$src))), (nxv4f32 ZPR:$src)>; - - def : Pat<(nxv2f64 (bitconvert (nxv16i8 ZPR:$src))), (nxv2f64 ZPR:$src)>; - def : Pat<(nxv2f64 (bitconvert (nxv8i16 ZPR:$src))), (nxv2f64 ZPR:$src)>; - def : Pat<(nxv2f64 (bitconvert (nxv4i32 ZPR:$src))), (nxv2f64 ZPR:$src)>; - def : Pat<(nxv2f64 (bitconvert (nxv2i64 ZPR:$src))), (nxv2f64 ZPR:$src)>; - def : Pat<(nxv2f64 (bitconvert (nxv8f16 ZPR:$src))), (nxv2f64 ZPR:$src)>; - def : Pat<(nxv2f64 (bitconvert (nxv4f32 ZPR:$src))), (nxv2f64 ZPR:$src)>; + // General case that we ideally never want to match. + def : Pat<(vscale GPR64:$scale), (MADDXrrr (UBFMXri (RDVLI_XI 1), 4, 63), $scale, XZR)>; + + let AddedComplexity = 5 in { + def : Pat<(vscale (i64 1)), (UBFMXri (RDVLI_XI 1), 4, 63)>; + def : Pat<(vscale (i64 -1)), (SBFMXri (RDVLI_XI -1), 4, 63)>; + + def : Pat<(vscale (sve_rdvl_imm i32:$imm)), (RDVLI_XI $imm)>; + def : Pat<(vscale (sve_cnth_imm i32:$imm)), (CNTH_XPiI 31, $imm)>; + def : Pat<(vscale (sve_cntw_imm i32:$imm)), (CNTW_XPiI 31, $imm)>; + def : Pat<(vscale (sve_cntd_imm i32:$imm)), (CNTD_XPiI 31, $imm)>; + + def : Pat<(vscale (sve_cnth_imm_neg i32:$imm)), (SUBXrs XZR, (CNTH_XPiI 31, $imm), 0)>; + def : Pat<(vscale (sve_cntw_imm_neg i32:$imm)), (SUBXrs XZR, (CNTW_XPiI 31, $imm), 0)>; + def : Pat<(vscale (sve_cntd_imm_neg i32:$imm)), (SUBXrs XZR, (CNTD_XPiI 31, $imm), 0)>; + } + + // FIXME: BigEndian requires an additional REV instruction to satisfy the + // constraint that none of the bits change when stored to memory as one + // type, and and reloaded as another type. + let Predicates = [IsLE] in { + def : Pat<(nxv16i8 (bitconvert (nxv8i16 ZPR:$src))), (nxv16i8 ZPR:$src)>; + def : Pat<(nxv16i8 (bitconvert (nxv4i32 ZPR:$src))), (nxv16i8 ZPR:$src)>; + def : Pat<(nxv16i8 (bitconvert (nxv2i64 ZPR:$src))), (nxv16i8 ZPR:$src)>; + def : Pat<(nxv16i8 (bitconvert (nxv8f16 ZPR:$src))), (nxv16i8 ZPR:$src)>; + def : Pat<(nxv16i8 (bitconvert (nxv4f32 ZPR:$src))), (nxv16i8 ZPR:$src)>; + def : Pat<(nxv16i8 (bitconvert (nxv2f64 ZPR:$src))), (nxv16i8 ZPR:$src)>; + + def : Pat<(nxv8i16 (bitconvert (nxv16i8 ZPR:$src))), (nxv8i16 ZPR:$src)>; + def : Pat<(nxv8i16 (bitconvert (nxv4i32 ZPR:$src))), (nxv8i16 ZPR:$src)>; + def : Pat<(nxv8i16 (bitconvert (nxv2i64 ZPR:$src))), (nxv8i16 ZPR:$src)>; + def : Pat<(nxv8i16 (bitconvert (nxv8f16 ZPR:$src))), (nxv8i16 ZPR:$src)>; + def : Pat<(nxv8i16 (bitconvert (nxv4f32 ZPR:$src))), (nxv8i16 ZPR:$src)>; + def : Pat<(nxv8i16 (bitconvert (nxv2f64 ZPR:$src))), (nxv8i16 ZPR:$src)>; + + def : Pat<(nxv4i32 (bitconvert (nxv16i8 ZPR:$src))), (nxv4i32 ZPR:$src)>; + def : Pat<(nxv4i32 (bitconvert (nxv8i16 ZPR:$src))), (nxv4i32 ZPR:$src)>; + def : Pat<(nxv4i32 (bitconvert (nxv2i64 ZPR:$src))), (nxv4i32 ZPR:$src)>; + def : Pat<(nxv4i32 (bitconvert (nxv8f16 ZPR:$src))), (nxv4i32 ZPR:$src)>; + def : Pat<(nxv4i32 (bitconvert (nxv4f32 ZPR:$src))), (nxv4i32 ZPR:$src)>; + def : Pat<(nxv4i32 (bitconvert (nxv2f64 ZPR:$src))), (nxv4i32 ZPR:$src)>; + + def : Pat<(nxv2i64 (bitconvert (nxv16i8 ZPR:$src))), (nxv2i64 ZPR:$src)>; + def : Pat<(nxv2i64 (bitconvert (nxv8i16 ZPR:$src))), (nxv2i64 ZPR:$src)>; + def : Pat<(nxv2i64 (bitconvert (nxv4i32 ZPR:$src))), (nxv2i64 ZPR:$src)>; + def : Pat<(nxv2i64 (bitconvert (nxv8f16 ZPR:$src))), (nxv2i64 ZPR:$src)>; + def : Pat<(nxv2i64 (bitconvert (nxv4f32 ZPR:$src))), (nxv2i64 ZPR:$src)>; + def : Pat<(nxv2i64 (bitconvert (nxv2f64 ZPR:$src))), (nxv2i64 ZPR:$src)>; + + def : Pat<(nxv8f16 (bitconvert (nxv16i8 ZPR:$src))), (nxv8f16 ZPR:$src)>; + def : Pat<(nxv8f16 (bitconvert (nxv8i16 ZPR:$src))), (nxv8f16 ZPR:$src)>; + def : Pat<(nxv8f16 (bitconvert (nxv4i32 ZPR:$src))), (nxv8f16 ZPR:$src)>; + def : Pat<(nxv8f16 (bitconvert (nxv2i64 ZPR:$src))), (nxv8f16 ZPR:$src)>; + def : Pat<(nxv8f16 (bitconvert (nxv4f32 ZPR:$src))), (nxv8f16 ZPR:$src)>; + def : Pat<(nxv8f16 (bitconvert (nxv2f64 ZPR:$src))), (nxv8f16 ZPR:$src)>; + + def : Pat<(nxv4f32 (bitconvert (nxv16i8 ZPR:$src))), (nxv4f32 ZPR:$src)>; + def : Pat<(nxv4f32 (bitconvert (nxv8i16 ZPR:$src))), (nxv4f32 ZPR:$src)>; + def : Pat<(nxv4f32 (bitconvert (nxv4i32 ZPR:$src))), (nxv4f32 ZPR:$src)>; + def : Pat<(nxv4f32 (bitconvert (nxv2i64 ZPR:$src))), (nxv4f32 ZPR:$src)>; + def : Pat<(nxv4f32 (bitconvert (nxv8f16 ZPR:$src))), (nxv4f32 ZPR:$src)>; + def : Pat<(nxv4f32 (bitconvert (nxv2f64 ZPR:$src))), (nxv4f32 ZPR:$src)>; + + def : Pat<(nxv2f64 (bitconvert (nxv16i8 ZPR:$src))), (nxv2f64 ZPR:$src)>; + def : Pat<(nxv2f64 (bitconvert (nxv8i16 ZPR:$src))), (nxv2f64 ZPR:$src)>; + def : Pat<(nxv2f64 (bitconvert (nxv4i32 ZPR:$src))), (nxv2f64 ZPR:$src)>; + def : Pat<(nxv2f64 (bitconvert (nxv2i64 ZPR:$src))), (nxv2f64 ZPR:$src)>; + def : Pat<(nxv2f64 (bitconvert (nxv8f16 ZPR:$src))), (nxv2f64 ZPR:$src)>; + def : Pat<(nxv2f64 (bitconvert (nxv4f32 ZPR:$src))), (nxv2f64 ZPR:$src)>; + + } + + let Predicates = [IsLE, HasBF16, HasSVE] in { + def : Pat<(nxv2i64 (bitconvert (nxv8bf16 ZPR:$src))), (nxv2i64 ZPR:$src)>; + def : Pat<(nxv8bf16 (bitconvert (nxv2i64 ZPR:$src))), (nxv8bf16 ZPR:$src)>; + def : Pat<(nxv8bf16 (bitconvert (nxv8i16 ZPR:$src))), (nxv8bf16 ZPR:$src)>; + } + + let Predicates = [IsLE, HasSVE, HasBF16] in { + def : Pat<(nxv8bf16 (bitconvert (nxv16i8 ZPR:$src))), (nxv8bf16 ZPR:$src)>; + def : Pat<(nxv8bf16 (bitconvert (nxv8i16 ZPR:$src))), (nxv8bf16 ZPR:$src)>; + def : Pat<(nxv8bf16 (bitconvert (nxv4i32 ZPR:$src))), (nxv8bf16 ZPR:$src)>; + def : Pat<(nxv8bf16 (bitconvert (nxv2i64 ZPR:$src))), (nxv8bf16 ZPR:$src)>; + def : Pat<(nxv8bf16 (bitconvert (nxv8f16 ZPR:$src))), (nxv8bf16 ZPR:$src)>; + def : Pat<(nxv8bf16 (bitconvert (nxv4f32 ZPR:$src))), (nxv8bf16 ZPR:$src)>; + def : Pat<(nxv8bf16 (bitconvert (nxv2f64 ZPR:$src))), (nxv8bf16 ZPR:$src)>; + + def : Pat<(nxv16i8 (bitconvert (nxv8bf16 ZPR:$src))), (nxv16i8 ZPR:$src)>; + def : Pat<(nxv8i16 (bitconvert (nxv8bf16 ZPR:$src))), (nxv8i16 ZPR:$src)>; + def : Pat<(nxv4i32 (bitconvert (nxv8bf16 ZPR:$src))), (nxv4i32 ZPR:$src)>; + def : Pat<(nxv2i64 (bitconvert (nxv8bf16 ZPR:$src))), (nxv2i64 ZPR:$src)>; + def : Pat<(nxv8f16 (bitconvert (nxv8bf16 ZPR:$src))), (nxv8f16 ZPR:$src)>; + def : Pat<(nxv4f32 (bitconvert (nxv8bf16 ZPR:$src))), (nxv4f32 ZPR:$src)>; + def : Pat<(nxv2f64 (bitconvert (nxv8bf16 ZPR:$src))), (nxv2f64 ZPR:$src)>; + } + + def : Pat<(nxv16i1 (reinterpret_cast (nxv16i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>; + def : Pat<(nxv16i1 (reinterpret_cast (nxv8i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>; + def : Pat<(nxv16i1 (reinterpret_cast (nxv4i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>; + def : Pat<(nxv16i1 (reinterpret_cast (nxv2i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>; + def : Pat<(nxv8i1 (reinterpret_cast (nxv16i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>; + def : Pat<(nxv8i1 (reinterpret_cast (nxv4i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>; + def : Pat<(nxv8i1 (reinterpret_cast (nxv2i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>; + def : Pat<(nxv4i1 (reinterpret_cast (nxv16i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>; + def : Pat<(nxv4i1 (reinterpret_cast (nxv8i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>; + def : Pat<(nxv4i1 (reinterpret_cast (nxv2i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>; + def : Pat<(nxv2i1 (reinterpret_cast (nxv16i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>; + def : Pat<(nxv2i1 (reinterpret_cast (nxv8i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>; + def : Pat<(nxv2i1 (reinterpret_cast (nxv4i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>; + + def : Pat<(nxv16i1 (and PPR:$Ps1, PPR:$Ps2)), + (AND_PPzPP (PTRUE_B 31), PPR:$Ps1, PPR:$Ps2)>; + def : Pat<(nxv8i1 (and PPR:$Ps1, PPR:$Ps2)), + (AND_PPzPP (PTRUE_H 31), PPR:$Ps1, PPR:$Ps2)>; + def : Pat<(nxv4i1 (and PPR:$Ps1, PPR:$Ps2)), + (AND_PPzPP (PTRUE_S 31), PPR:$Ps1, PPR:$Ps2)>; + def : Pat<(nxv2i1 (and PPR:$Ps1, PPR:$Ps2)), + (AND_PPzPP (PTRUE_D 31), PPR:$Ps1, PPR:$Ps2)>; // Add more complex addressing modes here as required multiclass pred_load { - + Instruction RegRegInst, Instruction RegImmInst, ComplexPattern AddrCP> { + // reg + reg + let AddedComplexity = 1 in { + def _reg_reg_z : Pat<(Ty (Load (AddrCP GPR64:$base, GPR64:$offset), (PredTy PPR:$gp), (SVEDup0Undef))), + (RegRegInst PPR:$gp, GPR64:$base, GPR64:$offset)>; + } + // reg + imm + let AddedComplexity = 2 in { + def _reg_imm_z : Pat<(Ty (Load (am_sve_indexed_s4 GPR64sp:$base, simm4s1:$offset), (PredTy PPR:$gp), (SVEDup0Undef))), + (RegImmInst PPR:$gp, GPR64:$base, simm4s1:$offset)>; + } def _default_z : Pat<(Ty (Load GPR64:$base, (PredTy PPR:$gp), (SVEDup0Undef))), (RegImmInst PPR:$gp, GPR64:$base, (i64 0))>; } // 2-element contiguous loads - defm : pred_load; - defm : pred_load; - defm : pred_load; - defm : pred_load; - defm : pred_load; - defm : pred_load; - defm : pred_load; - defm : pred_load; - defm : pred_load; - defm : pred_load; + defm : pred_load; + defm : pred_load; + defm : pred_load; + defm : pred_load; + defm : pred_load; + defm : pred_load; + defm : pred_load; + defm : pred_load; + defm : pred_load; + defm : pred_load; // 4-element contiguous loads - defm : pred_load; - defm : pred_load; - defm : pred_load; - defm : pred_load; - defm : pred_load; - defm : pred_load; - defm : pred_load; + defm : pred_load; + defm : pred_load; + defm : pred_load; + defm : pred_load; + defm : pred_load; + defm : pred_load; + defm : pred_load; // 8-element contiguous loads - defm : pred_load; - defm : pred_load; - defm : pred_load; - defm : pred_load; + defm : pred_load; + defm : pred_load; + defm : pred_load; + defm : pred_load; + + let Predicates = [HasBF16, HasSVE] in { + defm : pred_load; + } // 16-element contiguous loads - defm : pred_load; + defm : pred_load; multiclass pred_store { + Instruction RegRegInst, Instruction RegImmInst, ComplexPattern AddrCP> { + // reg + reg + let AddedComplexity = 1 in { + def _reg_reg : Pat<(Store (Ty ZPR:$vec), (AddrCP GPR64:$base, GPR64:$offset), (PredTy PPR:$gp)), + (RegRegInst ZPR:$vec, PPR:$gp, GPR64:$base, GPR64:$offset)>; + } + // reg + imm + let AddedComplexity = 2 in { + def _reg_imm : Pat<(Store (Ty ZPR:$vec), (am_sve_indexed_s4 GPR64sp:$base, simm4s1:$offset), (PredTy PPR:$gp)), + (RegImmInst ZPR:$vec, PPR:$gp, GPR64:$base, simm4s1:$offset)>; + } def _default : Pat<(Store (Ty ZPR:$vec), GPR64:$base, (PredTy PPR:$gp)), (RegImmInst ZPR:$vec, PPR:$gp, GPR64:$base, (i64 0))>; } // 2-element contiguous stores - defm : pred_store; - defm : pred_store; - defm : pred_store; - defm : pred_store; - defm : pred_store; - defm : pred_store; - defm : pred_store; + defm : pred_store; + defm : pred_store; + defm : pred_store; + defm : pred_store; + defm : pred_store; + defm : pred_store; + defm : pred_store; // 4-element contiguous stores - defm : pred_store; - defm : pred_store; - defm : pred_store; - defm : pred_store; - defm : pred_store; + defm : pred_store; + defm : pred_store; + defm : pred_store; + defm : pred_store; + defm : pred_store; // 8-element contiguous stores - defm : pred_store; - defm : pred_store; - defm : pred_store; + defm : pred_store; + defm : pred_store; + defm : pred_store; + + let Predicates = [HasBF16, HasSVE] in { + defm : pred_store; + } // 16-element contiguous stores - defm : pred_store; + defm : pred_store; + + defm : pred_load; + defm : pred_load; + defm : pred_load; + defm : pred_load; + + defm : pred_store; + defm : pred_store; + defm : pred_store; + defm : pred_store; + + multiclass unpred_store { + let AddedComplexity = 1 in { + def _imm : Pat<(Store (Ty ZPR:$val), (am_sve_indexed_s4 GPR64sp:$base, simm4s1:$offset)), + (RegImmInst ZPR:$val, (PTrue 31), GPR64sp:$base, simm4s1:$offset)>; + } + let AddedComplexity = 2 in { + def _fi : Pat<(Store (Ty ZPR:$val), (am_sve_fi GPR64sp:$base, simm4s1:$offset)), + (RegImmInst ZPR:$val, (PTrue 31), GPR64sp:$base, simm4s1:$offset)>; + } + + def : Pat<(Store (Ty ZPR:$val), GPR64:$base), + (RegImmInst ZPR:$val, (PTrue 31), GPR64:$base, (i64 0))>; + } + + defm : unpred_store< store, nxv16i8, ST1B_IMM, PTRUE_B>; + defm : unpred_store< truncstorevi8, nxv8i16, ST1B_H_IMM, PTRUE_H>; + defm : unpred_store< truncstorevi8, nxv4i32, ST1B_S_IMM, PTRUE_S>; + defm : unpred_store< truncstorevi8, nxv2i64, ST1B_D_IMM, PTRUE_D>; + defm : unpred_store< store, nxv8i16, ST1H_IMM, PTRUE_H>; + defm : unpred_store; + defm : unpred_store; + defm : unpred_store< store, nxv4i32, ST1W_IMM, PTRUE_S>; + defm : unpred_store; + defm : unpred_store< store, nxv2i64, ST1D_IMM, PTRUE_D>; + defm : unpred_store< store, nxv8f16, ST1H_IMM, PTRUE_H>; + defm : unpred_store< store, nxv8bf16, ST1H_IMM, PTRUE_H>; + defm : unpred_store< store, nxv4f16, ST1H_S_IMM, PTRUE_S>; + defm : unpred_store< store, nxv2f16, ST1H_D_IMM, PTRUE_D>; + defm : unpred_store< store, nxv4f32, ST1W_IMM, PTRUE_S>; + defm : unpred_store< store, nxv4f32, ST1W_D_IMM, PTRUE_D>; + defm : unpred_store< store, nxv2f64, ST1D_IMM, PTRUE_D>; + + multiclass unpred_load { + let AddedComplexity = 1 in { + def _imm: Pat<(Ty (Load (am_sve_indexed_s4 GPR64sp:$base, simm4s1:$offset))), + (RegImmInst (PTrue 31), GPR64sp:$base, simm4s1:$offset)>; + } + + let AddedComplexity = 2 in { + def _fi : Pat<(Ty (Load (am_sve_fi GPR64sp:$base, simm4s1:$offset))), + (RegImmInst (PTrue 31), GPR64sp:$base, simm4s1:$offset)>; + } + + def : Pat<(Ty (Load GPR64:$base)), + (RegImmInst (PTrue 31), GPR64:$base, (i64 0))>; + } + + defm : unpred_load< load, nxv16i8, LD1B_IMM, PTRUE_B>; + defm : unpred_load< zextloadvi8, nxv8i16, LD1B_H_IMM, PTRUE_H>; + defm : unpred_load< zextloadvi8, nxv4i32, LD1B_S_IMM, PTRUE_S>; + defm : unpred_load< zextloadvi8, nxv2i64, LD1B_D_IMM, PTRUE_D>; + defm : unpred_load< extloadvi8, nxv8i16, LD1B_H_IMM, PTRUE_H>; + defm : unpred_load< extloadvi8, nxv4i32, LD1B_S_IMM, PTRUE_S>; + defm : unpred_load< extloadvi8, nxv2i64, LD1B_D_IMM, PTRUE_D>; + defm : unpred_load< sextloadvi8, nxv8i16, LD1SB_H_IMM, PTRUE_H>; + defm : unpred_load< sextloadvi8, nxv4i32, LD1SB_S_IMM, PTRUE_S>; + defm : unpred_load< sextloadvi8, nxv2i64, LD1SB_D_IMM, PTRUE_D>; + defm : unpred_load< load, nxv8i16, LD1H_IMM, PTRUE_H>; + defm : unpred_load; + defm : unpred_load; + defm : unpred_load< extloadvi16, nxv4i32, LD1H_S_IMM, PTRUE_S>; + defm : unpred_load< extloadvi16, nxv2i64, LD1H_D_IMM, PTRUE_D>; + defm : unpred_load; + defm : unpred_load; + defm : unpred_load< load, nxv4i32, LD1W_IMM, PTRUE_S>; + defm : unpred_load; + defm : unpred_load< extloadvi32, nxv2i64, LD1W_D_IMM, PTRUE_D>; + defm : unpred_load; + defm : unpred_load< load, nxv2i64, LD1D_IMM, PTRUE_D>; + defm : unpred_load< load, nxv8f16, LD1H_IMM, PTRUE_H>; + defm : unpred_load< load, nxv8bf16, LD1H_IMM, PTRUE_H>; + defm : unpred_load< load, nxv4f16, LD1H_S_IMM, PTRUE_S>; + defm : unpred_load< load, nxv2f16, LD1H_D_IMM, PTRUE_D>; + defm : unpred_load< load, nxv4f32, LD1W_IMM, PTRUE_S>; + defm : unpred_load< load, nxv2f32, LD1W_D_IMM, PTRUE_D>; + defm : unpred_load< load, nxv2f64, LD1D_IMM, PTRUE_D>; + + multiclass unpred_store_predicate { + def _fi : Pat<(store (Ty PPR:$val), (am_sve_fi GPR64sp:$base, simm9:$offset)), + (Store PPR:$val, GPR64sp:$base, simm9:$offset)>; + + def _default : Pat<(store (Ty PPR:$Val), GPR64:$base), + (Store PPR:$Val, GPR64:$base, (i64 0))>; + } + + defm Pat_Store_P16 : unpred_store_predicate; + defm Pat_Store_P8 : unpred_store_predicate; + defm Pat_Store_P4 : unpred_store_predicate; + defm Pat_Store_P2 : unpred_store_predicate; + + multiclass unpred_load_predicate { + def _fi : Pat<(Ty (load (am_sve_fi GPR64sp:$base, simm9:$offset))), + (Load GPR64sp:$base, simm9:$offset)>; + + def _default : Pat<(Ty (load GPR64:$base)), + (Load GPR64:$base, (i64 0))>; + } + + defm Pat_Load_P16 : unpred_load_predicate; + defm Pat_Load_P8 : unpred_load_predicate; + defm Pat_Load_P4 : unpred_load_predicate; + defm Pat_Load_P2 : unpred_load_predicate; + + multiclass ld1 { + // reg + reg + let AddedComplexity = 1 in { + def : Pat<(Ty (Load (PredTy PPR:$gp), (AddrCP GPR64:$base, GPR64:$offset), MemVT)), + (RegRegInst PPR:$gp, GPR64sp:$base, GPR64:$offset)>; + } + + // scalar + immediate (mul vl) + let AddedComplexity = 2 in { + def : Pat<(Ty (Load (PredTy PPR:$gp), (am_sve_indexed_s4 GPR64sp:$base, simm4s1:$offset), MemVT)), + (RegImmInst PPR:$gp, GPR64sp:$base, simm4s1:$offset)>; + } + + // base + def : Pat<(Ty (Load (PredTy PPR:$gp), GPR64:$base, MemVT)), + (RegImmInst PPR:$gp, GPR64sp:$base, (i64 0))>; + } + + // 2-element contiguous loads + defm : ld1; + defm : ld1; + defm : ld1; + defm : ld1; + defm : ld1; + defm : ld1; + defm : ld1; + defm : ld1; + + // 4-element contiguous loads + defm : ld1; + defm : ld1; + defm : ld1; + defm : ld1; + defm : ld1; + defm : ld1; + + // 8-element contiguous loads + defm : ld1; + defm : ld1; + defm : ld1; + defm : ld1; + + let Predicates = [HasBF16, HasSVE] in { + defm : ld1; + } + + // 16-element contiguous loads + defm : ld1; + + multiclass ldnf1 { + // scalar + immediate (mul vl) + let AddedComplexity = 1 in { + def : Pat<(Ty (Load (PredTy PPR:$gp), (am_sve_indexed_s4 GPR64sp:$base, simm4s1:$offset), MemVT)), + (I PPR:$gp, GPR64sp:$base, simm4s1:$offset)>; + } + + // base + def : Pat<(Ty (Load (PredTy PPR:$gp), GPR64:$base, MemVT)), + (I PPR:$gp, GPR64sp:$base, (i64 0))>; + } + + // 2-element contiguous non-faulting loads + defm : ldnf1; + defm : ldnf1; + defm : ldnf1; + defm : ldnf1; + defm : ldnf1; + defm : ldnf1; + defm : ldnf1; + defm : ldnf1; + + // 4-element contiguous non-faulting loads + defm : ldnf1; + defm : ldnf1; + defm : ldnf1; + defm : ldnf1; + defm : ldnf1; + defm : ldnf1; + + // 8-element contiguous non-faulting loads + defm : ldnf1; + defm : ldnf1; + defm : ldnf1; + defm : ldnf1; + + let Predicates = [HasBF16, HasSVE] in { + defm : ldnf1; + } + + // 16-element contiguous non-faulting loads + defm : ldnf1; - defm : pred_load; - defm : pred_load; - defm : pred_load; - defm : pred_load; + multiclass ldff1 { + // reg + reg + let AddedComplexity = 1 in { + def : Pat<(Ty (Load (PredTy PPR:$gp), (AddrCP GPR64:$base, GPR64:$offset), MemVT)), + (I PPR:$gp, GPR64sp:$base, GPR64:$offset)>; + } - defm : pred_store; - defm : pred_store; - defm : pred_store; - defm : pred_store; + // Base + def : Pat<(Ty (Load (PredTy PPR:$gp), GPR64:$base, MemVT)), + (I PPR:$gp, GPR64sp:$base, XZR)>; + } + + // 2-element contiguous first faulting loads + defm : ldff1; + defm : ldff1; + defm : ldff1; + defm : ldff1; + defm : ldff1; + defm : ldff1; + defm : ldff1; + defm : ldff1; + defm : ldff1; + + // 4-element contiguous first faulting loads + defm : ldff1; + defm : ldff1; + defm : ldff1; + defm : ldff1; + defm : ldff1; + defm : ldff1; + + // 8-element contiguous first faulting loads + defm : ldff1; + defm : ldff1; + defm : ldff1; + defm : ldff1; + + let Predicates = [HasBF16, HasSVE] in { + defm : ldff1; + } + + // 16-element contiguous first faulting loads + defm : ldff1; + + multiclass st1 { + // reg + reg + let AddedComplexity = 1 in { + def : Pat<(Store (Ty ZPR:$vec), (AddrCP GPR64:$base, GPR64:$offset), (PredTy PPR:$gp), MemVT), + (RegRegInst ZPR:$vec, PPR:$gp, GPR64sp:$base, GPR64:$offset)>; + } + + // scalar + immediate (mul vl) + let AddedComplexity = 2 in { + def : Pat<(Store (Ty ZPR:$vec), (am_sve_indexed_s4 GPR64sp:$base, simm4s1:$offset), (PredTy PPR:$gp), MemVT), + (RegImmInst ZPR:$vec, PPR:$gp, GPR64sp:$base, simm4s1:$offset)>; + } + + // base + def : Pat<(Store (Ty ZPR:$vec), GPR64:$base, (PredTy PPR:$gp), MemVT), + (RegImmInst ZPR:$vec, PPR:$gp, GPR64:$base, (i64 0))>; + } + + // 2-element contiguous store + defm : st1; + defm : st1; + defm : st1; + defm : st1; + + // 4-element contiguous store + defm : st1; + defm : st1; + defm : st1; + + // 8-element contiguous store + defm : st1; + defm : st1; + + // 16-element contiguous store + defm : st1; + + def : Pat<(nxv16i8 (vector_insert (nxv16i8 (undef)), (i32 FPR32:$src), 0)), + (INSERT_SUBREG (nxv16i8 (IMPLICIT_DEF)), FPR32:$src, ssub)>; + def : Pat<(nxv8i16 (vector_insert (nxv8i16 (undef)), (i32 FPR32:$src), 0)), + (INSERT_SUBREG (nxv8i16 (IMPLICIT_DEF)), FPR32:$src, ssub)>; + def : Pat<(nxv4i32 (vector_insert (nxv4i32 (undef)), (i32 FPR32:$src), 0)), + (INSERT_SUBREG (nxv4i32 (IMPLICIT_DEF)), FPR32:$src, ssub)>; + def : Pat<(nxv2i64 (vector_insert (nxv2i64 (undef)), (i64 FPR64:$src), 0)), + (INSERT_SUBREG (nxv2i64 (IMPLICIT_DEF)), FPR64:$src, dsub)>; + + // Insert scalar into vector[0] + def : Pat<(nxv16i8 (vector_insert (nxv16i8 ZPR:$vec), (i32 GPR32:$src), 0)), + (CPY_ZPmR_B ZPR:$vec, (PTRUE_B 1), GPR32:$src)>; + def : Pat<(nxv8i16 (vector_insert (nxv8i16 ZPR:$vec), (i32 GPR32:$src), 0)), + (CPY_ZPmR_H ZPR:$vec, (PTRUE_H 1), GPR32:$src)>; + def : Pat<(nxv4i32 (vector_insert (nxv4i32 ZPR:$vec), (i32 GPR32:$src), 0)), + (CPY_ZPmR_S ZPR:$vec, (PTRUE_S 1), GPR32:$src)>; + def : Pat<(nxv2i64 (vector_insert (nxv2i64 ZPR:$vec), (i64 GPR64:$src), 0)), + (CPY_ZPmR_D ZPR:$vec, (PTRUE_D 1), GPR64:$src)>; + + def : Pat<(nxv8f16 (vector_insert (nxv8f16 ZPR:$vec), (f16 FPR16:$src), 0)), + (SEL_ZPZZ_H (PTRUE_H 1), (INSERT_SUBREG (IMPLICIT_DEF), FPR16:$src, hsub), ZPR:$vec)>; + def : Pat<(nxv4f32 (vector_insert (nxv4f32 ZPR:$vec), (f32 FPR32:$src), 0)), + (SEL_ZPZZ_S (PTRUE_S 1), (INSERT_SUBREG (IMPLICIT_DEF), FPR32:$src, ssub), ZPR:$vec)>; + def : Pat<(nxv2f64 (vector_insert (nxv2f64 ZPR:$vec), (f64 FPR64:$src), 0)), + (SEL_ZPZZ_D (PTRUE_D 1), (INSERT_SUBREG (IMPLICIT_DEF), FPR64:$src, dsub), ZPR:$vec)>; + + // Insert scalar into vector with scalar index + def : Pat<(nxv16i8 (vector_insert (nxv16i8 ZPR:$vec), GPR32:$src, GPR64:$index)), + (CPY_ZPmR_B ZPR:$vec, + (CMPEQ_PPzZZ_B (PTRUE_B 31), + (INDEX_II_B 0, 1), + (DUP_ZR_B (i32 (EXTRACT_SUBREG GPR64:$index, sub_32)))), + GPR32:$src)>; + def : Pat<(nxv8i16 (vector_insert (nxv8i16 ZPR:$vec), GPR32:$src, GPR64:$index)), + (CPY_ZPmR_H ZPR:$vec, + (CMPEQ_PPzZZ_H (PTRUE_H 31), + (INDEX_II_H 0, 1), + (DUP_ZR_H (i32 (EXTRACT_SUBREG GPR64:$index, sub_32)))), + GPR32:$src)>; + def : Pat<(nxv4i32 (vector_insert (nxv4i32 ZPR:$vec), GPR32:$src, GPR64:$index)), + (CPY_ZPmR_S ZPR:$vec, + (CMPEQ_PPzZZ_S (PTRUE_S 31), + (INDEX_II_S 0, 1), + (DUP_ZR_S (i32 (EXTRACT_SUBREG GPR64:$index, sub_32)))), + GPR32:$src)>; + def : Pat<(nxv2i64 (vector_insert (nxv2i64 ZPR:$vec), GPR64:$src, GPR64:$index)), + (CPY_ZPmR_D ZPR:$vec, + (CMPEQ_PPzZZ_D (PTRUE_D 31), + (INDEX_II_D 0, 1), + (DUP_ZR_D GPR64:$index)), + GPR64:$src)>; + + // Insert FP scalar into vector with scalar index + def : Pat<(nxv8f16 (vector_insert (nxv8f16 ZPR:$vec), (f16 FPR16:$src), GPR64:$index)), + (CPY_ZPmV_H ZPR:$vec, + (CMPEQ_PPzZZ_H (PTRUE_H 31), + (INDEX_II_H 0, 1), + (DUP_ZR_H (i32 (EXTRACT_SUBREG GPR64:$index, sub_32)))), + $src)>; + def : Pat<(nxv4f32 (vector_insert (nxv4f32 ZPR:$vec), (f32 FPR32:$src), GPR64:$index)), + (CPY_ZPmV_S ZPR:$vec, + (CMPEQ_PPzZZ_S (PTRUE_S 31), + (INDEX_II_S 0, 1), + (DUP_ZR_S (i32 (EXTRACT_SUBREG GPR64:$index, sub_32)))), + $src)>; + def : Pat<(nxv2f64 (vector_insert (nxv2f64 ZPR:$vec), (f64 FPR64:$src), GPR64:$index)), + (CPY_ZPmV_D ZPR:$vec, + (CMPEQ_PPzZZ_D (PTRUE_D 31), + (INDEX_II_D 0, 1), + (DUP_ZR_D $index)), + $src)>; + + // Extract element from vector with immediate index + def : Pat<(i32 (vector_extract (nxv16i8 ZPR:$vec), sve_elm_idx_extdup_b:$index)), + (EXTRACT_SUBREG (DUP_ZZI_B ZPR:$vec, sve_elm_idx_extdup_b:$index), ssub)>; + def : Pat<(i32 (vector_extract (nxv8i16 ZPR:$vec), sve_elm_idx_extdup_h:$index)), + (EXTRACT_SUBREG (DUP_ZZI_H ZPR:$vec, sve_elm_idx_extdup_h:$index), ssub)>; + def : Pat<(i32 (vector_extract (nxv4i32 ZPR:$vec), sve_elm_idx_extdup_s:$index)), + (EXTRACT_SUBREG (DUP_ZZI_S ZPR:$vec, sve_elm_idx_extdup_s:$index), ssub)>; + def : Pat<(i64 (vector_extract (nxv2i64 ZPR:$vec), sve_elm_idx_extdup_d:$index)), + (EXTRACT_SUBREG (DUP_ZZI_D ZPR:$vec, sve_elm_idx_extdup_d:$index), dsub)>; + def : Pat<(f16 (vector_extract (nxv8f16 ZPR:$vec), sve_elm_idx_extdup_h:$index)), + (EXTRACT_SUBREG (DUP_ZZI_H ZPR:$vec, sve_elm_idx_extdup_h:$index), hsub)>; + def : Pat<(f32 (vector_extract (nxv4f32 ZPR:$vec), sve_elm_idx_extdup_s:$index)), + (EXTRACT_SUBREG (DUP_ZZI_S ZPR:$vec, sve_elm_idx_extdup_s:$index), ssub)>; + def : Pat<(f64 (vector_extract (nxv2f64 ZPR:$vec), sve_elm_idx_extdup_d:$index)), + (EXTRACT_SUBREG (DUP_ZZI_D ZPR:$vec, sve_elm_idx_extdup_d:$index), dsub)>; + + // Extract element from vector with scalar index + def : Pat<(i32 (vector_extract (nxv16i8 ZPR:$vec), GPR64:$index)), + (LASTB_RPZ_B (WHILELS_PXX_B XZR, GPR64:$index), + ZPR:$vec)>; + def : Pat<(i32 (vector_extract (nxv8i16 ZPR:$vec), GPR64:$index)), + (LASTB_RPZ_H (WHILELS_PXX_H XZR, GPR64:$index), + ZPR:$vec)>; + def : Pat<(i32 (vector_extract (nxv4i32 ZPR:$vec), GPR64:$index)), + (LASTB_RPZ_S (WHILELS_PXX_S XZR, GPR64:$index), + ZPR:$vec)>; + def : Pat<(i64 (vector_extract (nxv2i64 ZPR:$vec), GPR64:$index)), + (LASTB_RPZ_D (WHILELS_PXX_D XZR, GPR64:$index), + ZPR:$vec)>; + + def : Pat<(f16 (vector_extract (nxv8f16 ZPR:$vec), GPR64:$index)), + (LASTB_VPZ_H (WHILELS_PXX_H XZR, GPR64:$index), + ZPR:$vec)>; + def : Pat<(f32 (vector_extract (nxv4f32 ZPR:$vec), GPR64:$index)), + (LASTB_VPZ_S (WHILELS_PXX_S XZR, GPR64:$index), + ZPR:$vec)>; + def : Pat<(f64 (vector_extract (nxv2f64 ZPR:$vec), GPR64:$index)), + (LASTB_VPZ_D (WHILELS_PXX_D XZR, GPR64:$index), + ZPR:$vec)>; +} + +let Predicates = [HasSVE, HasMatMulInt8] in { + defm SMMLA_ZZZ : sve_int_matmul<0b00, "smmla", int_aarch64_sve_smmla>; + defm UMMLA_ZZZ : sve_int_matmul<0b11, "ummla", int_aarch64_sve_ummla>; + defm USMMLA_ZZZ : sve_int_matmul<0b10, "usmmla", int_aarch64_sve_usmmla>; + defm USDOT_ZZZ : sve_int_dot_mixed<"usdot", int_aarch64_sve_usdot>; + defm USDOT_ZZZI : sve_int_dot_mixed_indexed<0, "usdot", int_aarch64_sve_usdot_lane>; + defm SUDOT_ZZZI : sve_int_dot_mixed_indexed<1, "sudot", int_aarch64_sve_sudot_lane>; +} + +let Predicates = [HasSVE, HasMatMulFP32] in { + defm FMMLA_ZZZ_S : sve_fp_matrix_mla<0, "fmmla", ZPR32, int_aarch64_sve_fmmla, nxv4f32>; +} + +let Predicates = [HasSVE, HasMatMulFP64] in { + defm FMMLA_ZZZ_D : sve_fp_matrix_mla<1, "fmmla", ZPR64, int_aarch64_sve_fmmla, nxv2f64>; + defm LD1RO_B_IMM : sve_mem_ldor_si<0b00, "ld1rob", Z_b, ZPR8, nxv16i8, nxv16i1, AArch64ld1ro_z>; + defm LD1RO_H_IMM : sve_mem_ldor_si<0b01, "ld1roh", Z_h, ZPR16, nxv8i16, nxv8i1, AArch64ld1ro_z>; + defm LD1RO_W_IMM : sve_mem_ldor_si<0b10, "ld1row", Z_s, ZPR32, nxv4i32, nxv4i1, AArch64ld1ro_z>; + defm LD1RO_D_IMM : sve_mem_ldor_si<0b11, "ld1rod", Z_d, ZPR64, nxv2i64, nxv2i1, AArch64ld1ro_z>; + defm LD1RO_B : sve_mem_ldor_ss<0b00, "ld1rob", Z_b, ZPR8, GPR64NoXZRshifted8, nxv16i8, nxv16i1, AArch64ld1ro_z, am_sve_regreg_lsl0>; + defm LD1RO_H : sve_mem_ldor_ss<0b01, "ld1roh", Z_h, ZPR16, GPR64NoXZRshifted16, nxv8i16, nxv8i1, AArch64ld1ro_z, am_sve_regreg_lsl1>; + defm LD1RO_W : sve_mem_ldor_ss<0b10, "ld1row", Z_s, ZPR32, GPR64NoXZRshifted32, nxv4i32, nxv4i1, AArch64ld1ro_z, am_sve_regreg_lsl2>; + defm LD1RO_D : sve_mem_ldor_ss<0b11, "ld1rod", Z_d, ZPR64, GPR64NoXZRshifted64, nxv2i64, nxv2i1, AArch64ld1ro_z, am_sve_regreg_lsl3>; + defm ZIP1_ZZZ_Q : sve_int_perm_bin_perm_128_zz<0b00, 0, "zip1", int_aarch64_sve_zip1q>; + defm ZIP2_ZZZ_Q : sve_int_perm_bin_perm_128_zz<0b00, 1, "zip2", int_aarch64_sve_zip2q>; + defm UZP1_ZZZ_Q : sve_int_perm_bin_perm_128_zz<0b01, 0, "uzp1", int_aarch64_sve_uzp1q>; + defm UZP2_ZZZ_Q : sve_int_perm_bin_perm_128_zz<0b01, 1, "uzp2", int_aarch64_sve_uzp2q>; + defm TRN1_ZZZ_Q : sve_int_perm_bin_perm_128_zz<0b11, 0, "trn1", int_aarch64_sve_trn1q>; + defm TRN2_ZZZ_Q : sve_int_perm_bin_perm_128_zz<0b11, 1, "trn2", int_aarch64_sve_trn2q>; +} + +let Predicates = [HasSVE, HasMatMulFP64, HasBF16] in { + def : SVE_2_Op_Pat; + def : SVE_2_Op_Pat; + def : SVE_2_Op_Pat; + def : SVE_2_Op_Pat; + def : SVE_2_Op_Pat; + def : SVE_2_Op_Pat; } let Predicates = [HasSVE2] in { // SVE2 integer multiply-add (indexed) - defm MLA_ZZZI : sve2_int_mla_by_indexed_elem<0b01, 0b0, "mla">; - defm MLS_ZZZI : sve2_int_mla_by_indexed_elem<0b01, 0b1, "mls">; + defm MLA_ZZZI : sve2_int_mla_by_indexed_elem<0b01, 0b0, "mla", int_aarch64_sve_mla_lane>; + defm MLS_ZZZI : sve2_int_mla_by_indexed_elem<0b01, 0b1, "mls", int_aarch64_sve_mls_lane>; // SVE2 saturating multiply-add high (indexed) - defm SQRDMLAH_ZZZI : sve2_int_mla_by_indexed_elem<0b10, 0b0, "sqrdmlah">; - defm SQRDMLSH_ZZZI : sve2_int_mla_by_indexed_elem<0b10, 0b1, "sqrdmlsh">; + defm SQRDMLAH_ZZZI : sve2_int_mla_by_indexed_elem<0b10, 0b0, "sqrdmlah", int_aarch64_sve_sqrdmlah_lane>; + defm SQRDMLSH_ZZZI : sve2_int_mla_by_indexed_elem<0b10, 0b1, "sqrdmlsh", int_aarch64_sve_sqrdmlsh_lane>; // SVE2 saturating multiply-add high (vectors, unpredicated) - defm SQRDMLAH_ZZZ : sve2_int_mla<0b0, "sqrdmlah">; - defm SQRDMLSH_ZZZ : sve2_int_mla<0b1, "sqrdmlsh">; + defm SQRDMLAH_ZZZ : sve2_int_mla<0b0, "sqrdmlah", int_aarch64_sve_sqrdmlah>; + defm SQRDMLSH_ZZZ : sve2_int_mla<0b1, "sqrdmlsh", int_aarch64_sve_sqrdmlsh>; // SVE2 integer multiply (indexed) - defm MUL_ZZZI : sve2_int_mul_by_indexed_elem<0b1110, "mul">; + defm MUL_ZZZI : sve2_int_mul_by_indexed_elem<0b1110, "mul", int_aarch64_sve_mul_lane>; // SVE2 saturating multiply high (indexed) - defm SQDMULH_ZZZI : sve2_int_mul_by_indexed_elem<0b1100, "sqdmulh">; - defm SQRDMULH_ZZZI : sve2_int_mul_by_indexed_elem<0b1101, "sqrdmulh">; + defm SQDMULH_ZZZI : sve2_int_mul_by_indexed_elem<0b1100, "sqdmulh", int_aarch64_sve_sqdmulh_lane>; + defm SQRDMULH_ZZZI : sve2_int_mul_by_indexed_elem<0b1101, "sqrdmulh", int_aarch64_sve_sqrdmulh_lane>; // SVE2 signed saturating doubling multiply high (unpredicated) - defm SQDMULH_ZZZ : sve2_int_mul<0b100, "sqdmulh">; - defm SQRDMULH_ZZZ : sve2_int_mul<0b101, "sqrdmulh">; + defm SQDMULH_ZZZ : sve2_int_mul<0b100, "sqdmulh", int_aarch64_sve_sqdmulh>; + defm SQRDMULH_ZZZ : sve2_int_mul<0b101, "sqrdmulh", int_aarch64_sve_sqrdmulh>; // SVE2 integer multiply vectors (unpredicated) - defm MUL_ZZZ : sve2_int_mul<0b000, "mul">; - defm SMULH_ZZZ : sve2_int_mul<0b010, "smulh">; - defm UMULH_ZZZ : sve2_int_mul<0b011, "umulh">; - def PMUL_ZZZ_B : sve2_int_mul<0b00, 0b001, "pmul", ZPR8>; - + defm MUL_ZZZ : sve2_int_mul<0b000, "mul", mul>; + defm SMULH_ZZZ : sve2_int_mul<0b010, "smulh", null_frag>; + defm UMULH_ZZZ : sve2_int_mul<0b011, "umulh", null_frag>; + defm PMUL_ZZZ : sve2_int_mul_single<0b001, "pmul", int_aarch64_sve_pmul>; + + // Add patterns for unpredicated version of smulh and umulh. + def : Pat<(nxv16i8 (int_aarch64_sve_smulh (nxv16i1 (AArch64ptrue 31)), nxv16i8:$Op1, nxv16i8:$Op2)), + (SMULH_ZZZ_B $Op1, $Op2)>; + def : Pat<(nxv8i16 (int_aarch64_sve_smulh (nxv8i1 (AArch64ptrue 31)), nxv8i16:$Op1, nxv8i16:$Op2)), + (SMULH_ZZZ_H $Op1, $Op2)>; + def : Pat<(nxv4i32 (int_aarch64_sve_smulh (nxv4i1 (AArch64ptrue 31)), nxv4i32:$Op1, nxv4i32:$Op2)), + (SMULH_ZZZ_S $Op1, $Op2)>; + def : Pat<(nxv2i64 (int_aarch64_sve_smulh (nxv2i1 (AArch64ptrue 31)), nxv2i64:$Op1, nxv2i64:$Op2)), + (SMULH_ZZZ_D $Op1, $Op2)>; + def : Pat<(nxv16i8 (int_aarch64_sve_umulh (nxv16i1 (AArch64ptrue 31)), nxv16i8:$Op1, nxv16i8:$Op2)), + (UMULH_ZZZ_B $Op1, $Op2)>; + def : Pat<(nxv8i16 (int_aarch64_sve_umulh (nxv8i1 (AArch64ptrue 31)), nxv8i16:$Op1, nxv8i16:$Op2)), + (UMULH_ZZZ_H $Op1, $Op2)>; + def : Pat<(nxv4i32 (int_aarch64_sve_umulh (nxv4i1 (AArch64ptrue 31)), nxv4i32:$Op1, nxv4i32:$Op2)), + (UMULH_ZZZ_S $Op1, $Op2)>; + def : Pat<(nxv2i64 (int_aarch64_sve_umulh (nxv2i1 (AArch64ptrue 31)), nxv2i64:$Op1, nxv2i64:$Op2)), + (UMULH_ZZZ_D $Op1, $Op2)>; // SVE2 complex integer dot product (indexed) - defm CDOT_ZZZI : sve2_cintx_dot_by_indexed_elem<"cdot">; + defm CDOT_ZZZI : sve2_cintx_dot_by_indexed_elem<"cdot", int_aarch64_sve_cdot_lane>; // SVE2 complex integer dot product - defm CDOT_ZZZ : sve2_cintx_dot<"cdot">; + defm CDOT_ZZZ : sve2_cintx_dot<"cdot", int_aarch64_sve_cdot>; // SVE2 complex integer multiply-add (indexed) - defm CMLA_ZZZI : sve2_cmla_by_indexed_elem<0b0, "cmla">; + defm CMLA_ZZZI : sve2_cmla_by_indexed_elem<0b0, "cmla", int_aarch64_sve_cmla_lane_x>; // SVE2 complex saturating multiply-add (indexed) - defm SQRDCMLAH_ZZZI : sve2_cmla_by_indexed_elem<0b1, "sqrdcmlah">; + defm SQRDCMLAH_ZZZI : sve2_cmla_by_indexed_elem<0b1, "sqrdcmlah", int_aarch64_sve_sqrdcmlah_lane_x>; // SVE2 complex integer multiply-add - defm CMLA_ZZZ : sve2_int_cmla<0b0, "cmla">; - defm SQRDCMLAH_ZZZ : sve2_int_cmla<0b1, "sqrdcmlah">; + defm CMLA_ZZZ : sve2_int_cmla<0b0, "cmla", int_aarch64_sve_cmla_x>; + defm SQRDCMLAH_ZZZ : sve2_int_cmla<0b1, "sqrdcmlah", int_aarch64_sve_sqrdcmlah_x>; // SVE2 integer multiply long (indexed) - defm SMULLB_ZZZI : sve2_int_mul_long_by_indexed_elem<0b000, "smullb">; - defm SMULLT_ZZZI : sve2_int_mul_long_by_indexed_elem<0b001, "smullt">; - defm UMULLB_ZZZI : sve2_int_mul_long_by_indexed_elem<0b010, "umullb">; - defm UMULLT_ZZZI : sve2_int_mul_long_by_indexed_elem<0b011, "umullt">; + defm SMULLB_ZZZI : sve2_int_mul_long_by_indexed_elem<0b000, "smullb", int_aarch64_sve_smullb_lane>; + defm SMULLT_ZZZI : sve2_int_mul_long_by_indexed_elem<0b001, "smullt", int_aarch64_sve_smullt_lane>; + defm UMULLB_ZZZI : sve2_int_mul_long_by_indexed_elem<0b010, "umullb", int_aarch64_sve_umullb_lane>; + defm UMULLT_ZZZI : sve2_int_mul_long_by_indexed_elem<0b011, "umullt", int_aarch64_sve_umullt_lane>; // SVE2 saturating multiply (indexed) - defm SQDMULLB_ZZZI : sve2_int_mul_long_by_indexed_elem<0b100, "sqdmullb">; - defm SQDMULLT_ZZZI : sve2_int_mul_long_by_indexed_elem<0b101, "sqdmullt">; + defm SQDMULLB_ZZZI : sve2_int_mul_long_by_indexed_elem<0b100, "sqdmullb", int_aarch64_sve_sqdmullb_lane>; + defm SQDMULLT_ZZZI : sve2_int_mul_long_by_indexed_elem<0b101, "sqdmullt", int_aarch64_sve_sqdmullt_lane>; // SVE2 integer multiply-add long (indexed) - defm SMLALB_ZZZI : sve2_int_mla_long_by_indexed_elem<0b1000, "smlalb">; - defm SMLALT_ZZZI : sve2_int_mla_long_by_indexed_elem<0b1001, "smlalt">; - defm UMLALB_ZZZI : sve2_int_mla_long_by_indexed_elem<0b1010, "umlalb">; - defm UMLALT_ZZZI : sve2_int_mla_long_by_indexed_elem<0b1011, "umlalt">; - defm SMLSLB_ZZZI : sve2_int_mla_long_by_indexed_elem<0b1100, "smlslb">; - defm SMLSLT_ZZZI : sve2_int_mla_long_by_indexed_elem<0b1101, "smlslt">; - defm UMLSLB_ZZZI : sve2_int_mla_long_by_indexed_elem<0b1110, "umlslb">; - defm UMLSLT_ZZZI : sve2_int_mla_long_by_indexed_elem<0b1111, "umlslt">; + defm SMLALB_ZZZI : sve2_int_mla_long_by_indexed_elem<0b1000, "smlalb", int_aarch64_sve_smlalb_lane>; + defm SMLALT_ZZZI : sve2_int_mla_long_by_indexed_elem<0b1001, "smlalt", int_aarch64_sve_smlalt_lane>; + defm UMLALB_ZZZI : sve2_int_mla_long_by_indexed_elem<0b1010, "umlalb", int_aarch64_sve_umlalb_lane>; + defm UMLALT_ZZZI : sve2_int_mla_long_by_indexed_elem<0b1011, "umlalt", int_aarch64_sve_umlalt_lane>; + defm SMLSLB_ZZZI : sve2_int_mla_long_by_indexed_elem<0b1100, "smlslb", int_aarch64_sve_smlslb_lane>; + defm SMLSLT_ZZZI : sve2_int_mla_long_by_indexed_elem<0b1101, "smlslt", int_aarch64_sve_smlslt_lane>; + defm UMLSLB_ZZZI : sve2_int_mla_long_by_indexed_elem<0b1110, "umlslb", int_aarch64_sve_umlslb_lane>; + defm UMLSLT_ZZZI : sve2_int_mla_long_by_indexed_elem<0b1111, "umlslt", int_aarch64_sve_umlslt_lane>; // SVE2 integer multiply-add long (vectors, unpredicated) - defm SMLALB_ZZZ : sve2_int_mla_long<0b10000, "smlalb">; - defm SMLALT_ZZZ : sve2_int_mla_long<0b10001, "smlalt">; - defm UMLALB_ZZZ : sve2_int_mla_long<0b10010, "umlalb">; - defm UMLALT_ZZZ : sve2_int_mla_long<0b10011, "umlalt">; - defm SMLSLB_ZZZ : sve2_int_mla_long<0b10100, "smlslb">; - defm SMLSLT_ZZZ : sve2_int_mla_long<0b10101, "smlslt">; - defm UMLSLB_ZZZ : sve2_int_mla_long<0b10110, "umlslb">; - defm UMLSLT_ZZZ : sve2_int_mla_long<0b10111, "umlslt">; + defm SMLALB_ZZZ : sve2_int_mla_long<0b10000, "smlalb", int_aarch64_sve_smlalb>; + defm SMLALT_ZZZ : sve2_int_mla_long<0b10001, "smlalt", int_aarch64_sve_smlalt>; + defm UMLALB_ZZZ : sve2_int_mla_long<0b10010, "umlalb", int_aarch64_sve_umlalb>; + defm UMLALT_ZZZ : sve2_int_mla_long<0b10011, "umlalt", int_aarch64_sve_umlalt>; + defm SMLSLB_ZZZ : sve2_int_mla_long<0b10100, "smlslb", int_aarch64_sve_smlslb>; + defm SMLSLT_ZZZ : sve2_int_mla_long<0b10101, "smlslt", int_aarch64_sve_smlslt>; + defm UMLSLB_ZZZ : sve2_int_mla_long<0b10110, "umlslb", int_aarch64_sve_umlslb>; + defm UMLSLT_ZZZ : sve2_int_mla_long<0b10111, "umlslt", int_aarch64_sve_umlslt>; // SVE2 saturating multiply-add long (indexed) - defm SQDMLALB_ZZZI : sve2_int_mla_long_by_indexed_elem<0b0100, "sqdmlalb">; - defm SQDMLALT_ZZZI : sve2_int_mla_long_by_indexed_elem<0b0101, "sqdmlalt">; - defm SQDMLSLB_ZZZI : sve2_int_mla_long_by_indexed_elem<0b0110, "sqdmlslb">; - defm SQDMLSLT_ZZZI : sve2_int_mla_long_by_indexed_elem<0b0111, "sqdmlslt">; + defm SQDMLALB_ZZZI : sve2_int_mla_long_by_indexed_elem<0b0100, "sqdmlalb", int_aarch64_sve_sqdmlalb_lane>; + defm SQDMLALT_ZZZI : sve2_int_mla_long_by_indexed_elem<0b0101, "sqdmlalt", int_aarch64_sve_sqdmlalt_lane>; + defm SQDMLSLB_ZZZI : sve2_int_mla_long_by_indexed_elem<0b0110, "sqdmlslb", int_aarch64_sve_sqdmlslb_lane>; + defm SQDMLSLT_ZZZI : sve2_int_mla_long_by_indexed_elem<0b0111, "sqdmlslt", int_aarch64_sve_sqdmlslt_lane>; // SVE2 saturating multiply-add long (vectors, unpredicated) - defm SQDMLALB_ZZZ : sve2_int_mla_long<0b11000, "sqdmlalb">; - defm SQDMLALT_ZZZ : sve2_int_mla_long<0b11001, "sqdmlalt">; - defm SQDMLSLB_ZZZ : sve2_int_mla_long<0b11010, "sqdmlslb">; - defm SQDMLSLT_ZZZ : sve2_int_mla_long<0b11011, "sqdmlslt">; + defm SQDMLALB_ZZZ : sve2_int_mla_long<0b11000, "sqdmlalb", int_aarch64_sve_sqdmlalb>; + defm SQDMLALT_ZZZ : sve2_int_mla_long<0b11001, "sqdmlalt", int_aarch64_sve_sqdmlalt>; + defm SQDMLSLB_ZZZ : sve2_int_mla_long<0b11010, "sqdmlslb", int_aarch64_sve_sqdmlslb>; + defm SQDMLSLT_ZZZ : sve2_int_mla_long<0b11011, "sqdmlslt", int_aarch64_sve_sqdmlslt>; // SVE2 saturating multiply-add interleaved long - defm SQDMLALBT_ZZZ : sve2_int_mla_long<0b00010, "sqdmlalbt">; - defm SQDMLSLBT_ZZZ : sve2_int_mla_long<0b00011, "sqdmlslbt">; + defm SQDMLALBT_ZZZ : sve2_int_mla_long<0b00010, "sqdmlalbt", int_aarch64_sve_sqdmlalbt>; + defm SQDMLSLBT_ZZZ : sve2_int_mla_long<0b00011, "sqdmlslbt", int_aarch64_sve_sqdmlslbt>; // SVE2 integer halving add/subtract (predicated) - defm SHADD_ZPmZ : sve2_int_arith_pred<0b100000, "shadd">; - defm UHADD_ZPmZ : sve2_int_arith_pred<0b100010, "uhadd">; - defm SHSUB_ZPmZ : sve2_int_arith_pred<0b100100, "shsub">; - defm UHSUB_ZPmZ : sve2_int_arith_pred<0b100110, "uhsub">; - defm SRHADD_ZPmZ : sve2_int_arith_pred<0b101000, "srhadd">; - defm URHADD_ZPmZ : sve2_int_arith_pred<0b101010, "urhadd">; - defm SHSUBR_ZPmZ : sve2_int_arith_pred<0b101100, "shsubr">; - defm UHSUBR_ZPmZ : sve2_int_arith_pred<0b101110, "uhsubr">; + defm SHADD_ZPmZ : sve2_int_arith_pred<0b100000, "shadd", int_aarch64_sve_shadd>; + defm UHADD_ZPmZ : sve2_int_arith_pred<0b100010, "uhadd", int_aarch64_sve_uhadd>; + defm SHSUB_ZPmZ : sve2_int_arith_pred<0b100100, "shsub", int_aarch64_sve_shsub>; + defm UHSUB_ZPmZ : sve2_int_arith_pred<0b100110, "uhsub", int_aarch64_sve_uhsub>; + defm SRHADD_ZPmZ : sve2_int_arith_pred<0b101000, "srhadd", int_aarch64_sve_srhadd>; + defm URHADD_ZPmZ : sve2_int_arith_pred<0b101010, "urhadd", int_aarch64_sve_urhadd>; + defm SHSUBR_ZPmZ : sve2_int_arith_pred<0b101100, "shsubr", int_aarch64_sve_shsubr>; + defm UHSUBR_ZPmZ : sve2_int_arith_pred<0b101110, "uhsubr", int_aarch64_sve_uhsubr>; // SVE2 integer pairwise add and accumulate long - defm SADALP_ZPmZ : sve2_int_sadd_long_accum_pairwise<0, "sadalp">; - defm UADALP_ZPmZ : sve2_int_sadd_long_accum_pairwise<1, "uadalp">; + defm SADALP_ZPmZ : sve2_int_sadd_long_accum_pairwise<0, "sadalp", int_aarch64_sve_sadalp>; + defm UADALP_ZPmZ : sve2_int_sadd_long_accum_pairwise<1, "uadalp", int_aarch64_sve_uadalp>; // SVE2 integer pairwise arithmetic - defm ADDP_ZPmZ : sve2_int_arith_pred<0b100011, "addp">; - defm SMAXP_ZPmZ : sve2_int_arith_pred<0b101001, "smaxp">; - defm UMAXP_ZPmZ : sve2_int_arith_pred<0b101011, "umaxp">; - defm SMINP_ZPmZ : sve2_int_arith_pred<0b101101, "sminp">; - defm UMINP_ZPmZ : sve2_int_arith_pred<0b101111, "uminp">; + defm ADDP_ZPmZ : sve2_int_arith_pred<0b100011, "addp", int_aarch64_sve_addp>; + defm SMAXP_ZPmZ : sve2_int_arith_pred<0b101001, "smaxp", int_aarch64_sve_smaxp>; + defm UMAXP_ZPmZ : sve2_int_arith_pred<0b101011, "umaxp", int_aarch64_sve_umaxp>; + defm SMINP_ZPmZ : sve2_int_arith_pred<0b101101, "sminp", int_aarch64_sve_sminp>; + defm UMINP_ZPmZ : sve2_int_arith_pred<0b101111, "uminp", int_aarch64_sve_uminp>; // SVE2 integer unary operations (predicated) - defm URECPE_ZPmZ : sve2_int_un_pred_arit_s<0b000, "urecpe">; - defm URSQRTE_ZPmZ : sve2_int_un_pred_arit_s<0b001, "ursqrte">; - defm SQABS_ZPmZ : sve2_int_un_pred_arit<0b100, "sqabs">; - defm SQNEG_ZPmZ : sve2_int_un_pred_arit<0b101, "sqneg">; + defm URECPE_ZPmZ : sve2_int_un_pred_arit_s<0b000, "urecpe", int_aarch64_sve_urecpe>; + defm URSQRTE_ZPmZ : sve2_int_un_pred_arit_s<0b001, "ursqrte", int_aarch64_sve_ursqrte>; + defm SQABS_ZPmZ : sve2_int_un_pred_arit<0b100, "sqabs", int_aarch64_sve_sqabs>; + defm SQNEG_ZPmZ : sve2_int_un_pred_arit<0b101, "sqneg", int_aarch64_sve_sqneg>; // SVE2 saturating add/subtract - defm SQADD_ZPmZ : sve2_int_arith_pred<0b110000, "sqadd">; - defm UQADD_ZPmZ : sve2_int_arith_pred<0b110010, "uqadd">; - defm SQSUB_ZPmZ : sve2_int_arith_pred<0b110100, "sqsub">; - defm UQSUB_ZPmZ : sve2_int_arith_pred<0b110110, "uqsub">; - defm SUQADD_ZPmZ : sve2_int_arith_pred<0b111000, "suqadd">; - defm USQADD_ZPmZ : sve2_int_arith_pred<0b111010, "usqadd">; - defm SQSUBR_ZPmZ : sve2_int_arith_pred<0b111100, "sqsubr">; - defm UQSUBR_ZPmZ : sve2_int_arith_pred<0b111110, "uqsubr">; + defm SQADD_ZPmZ : sve2_int_arith_pred<0b110000, "sqadd", int_aarch64_sve_sqadd>; + defm UQADD_ZPmZ : sve2_int_arith_pred<0b110010, "uqadd", int_aarch64_sve_uqadd>; + defm SQSUB_ZPmZ : sve2_int_arith_pred<0b110100, "sqsub", int_aarch64_sve_sqsub>; + defm UQSUB_ZPmZ : sve2_int_arith_pred<0b110110, "uqsub", int_aarch64_sve_uqsub>; + defm SUQADD_ZPmZ : sve2_int_arith_pred<0b111000, "suqadd", int_aarch64_sve_suqadd>; + defm USQADD_ZPmZ : sve2_int_arith_pred<0b111010, "usqadd", int_aarch64_sve_usqadd>; + defm SQSUBR_ZPmZ : sve2_int_arith_pred<0b111100, "sqsubr", int_aarch64_sve_sqsubr>; + defm UQSUBR_ZPmZ : sve2_int_arith_pred<0b111110, "uqsubr", int_aarch64_sve_uqsubr>; // SVE2 saturating/rounding bitwise shift left (predicated) - defm SRSHL_ZPmZ : sve2_int_arith_pred<0b000100, "srshl">; - defm URSHL_ZPmZ : sve2_int_arith_pred<0b000110, "urshl">; - defm SRSHLR_ZPmZ : sve2_int_arith_pred<0b001100, "srshlr">; - defm URSHLR_ZPmZ : sve2_int_arith_pred<0b001110, "urshlr">; - defm SQSHL_ZPmZ : sve2_int_arith_pred<0b010000, "sqshl">; - defm UQSHL_ZPmZ : sve2_int_arith_pred<0b010010, "uqshl">; - defm SQRSHL_ZPmZ : sve2_int_arith_pred<0b010100, "sqrshl">; - defm UQRSHL_ZPmZ : sve2_int_arith_pred<0b010110, "uqrshl">; - defm SQSHLR_ZPmZ : sve2_int_arith_pred<0b011000, "sqshlr">; - defm UQSHLR_ZPmZ : sve2_int_arith_pred<0b011010, "uqshlr">; - defm SQRSHLR_ZPmZ : sve2_int_arith_pred<0b011100, "sqrshlr">; - defm UQRSHLR_ZPmZ : sve2_int_arith_pred<0b011110, "uqrshlr">; + defm SRSHL_ZPmZ : sve2_int_arith_pred<0b000100, "srshl", int_aarch64_sve_srshl>; + defm URSHL_ZPmZ : sve2_int_arith_pred<0b000110, "urshl", int_aarch64_sve_urshl>; + defm SRSHLR_ZPmZ : sve2_int_arith_pred<0b001100, "srshlr", null_frag>; + defm URSHLR_ZPmZ : sve2_int_arith_pred<0b001110, "urshlr", null_frag>; + defm SQSHL_ZPmZ : sve2_int_arith_pred<0b010000, "sqshl", int_aarch64_sve_sqshl>; + defm UQSHL_ZPmZ : sve2_int_arith_pred<0b010010, "uqshl", int_aarch64_sve_uqshl>; + defm SQRSHL_ZPmZ : sve2_int_arith_pred<0b010100, "sqrshl", int_aarch64_sve_sqrshl>; + defm UQRSHL_ZPmZ : sve2_int_arith_pred<0b010110, "uqrshl", int_aarch64_sve_uqrshl>; + defm SQSHLR_ZPmZ : sve2_int_arith_pred<0b011000, "sqshlr", null_frag>; + defm UQSHLR_ZPmZ : sve2_int_arith_pred<0b011010, "uqshlr", null_frag>; + defm SQRSHLR_ZPmZ : sve2_int_arith_pred<0b011100, "sqrshlr", null_frag>; + defm UQRSHLR_ZPmZ : sve2_int_arith_pred<0b011110, "uqrshlr", null_frag>; + + let Predicates = [HasSVE2, UseExperimentalZeroingPseudos] in { + defm SQSHL_ZPZI : sve_int_bin_pred_shift_imm_left_zeroing_bhsd; + defm UQSHL_ZPZI : sve_int_bin_pred_shift_imm_left_zeroing_bhsd; + defm SRSHR_ZPZI : sve_int_bin_pred_shift_imm_right_zeroing_bhsd; + defm URSHR_ZPZI : sve_int_bin_pred_shift_imm_right_zeroing_bhsd; + defm SQSHLU_ZPZI : sve_int_bin_pred_shift_imm_left_zeroing_bhsd; + } // SVE2 predicated shifts - defm SQSHL_ZPmI : sve_int_bin_pred_shift_imm_left< 0b0110, "sqshl">; - defm UQSHL_ZPmI : sve_int_bin_pred_shift_imm_left< 0b0111, "uqshl">; - defm SRSHR_ZPmI : sve_int_bin_pred_shift_imm_right<0b1100, "srshr">; - defm URSHR_ZPmI : sve_int_bin_pred_shift_imm_right<0b1101, "urshr">; - defm SQSHLU_ZPmI : sve_int_bin_pred_shift_imm_left< 0b1111, "sqshlu">; + defm SQSHL_ZPmI : sve_int_bin_pred_shift_imm_left< 0b0110, "sqshl", "SQSHL_ZPZI">; + defm UQSHL_ZPmI : sve_int_bin_pred_shift_imm_left< 0b0111, "uqshl", "UQSHL_ZPZI">; + defm SRSHR_ZPmI : sve_int_bin_pred_shift_imm_right<0b1100, "srshr", "SRSHR_ZPZI", int_aarch64_sve_srshr>; + defm URSHR_ZPmI : sve_int_bin_pred_shift_imm_right<0b1101, "urshr", "URSHR_ZPZI", int_aarch64_sve_urshr>; + defm SQSHLU_ZPmI : sve2_int_bin_pred_shift_imm_left< 0b1111, "sqshlu", "SQSHLU_ZPZI", int_aarch64_sve_sqshlu>; // SVE2 integer add/subtract long - defm SADDLB_ZZZ : sve2_wide_int_arith_long<0b00000, "saddlb">; - defm SADDLT_ZZZ : sve2_wide_int_arith_long<0b00001, "saddlt">; - defm UADDLB_ZZZ : sve2_wide_int_arith_long<0b00010, "uaddlb">; - defm UADDLT_ZZZ : sve2_wide_int_arith_long<0b00011, "uaddlt">; - defm SSUBLB_ZZZ : sve2_wide_int_arith_long<0b00100, "ssublb">; - defm SSUBLT_ZZZ : sve2_wide_int_arith_long<0b00101, "ssublt">; - defm USUBLB_ZZZ : sve2_wide_int_arith_long<0b00110, "usublb">; - defm USUBLT_ZZZ : sve2_wide_int_arith_long<0b00111, "usublt">; - defm SABDLB_ZZZ : sve2_wide_int_arith_long<0b01100, "sabdlb">; - defm SABDLT_ZZZ : sve2_wide_int_arith_long<0b01101, "sabdlt">; - defm UABDLB_ZZZ : sve2_wide_int_arith_long<0b01110, "uabdlb">; - defm UABDLT_ZZZ : sve2_wide_int_arith_long<0b01111, "uabdlt">; + defm SADDLB_ZZZ : sve2_wide_int_arith_long<0b00000, "saddlb", int_aarch64_sve_saddlb>; + defm SADDLT_ZZZ : sve2_wide_int_arith_long<0b00001, "saddlt", int_aarch64_sve_saddlt>; + defm UADDLB_ZZZ : sve2_wide_int_arith_long<0b00010, "uaddlb", int_aarch64_sve_uaddlb>; + defm UADDLT_ZZZ : sve2_wide_int_arith_long<0b00011, "uaddlt", int_aarch64_sve_uaddlt>; + defm SSUBLB_ZZZ : sve2_wide_int_arith_long<0b00100, "ssublb", int_aarch64_sve_ssublb>; + defm SSUBLT_ZZZ : sve2_wide_int_arith_long<0b00101, "ssublt", int_aarch64_sve_ssublt>; + defm USUBLB_ZZZ : sve2_wide_int_arith_long<0b00110, "usublb", int_aarch64_sve_usublb>; + defm USUBLT_ZZZ : sve2_wide_int_arith_long<0b00111, "usublt", int_aarch64_sve_usublt>; + defm SABDLB_ZZZ : sve2_wide_int_arith_long<0b01100, "sabdlb", int_aarch64_sve_sabdlb>; + defm SABDLT_ZZZ : sve2_wide_int_arith_long<0b01101, "sabdlt", int_aarch64_sve_sabdlt>; + defm UABDLB_ZZZ : sve2_wide_int_arith_long<0b01110, "uabdlb", int_aarch64_sve_uabdlb>; + defm UABDLT_ZZZ : sve2_wide_int_arith_long<0b01111, "uabdlt", int_aarch64_sve_uabdlt>; // SVE2 integer add/subtract wide - defm SADDWB_ZZZ : sve2_wide_int_arith_wide<0b000, "saddwb">; - defm SADDWT_ZZZ : sve2_wide_int_arith_wide<0b001, "saddwt">; - defm UADDWB_ZZZ : sve2_wide_int_arith_wide<0b010, "uaddwb">; - defm UADDWT_ZZZ : sve2_wide_int_arith_wide<0b011, "uaddwt">; - defm SSUBWB_ZZZ : sve2_wide_int_arith_wide<0b100, "ssubwb">; - defm SSUBWT_ZZZ : sve2_wide_int_arith_wide<0b101, "ssubwt">; - defm USUBWB_ZZZ : sve2_wide_int_arith_wide<0b110, "usubwb">; - defm USUBWT_ZZZ : sve2_wide_int_arith_wide<0b111, "usubwt">; + defm SADDWB_ZZZ : sve2_wide_int_arith_wide<0b000, "saddwb", int_aarch64_sve_saddwb>; + defm SADDWT_ZZZ : sve2_wide_int_arith_wide<0b001, "saddwt", int_aarch64_sve_saddwt>; + defm UADDWB_ZZZ : sve2_wide_int_arith_wide<0b010, "uaddwb", int_aarch64_sve_uaddwb>; + defm UADDWT_ZZZ : sve2_wide_int_arith_wide<0b011, "uaddwt", int_aarch64_sve_uaddwt>; + defm SSUBWB_ZZZ : sve2_wide_int_arith_wide<0b100, "ssubwb", int_aarch64_sve_ssubwb>; + defm SSUBWT_ZZZ : sve2_wide_int_arith_wide<0b101, "ssubwt", int_aarch64_sve_ssubwt>; + defm USUBWB_ZZZ : sve2_wide_int_arith_wide<0b110, "usubwb", int_aarch64_sve_usubwb>; + defm USUBWT_ZZZ : sve2_wide_int_arith_wide<0b111, "usubwt", int_aarch64_sve_usubwt>; // SVE2 integer multiply long - defm SQDMULLB_ZZZ : sve2_wide_int_arith_long<0b11000, "sqdmullb">; - defm SQDMULLT_ZZZ : sve2_wide_int_arith_long<0b11001, "sqdmullt">; - defm SMULLB_ZZZ : sve2_wide_int_arith_long<0b11100, "smullb">; - defm SMULLT_ZZZ : sve2_wide_int_arith_long<0b11101, "smullt">; - defm UMULLB_ZZZ : sve2_wide_int_arith_long<0b11110, "umullb">; - defm UMULLT_ZZZ : sve2_wide_int_arith_long<0b11111, "umullt">; - defm PMULLB_ZZZ : sve2_pmul_long<0b0, "pmullb">; - defm PMULLT_ZZZ : sve2_pmul_long<0b1, "pmullt">; + defm SQDMULLB_ZZZ : sve2_wide_int_arith_long<0b11000, "sqdmullb", int_aarch64_sve_sqdmullb>; + defm SQDMULLT_ZZZ : sve2_wide_int_arith_long<0b11001, "sqdmullt", int_aarch64_sve_sqdmullt>; + defm SMULLB_ZZZ : sve2_wide_int_arith_long<0b11100, "smullb", int_aarch64_sve_smullb>; + defm SMULLT_ZZZ : sve2_wide_int_arith_long<0b11101, "smullt", int_aarch64_sve_smullt>; + defm UMULLB_ZZZ : sve2_wide_int_arith_long<0b11110, "umullb", int_aarch64_sve_umullb>; + defm UMULLT_ZZZ : sve2_wide_int_arith_long<0b11111, "umullt", int_aarch64_sve_umullt>; + defm PMULLB_ZZZ : sve2_pmul_long<0b0, "pmullb", int_aarch64_sve_pmullb_pair>; + defm PMULLT_ZZZ : sve2_pmul_long<0b1, "pmullt", int_aarch64_sve_pmullt_pair>; // SVE2 bitwise shift and insert - defm SRI_ZZI : sve2_int_bin_shift_imm_right<0b0, "sri">; - defm SLI_ZZI : sve2_int_bin_shift_imm_left< 0b1, "sli">; + defm SRI_ZZI : sve2_int_bin_shift_imm_right<0b0, "sri", int_aarch64_sve_sri>; + defm SLI_ZZI : sve2_int_bin_shift_imm_left< 0b1, "sli", int_aarch64_sve_sli>; // SVE2 bitwise shift right and accumulate - defm SSRA_ZZI : sve2_int_bin_accum_shift_imm_right<0b00, "ssra">; - defm USRA_ZZI : sve2_int_bin_accum_shift_imm_right<0b01, "usra">; - defm SRSRA_ZZI : sve2_int_bin_accum_shift_imm_right<0b10, "srsra">; - defm URSRA_ZZI : sve2_int_bin_accum_shift_imm_right<0b11, "ursra">; + defm SSRA_ZZI : sve2_int_bin_accum_shift_imm_right<0b00, "ssra", int_aarch64_sve_ssra>; + defm USRA_ZZI : sve2_int_bin_accum_shift_imm_right<0b01, "usra", int_aarch64_sve_usra>; + defm SRSRA_ZZI : sve2_int_bin_accum_shift_imm_right<0b10, "srsra", int_aarch64_sve_srsra>; + defm URSRA_ZZI : sve2_int_bin_accum_shift_imm_right<0b11, "ursra", int_aarch64_sve_ursra>; // SVE2 complex integer add - defm CADD_ZZI : sve2_int_cadd<0b0, "cadd">; - defm SQCADD_ZZI : sve2_int_cadd<0b1, "sqcadd">; + defm CADD_ZZI : sve2_int_cadd<0b0, "cadd", int_aarch64_sve_cadd_x>; + defm SQCADD_ZZI : sve2_int_cadd<0b1, "sqcadd", int_aarch64_sve_sqcadd_x>; // SVE2 integer absolute difference and accumulate - defm SABA_ZZZ : sve2_int_absdiff_accum<0b0, "saba">; - defm UABA_ZZZ : sve2_int_absdiff_accum<0b1, "uaba">; + defm SABA_ZZZ : sve2_int_absdiff_accum<0b0, "saba", int_aarch64_sve_saba>; + defm UABA_ZZZ : sve2_int_absdiff_accum<0b1, "uaba", int_aarch64_sve_uaba>; // SVE2 integer absolute difference and accumulate long - defm SABALB_ZZZ : sve2_int_absdiff_accum_long<0b00, "sabalb">; - defm SABALT_ZZZ : sve2_int_absdiff_accum_long<0b01, "sabalt">; - defm UABALB_ZZZ : sve2_int_absdiff_accum_long<0b10, "uabalb">; - defm UABALT_ZZZ : sve2_int_absdiff_accum_long<0b11, "uabalt">; + defm SABALB_ZZZ : sve2_int_absdiff_accum_long<0b00, "sabalb", int_aarch64_sve_sabalb>; + defm SABALT_ZZZ : sve2_int_absdiff_accum_long<0b01, "sabalt", int_aarch64_sve_sabalt>; + defm UABALB_ZZZ : sve2_int_absdiff_accum_long<0b10, "uabalb", int_aarch64_sve_uabalb>; + defm UABALT_ZZZ : sve2_int_absdiff_accum_long<0b11, "uabalt", int_aarch64_sve_uabalt>; // SVE2 integer add/subtract long with carry - defm ADCLB_ZZZ : sve2_int_addsub_long_carry<0b00, "adclb">; - defm ADCLT_ZZZ : sve2_int_addsub_long_carry<0b01, "adclt">; - defm SBCLB_ZZZ : sve2_int_addsub_long_carry<0b10, "sbclb">; - defm SBCLT_ZZZ : sve2_int_addsub_long_carry<0b11, "sbclt">; + defm ADCLB_ZZZ : sve2_int_addsub_long_carry<0b00, "adclb", int_aarch64_sve_adclb>; + defm ADCLT_ZZZ : sve2_int_addsub_long_carry<0b01, "adclt", int_aarch64_sve_adclt>; + defm SBCLB_ZZZ : sve2_int_addsub_long_carry<0b10, "sbclb", int_aarch64_sve_sbclb>; + defm SBCLT_ZZZ : sve2_int_addsub_long_carry<0b11, "sbclt", int_aarch64_sve_sbclt>; // SVE2 bitwise shift right narrow (bottom) defm SQSHRUNB_ZZI : sve2_int_bin_shift_imm_right_narrow_bottom<0b000, "sqshrunb", int_aarch64_sve_sqshrunb>; @@ -1489,29 +2426,29 @@ let Predicates = [HasSVE2] in { defm SQXTUNT_ZZ : sve2_int_sat_extract_narrow_top<0b10, "sqxtunt", int_aarch64_sve_sqxtunt>; // SVE2 character match - defm MATCH_PPzZZ : sve2_char_match<0b0, "match">; - defm NMATCH_PPzZZ : sve2_char_match<0b1, "nmatch">; + defm MATCH_PPzZZ : sve2_char_match<0b0, "match", int_aarch64_sve_match>; + defm NMATCH_PPzZZ : sve2_char_match<0b1, "nmatch", int_aarch64_sve_nmatch>; // SVE2 bitwise exclusive-or interleaved - defm EORBT_ZZZ : sve2_bitwise_xor_interleaved<0b0, "eorbt">; - defm EORTB_ZZZ : sve2_bitwise_xor_interleaved<0b1, "eortb">; + defm EORBT_ZZZ : sve2_bitwise_xor_interleaved<0b0, "eorbt", int_aarch64_sve_eorbt>; + defm EORTB_ZZZ : sve2_bitwise_xor_interleaved<0b1, "eortb", int_aarch64_sve_eortb>; // SVE2 bitwise shift left long - defm SSHLLB_ZZI : sve2_bitwise_shift_left_long<0b00, "sshllb">; - defm SSHLLT_ZZI : sve2_bitwise_shift_left_long<0b01, "sshllt">; - defm USHLLB_ZZI : sve2_bitwise_shift_left_long<0b10, "ushllb">; - defm USHLLT_ZZI : sve2_bitwise_shift_left_long<0b11, "ushllt">; + defm SSHLLB_ZZI : sve2_bitwise_shift_left_long<0b00, "sshllb", int_aarch64_sve_sshllb>; + defm SSHLLT_ZZI : sve2_bitwise_shift_left_long<0b01, "sshllt", int_aarch64_sve_sshllt>; + defm USHLLB_ZZI : sve2_bitwise_shift_left_long<0b10, "ushllb", int_aarch64_sve_ushllb>; + defm USHLLT_ZZI : sve2_bitwise_shift_left_long<0b11, "ushllt", int_aarch64_sve_ushllt>; // SVE2 integer add/subtract interleaved long - defm SADDLBT_ZZZ : sve2_misc_int_addsub_long_interleaved<0b00, "saddlbt">; - defm SSUBLBT_ZZZ : sve2_misc_int_addsub_long_interleaved<0b10, "ssublbt">; - defm SSUBLTB_ZZZ : sve2_misc_int_addsub_long_interleaved<0b11, "ssubltb">; + defm SADDLBT_ZZZ : sve2_misc_int_addsub_long_interleaved<0b00, "saddlbt", int_aarch64_sve_saddlbt>; + defm SSUBLBT_ZZZ : sve2_misc_int_addsub_long_interleaved<0b10, "ssublbt", int_aarch64_sve_ssublbt>; + defm SSUBLTB_ZZZ : sve2_misc_int_addsub_long_interleaved<0b11, "ssubltb", int_aarch64_sve_ssubltb>; // SVE2 histogram generation (segment) - def HISTSEG_ZZZ : sve2_hist_gen_segment<"histseg">; + def HISTSEG_ZZZ : sve2_hist_gen_segment<"histseg", int_aarch64_sve_histseg>; // SVE2 histogram generation (vector) - defm HISTCNT_ZPzZZ : sve2_hist_gen_vector<"histcnt">; + defm HISTCNT_ZPzZZ : sve2_hist_gen_vector<"histcnt", int_aarch64_sve_histcnt>; // SVE2 floating-point base 2 logarithm as integer defm FLOGB_ZPmZ : sve2_fp_flogb<"flogb", int_aarch64_sve_flogb>; @@ -1542,50 +2479,57 @@ let Predicates = [HasSVE2] in { defm FMLSLT_ZZZ_SHH : sve2_fp_mla_long<0b11, "fmlslt", int_aarch64_sve_fmlslt>; // SVE2 bitwise ternary operations - defm EOR3_ZZZZ_D : sve2_int_bitwise_ternary_op<0b000, "eor3">; - defm BCAX_ZZZZ_D : sve2_int_bitwise_ternary_op<0b010, "bcax">; - def BSL_ZZZZ_D : sve2_int_bitwise_ternary_op_d<0b001, "bsl">; - def BSL1N_ZZZZ_D : sve2_int_bitwise_ternary_op_d<0b011, "bsl1n">; - def BSL2N_ZZZZ_D : sve2_int_bitwise_ternary_op_d<0b101, "bsl2n">; - def NBSL_ZZZZ_D : sve2_int_bitwise_ternary_op_d<0b111, "nbsl">; + defm EOR3_ZZZZ : sve2_int_bitwise_ternary_op<0b000, "eor3", int_aarch64_sve_eor3>; + defm BCAX_ZZZZ : sve2_int_bitwise_ternary_op<0b010, "bcax", int_aarch64_sve_bcax>; + defm BSL_ZZZZ : sve2_int_bitwise_ternary_op<0b001, "bsl", int_aarch64_sve_bsl>; + defm BSL1N_ZZZZ : sve2_int_bitwise_ternary_op<0b011, "bsl1n", int_aarch64_sve_bsl1n>; + defm BSL2N_ZZZZ : sve2_int_bitwise_ternary_op<0b101, "bsl2n", int_aarch64_sve_bsl2n>; + defm NBSL_ZZZZ : sve2_int_bitwise_ternary_op<0b111, "nbsl", int_aarch64_sve_nbsl>; // SVE2 bitwise xor and rotate right by immediate - defm XAR_ZZZI : sve2_int_rotate_right_imm<"xar">; + defm XAR_ZZZI : sve2_int_rotate_right_imm<"xar", int_aarch64_sve_xar>; // SVE2 extract vector (immediate offset, constructive) def EXT_ZZI_B : sve2_int_perm_extract_i_cons<"ext">; // SVE2 non-temporal gather loads - defm LDNT1SB_ZZR_S : sve2_mem_gldnt_vs<0b00000, "ldnt1sb", Z_s, ZPR32>; - defm LDNT1B_ZZR_S : sve2_mem_gldnt_vs<0b00001, "ldnt1b", Z_s, ZPR32>; - defm LDNT1SH_ZZR_S : sve2_mem_gldnt_vs<0b00100, "ldnt1sh", Z_s, ZPR32>; - defm LDNT1H_ZZR_S : sve2_mem_gldnt_vs<0b00101, "ldnt1h", Z_s, ZPR32>; - defm LDNT1W_ZZR_S : sve2_mem_gldnt_vs<0b01001, "ldnt1w", Z_s, ZPR32>; - - defm LDNT1SB_ZZR_D : sve2_mem_gldnt_vs<0b10000, "ldnt1sb", Z_d, ZPR64>; - defm LDNT1B_ZZR_D : sve2_mem_gldnt_vs<0b10010, "ldnt1b", Z_d, ZPR64>; - defm LDNT1SH_ZZR_D : sve2_mem_gldnt_vs<0b10100, "ldnt1sh", Z_d, ZPR64>; - defm LDNT1H_ZZR_D : sve2_mem_gldnt_vs<0b10110, "ldnt1h", Z_d, ZPR64>; - defm LDNT1SW_ZZR_D : sve2_mem_gldnt_vs<0b11000, "ldnt1sw", Z_d, ZPR64>; - defm LDNT1W_ZZR_D : sve2_mem_gldnt_vs<0b11010, "ldnt1w", Z_d, ZPR64>; - defm LDNT1D_ZZR_D : sve2_mem_gldnt_vs<0b11110, "ldnt1d", Z_d, ZPR64>; + defm LDNT1SB_ZZR_S : sve2_mem_gldnt_vs_32_ptrs<0b00000, "ldnt1sb", AArch64ldnt1s_gather_z, nxv4i8>; + defm LDNT1B_ZZR_S : sve2_mem_gldnt_vs_32_ptrs<0b00001, "ldnt1b", AArch64ldnt1_gather_z, nxv4i8>; + defm LDNT1SH_ZZR_S : sve2_mem_gldnt_vs_32_ptrs<0b00100, "ldnt1sh", AArch64ldnt1s_gather_z, nxv4i16>; + defm LDNT1H_ZZR_S : sve2_mem_gldnt_vs_32_ptrs<0b00101, "ldnt1h", AArch64ldnt1_gather_z, nxv4i16>; + defm LDNT1W_ZZR_S : sve2_mem_gldnt_vs_32_ptrs<0b01001, "ldnt1w", AArch64ldnt1_gather_z, nxv4i32>; + + defm LDNT1SB_ZZR_D : sve2_mem_gldnt_vs_64_ptrs<0b10000, "ldnt1sb", AArch64ldnt1s_gather_z, nxv2i8>; + defm LDNT1B_ZZR_D : sve2_mem_gldnt_vs_64_ptrs<0b10010, "ldnt1b", AArch64ldnt1_gather_z, nxv2i8>; + defm LDNT1SH_ZZR_D : sve2_mem_gldnt_vs_64_ptrs<0b10100, "ldnt1sh", AArch64ldnt1s_gather_z, nxv2i16>; + defm LDNT1H_ZZR_D : sve2_mem_gldnt_vs_64_ptrs<0b10110, "ldnt1h", AArch64ldnt1_gather_z, nxv2i16>; + defm LDNT1SW_ZZR_D : sve2_mem_gldnt_vs_64_ptrs<0b11000, "ldnt1sw", AArch64ldnt1s_gather_z, nxv2i32>; + defm LDNT1W_ZZR_D : sve2_mem_gldnt_vs_64_ptrs<0b11010, "ldnt1w", AArch64ldnt1_gather_z, nxv2i32>; + defm LDNT1D_ZZR_D : sve2_mem_gldnt_vs_64_ptrs<0b11110, "ldnt1d", AArch64ldnt1_gather_z, nxv2i64>; // SVE2 vector splice (constructive) defm SPLICE_ZPZZ : sve2_int_perm_splice_cons<"splice">; // SVE2 non-temporal scatter stores - defm STNT1B_ZZR_S : sve2_mem_sstnt_vs<0b001, "stnt1b", Z_s, ZPR32>; - defm STNT1H_ZZR_S : sve2_mem_sstnt_vs<0b011, "stnt1h", Z_s, ZPR32>; - defm STNT1W_ZZR_S : sve2_mem_sstnt_vs<0b101, "stnt1w", Z_s, ZPR32>; + defm STNT1B_ZZR_S : sve2_mem_sstnt_vs_32_ptrs<0b001, "stnt1b", AArch64stnt1_scatter, nxv4i8>; + defm STNT1H_ZZR_S : sve2_mem_sstnt_vs_32_ptrs<0b011, "stnt1h", AArch64stnt1_scatter, nxv4i16>; + defm STNT1W_ZZR_S : sve2_mem_sstnt_vs_32_ptrs<0b101, "stnt1w", AArch64stnt1_scatter, nxv4i32>; - defm STNT1B_ZZR_D : sve2_mem_sstnt_vs<0b000, "stnt1b", Z_d, ZPR64>; - defm STNT1H_ZZR_D : sve2_mem_sstnt_vs<0b010, "stnt1h", Z_d, ZPR64>; - defm STNT1W_ZZR_D : sve2_mem_sstnt_vs<0b100, "stnt1w", Z_d, ZPR64>; - defm STNT1D_ZZR_D : sve2_mem_sstnt_vs<0b110, "stnt1d", Z_d, ZPR64>; + defm STNT1B_ZZR_D : sve2_mem_sstnt_vs_64_ptrs<0b000, "stnt1b", AArch64stnt1_scatter, nxv2i8>; + defm STNT1H_ZZR_D : sve2_mem_sstnt_vs_64_ptrs<0b010, "stnt1h", AArch64stnt1_scatter, nxv2i16>; + defm STNT1W_ZZR_D : sve2_mem_sstnt_vs_64_ptrs<0b100, "stnt1w", AArch64stnt1_scatter, nxv2i32>; + defm STNT1D_ZZR_D : sve2_mem_sstnt_vs_64_ptrs<0b110, "stnt1d", AArch64stnt1_scatter, nxv2i64>; // SVE2 table lookup (three sources) - defm TBL_ZZZZ : sve2_int_perm_tbl<"tbl">; - defm TBX_ZZZ : sve2_int_perm_tbx<"tbx">; + defm TBL_ZZZZ : sve2_int_perm_tbl<"tbl", int_aarch64_sve_tbl2>; + defm TBX_ZZZ : sve2_int_perm_tbx<"tbx", int_aarch64_sve_tbx>; + + let Predicates = [HasSVE, HasBF16] in { + def : SVE_3_Op_Pat; + def : Pat<(nxv8bf16 (int_aarch64_sve_tbl2 nxv8bf16:$Op1, nxv8bf16:$Op2, nxv8i16:$Op3)), + (nxv8bf16 (TBL_ZZZZ_H (REG_SEQUENCE ZPR2, nxv8bf16:$Op1, zsub0, nxv8bf16:$Op2, zsub1), + nxv8i16:$Op3))>; + } // SVE2 integer compare scalar count and limit defm WHILEGE_PWW : sve_int_while4_rr<0b000, "whilege", int_aarch64_sve_whilege>; @@ -1599,43 +2543,41 @@ let Predicates = [HasSVE2] in { defm WHILEHI_PXX : sve_int_while8_rr<0b101, "whilehi", int_aarch64_sve_whilehi>; // SVE2 pointer conflict compare - defm WHILEWR_PXX : sve2_int_while_rr<0b0, "whilewr">; - defm WHILERW_PXX : sve2_int_while_rr<0b1, "whilerw">; + defm WHILEWR_PXX : sve2_int_while_rr<0b0, "whilewr", "int_aarch64_sve_whilewr">; + defm WHILERW_PXX : sve2_int_while_rr<0b1, "whilerw", "int_aarch64_sve_whilerw">; } let Predicates = [HasSVE2AES] in { // SVE2 crypto destructive binary operations - def AESE_ZZZ_B : sve2_crypto_des_bin_op<0b00, "aese", ZPR8>; - def AESD_ZZZ_B : sve2_crypto_des_bin_op<0b01, "aesd", ZPR8>; + defm AESE_ZZZ_B : sve2_crypto_des_bin_op<0b00, "aese", ZPR8, int_aarch64_sve_aese, nxv16i8>; + defm AESD_ZZZ_B : sve2_crypto_des_bin_op<0b01, "aesd", ZPR8, int_aarch64_sve_aesd, nxv16i8>; // SVE2 crypto unary operations - def AESMC_ZZ_B : sve2_crypto_unary_op<0b0, "aesmc">; - def AESIMC_ZZ_B : sve2_crypto_unary_op<0b1, "aesimc">; + defm AESMC_ZZ_B : sve2_crypto_unary_op<0b0, "aesmc", int_aarch64_sve_aesmc>; + defm AESIMC_ZZ_B : sve2_crypto_unary_op<0b1, "aesimc", int_aarch64_sve_aesimc>; // PMULLB and PMULLT instructions which operate with 64-bit source and // 128-bit destination elements are enabled with crypto extensions, similar // to NEON PMULL2 instruction. - def PMULLB_ZZZ_Q : sve2_wide_int_arith<0b00, 0b11010, "pmullb", - ZPR128, ZPR64, ZPR64>; - def PMULLT_ZZZ_Q : sve2_wide_int_arith<0b00, 0b11011, "pmullt", - ZPR128, ZPR64, ZPR64>; + defm PMULLB_ZZZ_Q : sve2_wide_int_arith_pmul<0b00, 0b11010, "pmullb", int_aarch64_sve_pmullb_pair>; + defm PMULLT_ZZZ_Q : sve2_wide_int_arith_pmul<0b00, 0b11011, "pmullt", int_aarch64_sve_pmullt_pair>; } let Predicates = [HasSVE2SM4] in { // SVE2 crypto constructive binary operations - def SM4EKEY_ZZZ_S : sve2_crypto_cons_bin_op<0b0, "sm4ekey", ZPR32>; + defm SM4EKEY_ZZZ_S : sve2_crypto_cons_bin_op<0b0, "sm4ekey", ZPR32, int_aarch64_sve_sm4ekey, nxv4i32>; // SVE2 crypto destructive binary operations - def SM4E_ZZZ_S : sve2_crypto_des_bin_op<0b10, "sm4e", ZPR32>; + defm SM4E_ZZZ_S : sve2_crypto_des_bin_op<0b10, "sm4e", ZPR32, int_aarch64_sve_sm4e, nxv4i32>; } let Predicates = [HasSVE2SHA3] in { // SVE2 crypto constructive binary operations - def RAX1_ZZZ_D : sve2_crypto_cons_bin_op<0b1, "rax1", ZPR64>; + defm RAX1_ZZZ_D : sve2_crypto_cons_bin_op<0b1, "rax1", ZPR64, int_aarch64_sve_rax1, nxv2i64>; } let Predicates = [HasSVE2BitPerm] in { // SVE2 bitwise permute - defm BEXT_ZZZ : sve2_misc_bitwise<0b1100, "bext">; - defm BDEP_ZZZ : sve2_misc_bitwise<0b1101, "bdep">; - defm BGRP_ZZZ : sve2_misc_bitwise<0b1110, "bgrp">; + defm BEXT_ZZZ : sve2_misc_bitwise<0b1100, "bext", int_aarch64_sve_bext_x>; + defm BDEP_ZZZ : sve2_misc_bitwise<0b1101, "bdep", int_aarch64_sve_bdep_x>; + defm BGRP_ZZZ : sve2_misc_bitwise<0b1110, "bgrp", int_aarch64_sve_bgrp_x>; } diff --git a/llvm/lib/Target/AArch64/AArch64SchedA53.td b/llvm/lib/Target/AArch64/AArch64SchedA53.td index a6df0f3f083cb..c5ff1fcb274b7 100644 --- a/llvm/lib/Target/AArch64/AArch64SchedA53.td +++ b/llvm/lib/Target/AArch64/AArch64SchedA53.td @@ -26,7 +26,8 @@ def CortexA53Model : SchedMachineModel { // v 1.0 Spreadsheet let CompleteModel = 1; - list UnsupportedFeatures = SVEUnsupported.F; + list UnsupportedFeatures = !listconcat(SVEUnsupported.F, + PAUnsupported.F); } diff --git a/llvm/lib/Target/AArch64/AArch64SchedA57.td b/llvm/lib/Target/AArch64/AArch64SchedA57.td index 9f566d1c7079b..7c40da05c3056 100644 --- a/llvm/lib/Target/AArch64/AArch64SchedA57.td +++ b/llvm/lib/Target/AArch64/AArch64SchedA57.td @@ -31,7 +31,8 @@ def CortexA57Model : SchedMachineModel { let LoopMicroOpBufferSize = 16; let CompleteModel = 1; - list UnsupportedFeatures = SVEUnsupported.F; + list UnsupportedFeatures = !listconcat(SVEUnsupported.F, + PAUnsupported.F); } //===----------------------------------------------------------------------===// @@ -501,7 +502,7 @@ def : InstRW<[A57Write_5cyc_2V], (instregex "^FRINT[AIMNPXZ](v4f32|v2f64)")>; // Q form - v16i8, v8i16, v4i32, v2i64 // ASIMD bitwise insert, Q-form -def : InstRW<[A57Write_3cyc_2V], (instregex "^(BIF|BIT|BSL)v16i8")>; +def : InstRW<[A57Write_3cyc_2V], (instregex "^(BIF|BIT|BSL|BSP)v16i8")>; // ASIMD duplicate, gen reg, D-form and Q-form def : InstRW<[A57Write_8cyc_1L_1V], (instregex "^CPY")>; diff --git a/llvm/lib/Target/AArch64/AArch64SchedCyclone.td b/llvm/lib/Target/AArch64/AArch64SchedCyclone.td index 798ecb7508c08..8abcb804d5c71 100644 --- a/llvm/lib/Target/AArch64/AArch64SchedCyclone.td +++ b/llvm/lib/Target/AArch64/AArch64SchedCyclone.td @@ -18,7 +18,8 @@ def CycloneModel : SchedMachineModel { let MispredictPenalty = 16; // 14-19 cycles are typical. let CompleteModel = 1; - list UnsupportedFeatures = SVEUnsupported.F; + list UnsupportedFeatures = !listconcat(SVEUnsupported.F, + PAUnsupported.F); } //===----------------------------------------------------------------------===// @@ -494,7 +495,7 @@ def : InstRW<[CyWriteV3], (instregex "SQRSHLv","UQRSHLv")>; // WriteV includes: // SHLL,SSHLL,USHLL // SLI,SRI -// BIF,BIT,BSL +// BIF,BIT,BSL,BSP // EXT // CLS,CLZ,CNT,RBIT,REV16,REV32,REV64,XTN // XTN2 diff --git a/llvm/lib/Target/AArch64/AArch64SchedExynosM3.td b/llvm/lib/Target/AArch64/AArch64SchedExynosM3.td index d1734c455b2b4..8413a06ed3916 100644 --- a/llvm/lib/Target/AArch64/AArch64SchedExynosM3.td +++ b/llvm/lib/Target/AArch64/AArch64SchedExynosM3.td @@ -24,7 +24,8 @@ def ExynosM3Model : SchedMachineModel { let MispredictPenalty = 16; // Minimum branch misprediction penalty. let CompleteModel = 1; // Use the default model otherwise. - list UnsupportedFeatures = SVEUnsupported.F; + list UnsupportedFeatures = !listconcat(SVEUnsupported.F, + PAUnsupported.F); } //===----------------------------------------------------------------------===// @@ -660,7 +661,7 @@ def : InstRW<[M3WriteNEONY], (instrs FSQRTv2f64)>; // ASIMD miscellaneous instructions. def : InstRW<[M3WriteNALU1], (instregex "^RBITv")>; -def : InstRW<[M3WriteNALU1], (instregex "^(BIF|BIT|BSL)v")>; +def : InstRW<[M3WriteNALU1], (instregex "^(BIF|BIT|BSL|BSP)v")>; def : InstRW<[M3WriteNEONB], (instregex "^DUPv.+gpr")>; def : InstRW<[M3WriteNSHF1], (instregex "^DUPv.+lane")>; def : InstRW<[M3WriteNSHF1], (instregex "^EXTv")>; diff --git a/llvm/lib/Target/AArch64/AArch64SchedExynosM4.td b/llvm/lib/Target/AArch64/AArch64SchedExynosM4.td index d2284f9fa0b50..34e8beb423ce9 100644 --- a/llvm/lib/Target/AArch64/AArch64SchedExynosM4.td +++ b/llvm/lib/Target/AArch64/AArch64SchedExynosM4.td @@ -24,7 +24,8 @@ def ExynosM4Model : SchedMachineModel { let MispredictPenalty = 16; // Minimum branch misprediction penalty. let CompleteModel = 1; // Use the default model otherwise. - list UnsupportedFeatures = SVEUnsupported.F; + list UnsupportedFeatures = !listconcat(SVEUnsupported.F, + PAUnsupported.F); } //===----------------------------------------------------------------------===// @@ -803,7 +804,7 @@ def : InstRW<[M4WriteNEONY], (instrs FSQRTv2f64)>; // ASIMD miscellaneous instructions. def : InstRW<[M4WriteNALU1], (instregex "^RBITv")>; -def : InstRW<[M4WriteNALU1], (instregex "^(BIF|BIT|BSL)v")>; +def : InstRW<[M4WriteNALU1], (instregex "^(BIF|BIT|BSL|BSP)v")>; def : InstRW<[M4WriteNALU1], (instregex "^CL[STZ]v")>; def : InstRW<[M4WriteNEONB], (instregex "^DUPv.+gpr")>; def : InstRW<[M4WriteNSHF1], (instregex "^CPY")>; diff --git a/llvm/lib/Target/AArch64/AArch64SchedExynosM5.td b/llvm/lib/Target/AArch64/AArch64SchedExynosM5.td index df7402591e7b9..403aac80e47bf 100644 --- a/llvm/lib/Target/AArch64/AArch64SchedExynosM5.td +++ b/llvm/lib/Target/AArch64/AArch64SchedExynosM5.td @@ -24,7 +24,8 @@ def ExynosM5Model : SchedMachineModel { let MispredictPenalty = 15; // Minimum branch misprediction penalty. let CompleteModel = 1; // Use the default model otherwise. - list UnsupportedFeatures = SVEUnsupported.F; + list UnsupportedFeatures = !listconcat(SVEUnsupported.F, + PAUnsupported.F); } //===----------------------------------------------------------------------===// @@ -841,7 +842,7 @@ def : InstRW<[M5WriteNEONY], (instrs FSQRTv2f64)>; // ASIMD miscellaneous instructions. def : InstRW<[M5WriteNALU2], (instregex "^RBITv")>; -def : InstRW<[M5WriteNALU2], (instregex "^(BIF|BIT|BSL)v")>; +def : InstRW<[M5WriteNALU2], (instregex "^(BIF|BIT|BSL|BSP)v")>; def : InstRW<[M5WriteNALU2], (instregex "^CL[STZ]v")>; def : InstRW<[M5WriteNEONB], (instregex "^DUPv.+gpr")>; def : InstRW<[M5WriteNSHF2], (instregex "^CPY")>; diff --git a/llvm/lib/Target/AArch64/AArch64SchedFalkor.td b/llvm/lib/Target/AArch64/AArch64SchedFalkor.td index 92d03963de57f..a17ab36d7f9e0 100644 --- a/llvm/lib/Target/AArch64/AArch64SchedFalkor.td +++ b/llvm/lib/Target/AArch64/AArch64SchedFalkor.td @@ -23,8 +23,8 @@ def FalkorModel : SchedMachineModel { let MispredictPenalty = 11; // Minimum branch misprediction penalty. let CompleteModel = 1; - list UnsupportedFeatures = SVEUnsupported.F; - + list UnsupportedFeatures = !listconcat(SVEUnsupported.F, + PAUnsupported.F); // FIXME: Remove when all errors have been fixed. let FullInstRWOverlapCheck = 0; } diff --git a/llvm/lib/Target/AArch64/AArch64SchedFalkorDetails.td b/llvm/lib/Target/AArch64/AArch64SchedFalkorDetails.td index 697a0f69c58cb..f2cd83caffa2b 100644 --- a/llvm/lib/Target/AArch64/AArch64SchedFalkorDetails.td +++ b/llvm/lib/Target/AArch64/AArch64SchedFalkorDetails.td @@ -911,7 +911,7 @@ def : InstRW<[FalkorWr_1VXVY_1cyc], (instregex "^DUP(v16i8|v8i16)(gpr|lane)$") def : InstRW<[FalkorWr_1VXVY_1cyc], (instregex "^CPY(i8|i16|i32|i64)$")>; def : InstRW<[FalkorWr_1GTOV_1cyc], (instregex "^INSv(i8|i16)(gpr|lane)$")>; def : InstRW<[FalkorWr_1VTOG_1cyc], (instregex "^(S|U)MOVv.*$")>; -def : InstRW<[FalkorWr_1VXVY_1cyc], (instregex "^(BIF|BIT|BSL)v8i8$")>; +def : InstRW<[FalkorWr_1VXVY_1cyc], (instregex "^(BIF|BIT|BSL|BSP)v8i8$")>; def : InstRW<[FalkorWr_1VXVY_1cyc], (instrs EXTv8i8)>; def : InstRW<[FalkorWr_1VXVY_0cyc], (instregex "(MOVI|MVNI)(D|v8b_ns|v2i32|v4i16|v2s_msl)$")>; // imm fwd def : InstRW<[FalkorWr_1VXVY_1cyc], (instrs TBLv8i8One)>; @@ -935,7 +935,7 @@ def : InstRW<[FalkorWr_FMUL64_1VXVY_6cyc], def : InstRW<[FalkorWr_1GTOV_1VXVY_2cyc], (instregex "^INSv(i32|i64)(gpr|lane)$")>; def : InstRW<[FalkorWr_2GTOV_1cyc], (instregex "^DUP(v4i32|v2i64)(gpr|lane)$")>; -def : InstRW<[FalkorWr_2VXVY_1cyc], (instregex "^(BIF|BIT|BSL)v16i8$")>; +def : InstRW<[FalkorWr_2VXVY_1cyc], (instregex "^(BIF|BIT|BSL|BSP)v16i8$")>; def : InstRW<[FalkorWr_2VXVY_1cyc], (instrs EXTv16i8)>; def : InstRW<[FalkorWr_2VXVY_0cyc], (instregex "(MOVI|MVNI)(v2d_ns|v16b_ns|v4i32|v8i16|v4s_msl)$")>; // imm fwd def : InstRW<[FalkorWr_2VXVY_1cyc], (instrs NOTv16i8)>; diff --git a/llvm/lib/Target/AArch64/AArch64SchedKryo.td b/llvm/lib/Target/AArch64/AArch64SchedKryo.td index 0e1a24103121e..ba14bf1f50de1 100644 --- a/llvm/lib/Target/AArch64/AArch64SchedKryo.td +++ b/llvm/lib/Target/AArch64/AArch64SchedKryo.td @@ -27,8 +27,8 @@ def KryoModel : SchedMachineModel { let LoopMicroOpBufferSize = 16; let CompleteModel = 1; - list UnsupportedFeatures = SVEUnsupported.F; - + list UnsupportedFeatures = !listconcat(SVEUnsupported.F, + PAUnsupported.F); // FIXME: Remove when all errors have been fixed. let FullInstRWOverlapCheck = 0; } diff --git a/llvm/lib/Target/AArch64/AArch64SchedKryoDetails.td b/llvm/lib/Target/AArch64/AArch64SchedKryoDetails.td index 4c60992e6351a..bc5ad0f8beced 100644 --- a/llvm/lib/Target/AArch64/AArch64SchedKryoDetails.td +++ b/llvm/lib/Target/AArch64/AArch64SchedKryoDetails.td @@ -462,13 +462,13 @@ def KryoWrite_1cyc_X_noRSV_74ln : let Latency = 1; let NumMicroOps = 2; } def : InstRW<[KryoWrite_1cyc_X_noRSV_74ln], - (instrs BIFv8i8, BITv8i8, BSLv8i8)>; + (instrs BIFv8i8, BITv8i8, BSLv8i8, BSPv8i8)>; def KryoWrite_1cyc_X_X_75ln : SchedWriteRes<[KryoUnitX, KryoUnitX]> { let Latency = 1; let NumMicroOps = 2; } def : InstRW<[KryoWrite_1cyc_X_X_75ln], - (instrs BIFv16i8, BITv16i8, BSLv16i8)>; + (instrs BIFv16i8, BITv16i8, BSLv16i8, BSPv16i8)>; def KryoWrite_0cyc_noRSV_11ln : SchedWriteRes<[]> { let Latency = 0; let NumMicroOps = 1; diff --git a/llvm/lib/Target/AArch64/AArch64SchedThunderX.td b/llvm/lib/Target/AArch64/AArch64SchedThunderX.td index 3b6aecf5c0353..9c50f97085830 100644 --- a/llvm/lib/Target/AArch64/AArch64SchedThunderX.td +++ b/llvm/lib/Target/AArch64/AArch64SchedThunderX.td @@ -25,8 +25,8 @@ def ThunderXT8XModel : SchedMachineModel { let PostRAScheduler = 1; // Use PostRA scheduler. let CompleteModel = 1; - list UnsupportedFeatures = SVEUnsupported.F; - + list UnsupportedFeatures = !listconcat(SVEUnsupported.F, + PAUnsupported.F); // FIXME: Remove when all errors have been fixed. let FullInstRWOverlapCheck = 0; } diff --git a/llvm/lib/Target/AArch64/AArch64SchedThunderX2T99.td b/llvm/lib/Target/AArch64/AArch64SchedThunderX2T99.td index e2a293c068774..95c29dd2a567f 100644 --- a/llvm/lib/Target/AArch64/AArch64SchedThunderX2T99.td +++ b/llvm/lib/Target/AArch64/AArch64SchedThunderX2T99.td @@ -25,8 +25,8 @@ def ThunderX2T99Model : SchedMachineModel { let PostRAScheduler = 1; // Using PostRA sched. let CompleteModel = 1; - list UnsupportedFeatures = SVEUnsupported.F; - + list UnsupportedFeatures = !listconcat(SVEUnsupported.F, + PAUnsupported.F); // FIXME: Remove when all errors have been fixed. let FullInstRWOverlapCheck = 0; } @@ -1482,7 +1482,7 @@ def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^RBITv")>; // ASIMD bitwise insert, D-form // ASIMD bitwise insert, Q-form def : InstRW<[THX2T99Write_5Cyc_F01], - (instregex "^BIFv", "^BITv", "^BSLv")>; + (instregex "^BIFv", "^BITv", "^BSLv", "^BSPv")>; // ASIMD count, D-form // ASIMD count, Q-form diff --git a/llvm/lib/Target/AArch64/AArch64SchedThunderX3T110.td b/llvm/lib/Target/AArch64/AArch64SchedThunderX3T110.td new file mode 100644 index 0000000000000..00838cc4b9bd4 --- /dev/null +++ b/llvm/lib/Target/AArch64/AArch64SchedThunderX3T110.td @@ -0,0 +1,1997 @@ +//=- AArch64SchedThunderX3T110.td - Marvell ThunderX3 T110 ---*- tablegen -*-=// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file defines the scheduling model for Marvell ThunderX3T110 +// family of processors. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// Pipeline Description. + +def ThunderX3T110Model : SchedMachineModel { + let IssueWidth = 4; // 4 micro-ops dispatched at a time. + let MicroOpBufferSize = 70; // 70 entries in micro-op re-order buffer. + let LoadLatency = 4; // Optimistic load latency. + let MispredictPenalty = 12; // Extra cycles for mispredicted branch. + // Determined via a mix of micro-arch details and experimentation. + let LoopMicroOpBufferSize = 128; // FIXME: might be much bigger in TX3. + let PostRAScheduler = 1; // Using PostRA sched. + let CompleteModel = 1; + + list UnsupportedFeatures = !listconcat(SVEUnsupported.F, + PAUnsupported.F); + // FIXME: Remove when all errors have been fixed. + let FullInstRWOverlapCheck = 0; +} + +let SchedModel = ThunderX3T110Model in { + +// Issue ports. + +// Port 0: ALU. +def THX3T110P0 : ProcResource<1>; + +// Port 1: ALU. +def THX3T110P1 : ProcResource<1>; + +// Port 2: ALU/Branch. +def THX3T110P2 : ProcResource<1>; + +// Port 3: ALU/Branch. +def THX3T110P3 : ProcResource<1>; + +// Port 4: Load/Store. +def THX3T110P4 : ProcResource<1>; + +// Port 5: Load/store. +def THX3T110P5 : ProcResource<1>; + +// Port 6: FP/Neon/SIMD/Crypto. +def THX3T110P6FP0 : ProcResource<1>; + +// Port 7: FP/Neon/SIMD/Crypto. +def THX3T110P7FP1 : ProcResource<1>; + +// Port 8: FP/Neon/SIMD/Crypto. +def THX3T110P8FP2 : ProcResource<1>; + +// Port 9: FP/Neon/SIMD/Crypto. +def THX3T110P9FP3 : ProcResource<1>; + +// Port 10: Store Data Unit. +def THX3T110SD0 : ProcResource<1>; + +// Define groups for the functional units on each issue port. Each group +// created will be used by a WriteRes. + +// Integer divide/mulhi micro-ops only on port I1. +def THX3T110I1 : ProcResGroup<[THX3T110P1]>; + +// Branch micro-ops on ports I2/I3. +def THX3T110I23 : ProcResGroup<[THX3T110P2, THX3T110P3]>; + +// Branch micro-ops on ports I1/I2/I3. +def THX3T110I123 : ProcResGroup<[THX3T110P1, THX3T110P2, THX3T110P3]>; + +// Integer micro-ops on ports I0/I1/I2. +def THX3T110I012 : ProcResGroup<[THX3T110P0, THX3T110P1, THX3T110P2]>; + +// Integer micro-ops on ports I0/I1/I2/I3. +def THX3T110I0123 : ProcResGroup<[THX3T110P0, THX3T110P1, + THX3T110P2, THX3T110P3]>; + +// FP micro-ops on ports FP0/FP1/FP2/FP3. +def THX3T110FP0123 : ProcResGroup<[THX3T110P6FP0, THX3T110P7FP1, + THX3T110P8FP2, THX3T110P9FP3]>; + +// FP micro-ops on ports FP2/FP3. +def THX3T110FP23 : ProcResGroup<[THX3T110P8FP2, THX3T110P9FP3]>; + +// ASIMD micro-ops on ports FP0/FP1/FP2/FP3. +def THX3T110SIMD : ProcResGroup<[THX3T110P6FP0, THX3T110P7FP1, + THX3T110P8FP2, THX3T110P9FP3]>; + +// Store data micro-ops only on port 10. +def THX3T110SD : ProcResGroup<[THX3T110SD0]>; + +// Load/store micro-ops on ports P4/P5. +def THX3T110LS : ProcResGroup<[THX3T110P4, THX3T110P5]>; + +// 70 entry unified scheduler. +def THX3T110ANY: ProcResGroup<[THX3T110P0, THX3T110P1, THX3T110P2, + THX3T110P3, THX3T110P4, THX3T110P5, + THX3T110P6FP0, THX3T110P7FP1, + THX3T110P8FP2, THX3T110P9FP3]> { + let BufferSize = 70; +} + +// Define commonly used write types for InstRW specializations. +// All definitions follow the format: THX3T110Write_Cyc_. + +// 3 cycles on I1. +def THX3T110Write_3Cyc_I1 : SchedWriteRes<[THX3T110I1]> { + let Latency = 3; + let NumMicroOps = 2; +} + +// 4 cycles on I1. +def THX3T110Write_4Cyc_I1 : SchedWriteRes<[THX3T110I1]> { + let Latency = 4; + let NumMicroOps = 2; +} + +// 5 cycles on I1. +def THX3T110Write_5Cyc_I1 : SchedWriteRes<[THX3T110I1]> { + let Latency = 5; + let NumMicroOps = 2; +} + +// 7 cycles on I1. +def THX3T110Write_7Cyc_I1 : SchedWriteRes<[THX3T110I1]> { + let Latency = 7; + let NumMicroOps = 3; +} + +// 23 cycles on I1. +def THX3T110Write_23Cyc_I1 : SchedWriteRes<[THX3T110I1]> { + let Latency = 23; + let ResourceCycles = [13, 23]; + let NumMicroOps = 4; +} + +// 39 cycles on I1. +def THX3T110Write_39Cyc_I1 : SchedWriteRes<[THX3T110I1]> { + let Latency = 39; + let ResourceCycles = [13, 39]; + let NumMicroOps = 4; +} + +// 1 cycle on I2/I3 +def THX3T110Write_1Cyc_I23 : SchedWriteRes<[THX3T110I23]> { + let Latency = 1; + let NumMicroOps = 2; +} + +// 8 cycles on I2/I3 +def THX3T110Write_8Cyc_I23 : SchedWriteRes<[THX3T110I23]> { + let Latency = 8; + let NumMicroOps = 3; +} + +// 1 cycle on I1/I2/I3 +def THX3T110Write_1Cyc_I123 : SchedWriteRes<[THX3T110I123]> { + let Latency = 1; + let NumMicroOps = 2; +} + +// 8 cycles on I1/I2/I3 +def THX3T110Write_8Cyc_I123 : SchedWriteRes<[THX3T110I123]> { + let Latency = 8; + let NumMicroOps = 3; +} + +// 1 cycle on I0/I1/I2/I3. +def THX3T110Write_1Cyc_I0123 : SchedWriteRes<[THX3T110I0123]> { + let Latency = 1; + let NumMicroOps = 2; +} + +// 2 cycles on I0/I1/I2/I3. +def THX3T110Write_2Cyc_I0123 : SchedWriteRes<[THX3T110I0123]> { + let Latency = 2; + let NumMicroOps = 2; +} + +// 3 cycles on I0/I1/I2/I3. +def THX3T110Write_3Cyc_I0123 : SchedWriteRes<[THX3T110I0123]> { + let Latency = 3; + let NumMicroOps = 2; +} + +// 4 cycles on I0/I1/I2/I3. +def THX3T110Write_4Cyc_I0123 : SchedWriteRes<[THX3T110I0123]> { + let Latency = 4; + let NumMicroOps = 3; +} + +// 5 cycles on I0/I1/I2/I3. +def THX3T110Write_5Cyc_I0123 : SchedWriteRes<[THX3T110I0123]> { + let Latency = 5; + let NumMicroOps = 3; +} + +// 6 cycles on I0/I1/I2/I3. +def THX3T110Write_6Cyc_I0123 : SchedWriteRes<[THX3T110I0123]> { + let Latency = 6; + let NumMicroOps = 3; +} + +// 8 cycles on I0/I1/I2/I3. +def THX3T110Write_8Cyc_I0123 : SchedWriteRes<[THX3T110I0123]> { + let Latency = 8; + let NumMicroOps = 4; +} + +// 13 cycles on I0/I1/I2/I3. +def THX3T110Write_13Cyc_I0123 : SchedWriteRes<[THX3T110I0123]> { + let Latency = 13; + let NumMicroOps = 3; +} + +// 23 cycles on I0/I1/I2/I3. +def THX3T110Write_23Cyc_I0123 : SchedWriteRes<[THX3T110I0123]> { + let Latency = 23; + let NumMicroOps = 3; +} + +// 39 cycles on I0/I1/I2/I3. +def THX3T110Write_39Cyc_I0123 : SchedWriteRes<[THX3T110I0123]> { + let Latency = 39; + let NumMicroOps = 3; +} + +// 4 cycles on F2/F3. +def THX3T110Write_4Cyc_F23 : SchedWriteRes<[THX3T110FP23]> { + let Latency = 4; + let NumMicroOps = 2; +} + +// 5 cycles on F0/F1/F2/F3. +def THX3T110Write_5Cyc_F01 : SchedWriteRes<[THX3T110FP0123]> { + let Latency = 5; + let NumMicroOps = 2; +} + +// 6 cycles on F0/F1/F2/F3. +def THX3T110Write_6Cyc_F01 : SchedWriteRes<[THX3T110FP0123]> { + let Latency = 6; + let NumMicroOps = 3; +} + +// 7 cycles on F0/F1/F2/F3. +def THX3T110Write_7Cyc_F01 : SchedWriteRes<[THX3T110FP0123]> { + let Latency = 7; + let NumMicroOps = 3; +} + +// 8 cycles on F0/F1/F2/F3. +def THX3T110Write_8Cyc_F01 : SchedWriteRes<[THX3T110FP0123]> { + let Latency = 8; + let NumMicroOps = 3; +} + +// 10 cycles on F0/F1/F2/F3. +def THX3T110Write_10Cyc_F01 : SchedWriteRes<[THX3T110FP0123]> { + let Latency = 10; + let NumMicroOps = 3; +} + +// 16 cycles on F0/F1/F2/F3. +def THX3T110Write_16Cyc_F01 : SchedWriteRes<[THX3T110FP0123]> { + let Latency = 16; + let NumMicroOps = 3; + let ResourceCycles = [8]; +} + +// 23 cycles on F0/F1/F2/F3. +def THX3T110Write_23Cyc_F01 : SchedWriteRes<[THX3T110FP0123]> { + let Latency = 23; + let NumMicroOps = 3; + let ResourceCycles = [11]; +} + +// 1 cycle on LS0/LS1. +def THX3T110Write_1Cyc_LS01 : SchedWriteRes<[THX3T110LS]> { + let Latency = 1; + let NumMicroOps = 1; +} + +// 2 cycles on LS0/LS1. +def THX3T110Write_2Cyc_LS01 : SchedWriteRes<[THX3T110LS]> { + let Latency = 2; + let NumMicroOps = 2; +} + +// 4 cycles on LS0/LS1. +def THX3T110Write_4Cyc_LS01 : SchedWriteRes<[THX3T110LS]> { + let Latency = 4; + let NumMicroOps = 2; + let ResourceCycles = [2]; +} + +// 5 cycles on LS0/LS1. +def THX3T110Write_5Cyc_LS01 : SchedWriteRes<[THX3T110LS]> { + let Latency = 5; + let NumMicroOps = 3; +} + +// 6 cycles on LS0/LS1. +def THX3T110Write_6Cyc_LS01 : SchedWriteRes<[THX3T110LS]> { + let Latency = 6; + let NumMicroOps = 3; +} + +// 4 + 5 cycles on LS0/LS1. +// First resource is available after 4 cycles. +// Second resource is available after 5 cycles. +// Load vector pair, immed offset, Q-form [LDP/LDNP]. +def THX3T110Write_4_5Cyc_LS01 : SchedWriteRes<[THX3T110LS]> { + let Latency = 4; + let NumMicroOps = 2; + let ResourceCycles = [4, 5]; +} + +// 4 + 8 cycles on LS0/LS1. +// First resource is available after 4 cycles. +// Second resource is available after 8 cycles. +// Load vector pair, immed offset, S/D-form [LDP/LDNP]. +def THX3T110Write_4_8Cyc_LS01 : SchedWriteRes<[THX3T110LS]> { + let Latency = 4; + let NumMicroOps = 2; + let ResourceCycles = [4, 8]; +} + +// 11 cycles on LS0/LS1 and I1. +def THX3T110Write_11Cyc_LS01_I1 : + SchedWriteRes<[THX3T110LS, THX3T110I1]> { + let Latency = 11; + let NumMicroOps = 4; +} + +// 1 cycles on LS0/LS1 and I0/I1/I2/I3. +def THX3T110Write_1Cyc_LS01_I0123 : + SchedWriteRes<[THX3T110LS, THX3T110I0123]> { + let Latency = 1; + let NumMicroOps = 2; +} + +// 1 cycles on LS0/LS1 and 2 of I0/I1/I2/I3. +def THX3T110Write_1Cyc_LS01_I0123_I0123 : + SchedWriteRes<[THX3T110LS, THX3T110I0123, THX3T110I0123]> { + let Latency = 1; + let NumMicroOps = 3; +} + +// 4 cycles on LS0/LS1 and I0/I1/I2/I3. +def THX3T110Write_4Cyc_LS01_I0123 : + SchedWriteRes<[THX3T110LS, THX3T110I0123]> { + let Latency = 4; + let NumMicroOps = 3; +} + +// 4 cycles on LS0/LS1 and 2 of I0/I1/I2/I3. +def THX3T110Write_4Cyc_LS01_I0123_I0123 : + SchedWriteRes<[THX3T110LS, THX3T110I0123, THX3T110I0123]> { + let Latency = 4; + let NumMicroOps = 3; +} + +// 5 cycles on LS0/LS1 and I0/I1/I2/I3. +def THX3T110Write_5Cyc_LS01_I0123 : + SchedWriteRes<[THX3T110LS, THX3T110I0123]> { + let Latency = 5; + let NumMicroOps = 3; +} + +// 5 cycles on LS0/LS1 and 2 of I0/I1/I2/I3. +def THX3T110Write_5Cyc_LS01_I0123_I0123 : + SchedWriteRes<[THX3T110LS, THX3T110I0123, THX3T110I0123]> { + let Latency = 5; + let NumMicroOps = 3; +} + +// 6 cycles on LS0/LS1 and I0/I1/I2/I3. +def THX3T110Write_6Cyc_LS01_I012 : + SchedWriteRes<[THX3T110LS, THX3T110I0123]> { + let Latency = 6; + let NumMicroOps = 4; +} + +// 6 cycles on LS0/LS1 and 2 of I0/I1/I2/I3. +def THX3T110Write_6Cyc_LS01_I0123_I0123 : + SchedWriteRes<[THX3T110LS, THX3T110I0123, THX3T110I0123]> { + let Latency = 6; + let NumMicroOps = 3; +} + +// 1 cycle on LS0/LS1 and SD. +def THX3T110Write_1Cyc_LS01_SD : + SchedWriteRes<[THX3T110LS, THX3T110SD]> { + let Latency = 1; + let NumMicroOps = 2; +} + +// 2 cycles on LS0/LS1 and SD. +def THX3T110Write_2Cyc_LS01_SD : + SchedWriteRes<[THX3T110LS, THX3T110SD]> { + let Latency = 2; + let NumMicroOps = 2; +} + +// 4 cycles on LS0/LS1 and SD. +def THX3T110Write_4Cyc_LS01_SD : + SchedWriteRes<[THX3T110LS, THX3T110SD]> { + let Latency = 4; + let NumMicroOps = 3; +} + +// 5 cycles on LS0/LS1 and SD. +def THX3T110Write_5Cyc_LS01_SD : + SchedWriteRes<[THX3T110LS, THX3T110SD]> { + let Latency = 5; + let NumMicroOps = 4; +} + +// 6 cycles on LS0/LS1 and SD. +def THX3T110Write_6Cyc_LS01_SD : + SchedWriteRes<[THX3T110LS, THX3T110SD]> { + let Latency = 6; + let NumMicroOps = 5; +} + +// 1 cycle on LS0/LS1, SD and I0/I1/I2/I3. +def THX3T110Write_1Cyc_LS01_SD_I0123 : + SchedWriteRes<[THX3T110LS, THX3T110SD, THX3T110I0123]> { + let Latency = 1; + let NumMicroOps = 2; +} + +// 2 cycles on LS0/LS1, SD and I0/I1/I2/I3. +def THX3T110Write_2Cyc_LS01_SD_I0123 : + SchedWriteRes<[THX3T110LS, THX3T110SD, THX3T110I0123]> { + let Latency = 2; + let NumMicroOps = 2; +} + +// 4 cycles on LS0/LS1, SD and I0/I1/I2/I3. +def THX3T110Write_4Cyc_LS01_SD_I0123 : + SchedWriteRes<[THX3T110LS, THX3T110SD, THX3T110I0123]> { + let Latency = 4; + let NumMicroOps = 3; +} + +// 5 cycles on LS0/LS1, SD and I0/I1/I2/I3. +def THX3T110Write_5Cyc_LS01_SD_I0123 : + SchedWriteRes<[THX3T110LS, THX3T110SD, THX3T110I0123]> { + let Latency = 5; + let NumMicroOps = 4; +} + +// 6 cycles on LS0/LS1, SD and I0/I1/I2/I3. +def THX3T110Write_6Cyc_LS01_SD_I0123 : + SchedWriteRes<[THX3T110LS, THX3T110SD, THX3T110I0123]> { + let Latency = 6; + let NumMicroOps = 5; +} + +// 1 cycles on LS0/LS1 and F0/F1/F2/F3. +def THX3T110Write_1Cyc_LS01_F0123 : + SchedWriteRes<[THX3T110LS, THX3T110FP0123]> { + let Latency = 1; + let NumMicroOps = 2; +} + +// 5 cycles on LS0/LS1 and F0/F1/F2/F3. +def THX3T110Write_5Cyc_LS01_F0123 : + SchedWriteRes<[THX3T110LS, THX3T110FP0123]> { + let Latency = 5; + let NumMicroOps = 3; +} + +// 6 cycles on LS0/LS1 and F0/F1/F2/F3. +def THX3T110Write_6Cyc_LS01_F0123 : + SchedWriteRes<[THX3T110LS, THX3T110FP0123]> { + let Latency = 6; + let NumMicroOps = 3; +} + +// 7 cycles on LS0/LS1 and F0/F1/F2/F3. +def THX3T110Write_7Cyc_LS01_F0123 : + SchedWriteRes<[THX3T110LS, THX3T110FP0123]> { + let Latency = 7; + let NumMicroOps = 3; +} + +// 8 cycles on LS0/LS1 and F0/F1/F2/F3. +def THX3T110Write_8Cyc_LS01_F0123 : + SchedWriteRes<[THX3T110LS, THX3T110FP0123]> { + let Latency = 8; + let NumMicroOps = 3; +} + +// 8 cycles on LS0/LS1 and I0/I1/I2/I3. +def THX3T110Write_8Cyc_LS01_I0123 : + SchedWriteRes<[THX3T110LS, THX3T110I0123]> { + let Latency = 8; + let NumMicroOps = 3; +} + +// 12 cycles on LS0/LS1 and I0/I1/I2/I3. +def THX3T110Write_12Cyc_LS01_I0123 : + SchedWriteRes<[THX3T110LS, THX3T110I0123]> { + let Latency = 12; + let NumMicroOps = 4; +} + +// 16 cycles on LS0/LS1 and I0/I1/I2/I3. +def THX3T110Write_16Cyc_LS01_I0123 : + SchedWriteRes<[THX3T110LS, THX3T110I0123]> { + let Latency = 16; + let NumMicroOps = 5; +} + +// 24 cycles on LS0/LS1 and I0/I1/I2/I3. +def THX3T110Write_24Cyc_LS01_I0123 : + SchedWriteRes<[THX3T110LS, THX3T110I0123]> { + let Latency = 24; + let NumMicroOps = 10; +} + +// 32 cycles on LS0/LS1 and I0/I1/I2/I3. +def THX3T110Write_32Cyc_LS01_I0123 : + SchedWriteRes<[THX3T110LS, THX3T110I0123]> { + let Latency = 32; + let NumMicroOps = 14; +} + +// 3 cycles on F0/F1/F2/F3. +def THX3T110Write_3Cyc_F0123 : SchedWriteRes<[THX3T110FP0123]> { + let Latency = 3; + let NumMicroOps = 2; +} + +// 4 cycles on F0/F1/F2/F3. +def THX3T110Write_4Cyc_F0123 : SchedWriteRes<[THX3T110FP0123]> { + let Latency = 4; + let NumMicroOps = 2; +} + +// 5 cycles on F0/F1/F2/F3. +def THX3T110Write_5Cyc_F0123 : SchedWriteRes<[THX3T110FP0123]> { + let Latency = 5; + let NumMicroOps = 2; +} + +// 10 cycles on F0/F1/F2/F3. +def THX3T110Write_10Cyc_F0123 : SchedWriteRes<[THX3T110FP0123]> { + let Latency = 10; + let NumMicroOps = 4; +} + +// 15 cycles on F0/F1/F2/F3. +def THX3T110Write_15Cyc_F0123 : SchedWriteRes<[THX3T110FP0123]> { + let Latency = 15; + let NumMicroOps = 7; +} + +// 16 cycles on F0/F1/F2/F3. +def THX3T110Write_16Cyc_F0123 : SchedWriteRes<[THX3T110FP0123]> { + let Latency = 16; + let NumMicroOps = 3; +} + +// 18 cycles on F0/F1/F2/F3. +def THX3T110Write_18Cyc_F0123 : SchedWriteRes<[THX3T110FP0123]> { + let Latency = 18; + let NumMicroOps = 3; +} + +// 19 cycles on F0/F1/F2/F3. +def THX3T110Write_19Cyc_F0123 : SchedWriteRes<[THX3T110FP0123]> { + let Latency = 19; + let NumMicroOps = 4; +} + +// 20 cycles on F0/F1/F2/F3. +def THX3T110Write_20Cyc_F0123 : SchedWriteRes<[THX3T110FP0123]> { + let Latency = 20; + let NumMicroOps = 4; +} + +// 23 cycles on F0/F1/F2/F3. +def THX3T110Write_23Cyc_F0123 : SchedWriteRes<[THX3T110FP0123]> { + let Latency = 23; + let NumMicroOps = 4; +} + +// 3 cycles on F2/F3 and 4 cycles on F0/F1/F2/F3. +def THX3T110Write_3_4Cyc_F23_F0123 : + SchedWriteRes<[THX3T110FP23, THX3T110FP0123]> { + let Latency = 3; + let NumMicroOps = 2; + let ResourceCycles = [3, 4]; +} + + +// Define commonly used read types. + +// No forwarding is provided for these types. +def : ReadAdvance; +def : ReadAdvance; +def : ReadAdvance; +def : ReadAdvance; +def : ReadAdvance; +def : ReadAdvance; +def : ReadAdvance; +def : ReadAdvance; +def : ReadAdvance; + +//===----------------------------------------------------------------------===// +// 3. Instruction Tables. + +//--- +// 3.1 Branch Instructions +//--- + +// Branch, immed +// Branch and link, immed +// Compare and branch +def : WriteRes { + let Latency = 1; + let NumMicroOps = 2; +} + +// Branch, register +// Branch and link, register != LR +// Branch and link, register = LR +def : WriteRes { + let Latency = 1; + let NumMicroOps = 2; +} + +def : WriteRes { let Latency = 1; } +def : WriteRes { let Latency = 1; } +def : WriteRes { let Latency = 1; } + +def : WriteRes { + let Latency = 4; + let NumMicroOps = 2; +} + +//--- +// Branch +//--- +def : InstRW<[THX3T110Write_1Cyc_I23], (instrs B, BL, BR, BLR)>; +def : InstRW<[THX3T110Write_1Cyc_I23], (instrs Bcc)>; +def : InstRW<[THX3T110Write_1Cyc_I23], (instrs RET)>; +def : InstRW<[THX3T110Write_1Cyc_I23], + (instrs CBZW, CBZX, CBNZW, CBNZX, TBZW, TBZX, TBNZW, TBNZX)>; + +//--- +// 3.2 Arithmetic and Logical Instructions +// 3.3 Move and Shift Instructions +//--- + + +// ALU, basic +// Conditional compare +// Conditional select +// Address generation +def : WriteRes { + let Latency = 1; + let ResourceCycles = [1]; + let NumMicroOps = 2; +} + +def : InstRW<[WriteI], + (instregex "ADD?(W|X)r(i|r|s|x)", "ADDS?(W|X)r(i|r|s|x)(64)?", + "AND?(W|X)r(i|r|s|x)", "ANDS?(W|X)r(i|r|s|x)", + "ADC(W|X)r", + "BIC?(W|X)r(i|r|s|x)", "BICS?(W|X)r(i|r|s|x)", + "EON?(W|X)r(i|r|s|x)", "ORN?(W|X)r(i|r|s|x)", + "ORR?(W|X)r(i|r|s|x)", "SUB?(W|X)r(i|r|s|x)", + "SUBS?(W|X)r(i|r|s|x)", "SBC(W|X)r", + "SBCS(W|X)r", "CCMN(W|X)(i|r)", + "CCMP(W|X)(i|r)", "CSEL(W|X)r", + "CSINC(W|X)r", "CSINV(W|X)r", + "CSNEG(W|X)r")>; + +def : InstRW<[WriteI], (instrs COPY)>; + +// ALU, extend and/or shift +def : WriteRes { + let Latency = 2; + let ResourceCycles = [2]; + let NumMicroOps = 2; +} + +def : InstRW<[WriteISReg], + (instregex "ADD?(W|X)r(i|r|s|x)", "ADDS?(W|X)r(i|r|s|x)(64)?", + "AND?(W|X)r(i|r|s|x)", "ANDS?(W|X)r(i|r|s|x)", + "ADC(W|X)r", + "BIC?(W|X)r(i|r|s|x)", "BICS?(W|X)r(i|r|s|x)", + "EON?(W|X)r(i|r|s|x)", "ORN?(W|X)r(i|r|s|x)", + "ORR?(W|X)r(i|r|s|x)", "SUB?(W|X)r(i|r|s|x)", + "SUBS?(W|X)r(i|r|s|x)", "SBC(W|X)r", + "SBCS(W|X)r", "CCMN(W|X)(i|r)", + "CCMP(W|X)(i|r)", "CSEL(W|X)r", + "CSINC(W|X)r", "CSINV(W|X)r", + "CSNEG(W|X)r")>; + +def : WriteRes { + let Latency = 1; + let ResourceCycles = [1]; + let NumMicroOps = 2; +} + +def : InstRW<[WriteIEReg], + (instregex "ADD?(W|X)r(i|r|s|x)", "ADDS?(W|X)r(i|r|s|x)(64)?", + "AND?(W|X)r(i|r|s|x)", "ANDS?(W|X)r(i|r|s|x)", + "ADC(W|X)r", + "BIC?(W|X)r(i|r|s|x)", "BICS?(W|X)r(i|r|s|x)", + "EON?(W|X)r(i|r|s|x)", "ORN?(W|X)r(i|r|s|x)", + "ORR?(W|X)r(i|r|s|x)", "SUB?(W|X)r(i|r|s|x)", + "SUBS?(W|X)r(i|r|s|x)", "SBC(W|X)r", + "SBCS(W|X)r", "CCMN(W|X)(i|r)", + "CCMP(W|X)(i|r)", "CSEL(W|X)r", + "CSINC(W|X)r", "CSINV(W|X)r", + "CSNEG(W|X)r")>; + +// Move immed +def : WriteRes { + let Latency = 1; + let NumMicroOps = 2; +} + +def : InstRW<[THX3T110Write_1Cyc_I0123], + (instrs MOVKWi, MOVKXi, MOVNWi, MOVNXi, MOVZWi, MOVZXi)>; + +def : InstRW<[THX3T110Write_1Cyc_I0123], + (instrs ASRVWr, ASRVXr, LSLVWr, LSLVXr, RORVWr, RORVXr)>; + +// Variable shift +def : WriteRes { + let Latency = 1; + let NumMicroOps = 2; +} + +//--- +// 3.4 Divide and Multiply Instructions +//--- + +// Divide, W-form +// Latency range of 13-23/13-39. +def : WriteRes { + let Latency = 39; + let ResourceCycles = [39]; + let NumMicroOps = 4; +} + +// Divide, X-form +def : WriteRes { + let Latency = 23; + let ResourceCycles = [23]; + let NumMicroOps = 4; +} + +// Multiply accumulate, W-form +def : WriteRes { + let Latency = 5; + let NumMicroOps = 3; +} + +// Multiply accumulate, X-form +def : WriteRes { + let Latency = 5; + let NumMicroOps = 3; +} + +//def : InstRW<[WriteIM32, ReadIM, ReadIM, ReadIMA, THX3T110Write_5Cyc_I012], +// (instrs MADDWrrr, MSUBWrrr)>; +def : InstRW<[WriteIM32], (instrs MADDWrrr, MSUBWrrr)>; +def : InstRW<[WriteIM32], (instrs MADDXrrr, MSUBXrrr)>; +def : InstRW<[THX3T110Write_5Cyc_I0123], + (instregex "(S|U)(MADDL|MSUBL)rrr")>; + +def : InstRW<[WriteID32], (instrs SDIVWr, UDIVWr)>; +def : InstRW<[WriteID64], (instrs SDIVXr, UDIVXr)>; + +// Bitfield extract, two reg +def : WriteRes { + let Latency = 1; + let NumMicroOps = 2; +} + +// Multiply high +def : InstRW<[THX3T110Write_4Cyc_I1], (instrs SMULHrr, UMULHrr)>; + +// Miscellaneous Data-Processing Instructions +// Bitfield extract +def : InstRW<[THX3T110Write_1Cyc_I0123], (instrs EXTRWrri, EXTRXrri)>; + +// Bitifield move - basic +def : InstRW<[THX3T110Write_1Cyc_I0123], + (instrs SBFMWri, SBFMXri, UBFMWri, UBFMXri)>; + +// Bitfield move, insert +def : InstRW<[THX3T110Write_1Cyc_I0123], (instregex "^BFM")>; +def : InstRW<[THX3T110Write_1Cyc_I0123], (instregex "(S|U)?BFM.*")>; + +// Count leading +def : InstRW<[THX3T110Write_3_4Cyc_F23_F0123], + (instregex "^CLS(W|X)r$", "^CLZ(W|X)r$")>; + +// Reverse bits +def : InstRW<[THX3T110Write_3_4Cyc_F23_F0123], (instrs RBITWr, RBITXr)>; + +// Cryptography Extensions +def : InstRW<[THX3T110Write_4Cyc_F0123], (instregex "^AES[DE]")>; +def : InstRW<[THX3T110Write_4Cyc_F0123], (instregex "^AESI?MC")>; +def : InstRW<[THX3T110Write_4Cyc_F0123], (instregex "^PMULL")>; +def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "^SHA1SU0")>; +def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "^SHA1(H|SU1)")>; +def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "^SHA1[CMP]")>; +def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "^SHA256SU0")>; +def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "^SHA256(H|H2|SU1)")>; + +// CRC Instructions +// def : InstRW<[THX3T110Write_4Cyc_I1], (instregex "^CRC32", "^CRC32C")>; +def : InstRW<[THX3T110Write_4Cyc_I1], + (instrs CRC32Brr, CRC32Hrr, CRC32Wrr, CRC32Xrr)>; + +def : InstRW<[THX3T110Write_4Cyc_I1], + (instrs CRC32CBrr, CRC32CHrr, CRC32CWrr, CRC32CXrr)>; + +// Reverse bits/bytes +// NOTE: Handled by WriteI. + +//--- +// 3.6 Load Instructions +// 3.10 FP Load Instructions +//--- + +// Load register, literal +// Load register, unscaled immed +// Load register, immed unprivileged +// Load register, unsigned immed +def : WriteRes { + let Latency = 4; + let NumMicroOps = 4; +} + +// Load register, immed post-index +// NOTE: Handled by WriteLD, WriteI. +// Load register, immed pre-index +// NOTE: Handled by WriteLD, WriteAdr. +def : WriteRes { + let Latency = 1; + let NumMicroOps = 2; +} + +// Load pair, immed offset, normal +// Load pair, immed offset, signed words, base != SP +// Load pair, immed offset signed words, base = SP +// LDP only breaks into *one* LS micro-op. Thus +// the resources are handled by WriteLD. +def : WriteRes { + let Latency = 4; + let NumMicroOps = 4; +} + +// Load register offset, basic +// Load register, register offset, scale by 4/8 +// Load register, register offset, scale by 2 +// Load register offset, extend +// Load register, register offset, extend, scale by 4/8 +// Load register, register offset, extend, scale by 2 +def THX3T110WriteLDIdx : SchedWriteVariant<[ + SchedVar, + SchedVar]>; +def : SchedAlias; + +def THX3T110ReadAdrBase : SchedReadVariant<[ + SchedVar, + SchedVar]>; +def : SchedAlias; + +// Load pair, immed pre-index, normal +// Load pair, immed pre-index, signed words +// Load pair, immed post-index, normal +// Load pair, immed post-index, signed words +def : InstRW<[THX3T110Write_4Cyc_LS01_I0123, WriteLDHi], (instrs LDNPDi)>; +def : InstRW<[THX3T110Write_4Cyc_LS01_I0123, WriteLDHi], (instrs LDNPQi)>; +def : InstRW<[THX3T110Write_4Cyc_LS01_I0123, WriteLDHi], (instrs LDNPSi)>; +def : InstRW<[THX3T110Write_4Cyc_LS01_I0123, WriteLDHi], (instrs LDNPWi)>; +def : InstRW<[THX3T110Write_4Cyc_LS01_I0123, WriteLDHi], (instrs LDNPXi)>; + +def : InstRW<[THX3T110Write_4Cyc_LS01_I0123, WriteLDHi], (instrs LDPDi)>; +def : InstRW<[THX3T110Write_4Cyc_LS01_I0123, WriteLDHi], (instrs LDPQi)>; +def : InstRW<[THX3T110Write_4Cyc_LS01_I0123, WriteLDHi], (instrs LDPSi)>; +def : InstRW<[THX3T110Write_4Cyc_LS01_I0123, WriteLDHi], (instrs LDPSWi)>; +def : InstRW<[THX3T110Write_4Cyc_LS01_I0123, WriteLDHi], (instrs LDPWi)>; +def : InstRW<[THX3T110Write_4Cyc_LS01_I0123, WriteLDHi], (instrs LDPXi)>; + +def : InstRW<[THX3T110Write_4Cyc_LS01], (instrs LDRBui)>; +def : InstRW<[THX3T110Write_4Cyc_LS01], (instrs LDRDui)>; +def : InstRW<[THX3T110Write_4Cyc_LS01], (instrs LDRHui)>; +def : InstRW<[THX3T110Write_5Cyc_LS01], (instrs LDRQui)>; +def : InstRW<[THX3T110Write_5Cyc_LS01], (instrs LDRSui)>; + +def : InstRW<[THX3T110Write_4Cyc_LS01], (instrs LDRDl)>; +def : InstRW<[THX3T110Write_4Cyc_LS01], (instrs LDRQl)>; +def : InstRW<[THX3T110Write_4Cyc_LS01], (instrs LDRWl)>; +def : InstRW<[THX3T110Write_4Cyc_LS01], (instrs LDRXl)>; + +def : InstRW<[THX3T110Write_4Cyc_LS01], (instrs LDTRBi)>; +def : InstRW<[THX3T110Write_4Cyc_LS01], (instrs LDTRHi)>; +def : InstRW<[THX3T110Write_4Cyc_LS01], (instrs LDTRWi)>; +def : InstRW<[THX3T110Write_4Cyc_LS01], (instrs LDTRXi)>; + +def : InstRW<[THX3T110Write_4Cyc_LS01], (instrs LDTRSBWi)>; +def : InstRW<[THX3T110Write_4Cyc_LS01], (instrs LDTRSBXi)>; +def : InstRW<[THX3T110Write_4Cyc_LS01], (instrs LDTRSHWi)>; +def : InstRW<[THX3T110Write_4Cyc_LS01], (instrs LDTRSHXi)>; +def : InstRW<[THX3T110Write_4Cyc_LS01], (instrs LDTRSWi)>; + +def : InstRW<[THX3T110Write_5Cyc_LS01_I0123, WriteLDHi, WriteAdr], + (instrs LDPDpre)>; +def : InstRW<[THX3T110Write_5Cyc_LS01_I0123, WriteLDHi, WriteAdr], + (instrs LDPQpre)>; +def : InstRW<[THX3T110Write_5Cyc_LS01_I0123, WriteLDHi, WriteAdr], + (instrs LDPSpre)>; +def : InstRW<[THX3T110Write_5Cyc_LS01_I0123, WriteLDHi, WriteAdr], + (instrs LDPWpre)>; +def : InstRW<[THX3T110Write_5Cyc_LS01_I0123, WriteLDHi, WriteAdr], + (instrs LDPWpre)>; + +def : InstRW<[THX3T110Write_4Cyc_LS01, WriteAdr], + (instrs LDRBpre, LDRDpre, LDRHpre, LDRQpre, + LDRSpre, LDRWpre, LDRXpre, + LDRSBWpre, LDRSBXpre, LDRSBWpost, LDRSBXpost, + LDRSHWpre, LDRSHXpre, LDRSHWpost, LDRSHXpost, + LDRBBpre, LDRBBpost, LDRHHpre, LDRHHpost)>; + +def : InstRW<[THX3T110Write_5Cyc_LS01_I0123, WriteLDHi, WriteAdr], + (instrs LDPDpost, LDPQpost, LDPSpost, LDPWpost, LDPXpost)>; + +def : InstRW<[THX3T110Write_5Cyc_LS01_I0123, WriteI], + (instrs LDRBpost, LDRDpost, LDRHpost, + LDRQpost, LDRSpost, LDRWpost, LDRXpost)>; + +def : InstRW<[THX3T110Write_4Cyc_LS01_I0123_I0123, WriteLDHi, WriteAdr], + (instrs LDPDpre, LDPQpre, LDPSpre, LDPWpre, LDPXpre)>; + +def : InstRW<[THX3T110Write_4Cyc_LS01_I0123_I0123, WriteAdr], + (instrs LDRBpre, LDRDpre, LDRHpre, LDRQpre, + LDRSpre, LDRWpre, LDRXpre)>; + +def : InstRW<[THX3T110Write_4Cyc_LS01_I0123_I0123, WriteLDHi, WriteAdr], + (instrs LDPDpost, LDPQpost, LDPSpost, LDPWpost, LDPXpost)>; + +def : InstRW<[THX3T110Write_4Cyc_LS01_I0123_I0123, WriteI], + (instrs LDRBpost, LDRDpost, LDRHpost, LDRQpost, + LDRSpost, LDRWpost, LDRXpost)>; + +def : InstRW<[THX3T110Write_4Cyc_LS01_I0123, ReadAdrBase], (instrs LDRBroW)>; +def : InstRW<[THX3T110Write_4Cyc_LS01_I0123, ReadAdrBase], (instrs LDRDroW)>; +def : InstRW<[THX3T110Write_4Cyc_LS01_I0123, ReadAdrBase], (instrs LDRHroW)>; +def : InstRW<[THX3T110Write_4Cyc_LS01_I0123, ReadAdrBase], (instrs LDRHHroW)>; +def : InstRW<[THX3T110Write_4Cyc_LS01_I0123, ReadAdrBase], (instrs LDRQroW)>; +def : InstRW<[THX3T110Write_4Cyc_LS01_I0123, ReadAdrBase], (instrs LDRSroW)>; +def : InstRW<[THX3T110Write_4Cyc_LS01_I0123, ReadAdrBase], (instrs LDRSHWroW)>; +def : InstRW<[THX3T110Write_4Cyc_LS01_I0123, ReadAdrBase], (instrs LDRSHXroW)>; +def : InstRW<[THX3T110Write_4Cyc_LS01_I0123, ReadAdrBase], (instrs LDRWroW)>; +def : InstRW<[THX3T110Write_4Cyc_LS01_I0123, ReadAdrBase], (instrs LDRXroW)>; + +def : InstRW<[THX3T110Write_4Cyc_LS01_I0123, ReadAdrBase], (instrs LDRBroX)>; +def : InstRW<[THX3T110Write_4Cyc_LS01_I0123, ReadAdrBase], (instrs LDRDroX)>; +def : InstRW<[THX3T110Write_4Cyc_LS01_I0123, ReadAdrBase], (instrs LDRHHroX)>; +def : InstRW<[THX3T110Write_4Cyc_LS01_I0123, ReadAdrBase], (instrs LDRHroX)>; +def : InstRW<[THX3T110Write_4Cyc_LS01_I0123, ReadAdrBase], (instrs LDRQroX)>; +def : InstRW<[THX3T110Write_4Cyc_LS01_I0123, ReadAdrBase], (instrs LDRSroX)>; +def : InstRW<[THX3T110Write_4Cyc_LS01_I0123, ReadAdrBase], (instrs LDRSHWroX)>; +def : InstRW<[THX3T110Write_4Cyc_LS01_I0123, ReadAdrBase], (instrs LDRSHXroX)>; +def : InstRW<[THX3T110Write_4Cyc_LS01_I0123, ReadAdrBase], (instrs LDRWroX)>; +def : InstRW<[THX3T110Write_4Cyc_LS01_I0123, ReadAdrBase], (instrs LDRXroX)>; + +def : InstRW<[THX3T110Write_4Cyc_LS01], (instrs LDURBi)>; +def : InstRW<[THX3T110Write_4Cyc_LS01], (instrs LDURBBi)>; +def : InstRW<[THX3T110Write_4Cyc_LS01], (instrs LDURDi)>; +def : InstRW<[THX3T110Write_4Cyc_LS01], (instrs LDURHi)>; +def : InstRW<[THX3T110Write_4Cyc_LS01], (instrs LDURHHi)>; +def : InstRW<[THX3T110Write_4Cyc_LS01], (instrs LDURQi)>; +def : InstRW<[THX3T110Write_4Cyc_LS01], (instrs LDURSi)>; +def : InstRW<[THX3T110Write_4Cyc_LS01], (instrs LDURXi)>; +def : InstRW<[THX3T110Write_4Cyc_LS01], (instrs LDURSBWi)>; +def : InstRW<[THX3T110Write_4Cyc_LS01], (instrs LDURSBXi)>; +def : InstRW<[THX3T110Write_4Cyc_LS01], (instrs LDURSHWi)>; +def : InstRW<[THX3T110Write_4Cyc_LS01], (instrs LDURSHXi)>; +def : InstRW<[THX3T110Write_4Cyc_LS01], (instrs LDURSWi)>; + +// Load exclusive +def : InstRW<[THX3T110Write_4Cyc_LS01], (instregex "^LDAR(B|H|W|X)$")>; +def : InstRW<[THX3T110Write_4Cyc_LS01], (instregex "^LDAXR(B|H|W|X)$")>; +def : InstRW<[THX3T110Write_4Cyc_LS01], (instregex "^LDXR(B|H|W|X)$")>; +def : InstRW<[THX3T110Write_4Cyc_LS01], (instregex "^LDAXP(W|X)$")>; +def : InstRW<[THX3T110Write_4Cyc_LS01], (instregex "^LDXP(W|X)$")>; + +//--- +// Prefetch +//--- +def : InstRW<[THX3T110Write_6Cyc_LS01_I012], (instrs PRFMl)>; +def : InstRW<[THX3T110Write_6Cyc_LS01_I012], (instrs PRFUMi)>; +def : InstRW<[THX3T110Write_6Cyc_LS01_I012], (instrs PRFMui)>; +def : InstRW<[THX3T110Write_6Cyc_LS01_I012], (instrs PRFMroW)>; +def : InstRW<[THX3T110Write_6Cyc_LS01_I012], (instrs PRFMroX)>; + +//-- +// 3.7 Store Instructions +// 3.11 FP Store Instructions +//-- + +// Store register, unscaled immed +// Store register, immed unprivileged +// Store register, unsigned immed +def : WriteRes { + let Latency = 1; + let NumMicroOps = 2; +} + +// Store register, immed post-index +// NOTE: Handled by WriteAdr, WriteST, ReadAdrBase + +// Store register, immed pre-index +// NOTE: Handled by WriteAdr, WriteST + +// Store register, register offset, basic +// Store register, register offset, scaled by 4/8 +// Store register, register offset, scaled by 2 +// Store register, register offset, extend +// Store register, register offset, extend, scale by 4/8 +// Store register, register offset, extend, scale by 1 +def : WriteRes { + let Latency = 1; + let NumMicroOps = 2; +} + +// Store pair, immed offset, W-form +// Store pair, immed offset, X-form +def : WriteRes { + let Latency = 1; + let NumMicroOps = 2; +} + +// Store pair, immed post-index, W-form +// Store pair, immed post-index, X-form +// Store pair, immed pre-index, W-form +// Store pair, immed pre-index, X-form +// NOTE: Handled by WriteAdr, WriteSTP. +def : InstRW<[THX3T110Write_1Cyc_LS01_SD], (instrs STURBi)>; +def : InstRW<[THX3T110Write_1Cyc_LS01_SD], (instrs STURBBi)>; +def : InstRW<[THX3T110Write_1Cyc_LS01_SD], (instrs STURDi)>; +def : InstRW<[THX3T110Write_1Cyc_LS01_SD], (instrs STURHi)>; +def : InstRW<[THX3T110Write_1Cyc_LS01_SD], (instrs STURHHi)>; +def : InstRW<[THX3T110Write_1Cyc_LS01_SD], (instrs STURQi)>; +def : InstRW<[THX3T110Write_1Cyc_LS01_SD], (instrs STURSi)>; +def : InstRW<[THX3T110Write_1Cyc_LS01_SD], (instrs STURWi)>; +def : InstRW<[THX3T110Write_1Cyc_LS01_SD], (instrs STURXi)>; + +def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_SD], (instrs STTRBi)>; +def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_SD], (instrs STTRHi)>; +def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_SD], (instrs STTRWi)>; +def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_SD], (instrs STTRXi)>; + +def : InstRW<[THX3T110Write_1Cyc_LS01_SD], (instrs STNPDi)>; +def : InstRW<[THX3T110Write_1Cyc_LS01_SD], (instrs STNPQi)>; +def : InstRW<[THX3T110Write_1Cyc_LS01_SD], (instrs STNPXi)>; +def : InstRW<[THX3T110Write_1Cyc_LS01_SD], (instrs STNPWi)>; + +def : InstRW<[THX3T110Write_1Cyc_LS01_SD], (instrs STPDi)>; +def : InstRW<[THX3T110Write_1Cyc_LS01_SD], (instrs STPQi)>; +def : InstRW<[THX3T110Write_1Cyc_LS01_SD], (instrs STPXi)>; +def : InstRW<[THX3T110Write_1Cyc_LS01_SD], (instrs STPWi)>; + +def : InstRW<[THX3T110Write_1Cyc_LS01_I0123], (instrs STRBui)>; +def : InstRW<[THX3T110Write_1Cyc_LS01_I0123], (instrs STRDui)>; +def : InstRW<[THX3T110Write_1Cyc_LS01_I0123], (instrs STRHui)>; +def : InstRW<[THX3T110Write_1Cyc_LS01_I0123], (instrs STRQui)>; +def : InstRW<[THX3T110Write_1Cyc_LS01_I0123], (instrs STRXui)>; +def : InstRW<[THX3T110Write_1Cyc_LS01_I0123], (instrs STRWui)>; + +def : InstRW<[WriteSTP, THX3T110Write_1Cyc_LS01_SD], (instrs STRBui)>; +def : InstRW<[WriteSTP, THX3T110Write_1Cyc_LS01_SD], (instrs STRDui)>; +def : InstRW<[WriteSTP, THX3T110Write_1Cyc_LS01_SD], (instrs STRHui)>; +def : InstRW<[WriteSTP, THX3T110Write_1Cyc_LS01_SD], (instrs STRQui)>; +def : InstRW<[WriteSTP, THX3T110Write_1Cyc_LS01_SD], (instrs STRXui)>; +def : InstRW<[WriteSTP, THX3T110Write_1Cyc_LS01_SD], (instrs STRWui)>; + +def : InstRW<[WriteSTIdx, THX3T110Write_1Cyc_LS01_SD_I0123], (instrs STRBui)>; +def : InstRW<[WriteSTIdx, THX3T110Write_1Cyc_LS01_SD_I0123], (instrs STRDui)>; +def : InstRW<[WriteSTIdx, THX3T110Write_1Cyc_LS01_SD_I0123], (instrs STRHui)>; +def : InstRW<[WriteSTIdx, THX3T110Write_1Cyc_LS01_SD_I0123], (instrs STRQui)>; +def : InstRW<[WriteSTIdx, THX3T110Write_1Cyc_LS01_SD_I0123], (instrs STRXui)>; +def : InstRW<[WriteSTIdx, THX3T110Write_1Cyc_LS01_SD_I0123], (instrs STRWui)>; + +def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_I0123], + (instrs STPDpre, STPDpost)>; +def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_I0123, ReadAdrBase], + (instrs STPDpre, STPDpost)>; +def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_I0123], + (instrs STPQpre, STPQpost)>; +def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_I0123, ReadAdrBase], + (instrs STPQpre, STPQpost)>; +def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_I0123], + (instrs STPSpre, STPSpost)>; +def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_I0123, ReadAdrBase], + (instrs STPSpre, STPSpost)>; +def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_I0123], + (instrs STPWpre, STPWpost)>; +def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_I0123, ReadAdrBase], + (instrs STPWpre, STPWpost)>; +def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_I0123], + (instrs STPXpre, STPXpost)>; +def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_I0123, ReadAdrBase], + (instrs STPXpre, STPXpost)>; +def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_I0123], + (instrs STRBpre, STRBpost)>; +def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_I0123, ReadAdrBase], + (instrs STRBpre, STRBpost)>; +def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_I0123], + (instrs STRBBpre, STRBBpost)>; +def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_I0123, ReadAdrBase], + (instrs STRBBpre, STRBBpost)>; +def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_I0123], + (instrs STRDpre, STRDpost)>; +def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_I0123, ReadAdrBase], + (instrs STRDpre, STRDpost)>; +def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_I0123], + (instrs STRHpre, STRHpost)>; +def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_I0123, ReadAdrBase], + (instrs STRHpre, STRHpost)>; +def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_I0123], + (instrs STRHHpre, STRHHpost)>; +def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_I0123, ReadAdrBase], + (instrs STRHHpre, STRHHpost)>; +def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_I0123], + (instrs STRQpre, STRQpost)>; +def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_I0123, ReadAdrBase], + (instrs STRQpre, STRQpost)>; +def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_I0123], + (instrs STRSpre, STRSpost)>; +def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_I0123, ReadAdrBase], + (instrs STRSpre, STRSpost)>; +def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_I0123], + (instrs STRWpre, STRWpost)>; +def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_I0123, ReadAdrBase], + (instrs STRWpre, STRWpost)>; +def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_I0123], + (instrs STRXpre, STRXpost)>; +def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_I0123, ReadAdrBase], + (instrs STRXpre, STRXpost)>; +def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_I0123, ReadAdrBase], + (instrs STRBroW, STRBroX)>; +def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_I0123, ReadAdrBase], + (instrs STRBBroW, STRBBroX)>; +def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_I0123, ReadAdrBase], + (instrs STRDroW, STRDroX)>; +def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_I0123, ReadAdrBase], + (instrs STRHroW, STRHroX)>; +def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_I0123, ReadAdrBase], + (instrs STRHHroW, STRHHroX)>; +def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_I0123, ReadAdrBase], + (instrs STRQroW, STRQroX)>; +def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_I0123, ReadAdrBase], + (instrs STRSroW, STRSroX)>; +def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_I0123, ReadAdrBase], + (instrs STRWroW, STRWroX)>; +def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_I0123, ReadAdrBase], + (instrs STRXroW, STRXroX)>; + +// Store exclusive +def : InstRW<[THX3T110Write_4Cyc_LS01_SD], (instrs STNPWi, STNPXi)>; +def : InstRW<[THX3T110Write_4Cyc_LS01_SD], (instregex "^STLR(B|H|W|X)$")>; +def : InstRW<[THX3T110Write_4Cyc_LS01_SD], (instregex "^STXP(W|X)$")>; +def : InstRW<[THX3T110Write_4Cyc_LS01_SD], (instregex "^STXR(B|H|W|X)$")>; +def : InstRW<[THX3T110Write_4Cyc_LS01_SD], (instregex "^STLXP(W|X)$")>; +def : InstRW<[THX3T110Write_4Cyc_LS01_SD], (instregex "^STLXR(B|H|W|X)$")>; + +//--- +// 3.8 FP Data Processing Instructions +//--- + +// FP absolute value +// FP min/max +// FP negate +def : WriteRes { + let Latency = 5; + let NumMicroOps = 2; +} + +// FP arithmetic +def : InstRW<[THX3T110Write_6Cyc_F01], (instregex "^FADD", "^FSUB")>; + +// FP compare +def : WriteRes { + let Latency = 5; + let NumMicroOps = 2; +} + +// FP Mul, Div, Sqrt +def : WriteRes { + let Latency = 22; + let ResourceCycles = [19]; +} + +def THX3T110XWriteFDiv : SchedWriteRes<[THX3T110FP0123]> { + let Latency = 16; + let ResourceCycles = [8]; + let NumMicroOps = 4; +} + +def THX3T110XWriteFDivSP : SchedWriteRes<[THX3T110FP0123]> { + let Latency = 16; + let ResourceCycles = [8]; + let NumMicroOps = 4; +} + +def THX3T110XWriteFDivDP : SchedWriteRes<[THX3T110FP0123]> { + let Latency = 23; + let ResourceCycles = [12]; + let NumMicroOps = 4; +} + +def THX3T110XWriteFSqrtSP : SchedWriteRes<[THX3T110FP0123]> { + let Latency = 16; + let ResourceCycles = [8]; + let NumMicroOps = 4; +} + +def THX3T110XWriteFSqrtDP : SchedWriteRes<[THX3T110FP0123]> { + let Latency = 23; + let ResourceCycles = [12]; + let NumMicroOps = 4; +} + +// FP divide, S-form +// FP square root, S-form +def : InstRW<[THX3T110XWriteFDivSP], (instrs FDIVSrr)>; +def : InstRW<[THX3T110XWriteFSqrtSP], (instrs FSQRTSr)>; +def : InstRW<[THX3T110XWriteFDivSP], (instregex "^FDIVv.*32$")>; +def : InstRW<[THX3T110XWriteFSqrtSP], (instregex "^.*SQRT.*32$")>; +def : InstRW<[THX3T110Write_16Cyc_F01], (instregex "^FDIVSrr", "^FSQRTSr")>; + +// FP divide, D-form +// FP square root, D-form +def : InstRW<[THX3T110XWriteFDivDP], (instrs FDIVDrr)>; +def : InstRW<[THX3T110XWriteFSqrtDP], (instrs FSQRTDr)>; +def : InstRW<[THX3T110XWriteFDivDP], (instregex "^FDIVv.*64$")>; +def : InstRW<[THX3T110XWriteFSqrtDP], (instregex "^.*SQRT.*64$")>; +def : InstRW<[THX3T110Write_23Cyc_F01], (instregex "^FDIVDrr", "^FSQRTDr")>; + +// FP multiply +// FP multiply accumulate +def : WriteRes { + let Latency = 6; + let ResourceCycles = [2]; + let NumMicroOps = 3; +} + +def THX3T110XWriteFMul : SchedWriteRes<[THX3T110FP0123]> { + let Latency = 6; + let ResourceCycles = [2]; + let NumMicroOps = 3; +} + +def THX3T110XWriteFMulAcc : SchedWriteRes<[THX3T110FP0123]> { + let Latency = 6; + let ResourceCycles = [2]; + let NumMicroOps = 3; +} + +def : InstRW<[THX3T110XWriteFMul], (instregex "^FMUL", "^FNMUL")>; +def : InstRW<[THX3T110XWriteFMulAcc], + (instregex "^FMADD", "^FMSUB", "^FNMADD", "^FNMSUB")>; + +// FP round to integral +def : InstRW<[THX3T110Write_7Cyc_F01], + (instregex "^FRINT(A|I|M|N|P|X|Z)(Sr|Dr)")>; + +// FP select +def : InstRW<[THX3T110Write_3_4Cyc_F23_F0123], (instregex "^FCSEL")>; + +//--- +// 3.9 FP Miscellaneous Instructions +//--- + +// FP convert, from vec to vec reg +// FP convert, from gen to vec reg +// FP convert, from vec to gen reg +def : WriteRes { + let Latency = 7; + let NumMicroOps = 3; +} + +// FP move, immed +// FP move, register +def : WriteRes { + let Latency = 4; + let NumMicroOps = 2; +} + +// FP transfer, from gen to vec reg +// FP transfer, from vec to gen reg +def : WriteRes { + let Latency = 4; + let NumMicroOps = 2; +} + +def : InstRW<[THX3T110Write_5Cyc_F01], (instrs FMOVXDHighr, FMOVDXHighr)>; + +//--- +// 3.12 ASIMD Integer Instructions +//--- + +// ASIMD absolute diff, D-form +// ASIMD absolute diff, Q-form +// ASIMD absolute diff accum, D-form +// ASIMD absolute diff accum, Q-form +// ASIMD absolute diff accum long +// ASIMD absolute diff long +// ASIMD arith, basic +// ASIMD arith, complex +// ASIMD compare +// ASIMD logical (AND, BIC, EOR) +// ASIMD max/min, basic +// ASIMD max/min, reduce, 4H/4S +// ASIMD max/min, reduce, 8B/8H +// ASIMD max/min, reduce, 16B +// ASIMD multiply, D-form +// ASIMD multiply, Q-form +// ASIMD multiply accumulate long +// ASIMD multiply accumulate saturating long +// ASIMD multiply long +// ASIMD pairwise add and accumulate +// ASIMD shift accumulate +// ASIMD shift by immed, basic +// ASIMD shift by immed and insert, basic, D-form +// ASIMD shift by immed and insert, basic, Q-form +// ASIMD shift by immed, complex +// ASIMD shift by register, basic, D-form +// ASIMD shift by register, basic, Q-form +// ASIMD shift by register, complex, D-form +// ASIMD shift by register, complex, Q-form +def : WriteRes { + let Latency = 5; + let NumMicroOps = 4; + let ResourceCycles = [4]; +} + +// ASIMD arith, reduce, 4H/4S +// ASIMD arith, reduce, 8B/8H +// ASIMD arith, reduce, 16B + +// ASIMD logical (MVN (alias for NOT), ORN, ORR) +def : InstRW<[THX3T110Write_5Cyc_F0123], + (instregex "^ANDv", "^BICv", "^EORv", "^ORRv", "^ORNv", "^NOTv")>; + +// ASIMD arith, reduce +def : InstRW<[THX3T110Write_5Cyc_F0123], + (instregex "^ADDVv", "^SADDLVv", "^UADDLVv")>; + +// ASIMD polynomial (8x8) multiply long +def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "^(S|U|SQD)MULL")>; +def : InstRW<[THX3T110Write_5Cyc_F0123], + (instregex "(S|U|SQD)(MLAL|MLSL|MULL)v.*")>; +def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "^PMULL(v8i8|v16i8)")>; +def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "^PMULL(v1i64|v2i64)")>; + +// ASIMD absolute diff accum, D-form +def : InstRW<[THX3T110Write_5Cyc_F0123], + (instregex "^[SU]ABA(v8i8|v4i16|v2i32)$")>; +// ASIMD absolute diff accum, Q-form +def : InstRW<[THX3T110Write_5Cyc_F0123], + (instregex "^[SU]ABA(v16i8|v8i16|v4i32)$")>; +// ASIMD absolute diff accum long +def : InstRW<[THX3T110Write_5Cyc_F0123], + (instregex "^[SU]ABAL")>; +// ASIMD arith, reduce, 4H/4S +def : InstRW<[THX3T110Write_5Cyc_F0123], + (instregex "^[SU]?ADDL?V(v8i8|v4i16|v2i32)v$")>; +// ASIMD arith, reduce, 8B +def : InstRW<[THX3T110Write_5Cyc_F0123], + (instregex "^[SU]?ADDL?V(v8i16|v4i32)v$")>; +// ASIMD arith, reduce, 16B/16H +def : InstRW<[THX3T110Write_10Cyc_F0123], + (instregex "^[SU]?ADDL?Vv16i8v$")>; +// ASIMD max/min, reduce, 4H/4S +def : InstRW<[THX3T110Write_5Cyc_F0123], + (instregex "^[SU](MIN|MAX)V(v4i16|v4i32)v$")>; +// ASIMD max/min, reduce, 8B/8H +def : InstRW<[THX3T110Write_5Cyc_F0123], + (instregex "^[SU](MIN|MAX)V(v8i8|v8i16)v$")>; +// ASIMD max/min, reduce, 16B/16H +def : InstRW<[THX3T110Write_5Cyc_F0123], + (instregex "^[SU](MIN|MAX)Vv16i8v$")>; +// ASIMD multiply, D-form +def : InstRW<[THX3T110Write_5Cyc_F0123], + (instregex "^(P?MUL|SQR?DMULH)" # + "(v8i8|v4i16|v2i32|v1i8|v1i16|v1i32|v1i64)" # + "(_indexed)?$")>; +// ASIMD multiply, Q-form +def : InstRW<[THX3T110Write_5Cyc_F0123], + (instregex "^(P?MUL|SQR?DMULH)(v16i8|v8i16|v4i32)(_indexed)?$")>; +// ASIMD multiply accumulate, D-form +def : InstRW<[THX3T110Write_5Cyc_F0123], + (instregex "^ML[AS](v8i8|v4i16|v2i32)(_indexed)?$")>; +// ASIMD multiply accumulate, Q-form +def : InstRW<[THX3T110Write_5Cyc_F0123], + (instregex "^ML[AS](v16i8|v8i16|v4i32)(_indexed)?$")>; +// ASIMD shift accumulate +def : InstRW<[THX3T110Write_5Cyc_F0123], + (instregex "SRSRAv","SSRAv","URSRAv","USRAv")>; + +// ASIMD shift by immed, basic +def : InstRW<[THX3T110Write_5Cyc_F0123], + (instregex "RSHRNv","SHRNv", "SQRSHRNv","SQRSHRUNv", + "SQSHRNv","SQSHRUNv", "UQRSHRNv", + "UQSHRNv","SQXTNv","SQXTUNv","UQXTNv")>; +// ASIMD shift by immed, complex +def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "^[SU]?(Q|R){1,2}SHR")>; +def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "^SQSHLU")>; +// ASIMD shift by register, basic, Q-form +def : InstRW<[THX3T110Write_5Cyc_F01], + (instregex "^[SU]SHL(v16i8|v8i16|v4i32|v2i64)")>; +// ASIMD shift by register, complex, D-form +def : InstRW<[THX3T110Write_5Cyc_F0123], + (instregex "^[SU][QR]{1,2}SHL" # + "(v1i8|v1i16|v1i32|v1i64|v8i8|v4i16|v2i32|b|d|h|s)")>; +// ASIMD shift by register, complex, Q-form +def : InstRW<[THX3T110Write_5Cyc_F0123], + (instregex "^[SU][QR]{1,2}SHL(v16i8|v8i16|v4i32|v2i64)")>; + +// ASIMD Arithmetic +def : InstRW<[THX3T110Write_5Cyc_F0123], + (instregex "(ADD|SUB)(v8i8|v4i16|v2i32|v1i64)")>; +def : InstRW<[THX3T110Write_5Cyc_F0123], + (instregex "(ADD|SUB)(v16i8|v8i16|v4i32|v2i64)")>; +def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "(ADD|SUB)HNv.*")>; +def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "(RADD|RSUB)HNv.*")>; +def : InstRW<[THX3T110Write_5Cyc_F0123], + (instregex "^SQADD", "^SQNEG", "^SQSUB", "^SRHADD", + "^SUQADD", "^UQADD", "^UQSUB", "^URHADD", "^USQADD")>; +def : InstRW<[THX3T110Write_5Cyc_F0123], + (instregex "ADDP(v16i8|v8i16|v4i32|v2i64)")>; +def : InstRW<[THX3T110Write_5Cyc_F0123], + (instregex "((AND|ORN|EOR|EON)S?(Xr[rsi]|v16i8|v8i16|v4i32)|" # + "(ORR|BIC)S?(Xr[rs]|v16i8|v8i16|v4i32))")>; +def : InstRW<[THX3T110Write_5Cyc_F0123], + (instregex "(CLS|CLZ|CNT)(v4i32|v8i16|v16i8)")>; +def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "^SADALP","^UADALP")>; +def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "^SADDLPv","^UADDLPv")>; +def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "^SADDLV","^UADDLV")>; +def : InstRW<[THX3T110Write_5Cyc_F0123], + (instregex "^ADDVv","^SMAXVv","^UMAXVv","^SMINVv","^UMINVv")>; +def : InstRW<[THX3T110Write_5Cyc_F0123], + (instregex "^SABAv","^UABAv","^SABALv","^UABALv")>; +def : InstRW<[THX3T110Write_5Cyc_F0123], + (instregex "^SQADDv","^SQSUBv","^UQADDv","^UQSUBv")>; +def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "^SUQADDv","^USQADDv")>; +def : InstRW<[THX3T110Write_5Cyc_F0123], + (instregex "^ADDHNv","^RADDHNv", "^RSUBHNv", + "^SQABS", "^SQADD", "^SQNEG", "^SQSUB", + "^SRHADD", "^SUBHNv", "^SUQADD", + "^UQADD", "^UQSUB", "^URHADD", "^USQADD")>; +def : InstRW<[THX3T110Write_5Cyc_F0123], + (instregex "^CMEQv","^CMGEv","^CMGTv", + "^CMLEv","^CMLTv", "^CMHIv","^CMHSv")>; +def : InstRW<[THX3T110Write_5Cyc_F0123], + (instregex "^SMAXv","^SMINv","^UMAXv","^UMINv", + "^SMAXPv","^SMINPv","^UMAXPv","^UMINPv")>; +def : InstRW<[THX3T110Write_5Cyc_F0123], + (instregex "^SABDv","^UABDv", "^SABDLv","^UABDLv")>; + +//--- +// 3.13 ASIMD Floating-point Instructions +//--- + +// ASIMD FP absolute value +def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "^FABSv")>; + +// ASIMD FP arith, normal, D-form +// ASIMD FP arith, normal, Q-form +def : InstRW<[THX3T110Write_3_4Cyc_F23_F0123], + (instregex "^FABDv", "^FADDv", "^FSUBv")>; + +// ASIMD FP arith,pairwise, D-form +// ASIMD FP arith, pairwise, Q-form +def : InstRW<[THX3T110Write_3_4Cyc_F23_F0123], (instregex "^FADDPv")>; + +// ASIMD FP compare, D-form +// ASIMD FP compare, Q-form +def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "^FACGEv", "^FACGTv")>; +def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "^FCMEQv", "^FCMGEv", + "^FCMGTv", "^FCMLEv", + "^FCMLTv")>; + +// ASIMD FP round, D-form +def : InstRW<[THX3T110Write_5Cyc_F0123], + (instregex "^FRINT[AIMNPXZ](v2f32)")>; +// ASIMD FP round, Q-form +def : InstRW<[THX3T110Write_5Cyc_F0123], + (instregex "^FRINT[AIMNPXZ](v4f32|v2f64)")>; + +// ASIMD FP convert, long +// ASIMD FP convert, narrow +// ASIMD FP convert, other, D-form +// ASIMD FP convert, other, Q-form +// NOTE: Handled by WriteV. + +// ASIMD FP convert, long and narrow +def : InstRW<[THX3T110Write_5Cyc_F01], (instregex "^FCVT(L|N|XN)v")>; +// ASIMD FP convert, other, D-form +def : InstRW<[THX3T110Write_5Cyc_F01], + (instregex "^[FVSU]CVT([AMNPZ][SU])?(_Int)?(v2f32|v1i32|v2i32|v1i64)")>; +// ASIMD FP convert, other, Q-form +def : InstRW<[THX3T110Write_5Cyc_F01], + (instregex "^[FVSU]CVT([AMNPZ][SU])?(_Int)?(v4f32|v2f64|v4i32|v2i64)")>; + +// ASIMD FP divide, D-form, F32 +def : InstRW<[THX3T110Write_16Cyc_F0123], (instrs FDIVv2f32)>; +def : InstRW<[THX3T110Write_16Cyc_F0123], (instregex "FDIVv2f32")>; + +// ASIMD FP divide, Q-form, F32 +def : InstRW<[THX3T110Write_16Cyc_F0123], (instrs FDIVv4f32)>; +def : InstRW<[THX3T110Write_16Cyc_F0123], (instregex "FDIVv4f32")>; + +// ASIMD FP divide, Q-form, F64 +def : InstRW<[THX3T110Write_23Cyc_F0123], (instrs FDIVv2f64)>; +def : InstRW<[THX3T110Write_23Cyc_F0123], (instregex "FDIVv2f64")>; + +// ASIMD FP max/min, normal, D-form +// ASIMD FP max/min, normal, Q-form +def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "^FMAXv", "^FMAXNMv", + "^FMINv", "^FMINNMv")>; + +// ASIMD FP max/min, pairwise, D-form +// ASIMD FP max/min, pairwise, Q-form +def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "^FMAXPv", "^FMAXNMPv", + "^FMINPv", "^FMINNMPv")>; + +// ASIMD FP max/min, reduce +def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "^FMAXVv", "^FMAXNMVv", + "^FMINVv", "^FMINNMVv")>; + +// ASIMD FP multiply, D-form, FZ +// ASIMD FP multiply, D-form, no FZ +// ASIMD FP multiply, Q-form, FZ +// ASIMD FP multiply, Q-form, no FZ +def : InstRW<[THX3T110Write_5Cyc_F0123], + (instregex "^FMULv", "^FMULXv")>; +def : InstRW<[THX3T110Write_5Cyc_F0123], + (instregex "^FMULX?(v2f32|v1i32|v2i32|v1i64|32|64)")>; +def : InstRW<[THX3T110Write_5Cyc_F0123], + (instregex "^FMULX?(v4f32|v2f64|v4i32|v2i64)")>; + +// ASIMD FP multiply accumulate, Dform, FZ +// ASIMD FP multiply accumulate, Dform, no FZ +// ASIMD FP multiply accumulate, Qform, FZ +// ASIMD FP multiply accumulate, Qform, no FZ +def : InstRW<[THX3T110Write_5Cyc_F0123], + (instregex "^FMLAv", "^FMLSv")>; +def : InstRW<[THX3T110Write_5Cyc_F0123], + (instregex "^FML[AS](v2f32|v1i32|v2i32|v1i64)")>; +def : InstRW<[THX3T110Write_5Cyc_F0123], + (instregex "^FML[AS](v4f32|v2f64|v4i32|v2i64)")>; + +// ASIMD FP negate +def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "^FNEGv")>; + +//-- +// 3.14 ASIMD Miscellaneous Instructions +//-- + +// ASIMD bit reverse +def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "^RBITv")>; + +// ASIMD bitwise insert, D-form +// ASIMD bitwise insert, Q-form +def : InstRW<[THX3T110Write_3_4Cyc_F23_F0123], + (instregex "^BIFv", "^BITv", "^BSLv")>; + +// ASIMD count, D-form +// ASIMD count, Q-form +def : InstRW<[THX3T110Write_3_4Cyc_F23_F0123], + (instregex "^CLSv", "^CLZv", "^CNTv")>; + +// ASIMD duplicate, gen reg +// ASIMD duplicate, element +def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "^DUPv")>; +def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "^CPY")>; +def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "^DUPv.+gpr")>; + +// ASIMD extract +def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "^EXTv")>; + +// ASIMD extract narrow +def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "^XTNv")>; + +// ASIMD extract narrow, saturating +def : InstRW<[THX3T110Write_5Cyc_F0123], + (instregex "^SQXTNv", "^SQXTUNv", "^UQXTNv")>; + +// ASIMD insert, element to element +def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "^INSv")>; + +// ASIMD transfer, element to gen reg +def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "^[SU]MOVv")>; + +// ASIMD move, integer immed +def : InstRW<[THX3T110Write_3_4Cyc_F23_F0123], (instregex "^MOVIv")>; + +// ASIMD move, FP immed +def : InstRW<[THX3T110Write_3_4Cyc_F23_F0123], (instregex "^FMOVv")>; + +// ASIMD transpose +def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "^TRN1", "^TRN2")>; + +// ASIMD unzip/zip +def : InstRW<[THX3T110Write_5Cyc_F0123], + (instregex "^UZP1", "^UZP2", "^ZIP1", "^ZIP2")>; + +// ASIMD reciprocal estimate, D-form +// ASIMD reciprocal estimate, Q-form +def : InstRW<[THX3T110Write_5Cyc_F0123], + (instregex "^FRECPEv", "^FRECPXv", "^URECPEv", + "^FRSQRTEv", "^URSQRTEv")>; + +// ASIMD reciprocal step, D-form, FZ +// ASIMD reciprocal step, D-form, no FZ +// ASIMD reciprocal step, Q-form, FZ +// ASIMD reciprocal step, Q-form, no FZ +def : InstRW<[THX3T110Write_5Cyc_F0123], + (instregex "^FRECPSv", "^FRSQRTSv")>; + +// ASIMD reverse +def : InstRW<[THX3T110Write_5Cyc_F0123], + (instregex "^REV16v", "^REV32v", "^REV64v")>; + +// ASIMD table lookup, D-form +// ASIMD table lookup, Q-form +def : InstRW<[THX3T110Write_5Cyc_F0123], + (instrs TBLv8i8One, TBLv16i8One, TBXv8i8One, TBXv16i8One)>; +def : InstRW<[THX3T110Write_10Cyc_F0123], + (instrs TBLv8i8Two, TBLv16i8Two, TBXv8i8Two, TBXv16i8Two)>; +def : InstRW<[THX3T110Write_15Cyc_F0123], + (instrs TBLv8i8Three, TBLv16i8Three, TBXv8i8Three, TBXv16i8Three)>; +def : InstRW<[THX3T110Write_20Cyc_F0123], + (instrs TBLv8i8Four, TBLv16i8Four, TBXv8i8Four, TBXv16i8Four)>; + +// ASIMD transfer, element to word or word +def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "^[SU]MOVv")>; + +// ASIMD transfer, element to gen reg +def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "(S|U)MOVv.*")>; + +// ASIMD transfer gen reg to element +def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "^INSv")>; + +// ASIMD transpose +def : InstRW<[THX3T110Write_5Cyc_F0123], + (instregex "^TRN1v", "^TRN2v", "^UZP1v", "^UZP2v")>; + +// ASIMD unzip/zip +def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "^ZIP1v", "^ZIP2v")>; + +//-- +// 3.15 ASIMD Load Instructions +//-- + +// ASIMD load, 1 element, multiple, 1 reg, D-form +// ASIMD load, 1 element, multiple, 1 reg, Q-form +def : InstRW<[THX3T110Write_4Cyc_LS01], + (instregex "^LD1Onev(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[THX3T110Write_4Cyc_LS01, WriteAdr], + (instregex "^LD1Onev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; + +// ASIMD load, 1 element, multiple, 2 reg, D-form +// ASIMD load, 1 element, multiple, 2 reg, Q-form +def : InstRW<[THX3T110Write_4Cyc_LS01], + (instregex "^LD1Twov(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[THX3T110Write_4Cyc_LS01, WriteAdr], + (instregex "^LD1Twov(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; + +// ASIMD load, 1 element, multiple, 3 reg, D-form +// ASIMD load, 1 element, multiple, 3 reg, Q-form +def : InstRW<[THX3T110Write_5Cyc_LS01], + (instregex "^LD1Threev(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[THX3T110Write_5Cyc_LS01, WriteAdr], + (instregex "^LD1Threev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; + +// ASIMD load, 1 element, multiple, 4 reg, D-form +// ASIMD load, 1 element, multiple, 4 reg, Q-form +def : InstRW<[THX3T110Write_6Cyc_LS01], + (instregex "^LD1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[THX3T110Write_6Cyc_LS01, WriteAdr], + (instregex "^LD1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; + +// ASIMD load, 1 element, one lane, B/H/S +// ASIMD load, 1 element, one lane, D +def : InstRW<[THX3T110Write_5Cyc_LS01_F0123], + (instregex "^LD1i(8|16|32|64)$")>; +def : InstRW<[THX3T110Write_5Cyc_LS01_F0123, WriteAdr], + (instregex "^LD1i(8|16|32|64)_POST$")>; + +// ASIMD load, 1 element, all lanes, D-form, B/H/S +// ASIMD load, 1 element, all lanes, D-form, D +// ASIMD load, 1 element, all lanes, Q-form +def : InstRW<[THX3T110Write_5Cyc_LS01_F0123], + (instregex "^LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[THX3T110Write_5Cyc_LS01_F0123, WriteAdr], + (instregex "^LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; + +// ASIMD load, 2 element, multiple, D-form, B/H/S +// ASIMD load, 2 element, multiple, Q-form, D +def : InstRW<[THX3T110Write_5Cyc_LS01_F0123], + (instregex "^LD2Twov(8b|4h|2s|16b|8h|4s|2d)$")>; +def : InstRW<[THX3T110Write_5Cyc_LS01_F0123, WriteAdr], + (instregex "^LD2Twov(8b|4h|2s|16b|8h|4s|2d)_POST$")>; + +// ASIMD load, 2 element, one lane, B/H +// ASIMD load, 2 element, one lane, S +// ASIMD load, 2 element, one lane, D +def : InstRW<[THX3T110Write_5Cyc_LS01_F0123], + (instregex "^LD2i(8|16|32|64)$")>; +def : InstRW<[THX3T110Write_5Cyc_LS01_F0123, WriteAdr], + (instregex "^LD2i(8|16|32|64)_POST$")>; + +// ASIMD load, 2 element, all lanes, D-form, B/H/S +// ASIMD load, 2 element, all lanes, D-form, D +// ASIMD load, 2 element, all lanes, Q-form +def : InstRW<[THX3T110Write_5Cyc_LS01_F0123], + (instregex "^LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[THX3T110Write_5Cyc_LS01_F0123, WriteAdr], + (instregex "^LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; + +// ASIMD load, 3 element, multiple, D-form, B/H/S +// ASIMD load, 3 element, multiple, Q-form, B/H/S +// ASIMD load, 3 element, multiple, Q-form, D +def : InstRW<[THX3T110Write_8Cyc_LS01_F0123], + (instregex "^LD3Threev(8b|4h|2s|16b|8h|4s|2d)$")>; +def : InstRW<[THX3T110Write_8Cyc_LS01_F0123, WriteAdr], + (instregex "^LD3Threev(8b|4h|2s|16b|8h|4s|2d)_POST$")>; + +// ASIMD load, 3 element, one lone, B/H +// ASIMD load, 3 element, one lane, S +// ASIMD load, 3 element, one lane, D +def : InstRW<[THX3T110Write_7Cyc_LS01_F0123], + (instregex "^LD3i(8|16|32|64)$")>; +def : InstRW<[THX3T110Write_7Cyc_LS01_F0123, WriteAdr], + (instregex "^LD3i(8|16|32|64)_POST$")>; + +// ASIMD load, 3 element, all lanes, D-form, B/H/S +// ASIMD load, 3 element, all lanes, D-form, D +// ASIMD load, 3 element, all lanes, Q-form, B/H/S +// ASIMD load, 3 element, all lanes, Q-form, D +def : InstRW<[THX3T110Write_7Cyc_LS01_F0123], + (instregex "^LD3Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[THX3T110Write_7Cyc_LS01_F0123, WriteAdr], + (instregex "^LD3Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; + +// ASIMD load, 4 element, multiple, D-form, B/H/S +// ASIMD load, 4 element, multiple, Q-form, B/H/S +// ASIMD load, 4 element, multiple, Q-form, D +def : InstRW<[THX3T110Write_8Cyc_LS01_F0123], + (instregex "^LD4Fourv(8b|4h|2s|16b|8h|4s|2d)$")>; +def : InstRW<[THX3T110Write_8Cyc_LS01_F0123, WriteAdr], + (instregex "^LD4Fourv(8b|4h|2s|16b|8h|4s|2d)_POST$")>; + +// ASIMD load, 4 element, one lane, B/H +// ASIMD load, 4 element, one lane, S +// ASIMD load, 4 element, one lane, D +def : InstRW<[THX3T110Write_6Cyc_LS01_F0123], + (instregex "^LD4i(8|16|32|64)$")>; +def : InstRW<[THX3T110Write_6Cyc_LS01_F0123, WriteAdr], + (instregex "^LD4i(8|16|32|64)_POST$")>; + +// ASIMD load, 4 element, all lanes, D-form, B/H/S +// ASIMD load, 4 element, all lanes, D-form, D +// ASIMD load, 4 element, all lanes, Q-form, B/H/S +// ASIMD load, 4 element, all lanes, Q-form, D +def : InstRW<[THX3T110Write_6Cyc_LS01_F0123], + (instregex "^LD4Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[THX3T110Write_6Cyc_LS01_F0123, WriteAdr], + (instregex "^LD4Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; + +//-- +// 3.16 ASIMD Store Instructions +//-- + +// ASIMD store, 1 element, multiple, 1 reg, D-form +// ASIMD store, 1 element, multiple, 1 reg, Q-form +def : InstRW<[THX3T110Write_1Cyc_LS01], + (instregex "^ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[THX3T110Write_1Cyc_LS01, WriteAdr], + (instregex "^ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; + +// ASIMD store, 1 element, multiple, 2 reg, D-form +// ASIMD store, 1 element, multiple, 2 reg, Q-form +def : InstRW<[THX3T110Write_1Cyc_LS01], + (instregex "^ST1Twov(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[THX3T110Write_1Cyc_LS01, WriteAdr], + (instregex "^ST1Twov(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; + +// ASIMD store, 1 element, multiple, 3 reg, D-form +// ASIMD store, 1 element, multiple, 3 reg, Q-form +def : InstRW<[THX3T110Write_1Cyc_LS01], + (instregex "^ST1Threev(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[THX3T110Write_1Cyc_LS01, WriteAdr], + (instregex "^ST1Threev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; + +// ASIMD store, 1 element, multiple, 4 reg, D-form +// ASIMD store, 1 element, multiple, 4 reg, Q-form +def : InstRW<[THX3T110Write_1Cyc_LS01], + (instregex "^ST1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[THX3T110Write_1Cyc_LS01, WriteAdr], + (instregex "^ST1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; + +// ASIMD store, 1 element, one lane, B/H/S +// ASIMD store, 1 element, one lane, D +def : InstRW<[THX3T110Write_1Cyc_LS01_F0123], + (instregex "^ST1i(8|16|32|64)$")>; +def : InstRW<[THX3T110Write_1Cyc_LS01_F0123, WriteAdr], + (instregex "^ST1i(8|16|32|64)_POST$")>; + +// ASIMD store, 2 element, multiple, D-form, B/H/S +// ASIMD store, 2 element, multiple, Q-form, B/H/S +// ASIMD store, 2 element, multiple, Q-form, D +def : InstRW<[THX3T110Write_1Cyc_LS01_F0123], + (instregex "^ST2Twov(8b|4h|2s|16b|8h|4s|2d)$")>; +def : InstRW<[THX3T110Write_1Cyc_LS01_F0123, WriteAdr], + (instregex "^ST2Twov(8b|4h|2s|16b|8h|4s|2d)_POST$")>; + +// ASIMD store, 2 element, one lane, B/H/S +// ASIMD store, 2 element, one lane, D +def : InstRW<[THX3T110Write_1Cyc_LS01_F0123], + (instregex "^ST2i(8|16|32|64)$")>; +def : InstRW<[THX3T110Write_1Cyc_LS01_F0123, WriteAdr], + (instregex "^ST2i(8|16|32|64)_POST$")>; + +// ASIMD store, 3 element, multiple, D-form, B/H/S +// ASIMD store, 3 element, multiple, Q-form, B/H/S +// ASIMD store, 3 element, multiple, Q-form, D +def : InstRW<[THX3T110Write_1Cyc_LS01_F0123], + (instregex "^ST3Threev(8b|4h|2s|16b|8h|4s|2d)$")>; +def : InstRW<[THX3T110Write_1Cyc_LS01_F0123, WriteAdr], + (instregex "^ST3Threev(8b|4h|2s|16b|8h|4s|2d)_POST$")>; + +// ASIMD store, 3 element, one lane, B/H +// ASIMD store, 3 element, one lane, S +// ASIMD store, 3 element, one lane, D +def : InstRW<[THX3T110Write_1Cyc_LS01_F0123], + (instregex "^ST3i(8|16|32|64)$")>; +def : InstRW<[THX3T110Write_1Cyc_LS01_F0123, WriteAdr], + (instregex "^ST3i(8|16|32|64)_POST$")>; + +// ASIMD store, 4 element, multiple, D-form, B/H/S +// ASIMD store, 4 element, multiple, Q-form, B/H/S +// ASIMD store, 4 element, multiple, Q-form, D +def : InstRW<[THX3T110Write_1Cyc_LS01_F0123], + (instregex "^ST4Fourv(8b|4h|2s|16b|8h|4s|2d)$")>; +def : InstRW<[THX3T110Write_1Cyc_LS01_F0123, WriteAdr], + (instregex "^ST4Fourv(8b|4h|2s|16b|8h|4s|2d)_POST$")>; + +// ASIMD store, 4 element, one lane, B/H +// ASIMD store, 4 element, one lane, S +// ASIMD store, 4 element, one lane, D +def : InstRW<[THX3T110Write_1Cyc_LS01_F0123], + (instregex "^ST4i(8|16|32|64)$")>; +def : InstRW<[THX3T110Write_1Cyc_LS01_F0123, WriteAdr], + (instregex "^ST4i(8|16|32|64)_POST$")>; + +// V8.1a Atomics (LSE) +def : InstRW<[THX3T110Write_4Cyc_I0123, WriteAtomic], + (instrs CASB, CASH, CASW, CASX)>; + +def : InstRW<[THX3T110Write_6Cyc_I0123, WriteAtomic], + (instrs CASAB, CASAH, CASAW, CASAX)>; + +def : InstRW<[THX3T110Write_6Cyc_I0123, WriteAtomic], + (instrs CASLB, CASLH, CASLW, CASLX)>; + +def : InstRW<[THX3T110Write_8Cyc_I0123, WriteAtomic], + (instrs CASALB, CASALH, CASALW, CASALX)>; + +def : InstRW<[THX3T110Write_6Cyc_I0123, WriteAtomic], + (instrs LDLARB, LDLARH, LDLARW, LDLARX)>; + +def : InstRW<[THX3T110Write_4Cyc_I0123, WriteAtomic], + (instrs LDADDB, LDADDH, LDADDW, LDADDX)>; + +def : InstRW<[THX3T110Write_6Cyc_I0123, WriteAtomic], + (instrs LDADDAB, LDADDAH, LDADDAW, LDADDAX)>; + +def : InstRW<[THX3T110Write_6Cyc_I0123, WriteAtomic], + (instrs LDADDLB, LDADDLH, LDADDLW, LDADDLX)>; + +def : InstRW<[THX3T110Write_8Cyc_I0123, WriteAtomic], + (instrs LDADDALB, LDADDALH, LDADDALW, LDADDALX)>; + +def : InstRW<[THX3T110Write_4Cyc_I0123, WriteAtomic], + (instrs LDCLRB, LDCLRH, LDCLRW, LDCLRX)>; + +def : InstRW<[THX3T110Write_6Cyc_I0123, WriteAtomic], + (instrs LDCLRAB, LDCLRAH, LDCLRAW, LDCLRAX)>; + +def : InstRW<[THX3T110Write_6Cyc_I0123, WriteAtomic], + (instrs LDCLRLB, LDCLRLH, LDCLRLW, LDCLRLX)>; + +def : InstRW<[THX3T110Write_8Cyc_I0123, WriteAtomic], + (instrs LDCLRALB, LDCLRALH, LDCLRALW, LDCLRALX)>; + +def : InstRW<[THX3T110Write_4Cyc_I0123, WriteAtomic], + (instrs LDEORB, LDEORH, LDEORW, LDEORX)>; + +def : InstRW<[THX3T110Write_6Cyc_I0123, WriteAtomic], + (instrs LDEORAB, LDEORAH, LDEORAW, LDEORAX)>; + +def : InstRW<[THX3T110Write_6Cyc_I0123, WriteAtomic], + (instrs LDEORLB, LDEORLH, LDEORLW, LDEORLX)>; + +def : InstRW<[THX3T110Write_8Cyc_I0123, WriteAtomic], + (instrs LDEORALB, LDEORALH, LDEORALW, LDEORALX)>; + +def : InstRW<[THX3T110Write_4Cyc_I0123, WriteAtomic], + (instrs LDSETB, LDSETH, LDSETW, LDSETX)>; + +def : InstRW<[THX3T110Write_6Cyc_I0123, WriteAtomic], + (instrs LDSETAB, LDSETAH, LDSETAW, LDSETAX)>; + +def : InstRW<[THX3T110Write_6Cyc_I0123, WriteAtomic], + (instrs LDSETLB, LDSETLH, LDSETLW, LDSETLX)>; + +def : InstRW<[THX3T110Write_8Cyc_I0123, WriteAtomic], + (instrs LDSETALB, LDSETALH, LDSETALW, LDSETALX)>; + +def : InstRW<[THX3T110Write_4Cyc_I0123, WriteAtomic], + (instrs LDSMAXB, LDSMAXH, LDSMAXW, LDSMAXX, + LDSMAXAB, LDSMAXAH, LDSMAXAW, LDSMAXAX, + LDSMAXLB, LDSMAXLH, LDSMAXLW, LDSMAXLX, + LDSMAXALB, LDSMAXALH, LDSMAXALW, LDSMAXALX)>; + +def : InstRW<[THX3T110Write_4Cyc_I0123, WriteAtomic], + (instrs LDSMINB, LDSMINH, LDSMINW, LDSMINX, + LDSMINAB, LDSMINAH, LDSMINAW, LDSMINAX, + LDSMINLB, LDSMINLH, LDSMINLW, LDSMINLX, + LDSMINALB, LDSMINALH, LDSMINALW, LDSMINALX)>; + +def : InstRW<[THX3T110Write_4Cyc_I0123, WriteAtomic], + (instrs LDUMAXB, LDUMAXH, LDUMAXW, LDUMAXX, + LDUMAXAB, LDUMAXAH, LDUMAXAW, LDUMAXAX, + LDUMAXLB, LDUMAXLH, LDUMAXLW, LDUMAXLX, + LDUMAXALB, LDUMAXALH, LDUMAXALW, LDUMAXALX)>; + +def : InstRW<[THX3T110Write_4Cyc_I0123, WriteAtomic], + (instrs LDUMINB, LDUMINH, LDUMINW, LDUMINX, + LDUMINAB, LDUMINAH, LDUMINAW, LDUMINAX, + LDUMINLB, LDUMINLH, LDUMINLW, LDUMINLX, + LDUMINALB, LDUMINALH, LDUMINALW, LDUMINALX)>; + +def : InstRW<[THX3T110Write_4Cyc_I0123, WriteAtomic], + (instrs SWPB, SWPH, SWPW, SWPX)>; + +def : InstRW<[THX3T110Write_6Cyc_I0123, WriteAtomic], + (instrs SWPAB, SWPAH, SWPAW, SWPAX)>; + +def : InstRW<[THX3T110Write_6Cyc_I0123, WriteAtomic], + (instrs SWPLB, SWPLH, SWPLW, SWPLX)>; + +def : InstRW<[THX3T110Write_8Cyc_I0123, WriteAtomic], + (instrs SWPALB, SWPALH, SWPALW, SWPALX)>; + +def : InstRW<[THX3T110Write_4Cyc_I0123, WriteAtomic], + (instrs STLLRB, STLLRH, STLLRW, STLLRX)>; + +// V8.3a PAC +def : InstRW<[THX3T110Write_11Cyc_LS01_I1], (instregex "^LDRAA", "^LDRAB")>; +def : InstRW<[THX3T110Write_8Cyc_I123], + (instrs BLRAA, BLRAAZ, BLRAB, BLRABZ, + BRAA, BRAAZ, BRAB, BRABZ)>; +def : InstRW<[THX3T110Write_8Cyc_I123], (instrs RETAA, RETAB)>; + +} // SchedModel = ThunderX3T110Model diff --git a/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp b/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp index ba61ed726e840..8f814d185e859 100644 --- a/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp @@ -17,7 +17,7 @@ using namespace llvm; SDValue AArch64SelectionDAGInfo::EmitTargetCodeForMemset( SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src, - SDValue Size, unsigned Align, bool isVolatile, + SDValue Size, Align Alignment, bool isVolatile, MachinePointerInfo DstPtrInfo) const { // Check to see if there is a specialized entry-point for memory zeroing. ConstantSDNode *V = dyn_cast(Src); @@ -117,7 +117,7 @@ SDValue AArch64SelectionDAGInfo::EmitTargetCodeForSetTag( MachineFunction &MF = DAG.getMachineFunction(); MachineMemOperand *BaseMemOperand = MF.getMachineMemOperand( - DstPtrInfo, MachineMemOperand::MOStore, ObjSize, 16); + DstPtrInfo, MachineMemOperand::MOStore, ObjSize, Align(16)); bool UseSetTagRangeLoop = kSetTagLoopThreshold >= 0 && (int)ObjSize >= kSetTagLoopThreshold; @@ -125,21 +125,18 @@ SDValue AArch64SelectionDAGInfo::EmitTargetCodeForSetTag( return EmitUnrolledSetTag(DAG, dl, Chain, Addr, ObjSize, BaseMemOperand, ZeroData); - if (ObjSize % 32 != 0) { - SDNode *St1 = DAG.getMachineNode( - ZeroData ? AArch64::STZGPostIndex : AArch64::STGPostIndex, dl, - {MVT::i64, MVT::Other}, - {Addr, Addr, DAG.getTargetConstant(1, dl, MVT::i64), Chain}); - DAG.setNodeMemRefs(cast(St1), {BaseMemOperand}); - ObjSize -= 16; - Addr = SDValue(St1, 0); - Chain = SDValue(St1, 1); - } - const EVT ResTys[] = {MVT::i64, MVT::i64, MVT::Other}; - SDValue Ops[] = {DAG.getConstant(ObjSize, dl, MVT::i64), Addr, Chain}; - SDNode *St = DAG.getMachineNode( - ZeroData ? AArch64::STZGloop : AArch64::STGloop, dl, ResTys, Ops); + + unsigned Opcode; + if (Addr.getOpcode() == ISD::FrameIndex) { + int FI = cast(Addr)->getIndex(); + Addr = DAG.getTargetFrameIndex(FI, MVT::i64); + Opcode = ZeroData ? AArch64::STZGloop : AArch64::STGloop; + } else { + Opcode = ZeroData ? AArch64::STZGloop_wback : AArch64::STGloop_wback; + } + SDValue Ops[] = {DAG.getTargetConstant(ObjSize, dl, MVT::i64), Addr, Chain}; + SDNode *St = DAG.getMachineNode(Opcode, dl, ResTys, Ops); DAG.setNodeMemRefs(cast(St), {BaseMemOperand}); return SDValue(St, 2); diff --git a/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.h b/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.h index d0967fb973cc3..d94fd8471b7b9 100644 --- a/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.h +++ b/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.h @@ -21,7 +21,8 @@ class AArch64SelectionDAGInfo : public SelectionDAGTargetInfo { public: SDValue EmitTargetCodeForMemset(SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src, - SDValue Size, unsigned Align, bool isVolatile, + SDValue Size, Align Alignment, + bool isVolatile, MachinePointerInfo DstPtrInfo) const override; SDValue EmitTargetCodeForSetTag(SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Op1, SDValue Op2, diff --git a/llvm/lib/Target/AArch64/AArch64StackOffset.h b/llvm/lib/Target/AArch64/AArch64StackOffset.h index f95b5dc5246e9..6fa1c744f77e2 100644 --- a/llvm/lib/Target/AArch64/AArch64StackOffset.h +++ b/llvm/lib/Target/AArch64/AArch64StackOffset.h @@ -16,6 +16,7 @@ #include "llvm/Support/MachineValueType.h" #include "llvm/Support/TypeSize.h" +#include namespace llvm { diff --git a/llvm/lib/Target/AArch64/AArch64StackTagging.cpp b/llvm/lib/Target/AArch64/AArch64StackTagging.cpp index 975502818fcd2..61f27cbc3b29d 100644 --- a/llvm/lib/Target/AArch64/AArch64StackTagging.cpp +++ b/llvm/lib/Target/AArch64/AArch64StackTagging.cpp @@ -19,10 +19,13 @@ #include "llvm/ADT/Optional.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/CFG.h" #include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/PostDominators.h" #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/ScalarEvolutionExpressions.h" +#include "llvm/Analysis/StackSafetyAnalysis.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/CodeGen/LiveRegUnits.h" #include "llvm/CodeGen/MachineBasicBlock.h" @@ -44,6 +47,7 @@ #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/IntrinsicsAArch64.h" #include "llvm/IR/Metadata.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/Casting.h" #include "llvm/Support/Debug.h" @@ -61,6 +65,11 @@ static cl::opt ClMergeInit( "stack-tagging-merge-init", cl::Hidden, cl::init(true), cl::ZeroOrMore, cl::desc("merge stack variable initializers with tagging when possible")); +static cl::opt + ClUseStackSafety("stack-tagging-use-stack-safety", cl::Hidden, + cl::init(true), cl::ZeroOrMore, + cl::desc("Use Stack Safety analysis results")); + static cl::opt ClScanLimit("stack-tagging-merge-init-scan-limit", cl::init(40), cl::Hidden); @@ -256,8 +265,9 @@ public: Type *EltTy = VecTy->getElementType(); if (EltTy->isPointerTy()) { uint32_t EltSize = DL->getTypeSizeInBits(EltTy); - Type *NewTy = VectorType::get(IntegerType::get(Ctx, EltSize), - VecTy->getNumElements()); + auto *NewTy = FixedVectorType::get( + IntegerType::get(Ctx, EltSize), + cast(VecTy)->getNumElements()); V = IRB.CreatePointerCast(V, NewTy); } } @@ -275,15 +285,17 @@ class AArch64StackTagging : public FunctionPass { int Tag; // -1 for non-tagged allocations }; - bool MergeInit; + const bool MergeInit; + const bool UseStackSafety; public: static char ID; // Pass ID, replacement for typeid - AArch64StackTagging(bool MergeInit = true) + AArch64StackTagging(bool IsOptNone = false) : FunctionPass(ID), - MergeInit(ClMergeInit.getNumOccurrences() > 0 ? ClMergeInit - : MergeInit) { + MergeInit(ClMergeInit.getNumOccurrences() ? ClMergeInit : !IsOptNone), + UseStackSafety(ClUseStackSafety.getNumOccurrences() ? ClUseStackSafety + : !IsOptNone) { initializeAArch64StackTaggingPass(*PassRegistry::getPassRegistry()); } @@ -305,13 +317,16 @@ public: StringRef getPassName() const override { return "AArch64 Stack Tagging"; } private: - Function *F; - Function *SetTagFunc; - const DataLayout *DL; - AAResults *AA; + Function *F = nullptr; + Function *SetTagFunc = nullptr; + const DataLayout *DL = nullptr; + AAResults *AA = nullptr; + const StackSafetyGlobalInfo *SSI = nullptr; void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); + if (UseStackSafety) + AU.addRequired(); if (MergeInit) AU.addRequired(); } @@ -323,11 +338,13 @@ char AArch64StackTagging::ID = 0; INITIALIZE_PASS_BEGIN(AArch64StackTagging, DEBUG_TYPE, "AArch64 Stack Tagging", false, false) +INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) +INITIALIZE_PASS_DEPENDENCY(StackSafetyGlobalInfoWrapperPass) INITIALIZE_PASS_END(AArch64StackTagging, DEBUG_TYPE, "AArch64 Stack Tagging", false, false) -FunctionPass *llvm::createAArch64StackTaggingPass(bool MergeInit) { - return new AArch64StackTagging(MergeInit); +FunctionPass *llvm::createAArch64StackTaggingPass(bool IsOptNone) { + return new AArch64StackTagging(IsOptNone); } Instruction *AArch64StackTagging::collectInitializers(Instruction *StartInst, @@ -400,7 +417,9 @@ bool AArch64StackTagging::isInterestingAlloca(const AllocaInst &AI) { // dynamic alloca instrumentation for them as well. !AI.isUsedWithInAlloca() && // swifterror allocas are register promoted by ISel - !AI.isSwiftError(); + !AI.isSwiftError() && + // safe allocas are not interesting + !(SSI && SSI->isSafe(AI)); return IsInteresting; } @@ -482,7 +501,7 @@ void AArch64StackTagging::alignAndPadAlloca(AllocaInfo &Info) { auto *NewAI = new AllocaInst( TypeWithPadding, Info.AI->getType()->getAddressSpace(), nullptr, "", Info.AI); NewAI->takeName(Info.AI); - NewAI->setAlignment(MaybeAlign(Info.AI->getAlignment())); + NewAI->setAlignment(Info.AI->getAlign()); NewAI->setUsedWithInAlloca(Info.AI->isUsedWithInAlloca()); NewAI->setSwiftError(Info.AI->isSwiftError()); NewAI->copyMetadata(*Info.AI); @@ -516,6 +535,8 @@ bool AArch64StackTagging::runOnFunction(Function &Fn) { if (!Fn.hasFnAttribute(Attribute::SanitizeMemTag)) return false; + if (UseStackSafety) + SSI = &getAnalysis().getResult(); F = &Fn; DL = &Fn.getParent()->getDataLayout(); if (MergeInit) diff --git a/llvm/lib/Target/AArch64/AArch64StorePairSuppress.cpp b/llvm/lib/Target/AArch64/AArch64StorePairSuppress.cpp index 5deb601822b8c..a94856ef4fba3 100644 --- a/llvm/lib/Target/AArch64/AArch64StorePairSuppress.cpp +++ b/llvm/lib/Target/AArch64/AArch64StorePairSuppress.cpp @@ -149,7 +149,9 @@ bool AArch64StorePairSuppress::runOnMachineFunction(MachineFunction &MF) { continue; const MachineOperand *BaseOp; int64_t Offset; - if (TII->getMemOperandWithOffset(MI, BaseOp, Offset, TRI) && + bool OffsetIsScalable; + if (TII->getMemOperandWithOffset(MI, BaseOp, Offset, OffsetIsScalable, + TRI) && BaseOp->isReg()) { Register BaseReg = BaseOp->getReg(); if (PrevBaseReg == BaseReg) { diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp index 3636d8d2b628c..029535cb98b57 100644 --- a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp +++ b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp @@ -13,12 +13,12 @@ #include "AArch64Subtarget.h" #include "AArch64.h" -#include "AArch64CallLowering.h" #include "AArch64InstrInfo.h" -#include "AArch64LegalizerInfo.h" #include "AArch64PBQPRegAlloc.h" -#include "AArch64RegisterBankInfo.h" #include "AArch64TargetMachine.h" +#include "GISel/AArch64CallLowering.h" +#include "GISel/AArch64LegalizerInfo.h" +#include "GISel/AArch64RegisterBankInfo.h" #include "MCTargetDesc/AArch64AddressingModes.h" #include "llvm/CodeGen/GlobalISel/InstructionSelect.h" #include "llvm/CodeGen/MachineScheduler.h" @@ -47,6 +47,18 @@ static cl::opt cl::desc("Call nonlazybind functions via direct GOT load"), cl::init(false), cl::Hidden); +static cl::opt SVEVectorBitsMax( + "aarch64-sve-vector-bits-max", + cl::desc("Assume SVE vector registers are at most this big, " + "with zero meaning no maximum size is assumed."), + cl::init(0), cl::Hidden); + +static cl::opt SVEVectorBitsMin( + "aarch64-sve-vector-bits-min", + cl::desc("Assume SVE vector registers are at least this big, " + "with zero meaning no minimum size is assumed."), + cl::init(0), cl::Hidden); + AArch64Subtarget & AArch64Subtarget::initializeSubtargetDependencies(StringRef FS, StringRef CPUString) { @@ -68,6 +80,9 @@ void AArch64Subtarget::initializeProperties() { switch (ARMProcFamily) { case Others: break; + case Carmel: + CacheLineSize = 64; + break; case CortexA35: break; case CortexA53: @@ -86,8 +101,16 @@ void AArch64Subtarget::initializeProperties() { case CortexA73: case CortexA75: case CortexA76: + case CortexA77: + case CortexA78: + case CortexX1: PrefFunctionLogAlignment = 4; break; + case A64FX: + CacheLineSize = 256; + PrefFunctionLogAlignment = 5; + PrefLoopLogAlignment = 5; + break; case AppleA7: case AppleA10: case AppleA11: @@ -160,6 +183,17 @@ void AArch64Subtarget::initializeProperties() { PrefFunctionLogAlignment = 4; PrefLoopLogAlignment = 2; break; + case ThunderX3T110: + CacheLineSize = 64; + PrefFunctionLogAlignment = 4; + PrefLoopLogAlignment = 2; + MaxInterleaveFactor = 4; + PrefetchDistance = 128; + MinPrefetchStride = 1024; + MaxPrefetchIterationsAhead = 4; + // FIXME: remove this to enable 64-bit SLP if performance looks good. + MinVectorRegisterBitWidth = 128; + break; } } @@ -177,6 +211,7 @@ AArch64Subtarget::AArch64Subtarget(const Triple &TT, const std::string &CPU, ReserveXRegister.set(18); CallLoweringInfo.reset(new AArch64CallLowering(*getTargetLowering())); + InlineAsmLoweringInfo.reset(new InlineAsmLowering(getTargetLowering())); Legalizer.reset(new AArch64LegalizerInfo(*this)); auto *RBI = new AArch64RegisterBankInfo(*getRegisterInfo()); @@ -194,6 +229,10 @@ const CallLowering *AArch64Subtarget::getCallLowering() const { return CallLoweringInfo.get(); } +const InlineAsmLowering *AArch64Subtarget::getInlineAsmLowering() const { + return InlineAsmLoweringInfo.get(); +} + InstructionSelector *AArch64Subtarget::getInstructionSelector() const { return InstSelector.get(); } @@ -305,3 +344,25 @@ void AArch64Subtarget::mirFileLoaded(MachineFunction &MF) const { if (!MFI.isMaxCallFrameSizeComputed()) MFI.computeMaxCallFrameSize(MF); } + +unsigned AArch64Subtarget::getMaxSVEVectorSizeInBits() const { + assert(HasSVE && "Tried to get SVE vector length without SVE support!"); + assert(SVEVectorBitsMax % 128 == 0 && + "SVE requires vector length in multiples of 128!"); + assert((SVEVectorBitsMax >= SVEVectorBitsMin || SVEVectorBitsMax == 0) && + "Minimum SVE vector size should not be larger than its maximum!"); + if (SVEVectorBitsMax == 0) + return 0; + return (std::max(SVEVectorBitsMin, SVEVectorBitsMax) / 128) * 128; +} + +unsigned AArch64Subtarget::getMinSVEVectorSizeInBits() const { + assert(HasSVE && "Tried to get SVE vector length without SVE support!"); + assert(SVEVectorBitsMin % 128 == 0 && + "SVE requires vector length in multiples of 128!"); + assert((SVEVectorBitsMax >= SVEVectorBitsMin || SVEVectorBitsMax == 0) && + "Minimum SVE vector size should not be larger than its maximum!"); + if (SVEVectorBitsMax == 0) + return (SVEVectorBitsMin / 128) * 128; + return (std::min(SVEVectorBitsMin, SVEVectorBitsMax) / 128) * 128; +} diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.h b/llvm/lib/Target/AArch64/AArch64Subtarget.h index 79c2c161d3cb2..b111f00169488 100644 --- a/llvm/lib/Target/AArch64/AArch64Subtarget.h +++ b/llvm/lib/Target/AArch64/AArch64Subtarget.h @@ -19,6 +19,7 @@ #include "AArch64RegisterInfo.h" #include "AArch64SelectionDAGInfo.h" #include "llvm/CodeGen/GlobalISel/CallLowering.h" +#include "llvm/CodeGen/GlobalISel/InlineAsmLowering.h" #include "llvm/CodeGen/GlobalISel/InstructionSelector.h" #include "llvm/CodeGen/GlobalISel/LegalizerInfo.h" #include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h" @@ -38,11 +39,13 @@ class AArch64Subtarget final : public AArch64GenSubtargetInfo { public: enum ARMProcFamilyEnum : uint8_t { Others, + A64FX, AppleA7, AppleA10, AppleA11, AppleA12, AppleA13, + Carmel, CortexA35, CortexA53, CortexA55, @@ -52,6 +55,9 @@ public: CortexA73, CortexA75, CortexA76, + CortexA77, + CortexA78, + CortexX1, ExynosM3, Falkor, Kryo, @@ -63,7 +69,8 @@ public: ThunderXT81, ThunderXT83, ThunderXT88, - TSV110 + TSV110, + ThunderX3T110 }; protected: @@ -75,6 +82,7 @@ protected: bool HasV8_3aOps = false; bool HasV8_4aOps = false; bool HasV8_5aOps = false; + bool HasV8_6aOps = false; bool HasFPARMv8 = false; bool HasNEON = false; @@ -99,6 +107,10 @@ protected: bool HasPAN_RWV = false; bool HasCCPP = false; + // SVE extensions + bool HasSVE = false; + bool UseExperimentalZeroingPseudos = false; + // Armv8.2 Crypto extensions bool HasSM4 = false; bool HasSHA3 = false; @@ -125,8 +137,6 @@ protected: bool HasRCPC_IMMO = false; bool HasLSLFast = false; - bool HasSVE = false; - bool HasSVE2 = false; bool HasRCPC = false; bool HasAggressiveFMA = false; @@ -143,7 +153,17 @@ protected: bool HasMTE = false; bool HasTME = false; + // Armv8.6-A Extensions + bool HasBF16 = false; + bool HasMatMulInt8 = false; + bool HasMatMulFP32 = false; + bool HasMatMulFP64 = false; + bool HasAMVS = false; + bool HasFineGrainedTraps = false; + bool HasEnhancedCounterVirtualization = false; + // Arm SVE2 extensions + bool HasSVE2 = false; bool HasSVE2AES = false; bool HasSVE2SM4 = false; bool HasSVE2SHA3 = false; @@ -196,6 +216,8 @@ protected: bool UseEL2ForTP = false; bool UseEL3ForTP = false; bool AllowTaggedGlobals = false; + bool HardenSlsRetBr = false; + bool HardenSlsBlr = false; uint8_t MaxInterleaveFactor = 2; uint8_t VectorInsertExtractBaseCost = 3; uint16_t CacheLineSize = 0; @@ -225,6 +247,7 @@ protected: /// GlobalISel related APIs. std::unique_ptr CallLoweringInfo; + std::unique_ptr InlineAsmLoweringInfo; std::unique_ptr InstSelector; std::unique_ptr Legalizer; std::unique_ptr RegBankInfo; @@ -260,6 +283,7 @@ public: return &getInstrInfo()->getRegisterInfo(); } const CallLowering *getCallLowering() const override; + const InlineAsmLowering *getInlineAsmLowering() const override; InstructionSelector *getInstructionSelector() const override; const LegalizerInfo *getLegalizerInfo() const override; const RegisterBankInfo *getRegBankInfo() const override; @@ -347,6 +371,9 @@ public: hasFuseCCSelect() || hasFuseLiterals(); } + bool hardenSlsRetBr() const { return HardenSlsRetBr; } + bool hardenSlsBlr() const { return HardenSlsBlr; } + bool useEL1ForTP() const { return UseEL1ForTP; } bool useEL2ForTP() const { return UseEL2ForTP; } bool useEL3ForTP() const { return UseEL3ForTP; } @@ -359,7 +386,12 @@ public: } unsigned getCacheLineSize() const override { return CacheLineSize; } unsigned getPrefetchDistance() const override { return PrefetchDistance; } - unsigned getMinPrefetchStride() const override { return MinPrefetchStride; } + unsigned getMinPrefetchStride(unsigned NumMemAccesses, + unsigned NumStridedMemAccesses, + unsigned NumPrefetches, + bool HasCall) const override { + return MinPrefetchStride; + } unsigned getMaxPrefetchIterationsAhead() const override { return MaxPrefetchIterationsAhead; } @@ -372,6 +404,10 @@ public: unsigned getWideningBaseCost() const { return WideningBaseCost; } + bool useExperimentalZeroingPseudos() const { + return UseExperimentalZeroingPseudos; + } + /// CPU has TBI (top byte of addresses is ignored during HW address /// translation) and OS enables it. bool supportsAddressTopByteIgnored() const; @@ -401,6 +437,16 @@ public: bool hasSVE2SM4() const { return HasSVE2SM4; } bool hasSVE2SHA3() const { return HasSVE2SHA3; } bool hasSVE2BitPerm() const { return HasSVE2BitPerm; } + bool hasMatMulInt8() const { return HasMatMulInt8; } + bool hasMatMulFP32() const { return HasMatMulFP32; } + bool hasMatMulFP64() const { return HasMatMulFP64; } + + // Armv8.6-A Extensions + bool hasBF16() const { return HasBF16; } + bool hasFineGrainedTraps() const { return HasFineGrainedTraps; } + bool hasEnhancedCounterVirtualization() const { + return HasEnhancedCounterVirtualization; + } bool isLittleEndian() const { return IsLittle; } @@ -438,6 +484,7 @@ public: bool hasDIT() const { return HasDIT; } bool hasTRACEV8_4() const { return HasTRACEV8_4; } bool hasAM() const { return HasAM; } + bool hasAMVS() const { return HasAMVS; } bool hasSEL2() const { return HasSEL2; } bool hasPMU() const { return HasPMU; } bool hasTLB_RMI() const { return HasTLB_RMI; } @@ -497,6 +544,12 @@ public: } void mirFileLoaded(MachineFunction &MF) const override; + + // Return the known range for the bit length of SVE data registers. A value + // of 0 means nothing is known about that particular limit beyong what's + // implied by the architecture. + unsigned getMaxSVEVectorSizeInBits() const; + unsigned getMinSVEVectorSizeInBits() const; }; } // End llvm namespace diff --git a/llvm/lib/Target/AArch64/AArch64SystemOperands.td b/llvm/lib/Target/AArch64/AArch64SystemOperands.td index 6e82d326e5194..ceceabc6ff4ed 100644 --- a/llvm/lib/Target/AArch64/AArch64SystemOperands.td +++ b/llvm/lib/Target/AArch64/AArch64SystemOperands.td @@ -18,18 +18,18 @@ include "llvm/TableGen/SearchableTable.td" //===----------------------------------------------------------------------===// def HasCCPP : Predicate<"Subtarget->hasCCPP()">, - AssemblerPredicate<"FeatureCCPP", "ccpp">; + AssemblerPredicate<(all_of FeatureCCPP), "ccpp">; def HasPAN : Predicate<"Subtarget->hasPAN()">, - AssemblerPredicate<"FeaturePAN", + AssemblerPredicate<(all_of FeaturePAN), "ARM v8.1 Privileged Access-Never extension">; def HasPsUAO : Predicate<"Subtarget->hasPsUAO()">, - AssemblerPredicate<"FeaturePsUAO", + AssemblerPredicate<(all_of FeaturePsUAO), "ARM v8.2 UAO PState extension (psuao)">; def HasPAN_RWV : Predicate<"Subtarget->hasPAN_RWV()">, - AssemblerPredicate<"FeaturePAN_RWV", + AssemblerPredicate<(all_of FeaturePAN_RWV), "ARM v8.2 PAN AT S1E1R and AT S1E1W Variation">; //===----------------------------------------------------------------------===// @@ -338,7 +338,7 @@ def : PState<"PAN", 0b00100>; // v8.2a "User Access Override" extension-specific PStates let Requires = [{ {AArch64::FeaturePsUAO} }] in def : PState<"UAO", 0b00011>; -// v8.4a timining insensitivity of data processing instructions +// v8.4a timing insensitivity of data processing instructions let Requires = [{ {AArch64::FeatureDIT} }] in def : PState<"DIT", 0b11010>; // v8.5a Spectre Mitigation @@ -844,7 +844,7 @@ def : RWSysReg<"SP_EL2", 0b11, 0b110, 0b0100, 0b0001, 0b000>; def : RWSysReg<"SPSel", 0b11, 0b000, 0b0100, 0b0010, 0b000>; def : RWSysReg<"NZCV", 0b11, 0b011, 0b0100, 0b0010, 0b000>; def : RWSysReg<"DAIF", 0b11, 0b011, 0b0100, 0b0010, 0b001>; -def : RWSysReg<"CurrentEL", 0b11, 0b000, 0b0100, 0b0010, 0b010>; +def : ROSysReg<"CurrentEL", 0b11, 0b000, 0b0100, 0b0010, 0b010>; def : RWSysReg<"SPSR_irq", 0b11, 0b100, 0b0100, 0b0011, 0b000>; def : RWSysReg<"SPSR_abt", 0b11, 0b100, 0b0100, 0b0011, 0b001>; def : RWSysReg<"SPSR_und", 0b11, 0b100, 0b0100, 0b0011, 0b010>; @@ -1167,7 +1167,6 @@ def : RWSysReg<"ICC_SRE_EL3", 0b11, 0b110, 0b1100, 0b1100, 0b101>; def : RWSysReg<"ICC_IGRPEN0_EL1", 0b11, 0b000, 0b1100, 0b1100, 0b110>; def : RWSysReg<"ICC_IGRPEN1_EL1", 0b11, 0b000, 0b1100, 0b1100, 0b111>; def : RWSysReg<"ICC_IGRPEN1_EL3", 0b11, 0b110, 0b1100, 0b1100, 0b111>; -def : RWSysReg<"ICC_SEIEN_EL1", 0b11, 0b000, 0b1100, 0b1101, 0b000>; def : RWSysReg<"ICC_AP0R0_EL1", 0b11, 0b000, 0b1100, 0b1000, 0b100>; def : RWSysReg<"ICC_AP0R1_EL1", 0b11, 0b000, 0b1100, 0b1000, 0b101>; def : RWSysReg<"ICC_AP0R2_EL1", 0b11, 0b000, 0b1100, 0b1000, 0b110>; @@ -1185,9 +1184,8 @@ def : RWSysReg<"ICH_AP1R1_EL2", 0b11, 0b100, 0b1100, 0b1001, 0b001>; def : RWSysReg<"ICH_AP1R2_EL2", 0b11, 0b100, 0b1100, 0b1001, 0b010>; def : RWSysReg<"ICH_AP1R3_EL2", 0b11, 0b100, 0b1100, 0b1001, 0b011>; def : RWSysReg<"ICH_HCR_EL2", 0b11, 0b100, 0b1100, 0b1011, 0b000>; -def : RWSysReg<"ICH_MISR_EL2", 0b11, 0b100, 0b1100, 0b1011, 0b010>; +def : ROSysReg<"ICH_MISR_EL2", 0b11, 0b100, 0b1100, 0b1011, 0b010>; def : RWSysReg<"ICH_VMCR_EL2", 0b11, 0b100, 0b1100, 0b1011, 0b111>; -def : RWSysReg<"ICH_VSEIR_EL2", 0b11, 0b100, 0b1100, 0b1001, 0b100>; def : RWSysReg<"ICH_LR0_EL2", 0b11, 0b100, 0b1100, 0b1100, 0b000>; def : RWSysReg<"ICH_LR1_EL2", 0b11, 0b100, 0b1100, 0b1100, 0b001>; def : RWSysReg<"ICH_LR2_EL2", 0b11, 0b100, 0b1100, 0b1100, 0b010>; @@ -1260,7 +1258,7 @@ let Requires = [{ {AArch64::FeatureSPE} }] in { def : RWSysReg<"PMBLIMITR_EL1", 0b11, 0b000, 0b1001, 0b1010, 0b000>; def : RWSysReg<"PMBPTR_EL1", 0b11, 0b000, 0b1001, 0b1010, 0b001>; def : RWSysReg<"PMBSR_EL1", 0b11, 0b000, 0b1001, 0b1010, 0b011>; -def : RWSysReg<"PMBIDR_EL1", 0b11, 0b000, 0b1001, 0b1010, 0b111>; +def : ROSysReg<"PMBIDR_EL1", 0b11, 0b000, 0b1001, 0b1010, 0b111>; def : RWSysReg<"PMSCR_EL2", 0b11, 0b100, 0b1001, 0b1001, 0b000>; def : RWSysReg<"PMSCR_EL12", 0b11, 0b101, 0b1001, 0b1001, 0b000>; def : RWSysReg<"PMSCR_EL1", 0b11, 0b000, 0b1001, 0b1001, 0b000>; @@ -1269,7 +1267,7 @@ def : RWSysReg<"PMSIRR_EL1", 0b11, 0b000, 0b1001, 0b1001, 0b011>; def : RWSysReg<"PMSFCR_EL1", 0b11, 0b000, 0b1001, 0b1001, 0b100>; def : RWSysReg<"PMSEVFR_EL1", 0b11, 0b000, 0b1001, 0b1001, 0b101>; def : RWSysReg<"PMSLATFR_EL1", 0b11, 0b000, 0b1001, 0b1001, 0b110>; -def : RWSysReg<"PMSIDR_EL1", 0b11, 0b000, 0b1001, 0b1001, 0b111>; +def : ROSysReg<"PMSIDR_EL1", 0b11, 0b000, 0b1001, 0b1001, 0b111>; } // v8.2a "RAS extension" registers @@ -1333,7 +1331,6 @@ def : RWSysReg<"PMMIR_EL1", 0b11, 0b000, 0b1001, 0b1110, 0b110>; let Requires = [{ {AArch64::FeatureRASv8_4} }] in { def : RWSysReg<"ERXPFGCTL_EL1", 0b11, 0b000, 0b0101, 0b0100, 0b101>; def : RWSysReg<"ERXPFGCDN_EL1", 0b11, 0b000, 0b0101, 0b0100, 0b110>; -def : RWSysReg<"ERXTS_EL1", 0b11, 0b000, 0b0101, 0b0101, 0b111>; def : RWSysReg<"ERXMISC2_EL1", 0b11, 0b000, 0b0101, 0b0101, 0b010>; def : RWSysReg<"ERXMISC3_EL1", 0b11, 0b000, 0b0101, 0b0101, 0b011>; def : ROSysReg<"ERXPFGF_EL1", 0b11, 0b000, 0b0101, 0b0100, 0b100>; @@ -1360,7 +1357,7 @@ def : RWSysReg<"MPAMVPM7_EL2", 0b11, 0b100, 0b1010, 0b0110, 0b111>; def : ROSysReg<"MPAMIDR_EL1", 0b11, 0b000, 0b1010, 0b0100, 0b100>; } //FeatureMPAM -// v8.4a Activitiy Monitor registers +// v8.4a Activity Monitor registers // Op0 Op1 CRn CRm Op2 let Requires = [{ {AArch64::FeatureAM} }] in { def : RWSysReg<"AMCR_EL0", 0b11, 0b011, 0b1101, 0b0010, 0b000>; @@ -1426,7 +1423,7 @@ def : RWSysReg<"TRFCR_EL2", 0b11, 0b100, 0b0001, 0b0010, 0b001>; def : RWSysReg<"TRFCR_EL12", 0b11, 0b101, 0b0001, 0b0010, 0b001>; } //FeatureTRACEV8_4 -// v8.4a Timining insensitivity of data processing instructions +// v8.4a Timing insensitivity of data processing instructions // DIT: Data Independent Timing instructions // Op0 Op1 CRn CRm Op2 let Requires = [{ {AArch64::FeatureDIT} }] in { @@ -1490,6 +1487,41 @@ def : RWSysReg<"TRBTRG_EL1", 0b11, 0b000, 0b1001, 0b1011, 0b110>; def : ROSysReg<"TRBIDR_EL1", 0b11, 0b000, 0b1001, 0b1011, 0b111>; } // FeatureTRBE + +// v8.6a Activity Monitors Virtualization Support +let Requires = [{ {AArch64::FeatureAMVS} }] in { +foreach n = 0-15 in { + foreach x = 0-1 in { + def : RWSysReg<"AMEVCNTVOFF"#x#n#"_EL2", + 0b11, 0b100, 0b1101, 0b1000, 0b000>{ + let Encoding{4} = x; + let Encoding{3-0} = n; + } + } +} +} + +// v8.6a Fine Grained Virtualization Traps +// Op0 Op1 CRn CRm Op2 +let Requires = [{ {AArch64::FeatureFineGrainedTraps} }] in { +def : RWSysReg<"HFGRTR_EL2", 0b11, 0b100, 0b0001, 0b0001, 0b100>; +def : RWSysReg<"HFGWTR_EL2", 0b11, 0b100, 0b0001, 0b0001, 0b101>; +def : RWSysReg<"HFGITR_EL2", 0b11, 0b100, 0b0001, 0b0001, 0b110>; +def : RWSysReg<"HDFGRTR_EL2", 0b11, 0b100, 0b0011, 0b0001, 0b100>; +def : RWSysReg<"HDFGWTR_EL2", 0b11, 0b100, 0b0011, 0b0001, 0b101>; +} + +// v8.6a Enhanced Counter Virtualization +// Op0 Op1 CRn CRm Op2 +let Requires = [{ {AArch64::FeatureEnhancedCounterVirtualization} }] in { +def : RWSysReg<"CNTSCALE_EL2", 0b11, 0b100, 0b1110, 0b0000, 0b100>; +def : RWSysReg<"CNTISCALE_EL2", 0b11, 0b100, 0b1110, 0b0000, 0b101>; +def : RWSysReg<"CNTPOFF_EL2", 0b11, 0b100, 0b1110, 0b0000, 0b110>; +def : RWSysReg<"CNTVFRQ_EL2", 0b11, 0b100, 0b1110, 0b0000, 0b111>; +def : RWSysReg<"CNTPCTSS_EL0", 0b11, 0b011, 0b1110, 0b0000, 0b101>; +def : RWSysReg<"CNTVCTSS_EL0", 0b11, 0b011, 0b1110, 0b0000, 0b110>; +} + // Cyclone specific system registers // Op0 Op1 CRn CRm Op2 let Requires = [{ {AArch64::ProcAppleA7} }] in diff --git a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp index 115a7da8a6d90..a63b9a97ada55 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp @@ -11,6 +11,7 @@ #include "AArch64TargetMachine.h" #include "AArch64.h" +#include "AArch64MachineFunctionInfo.h" #include "AArch64MacroFusion.h" #include "AArch64Subtarget.h" #include "AArch64TargetObjectFile.h" @@ -26,6 +27,7 @@ #include "llvm/CodeGen/GlobalISel/Legalizer.h" #include "llvm/CodeGen/GlobalISel/Localizer.h" #include "llvm/CodeGen/GlobalISel/RegBankSelect.h" +#include "llvm/CodeGen/MIRParser/MIParser.h" #include "llvm/CodeGen/MachineScheduler.h" #include "llvm/CodeGen/Passes.h" #include "llvm/CodeGen/TargetPassConfig.h" @@ -146,6 +148,11 @@ static cl::opt EnableGlobalISelAtO( cl::desc("Enable GlobalISel at or below an opt level (-1 to disable)"), cl::init(0)); +static cl::opt EnableSVEIntrinsicOpts( + "aarch64-sve-intrinsic-opts", cl::Hidden, + cl::desc("Enable SVE intrinsic opts"), + cl::init(true)); + static cl::opt EnableFalkorHWPFFix("aarch64-enable-falkor-hwpf-fix", cl::init(true), cl::Hidden); @@ -176,13 +183,16 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAArch64Target() { initializeAArch64LoadStoreOptPass(*PR); initializeAArch64SIMDInstrOptPass(*PR); initializeAArch64PreLegalizerCombinerPass(*PR); + initializeAArch64PostLegalizerCombinerPass(*PR); initializeAArch64PromoteConstantPass(*PR); initializeAArch64RedundantCopyEliminationPass(*PR); initializeAArch64StorePairSuppressPass(*PR); initializeFalkorHWPFFixPass(*PR); initializeFalkorMarkStridedAccessesLegacyPass(*PR); initializeLDTLSCleanupPass(*PR); + initializeSVEIntrinsicOptsPass(*PR); initializeAArch64SpeculationHardeningPass(*PR); + initializeAArch64SLSHardeningPass(*PR); initializeAArch64StackTaggingPass(*PR); initializeAArch64StackTaggingPreRAPass(*PR); } @@ -236,12 +246,8 @@ getEffectiveAArch64CodeModel(const Triple &TT, Optional CM, if (CM) { if (*CM != CodeModel::Small && *CM != CodeModel::Tiny && *CM != CodeModel::Large) { - if (!TT.isOSFuchsia()) - report_fatal_error( - "Only small, tiny and large code models are allowed on AArch64"); - else if (*CM != CodeModel::Kernel) - report_fatal_error("Only small, tiny, kernel, and large code models " - "are allowed on AArch64"); + report_fatal_error( + "Only small, tiny and large code models are allowed on AArch64"); } else if (*CM == CodeModel::Tiny && !TT.isOSBinFormatELF()) report_fatal_error("tiny code model is only supported on ELF"); return *CM; @@ -313,6 +319,9 @@ AArch64TargetMachine::AArch64TargetMachine(const Target &T, const Triple &TT, // AArch64 supports default outlining behaviour. setSupportsDefaultOutlining(true); + + // AArch64 supports the debug entry values. + setSupportsDebugEntryValues(true); } AArch64TargetMachine::~AArch64TargetMachine() = default; @@ -403,6 +412,7 @@ public: bool addIRTranslator() override; void addPreLegalizeMachineIR() override; bool addLegalizeMachineIR() override; + void addPreRegBankSelect() override; bool addRegBankSelect() override; void addPreGlobalInstructionSelect() override; bool addGlobalInstructionSelect() override; @@ -435,6 +445,10 @@ void AArch64PassConfig::addIRPasses() { // ourselves. addPass(createAtomicExpandPass()); + // Expand any SVE vector library calls that we can't code generate directly. + if (EnableSVEIntrinsicOpts && TM->getOptLevel() == CodeGenOpt::Aggressive) + addPass(createSVEIntrinsicOptsPass()); + // Cmpxchg instructions are often used with a subsequent comparison to // determine whether it succeeded. We can exploit existing control-flow in // ldrex/strex loops to simplify this, but it needs tidying up. @@ -454,6 +468,9 @@ void AArch64PassConfig::addIRPasses() { TargetPassConfig::addIRPasses(); + addPass(createAArch64StackTaggingPass( + /*IsOptNone=*/TM->getOptLevel() == CodeGenOpt::None)); + // Match interleaved memory accesses to ldN/stN intrinsics. if (TM->getOptLevel() != CodeGenOpt::None) { addPass(createInterleavedLoadCombinePass()); @@ -473,9 +490,6 @@ void AArch64PassConfig::addIRPasses() { addPass(createLICMPass()); } - addPass(createAArch64StackTaggingPass(/* MergeInit = */ TM->getOptLevel() != - CodeGenOpt::None)); - // Add Control Flow Guard checks. if (TM->getTargetTriple().isOSWindows()) addPass(createCFGuardCheckPass()); @@ -541,6 +555,14 @@ bool AArch64PassConfig::addLegalizeMachineIR() { return false; } +void AArch64PassConfig::addPreRegBankSelect() { + // For now we don't add this to the pipeline for -O0. We could do in future + // if we split the combines into separate O0/opt groupings. + bool IsOptNone = getOptLevel() == CodeGenOpt::None; + if (!IsOptNone) + addPass(createAArch64PostLegalizeCombiner(IsOptNone)); +} + bool AArch64PassConfig::addRegBankSelect() { addPass(new RegBankSelect()); return false; @@ -614,6 +636,9 @@ void AArch64PassConfig::addPreSched2() { // info. addPass(createAArch64SpeculationHardeningPass()); + addPass(createAArch64IndirectThunks()); + addPass(createAArch64SLSHardeningPass()); + if (TM->getOptLevel() != CodeGenOpt::None) { if (EnableFalkorHWPFFix) addPass(createFalkorHWPFFixPass()); @@ -648,4 +673,28 @@ void AArch64PassConfig::addPreEmitPass() { if (TM->getOptLevel() != CodeGenOpt::None && EnableCollectLOH && TM->getTargetTriple().isOSBinFormatMachO()) addPass(createAArch64CollectLOHPass()); + + // SVE bundles move prefixes with destructive operations. + addPass(createUnpackMachineBundles(nullptr)); +} + +yaml::MachineFunctionInfo * +AArch64TargetMachine::createDefaultFuncInfoYAML() const { + return new yaml::AArch64FunctionInfo(); +} + +yaml::MachineFunctionInfo * +AArch64TargetMachine::convertFuncInfoToYAML(const MachineFunction &MF) const { + const auto *MFI = MF.getInfo(); + return new yaml::AArch64FunctionInfo(*MFI); +} + +bool AArch64TargetMachine::parseMachineFunctionInfo( + const yaml::MachineFunctionInfo &MFI, PerFunctionMIParsingState &PFS, + SMDiagnostic &Error, SMRange &SourceRange) const { + const auto &YamlMFI = + reinterpret_cast(MFI); + MachineFunction &MF = PFS.MF; + MF.getInfo()->initializeBaseYamlFields(YamlMFI); + return false; } diff --git a/llvm/lib/Target/AArch64/AArch64TargetMachine.h b/llvm/lib/Target/AArch64/AArch64TargetMachine.h index 5264efb89b9c5..7738a42293919 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetMachine.h +++ b/llvm/lib/Target/AArch64/AArch64TargetMachine.h @@ -49,6 +49,14 @@ public: return TLOF.get(); } + yaml::MachineFunctionInfo *createDefaultFuncInfoYAML() const override; + yaml::MachineFunctionInfo * + convertFuncInfoToYAML(const MachineFunction &MF) const override; + bool parseMachineFunctionInfo(const yaml::MachineFunctionInfo &, + PerFunctionMIParsingState &PFS, + SMDiagnostic &Error, + SMRange &SourceRange) const override; + private: bool isLittle; }; diff --git a/llvm/lib/Target/AArch64/AArch64TargetObjectFile.cpp b/llvm/lib/Target/AArch64/AArch64TargetObjectFile.cpp index 54562094fcf56..dfc66f0cb4c16 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetObjectFile.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetObjectFile.cpp @@ -20,7 +20,6 @@ using namespace dwarf; void AArch64_ELFTargetObjectFile::Initialize(MCContext &Ctx, const TargetMachine &TM) { TargetLoweringObjectFileELF::Initialize(Ctx, TM); - InitializeELF(TM.Options.UseInitArray); // AARCH64 ELF ABI does not define static relocation type for TLS offset // within a module. Do not generate AT_location for TLS variables. SupportDebugThreadLocalLocation = false; @@ -43,7 +42,7 @@ const MCExpr *AArch64_MachoTargetObjectFile::getTTypeGlobalReference( const MCExpr *Res = MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_GOT, getContext()); MCSymbol *PCSym = getContext().createTempSymbol(); - Streamer.EmitLabel(PCSym); + Streamer.emitLabel(PCSym); const MCExpr *PC = MCSymbolRefExpr::create(PCSym, getContext()); return MCBinaryExpr::createSub(Res, PC, getContext()); } @@ -68,7 +67,7 @@ const MCExpr *AArch64_MachoTargetObjectFile::getIndirectSymViaGOTPCRel( const MCExpr *Res = MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_GOT, getContext()); MCSymbol *PCSym = getContext().createTempSymbol(); - Streamer.EmitLabel(PCSym); + Streamer.emitLabel(PCSym); const MCExpr *PC = MCSymbolRefExpr::create(PCSym, getContext()); return MCBinaryExpr::createSub(Res, PC, getContext()); } diff --git a/llvm/lib/Target/AArch64/AArch64TargetObjectFile.h b/llvm/lib/Target/AArch64/AArch64TargetObjectFile.h index 1cb4c028c80d2..28324c2ae608f 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetObjectFile.h +++ b/llvm/lib/Target/AArch64/AArch64TargetObjectFile.h @@ -18,6 +18,11 @@ class AArch64TargetMachine; /// This implementation is used for AArch64 ELF targets (Linux in particular). class AArch64_ELFTargetObjectFile : public TargetLoweringObjectFileELF { void Initialize(MCContext &Ctx, const TargetMachine &TM) override; + +public: + AArch64_ELFTargetObjectFile() { + PLTRelativeVariantKind = MCSymbolRefExpr::VK_PLT; + } }; /// AArch64_MachoTargetObjectFile - This TLOF implementation is used for Darwin. diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index 4724d6b8daea7..cf6de797727be 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -57,7 +57,8 @@ int AArch64TTIImpl::getIntImmCost(int64_t Val) { } /// Calculate the cost of materializing the given constant. -int AArch64TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) { +int AArch64TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty, + TTI::TargetCostKind CostKind) { assert(Ty->isIntegerTy()); unsigned BitSize = Ty->getPrimitiveSizeInBits(); @@ -82,7 +83,8 @@ int AArch64TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) { } int AArch64TTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx, - const APInt &Imm, Type *Ty) { + const APInt &Imm, Type *Ty, + TTI::TargetCostKind CostKind) { assert(Ty->isIntegerTy()); unsigned BitSize = Ty->getPrimitiveSizeInBits(); @@ -139,16 +141,17 @@ int AArch64TTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx, if (Idx == ImmIdx) { int NumConstants = (BitSize + 63) / 64; - int Cost = AArch64TTIImpl::getIntImmCost(Imm, Ty); + int Cost = AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind); return (Cost <= NumConstants * TTI::TCC_Basic) ? static_cast(TTI::TCC_Free) : Cost; } - return AArch64TTIImpl::getIntImmCost(Imm, Ty); + return AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind); } int AArch64TTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, - const APInt &Imm, Type *Ty) { + const APInt &Imm, Type *Ty, + TTI::TargetCostKind CostKind) { assert(Ty->isIntegerTy()); unsigned BitSize = Ty->getPrimitiveSizeInBits(); @@ -161,7 +164,7 @@ int AArch64TTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, // selected instruction, so we compute the materialization cost for the // immediate directly. if (IID >= Intrinsic::aarch64_addg && IID <= Intrinsic::aarch64_udiv) - return AArch64TTIImpl::getIntImmCost(Imm, Ty); + return AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind); switch (IID) { default: @@ -174,7 +177,7 @@ int AArch64TTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, case Intrinsic::umul_with_overflow: if (Idx == 1) { int NumConstants = (BitSize + 63) / 64; - int Cost = AArch64TTIImpl::getIntImmCost(Imm, Ty); + int Cost = AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind); return (Cost <= NumConstants * TTI::TCC_Basic) ? static_cast(TTI::TCC_Free) : Cost; @@ -190,7 +193,7 @@ int AArch64TTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, return TTI::TCC_Free; break; } - return AArch64TTIImpl::getIntImmCost(Imm, Ty); + return AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind); } TargetTransformInfo::PopcntSupportKind @@ -208,8 +211,8 @@ bool AArch64TTIImpl::isWideningInstruction(Type *DstTy, unsigned Opcode, // A helper that returns a vector type from the given type. The number of // elements in type Ty determine the vector width. auto toVectorTy = [&](Type *ArgTy) { - return VectorType::get(ArgTy->getScalarType(), - DstTy->getVectorNumElements()); + return FixedVectorType::get(ArgTy->getScalarType(), + cast(DstTy)->getNumElements()); }; // Exit early if DstTy is not a vector type whose elements are at least @@ -251,7 +254,7 @@ bool AArch64TTIImpl::isWideningInstruction(Type *DstTy, unsigned Opcode, // Legalize the source type and ensure it can be used in a widening // operation. - Type *SrcTy = toVectorTy(Extend->getSrcTy()); + auto *SrcTy = toVectorTy(Extend->getSrcTy()); auto SrcTyL = TLI->getTypeLegalizationCost(DL, SrcTy); unsigned SrcElTySize = SrcTyL.second.getScalarSizeInBits(); if (!SrcTyL.second.isVector() || SrcElTySize != SrcTy->getScalarSizeInBits()) @@ -267,6 +270,7 @@ bool AArch64TTIImpl::isWideningInstruction(Type *DstTy, unsigned Opcode, } int AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, + TTI::TargetCostKind CostKind, const Instruction *I) { int ISD = TLI->InstructionOpcodeToISD(Opcode); assert(ISD && "Invalid opcode"); @@ -291,11 +295,18 @@ int AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, } } + // TODO: Allow non-throughput costs that aren't binary. + auto AdjustCost = [&CostKind](int Cost) { + if (CostKind != TTI::TCK_RecipThroughput) + return Cost == 0 ? 0 : 1; + return Cost; + }; + EVT SrcTy = TLI->getValueType(DL, Src); EVT DstTy = TLI->getValueType(DL, Dst); if (!SrcTy.isSimple() || !DstTy.isSimple()) - return BaseT::getCastInstrCost(Opcode, Dst, Src); + return AdjustCost(BaseT::getCastInstrCost(Opcode, Dst, Src, CostKind, I)); static const TypeConversionCostTblEntry ConversionTbl[] = { @@ -397,9 +408,9 @@ int AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, if (const auto *Entry = ConvertCostTableLookup(ConversionTbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT())) - return Entry->Cost; + return AdjustCost(Entry->Cost); - return BaseT::getCastInstrCost(Opcode, Dst, Src); + return AdjustCost(BaseT::getCastInstrCost(Opcode, Dst, Src, CostKind, I)); } int AArch64TTIImpl::getExtractWithExtendCost(unsigned Opcode, Type *Dst, @@ -425,17 +436,18 @@ int AArch64TTIImpl::getExtractWithExtendCost(unsigned Opcode, Type *Dst, auto VecLT = TLI->getTypeLegalizationCost(DL, VecTy); auto DstVT = TLI->getValueType(DL, Dst); auto SrcVT = TLI->getValueType(DL, Src); + TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; // If the resulting type is still a vector and the destination type is legal, // we may get the extension for free. If not, get the default cost for the // extend. if (!VecLT.second.isVector() || !TLI->isTypeLegal(DstVT)) - return Cost + getCastInstrCost(Opcode, Dst, Src); + return Cost + getCastInstrCost(Opcode, Dst, Src, CostKind); // The destination type should be larger than the element type. If not, get // the default cost for the extend. if (DstVT.getSizeInBits() < SrcVT.getSizeInBits()) - return Cost + getCastInstrCost(Opcode, Dst, Src); + return Cost + getCastInstrCost(Opcode, Dst, Src, CostKind); switch (Opcode) { default: @@ -454,7 +466,16 @@ int AArch64TTIImpl::getExtractWithExtendCost(unsigned Opcode, Type *Dst, } // If we are unable to perform the extend for free, get the default cost. - return Cost + getCastInstrCost(Opcode, Dst, Src); + return Cost + getCastInstrCost(Opcode, Dst, Src, CostKind); +} + +unsigned AArch64TTIImpl::getCFInstrCost(unsigned Opcode, + TTI::TargetCostKind CostKind) { + if (CostKind != TTI::TCK_RecipThroughput) + return Opcode == Instruction::PHI ? 0 : 1; + assert(CostKind == TTI::TCK_RecipThroughput && "unexpected CostKind"); + // Branches are assumed to be predicted. + return 0; } int AArch64TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, @@ -483,10 +504,17 @@ int AArch64TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, } int AArch64TTIImpl::getArithmeticInstrCost( - unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info, + unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, + TTI::OperandValueKind Opd1Info, TTI::OperandValueKind Opd2Info, TTI::OperandValueProperties Opd1PropInfo, TTI::OperandValueProperties Opd2PropInfo, ArrayRef Args, const Instruction *CxtI) { + // TODO: Handle more cost kinds. + if (CostKind != TTI::TCK_RecipThroughput) + return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info, + Opd2Info, Opd1PropInfo, + Opd2PropInfo, Args, CxtI); + // Legalize the type. std::pair LT = TLI->getTypeLegalizationCost(DL, Ty); @@ -504,7 +532,8 @@ int AArch64TTIImpl::getArithmeticInstrCost( switch (ISD) { default: - return Cost + BaseT::getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info, + return Cost + BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info, + Opd2Info, Opd1PropInfo, Opd2PropInfo); case ISD::SDIV: if (Opd2Info == TargetTransformInfo::OK_UniformConstantValue && @@ -513,16 +542,20 @@ int AArch64TTIImpl::getArithmeticInstrCost( // normally expanded to the sequence ADD + CMP + SELECT + SRA. // The OperandValue properties many not be same as that of previous // operation; conservatively assume OP_None. - Cost += getArithmeticInstrCost(Instruction::Add, Ty, Opd1Info, Opd2Info, + Cost += getArithmeticInstrCost(Instruction::Add, Ty, CostKind, + Opd1Info, Opd2Info, TargetTransformInfo::OP_None, TargetTransformInfo::OP_None); - Cost += getArithmeticInstrCost(Instruction::Sub, Ty, Opd1Info, Opd2Info, + Cost += getArithmeticInstrCost(Instruction::Sub, Ty, CostKind, + Opd1Info, Opd2Info, TargetTransformInfo::OP_None, TargetTransformInfo::OP_None); - Cost += getArithmeticInstrCost(Instruction::Select, Ty, Opd1Info, Opd2Info, + Cost += getArithmeticInstrCost(Instruction::Select, Ty, CostKind, + Opd1Info, Opd2Info, TargetTransformInfo::OP_None, TargetTransformInfo::OP_None); - Cost += getArithmeticInstrCost(Instruction::AShr, Ty, Opd1Info, Opd2Info, + Cost += getArithmeticInstrCost(Instruction::AShr, Ty, CostKind, + Opd1Info, Opd2Info, TargetTransformInfo::OP_None, TargetTransformInfo::OP_None); return Cost; @@ -535,31 +568,34 @@ int AArch64TTIImpl::getArithmeticInstrCost( // Vector signed division by constant are expanded to the // sequence MULHS + ADD/SUB + SRA + SRL + ADD, and unsigned division // to MULHS + SUB + SRL + ADD + SRL. - int MulCost = getArithmeticInstrCost(Instruction::Mul, Ty, Opd1Info, - Opd2Info, + int MulCost = getArithmeticInstrCost(Instruction::Mul, Ty, CostKind, + Opd1Info, Opd2Info, TargetTransformInfo::OP_None, TargetTransformInfo::OP_None); - int AddCost = getArithmeticInstrCost(Instruction::Add, Ty, Opd1Info, - Opd2Info, + int AddCost = getArithmeticInstrCost(Instruction::Add, Ty, CostKind, + Opd1Info, Opd2Info, TargetTransformInfo::OP_None, TargetTransformInfo::OP_None); - int ShrCost = getArithmeticInstrCost(Instruction::AShr, Ty, Opd1Info, - Opd2Info, + int ShrCost = getArithmeticInstrCost(Instruction::AShr, Ty, CostKind, + Opd1Info, Opd2Info, TargetTransformInfo::OP_None, TargetTransformInfo::OP_None); return MulCost * 2 + AddCost * 2 + ShrCost * 2 + 1; } } - Cost += BaseT::getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info, + Cost += BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info, + Opd2Info, Opd1PropInfo, Opd2PropInfo); if (Ty->isVectorTy()) { // On AArch64, vector divisions are not supported natively and are // expanded into scalar divisions of each pair of elements. - Cost += getArithmeticInstrCost(Instruction::ExtractElement, Ty, Opd1Info, - Opd2Info, Opd1PropInfo, Opd2PropInfo); - Cost += getArithmeticInstrCost(Instruction::InsertElement, Ty, Opd1Info, - Opd2Info, Opd1PropInfo, Opd2PropInfo); + Cost += getArithmeticInstrCost(Instruction::ExtractElement, Ty, CostKind, + Opd1Info, Opd2Info, Opd1PropInfo, + Opd2PropInfo); + Cost += getArithmeticInstrCost(Instruction::InsertElement, Ty, CostKind, + Opd1Info, Opd2Info, Opd1PropInfo, + Opd2PropInfo); // TODO: if one of the arguments is scalar, then it's not necessary to // double the cost of handling the vector elements. Cost += Cost; @@ -574,6 +610,16 @@ int AArch64TTIImpl::getArithmeticInstrCost( // These nodes are marked as 'custom' for combining purposes only. // We know that they are legal. See LowerAdd in ISelLowering. return (Cost + 1) * LT.first; + + case ISD::FADD: + // These nodes are marked as 'custom' just to lower them to SVE. + // We know said lowering will incur no additional cost. + if (isa(Ty) && !Ty->getScalarType()->isFP128Ty()) + return (Cost + 2) * LT.first; + + return Cost + BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info, + Opd2Info, + Opd1PropInfo, Opd2PropInfo); } } @@ -596,7 +642,12 @@ int AArch64TTIImpl::getAddressComputationCost(Type *Ty, ScalarEvolution *SE, } int AArch64TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, - Type *CondTy, const Instruction *I) { + Type *CondTy, + TTI::TargetCostKind CostKind, + const Instruction *I) { + // TODO: Handle other cost kinds. + if (CostKind != TTI::TCK_RecipThroughput) + return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, CostKind, I); int ISD = TLI->InstructionOpcodeToISD(Opcode); // We don't lower some vector selects well that are wider than the register @@ -623,13 +674,18 @@ int AArch64TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, return Entry->Cost; } } - return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, I); + return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, CostKind, I); } AArch64TTIImpl::TTI::MemCmpExpansionOptions AArch64TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const { TTI::MemCmpExpansionOptions Options; - Options.AllowOverlappingLoads = !ST->requiresStrictAlign(); + if (ST->requiresStrictAlign()) { + // TODO: Add cost modeling for strict align. Misaligned loads expand to + // a bunch of instructions when strict align is enabled. + return Options; + } + Options.AllowOverlappingLoads = true; Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize); Options.NumLoadsPerBlock = Options.MaxNumLoads; // TODO: Though vector loads usually perform well on AArch64, in some targets @@ -641,7 +697,17 @@ AArch64TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const { int AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Ty, MaybeAlign Alignment, unsigned AddressSpace, + TTI::TargetCostKind CostKind, const Instruction *I) { + // TODO: Handle other cost kinds. + if (CostKind != TTI::TCK_RecipThroughput) + return 1; + + // Type legalization can't handle structs + if (TLI->getValueType(DL, Ty, true) == MVT::Other) + return BaseT::getMemoryOpCost(Opcode, Ty, Alignment, AddressSpace, + CostKind); + auto LT = TLI->getTypeLegalizationCost(DL, Ty); if (ST->isMisaligned128StoreSlow() && Opcode == Instruction::Store && @@ -656,7 +722,8 @@ int AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Ty, return LT.first * 2 * AmortizationCost; } - if (Ty->isVectorTy() && Ty->getVectorElementType()->isIntegerTy(8)) { + if (Ty->isVectorTy() && + cast(Ty)->getElementType()->isIntegerTy(8)) { unsigned ProfitableNumElements; if (Opcode == Instruction::Store) // We use a custom trunc store lowering so v.4b should be profitable. @@ -666,8 +733,8 @@ int AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Ty, // have to promote the elements to v.2. ProfitableNumElements = 8; - if (Ty->getVectorNumElements() < ProfitableNumElements) { - unsigned NumVecElts = Ty->getVectorNumElements(); + if (cast(Ty)->getNumElements() < ProfitableNumElements) { + unsigned NumVecElts = cast(Ty)->getNumElements(); unsigned NumVectorizableInstsToAmortize = NumVecElts * 2; // We generate 2 instructions per vector element. return NumVectorizableInstsToAmortize * NumVecElts * 2; @@ -677,20 +744,18 @@ int AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Ty, return LT.first; } -int AArch64TTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, - unsigned Factor, - ArrayRef Indices, - unsigned Alignment, - unsigned AddressSpace, - bool UseMaskForCond, - bool UseMaskForGaps) { +int AArch64TTIImpl::getInterleavedMemoryOpCost( + unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef Indices, + Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, + bool UseMaskForCond, bool UseMaskForGaps) { assert(Factor >= 2 && "Invalid interleave factor"); - assert(isa(VecTy) && "Expect a vector type"); + auto *VecVTy = cast(VecTy); if (!UseMaskForCond && !UseMaskForGaps && Factor <= TLI->getMaxSupportedInterleaveFactor()) { - unsigned NumElts = VecTy->getVectorNumElements(); - auto *SubVecTy = VectorType::get(VecTy->getScalarType(), NumElts / Factor); + unsigned NumElts = VecVTy->getNumElements(); + auto *SubVecTy = + FixedVectorType::get(VecTy->getScalarType(), NumElts / Factor); // ldN/stN only support legal vector types of size 64 or 128 in bits. // Accesses having vector types that are a multiple of 128 bits can be @@ -701,18 +766,20 @@ int AArch64TTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, } return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, - Alignment, AddressSpace, + Alignment, AddressSpace, CostKind, UseMaskForCond, UseMaskForGaps); } int AArch64TTIImpl::getCostOfKeepingLiveOverCall(ArrayRef Tys) { int Cost = 0; + TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; for (auto *I : Tys) { if (!I->isVectorTy()) continue; - if (I->getScalarSizeInBits() * I->getVectorNumElements() == 128) - Cost += getMemoryOpCost(Instruction::Store, I, Align(128), 0) + - getMemoryOpCost(Instruction::Load, I, Align(128), 0); + if (I->getScalarSizeInBits() * cast(I)->getNumElements() == + 128) + Cost += getMemoryOpCost(Instruction::Store, I, Align(128), 0, CostKind) + + getMemoryOpCost(Instruction::Load, I, Align(128), 0, CostKind); } return Cost; } @@ -792,6 +859,11 @@ void AArch64TTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, getFalkorUnrollingPreferences(L, SE, UP); } +void AArch64TTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE, + TTI::PeelingPreferences &PP) { + BaseT::getPeelingPreferences(L, SE, PP); +} + Value *AArch64TTIImpl::getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst, Type *ExpectedType) { switch (Inst->getIntrinsicID()) { @@ -902,7 +974,7 @@ bool AArch64TTIImpl::shouldConsiderAddressTypePromotion( bool AArch64TTIImpl::useReductionIntrinsic(unsigned Opcode, Type *Ty, TTI::ReductionFlags Flags) const { - assert(isa(Ty) && "Expected Ty to be a vector type"); + auto *VTy = cast(Ty); unsigned ScalarBits = Ty->getScalarSizeInBits(); switch (Opcode) { case Instruction::FAdd: @@ -913,10 +985,10 @@ bool AArch64TTIImpl::useReductionIntrinsic(unsigned Opcode, Type *Ty, case Instruction::Mul: return false; case Instruction::Add: - return ScalarBits * Ty->getVectorNumElements() >= 128; + return ScalarBits * cast(VTy)->getNumElements() >= 128; case Instruction::ICmp: return (ScalarBits < 64) && - (ScalarBits * Ty->getVectorNumElements() >= 128); + (ScalarBits * cast(VTy)->getNumElements() >= 128); case Instruction::FCmp: return Flags.NoNaN; default: @@ -925,11 +997,14 @@ bool AArch64TTIImpl::useReductionIntrinsic(unsigned Opcode, Type *Ty, return false; } -int AArch64TTIImpl::getArithmeticReductionCost(unsigned Opcode, Type *ValTy, - bool IsPairwiseForm) { +int AArch64TTIImpl::getArithmeticReductionCost(unsigned Opcode, + VectorType *ValTy, + bool IsPairwiseForm, + TTI::TargetCostKind CostKind) { if (IsPairwiseForm) - return BaseT::getArithmeticReductionCost(Opcode, ValTy, IsPairwiseForm); + return BaseT::getArithmeticReductionCost(Opcode, ValTy, IsPairwiseForm, + CostKind); std::pair LT = TLI->getTypeLegalizationCost(DL, ValTy); MVT MTy = LT.second; @@ -950,11 +1025,12 @@ int AArch64TTIImpl::getArithmeticReductionCost(unsigned Opcode, Type *ValTy, if (const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy)) return LT.first * Entry->Cost; - return BaseT::getArithmeticReductionCost(Opcode, ValTy, IsPairwiseForm); + return BaseT::getArithmeticReductionCost(Opcode, ValTy, IsPairwiseForm, + CostKind); } -int AArch64TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, - Type *SubTp) { +int AArch64TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, + int Index, VectorType *SubTp) { if (Kind == TTI::SK_Broadcast || Kind == TTI::SK_Transpose || Kind == TTI::SK_Select || Kind == TTI::SK_PermuteSingleSrc) { static const CostTblEntry ShuffleTbl[] = { diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h index 6f4569a497831..1f029689a60e6 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -72,11 +72,11 @@ public: using BaseT::getIntImmCost; int getIntImmCost(int64_t Val); - int getIntImmCost(const APInt &Imm, Type *Ty); + int getIntImmCost(const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind); int getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, - Type *Ty); + Type *Ty, TTI::TargetCostKind CostKind); int getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, - Type *Ty); + Type *Ty, TTI::TargetCostKind CostKind); TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth); /// @} @@ -98,6 +98,8 @@ public: unsigned getRegisterBitWidth(bool Vector) const { if (Vector) { + if (ST->hasSVE()) + return std::max(ST->getMinSVEVectorSizeInBits(), 128u); if (ST->hasNEON()) return 128; return 0; @@ -112,15 +114,19 @@ public: unsigned getMaxInterleaveFactor(unsigned VF); int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, + TTI::TargetCostKind CostKind, const Instruction *I = nullptr); int getExtractWithExtendCost(unsigned Opcode, Type *Dst, VectorType *VecTy, unsigned Index); + unsigned getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind); + int getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index); int getArithmeticInstrCost( unsigned Opcode, Type *Ty, + TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput, TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue, TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue, TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None, @@ -131,30 +137,37 @@ public: int getAddressComputationCost(Type *Ty, ScalarEvolution *SE, const SCEV *Ptr); int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, + TTI::TargetCostKind CostKind, const Instruction *I = nullptr); TTI::MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const; int getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, - unsigned AddressSpace, const Instruction *I = nullptr); + unsigned AddressSpace, + TTI::TargetCostKind CostKind, + const Instruction *I = nullptr); int getCostOfKeepingLiveOverCall(ArrayRef Tys); void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP); + void getPeelingPreferences(Loop *L, ScalarEvolution &SE, + TTI::PeelingPreferences &PP); + Value *getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst, Type *ExpectedType); bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info); - bool isLegalMaskedLoadStore(Type *DataType, MaybeAlign Alignment) { - if (!isa(DataType) || !ST->hasSVE()) + bool isLegalMaskedLoadStore(Type *DataType, Align Alignment) { + if (!isa(DataType) || !ST->hasSVE()) return false; - Type *Ty = DataType->getVectorElementType(); - if (Ty->isHalfTy() || Ty->isFloatTy() || Ty->isDoubleTy()) + Type *Ty = cast(DataType)->getElementType(); + if (Ty->isBFloatTy() || Ty->isHalfTy() || + Ty->isFloatTy() || Ty->isDoubleTy()) return true; if (Ty->isIntegerTy(8) || Ty->isIntegerTy(16) || @@ -164,26 +177,58 @@ public: return false; } - bool isLegalMaskedLoad(Type *DataType, MaybeAlign Alignment) { + bool isLegalMaskedLoad(Type *DataType, Align Alignment) { return isLegalMaskedLoadStore(DataType, Alignment); } - bool isLegalMaskedStore(Type *DataType, MaybeAlign Alignment) { + bool isLegalMaskedStore(Type *DataType, Align Alignment) { return isLegalMaskedLoadStore(DataType, Alignment); } - int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, - ArrayRef Indices, unsigned Alignment, - unsigned AddressSpace, - bool UseMaskForCond = false, - bool UseMaskForGaps = false); + bool isLegalNTStore(Type *DataType, Align Alignment) { + // NOTE: The logic below is mostly geared towards LV, which calls it with + // vectors with 2 elements. We might want to improve that, if other + // users show up. + // Nontemporal vector stores can be directly lowered to STNP, if the vector + // can be halved so that each half fits into a register. That's the case if + // the element type fits into a register and the number of elements is a + // power of 2 > 1. + if (auto *DataTypeVTy = dyn_cast(DataType)) { + unsigned NumElements = + cast(DataTypeVTy)->getNumElements(); + unsigned EltSize = DataTypeVTy->getElementType()->getScalarSizeInBits(); + return NumElements > 1 && isPowerOf2_64(NumElements) && EltSize >= 8 && + EltSize <= 128 && isPowerOf2_64(EltSize); + } + return BaseT::isLegalNTStore(DataType, Alignment); + } + + int getInterleavedMemoryOpCost( + unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef Indices, + Align Alignment, unsigned AddressSpace, + TTI::TargetCostKind CostKind = TTI::TCK_SizeAndLatency, + bool UseMaskForCond = false, bool UseMaskForGaps = false); bool shouldConsiderAddressTypePromotion(const Instruction &I, bool &AllowPromotionWithoutCommonHeader); bool shouldExpandReduction(const IntrinsicInst *II) const { - return false; + switch (II->getIntrinsicID()) { + case Intrinsic::experimental_vector_reduce_v2_fadd: + case Intrinsic::experimental_vector_reduce_v2_fmul: + // We don't have legalization support for ordered FP reductions. + return !II->getFastMathFlags().allowReassoc(); + + case Intrinsic::experimental_vector_reduce_fmax: + case Intrinsic::experimental_vector_reduce_fmin: + // Lowering asserts that there are no NaNs. + return !II->getFastMathFlags().noNaNs(); + + default: + // Don't expand anything else, let legalization deal with it. + return false; + } } unsigned getGISelRematGlobalCost() const { @@ -193,10 +238,12 @@ public: bool useReductionIntrinsic(unsigned Opcode, Type *Ty, TTI::ReductionFlags Flags) const; - int getArithmeticReductionCost(unsigned Opcode, Type *Ty, - bool IsPairwiseForm); + int getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, + bool IsPairwiseForm, + TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput); - int getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp); + int getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, int Index, + VectorType *SubTp); /// @} }; diff --git a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp index be4c960224727..0ac09c4f96f04 100644 --- a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp +++ b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp @@ -260,6 +260,8 @@ public: bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name, SMLoc NameLoc, OperandVector &Operands) override; bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) override; + OperandMatchResultTy tryParseRegister(unsigned &RegNo, SMLoc &StartLoc, + SMLoc &EndLoc) override; bool ParseDirective(AsmToken DirectiveID) override; unsigned validateTargetOperandClass(MCParsedAsmOperand &Op, unsigned Kind) override; @@ -755,12 +757,13 @@ public: return false; int64_t Val = MCE->getValue(); - int64_t SVal = typename std::make_signed::type(Val); - int64_t UVal = typename std::make_unsigned::type(Val); - if (Val != SVal && Val != UVal) + // Avoid left shift by 64 directly. + uint64_t Upper = UINT64_C(-1) << (sizeof(T) * 4) << (sizeof(T) * 4); + // Allow all-0 or all-1 in top bits to permit bitwise NOT. + if ((Val & Upper) && (Val & Upper) != Upper) return false; - return AArch64_AM::isLogicalImmediate(UVal, sizeof(T) * 8); + return AArch64_AM::isLogicalImmediate(Val & ~Upper, sizeof(T) * 8); } bool isShiftedImm() const { return Kind == k_ShiftedImm; } @@ -852,8 +855,7 @@ public: if (!isShiftedImm() && (!isImm() || !isa(getImm()))) return DiagnosticPredicateTy::NoMatch; - bool IsByte = - std::is_same::type>::value; + bool IsByte = std::is_same>::value; if (auto ShiftedImm = getShiftedVal<8>()) if (!(IsByte && ShiftedImm->second) && AArch64_AM::isSVECpyImm(uint64_t(ShiftedImm->first) @@ -870,8 +872,7 @@ public: if (!isShiftedImm() && (!isImm() || !isa(getImm()))) return DiagnosticPredicateTy::NoMatch; - bool IsByte = - std::is_same::type>::value; + bool IsByte = std::is_same>::value; if (auto ShiftedImm = getShiftedVal<8>()) if (!(IsByte && ShiftedImm->second) && AArch64_AM::isSVEAddSubImm(ShiftedImm->first @@ -969,11 +970,15 @@ public: bool isMOVZMovAlias() const { if (!isImm()) return false; - const MCConstantExpr *CE = dyn_cast(getImm()); - if (!CE) return false; - uint64_t Value = CE->getValue(); + const MCExpr *E = getImm(); + if (const MCConstantExpr *CE = dyn_cast(E)) { + uint64_t Value = CE->getValue(); - return AArch64_AM::isMOVZMovAlias(Value, Shift, RegWidth); + return AArch64_AM::isMOVZMovAlias(Value, Shift, RegWidth); + } + // Only supports the case of Shift being 0 if an expression is used as an + // operand + return !Shift && E; } template @@ -1033,8 +1038,10 @@ public: bool isNeonVectorRegLo() const { return Kind == k_Register && Reg.Kind == RegKind::NeonVector && - AArch64MCRegisterClasses[AArch64::FPR128_loRegClassID].contains( - Reg.RegNum); + (AArch64MCRegisterClasses[AArch64::FPR128_loRegClassID].contains( + Reg.RegNum) || + AArch64MCRegisterClasses[AArch64::FPR64_loRegClassID].contains( + Reg.RegNum)); } template bool isSVEVectorReg() const { @@ -1606,7 +1613,7 @@ public: void addLogicalImmOperands(MCInst &Inst, unsigned N) const { assert(N == 1 && "Invalid number of operands!"); const MCConstantExpr *MCE = cast(getImm()); - typename std::make_unsigned::type Val = MCE->getValue(); + std::make_unsigned_t Val = MCE->getValue(); uint64_t encoding = AArch64_AM::encodeLogicalImmediate(Val, sizeof(T) * 8); Inst.addOperand(MCOperand::createImm(encoding)); } @@ -1615,7 +1622,7 @@ public: void addLogicalImmNotOperands(MCInst &Inst, unsigned N) const { assert(N == 1 && "Invalid number of operands!"); const MCConstantExpr *MCE = cast(getImm()); - typename std::make_unsigned::type Val = ~MCE->getValue(); + std::make_unsigned_t Val = ~MCE->getValue(); uint64_t encoding = AArch64_AM::encodeLogicalImmediate(Val, sizeof(T) * 8); Inst.addOperand(MCOperand::createImm(encoding)); } @@ -1771,9 +1778,13 @@ public: void addMOVZMovAliasOperands(MCInst &Inst, unsigned N) const { assert(N == 1 && "Invalid number of operands!"); - const MCConstantExpr *CE = cast(getImm()); - uint64_t Value = CE->getValue(); - Inst.addOperand(MCOperand::createImm((Value >> Shift) & 0xffff)); + const MCConstantExpr *CE = dyn_cast(getImm()); + if (CE) { + uint64_t Value = CE->getValue(); + Inst.addOperand(MCOperand::createImm((Value >> Shift) & 0xffff)); + } else { + addExpr(Inst, getImm()); + } } template @@ -2243,10 +2254,16 @@ static unsigned matchSVEPredicateVectorRegName(StringRef Name) { bool AArch64AsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) { + return tryParseRegister(RegNo, StartLoc, EndLoc) != MatchOperand_Success; +} + +OperandMatchResultTy AArch64AsmParser::tryParseRegister(unsigned &RegNo, + SMLoc &StartLoc, + SMLoc &EndLoc) { StartLoc = getLoc(); auto Res = tryParseScalarRegister(RegNo); EndLoc = SMLoc::getFromPointer(getLoc().getPointer() - 1); - return Res != MatchOperand_Success; + return Res; } // Matches a register name or register alias previously defined by '.req' @@ -2404,9 +2421,9 @@ AArch64AsmParser::tryParsePrefetch(OperandVector &Operands) { return MatchOperand_ParseFail; } - Parser.Lex(); // Eat identifier token. Operands.push_back(AArch64Operand::CreatePrefetch( *PRFM, Tok.getString(), S, getContext())); + Parser.Lex(); // Eat identifier token. return MatchOperand_Success; } @@ -2427,9 +2444,9 @@ AArch64AsmParser::tryParsePSBHint(OperandVector &Operands) { return MatchOperand_ParseFail; } - Parser.Lex(); // Eat identifier token. Operands.push_back(AArch64Operand::CreatePSBHint( PSB->Encoding, Tok.getString(), S, getContext())); + Parser.Lex(); // Eat identifier token. return MatchOperand_Success; } @@ -2450,9 +2467,9 @@ AArch64AsmParser::tryParseBTIHint(OperandVector &Operands) { return MatchOperand_ParseFail; } - Parser.Lex(); // Eat identifier token. Operands.push_back(AArch64Operand::CreateBTIHint( BTI->Encoding, Tok.getString(), S, getContext())); + Parser.Lex(); // Eat identifier token. return MatchOperand_Success; } @@ -2827,6 +2844,7 @@ static const struct Extension { {"tlb-rmi", {AArch64::FeatureTLB_RMI}}, {"pan-rwv", {AArch64::FeaturePAN_RWV}}, {"ccpp", {AArch64::FeatureCCPP}}, + {"rcpc", {AArch64::FeatureRCPC}}, {"sve", {AArch64::FeatureSVE}}, {"sve2", {AArch64::FeatureSVE2}}, {"sve2-aes", {AArch64::FeatureSVE2AES}}, @@ -2851,6 +2869,8 @@ static void setRequiredFeatureString(FeatureBitset FBS, std::string &Str) { Str += "ARMv8.4a"; else if (FBS[AArch64::HasV8_5aOps]) Str += "ARMv8.5a"; + else if (FBS[AArch64::HasV8_6aOps]) + Str += "ARMv8.6a"; else { auto ext = std::find_if(std::begin(ExtensionMap), std::end(ExtensionMap), @@ -3771,7 +3791,7 @@ bool AArch64AsmParser::ParseInstruction(ParseInstructionInfo &Info, // First check for the AArch64-specific .req directive. if (Parser.getTok().is(AsmToken::Identifier) && - Parser.getTok().getIdentifier() == ".req") { + Parser.getTok().getIdentifier().lower() == ".req") { parseDirectiveReq(Name, NameLoc); // We always return 'error' for this, as we're done with this // statement and don't need to match the 'instruction." @@ -4106,6 +4126,16 @@ bool AArch64AsmParser::validateInstruction(MCInst &Inst, SMLoc &IDLoc, "unpredictable STXP instruction, status is also a source"); break; } + case AArch64::LDRABwriteback: + case AArch64::LDRAAwriteback: { + unsigned Xt = Inst.getOperand(0).getReg(); + unsigned Xn = Inst.getOperand(1).getReg(); + if (Xt == Xn) + return Error(Loc[0], + "unpredictable LDRA instruction, writeback base" + " is also a destination"); + break; + } } @@ -4235,6 +4265,8 @@ bool AArch64AsmParser::showMatchError(SMLoc Loc, unsigned ErrCode, return Error(Loc, "index must be a multiple of 4 in range [-32, 28]."); case Match_InvalidMemoryIndexed16SImm4: return Error(Loc, "index must be a multiple of 16 in range [-128, 112]."); + case Match_InvalidMemoryIndexed32SImm4: + return Error(Loc, "index must be a multiple of 32 in range [-256, 224]."); case Match_InvalidMemoryIndexed1SImm6: return Error(Loc, "index must be an integer in range [-32, 31]."); case Match_InvalidMemoryIndexedSImm8: @@ -4824,7 +4856,7 @@ bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, return true; Inst.setLoc(IDLoc); - Out.EmitInstruction(Inst, getSTI()); + Out.emitInstruction(Inst, getSTI()); return false; } case Match_MissingFeature: { @@ -4894,6 +4926,7 @@ bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, case Match_InvalidMemoryIndexed4SImm4: case Match_InvalidMemoryIndexed1SImm6: case Match_InvalidMemoryIndexed16SImm4: + case Match_InvalidMemoryIndexed32SImm4: case Match_InvalidMemoryIndexed4SImm7: case Match_InvalidMemoryIndexed8SImm7: case Match_InvalidMemoryIndexed16SImm7: @@ -5024,7 +5057,7 @@ bool AArch64AsmParser::ParseDirective(AsmToken DirectiveID) { getContext().getObjectFileInfo()->getObjectFileType(); bool IsMachO = Format == MCObjectFileInfo::IsMachO; - StringRef IDVal = DirectiveID.getIdentifier(); + auto IDVal = DirectiveID.getIdentifier().lower(); SMLoc Loc = DirectiveID.getLoc(); if (IDVal == ".arch") parseDirectiveArch(Loc); @@ -5076,6 +5109,7 @@ static void ExpandCryptoAEK(AArch64::ArchKind ArchKind, break; case AArch64::ArchKind::ARMV8_4A: case AArch64::ArchKind::ARMV8_5A: + case AArch64::ArchKind::ARMV8_6A: RequestedExtensions.push_back("sm4"); RequestedExtensions.push_back("sha3"); RequestedExtensions.push_back("sha2"); @@ -5095,6 +5129,7 @@ static void ExpandCryptoAEK(AArch64::ArchKind ArchKind, break; case AArch64::ArchKind::ARMV8_4A: case AArch64::ArchKind::ARMV8_5A: + case AArch64::ArchKind::ARMV8_6A: RequestedExtensions.push_back("nosm4"); RequestedExtensions.push_back("nosha3"); RequestedExtensions.push_back("nosha2"); @@ -5314,7 +5349,7 @@ bool AArch64AsmParser::parseDirectiveTLSDescCall(SMLoc L) { Inst.setOpcode(AArch64::TLSDESCCALL); Inst.addOperand(MCOperand::createExpr(Expr)); - getParser().getStreamer().EmitInstruction(Inst, getSTI()); + getParser().getStreamer().emitInstruction(Inst, getSTI()); return false; } @@ -5365,7 +5400,7 @@ bool AArch64AsmParser::parseDirectiveLOH(StringRef IDVal, SMLoc Loc) { "unexpected token in '" + Twine(IDVal) + "' directive")) return true; - getStreamer().EmitLOHDirective((MCLOHType)Kind, Args); + getStreamer().emitLOHDirective((MCLOHType)Kind, Args); return false; } @@ -5458,7 +5493,7 @@ bool AArch64AsmParser::parseDirectiveUnreq(SMLoc L) { bool AArch64AsmParser::parseDirectiveCFINegateRAState() { if (parseToken(AsmToken::EndOfStatement, "unexpected token in directive")) return true; - getStreamer().EmitCFINegateRAState(); + getStreamer().emitCFINegateRAState(); return false; } @@ -5468,7 +5503,7 @@ bool AArch64AsmParser::parseDirectiveCFIBKeyFrame() { if (parseToken(AsmToken::EndOfStatement, "unexpected token in '.cfi_b_key_frame'")) return true; - getStreamer().EmitCFIBKeyFrame(); + getStreamer().emitCFIBKeyFrame(); return false; } diff --git a/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp b/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp index d6db88603429f..1ff4abb340540 100644 --- a/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp +++ b/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp @@ -146,6 +146,9 @@ static DecodeStatus DecodeExclusiveLdStInstruction(MCInst &Inst, uint32_t insn, static DecodeStatus DecodePairLdStInstruction(MCInst &Inst, uint32_t insn, uint64_t Address, const void *Decoder); +static DecodeStatus DecodeAuthLoadInstruction(MCInst &Inst, uint32_t insn, + uint64_t Address, + const void *Decoder); static DecodeStatus DecodeAddSubERegInstruction(MCInst &Inst, uint32_t insn, uint64_t Address, const void *Decoder); @@ -1501,6 +1504,39 @@ static DecodeStatus DecodePairLdStInstruction(MCInst &Inst, uint32_t insn, return Success; } +static DecodeStatus DecodeAuthLoadInstruction(MCInst &Inst, uint32_t insn, + uint64_t Addr, + const void *Decoder) { + unsigned Rt = fieldFromInstruction(insn, 0, 5); + unsigned Rn = fieldFromInstruction(insn, 5, 5); + uint64_t offset = fieldFromInstruction(insn, 22, 1) << 9 | + fieldFromInstruction(insn, 12, 9); + unsigned writeback = fieldFromInstruction(insn, 11, 1); + + switch (Inst.getOpcode()) { + default: + return Fail; + case AArch64::LDRAAwriteback: + case AArch64::LDRABwriteback: + DecodeGPR64spRegisterClass(Inst, Rn /* writeback register */, Addr, + Decoder); + break; + case AArch64::LDRAAindexed: + case AArch64::LDRABindexed: + break; + } + + DecodeGPR64RegisterClass(Inst, Rt, Addr, Decoder); + DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder); + DecodeSImm<10>(Inst, offset, Addr, Decoder); + + if (writeback && Rt == Rn && Rn != 31) { + return SoftFail; + } + + return Success; +} + static DecodeStatus DecodeAddSubERegInstruction(MCInst &Inst, uint32_t insn, uint64_t Addr, const void *Decoder) { diff --git a/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp b/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp new file mode 100644 index 0000000000000..11a8d5def4296 --- /dev/null +++ b/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp @@ -0,0 +1,1049 @@ +//===--- AArch64CallLowering.cpp - Call lowering --------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file implements the lowering of LLVM calls to machine code calls for +/// GlobalISel. +/// +//===----------------------------------------------------------------------===// + +#include "AArch64CallLowering.h" +#include "AArch64ISelLowering.h" +#include "AArch64MachineFunctionInfo.h" +#include "AArch64Subtarget.h" +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/CodeGen/Analysis.h" +#include "llvm/CodeGen/CallingConvLower.h" +#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" +#include "llvm/CodeGen/GlobalISel/Utils.h" +#include "llvm/CodeGen/LowLevelType.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineMemOperand.h" +#include "llvm/CodeGen/MachineOperand.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/TargetRegisterInfo.h" +#include "llvm/CodeGen/TargetSubtargetInfo.h" +#include "llvm/CodeGen/ValueTypes.h" +#include "llvm/IR/Argument.h" +#include "llvm/IR/Attributes.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Type.h" +#include "llvm/IR/Value.h" +#include "llvm/Support/MachineValueType.h" +#include +#include +#include +#include + +#define DEBUG_TYPE "aarch64-call-lowering" + +using namespace llvm; + +AArch64CallLowering::AArch64CallLowering(const AArch64TargetLowering &TLI) + : CallLowering(&TLI) {} + +namespace { +struct IncomingArgHandler : public CallLowering::ValueHandler { + IncomingArgHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI, + CCAssignFn *AssignFn) + : ValueHandler(MIRBuilder, MRI, AssignFn), StackUsed(0) {} + + Register getStackAddress(uint64_t Size, int64_t Offset, + MachinePointerInfo &MPO) override { + auto &MFI = MIRBuilder.getMF().getFrameInfo(); + int FI = MFI.CreateFixedObject(Size, Offset, true); + MPO = MachinePointerInfo::getFixedStack(MIRBuilder.getMF(), FI); + auto AddrReg = MIRBuilder.buildFrameIndex(LLT::pointer(0, 64), FI); + StackUsed = std::max(StackUsed, Size + Offset); + return AddrReg.getReg(0); + } + + void assignValueToReg(Register ValVReg, Register PhysReg, + CCValAssign &VA) override { + markPhysRegUsed(PhysReg); + switch (VA.getLocInfo()) { + default: + MIRBuilder.buildCopy(ValVReg, PhysReg); + break; + case CCValAssign::LocInfo::SExt: + case CCValAssign::LocInfo::ZExt: + case CCValAssign::LocInfo::AExt: { + auto Copy = MIRBuilder.buildCopy(LLT{VA.getLocVT()}, PhysReg); + MIRBuilder.buildTrunc(ValVReg, Copy); + break; + } + } + } + + void assignValueToAddress(Register ValVReg, Register Addr, uint64_t Size, + MachinePointerInfo &MPO, CCValAssign &VA) override { + MachineFunction &MF = MIRBuilder.getMF(); + auto MMO = MF.getMachineMemOperand( + MPO, MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant, Size, + inferAlignFromPtrInfo(MF, MPO)); + MIRBuilder.buildLoad(ValVReg, Addr, *MMO); + } + + /// How the physical register gets marked varies between formal + /// parameters (it's a basic-block live-in), and a call instruction + /// (it's an implicit-def of the BL). + virtual void markPhysRegUsed(unsigned PhysReg) = 0; + + bool isIncomingArgumentHandler() const override { return true; } + + uint64_t StackUsed; +}; + +struct FormalArgHandler : public IncomingArgHandler { + FormalArgHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI, + CCAssignFn *AssignFn) + : IncomingArgHandler(MIRBuilder, MRI, AssignFn) {} + + void markPhysRegUsed(unsigned PhysReg) override { + MIRBuilder.getMRI()->addLiveIn(PhysReg); + MIRBuilder.getMBB().addLiveIn(PhysReg); + } +}; + +struct CallReturnHandler : public IncomingArgHandler { + CallReturnHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI, + MachineInstrBuilder MIB, CCAssignFn *AssignFn) + : IncomingArgHandler(MIRBuilder, MRI, AssignFn), MIB(MIB) {} + + void markPhysRegUsed(unsigned PhysReg) override { + MIB.addDef(PhysReg, RegState::Implicit); + } + + MachineInstrBuilder MIB; +}; + +struct OutgoingArgHandler : public CallLowering::ValueHandler { + OutgoingArgHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI, + MachineInstrBuilder MIB, CCAssignFn *AssignFn, + CCAssignFn *AssignFnVarArg, bool IsTailCall = false, + int FPDiff = 0) + : ValueHandler(MIRBuilder, MRI, AssignFn), MIB(MIB), + AssignFnVarArg(AssignFnVarArg), IsTailCall(IsTailCall), FPDiff(FPDiff), + StackSize(0), SPReg(0) {} + + bool isIncomingArgumentHandler() const override { return false; } + + Register getStackAddress(uint64_t Size, int64_t Offset, + MachinePointerInfo &MPO) override { + MachineFunction &MF = MIRBuilder.getMF(); + LLT p0 = LLT::pointer(0, 64); + LLT s64 = LLT::scalar(64); + + if (IsTailCall) { + Offset += FPDiff; + int FI = MF.getFrameInfo().CreateFixedObject(Size, Offset, true); + auto FIReg = MIRBuilder.buildFrameIndex(p0, FI); + MPO = MachinePointerInfo::getFixedStack(MF, FI); + return FIReg.getReg(0); + } + + if (!SPReg) + SPReg = MIRBuilder.buildCopy(p0, Register(AArch64::SP)).getReg(0); + + auto OffsetReg = MIRBuilder.buildConstant(s64, Offset); + + auto AddrReg = MIRBuilder.buildPtrAdd(p0, SPReg, OffsetReg); + + MPO = MachinePointerInfo::getStack(MF, Offset); + return AddrReg.getReg(0); + } + + void assignValueToReg(Register ValVReg, Register PhysReg, + CCValAssign &VA) override { + MIB.addUse(PhysReg, RegState::Implicit); + Register ExtReg = extendRegister(ValVReg, VA); + MIRBuilder.buildCopy(PhysReg, ExtReg); + } + + void assignValueToAddress(Register ValVReg, Register Addr, uint64_t Size, + MachinePointerInfo &MPO, CCValAssign &VA) override { + MachineFunction &MF = MIRBuilder.getMF(); + auto MMO = MF.getMachineMemOperand(MPO, MachineMemOperand::MOStore, Size, + inferAlignFromPtrInfo(MF, MPO)); + MIRBuilder.buildStore(ValVReg, Addr, *MMO); + } + + void assignValueToAddress(const CallLowering::ArgInfo &Arg, Register Addr, + uint64_t Size, MachinePointerInfo &MPO, + CCValAssign &VA) override { + unsigned MaxSize = Size * 8; + // For varargs, we always want to extend them to 8 bytes, in which case + // we disable setting a max. + if (!Arg.IsFixed) + MaxSize = 0; + + Register ValVReg = VA.getLocInfo() != CCValAssign::LocInfo::FPExt + ? extendRegister(Arg.Regs[0], VA, MaxSize) + : Arg.Regs[0]; + + // If we extended we might need to adjust the MMO's Size. + const LLT RegTy = MRI.getType(ValVReg); + if (RegTy.getSizeInBytes() > Size) + Size = RegTy.getSizeInBytes(); + + assignValueToAddress(ValVReg, Addr, Size, MPO, VA); + } + + bool assignArg(unsigned ValNo, MVT ValVT, MVT LocVT, + CCValAssign::LocInfo LocInfo, + const CallLowering::ArgInfo &Info, + ISD::ArgFlagsTy Flags, + CCState &State) override { + bool Res; + if (Info.IsFixed) + Res = AssignFn(ValNo, ValVT, LocVT, LocInfo, Flags, State); + else + Res = AssignFnVarArg(ValNo, ValVT, LocVT, LocInfo, Flags, State); + + StackSize = State.getNextStackOffset(); + return Res; + } + + MachineInstrBuilder MIB; + CCAssignFn *AssignFnVarArg; + bool IsTailCall; + + /// For tail calls, the byte offset of the call's argument area from the + /// callee's. Unused elsewhere. + int FPDiff; + uint64_t StackSize; + + // Cache the SP register vreg if we need it more than once in this call site. + Register SPReg; +}; +} // namespace + +static bool doesCalleeRestoreStack(CallingConv::ID CallConv, bool TailCallOpt) { + return CallConv == CallingConv::Fast && TailCallOpt; +} + +void AArch64CallLowering::splitToValueTypes( + const ArgInfo &OrigArg, SmallVectorImpl &SplitArgs, + const DataLayout &DL, MachineRegisterInfo &MRI, CallingConv::ID CallConv) const { + const AArch64TargetLowering &TLI = *getTLI(); + LLVMContext &Ctx = OrigArg.Ty->getContext(); + + SmallVector SplitVTs; + SmallVector Offsets; + ComputeValueVTs(TLI, DL, OrigArg.Ty, SplitVTs, &Offsets, 0); + + if (SplitVTs.size() == 0) + return; + + if (SplitVTs.size() == 1) { + // No splitting to do, but we want to replace the original type (e.g. [1 x + // double] -> double). + SplitArgs.emplace_back(OrigArg.Regs[0], SplitVTs[0].getTypeForEVT(Ctx), + OrigArg.Flags[0], OrigArg.IsFixed); + return; + } + + // Create one ArgInfo for each virtual register in the original ArgInfo. + assert(OrigArg.Regs.size() == SplitVTs.size() && "Regs / types mismatch"); + + bool NeedsRegBlock = TLI.functionArgumentNeedsConsecutiveRegisters( + OrigArg.Ty, CallConv, false); + for (unsigned i = 0, e = SplitVTs.size(); i < e; ++i) { + Type *SplitTy = SplitVTs[i].getTypeForEVT(Ctx); + SplitArgs.emplace_back(OrigArg.Regs[i], SplitTy, OrigArg.Flags[0], + OrigArg.IsFixed); + if (NeedsRegBlock) + SplitArgs.back().Flags[0].setInConsecutiveRegs(); + } + + SplitArgs.back().Flags[0].setInConsecutiveRegsLast(); +} + +bool AArch64CallLowering::lowerReturn(MachineIRBuilder &MIRBuilder, + const Value *Val, + ArrayRef VRegs, + Register SwiftErrorVReg) const { + auto MIB = MIRBuilder.buildInstrNoInsert(AArch64::RET_ReallyLR); + assert(((Val && !VRegs.empty()) || (!Val && VRegs.empty())) && + "Return value without a vreg"); + + bool Success = true; + if (!VRegs.empty()) { + MachineFunction &MF = MIRBuilder.getMF(); + const Function &F = MF.getFunction(); + + MachineRegisterInfo &MRI = MF.getRegInfo(); + const AArch64TargetLowering &TLI = *getTLI(); + CCAssignFn *AssignFn = TLI.CCAssignFnForReturn(F.getCallingConv()); + auto &DL = F.getParent()->getDataLayout(); + LLVMContext &Ctx = Val->getType()->getContext(); + + SmallVector SplitEVTs; + ComputeValueVTs(TLI, DL, Val->getType(), SplitEVTs); + assert(VRegs.size() == SplitEVTs.size() && + "For each split Type there should be exactly one VReg."); + + SmallVector SplitArgs; + CallingConv::ID CC = F.getCallingConv(); + + for (unsigned i = 0; i < SplitEVTs.size(); ++i) { + if (TLI.getNumRegistersForCallingConv(Ctx, CC, SplitEVTs[i]) > 1) { + LLVM_DEBUG(dbgs() << "Can't handle extended arg types which need split"); + return false; + } + + Register CurVReg = VRegs[i]; + ArgInfo CurArgInfo = ArgInfo{CurVReg, SplitEVTs[i].getTypeForEVT(Ctx)}; + setArgFlags(CurArgInfo, AttributeList::ReturnIndex, DL, F); + + // i1 is a special case because SDAG i1 true is naturally zero extended + // when widened using ANYEXT. We need to do it explicitly here. + if (MRI.getType(CurVReg).getSizeInBits() == 1) { + CurVReg = MIRBuilder.buildZExt(LLT::scalar(8), CurVReg).getReg(0); + } else { + // Some types will need extending as specified by the CC. + MVT NewVT = TLI.getRegisterTypeForCallingConv(Ctx, CC, SplitEVTs[i]); + if (EVT(NewVT) != SplitEVTs[i]) { + unsigned ExtendOp = TargetOpcode::G_ANYEXT; + if (F.getAttributes().hasAttribute(AttributeList::ReturnIndex, + Attribute::SExt)) + ExtendOp = TargetOpcode::G_SEXT; + else if (F.getAttributes().hasAttribute(AttributeList::ReturnIndex, + Attribute::ZExt)) + ExtendOp = TargetOpcode::G_ZEXT; + + LLT NewLLT(NewVT); + LLT OldLLT(MVT::getVT(CurArgInfo.Ty)); + CurArgInfo.Ty = EVT(NewVT).getTypeForEVT(Ctx); + // Instead of an extend, we might have a vector type which needs + // padding with more elements, e.g. <2 x half> -> <4 x half>. + if (NewVT.isVector()) { + if (OldLLT.isVector()) { + if (NewLLT.getNumElements() > OldLLT.getNumElements()) { + // We don't handle VA types which are not exactly twice the + // size, but can easily be done in future. + if (NewLLT.getNumElements() != OldLLT.getNumElements() * 2) { + LLVM_DEBUG(dbgs() << "Outgoing vector ret has too many elts"); + return false; + } + auto Undef = MIRBuilder.buildUndef({OldLLT}); + CurVReg = + MIRBuilder.buildMerge({NewLLT}, {CurVReg, Undef}).getReg(0); + } else { + // Just do a vector extend. + CurVReg = MIRBuilder.buildInstr(ExtendOp, {NewLLT}, {CurVReg}) + .getReg(0); + } + } else if (NewLLT.getNumElements() == 2) { + // We need to pad a <1 x S> type to <2 x S>. Since we don't have + // <1 x S> vector types in GISel we use a build_vector instead + // of a vector merge/concat. + auto Undef = MIRBuilder.buildUndef({OldLLT}); + CurVReg = + MIRBuilder + .buildBuildVector({NewLLT}, {CurVReg, Undef.getReg(0)}) + .getReg(0); + } else { + LLVM_DEBUG(dbgs() << "Could not handle ret ty"); + return false; + } + } else { + // A scalar extend. + CurVReg = + MIRBuilder.buildInstr(ExtendOp, {NewLLT}, {CurVReg}).getReg(0); + } + } + } + if (CurVReg != CurArgInfo.Regs[0]) { + CurArgInfo.Regs[0] = CurVReg; + // Reset the arg flags after modifying CurVReg. + setArgFlags(CurArgInfo, AttributeList::ReturnIndex, DL, F); + } + splitToValueTypes(CurArgInfo, SplitArgs, DL, MRI, CC); + } + + OutgoingArgHandler Handler(MIRBuilder, MRI, MIB, AssignFn, AssignFn); + Success = handleAssignments(MIRBuilder, SplitArgs, Handler); + } + + if (SwiftErrorVReg) { + MIB.addUse(AArch64::X21, RegState::Implicit); + MIRBuilder.buildCopy(AArch64::X21, SwiftErrorVReg); + } + + MIRBuilder.insertInstr(MIB); + return Success; +} + +/// Helper function to compute forwarded registers for musttail calls. Computes +/// the forwarded registers, sets MBB liveness, and emits COPY instructions that +/// can be used to save + restore registers later. +static void handleMustTailForwardedRegisters(MachineIRBuilder &MIRBuilder, + CCAssignFn *AssignFn) { + MachineBasicBlock &MBB = MIRBuilder.getMBB(); + MachineFunction &MF = MIRBuilder.getMF(); + MachineFrameInfo &MFI = MF.getFrameInfo(); + + if (!MFI.hasMustTailInVarArgFunc()) + return; + + AArch64FunctionInfo *FuncInfo = MF.getInfo(); + const Function &F = MF.getFunction(); + assert(F.isVarArg() && "Expected F to be vararg?"); + + // Compute the set of forwarded registers. The rest are scratch. + SmallVector ArgLocs; + CCState CCInfo(F.getCallingConv(), /*IsVarArg=*/true, MF, ArgLocs, + F.getContext()); + SmallVector RegParmTypes; + RegParmTypes.push_back(MVT::i64); + RegParmTypes.push_back(MVT::f128); + + // Later on, we can use this vector to restore the registers if necessary. + SmallVectorImpl &Forwards = + FuncInfo->getForwardedMustTailRegParms(); + CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, AssignFn); + + // Conservatively forward X8, since it might be used for an aggregate + // return. + if (!CCInfo.isAllocated(AArch64::X8)) { + unsigned X8VReg = MF.addLiveIn(AArch64::X8, &AArch64::GPR64RegClass); + Forwards.push_back(ForwardedRegister(X8VReg, AArch64::X8, MVT::i64)); + } + + // Add the forwards to the MachineBasicBlock and MachineFunction. + for (const auto &F : Forwards) { + MBB.addLiveIn(F.PReg); + MIRBuilder.buildCopy(Register(F.VReg), Register(F.PReg)); + } +} + +bool AArch64CallLowering::fallBackToDAGISel(const Function &F) const { + if (isa(F.getReturnType())) + return true; + return llvm::any_of(F.args(), [](const Argument &A) { + return isa(A.getType()); + }); +} + +bool AArch64CallLowering::lowerFormalArguments( + MachineIRBuilder &MIRBuilder, const Function &F, + ArrayRef> VRegs) const { + MachineFunction &MF = MIRBuilder.getMF(); + MachineBasicBlock &MBB = MIRBuilder.getMBB(); + MachineRegisterInfo &MRI = MF.getRegInfo(); + auto &DL = F.getParent()->getDataLayout(); + + SmallVector SplitArgs; + unsigned i = 0; + for (auto &Arg : F.args()) { + if (DL.getTypeStoreSize(Arg.getType()).isZero()) + continue; + + ArgInfo OrigArg{VRegs[i], Arg.getType()}; + setArgFlags(OrigArg, i + AttributeList::FirstArgIndex, DL, F); + + splitToValueTypes(OrigArg, SplitArgs, DL, MRI, F.getCallingConv()); + ++i; + } + + if (!MBB.empty()) + MIRBuilder.setInstr(*MBB.begin()); + + const AArch64TargetLowering &TLI = *getTLI(); + CCAssignFn *AssignFn = + TLI.CCAssignFnForCall(F.getCallingConv(), /*IsVarArg=*/false); + + FormalArgHandler Handler(MIRBuilder, MRI, AssignFn); + if (!handleAssignments(MIRBuilder, SplitArgs, Handler)) + return false; + + AArch64FunctionInfo *FuncInfo = MF.getInfo(); + uint64_t StackOffset = Handler.StackUsed; + if (F.isVarArg()) { + auto &Subtarget = MF.getSubtarget(); + if (!Subtarget.isTargetDarwin()) { + // FIXME: we need to reimplement saveVarArgsRegisters from + // AArch64ISelLowering. + return false; + } + + // We currently pass all varargs at 8-byte alignment, or 4 in ILP32. + StackOffset = alignTo(Handler.StackUsed, Subtarget.isTargetILP32() ? 4 : 8); + + auto &MFI = MIRBuilder.getMF().getFrameInfo(); + FuncInfo->setVarArgsStackIndex(MFI.CreateFixedObject(4, StackOffset, true)); + } + + if (doesCalleeRestoreStack(F.getCallingConv(), + MF.getTarget().Options.GuaranteedTailCallOpt)) { + // We have a non-standard ABI, so why not make full use of the stack that + // we're going to pop? It must be aligned to 16 B in any case. + StackOffset = alignTo(StackOffset, 16); + + // If we're expected to restore the stack (e.g. fastcc), then we'll be + // adding a multiple of 16. + FuncInfo->setArgumentStackToRestore(StackOffset); + + // Our own callers will guarantee that the space is free by giving an + // aligned value to CALLSEQ_START. + } + + // When we tail call, we need to check if the callee's arguments + // will fit on the caller's stack. So, whenever we lower formal arguments, + // we should keep track of this information, since we might lower a tail call + // in this function later. + FuncInfo->setBytesInStackArgArea(StackOffset); + + auto &Subtarget = MF.getSubtarget(); + if (Subtarget.hasCustomCallingConv()) + Subtarget.getRegisterInfo()->UpdateCustomCalleeSavedRegs(MF); + + handleMustTailForwardedRegisters(MIRBuilder, AssignFn); + + // Move back to the end of the basic block. + MIRBuilder.setMBB(MBB); + + return true; +} + +/// Return true if the calling convention is one that we can guarantee TCO for. +static bool canGuaranteeTCO(CallingConv::ID CC) { + return CC == CallingConv::Fast; +} + +/// Return true if we might ever do TCO for calls with this calling convention. +static bool mayTailCallThisCC(CallingConv::ID CC) { + switch (CC) { + case CallingConv::C: + case CallingConv::PreserveMost: + case CallingConv::Swift: + return true; + default: + return canGuaranteeTCO(CC); + } +} + +/// Returns a pair containing the fixed CCAssignFn and the vararg CCAssignFn for +/// CC. +static std::pair +getAssignFnsForCC(CallingConv::ID CC, const AArch64TargetLowering &TLI) { + return {TLI.CCAssignFnForCall(CC, false), TLI.CCAssignFnForCall(CC, true)}; +} + +bool AArch64CallLowering::doCallerAndCalleePassArgsTheSameWay( + CallLoweringInfo &Info, MachineFunction &MF, + SmallVectorImpl &InArgs) const { + const Function &CallerF = MF.getFunction(); + CallingConv::ID CalleeCC = Info.CallConv; + CallingConv::ID CallerCC = CallerF.getCallingConv(); + + // If the calling conventions match, then everything must be the same. + if (CalleeCC == CallerCC) + return true; + + // Check if the caller and callee will handle arguments in the same way. + const AArch64TargetLowering &TLI = *getTLI(); + CCAssignFn *CalleeAssignFnFixed; + CCAssignFn *CalleeAssignFnVarArg; + std::tie(CalleeAssignFnFixed, CalleeAssignFnVarArg) = + getAssignFnsForCC(CalleeCC, TLI); + + CCAssignFn *CallerAssignFnFixed; + CCAssignFn *CallerAssignFnVarArg; + std::tie(CallerAssignFnFixed, CallerAssignFnVarArg) = + getAssignFnsForCC(CallerCC, TLI); + + if (!resultsCompatible(Info, MF, InArgs, *CalleeAssignFnFixed, + *CalleeAssignFnVarArg, *CallerAssignFnFixed, + *CallerAssignFnVarArg)) + return false; + + // Make sure that the caller and callee preserve all of the same registers. + auto TRI = MF.getSubtarget().getRegisterInfo(); + const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC); + const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC); + if (MF.getSubtarget().hasCustomCallingConv()) { + TRI->UpdateCustomCallPreservedMask(MF, &CallerPreserved); + TRI->UpdateCustomCallPreservedMask(MF, &CalleePreserved); + } + + return TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved); +} + +bool AArch64CallLowering::areCalleeOutgoingArgsTailCallable( + CallLoweringInfo &Info, MachineFunction &MF, + SmallVectorImpl &OutArgs) const { + // If there are no outgoing arguments, then we are done. + if (OutArgs.empty()) + return true; + + const Function &CallerF = MF.getFunction(); + CallingConv::ID CalleeCC = Info.CallConv; + CallingConv::ID CallerCC = CallerF.getCallingConv(); + const AArch64TargetLowering &TLI = *getTLI(); + + CCAssignFn *AssignFnFixed; + CCAssignFn *AssignFnVarArg; + std::tie(AssignFnFixed, AssignFnVarArg) = getAssignFnsForCC(CalleeCC, TLI); + + // We have outgoing arguments. Make sure that we can tail call with them. + SmallVector OutLocs; + CCState OutInfo(CalleeCC, false, MF, OutLocs, CallerF.getContext()); + + if (!analyzeArgInfo(OutInfo, OutArgs, *AssignFnFixed, *AssignFnVarArg)) { + LLVM_DEBUG(dbgs() << "... Could not analyze call operands.\n"); + return false; + } + + // Make sure that they can fit on the caller's stack. + const AArch64FunctionInfo *FuncInfo = MF.getInfo(); + if (OutInfo.getNextStackOffset() > FuncInfo->getBytesInStackArgArea()) { + LLVM_DEBUG(dbgs() << "... Cannot fit call operands on caller's stack.\n"); + return false; + } + + // Verify that the parameters in callee-saved registers match. + // TODO: Port this over to CallLowering as general code once swiftself is + // supported. + auto TRI = MF.getSubtarget().getRegisterInfo(); + const uint32_t *CallerPreservedMask = TRI->getCallPreservedMask(MF, CallerCC); + MachineRegisterInfo &MRI = MF.getRegInfo(); + + for (unsigned i = 0; i < OutLocs.size(); ++i) { + auto &ArgLoc = OutLocs[i]; + // If it's not a register, it's fine. + if (!ArgLoc.isRegLoc()) { + if (Info.IsVarArg) { + // Be conservative and disallow variadic memory operands to match SDAG's + // behaviour. + // FIXME: If the caller's calling convention is C, then we can + // potentially use its argument area. However, for cases like fastcc, + // we can't do anything. + LLVM_DEBUG( + dbgs() + << "... Cannot tail call vararg function with stack arguments\n"); + return false; + } + continue; + } + + Register Reg = ArgLoc.getLocReg(); + + // Only look at callee-saved registers. + if (MachineOperand::clobbersPhysReg(CallerPreservedMask, Reg)) + continue; + + LLVM_DEBUG( + dbgs() + << "... Call has an argument passed in a callee-saved register.\n"); + + // Check if it was copied from. + ArgInfo &OutInfo = OutArgs[i]; + + if (OutInfo.Regs.size() > 1) { + LLVM_DEBUG( + dbgs() << "... Cannot handle arguments in multiple registers.\n"); + return false; + } + + // Check if we copy the register, walking through copies from virtual + // registers. Note that getDefIgnoringCopies does not ignore copies from + // physical registers. + MachineInstr *RegDef = getDefIgnoringCopies(OutInfo.Regs[0], MRI); + if (!RegDef || RegDef->getOpcode() != TargetOpcode::COPY) { + LLVM_DEBUG( + dbgs() + << "... Parameter was not copied into a VReg, cannot tail call.\n"); + return false; + } + + // Got a copy. Verify that it's the same as the register we want. + Register CopyRHS = RegDef->getOperand(1).getReg(); + if (CopyRHS != Reg) { + LLVM_DEBUG(dbgs() << "... Callee-saved register was not copied into " + "VReg, cannot tail call.\n"); + return false; + } + } + + return true; +} + +bool AArch64CallLowering::isEligibleForTailCallOptimization( + MachineIRBuilder &MIRBuilder, CallLoweringInfo &Info, + SmallVectorImpl &InArgs, + SmallVectorImpl &OutArgs) const { + + // Must pass all target-independent checks in order to tail call optimize. + if (!Info.IsTailCall) + return false; + + CallingConv::ID CalleeCC = Info.CallConv; + MachineFunction &MF = MIRBuilder.getMF(); + const Function &CallerF = MF.getFunction(); + + LLVM_DEBUG(dbgs() << "Attempting to lower call as tail call\n"); + + if (Info.SwiftErrorVReg) { + // TODO: We should handle this. + // Note that this is also handled by the check for no outgoing arguments. + // Proactively disabling this though, because the swifterror handling in + // lowerCall inserts a COPY *after* the location of the call. + LLVM_DEBUG(dbgs() << "... Cannot handle tail calls with swifterror yet.\n"); + return false; + } + + if (!mayTailCallThisCC(CalleeCC)) { + LLVM_DEBUG(dbgs() << "... Calling convention cannot be tail called.\n"); + return false; + } + + // Byval parameters hand the function a pointer directly into the stack area + // we want to reuse during a tail call. Working around this *is* possible (see + // X86). + // + // FIXME: In AArch64ISelLowering, this isn't worked around. Can/should we try + // it? + // + // On Windows, "inreg" attributes signify non-aggregate indirect returns. + // In this case, it is necessary to save/restore X0 in the callee. Tail + // call opt interferes with this. So we disable tail call opt when the + // caller has an argument with "inreg" attribute. + // + // FIXME: Check whether the callee also has an "inreg" argument. + // + // When the caller has a swifterror argument, we don't want to tail call + // because would have to move into the swifterror register before the + // tail call. + if (any_of(CallerF.args(), [](const Argument &A) { + return A.hasByValAttr() || A.hasInRegAttr() || A.hasSwiftErrorAttr(); + })) { + LLVM_DEBUG(dbgs() << "... Cannot tail call from callers with byval, " + "inreg, or swifterror arguments\n"); + return false; + } + + // Externally-defined functions with weak linkage should not be + // tail-called on AArch64 when the OS does not support dynamic + // pre-emption of symbols, as the AAELF spec requires normal calls + // to undefined weak functions to be replaced with a NOP or jump to the + // next instruction. The behaviour of branch instructions in this + // situation (as used for tail calls) is implementation-defined, so we + // cannot rely on the linker replacing the tail call with a return. + if (Info.Callee.isGlobal()) { + const GlobalValue *GV = Info.Callee.getGlobal(); + const Triple &TT = MF.getTarget().getTargetTriple(); + if (GV->hasExternalWeakLinkage() && + (!TT.isOSWindows() || TT.isOSBinFormatELF() || + TT.isOSBinFormatMachO())) { + LLVM_DEBUG(dbgs() << "... Cannot tail call externally-defined function " + "with weak linkage for this OS.\n"); + return false; + } + } + + // If we have -tailcallopt, then we're done. + if (MF.getTarget().Options.GuaranteedTailCallOpt) + return canGuaranteeTCO(CalleeCC) && CalleeCC == CallerF.getCallingConv(); + + // We don't have -tailcallopt, so we're allowed to change the ABI (sibcall). + // Try to find cases where we can do that. + + // I want anyone implementing a new calling convention to think long and hard + // about this assert. + assert((!Info.IsVarArg || CalleeCC == CallingConv::C) && + "Unexpected variadic calling convention"); + + // Verify that the incoming and outgoing arguments from the callee are + // safe to tail call. + if (!doCallerAndCalleePassArgsTheSameWay(Info, MF, InArgs)) { + LLVM_DEBUG( + dbgs() + << "... Caller and callee have incompatible calling conventions.\n"); + return false; + } + + if (!areCalleeOutgoingArgsTailCallable(Info, MF, OutArgs)) + return false; + + LLVM_DEBUG( + dbgs() << "... Call is eligible for tail call optimization.\n"); + return true; +} + +static unsigned getCallOpcode(const MachineFunction &CallerF, bool IsIndirect, + bool IsTailCall) { + if (!IsTailCall) + return IsIndirect ? getBLRCallOpcode(CallerF) : (unsigned)AArch64::BL; + + if (!IsIndirect) + return AArch64::TCRETURNdi; + + // When BTI is enabled, we need to use TCRETURNriBTI to make sure that we use + // x16 or x17. + if (CallerF.getFunction().hasFnAttribute("branch-target-enforcement")) + return AArch64::TCRETURNriBTI; + + return AArch64::TCRETURNri; +} + +bool AArch64CallLowering::lowerTailCall( + MachineIRBuilder &MIRBuilder, CallLoweringInfo &Info, + SmallVectorImpl &OutArgs) const { + MachineFunction &MF = MIRBuilder.getMF(); + const Function &F = MF.getFunction(); + MachineRegisterInfo &MRI = MF.getRegInfo(); + const AArch64TargetLowering &TLI = *getTLI(); + AArch64FunctionInfo *FuncInfo = MF.getInfo(); + + // True when we're tail calling, but without -tailcallopt. + bool IsSibCall = !MF.getTarget().Options.GuaranteedTailCallOpt; + + // TODO: Right now, regbankselect doesn't know how to handle the rtcGPR64 + // register class. Until we can do that, we should fall back here. + if (F.hasFnAttribute("branch-target-enforcement")) { + LLVM_DEBUG( + dbgs() << "Cannot lower indirect tail calls with BTI enabled yet.\n"); + return false; + } + + // Find out which ABI gets to decide where things go. + CallingConv::ID CalleeCC = Info.CallConv; + CCAssignFn *AssignFnFixed; + CCAssignFn *AssignFnVarArg; + std::tie(AssignFnFixed, AssignFnVarArg) = getAssignFnsForCC(CalleeCC, TLI); + + MachineInstrBuilder CallSeqStart; + if (!IsSibCall) + CallSeqStart = MIRBuilder.buildInstr(AArch64::ADJCALLSTACKDOWN); + + unsigned Opc = getCallOpcode(MF, Info.Callee.isReg(), true); + auto MIB = MIRBuilder.buildInstrNoInsert(Opc); + MIB.add(Info.Callee); + + // Byte offset for the tail call. When we are sibcalling, this will always + // be 0. + MIB.addImm(0); + + // Tell the call which registers are clobbered. + auto TRI = MF.getSubtarget().getRegisterInfo(); + const uint32_t *Mask = TRI->getCallPreservedMask(MF, CalleeCC); + if (MF.getSubtarget().hasCustomCallingConv()) + TRI->UpdateCustomCallPreservedMask(MF, &Mask); + MIB.addRegMask(Mask); + + if (TRI->isAnyArgRegReserved(MF)) + TRI->emitReservedArgRegCallError(MF); + + // FPDiff is the byte offset of the call's argument area from the callee's. + // Stores to callee stack arguments will be placed in FixedStackSlots offset + // by this amount for a tail call. In a sibling call it must be 0 because the + // caller will deallocate the entire stack and the callee still expects its + // arguments to begin at SP+0. + int FPDiff = 0; + + // This will be 0 for sibcalls, potentially nonzero for tail calls produced + // by -tailcallopt. For sibcalls, the memory operands for the call are + // already available in the caller's incoming argument space. + unsigned NumBytes = 0; + if (!IsSibCall) { + // We aren't sibcalling, so we need to compute FPDiff. We need to do this + // before handling assignments, because FPDiff must be known for memory + // arguments. + unsigned NumReusableBytes = FuncInfo->getBytesInStackArgArea(); + SmallVector OutLocs; + CCState OutInfo(CalleeCC, false, MF, OutLocs, F.getContext()); + analyzeArgInfo(OutInfo, OutArgs, *AssignFnFixed, *AssignFnVarArg); + + // The callee will pop the argument stack as a tail call. Thus, we must + // keep it 16-byte aligned. + NumBytes = alignTo(OutInfo.getNextStackOffset(), 16); + + // FPDiff will be negative if this tail call requires more space than we + // would automatically have in our incoming argument space. Positive if we + // actually shrink the stack. + FPDiff = NumReusableBytes - NumBytes; + + // The stack pointer must be 16-byte aligned at all times it's used for a + // memory operation, which in practice means at *all* times and in + // particular across call boundaries. Therefore our own arguments started at + // a 16-byte aligned SP and the delta applied for the tail call should + // satisfy the same constraint. + assert(FPDiff % 16 == 0 && "unaligned stack on tail call"); + } + + const auto &Forwards = FuncInfo->getForwardedMustTailRegParms(); + + // Do the actual argument marshalling. + OutgoingArgHandler Handler(MIRBuilder, MRI, MIB, AssignFnFixed, + AssignFnVarArg, true, FPDiff); + if (!handleAssignments(MIRBuilder, OutArgs, Handler)) + return false; + + if (Info.IsVarArg && Info.IsMustTailCall) { + // Now we know what's being passed to the function. Add uses to the call for + // the forwarded registers that we *aren't* passing as parameters. This will + // preserve the copies we build earlier. + for (const auto &F : Forwards) { + Register ForwardedReg = F.PReg; + // If the register is already passed, or aliases a register which is + // already being passed, then skip it. + if (any_of(MIB->uses(), [&ForwardedReg, &TRI](const MachineOperand &Use) { + if (!Use.isReg()) + return false; + return TRI->regsOverlap(Use.getReg(), ForwardedReg); + })) + continue; + + // We aren't passing it already, so we should add it to the call. + MIRBuilder.buildCopy(ForwardedReg, Register(F.VReg)); + MIB.addReg(ForwardedReg, RegState::Implicit); + } + } + + // If we have -tailcallopt, we need to adjust the stack. We'll do the call + // sequence start and end here. + if (!IsSibCall) { + MIB->getOperand(1).setImm(FPDiff); + CallSeqStart.addImm(NumBytes).addImm(0); + // End the call sequence *before* emitting the call. Normally, we would + // tidy the frame up after the call. However, here, we've laid out the + // parameters so that when SP is reset, they will be in the correct + // location. + MIRBuilder.buildInstr(AArch64::ADJCALLSTACKUP).addImm(NumBytes).addImm(0); + } + + // Now we can add the actual call instruction to the correct basic block. + MIRBuilder.insertInstr(MIB); + + // If Callee is a reg, since it is used by a target specific instruction, + // it must have a register class matching the constraint of that instruction. + if (Info.Callee.isReg()) + MIB->getOperand(0).setReg(constrainOperandRegClass( + MF, *TRI, MRI, *MF.getSubtarget().getInstrInfo(), + *MF.getSubtarget().getRegBankInfo(), *MIB, MIB->getDesc(), Info.Callee, + 0)); + + MF.getFrameInfo().setHasTailCall(); + Info.LoweredTailCall = true; + return true; +} + +bool AArch64CallLowering::lowerCall(MachineIRBuilder &MIRBuilder, + CallLoweringInfo &Info) const { + MachineFunction &MF = MIRBuilder.getMF(); + const Function &F = MF.getFunction(); + MachineRegisterInfo &MRI = MF.getRegInfo(); + auto &DL = F.getParent()->getDataLayout(); + const AArch64TargetLowering &TLI = *getTLI(); + + SmallVector OutArgs; + for (auto &OrigArg : Info.OrigArgs) { + splitToValueTypes(OrigArg, OutArgs, DL, MRI, Info.CallConv); + // AAPCS requires that we zero-extend i1 to 8 bits by the caller. + if (OrigArg.Ty->isIntegerTy(1)) + OutArgs.back().Flags[0].setZExt(); + } + + SmallVector InArgs; + if (!Info.OrigRet.Ty->isVoidTy()) + splitToValueTypes(Info.OrigRet, InArgs, DL, MRI, F.getCallingConv()); + + // If we can lower as a tail call, do that instead. + bool CanTailCallOpt = + isEligibleForTailCallOptimization(MIRBuilder, Info, InArgs, OutArgs); + + // We must emit a tail call if we have musttail. + if (Info.IsMustTailCall && !CanTailCallOpt) { + // There are types of incoming/outgoing arguments we can't handle yet, so + // it doesn't make sense to actually die here like in ISelLowering. Instead, + // fall back to SelectionDAG and let it try to handle this. + LLVM_DEBUG(dbgs() << "Failed to lower musttail call as tail call\n"); + return false; + } + + if (CanTailCallOpt) + return lowerTailCall(MIRBuilder, Info, OutArgs); + + // Find out which ABI gets to decide where things go. + CCAssignFn *AssignFnFixed; + CCAssignFn *AssignFnVarArg; + std::tie(AssignFnFixed, AssignFnVarArg) = + getAssignFnsForCC(Info.CallConv, TLI); + + MachineInstrBuilder CallSeqStart; + CallSeqStart = MIRBuilder.buildInstr(AArch64::ADJCALLSTACKDOWN); + + // Create a temporarily-floating call instruction so we can add the implicit + // uses of arg registers. + unsigned Opc = getCallOpcode(MF, Info.Callee.isReg(), false); + + auto MIB = MIRBuilder.buildInstrNoInsert(Opc); + MIB.add(Info.Callee); + + // Tell the call which registers are clobbered. + auto TRI = MF.getSubtarget().getRegisterInfo(); + const uint32_t *Mask = TRI->getCallPreservedMask(MF, Info.CallConv); + if (MF.getSubtarget().hasCustomCallingConv()) + TRI->UpdateCustomCallPreservedMask(MF, &Mask); + MIB.addRegMask(Mask); + + if (TRI->isAnyArgRegReserved(MF)) + TRI->emitReservedArgRegCallError(MF); + + // Do the actual argument marshalling. + OutgoingArgHandler Handler(MIRBuilder, MRI, MIB, AssignFnFixed, + AssignFnVarArg, false); + if (!handleAssignments(MIRBuilder, OutArgs, Handler)) + return false; + + // Now we can add the actual call instruction to the correct basic block. + MIRBuilder.insertInstr(MIB); + + // If Callee is a reg, since it is used by a target specific + // instruction, it must have a register class matching the + // constraint of that instruction. + if (Info.Callee.isReg()) + MIB->getOperand(0).setReg(constrainOperandRegClass( + MF, *TRI, MRI, *MF.getSubtarget().getInstrInfo(), + *MF.getSubtarget().getRegBankInfo(), *MIB, MIB->getDesc(), Info.Callee, + 0)); + + // Finally we can copy the returned value back into its virtual-register. In + // symmetry with the arguments, the physical register must be an + // implicit-define of the call instruction. + if (!Info.OrigRet.Ty->isVoidTy()) { + CCAssignFn *RetAssignFn = TLI.CCAssignFnForReturn(Info.CallConv); + CallReturnHandler Handler(MIRBuilder, MRI, MIB, RetAssignFn); + if (!handleAssignments(MIRBuilder, InArgs, Handler)) + return false; + } + + if (Info.SwiftErrorVReg) { + MIB.addDef(AArch64::X21, RegState::Implicit); + MIRBuilder.buildCopy(Info.SwiftErrorVReg, Register(AArch64::X21)); + } + + uint64_t CalleePopBytes = + doesCalleeRestoreStack(Info.CallConv, + MF.getTarget().Options.GuaranteedTailCallOpt) + ? alignTo(Handler.StackSize, 16) + : 0; + + CallSeqStart.addImm(Handler.StackSize).addImm(0); + MIRBuilder.buildInstr(AArch64::ADJCALLSTACKUP) + .addImm(Handler.StackSize) + .addImm(CalleePopBytes); + + return true; +} diff --git a/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.h b/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.h new file mode 100644 index 0000000000000..640a862530596 --- /dev/null +++ b/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.h @@ -0,0 +1,84 @@ +//===- AArch64CallLowering.h - Call lowering --------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file describes how to lower LLVM calls to machine code calls. +/// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64CALLLOWERING_H +#define LLVM_LIB_TARGET_AARCH64_AARCH64CALLLOWERING_H + +#include "llvm/ADT/ArrayRef.h" +#include "llvm/CodeGen/GlobalISel/CallLowering.h" +#include "llvm/IR/CallingConv.h" +#include +#include + +namespace llvm { + +class AArch64TargetLowering; +class CCValAssign; +class DataLayout; +class MachineIRBuilder; +class MachineRegisterInfo; +class Type; + +class AArch64CallLowering: public CallLowering { +public: + AArch64CallLowering(const AArch64TargetLowering &TLI); + + bool lowerReturn(MachineIRBuilder &MIRBuilder, const Value *Val, + ArrayRef VRegs, + Register SwiftErrorVReg) const override; + + bool fallBackToDAGISel(const Function &F) const override; + + bool lowerFormalArguments(MachineIRBuilder &MIRBuilder, const Function &F, + ArrayRef> VRegs) const override; + + bool lowerCall(MachineIRBuilder &MIRBuilder, + CallLoweringInfo &Info) const override; + + /// Returns true if the call can be lowered as a tail call. + bool + isEligibleForTailCallOptimization(MachineIRBuilder &MIRBuilder, + CallLoweringInfo &Info, + SmallVectorImpl &InArgs, + SmallVectorImpl &OutArgs) const; + + bool supportSwiftError() const override { return true; } + +private: + using RegHandler = std::function; + + using MemHandler = + std::function; + + void splitToValueTypes(const ArgInfo &OrigArgInfo, + SmallVectorImpl &SplitArgs, + const DataLayout &DL, MachineRegisterInfo &MRI, + CallingConv::ID CallConv) const; + + bool lowerTailCall(MachineIRBuilder &MIRBuilder, CallLoweringInfo &Info, + SmallVectorImpl &OutArgs) const; + + bool + doCallerAndCalleePassArgsTheSameWay(CallLoweringInfo &Info, + MachineFunction &MF, + SmallVectorImpl &InArgs) const; + + bool + areCalleeOutgoingArgsTailCallable(CallLoweringInfo &Info, MachineFunction &MF, + SmallVectorImpl &OutArgs) const; +}; + +} // end namespace llvm + +#endif // LLVM_LIB_TARGET_AARCH64_AARCH64CALLLOWERING_H diff --git a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp new file mode 100644 index 0000000000000..408f0cb77e738 --- /dev/null +++ b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp @@ -0,0 +1,5704 @@ +//===- AArch64InstructionSelector.cpp ----------------------------*- C++ -*-==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// \file +/// This file implements the targeting of the InstructionSelector class for +/// AArch64. +/// \todo This should be generated by TableGen. +//===----------------------------------------------------------------------===// + +#include "AArch64InstrInfo.h" +#include "AArch64MachineFunctionInfo.h" +#include "AArch64RegisterBankInfo.h" +#include "AArch64RegisterInfo.h" +#include "AArch64Subtarget.h" +#include "AArch64TargetMachine.h" +#include "MCTargetDesc/AArch64AddressingModes.h" +#include "llvm/ADT/Optional.h" +#include "llvm/CodeGen/GlobalISel/InstructionSelector.h" +#include "llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h" +#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" +#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" +#include "llvm/CodeGen/GlobalISel/Utils.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineConstantPool.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineOperand.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/TargetOpcodes.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/Type.h" +#include "llvm/IR/IntrinsicsAArch64.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" + +#define DEBUG_TYPE "aarch64-isel" + +using namespace llvm; + +namespace { + +#define GET_GLOBALISEL_PREDICATE_BITSET +#include "AArch64GenGlobalISel.inc" +#undef GET_GLOBALISEL_PREDICATE_BITSET + +class AArch64InstructionSelector : public InstructionSelector { +public: + AArch64InstructionSelector(const AArch64TargetMachine &TM, + const AArch64Subtarget &STI, + const AArch64RegisterBankInfo &RBI); + + bool select(MachineInstr &I) override; + static const char *getName() { return DEBUG_TYPE; } + + void setupMF(MachineFunction &MF, GISelKnownBits &KB, + CodeGenCoverage &CoverageInfo) override { + InstructionSelector::setupMF(MF, KB, CoverageInfo); + + // hasFnAttribute() is expensive to call on every BRCOND selection, so + // cache it here for each run of the selector. + ProduceNonFlagSettingCondBr = + !MF.getFunction().hasFnAttribute(Attribute::SpeculativeLoadHardening); + MFReturnAddr = Register(); + + processPHIs(MF); + } + +private: + /// tblgen-erated 'select' implementation, used as the initial selector for + /// the patterns that don't require complex C++. + bool selectImpl(MachineInstr &I, CodeGenCoverage &CoverageInfo) const; + + // A lowering phase that runs before any selection attempts. + // Returns true if the instruction was modified. + bool preISelLower(MachineInstr &I); + + // An early selection function that runs before the selectImpl() call. + bool earlySelect(MachineInstr &I) const; + + // Do some preprocessing of G_PHIs before we begin selection. + void processPHIs(MachineFunction &MF); + + bool earlySelectSHL(MachineInstr &I, MachineRegisterInfo &MRI) const; + + /// Eliminate same-sized cross-bank copies into stores before selectImpl(). + bool contractCrossBankCopyIntoStore(MachineInstr &I, + MachineRegisterInfo &MRI); + + bool convertPtrAddToAdd(MachineInstr &I, MachineRegisterInfo &MRI); + + bool selectVaStartAAPCS(MachineInstr &I, MachineFunction &MF, + MachineRegisterInfo &MRI) const; + bool selectVaStartDarwin(MachineInstr &I, MachineFunction &MF, + MachineRegisterInfo &MRI) const; + + bool tryOptAndIntoCompareBranch(MachineInstr *LHS, + int64_t CmpConstant, + const CmpInst::Predicate &Pred, + MachineBasicBlock *DstMBB, + MachineIRBuilder &MIB) const; + bool selectCompareBranch(MachineInstr &I, MachineFunction &MF, + MachineRegisterInfo &MRI) const; + + bool selectVectorASHR(MachineInstr &I, MachineRegisterInfo &MRI) const; + bool selectVectorSHL(MachineInstr &I, MachineRegisterInfo &MRI) const; + + // Helper to generate an equivalent of scalar_to_vector into a new register, + // returned via 'Dst'. + MachineInstr *emitScalarToVector(unsigned EltSize, + const TargetRegisterClass *DstRC, + Register Scalar, + MachineIRBuilder &MIRBuilder) const; + + /// Emit a lane insert into \p DstReg, or a new vector register if None is + /// provided. + /// + /// The lane inserted into is defined by \p LaneIdx. The vector source + /// register is given by \p SrcReg. The register containing the element is + /// given by \p EltReg. + MachineInstr *emitLaneInsert(Optional DstReg, Register SrcReg, + Register EltReg, unsigned LaneIdx, + const RegisterBank &RB, + MachineIRBuilder &MIRBuilder) const; + bool selectInsertElt(MachineInstr &I, MachineRegisterInfo &MRI) const; + bool tryOptConstantBuildVec(MachineInstr &MI, LLT DstTy, + MachineRegisterInfo &MRI) const; + bool selectBuildVector(MachineInstr &I, MachineRegisterInfo &MRI) const; + bool selectMergeValues(MachineInstr &I, MachineRegisterInfo &MRI) const; + bool selectUnmergeValues(MachineInstr &I, MachineRegisterInfo &MRI) const; + + bool selectShuffleVector(MachineInstr &I, MachineRegisterInfo &MRI) const; + bool selectExtractElt(MachineInstr &I, MachineRegisterInfo &MRI) const; + bool selectConcatVectors(MachineInstr &I, MachineRegisterInfo &MRI) const; + bool selectSplitVectorUnmerge(MachineInstr &I, + MachineRegisterInfo &MRI) const; + bool selectIntrinsicWithSideEffects(MachineInstr &I, + MachineRegisterInfo &MRI) const; + bool selectIntrinsic(MachineInstr &I, MachineRegisterInfo &MRI); + bool selectVectorICmp(MachineInstr &I, MachineRegisterInfo &MRI) const; + bool selectIntrinsicTrunc(MachineInstr &I, MachineRegisterInfo &MRI) const; + bool selectIntrinsicRound(MachineInstr &I, MachineRegisterInfo &MRI) const; + bool selectJumpTable(MachineInstr &I, MachineRegisterInfo &MRI) const; + bool selectBrJT(MachineInstr &I, MachineRegisterInfo &MRI) const; + bool selectTLSGlobalValue(MachineInstr &I, MachineRegisterInfo &MRI) const; + + unsigned emitConstantPoolEntry(const Constant *CPVal, + MachineFunction &MF) const; + MachineInstr *emitLoadFromConstantPool(const Constant *CPVal, + MachineIRBuilder &MIRBuilder) const; + + // Emit a vector concat operation. + MachineInstr *emitVectorConcat(Optional Dst, Register Op1, + Register Op2, + MachineIRBuilder &MIRBuilder) const; + + // Emit an integer compare between LHS and RHS, which checks for Predicate. + // + // This returns the produced compare instruction, and the predicate which + // was ultimately used in the compare. The predicate may differ from what + // is passed in \p Predicate due to optimization. + std::pair + emitIntegerCompare(MachineOperand &LHS, MachineOperand &RHS, + MachineOperand &Predicate, + MachineIRBuilder &MIRBuilder) const; + MachineInstr *emitADD(Register DefReg, MachineOperand &LHS, MachineOperand &RHS, + MachineIRBuilder &MIRBuilder) const; + MachineInstr *emitCMN(MachineOperand &LHS, MachineOperand &RHS, + MachineIRBuilder &MIRBuilder) const; + MachineInstr *emitTST(const Register &LHS, const Register &RHS, + MachineIRBuilder &MIRBuilder) const; + MachineInstr *emitExtractVectorElt(Optional DstReg, + const RegisterBank &DstRB, LLT ScalarTy, + Register VecReg, unsigned LaneIdx, + MachineIRBuilder &MIRBuilder) const; + + /// Helper function for selecting G_FCONSTANT. If the G_FCONSTANT can be + /// materialized using a FMOV instruction, then update MI and return it. + /// Otherwise, do nothing and return a nullptr. + MachineInstr *emitFMovForFConstant(MachineInstr &MI, + MachineRegisterInfo &MRI) const; + + /// Emit a CSet for a compare. + MachineInstr *emitCSetForICMP(Register DefReg, unsigned Pred, + MachineIRBuilder &MIRBuilder) const; + + /// Emit a TB(N)Z instruction which tests \p Bit in \p TestReg. + /// \p IsNegative is true if the test should be "not zero". + /// This will also optimize the test bit instruction when possible. + MachineInstr *emitTestBit(Register TestReg, uint64_t Bit, bool IsNegative, + MachineBasicBlock *DstMBB, + MachineIRBuilder &MIB) const; + + // Equivalent to the i32shift_a and friends from AArch64InstrInfo.td. + // We use these manually instead of using the importer since it doesn't + // support SDNodeXForm. + ComplexRendererFns selectShiftA_32(const MachineOperand &Root) const; + ComplexRendererFns selectShiftB_32(const MachineOperand &Root) const; + ComplexRendererFns selectShiftA_64(const MachineOperand &Root) const; + ComplexRendererFns selectShiftB_64(const MachineOperand &Root) const; + + ComplexRendererFns select12BitValueWithLeftShift(uint64_t Immed) const; + ComplexRendererFns selectArithImmed(MachineOperand &Root) const; + ComplexRendererFns selectNegArithImmed(MachineOperand &Root) const; + + ComplexRendererFns selectAddrModeUnscaled(MachineOperand &Root, + unsigned Size) const; + + ComplexRendererFns selectAddrModeUnscaled8(MachineOperand &Root) const { + return selectAddrModeUnscaled(Root, 1); + } + ComplexRendererFns selectAddrModeUnscaled16(MachineOperand &Root) const { + return selectAddrModeUnscaled(Root, 2); + } + ComplexRendererFns selectAddrModeUnscaled32(MachineOperand &Root) const { + return selectAddrModeUnscaled(Root, 4); + } + ComplexRendererFns selectAddrModeUnscaled64(MachineOperand &Root) const { + return selectAddrModeUnscaled(Root, 8); + } + ComplexRendererFns selectAddrModeUnscaled128(MachineOperand &Root) const { + return selectAddrModeUnscaled(Root, 16); + } + + /// Helper to try to fold in a GISEL_ADD_LOW into an immediate, to be used + /// from complex pattern matchers like selectAddrModeIndexed(). + ComplexRendererFns tryFoldAddLowIntoImm(MachineInstr &RootDef, unsigned Size, + MachineRegisterInfo &MRI) const; + + ComplexRendererFns selectAddrModeIndexed(MachineOperand &Root, + unsigned Size) const; + template + ComplexRendererFns selectAddrModeIndexed(MachineOperand &Root) const { + return selectAddrModeIndexed(Root, Width / 8); + } + + bool isWorthFoldingIntoExtendedReg(MachineInstr &MI, + const MachineRegisterInfo &MRI) const; + ComplexRendererFns + selectAddrModeShiftedExtendXReg(MachineOperand &Root, + unsigned SizeInBytes) const; + + /// Returns a \p ComplexRendererFns which contains a base, offset, and whether + /// or not a shift + extend should be folded into an addressing mode. Returns + /// None when this is not profitable or possible. + ComplexRendererFns + selectExtendedSHL(MachineOperand &Root, MachineOperand &Base, + MachineOperand &Offset, unsigned SizeInBytes, + bool WantsExt) const; + ComplexRendererFns selectAddrModeRegisterOffset(MachineOperand &Root) const; + ComplexRendererFns selectAddrModeXRO(MachineOperand &Root, + unsigned SizeInBytes) const; + template + ComplexRendererFns selectAddrModeXRO(MachineOperand &Root) const { + return selectAddrModeXRO(Root, Width / 8); + } + + ComplexRendererFns selectAddrModeWRO(MachineOperand &Root, + unsigned SizeInBytes) const; + template + ComplexRendererFns selectAddrModeWRO(MachineOperand &Root) const { + return selectAddrModeWRO(Root, Width / 8); + } + + ComplexRendererFns selectShiftedRegister(MachineOperand &Root) const; + + ComplexRendererFns selectArithShiftedRegister(MachineOperand &Root) const { + return selectShiftedRegister(Root); + } + + ComplexRendererFns selectLogicalShiftedRegister(MachineOperand &Root) const { + // TODO: selectShiftedRegister should allow for rotates on logical shifts. + // For now, make them the same. The only difference between the two is that + // logical shifts are allowed to fold in rotates. Otherwise, these are + // functionally the same. + return selectShiftedRegister(Root); + } + + /// Given an extend instruction, determine the correct shift-extend type for + /// that instruction. + /// + /// If the instruction is going to be used in a load or store, pass + /// \p IsLoadStore = true. + AArch64_AM::ShiftExtendType + getExtendTypeForInst(MachineInstr &MI, MachineRegisterInfo &MRI, + bool IsLoadStore = false) const; + + /// Instructions that accept extend modifiers like UXTW expect the register + /// being extended to be a GPR32. Narrow ExtReg to a 32-bit register using a + /// subregister copy if necessary. Return either ExtReg, or the result of the + /// new copy. + Register narrowExtendRegIfNeeded(Register ExtReg, + MachineIRBuilder &MIB) const; + Register widenGPRBankRegIfNeeded(Register Reg, unsigned Size, + MachineIRBuilder &MIB) const; + ComplexRendererFns selectArithExtendedRegister(MachineOperand &Root) const; + + void renderTruncImm(MachineInstrBuilder &MIB, const MachineInstr &MI, + int OpIdx = -1) const; + void renderLogicalImm32(MachineInstrBuilder &MIB, const MachineInstr &I, + int OpIdx = -1) const; + void renderLogicalImm64(MachineInstrBuilder &MIB, const MachineInstr &I, + int OpIdx = -1) const; + + // Materialize a GlobalValue or BlockAddress using a movz+movk sequence. + void materializeLargeCMVal(MachineInstr &I, const Value *V, + unsigned OpFlags) const; + + // Optimization methods. + bool tryOptSelect(MachineInstr &MI) const; + MachineInstr *tryFoldIntegerCompare(MachineOperand &LHS, MachineOperand &RHS, + MachineOperand &Predicate, + MachineIRBuilder &MIRBuilder) const; + MachineInstr *tryOptArithImmedIntegerCompare(MachineOperand &LHS, + MachineOperand &RHS, + CmpInst::Predicate &Predicate, + MachineIRBuilder &MIB) const; + MachineInstr *tryOptArithShiftedCompare(MachineOperand &LHS, + MachineOperand &RHS, + MachineIRBuilder &MIB) const; + + /// Return true if \p MI is a load or store of \p NumBytes bytes. + bool isLoadStoreOfNumBytes(const MachineInstr &MI, unsigned NumBytes) const; + + /// Returns true if \p MI is guaranteed to have the high-half of a 64-bit + /// register zeroed out. In other words, the result of MI has been explicitly + /// zero extended. + bool isDef32(const MachineInstr &MI) const; + + const AArch64TargetMachine &TM; + const AArch64Subtarget &STI; + const AArch64InstrInfo &TII; + const AArch64RegisterInfo &TRI; + const AArch64RegisterBankInfo &RBI; + + bool ProduceNonFlagSettingCondBr = false; + + // Some cached values used during selection. + // We use LR as a live-in register, and we keep track of it here as it can be + // clobbered by calls. + Register MFReturnAddr; + +#define GET_GLOBALISEL_PREDICATES_DECL +#include "AArch64GenGlobalISel.inc" +#undef GET_GLOBALISEL_PREDICATES_DECL + +// We declare the temporaries used by selectImpl() in the class to minimize the +// cost of constructing placeholder values. +#define GET_GLOBALISEL_TEMPORARIES_DECL +#include "AArch64GenGlobalISel.inc" +#undef GET_GLOBALISEL_TEMPORARIES_DECL +}; + +} // end anonymous namespace + +#define GET_GLOBALISEL_IMPL +#include "AArch64GenGlobalISel.inc" +#undef GET_GLOBALISEL_IMPL + +AArch64InstructionSelector::AArch64InstructionSelector( + const AArch64TargetMachine &TM, const AArch64Subtarget &STI, + const AArch64RegisterBankInfo &RBI) + : InstructionSelector(), TM(TM), STI(STI), TII(*STI.getInstrInfo()), + TRI(*STI.getRegisterInfo()), RBI(RBI), +#define GET_GLOBALISEL_PREDICATES_INIT +#include "AArch64GenGlobalISel.inc" +#undef GET_GLOBALISEL_PREDICATES_INIT +#define GET_GLOBALISEL_TEMPORARIES_INIT +#include "AArch64GenGlobalISel.inc" +#undef GET_GLOBALISEL_TEMPORARIES_INIT +{ +} + +// FIXME: This should be target-independent, inferred from the types declared +// for each class in the bank. +static const TargetRegisterClass * +getRegClassForTypeOnBank(LLT Ty, const RegisterBank &RB, + const RegisterBankInfo &RBI, + bool GetAllRegSet = false) { + if (RB.getID() == AArch64::GPRRegBankID) { + if (Ty.getSizeInBits() <= 32) + return GetAllRegSet ? &AArch64::GPR32allRegClass + : &AArch64::GPR32RegClass; + if (Ty.getSizeInBits() == 64) + return GetAllRegSet ? &AArch64::GPR64allRegClass + : &AArch64::GPR64RegClass; + return nullptr; + } + + if (RB.getID() == AArch64::FPRRegBankID) { + if (Ty.getSizeInBits() <= 16) + return &AArch64::FPR16RegClass; + if (Ty.getSizeInBits() == 32) + return &AArch64::FPR32RegClass; + if (Ty.getSizeInBits() == 64) + return &AArch64::FPR64RegClass; + if (Ty.getSizeInBits() == 128) + return &AArch64::FPR128RegClass; + return nullptr; + } + + return nullptr; +} + +/// Given a register bank, and size in bits, return the smallest register class +/// that can represent that combination. +static const TargetRegisterClass * +getMinClassForRegBank(const RegisterBank &RB, unsigned SizeInBits, + bool GetAllRegSet = false) { + unsigned RegBankID = RB.getID(); + + if (RegBankID == AArch64::GPRRegBankID) { + if (SizeInBits <= 32) + return GetAllRegSet ? &AArch64::GPR32allRegClass + : &AArch64::GPR32RegClass; + if (SizeInBits == 64) + return GetAllRegSet ? &AArch64::GPR64allRegClass + : &AArch64::GPR64RegClass; + } + + if (RegBankID == AArch64::FPRRegBankID) { + switch (SizeInBits) { + default: + return nullptr; + case 8: + return &AArch64::FPR8RegClass; + case 16: + return &AArch64::FPR16RegClass; + case 32: + return &AArch64::FPR32RegClass; + case 64: + return &AArch64::FPR64RegClass; + case 128: + return &AArch64::FPR128RegClass; + } + } + + return nullptr; +} + +/// Returns the correct subregister to use for a given register class. +static bool getSubRegForClass(const TargetRegisterClass *RC, + const TargetRegisterInfo &TRI, unsigned &SubReg) { + switch (TRI.getRegSizeInBits(*RC)) { + case 8: + SubReg = AArch64::bsub; + break; + case 16: + SubReg = AArch64::hsub; + break; + case 32: + if (RC != &AArch64::FPR32RegClass) + SubReg = AArch64::sub_32; + else + SubReg = AArch64::ssub; + break; + case 64: + SubReg = AArch64::dsub; + break; + default: + LLVM_DEBUG( + dbgs() << "Couldn't find appropriate subregister for register class."); + return false; + } + + return true; +} + +/// Returns the minimum size the given register bank can hold. +static unsigned getMinSizeForRegBank(const RegisterBank &RB) { + switch (RB.getID()) { + case AArch64::GPRRegBankID: + return 32; + case AArch64::FPRRegBankID: + return 8; + default: + llvm_unreachable("Tried to get minimum size for unknown register bank."); + } +} + +static Optional getImmedFromMO(const MachineOperand &Root) { + auto &MI = *Root.getParent(); + auto &MBB = *MI.getParent(); + auto &MF = *MBB.getParent(); + auto &MRI = MF.getRegInfo(); + uint64_t Immed; + if (Root.isImm()) + Immed = Root.getImm(); + else if (Root.isCImm()) + Immed = Root.getCImm()->getZExtValue(); + else if (Root.isReg()) { + auto ValAndVReg = + getConstantVRegValWithLookThrough(Root.getReg(), MRI, true); + if (!ValAndVReg) + return None; + Immed = ValAndVReg->Value; + } else + return None; + return Immed; +} + +/// Check whether \p I is a currently unsupported binary operation: +/// - it has an unsized type +/// - an operand is not a vreg +/// - all operands are not in the same bank +/// These are checks that should someday live in the verifier, but right now, +/// these are mostly limitations of the aarch64 selector. +static bool unsupportedBinOp(const MachineInstr &I, + const AArch64RegisterBankInfo &RBI, + const MachineRegisterInfo &MRI, + const AArch64RegisterInfo &TRI) { + LLT Ty = MRI.getType(I.getOperand(0).getReg()); + if (!Ty.isValid()) { + LLVM_DEBUG(dbgs() << "Generic binop register should be typed\n"); + return true; + } + + const RegisterBank *PrevOpBank = nullptr; + for (auto &MO : I.operands()) { + // FIXME: Support non-register operands. + if (!MO.isReg()) { + LLVM_DEBUG(dbgs() << "Generic inst non-reg operands are unsupported\n"); + return true; + } + + // FIXME: Can generic operations have physical registers operands? If + // so, this will need to be taught about that, and we'll need to get the + // bank out of the minimal class for the register. + // Either way, this needs to be documented (and possibly verified). + if (!Register::isVirtualRegister(MO.getReg())) { + LLVM_DEBUG(dbgs() << "Generic inst has physical register operand\n"); + return true; + } + + const RegisterBank *OpBank = RBI.getRegBank(MO.getReg(), MRI, TRI); + if (!OpBank) { + LLVM_DEBUG(dbgs() << "Generic register has no bank or class\n"); + return true; + } + + if (PrevOpBank && OpBank != PrevOpBank) { + LLVM_DEBUG(dbgs() << "Generic inst operands have different banks\n"); + return true; + } + PrevOpBank = OpBank; + } + return false; +} + +/// Select the AArch64 opcode for the basic binary operation \p GenericOpc +/// (such as G_OR or G_SDIV), appropriate for the register bank \p RegBankID +/// and of size \p OpSize. +/// \returns \p GenericOpc if the combination is unsupported. +static unsigned selectBinaryOp(unsigned GenericOpc, unsigned RegBankID, + unsigned OpSize) { + switch (RegBankID) { + case AArch64::GPRRegBankID: + if (OpSize == 32) { + switch (GenericOpc) { + case TargetOpcode::G_SHL: + return AArch64::LSLVWr; + case TargetOpcode::G_LSHR: + return AArch64::LSRVWr; + case TargetOpcode::G_ASHR: + return AArch64::ASRVWr; + default: + return GenericOpc; + } + } else if (OpSize == 64) { + switch (GenericOpc) { + case TargetOpcode::G_PTR_ADD: + return AArch64::ADDXrr; + case TargetOpcode::G_SHL: + return AArch64::LSLVXr; + case TargetOpcode::G_LSHR: + return AArch64::LSRVXr; + case TargetOpcode::G_ASHR: + return AArch64::ASRVXr; + default: + return GenericOpc; + } + } + break; + case AArch64::FPRRegBankID: + switch (OpSize) { + case 32: + switch (GenericOpc) { + case TargetOpcode::G_FADD: + return AArch64::FADDSrr; + case TargetOpcode::G_FSUB: + return AArch64::FSUBSrr; + case TargetOpcode::G_FMUL: + return AArch64::FMULSrr; + case TargetOpcode::G_FDIV: + return AArch64::FDIVSrr; + default: + return GenericOpc; + } + case 64: + switch (GenericOpc) { + case TargetOpcode::G_FADD: + return AArch64::FADDDrr; + case TargetOpcode::G_FSUB: + return AArch64::FSUBDrr; + case TargetOpcode::G_FMUL: + return AArch64::FMULDrr; + case TargetOpcode::G_FDIV: + return AArch64::FDIVDrr; + case TargetOpcode::G_OR: + return AArch64::ORRv8i8; + default: + return GenericOpc; + } + } + break; + } + return GenericOpc; +} + +/// Select the AArch64 opcode for the G_LOAD or G_STORE operation \p GenericOpc, +/// appropriate for the (value) register bank \p RegBankID and of memory access +/// size \p OpSize. This returns the variant with the base+unsigned-immediate +/// addressing mode (e.g., LDRXui). +/// \returns \p GenericOpc if the combination is unsupported. +static unsigned selectLoadStoreUIOp(unsigned GenericOpc, unsigned RegBankID, + unsigned OpSize) { + const bool isStore = GenericOpc == TargetOpcode::G_STORE; + switch (RegBankID) { + case AArch64::GPRRegBankID: + switch (OpSize) { + case 8: + return isStore ? AArch64::STRBBui : AArch64::LDRBBui; + case 16: + return isStore ? AArch64::STRHHui : AArch64::LDRHHui; + case 32: + return isStore ? AArch64::STRWui : AArch64::LDRWui; + case 64: + return isStore ? AArch64::STRXui : AArch64::LDRXui; + } + break; + case AArch64::FPRRegBankID: + switch (OpSize) { + case 8: + return isStore ? AArch64::STRBui : AArch64::LDRBui; + case 16: + return isStore ? AArch64::STRHui : AArch64::LDRHui; + case 32: + return isStore ? AArch64::STRSui : AArch64::LDRSui; + case 64: + return isStore ? AArch64::STRDui : AArch64::LDRDui; + } + break; + } + return GenericOpc; +} + +#ifndef NDEBUG +/// Helper function that verifies that we have a valid copy at the end of +/// selectCopy. Verifies that the source and dest have the expected sizes and +/// then returns true. +static bool isValidCopy(const MachineInstr &I, const RegisterBank &DstBank, + const MachineRegisterInfo &MRI, + const TargetRegisterInfo &TRI, + const RegisterBankInfo &RBI) { + const Register DstReg = I.getOperand(0).getReg(); + const Register SrcReg = I.getOperand(1).getReg(); + const unsigned DstSize = RBI.getSizeInBits(DstReg, MRI, TRI); + const unsigned SrcSize = RBI.getSizeInBits(SrcReg, MRI, TRI); + + // Make sure the size of the source and dest line up. + assert( + (DstSize == SrcSize || + // Copies are a mean to setup initial types, the number of + // bits may not exactly match. + (Register::isPhysicalRegister(SrcReg) && DstSize <= SrcSize) || + // Copies are a mean to copy bits around, as long as we are + // on the same register class, that's fine. Otherwise, that + // means we need some SUBREG_TO_REG or AND & co. + (((DstSize + 31) / 32 == (SrcSize + 31) / 32) && DstSize > SrcSize)) && + "Copy with different width?!"); + + // Check the size of the destination. + assert((DstSize <= 64 || DstBank.getID() == AArch64::FPRRegBankID) && + "GPRs cannot get more than 64-bit width values"); + + return true; +} +#endif + +/// Helper function for selectCopy. Inserts a subregister copy from \p SrcReg +/// to \p *To. +/// +/// E.g "To = COPY SrcReg:SubReg" +static bool copySubReg(MachineInstr &I, MachineRegisterInfo &MRI, + const RegisterBankInfo &RBI, Register SrcReg, + const TargetRegisterClass *To, unsigned SubReg) { + assert(SrcReg.isValid() && "Expected a valid source register?"); + assert(To && "Destination register class cannot be null"); + assert(SubReg && "Expected a valid subregister"); + + MachineIRBuilder MIB(I); + auto SubRegCopy = + MIB.buildInstr(TargetOpcode::COPY, {To}, {}).addReg(SrcReg, 0, SubReg); + MachineOperand &RegOp = I.getOperand(1); + RegOp.setReg(SubRegCopy.getReg(0)); + + // It's possible that the destination register won't be constrained. Make + // sure that happens. + if (!Register::isPhysicalRegister(I.getOperand(0).getReg())) + RBI.constrainGenericRegister(I.getOperand(0).getReg(), *To, MRI); + + return true; +} + +/// Helper function to get the source and destination register classes for a +/// copy. Returns a std::pair containing the source register class for the +/// copy, and the destination register class for the copy. If a register class +/// cannot be determined, then it will be nullptr. +static std::pair +getRegClassesForCopy(MachineInstr &I, const TargetInstrInfo &TII, + MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI, + const RegisterBankInfo &RBI) { + Register DstReg = I.getOperand(0).getReg(); + Register SrcReg = I.getOperand(1).getReg(); + const RegisterBank &DstRegBank = *RBI.getRegBank(DstReg, MRI, TRI); + const RegisterBank &SrcRegBank = *RBI.getRegBank(SrcReg, MRI, TRI); + unsigned DstSize = RBI.getSizeInBits(DstReg, MRI, TRI); + unsigned SrcSize = RBI.getSizeInBits(SrcReg, MRI, TRI); + + // Special casing for cross-bank copies of s1s. We can technically represent + // a 1-bit value with any size of register. The minimum size for a GPR is 32 + // bits. So, we need to put the FPR on 32 bits as well. + // + // FIXME: I'm not sure if this case holds true outside of copies. If it does, + // then we can pull it into the helpers that get the appropriate class for a + // register bank. Or make a new helper that carries along some constraint + // information. + if (SrcRegBank != DstRegBank && (DstSize == 1 && SrcSize == 1)) + SrcSize = DstSize = 32; + + return {getMinClassForRegBank(SrcRegBank, SrcSize, true), + getMinClassForRegBank(DstRegBank, DstSize, true)}; +} + +static bool selectCopy(MachineInstr &I, const TargetInstrInfo &TII, + MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI, + const RegisterBankInfo &RBI) { + Register DstReg = I.getOperand(0).getReg(); + Register SrcReg = I.getOperand(1).getReg(); + const RegisterBank &DstRegBank = *RBI.getRegBank(DstReg, MRI, TRI); + const RegisterBank &SrcRegBank = *RBI.getRegBank(SrcReg, MRI, TRI); + + // Find the correct register classes for the source and destination registers. + const TargetRegisterClass *SrcRC; + const TargetRegisterClass *DstRC; + std::tie(SrcRC, DstRC) = getRegClassesForCopy(I, TII, MRI, TRI, RBI); + + if (!DstRC) { + LLVM_DEBUG(dbgs() << "Unexpected dest size " + << RBI.getSizeInBits(DstReg, MRI, TRI) << '\n'); + return false; + } + + // A couple helpers below, for making sure that the copy we produce is valid. + + // Set to true if we insert a SUBREG_TO_REG. If we do this, then we don't want + // to verify that the src and dst are the same size, since that's handled by + // the SUBREG_TO_REG. + bool KnownValid = false; + + // Returns true, or asserts if something we don't expect happens. Instead of + // returning true, we return isValidCopy() to ensure that we verify the + // result. + auto CheckCopy = [&]() { + // If we have a bitcast or something, we can't have physical registers. + assert((I.isCopy() || + (!Register::isPhysicalRegister(I.getOperand(0).getReg()) && + !Register::isPhysicalRegister(I.getOperand(1).getReg()))) && + "No phys reg on generic operator!"); + bool ValidCopy = true; +#ifndef NDEBUG + ValidCopy = KnownValid || isValidCopy(I, DstRegBank, MRI, TRI, RBI); + assert(ValidCopy && "Invalid copy."); +#endif + return ValidCopy; + }; + + // Is this a copy? If so, then we may need to insert a subregister copy. + if (I.isCopy()) { + // Yes. Check if there's anything to fix up. + if (!SrcRC) { + LLVM_DEBUG(dbgs() << "Couldn't determine source register class\n"); + return false; + } + + unsigned SrcSize = TRI.getRegSizeInBits(*SrcRC); + unsigned DstSize = TRI.getRegSizeInBits(*DstRC); + unsigned SubReg; + + // If the source bank doesn't support a subregister copy small enough, + // then we first need to copy to the destination bank. + if (getMinSizeForRegBank(SrcRegBank) > DstSize) { + const TargetRegisterClass *DstTempRC = + getMinClassForRegBank(DstRegBank, SrcSize, /* GetAllRegSet */ true); + getSubRegForClass(DstRC, TRI, SubReg); + + MachineIRBuilder MIB(I); + auto Copy = MIB.buildCopy({DstTempRC}, {SrcReg}); + copySubReg(I, MRI, RBI, Copy.getReg(0), DstRC, SubReg); + } else if (SrcSize > DstSize) { + // If the source register is bigger than the destination we need to + // perform a subregister copy. + const TargetRegisterClass *SubRegRC = + getMinClassForRegBank(SrcRegBank, DstSize, /* GetAllRegSet */ true); + getSubRegForClass(SubRegRC, TRI, SubReg); + copySubReg(I, MRI, RBI, SrcReg, DstRC, SubReg); + } else if (DstSize > SrcSize) { + // If the destination register is bigger than the source we need to do + // a promotion using SUBREG_TO_REG. + const TargetRegisterClass *PromotionRC = + getMinClassForRegBank(SrcRegBank, DstSize, /* GetAllRegSet */ true); + getSubRegForClass(SrcRC, TRI, SubReg); + + Register PromoteReg = MRI.createVirtualRegister(PromotionRC); + BuildMI(*I.getParent(), I, I.getDebugLoc(), + TII.get(AArch64::SUBREG_TO_REG), PromoteReg) + .addImm(0) + .addUse(SrcReg) + .addImm(SubReg); + MachineOperand &RegOp = I.getOperand(1); + RegOp.setReg(PromoteReg); + + // Promise that the copy is implicitly validated by the SUBREG_TO_REG. + KnownValid = true; + } + + // If the destination is a physical register, then there's nothing to + // change, so we're done. + if (Register::isPhysicalRegister(DstReg)) + return CheckCopy(); + } + + // No need to constrain SrcReg. It will get constrained when we hit another + // of its use or its defs. Copies do not have constraints. + if (!RBI.constrainGenericRegister(DstReg, *DstRC, MRI)) { + LLVM_DEBUG(dbgs() << "Failed to constrain " << TII.getName(I.getOpcode()) + << " operand\n"); + return false; + } + I.setDesc(TII.get(AArch64::COPY)); + return CheckCopy(); +} + +static unsigned selectFPConvOpc(unsigned GenericOpc, LLT DstTy, LLT SrcTy) { + if (!DstTy.isScalar() || !SrcTy.isScalar()) + return GenericOpc; + + const unsigned DstSize = DstTy.getSizeInBits(); + const unsigned SrcSize = SrcTy.getSizeInBits(); + + switch (DstSize) { + case 32: + switch (SrcSize) { + case 32: + switch (GenericOpc) { + case TargetOpcode::G_SITOFP: + return AArch64::SCVTFUWSri; + case TargetOpcode::G_UITOFP: + return AArch64::UCVTFUWSri; + case TargetOpcode::G_FPTOSI: + return AArch64::FCVTZSUWSr; + case TargetOpcode::G_FPTOUI: + return AArch64::FCVTZUUWSr; + default: + return GenericOpc; + } + case 64: + switch (GenericOpc) { + case TargetOpcode::G_SITOFP: + return AArch64::SCVTFUXSri; + case TargetOpcode::G_UITOFP: + return AArch64::UCVTFUXSri; + case TargetOpcode::G_FPTOSI: + return AArch64::FCVTZSUWDr; + case TargetOpcode::G_FPTOUI: + return AArch64::FCVTZUUWDr; + default: + return GenericOpc; + } + default: + return GenericOpc; + } + case 64: + switch (SrcSize) { + case 32: + switch (GenericOpc) { + case TargetOpcode::G_SITOFP: + return AArch64::SCVTFUWDri; + case TargetOpcode::G_UITOFP: + return AArch64::UCVTFUWDri; + case TargetOpcode::G_FPTOSI: + return AArch64::FCVTZSUXSr; + case TargetOpcode::G_FPTOUI: + return AArch64::FCVTZUUXSr; + default: + return GenericOpc; + } + case 64: + switch (GenericOpc) { + case TargetOpcode::G_SITOFP: + return AArch64::SCVTFUXDri; + case TargetOpcode::G_UITOFP: + return AArch64::UCVTFUXDri; + case TargetOpcode::G_FPTOSI: + return AArch64::FCVTZSUXDr; + case TargetOpcode::G_FPTOUI: + return AArch64::FCVTZUUXDr; + default: + return GenericOpc; + } + default: + return GenericOpc; + } + default: + return GenericOpc; + }; + return GenericOpc; +} + +static unsigned selectSelectOpc(MachineInstr &I, MachineRegisterInfo &MRI, + const RegisterBankInfo &RBI) { + const TargetRegisterInfo &TRI = *MRI.getTargetRegisterInfo(); + bool IsFP = (RBI.getRegBank(I.getOperand(0).getReg(), MRI, TRI)->getID() != + AArch64::GPRRegBankID); + LLT Ty = MRI.getType(I.getOperand(0).getReg()); + if (Ty == LLT::scalar(32)) + return IsFP ? AArch64::FCSELSrrr : AArch64::CSELWr; + else if (Ty == LLT::scalar(64) || Ty == LLT::pointer(0, 64)) + return IsFP ? AArch64::FCSELDrrr : AArch64::CSELXr; + return 0; +} + +/// Helper function to select the opcode for a G_FCMP. +static unsigned selectFCMPOpc(MachineInstr &I, MachineRegisterInfo &MRI) { + // If this is a compare against +0.0, then we don't have to explicitly + // materialize a constant. + const ConstantFP *FPImm = getConstantFPVRegVal(I.getOperand(3).getReg(), MRI); + bool ShouldUseImm = FPImm && (FPImm->isZero() && !FPImm->isNegative()); + unsigned OpSize = MRI.getType(I.getOperand(2).getReg()).getSizeInBits(); + if (OpSize != 32 && OpSize != 64) + return 0; + unsigned CmpOpcTbl[2][2] = {{AArch64::FCMPSrr, AArch64::FCMPDrr}, + {AArch64::FCMPSri, AArch64::FCMPDri}}; + return CmpOpcTbl[ShouldUseImm][OpSize == 64]; +} + +/// Returns true if \p P is an unsigned integer comparison predicate. +static bool isUnsignedICMPPred(const CmpInst::Predicate P) { + switch (P) { + default: + return false; + case CmpInst::ICMP_UGT: + case CmpInst::ICMP_UGE: + case CmpInst::ICMP_ULT: + case CmpInst::ICMP_ULE: + return true; + } +} + +static AArch64CC::CondCode changeICMPPredToAArch64CC(CmpInst::Predicate P) { + switch (P) { + default: + llvm_unreachable("Unknown condition code!"); + case CmpInst::ICMP_NE: + return AArch64CC::NE; + case CmpInst::ICMP_EQ: + return AArch64CC::EQ; + case CmpInst::ICMP_SGT: + return AArch64CC::GT; + case CmpInst::ICMP_SGE: + return AArch64CC::GE; + case CmpInst::ICMP_SLT: + return AArch64CC::LT; + case CmpInst::ICMP_SLE: + return AArch64CC::LE; + case CmpInst::ICMP_UGT: + return AArch64CC::HI; + case CmpInst::ICMP_UGE: + return AArch64CC::HS; + case CmpInst::ICMP_ULT: + return AArch64CC::LO; + case CmpInst::ICMP_ULE: + return AArch64CC::LS; + } +} + +static void changeFCMPPredToAArch64CC(CmpInst::Predicate P, + AArch64CC::CondCode &CondCode, + AArch64CC::CondCode &CondCode2) { + CondCode2 = AArch64CC::AL; + switch (P) { + default: + llvm_unreachable("Unknown FP condition!"); + case CmpInst::FCMP_OEQ: + CondCode = AArch64CC::EQ; + break; + case CmpInst::FCMP_OGT: + CondCode = AArch64CC::GT; + break; + case CmpInst::FCMP_OGE: + CondCode = AArch64CC::GE; + break; + case CmpInst::FCMP_OLT: + CondCode = AArch64CC::MI; + break; + case CmpInst::FCMP_OLE: + CondCode = AArch64CC::LS; + break; + case CmpInst::FCMP_ONE: + CondCode = AArch64CC::MI; + CondCode2 = AArch64CC::GT; + break; + case CmpInst::FCMP_ORD: + CondCode = AArch64CC::VC; + break; + case CmpInst::FCMP_UNO: + CondCode = AArch64CC::VS; + break; + case CmpInst::FCMP_UEQ: + CondCode = AArch64CC::EQ; + CondCode2 = AArch64CC::VS; + break; + case CmpInst::FCMP_UGT: + CondCode = AArch64CC::HI; + break; + case CmpInst::FCMP_UGE: + CondCode = AArch64CC::PL; + break; + case CmpInst::FCMP_ULT: + CondCode = AArch64CC::LT; + break; + case CmpInst::FCMP_ULE: + CondCode = AArch64CC::LE; + break; + case CmpInst::FCMP_UNE: + CondCode = AArch64CC::NE; + break; + } +} + +/// Return a register which can be used as a bit to test in a TB(N)Z. +static Register getTestBitReg(Register Reg, uint64_t &Bit, bool &Invert, + MachineRegisterInfo &MRI) { + assert(Reg.isValid() && "Expected valid register!"); + while (MachineInstr *MI = getDefIgnoringCopies(Reg, MRI)) { + unsigned Opc = MI->getOpcode(); + + if (!MI->getOperand(0).isReg() || + !MRI.hasOneNonDBGUse(MI->getOperand(0).getReg())) + break; + + // (tbz (any_ext x), b) -> (tbz x, b) if we don't use the extended bits. + // + // (tbz (trunc x), b) -> (tbz x, b) is always safe, because the bit number + // on the truncated x is the same as the bit number on x. + if (Opc == TargetOpcode::G_ANYEXT || Opc == TargetOpcode::G_ZEXT || + Opc == TargetOpcode::G_TRUNC) { + Register NextReg = MI->getOperand(1).getReg(); + // Did we find something worth folding? + if (!NextReg.isValid() || !MRI.hasOneNonDBGUse(NextReg)) + break; + + // NextReg is worth folding. Keep looking. + Reg = NextReg; + continue; + } + + // Attempt to find a suitable operation with a constant on one side. + Optional C; + Register TestReg; + switch (Opc) { + default: + break; + case TargetOpcode::G_AND: + case TargetOpcode::G_XOR: { + TestReg = MI->getOperand(1).getReg(); + Register ConstantReg = MI->getOperand(2).getReg(); + auto VRegAndVal = getConstantVRegValWithLookThrough(ConstantReg, MRI); + if (!VRegAndVal) { + // AND commutes, check the other side for a constant. + // FIXME: Can we canonicalize the constant so that it's always on the + // same side at some point earlier? + std::swap(ConstantReg, TestReg); + VRegAndVal = getConstantVRegValWithLookThrough(ConstantReg, MRI); + } + if (VRegAndVal) + C = VRegAndVal->Value; + break; + } + case TargetOpcode::G_ASHR: + case TargetOpcode::G_LSHR: + case TargetOpcode::G_SHL: { + TestReg = MI->getOperand(1).getReg(); + auto VRegAndVal = + getConstantVRegValWithLookThrough(MI->getOperand(2).getReg(), MRI); + if (VRegAndVal) + C = VRegAndVal->Value; + break; + } + } + + // Didn't find a constant or viable register. Bail out of the loop. + if (!C || !TestReg.isValid()) + break; + + // We found a suitable instruction with a constant. Check to see if we can + // walk through the instruction. + Register NextReg; + unsigned TestRegSize = MRI.getType(TestReg).getSizeInBits(); + switch (Opc) { + default: + break; + case TargetOpcode::G_AND: + // (tbz (and x, m), b) -> (tbz x, b) when the b-th bit of m is set. + if ((*C >> Bit) & 1) + NextReg = TestReg; + break; + case TargetOpcode::G_SHL: + // (tbz (shl x, c), b) -> (tbz x, b-c) when b-c is positive and fits in + // the type of the register. + if (*C <= Bit && (Bit - *C) < TestRegSize) { + NextReg = TestReg; + Bit = Bit - *C; + } + break; + case TargetOpcode::G_ASHR: + // (tbz (ashr x, c), b) -> (tbz x, b+c) or (tbz x, msb) if b+c is > # bits + // in x + NextReg = TestReg; + Bit = Bit + *C; + if (Bit >= TestRegSize) + Bit = TestRegSize - 1; + break; + case TargetOpcode::G_LSHR: + // (tbz (lshr x, c), b) -> (tbz x, b+c) when b + c is < # bits in x + if ((Bit + *C) < TestRegSize) { + NextReg = TestReg; + Bit = Bit + *C; + } + break; + case TargetOpcode::G_XOR: + // We can walk through a G_XOR by inverting whether we use tbz/tbnz when + // appropriate. + // + // e.g. If x' = xor x, c, and the b-th bit is set in c then + // + // tbz x', b -> tbnz x, b + // + // Because x' only has the b-th bit set if x does not. + if ((*C >> Bit) & 1) + Invert = !Invert; + NextReg = TestReg; + break; + } + + // Check if we found anything worth folding. + if (!NextReg.isValid()) + return Reg; + Reg = NextReg; + } + + return Reg; +} + +MachineInstr *AArch64InstructionSelector::emitTestBit( + Register TestReg, uint64_t Bit, bool IsNegative, MachineBasicBlock *DstMBB, + MachineIRBuilder &MIB) const { + assert(TestReg.isValid()); + assert(ProduceNonFlagSettingCondBr && + "Cannot emit TB(N)Z with speculation tracking!"); + MachineRegisterInfo &MRI = *MIB.getMRI(); + + // Attempt to optimize the test bit by walking over instructions. + TestReg = getTestBitReg(TestReg, Bit, IsNegative, MRI); + LLT Ty = MRI.getType(TestReg); + unsigned Size = Ty.getSizeInBits(); + assert(!Ty.isVector() && "Expected a scalar!"); + assert(Bit < 64 && "Bit is too large!"); + + // When the test register is a 64-bit register, we have to narrow to make + // TBNZW work. + bool UseWReg = Bit < 32; + unsigned NecessarySize = UseWReg ? 32 : 64; + if (Size < NecessarySize) + TestReg = widenGPRBankRegIfNeeded(TestReg, NecessarySize, MIB); + else if (Size > NecessarySize) + TestReg = narrowExtendRegIfNeeded(TestReg, MIB); + + static const unsigned OpcTable[2][2] = {{AArch64::TBZX, AArch64::TBNZX}, + {AArch64::TBZW, AArch64::TBNZW}}; + unsigned Opc = OpcTable[UseWReg][IsNegative]; + auto TestBitMI = + MIB.buildInstr(Opc).addReg(TestReg).addImm(Bit).addMBB(DstMBB); + constrainSelectedInstRegOperands(*TestBitMI, TII, TRI, RBI); + return &*TestBitMI; +} + +bool AArch64InstructionSelector::tryOptAndIntoCompareBranch( + MachineInstr *AndInst, int64_t CmpConstant, const CmpInst::Predicate &Pred, + MachineBasicBlock *DstMBB, MachineIRBuilder &MIB) const { + // Given something like this: + // + // %x = ...Something... + // %one = G_CONSTANT i64 1 + // %zero = G_CONSTANT i64 0 + // %and = G_AND %x, %one + // %cmp = G_ICMP intpred(ne), %and, %zero + // %cmp_trunc = G_TRUNC %cmp + // G_BRCOND %cmp_trunc, %bb.3 + // + // We want to try and fold the AND into the G_BRCOND and produce either a + // TBNZ (when we have intpred(ne)) or a TBZ (when we have intpred(eq)). + // + // In this case, we'd get + // + // TBNZ %x %bb.3 + // + if (!AndInst || AndInst->getOpcode() != TargetOpcode::G_AND) + return false; + + // Need to be comparing against 0 to fold. + if (CmpConstant != 0) + return false; + + MachineRegisterInfo &MRI = *MIB.getMRI(); + + // Only support EQ and NE. If we have LT, then it *is* possible to fold, but + // we don't want to do this. When we have an AND and LT, we need a TST/ANDS, + // so folding would be redundant. + if (Pred != CmpInst::Predicate::ICMP_EQ && + Pred != CmpInst::Predicate::ICMP_NE) + return false; + + // Check if the AND has a constant on its RHS which we can use as a mask. + // If it's a power of 2, then it's the same as checking a specific bit. + // (e.g, ANDing with 8 == ANDing with 000...100 == testing if bit 3 is set) + auto MaybeBit = + getConstantVRegValWithLookThrough(AndInst->getOperand(2).getReg(), MRI); + if (!MaybeBit || !isPowerOf2_64(MaybeBit->Value)) + return false; + + uint64_t Bit = Log2_64(static_cast(MaybeBit->Value)); + Register TestReg = AndInst->getOperand(1).getReg(); + bool Invert = Pred == CmpInst::Predicate::ICMP_NE; + + // Emit a TB(N)Z. + emitTestBit(TestReg, Bit, Invert, DstMBB, MIB); + return true; +} + +bool AArch64InstructionSelector::selectCompareBranch( + MachineInstr &I, MachineFunction &MF, MachineRegisterInfo &MRI) const { + + const Register CondReg = I.getOperand(0).getReg(); + MachineBasicBlock *DestMBB = I.getOperand(1).getMBB(); + MachineInstr *CCMI = MRI.getVRegDef(CondReg); + if (CCMI->getOpcode() == TargetOpcode::G_TRUNC) + CCMI = MRI.getVRegDef(CCMI->getOperand(1).getReg()); + if (CCMI->getOpcode() != TargetOpcode::G_ICMP) + return false; + + Register LHS = CCMI->getOperand(2).getReg(); + Register RHS = CCMI->getOperand(3).getReg(); + auto VRegAndVal = getConstantVRegValWithLookThrough(RHS, MRI); + MachineIRBuilder MIB(I); + CmpInst::Predicate Pred = + (CmpInst::Predicate)CCMI->getOperand(1).getPredicate(); + MachineInstr *LHSMI = getDefIgnoringCopies(LHS, MRI); + + // When we can emit a TB(N)Z, prefer that. + // + // Handle non-commutative condition codes first. + // Note that we don't want to do this when we have a G_AND because it can + // become a tst. The tst will make the test bit in the TB(N)Z redundant. + if (VRegAndVal && LHSMI->getOpcode() != TargetOpcode::G_AND) { + int64_t C = VRegAndVal->Value; + + // When we have a greater-than comparison, we can just test if the msb is + // zero. + if (C == -1 && Pred == CmpInst::ICMP_SGT) { + uint64_t Bit = MRI.getType(LHS).getSizeInBits() - 1; + emitTestBit(LHS, Bit, /*IsNegative = */ false, DestMBB, MIB); + I.eraseFromParent(); + return true; + } + + // When we have a less than comparison, we can just test if the msb is not + // zero. + if (C == 0 && Pred == CmpInst::ICMP_SLT) { + uint64_t Bit = MRI.getType(LHS).getSizeInBits() - 1; + emitTestBit(LHS, Bit, /*IsNegative = */ true, DestMBB, MIB); + I.eraseFromParent(); + return true; + } + } + + if (!VRegAndVal) { + std::swap(RHS, LHS); + VRegAndVal = getConstantVRegValWithLookThrough(RHS, MRI); + LHSMI = getDefIgnoringCopies(LHS, MRI); + } + + if (!VRegAndVal || VRegAndVal->Value != 0) { + // If we can't select a CBZ then emit a cmp + Bcc. + MachineInstr *Cmp; + std::tie(Cmp, Pred) = emitIntegerCompare( + CCMI->getOperand(2), CCMI->getOperand(3), CCMI->getOperand(1), MIB); + if (!Cmp) + return false; + const AArch64CC::CondCode CC = changeICMPPredToAArch64CC(Pred); + MIB.buildInstr(AArch64::Bcc, {}, {}).addImm(CC).addMBB(DestMBB); + I.eraseFromParent(); + return true; + } + + // Try to emit a TB(N)Z for an eq or ne condition. + if (tryOptAndIntoCompareBranch(LHSMI, VRegAndVal->Value, Pred, DestMBB, + MIB)) { + I.eraseFromParent(); + return true; + } + + const RegisterBank &RB = *RBI.getRegBank(LHS, MRI, TRI); + if (RB.getID() != AArch64::GPRRegBankID) + return false; + if (Pred != CmpInst::ICMP_NE && Pred != CmpInst::ICMP_EQ) + return false; + + const unsigned CmpWidth = MRI.getType(LHS).getSizeInBits(); + unsigned CBOpc = 0; + if (CmpWidth <= 32) + CBOpc = (Pred == CmpInst::ICMP_EQ ? AArch64::CBZW : AArch64::CBNZW); + else if (CmpWidth == 64) + CBOpc = (Pred == CmpInst::ICMP_EQ ? AArch64::CBZX : AArch64::CBNZX); + else + return false; + + BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(CBOpc)) + .addUse(LHS) + .addMBB(DestMBB) + .constrainAllUses(TII, TRI, RBI); + + I.eraseFromParent(); + return true; +} + +/// Returns the element immediate value of a vector shift operand if found. +/// This needs to detect a splat-like operation, e.g. a G_BUILD_VECTOR. +static Optional getVectorShiftImm(Register Reg, + MachineRegisterInfo &MRI) { + assert(MRI.getType(Reg).isVector() && "Expected a *vector* shift operand"); + MachineInstr *OpMI = MRI.getVRegDef(Reg); + assert(OpMI && "Expected to find a vreg def for vector shift operand"); + if (OpMI->getOpcode() != TargetOpcode::G_BUILD_VECTOR) + return None; + + // Check all operands are identical immediates. + int64_t ImmVal = 0; + for (unsigned Idx = 1; Idx < OpMI->getNumOperands(); ++Idx) { + auto VRegAndVal = getConstantVRegValWithLookThrough(OpMI->getOperand(Idx).getReg(), MRI); + if (!VRegAndVal) + return None; + + if (Idx == 1) + ImmVal = VRegAndVal->Value; + if (ImmVal != VRegAndVal->Value) + return None; + } + + return ImmVal; +} + +/// Matches and returns the shift immediate value for a SHL instruction given +/// a shift operand. +static Optional getVectorSHLImm(LLT SrcTy, Register Reg, MachineRegisterInfo &MRI) { + Optional ShiftImm = getVectorShiftImm(Reg, MRI); + if (!ShiftImm) + return None; + // Check the immediate is in range for a SHL. + int64_t Imm = *ShiftImm; + if (Imm < 0) + return None; + switch (SrcTy.getElementType().getSizeInBits()) { + default: + LLVM_DEBUG(dbgs() << "Unhandled element type for vector shift"); + return None; + case 8: + if (Imm > 7) + return None; + break; + case 16: + if (Imm > 15) + return None; + break; + case 32: + if (Imm > 31) + return None; + break; + case 64: + if (Imm > 63) + return None; + break; + } + return Imm; +} + +bool AArch64InstructionSelector::selectVectorSHL( + MachineInstr &I, MachineRegisterInfo &MRI) const { + assert(I.getOpcode() == TargetOpcode::G_SHL); + Register DstReg = I.getOperand(0).getReg(); + const LLT Ty = MRI.getType(DstReg); + Register Src1Reg = I.getOperand(1).getReg(); + Register Src2Reg = I.getOperand(2).getReg(); + + if (!Ty.isVector()) + return false; + + // Check if we have a vector of constants on RHS that we can select as the + // immediate form. + Optional ImmVal = getVectorSHLImm(Ty, Src2Reg, MRI); + + unsigned Opc = 0; + if (Ty == LLT::vector(2, 64)) { + Opc = ImmVal ? AArch64::SHLv2i64_shift : AArch64::USHLv2i64; + } else if (Ty == LLT::vector(4, 32)) { + Opc = ImmVal ? AArch64::SHLv4i32_shift : AArch64::USHLv4i32; + } else if (Ty == LLT::vector(2, 32)) { + Opc = ImmVal ? AArch64::SHLv2i32_shift : AArch64::USHLv2i32; + } else { + LLVM_DEBUG(dbgs() << "Unhandled G_SHL type"); + return false; + } + + MachineIRBuilder MIB(I); + auto Shl = MIB.buildInstr(Opc, {DstReg}, {Src1Reg}); + if (ImmVal) + Shl.addImm(*ImmVal); + else + Shl.addUse(Src2Reg); + constrainSelectedInstRegOperands(*Shl, TII, TRI, RBI); + I.eraseFromParent(); + return true; +} + +bool AArch64InstructionSelector::selectVectorASHR( + MachineInstr &I, MachineRegisterInfo &MRI) const { + assert(I.getOpcode() == TargetOpcode::G_ASHR); + Register DstReg = I.getOperand(0).getReg(); + const LLT Ty = MRI.getType(DstReg); + Register Src1Reg = I.getOperand(1).getReg(); + Register Src2Reg = I.getOperand(2).getReg(); + + if (!Ty.isVector()) + return false; + + // There is not a shift right register instruction, but the shift left + // register instruction takes a signed value, where negative numbers specify a + // right shift. + + unsigned Opc = 0; + unsigned NegOpc = 0; + const TargetRegisterClass *RC = nullptr; + if (Ty == LLT::vector(2, 64)) { + Opc = AArch64::SSHLv2i64; + NegOpc = AArch64::NEGv2i64; + RC = &AArch64::FPR128RegClass; + } else if (Ty == LLT::vector(4, 32)) { + Opc = AArch64::SSHLv4i32; + NegOpc = AArch64::NEGv4i32; + RC = &AArch64::FPR128RegClass; + } else if (Ty == LLT::vector(2, 32)) { + Opc = AArch64::SSHLv2i32; + NegOpc = AArch64::NEGv2i32; + RC = &AArch64::FPR64RegClass; + } else { + LLVM_DEBUG(dbgs() << "Unhandled G_ASHR type"); + return false; + } + + MachineIRBuilder MIB(I); + auto Neg = MIB.buildInstr(NegOpc, {RC}, {Src2Reg}); + constrainSelectedInstRegOperands(*Neg, TII, TRI, RBI); + auto SShl = MIB.buildInstr(Opc, {DstReg}, {Src1Reg, Neg}); + constrainSelectedInstRegOperands(*SShl, TII, TRI, RBI); + I.eraseFromParent(); + return true; +} + +bool AArch64InstructionSelector::selectVaStartAAPCS( + MachineInstr &I, MachineFunction &MF, MachineRegisterInfo &MRI) const { + return false; +} + +bool AArch64InstructionSelector::selectVaStartDarwin( + MachineInstr &I, MachineFunction &MF, MachineRegisterInfo &MRI) const { + AArch64FunctionInfo *FuncInfo = MF.getInfo(); + Register ListReg = I.getOperand(0).getReg(); + + Register ArgsAddrReg = MRI.createVirtualRegister(&AArch64::GPR64RegClass); + + auto MIB = + BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(AArch64::ADDXri)) + .addDef(ArgsAddrReg) + .addFrameIndex(FuncInfo->getVarArgsStackIndex()) + .addImm(0) + .addImm(0); + + constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); + + MIB = BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(AArch64::STRXui)) + .addUse(ArgsAddrReg) + .addUse(ListReg) + .addImm(0) + .addMemOperand(*I.memoperands_begin()); + + constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); + I.eraseFromParent(); + return true; +} + +void AArch64InstructionSelector::materializeLargeCMVal( + MachineInstr &I, const Value *V, unsigned OpFlags) const { + MachineBasicBlock &MBB = *I.getParent(); + MachineFunction &MF = *MBB.getParent(); + MachineRegisterInfo &MRI = MF.getRegInfo(); + MachineIRBuilder MIB(I); + + auto MovZ = MIB.buildInstr(AArch64::MOVZXi, {&AArch64::GPR64RegClass}, {}); + MovZ->addOperand(MF, I.getOperand(1)); + MovZ->getOperand(1).setTargetFlags(OpFlags | AArch64II::MO_G0 | + AArch64II::MO_NC); + MovZ->addOperand(MF, MachineOperand::CreateImm(0)); + constrainSelectedInstRegOperands(*MovZ, TII, TRI, RBI); + + auto BuildMovK = [&](Register SrcReg, unsigned char Flags, unsigned Offset, + Register ForceDstReg) { + Register DstReg = ForceDstReg + ? ForceDstReg + : MRI.createVirtualRegister(&AArch64::GPR64RegClass); + auto MovI = MIB.buildInstr(AArch64::MOVKXi).addDef(DstReg).addUse(SrcReg); + if (auto *GV = dyn_cast(V)) { + MovI->addOperand(MF, MachineOperand::CreateGA( + GV, MovZ->getOperand(1).getOffset(), Flags)); + } else { + MovI->addOperand( + MF, MachineOperand::CreateBA(cast(V), + MovZ->getOperand(1).getOffset(), Flags)); + } + MovI->addOperand(MF, MachineOperand::CreateImm(Offset)); + constrainSelectedInstRegOperands(*MovI, TII, TRI, RBI); + return DstReg; + }; + Register DstReg = BuildMovK(MovZ.getReg(0), + AArch64II::MO_G1 | AArch64II::MO_NC, 16, 0); + DstReg = BuildMovK(DstReg, AArch64II::MO_G2 | AArch64II::MO_NC, 32, 0); + BuildMovK(DstReg, AArch64II::MO_G3, 48, I.getOperand(0).getReg()); + return; +} + +bool AArch64InstructionSelector::preISelLower(MachineInstr &I) { + MachineBasicBlock &MBB = *I.getParent(); + MachineFunction &MF = *MBB.getParent(); + MachineRegisterInfo &MRI = MF.getRegInfo(); + + switch (I.getOpcode()) { + case TargetOpcode::G_SHL: + case TargetOpcode::G_ASHR: + case TargetOpcode::G_LSHR: { + // These shifts are legalized to have 64 bit shift amounts because we want + // to take advantage of the existing imported selection patterns that assume + // the immediates are s64s. However, if the shifted type is 32 bits and for + // some reason we receive input GMIR that has an s64 shift amount that's not + // a G_CONSTANT, insert a truncate so that we can still select the s32 + // register-register variant. + Register SrcReg = I.getOperand(1).getReg(); + Register ShiftReg = I.getOperand(2).getReg(); + const LLT ShiftTy = MRI.getType(ShiftReg); + const LLT SrcTy = MRI.getType(SrcReg); + if (SrcTy.isVector()) + return false; + assert(!ShiftTy.isVector() && "unexpected vector shift ty"); + if (SrcTy.getSizeInBits() != 32 || ShiftTy.getSizeInBits() != 64) + return false; + auto *AmtMI = MRI.getVRegDef(ShiftReg); + assert(AmtMI && "could not find a vreg definition for shift amount"); + if (AmtMI->getOpcode() != TargetOpcode::G_CONSTANT) { + // Insert a subregister copy to implement a 64->32 trunc + MachineIRBuilder MIB(I); + auto Trunc = MIB.buildInstr(TargetOpcode::COPY, {SrcTy}, {}) + .addReg(ShiftReg, 0, AArch64::sub_32); + MRI.setRegBank(Trunc.getReg(0), RBI.getRegBank(AArch64::GPRRegBankID)); + I.getOperand(2).setReg(Trunc.getReg(0)); + } + return true; + } + case TargetOpcode::G_STORE: + return contractCrossBankCopyIntoStore(I, MRI); + case TargetOpcode::G_PTR_ADD: + return convertPtrAddToAdd(I, MRI); + case TargetOpcode::G_LOAD: { + // For scalar loads of pointers, we try to convert the dest type from p0 + // to s64 so that our imported patterns can match. Like with the G_PTR_ADD + // conversion, this should be ok because all users should have been + // selected already, so the type doesn't matter for them. + Register DstReg = I.getOperand(0).getReg(); + const LLT DstTy = MRI.getType(DstReg); + if (!DstTy.isPointer()) + return false; + MRI.setType(DstReg, LLT::scalar(64)); + return true; + } + default: + return false; + } +} + +/// This lowering tries to look for G_PTR_ADD instructions and then converts +/// them to a standard G_ADD with a COPY on the source. +/// +/// The motivation behind this is to expose the add semantics to the imported +/// tablegen patterns. We shouldn't need to check for uses being loads/stores, +/// because the selector works bottom up, uses before defs. By the time we +/// end up trying to select a G_PTR_ADD, we should have already attempted to +/// fold this into addressing modes and were therefore unsuccessful. +bool AArch64InstructionSelector::convertPtrAddToAdd( + MachineInstr &I, MachineRegisterInfo &MRI) { + assert(I.getOpcode() == TargetOpcode::G_PTR_ADD && "Expected G_PTR_ADD"); + Register DstReg = I.getOperand(0).getReg(); + Register AddOp1Reg = I.getOperand(1).getReg(); + const LLT PtrTy = MRI.getType(DstReg); + if (PtrTy.getAddressSpace() != 0) + return false; + + MachineIRBuilder MIB(I); + const LLT CastPtrTy = PtrTy.isVector() ? LLT::vector(2, 64) : LLT::scalar(64); + auto PtrToInt = MIB.buildPtrToInt(CastPtrTy, AddOp1Reg); + // Set regbanks on the registers. + if (PtrTy.isVector()) + MRI.setRegBank(PtrToInt.getReg(0), RBI.getRegBank(AArch64::FPRRegBankID)); + else + MRI.setRegBank(PtrToInt.getReg(0), RBI.getRegBank(AArch64::GPRRegBankID)); + + // Now turn the %dst(p0) = G_PTR_ADD %base, off into: + // %dst(intty) = G_ADD %intbase, off + I.setDesc(TII.get(TargetOpcode::G_ADD)); + MRI.setType(DstReg, CastPtrTy); + I.getOperand(1).setReg(PtrToInt.getReg(0)); + if (!select(*PtrToInt)) { + LLVM_DEBUG(dbgs() << "Failed to select G_PTRTOINT in convertPtrAddToAdd"); + return false; + } + return true; +} + +bool AArch64InstructionSelector::earlySelectSHL( + MachineInstr &I, MachineRegisterInfo &MRI) const { + // We try to match the immediate variant of LSL, which is actually an alias + // for a special case of UBFM. Otherwise, we fall back to the imported + // selector which will match the register variant. + assert(I.getOpcode() == TargetOpcode::G_SHL && "unexpected op"); + const auto &MO = I.getOperand(2); + auto VRegAndVal = getConstantVRegVal(MO.getReg(), MRI); + if (!VRegAndVal) + return false; + + const LLT DstTy = MRI.getType(I.getOperand(0).getReg()); + if (DstTy.isVector()) + return false; + bool Is64Bit = DstTy.getSizeInBits() == 64; + auto Imm1Fn = Is64Bit ? selectShiftA_64(MO) : selectShiftA_32(MO); + auto Imm2Fn = Is64Bit ? selectShiftB_64(MO) : selectShiftB_32(MO); + MachineIRBuilder MIB(I); + + if (!Imm1Fn || !Imm2Fn) + return false; + + auto NewI = + MIB.buildInstr(Is64Bit ? AArch64::UBFMXri : AArch64::UBFMWri, + {I.getOperand(0).getReg()}, {I.getOperand(1).getReg()}); + + for (auto &RenderFn : *Imm1Fn) + RenderFn(NewI); + for (auto &RenderFn : *Imm2Fn) + RenderFn(NewI); + + I.eraseFromParent(); + return constrainSelectedInstRegOperands(*NewI, TII, TRI, RBI); +} + +bool AArch64InstructionSelector::contractCrossBankCopyIntoStore( + MachineInstr &I, MachineRegisterInfo &MRI) { + assert(I.getOpcode() == TargetOpcode::G_STORE && "Expected G_STORE"); + // If we're storing a scalar, it doesn't matter what register bank that + // scalar is on. All that matters is the size. + // + // So, if we see something like this (with a 32-bit scalar as an example): + // + // %x:gpr(s32) = ... something ... + // %y:fpr(s32) = COPY %x:gpr(s32) + // G_STORE %y:fpr(s32) + // + // We can fix this up into something like this: + // + // G_STORE %x:gpr(s32) + // + // And then continue the selection process normally. + Register DefDstReg = getSrcRegIgnoringCopies(I.getOperand(0).getReg(), MRI); + if (!DefDstReg.isValid()) + return false; + LLT DefDstTy = MRI.getType(DefDstReg); + Register StoreSrcReg = I.getOperand(0).getReg(); + LLT StoreSrcTy = MRI.getType(StoreSrcReg); + + // If we get something strange like a physical register, then we shouldn't + // go any further. + if (!DefDstTy.isValid()) + return false; + + // Are the source and dst types the same size? + if (DefDstTy.getSizeInBits() != StoreSrcTy.getSizeInBits()) + return false; + + if (RBI.getRegBank(StoreSrcReg, MRI, TRI) == + RBI.getRegBank(DefDstReg, MRI, TRI)) + return false; + + // We have a cross-bank copy, which is entering a store. Let's fold it. + I.getOperand(0).setReg(DefDstReg); + return true; +} + +bool AArch64InstructionSelector::earlySelect(MachineInstr &I) const { + assert(I.getParent() && "Instruction should be in a basic block!"); + assert(I.getParent()->getParent() && "Instruction should be in a function!"); + + MachineBasicBlock &MBB = *I.getParent(); + MachineFunction &MF = *MBB.getParent(); + MachineRegisterInfo &MRI = MF.getRegInfo(); + + switch (I.getOpcode()) { + case TargetOpcode::G_SHL: + return earlySelectSHL(I, MRI); + case TargetOpcode::G_CONSTANT: { + bool IsZero = false; + if (I.getOperand(1).isCImm()) + IsZero = I.getOperand(1).getCImm()->getZExtValue() == 0; + else if (I.getOperand(1).isImm()) + IsZero = I.getOperand(1).getImm() == 0; + + if (!IsZero) + return false; + + Register DefReg = I.getOperand(0).getReg(); + LLT Ty = MRI.getType(DefReg); + if (Ty.getSizeInBits() == 64) { + I.getOperand(1).ChangeToRegister(AArch64::XZR, false); + RBI.constrainGenericRegister(DefReg, AArch64::GPR64RegClass, MRI); + } else if (Ty.getSizeInBits() == 32) { + I.getOperand(1).ChangeToRegister(AArch64::WZR, false); + RBI.constrainGenericRegister(DefReg, AArch64::GPR32RegClass, MRI); + } else + return false; + + I.setDesc(TII.get(TargetOpcode::COPY)); + return true; + } + default: + return false; + } +} + +bool AArch64InstructionSelector::select(MachineInstr &I) { + assert(I.getParent() && "Instruction should be in a basic block!"); + assert(I.getParent()->getParent() && "Instruction should be in a function!"); + + MachineBasicBlock &MBB = *I.getParent(); + MachineFunction &MF = *MBB.getParent(); + MachineRegisterInfo &MRI = MF.getRegInfo(); + + const AArch64Subtarget *Subtarget = + &static_cast(MF.getSubtarget()); + if (Subtarget->requiresStrictAlign()) { + // We don't support this feature yet. + LLVM_DEBUG(dbgs() << "AArch64 GISel does not support strict-align yet\n"); + return false; + } + + unsigned Opcode = I.getOpcode(); + // G_PHI requires same handling as PHI + if (!I.isPreISelOpcode() || Opcode == TargetOpcode::G_PHI) { + // Certain non-generic instructions also need some special handling. + + if (Opcode == TargetOpcode::LOAD_STACK_GUARD) + return constrainSelectedInstRegOperands(I, TII, TRI, RBI); + + if (Opcode == TargetOpcode::PHI || Opcode == TargetOpcode::G_PHI) { + const Register DefReg = I.getOperand(0).getReg(); + const LLT DefTy = MRI.getType(DefReg); + + const RegClassOrRegBank &RegClassOrBank = + MRI.getRegClassOrRegBank(DefReg); + + const TargetRegisterClass *DefRC + = RegClassOrBank.dyn_cast(); + if (!DefRC) { + if (!DefTy.isValid()) { + LLVM_DEBUG(dbgs() << "PHI operand has no type, not a gvreg?\n"); + return false; + } + const RegisterBank &RB = *RegClassOrBank.get(); + DefRC = getRegClassForTypeOnBank(DefTy, RB, RBI); + if (!DefRC) { + LLVM_DEBUG(dbgs() << "PHI operand has unexpected size/bank\n"); + return false; + } + } + + I.setDesc(TII.get(TargetOpcode::PHI)); + + return RBI.constrainGenericRegister(DefReg, *DefRC, MRI); + } + + if (I.isCopy()) + return selectCopy(I, TII, MRI, TRI, RBI); + + return true; + } + + + if (I.getNumOperands() != I.getNumExplicitOperands()) { + LLVM_DEBUG( + dbgs() << "Generic instruction has unexpected implicit operands\n"); + return false; + } + + // Try to do some lowering before we start instruction selecting. These + // lowerings are purely transformations on the input G_MIR and so selection + // must continue after any modification of the instruction. + if (preISelLower(I)) { + Opcode = I.getOpcode(); // The opcode may have been modified, refresh it. + } + + // There may be patterns where the importer can't deal with them optimally, + // but does select it to a suboptimal sequence so our custom C++ selection + // code later never has a chance to work on it. Therefore, we have an early + // selection attempt here to give priority to certain selection routines + // over the imported ones. + if (earlySelect(I)) + return true; + + if (selectImpl(I, *CoverageInfo)) + return true; + + LLT Ty = + I.getOperand(0).isReg() ? MRI.getType(I.getOperand(0).getReg()) : LLT{}; + + MachineIRBuilder MIB(I); + + switch (Opcode) { + case TargetOpcode::G_BRCOND: { + if (Ty.getSizeInBits() > 32) { + // We shouldn't need this on AArch64, but it would be implemented as an + // EXTRACT_SUBREG followed by a TBNZW because TBNZX has no encoding if the + // bit being tested is < 32. + LLVM_DEBUG(dbgs() << "G_BRCOND has type: " << Ty + << ", expected at most 32-bits"); + return false; + } + + const Register CondReg = I.getOperand(0).getReg(); + MachineBasicBlock *DestMBB = I.getOperand(1).getMBB(); + + // Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z + // instructions will not be produced, as they are conditional branch + // instructions that do not set flags. + if (ProduceNonFlagSettingCondBr && selectCompareBranch(I, MF, MRI)) + return true; + + if (ProduceNonFlagSettingCondBr) { + auto MIB = BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::TBNZW)) + .addUse(CondReg) + .addImm(/*bit offset=*/0) + .addMBB(DestMBB); + + I.eraseFromParent(); + return constrainSelectedInstRegOperands(*MIB.getInstr(), TII, TRI, RBI); + } else { + auto CMP = BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::ANDSWri)) + .addDef(AArch64::WZR) + .addUse(CondReg) + .addImm(1); + constrainSelectedInstRegOperands(*CMP.getInstr(), TII, TRI, RBI); + auto Bcc = + BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::Bcc)) + .addImm(AArch64CC::EQ) + .addMBB(DestMBB); + + I.eraseFromParent(); + return constrainSelectedInstRegOperands(*Bcc.getInstr(), TII, TRI, RBI); + } + } + + case TargetOpcode::G_BRINDIRECT: { + I.setDesc(TII.get(AArch64::BR)); + return constrainSelectedInstRegOperands(I, TII, TRI, RBI); + } + + case TargetOpcode::G_BRJT: + return selectBrJT(I, MRI); + + case AArch64::G_ADD_LOW: { + // This op may have been separated from it's ADRP companion by the localizer + // or some other code motion pass. Given that many CPUs will try to + // macro fuse these operations anyway, select this into a MOVaddr pseudo + // which will later be expanded into an ADRP+ADD pair after scheduling. + MachineInstr *BaseMI = MRI.getVRegDef(I.getOperand(1).getReg()); + if (BaseMI->getOpcode() != AArch64::ADRP) { + I.setDesc(TII.get(AArch64::ADDXri)); + I.addOperand(MachineOperand::CreateImm(0)); + return constrainSelectedInstRegOperands(I, TII, TRI, RBI); + } + assert(TM.getCodeModel() == CodeModel::Small && + "Expected small code model"); + MachineIRBuilder MIB(I); + auto Op1 = BaseMI->getOperand(1); + auto Op2 = I.getOperand(2); + auto MovAddr = MIB.buildInstr(AArch64::MOVaddr, {I.getOperand(0)}, {}) + .addGlobalAddress(Op1.getGlobal(), Op1.getOffset(), + Op1.getTargetFlags()) + .addGlobalAddress(Op2.getGlobal(), Op2.getOffset(), + Op2.getTargetFlags()); + I.eraseFromParent(); + return constrainSelectedInstRegOperands(*MovAddr, TII, TRI, RBI); + } + + case TargetOpcode::G_BSWAP: { + // Handle vector types for G_BSWAP directly. + Register DstReg = I.getOperand(0).getReg(); + LLT DstTy = MRI.getType(DstReg); + + // We should only get vector types here; everything else is handled by the + // importer right now. + if (!DstTy.isVector() || DstTy.getSizeInBits() > 128) { + LLVM_DEBUG(dbgs() << "Dst type for G_BSWAP currently unsupported.\n"); + return false; + } + + // Only handle 4 and 2 element vectors for now. + // TODO: 16-bit elements. + unsigned NumElts = DstTy.getNumElements(); + if (NumElts != 4 && NumElts != 2) { + LLVM_DEBUG(dbgs() << "Unsupported number of elements for G_BSWAP.\n"); + return false; + } + + // Choose the correct opcode for the supported types. Right now, that's + // v2s32, v4s32, and v2s64. + unsigned Opc = 0; + unsigned EltSize = DstTy.getElementType().getSizeInBits(); + if (EltSize == 32) + Opc = (DstTy.getNumElements() == 2) ? AArch64::REV32v8i8 + : AArch64::REV32v16i8; + else if (EltSize == 64) + Opc = AArch64::REV64v16i8; + + // We should always get something by the time we get here... + assert(Opc != 0 && "Didn't get an opcode for G_BSWAP?"); + + I.setDesc(TII.get(Opc)); + return constrainSelectedInstRegOperands(I, TII, TRI, RBI); + } + + case TargetOpcode::G_FCONSTANT: + case TargetOpcode::G_CONSTANT: { + const bool isFP = Opcode == TargetOpcode::G_FCONSTANT; + + const LLT s8 = LLT::scalar(8); + const LLT s16 = LLT::scalar(16); + const LLT s32 = LLT::scalar(32); + const LLT s64 = LLT::scalar(64); + const LLT p0 = LLT::pointer(0, 64); + + const Register DefReg = I.getOperand(0).getReg(); + const LLT DefTy = MRI.getType(DefReg); + const unsigned DefSize = DefTy.getSizeInBits(); + const RegisterBank &RB = *RBI.getRegBank(DefReg, MRI, TRI); + + // FIXME: Redundant check, but even less readable when factored out. + if (isFP) { + if (Ty != s32 && Ty != s64) { + LLVM_DEBUG(dbgs() << "Unable to materialize FP " << Ty + << " constant, expected: " << s32 << " or " << s64 + << '\n'); + return false; + } + + if (RB.getID() != AArch64::FPRRegBankID) { + LLVM_DEBUG(dbgs() << "Unable to materialize FP " << Ty + << " constant on bank: " << RB + << ", expected: FPR\n"); + return false; + } + + // The case when we have 0.0 is covered by tablegen. Reject it here so we + // can be sure tablegen works correctly and isn't rescued by this code. + if (I.getOperand(1).getFPImm()->getValueAPF().isExactlyValue(0.0)) + return false; + } else { + // s32 and s64 are covered by tablegen. + if (Ty != p0 && Ty != s8 && Ty != s16) { + LLVM_DEBUG(dbgs() << "Unable to materialize integer " << Ty + << " constant, expected: " << s32 << ", " << s64 + << ", or " << p0 << '\n'); + return false; + } + + if (RB.getID() != AArch64::GPRRegBankID) { + LLVM_DEBUG(dbgs() << "Unable to materialize integer " << Ty + << " constant on bank: " << RB + << ", expected: GPR\n"); + return false; + } + } + + // We allow G_CONSTANT of types < 32b. + const unsigned MovOpc = + DefSize == 64 ? AArch64::MOVi64imm : AArch64::MOVi32imm; + + if (isFP) { + // Either emit a FMOV, or emit a copy to emit a normal mov. + const TargetRegisterClass &GPRRC = + DefSize == 32 ? AArch64::GPR32RegClass : AArch64::GPR64RegClass; + const TargetRegisterClass &FPRRC = + DefSize == 32 ? AArch64::FPR32RegClass : AArch64::FPR64RegClass; + + // Can we use a FMOV instruction to represent the immediate? + if (emitFMovForFConstant(I, MRI)) + return true; + + // For 64b values, emit a constant pool load instead. + if (DefSize == 64) { + auto *FPImm = I.getOperand(1).getFPImm(); + MachineIRBuilder MIB(I); + auto *LoadMI = emitLoadFromConstantPool(FPImm, MIB); + if (!LoadMI) { + LLVM_DEBUG(dbgs() << "Failed to load double constant pool entry\n"); + return false; + } + MIB.buildCopy({DefReg}, {LoadMI->getOperand(0).getReg()}); + I.eraseFromParent(); + return RBI.constrainGenericRegister(DefReg, FPRRC, MRI); + } + + // Nope. Emit a copy and use a normal mov instead. + const Register DefGPRReg = MRI.createVirtualRegister(&GPRRC); + MachineOperand &RegOp = I.getOperand(0); + RegOp.setReg(DefGPRReg); + MIB.setInsertPt(MIB.getMBB(), std::next(I.getIterator())); + MIB.buildCopy({DefReg}, {DefGPRReg}); + + if (!RBI.constrainGenericRegister(DefReg, FPRRC, MRI)) { + LLVM_DEBUG(dbgs() << "Failed to constrain G_FCONSTANT def operand\n"); + return false; + } + + MachineOperand &ImmOp = I.getOperand(1); + // FIXME: Is going through int64_t always correct? + ImmOp.ChangeToImmediate( + ImmOp.getFPImm()->getValueAPF().bitcastToAPInt().getZExtValue()); + } else if (I.getOperand(1).isCImm()) { + uint64_t Val = I.getOperand(1).getCImm()->getZExtValue(); + I.getOperand(1).ChangeToImmediate(Val); + } else if (I.getOperand(1).isImm()) { + uint64_t Val = I.getOperand(1).getImm(); + I.getOperand(1).ChangeToImmediate(Val); + } + + I.setDesc(TII.get(MovOpc)); + constrainSelectedInstRegOperands(I, TII, TRI, RBI); + return true; + } + case TargetOpcode::G_EXTRACT: { + Register DstReg = I.getOperand(0).getReg(); + Register SrcReg = I.getOperand(1).getReg(); + LLT SrcTy = MRI.getType(SrcReg); + LLT DstTy = MRI.getType(DstReg); + (void)DstTy; + unsigned SrcSize = SrcTy.getSizeInBits(); + + if (SrcTy.getSizeInBits() > 64) { + // This should be an extract of an s128, which is like a vector extract. + if (SrcTy.getSizeInBits() != 128) + return false; + // Only support extracting 64 bits from an s128 at the moment. + if (DstTy.getSizeInBits() != 64) + return false; + + const RegisterBank &SrcRB = *RBI.getRegBank(SrcReg, MRI, TRI); + const RegisterBank &DstRB = *RBI.getRegBank(DstReg, MRI, TRI); + // Check we have the right regbank always. + assert(SrcRB.getID() == AArch64::FPRRegBankID && + DstRB.getID() == AArch64::FPRRegBankID && + "Wrong extract regbank!"); + (void)SrcRB; + + // Emit the same code as a vector extract. + // Offset must be a multiple of 64. + unsigned Offset = I.getOperand(2).getImm(); + if (Offset % 64 != 0) + return false; + unsigned LaneIdx = Offset / 64; + MachineIRBuilder MIB(I); + MachineInstr *Extract = emitExtractVectorElt( + DstReg, DstRB, LLT::scalar(64), SrcReg, LaneIdx, MIB); + if (!Extract) + return false; + I.eraseFromParent(); + return true; + } + + I.setDesc(TII.get(SrcSize == 64 ? AArch64::UBFMXri : AArch64::UBFMWri)); + MachineInstrBuilder(MF, I).addImm(I.getOperand(2).getImm() + + Ty.getSizeInBits() - 1); + + if (SrcSize < 64) { + assert(SrcSize == 32 && DstTy.getSizeInBits() == 16 && + "unexpected G_EXTRACT types"); + return constrainSelectedInstRegOperands(I, TII, TRI, RBI); + } + + DstReg = MRI.createGenericVirtualRegister(LLT::scalar(64)); + MIB.setInsertPt(MIB.getMBB(), std::next(I.getIterator())); + MIB.buildInstr(TargetOpcode::COPY, {I.getOperand(0).getReg()}, {}) + .addReg(DstReg, 0, AArch64::sub_32); + RBI.constrainGenericRegister(I.getOperand(0).getReg(), + AArch64::GPR32RegClass, MRI); + I.getOperand(0).setReg(DstReg); + + return constrainSelectedInstRegOperands(I, TII, TRI, RBI); + } + + case TargetOpcode::G_INSERT: { + LLT SrcTy = MRI.getType(I.getOperand(2).getReg()); + LLT DstTy = MRI.getType(I.getOperand(0).getReg()); + unsigned DstSize = DstTy.getSizeInBits(); + // Larger inserts are vectors, same-size ones should be something else by + // now (split up or turned into COPYs). + if (Ty.getSizeInBits() > 64 || SrcTy.getSizeInBits() > 32) + return false; + + I.setDesc(TII.get(DstSize == 64 ? AArch64::BFMXri : AArch64::BFMWri)); + unsigned LSB = I.getOperand(3).getImm(); + unsigned Width = MRI.getType(I.getOperand(2).getReg()).getSizeInBits(); + I.getOperand(3).setImm((DstSize - LSB) % DstSize); + MachineInstrBuilder(MF, I).addImm(Width - 1); + + if (DstSize < 64) { + assert(DstSize == 32 && SrcTy.getSizeInBits() == 16 && + "unexpected G_INSERT types"); + return constrainSelectedInstRegOperands(I, TII, TRI, RBI); + } + + Register SrcReg = MRI.createGenericVirtualRegister(LLT::scalar(64)); + BuildMI(MBB, I.getIterator(), I.getDebugLoc(), + TII.get(AArch64::SUBREG_TO_REG)) + .addDef(SrcReg) + .addImm(0) + .addUse(I.getOperand(2).getReg()) + .addImm(AArch64::sub_32); + RBI.constrainGenericRegister(I.getOperand(2).getReg(), + AArch64::GPR32RegClass, MRI); + I.getOperand(2).setReg(SrcReg); + + return constrainSelectedInstRegOperands(I, TII, TRI, RBI); + } + case TargetOpcode::G_FRAME_INDEX: { + // allocas and G_FRAME_INDEX are only supported in addrspace(0). + if (Ty != LLT::pointer(0, 64)) { + LLVM_DEBUG(dbgs() << "G_FRAME_INDEX pointer has type: " << Ty + << ", expected: " << LLT::pointer(0, 64) << '\n'); + return false; + } + I.setDesc(TII.get(AArch64::ADDXri)); + + // MOs for a #0 shifted immediate. + I.addOperand(MachineOperand::CreateImm(0)); + I.addOperand(MachineOperand::CreateImm(0)); + + return constrainSelectedInstRegOperands(I, TII, TRI, RBI); + } + + case TargetOpcode::G_GLOBAL_VALUE: { + auto GV = I.getOperand(1).getGlobal(); + if (GV->isThreadLocal()) + return selectTLSGlobalValue(I, MRI); + + unsigned OpFlags = STI.ClassifyGlobalReference(GV, TM); + if (OpFlags & AArch64II::MO_GOT) { + I.setDesc(TII.get(AArch64::LOADgot)); + I.getOperand(1).setTargetFlags(OpFlags); + } else if (TM.getCodeModel() == CodeModel::Large) { + // Materialize the global using movz/movk instructions. + materializeLargeCMVal(I, GV, OpFlags); + I.eraseFromParent(); + return true; + } else if (TM.getCodeModel() == CodeModel::Tiny) { + I.setDesc(TII.get(AArch64::ADR)); + I.getOperand(1).setTargetFlags(OpFlags); + } else { + I.setDesc(TII.get(AArch64::MOVaddr)); + I.getOperand(1).setTargetFlags(OpFlags | AArch64II::MO_PAGE); + MachineInstrBuilder MIB(MF, I); + MIB.addGlobalAddress(GV, I.getOperand(1).getOffset(), + OpFlags | AArch64II::MO_PAGEOFF | AArch64II::MO_NC); + } + return constrainSelectedInstRegOperands(I, TII, TRI, RBI); + } + + case TargetOpcode::G_ZEXTLOAD: + case TargetOpcode::G_LOAD: + case TargetOpcode::G_STORE: { + bool IsZExtLoad = I.getOpcode() == TargetOpcode::G_ZEXTLOAD; + MachineIRBuilder MIB(I); + + LLT PtrTy = MRI.getType(I.getOperand(1).getReg()); + + if (PtrTy != LLT::pointer(0, 64)) { + LLVM_DEBUG(dbgs() << "Load/Store pointer has type: " << PtrTy + << ", expected: " << LLT::pointer(0, 64) << '\n'); + return false; + } + + auto &MemOp = **I.memoperands_begin(); + if (MemOp.isAtomic()) { + // For now we just support s8 acquire loads to be able to compile stack + // protector code. + if (MemOp.getOrdering() == AtomicOrdering::Acquire && + MemOp.getSize() == 1) { + I.setDesc(TII.get(AArch64::LDARB)); + return constrainSelectedInstRegOperands(I, TII, TRI, RBI); + } + LLVM_DEBUG(dbgs() << "Atomic load/store not fully supported yet\n"); + return false; + } + unsigned MemSizeInBits = MemOp.getSize() * 8; + + const Register PtrReg = I.getOperand(1).getReg(); +#ifndef NDEBUG + const RegisterBank &PtrRB = *RBI.getRegBank(PtrReg, MRI, TRI); + // Sanity-check the pointer register. + assert(PtrRB.getID() == AArch64::GPRRegBankID && + "Load/Store pointer operand isn't a GPR"); + assert(MRI.getType(PtrReg).isPointer() && + "Load/Store pointer operand isn't a pointer"); +#endif + + const Register ValReg = I.getOperand(0).getReg(); + const RegisterBank &RB = *RBI.getRegBank(ValReg, MRI, TRI); + + const unsigned NewOpc = + selectLoadStoreUIOp(I.getOpcode(), RB.getID(), MemSizeInBits); + if (NewOpc == I.getOpcode()) + return false; + + I.setDesc(TII.get(NewOpc)); + + uint64_t Offset = 0; + auto *PtrMI = MRI.getVRegDef(PtrReg); + + // Try to fold a GEP into our unsigned immediate addressing mode. + if (PtrMI->getOpcode() == TargetOpcode::G_PTR_ADD) { + if (auto COff = getConstantVRegVal(PtrMI->getOperand(2).getReg(), MRI)) { + int64_t Imm = *COff; + const unsigned Size = MemSizeInBits / 8; + const unsigned Scale = Log2_32(Size); + if ((Imm & (Size - 1)) == 0 && Imm >= 0 && Imm < (0x1000 << Scale)) { + Register Ptr2Reg = PtrMI->getOperand(1).getReg(); + I.getOperand(1).setReg(Ptr2Reg); + PtrMI = MRI.getVRegDef(Ptr2Reg); + Offset = Imm / Size; + } + } + } + + // If we haven't folded anything into our addressing mode yet, try to fold + // a frame index into the base+offset. + if (!Offset && PtrMI->getOpcode() == TargetOpcode::G_FRAME_INDEX) + I.getOperand(1).ChangeToFrameIndex(PtrMI->getOperand(1).getIndex()); + + I.addOperand(MachineOperand::CreateImm(Offset)); + + // If we're storing a 0, use WZR/XZR. + if (auto CVal = getConstantVRegVal(ValReg, MRI)) { + if (*CVal == 0 && Opcode == TargetOpcode::G_STORE) { + if (I.getOpcode() == AArch64::STRWui) + I.getOperand(0).setReg(AArch64::WZR); + else if (I.getOpcode() == AArch64::STRXui) + I.getOperand(0).setReg(AArch64::XZR); + } + } + + if (IsZExtLoad) { + // The zextload from a smaller type to i32 should be handled by the importer. + if (MRI.getType(ValReg).getSizeInBits() != 64) + return false; + // If we have a ZEXTLOAD then change the load's type to be a narrower reg + //and zero_extend with SUBREG_TO_REG. + Register LdReg = MRI.createVirtualRegister(&AArch64::GPR32RegClass); + Register DstReg = I.getOperand(0).getReg(); + I.getOperand(0).setReg(LdReg); + + MIB.setInsertPt(MIB.getMBB(), std::next(I.getIterator())); + MIB.buildInstr(AArch64::SUBREG_TO_REG, {DstReg}, {}) + .addImm(0) + .addUse(LdReg) + .addImm(AArch64::sub_32); + constrainSelectedInstRegOperands(I, TII, TRI, RBI); + return RBI.constrainGenericRegister(DstReg, AArch64::GPR64allRegClass, + MRI); + } + return constrainSelectedInstRegOperands(I, TII, TRI, RBI); + } + + case TargetOpcode::G_SMULH: + case TargetOpcode::G_UMULH: { + // Reject the various things we don't support yet. + if (unsupportedBinOp(I, RBI, MRI, TRI)) + return false; + + const Register DefReg = I.getOperand(0).getReg(); + const RegisterBank &RB = *RBI.getRegBank(DefReg, MRI, TRI); + + if (RB.getID() != AArch64::GPRRegBankID) { + LLVM_DEBUG(dbgs() << "G_[SU]MULH on bank: " << RB << ", expected: GPR\n"); + return false; + } + + if (Ty != LLT::scalar(64)) { + LLVM_DEBUG(dbgs() << "G_[SU]MULH has type: " << Ty + << ", expected: " << LLT::scalar(64) << '\n'); + return false; + } + + unsigned NewOpc = I.getOpcode() == TargetOpcode::G_SMULH ? AArch64::SMULHrr + : AArch64::UMULHrr; + I.setDesc(TII.get(NewOpc)); + + // Now that we selected an opcode, we need to constrain the register + // operands to use appropriate classes. + return constrainSelectedInstRegOperands(I, TII, TRI, RBI); + } + case TargetOpcode::G_FADD: + case TargetOpcode::G_FSUB: + case TargetOpcode::G_FMUL: + case TargetOpcode::G_FDIV: + + case TargetOpcode::G_ASHR: + if (MRI.getType(I.getOperand(0).getReg()).isVector()) + return selectVectorASHR(I, MRI); + LLVM_FALLTHROUGH; + case TargetOpcode::G_SHL: + if (Opcode == TargetOpcode::G_SHL && + MRI.getType(I.getOperand(0).getReg()).isVector()) + return selectVectorSHL(I, MRI); + LLVM_FALLTHROUGH; + case TargetOpcode::G_OR: + case TargetOpcode::G_LSHR: { + // Reject the various things we don't support yet. + if (unsupportedBinOp(I, RBI, MRI, TRI)) + return false; + + const unsigned OpSize = Ty.getSizeInBits(); + + const Register DefReg = I.getOperand(0).getReg(); + const RegisterBank &RB = *RBI.getRegBank(DefReg, MRI, TRI); + + const unsigned NewOpc = selectBinaryOp(I.getOpcode(), RB.getID(), OpSize); + if (NewOpc == I.getOpcode()) + return false; + + I.setDesc(TII.get(NewOpc)); + // FIXME: Should the type be always reset in setDesc? + + // Now that we selected an opcode, we need to constrain the register + // operands to use appropriate classes. + return constrainSelectedInstRegOperands(I, TII, TRI, RBI); + } + + case TargetOpcode::G_PTR_ADD: { + MachineIRBuilder MIRBuilder(I); + emitADD(I.getOperand(0).getReg(), I.getOperand(1), I.getOperand(2), + MIRBuilder); + I.eraseFromParent(); + return true; + } + case TargetOpcode::G_UADDO: { + // TODO: Support other types. + unsigned OpSize = Ty.getSizeInBits(); + if (OpSize != 32 && OpSize != 64) { + LLVM_DEBUG( + dbgs() + << "G_UADDO currently only supported for 32 and 64 b types.\n"); + return false; + } + + // TODO: Support vectors. + if (Ty.isVector()) { + LLVM_DEBUG(dbgs() << "G_UADDO currently only supported for scalars.\n"); + return false; + } + + // Add and set the set condition flag. + unsigned AddsOpc = OpSize == 32 ? AArch64::ADDSWrr : AArch64::ADDSXrr; + MachineIRBuilder MIRBuilder(I); + auto AddsMI = MIRBuilder.buildInstr(AddsOpc, {I.getOperand(0)}, + {I.getOperand(2), I.getOperand(3)}); + constrainSelectedInstRegOperands(*AddsMI, TII, TRI, RBI); + + // Now, put the overflow result in the register given by the first operand + // to the G_UADDO. CSINC increments the result when the predicate is false, + // so to get the increment when it's true, we need to use the inverse. In + // this case, we want to increment when carry is set. + auto CsetMI = MIRBuilder + .buildInstr(AArch64::CSINCWr, {I.getOperand(1).getReg()}, + {Register(AArch64::WZR), Register(AArch64::WZR)}) + .addImm(getInvertedCondCode(AArch64CC::HS)); + constrainSelectedInstRegOperands(*CsetMI, TII, TRI, RBI); + I.eraseFromParent(); + return true; + } + + case TargetOpcode::G_PTRMASK: { + Register MaskReg = I.getOperand(2).getReg(); + Optional MaskVal = getConstantVRegVal(MaskReg, MRI); + // TODO: Implement arbitrary cases + if (!MaskVal || !isShiftedMask_64(*MaskVal)) + return false; + + uint64_t Mask = *MaskVal; + I.setDesc(TII.get(AArch64::ANDXri)); + I.getOperand(2).ChangeToImmediate( + AArch64_AM::encodeLogicalImmediate(Mask, 64)); + + return constrainSelectedInstRegOperands(I, TII, TRI, RBI); + } + case TargetOpcode::G_PTRTOINT: + case TargetOpcode::G_TRUNC: { + const LLT DstTy = MRI.getType(I.getOperand(0).getReg()); + const LLT SrcTy = MRI.getType(I.getOperand(1).getReg()); + + const Register DstReg = I.getOperand(0).getReg(); + const Register SrcReg = I.getOperand(1).getReg(); + + const RegisterBank &DstRB = *RBI.getRegBank(DstReg, MRI, TRI); + const RegisterBank &SrcRB = *RBI.getRegBank(SrcReg, MRI, TRI); + + if (DstRB.getID() != SrcRB.getID()) { + LLVM_DEBUG( + dbgs() << "G_TRUNC/G_PTRTOINT input/output on different banks\n"); + return false; + } + + if (DstRB.getID() == AArch64::GPRRegBankID) { + const TargetRegisterClass *DstRC = + getRegClassForTypeOnBank(DstTy, DstRB, RBI); + if (!DstRC) + return false; + + const TargetRegisterClass *SrcRC = + getRegClassForTypeOnBank(SrcTy, SrcRB, RBI); + if (!SrcRC) + return false; + + if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, MRI) || + !RBI.constrainGenericRegister(DstReg, *DstRC, MRI)) { + LLVM_DEBUG(dbgs() << "Failed to constrain G_TRUNC/G_PTRTOINT\n"); + return false; + } + + if (DstRC == SrcRC) { + // Nothing to be done + } else if (Opcode == TargetOpcode::G_TRUNC && DstTy == LLT::scalar(32) && + SrcTy == LLT::scalar(64)) { + llvm_unreachable("TableGen can import this case"); + return false; + } else if (DstRC == &AArch64::GPR32RegClass && + SrcRC == &AArch64::GPR64RegClass) { + I.getOperand(1).setSubReg(AArch64::sub_32); + } else { + LLVM_DEBUG( + dbgs() << "Unhandled mismatched classes in G_TRUNC/G_PTRTOINT\n"); + return false; + } + + I.setDesc(TII.get(TargetOpcode::COPY)); + return true; + } else if (DstRB.getID() == AArch64::FPRRegBankID) { + if (DstTy == LLT::vector(4, 16) && SrcTy == LLT::vector(4, 32)) { + I.setDesc(TII.get(AArch64::XTNv4i16)); + constrainSelectedInstRegOperands(I, TII, TRI, RBI); + return true; + } + + if (!SrcTy.isVector() && SrcTy.getSizeInBits() == 128) { + MachineIRBuilder MIB(I); + MachineInstr *Extract = emitExtractVectorElt( + DstReg, DstRB, LLT::scalar(DstTy.getSizeInBits()), SrcReg, 0, MIB); + if (!Extract) + return false; + I.eraseFromParent(); + return true; + } + + // We might have a vector G_PTRTOINT, in which case just emit a COPY. + if (Opcode == TargetOpcode::G_PTRTOINT) { + assert(DstTy.isVector() && "Expected an FPR ptrtoint to be a vector"); + I.setDesc(TII.get(TargetOpcode::COPY)); + return true; + } + } + + return false; + } + + case TargetOpcode::G_ANYEXT: { + const Register DstReg = I.getOperand(0).getReg(); + const Register SrcReg = I.getOperand(1).getReg(); + + const RegisterBank &RBDst = *RBI.getRegBank(DstReg, MRI, TRI); + if (RBDst.getID() != AArch64::GPRRegBankID) { + LLVM_DEBUG(dbgs() << "G_ANYEXT on bank: " << RBDst + << ", expected: GPR\n"); + return false; + } + + const RegisterBank &RBSrc = *RBI.getRegBank(SrcReg, MRI, TRI); + if (RBSrc.getID() != AArch64::GPRRegBankID) { + LLVM_DEBUG(dbgs() << "G_ANYEXT on bank: " << RBSrc + << ", expected: GPR\n"); + return false; + } + + const unsigned DstSize = MRI.getType(DstReg).getSizeInBits(); + + if (DstSize == 0) { + LLVM_DEBUG(dbgs() << "G_ANYEXT operand has no size, not a gvreg?\n"); + return false; + } + + if (DstSize != 64 && DstSize > 32) { + LLVM_DEBUG(dbgs() << "G_ANYEXT to size: " << DstSize + << ", expected: 32 or 64\n"); + return false; + } + // At this point G_ANYEXT is just like a plain COPY, but we need + // to explicitly form the 64-bit value if any. + if (DstSize > 32) { + Register ExtSrc = MRI.createVirtualRegister(&AArch64::GPR64allRegClass); + BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::SUBREG_TO_REG)) + .addDef(ExtSrc) + .addImm(0) + .addUse(SrcReg) + .addImm(AArch64::sub_32); + I.getOperand(1).setReg(ExtSrc); + } + return selectCopy(I, TII, MRI, TRI, RBI); + } + + case TargetOpcode::G_ZEXT: + case TargetOpcode::G_SEXT_INREG: + case TargetOpcode::G_SEXT: { + unsigned Opcode = I.getOpcode(); + const bool IsSigned = Opcode != TargetOpcode::G_ZEXT; + const Register DefReg = I.getOperand(0).getReg(); + Register SrcReg = I.getOperand(1).getReg(); + const LLT DstTy = MRI.getType(DefReg); + const LLT SrcTy = MRI.getType(SrcReg); + unsigned DstSize = DstTy.getSizeInBits(); + unsigned SrcSize = SrcTy.getSizeInBits(); + + // SEXT_INREG has the same src reg size as dst, the size of the value to be + // extended is encoded in the imm. + if (Opcode == TargetOpcode::G_SEXT_INREG) + SrcSize = I.getOperand(2).getImm(); + + if (DstTy.isVector()) + return false; // Should be handled by imported patterns. + + assert((*RBI.getRegBank(DefReg, MRI, TRI)).getID() == + AArch64::GPRRegBankID && + "Unexpected ext regbank"); + + MachineIRBuilder MIB(I); + MachineInstr *ExtI; + + // First check if we're extending the result of a load which has a dest type + // smaller than 32 bits, then this zext is redundant. GPR32 is the smallest + // GPR register on AArch64 and all loads which are smaller automatically + // zero-extend the upper bits. E.g. + // %v(s8) = G_LOAD %p, :: (load 1) + // %v2(s32) = G_ZEXT %v(s8) + if (!IsSigned) { + auto *LoadMI = getOpcodeDef(TargetOpcode::G_LOAD, SrcReg, MRI); + bool IsGPR = + RBI.getRegBank(SrcReg, MRI, TRI)->getID() == AArch64::GPRRegBankID; + if (LoadMI && IsGPR) { + const MachineMemOperand *MemOp = *LoadMI->memoperands_begin(); + unsigned BytesLoaded = MemOp->getSize(); + if (BytesLoaded < 4 && SrcTy.getSizeInBytes() == BytesLoaded) + return selectCopy(I, TII, MRI, TRI, RBI); + } + + // If we are zero extending from 32 bits to 64 bits, it's possible that + // the instruction implicitly does the zero extend for us. In that case, + // we can just emit a SUBREG_TO_REG. + if (IsGPR && SrcSize == 32 && DstSize == 64) { + // Unlike with the G_LOAD case, we don't want to look through copies + // here. + MachineInstr *Def = MRI.getVRegDef(SrcReg); + if (Def && isDef32(*Def)) { + MIB.buildInstr(AArch64::SUBREG_TO_REG, {DefReg}, {}) + .addImm(0) + .addUse(SrcReg) + .addImm(AArch64::sub_32); + + if (!RBI.constrainGenericRegister(DefReg, AArch64::GPR64RegClass, + MRI)) { + LLVM_DEBUG(dbgs() << "Failed to constrain G_ZEXT destination\n"); + return false; + } + + if (!RBI.constrainGenericRegister(SrcReg, AArch64::GPR32RegClass, + MRI)) { + LLVM_DEBUG(dbgs() << "Failed to constrain G_ZEXT source\n"); + return false; + } + + I.eraseFromParent(); + return true; + } + } + } + + if (DstSize == 64) { + if (Opcode != TargetOpcode::G_SEXT_INREG) { + // FIXME: Can we avoid manually doing this? + if (!RBI.constrainGenericRegister(SrcReg, AArch64::GPR32RegClass, + MRI)) { + LLVM_DEBUG(dbgs() << "Failed to constrain " << TII.getName(Opcode) + << " operand\n"); + return false; + } + SrcReg = MIB.buildInstr(AArch64::SUBREG_TO_REG, + {&AArch64::GPR64RegClass}, {}) + .addImm(0) + .addUse(SrcReg) + .addImm(AArch64::sub_32) + .getReg(0); + } + + ExtI = MIB.buildInstr(IsSigned ? AArch64::SBFMXri : AArch64::UBFMXri, + {DefReg}, {SrcReg}) + .addImm(0) + .addImm(SrcSize - 1); + } else if (DstSize <= 32) { + ExtI = MIB.buildInstr(IsSigned ? AArch64::SBFMWri : AArch64::UBFMWri, + {DefReg}, {SrcReg}) + .addImm(0) + .addImm(SrcSize - 1); + } else { + return false; + } + + constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI); + I.eraseFromParent(); + return true; + } + + case TargetOpcode::G_SITOFP: + case TargetOpcode::G_UITOFP: + case TargetOpcode::G_FPTOSI: + case TargetOpcode::G_FPTOUI: { + const LLT DstTy = MRI.getType(I.getOperand(0).getReg()), + SrcTy = MRI.getType(I.getOperand(1).getReg()); + const unsigned NewOpc = selectFPConvOpc(Opcode, DstTy, SrcTy); + if (NewOpc == Opcode) + return false; + + I.setDesc(TII.get(NewOpc)); + constrainSelectedInstRegOperands(I, TII, TRI, RBI); + + return true; + } + + case TargetOpcode::G_FREEZE: + return selectCopy(I, TII, MRI, TRI, RBI); + + case TargetOpcode::G_INTTOPTR: + // The importer is currently unable to import pointer types since they + // didn't exist in SelectionDAG. + return selectCopy(I, TII, MRI, TRI, RBI); + + case TargetOpcode::G_BITCAST: + // Imported SelectionDAG rules can handle every bitcast except those that + // bitcast from a type to the same type. Ideally, these shouldn't occur + // but we might not run an optimizer that deletes them. The other exception + // is bitcasts involving pointer types, as SelectionDAG has no knowledge + // of them. + return selectCopy(I, TII, MRI, TRI, RBI); + + case TargetOpcode::G_SELECT: { + if (MRI.getType(I.getOperand(1).getReg()) != LLT::scalar(1)) { + LLVM_DEBUG(dbgs() << "G_SELECT cond has type: " << Ty + << ", expected: " << LLT::scalar(1) << '\n'); + return false; + } + + const Register CondReg = I.getOperand(1).getReg(); + const Register TReg = I.getOperand(2).getReg(); + const Register FReg = I.getOperand(3).getReg(); + + if (tryOptSelect(I)) + return true; + + Register CSelOpc = selectSelectOpc(I, MRI, RBI); + MachineInstr &TstMI = + *BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::ANDSWri)) + .addDef(AArch64::WZR) + .addUse(CondReg) + .addImm(AArch64_AM::encodeLogicalImmediate(1, 32)); + + MachineInstr &CSelMI = *BuildMI(MBB, I, I.getDebugLoc(), TII.get(CSelOpc)) + .addDef(I.getOperand(0).getReg()) + .addUse(TReg) + .addUse(FReg) + .addImm(AArch64CC::NE); + + constrainSelectedInstRegOperands(TstMI, TII, TRI, RBI); + constrainSelectedInstRegOperands(CSelMI, TII, TRI, RBI); + + I.eraseFromParent(); + return true; + } + case TargetOpcode::G_ICMP: { + if (Ty.isVector()) + return selectVectorICmp(I, MRI); + + if (Ty != LLT::scalar(32)) { + LLVM_DEBUG(dbgs() << "G_ICMP result has type: " << Ty + << ", expected: " << LLT::scalar(32) << '\n'); + return false; + } + + MachineIRBuilder MIRBuilder(I); + MachineInstr *Cmp; + CmpInst::Predicate Pred; + std::tie(Cmp, Pred) = emitIntegerCompare(I.getOperand(2), I.getOperand(3), + I.getOperand(1), MIRBuilder); + if (!Cmp) + return false; + emitCSetForICMP(I.getOperand(0).getReg(), Pred, MIRBuilder); + I.eraseFromParent(); + return true; + } + + case TargetOpcode::G_FCMP: { + if (Ty != LLT::scalar(32)) { + LLVM_DEBUG(dbgs() << "G_FCMP result has type: " << Ty + << ", expected: " << LLT::scalar(32) << '\n'); + return false; + } + + unsigned CmpOpc = selectFCMPOpc(I, MRI); + if (!CmpOpc) + return false; + + // FIXME: regbank + + AArch64CC::CondCode CC1, CC2; + changeFCMPPredToAArch64CC( + (CmpInst::Predicate)I.getOperand(1).getPredicate(), CC1, CC2); + + // Partially build the compare. Decide if we need to add a use for the + // third operand based off whether or not we're comparing against 0.0. + auto CmpMI = BuildMI(MBB, I, I.getDebugLoc(), TII.get(CmpOpc)) + .addUse(I.getOperand(2).getReg()); + + // If we don't have an immediate compare, then we need to add a use of the + // register which wasn't used for the immediate. + // Note that the immediate will always be the last operand. + if (CmpOpc != AArch64::FCMPSri && CmpOpc != AArch64::FCMPDri) + CmpMI = CmpMI.addUse(I.getOperand(3).getReg()); + + const Register DefReg = I.getOperand(0).getReg(); + Register Def1Reg = DefReg; + if (CC2 != AArch64CC::AL) + Def1Reg = MRI.createVirtualRegister(&AArch64::GPR32RegClass); + + MachineInstr &CSetMI = + *BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::CSINCWr)) + .addDef(Def1Reg) + .addUse(AArch64::WZR) + .addUse(AArch64::WZR) + .addImm(getInvertedCondCode(CC1)); + + if (CC2 != AArch64CC::AL) { + Register Def2Reg = MRI.createVirtualRegister(&AArch64::GPR32RegClass); + MachineInstr &CSet2MI = + *BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::CSINCWr)) + .addDef(Def2Reg) + .addUse(AArch64::WZR) + .addUse(AArch64::WZR) + .addImm(getInvertedCondCode(CC2)); + MachineInstr &OrMI = + *BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::ORRWrr)) + .addDef(DefReg) + .addUse(Def1Reg) + .addUse(Def2Reg); + constrainSelectedInstRegOperands(OrMI, TII, TRI, RBI); + constrainSelectedInstRegOperands(CSet2MI, TII, TRI, RBI); + } + constrainSelectedInstRegOperands(*CmpMI, TII, TRI, RBI); + constrainSelectedInstRegOperands(CSetMI, TII, TRI, RBI); + + I.eraseFromParent(); + return true; + } + case TargetOpcode::G_VASTART: + return STI.isTargetDarwin() ? selectVaStartDarwin(I, MF, MRI) + : selectVaStartAAPCS(I, MF, MRI); + case TargetOpcode::G_INTRINSIC: + return selectIntrinsic(I, MRI); + case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS: + return selectIntrinsicWithSideEffects(I, MRI); + case TargetOpcode::G_IMPLICIT_DEF: { + I.setDesc(TII.get(TargetOpcode::IMPLICIT_DEF)); + const LLT DstTy = MRI.getType(I.getOperand(0).getReg()); + const Register DstReg = I.getOperand(0).getReg(); + const RegisterBank &DstRB = *RBI.getRegBank(DstReg, MRI, TRI); + const TargetRegisterClass *DstRC = + getRegClassForTypeOnBank(DstTy, DstRB, RBI); + RBI.constrainGenericRegister(DstReg, *DstRC, MRI); + return true; + } + case TargetOpcode::G_BLOCK_ADDR: { + if (TM.getCodeModel() == CodeModel::Large) { + materializeLargeCMVal(I, I.getOperand(1).getBlockAddress(), 0); + I.eraseFromParent(); + return true; + } else { + I.setDesc(TII.get(AArch64::MOVaddrBA)); + auto MovMI = BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::MOVaddrBA), + I.getOperand(0).getReg()) + .addBlockAddress(I.getOperand(1).getBlockAddress(), + /* Offset */ 0, AArch64II::MO_PAGE) + .addBlockAddress( + I.getOperand(1).getBlockAddress(), /* Offset */ 0, + AArch64II::MO_NC | AArch64II::MO_PAGEOFF); + I.eraseFromParent(); + return constrainSelectedInstRegOperands(*MovMI, TII, TRI, RBI); + } + } + case TargetOpcode::G_INTRINSIC_TRUNC: + return selectIntrinsicTrunc(I, MRI); + case TargetOpcode::G_INTRINSIC_ROUND: + return selectIntrinsicRound(I, MRI); + case TargetOpcode::G_BUILD_VECTOR: + return selectBuildVector(I, MRI); + case TargetOpcode::G_MERGE_VALUES: + return selectMergeValues(I, MRI); + case TargetOpcode::G_UNMERGE_VALUES: + return selectUnmergeValues(I, MRI); + case TargetOpcode::G_SHUFFLE_VECTOR: + return selectShuffleVector(I, MRI); + case TargetOpcode::G_EXTRACT_VECTOR_ELT: + return selectExtractElt(I, MRI); + case TargetOpcode::G_INSERT_VECTOR_ELT: + return selectInsertElt(I, MRI); + case TargetOpcode::G_CONCAT_VECTORS: + return selectConcatVectors(I, MRI); + case TargetOpcode::G_JUMP_TABLE: + return selectJumpTable(I, MRI); + } + + return false; +} + +bool AArch64InstructionSelector::selectBrJT(MachineInstr &I, + MachineRegisterInfo &MRI) const { + assert(I.getOpcode() == TargetOpcode::G_BRJT && "Expected G_BRJT"); + Register JTAddr = I.getOperand(0).getReg(); + unsigned JTI = I.getOperand(1).getIndex(); + Register Index = I.getOperand(2).getReg(); + MachineIRBuilder MIB(I); + + Register TargetReg = MRI.createVirtualRegister(&AArch64::GPR64RegClass); + Register ScratchReg = MRI.createVirtualRegister(&AArch64::GPR64spRegClass); + auto JumpTableInst = MIB.buildInstr(AArch64::JumpTableDest32, + {TargetReg, ScratchReg}, {JTAddr, Index}) + .addJumpTableIndex(JTI); + // Build the indirect branch. + MIB.buildInstr(AArch64::BR, {}, {TargetReg}); + I.eraseFromParent(); + return constrainSelectedInstRegOperands(*JumpTableInst, TII, TRI, RBI); +} + +bool AArch64InstructionSelector::selectJumpTable( + MachineInstr &I, MachineRegisterInfo &MRI) const { + assert(I.getOpcode() == TargetOpcode::G_JUMP_TABLE && "Expected jump table"); + assert(I.getOperand(1).isJTI() && "Jump table op should have a JTI!"); + + Register DstReg = I.getOperand(0).getReg(); + unsigned JTI = I.getOperand(1).getIndex(); + // We generate a MOVaddrJT which will get expanded to an ADRP + ADD later. + MachineIRBuilder MIB(I); + auto MovMI = + MIB.buildInstr(AArch64::MOVaddrJT, {DstReg}, {}) + .addJumpTableIndex(JTI, AArch64II::MO_PAGE) + .addJumpTableIndex(JTI, AArch64II::MO_NC | AArch64II::MO_PAGEOFF); + I.eraseFromParent(); + return constrainSelectedInstRegOperands(*MovMI, TII, TRI, RBI); +} + +bool AArch64InstructionSelector::selectTLSGlobalValue( + MachineInstr &I, MachineRegisterInfo &MRI) const { + if (!STI.isTargetMachO()) + return false; + MachineFunction &MF = *I.getParent()->getParent(); + MF.getFrameInfo().setAdjustsStack(true); + + const GlobalValue &GV = *I.getOperand(1).getGlobal(); + MachineIRBuilder MIB(I); + + MIB.buildInstr(AArch64::LOADgot, {AArch64::X0}, {}) + .addGlobalAddress(&GV, 0, AArch64II::MO_TLS); + + auto Load = MIB.buildInstr(AArch64::LDRXui, {&AArch64::GPR64commonRegClass}, + {Register(AArch64::X0)}) + .addImm(0); + + // TLS calls preserve all registers except those that absolutely must be + // trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be + // silly). + MIB.buildInstr(getBLRCallOpcode(MF), {}, {Load}) + .addDef(AArch64::X0, RegState::Implicit) + .addRegMask(TRI.getTLSCallPreservedMask()); + + MIB.buildCopy(I.getOperand(0).getReg(), Register(AArch64::X0)); + RBI.constrainGenericRegister(I.getOperand(0).getReg(), AArch64::GPR64RegClass, + MRI); + I.eraseFromParent(); + return true; +} + +bool AArch64InstructionSelector::selectIntrinsicTrunc( + MachineInstr &I, MachineRegisterInfo &MRI) const { + const LLT SrcTy = MRI.getType(I.getOperand(0).getReg()); + + // Select the correct opcode. + unsigned Opc = 0; + if (!SrcTy.isVector()) { + switch (SrcTy.getSizeInBits()) { + default: + case 16: + Opc = AArch64::FRINTZHr; + break; + case 32: + Opc = AArch64::FRINTZSr; + break; + case 64: + Opc = AArch64::FRINTZDr; + break; + } + } else { + unsigned NumElts = SrcTy.getNumElements(); + switch (SrcTy.getElementType().getSizeInBits()) { + default: + break; + case 16: + if (NumElts == 4) + Opc = AArch64::FRINTZv4f16; + else if (NumElts == 8) + Opc = AArch64::FRINTZv8f16; + break; + case 32: + if (NumElts == 2) + Opc = AArch64::FRINTZv2f32; + else if (NumElts == 4) + Opc = AArch64::FRINTZv4f32; + break; + case 64: + if (NumElts == 2) + Opc = AArch64::FRINTZv2f64; + break; + } + } + + if (!Opc) { + // Didn't get an opcode above, bail. + LLVM_DEBUG(dbgs() << "Unsupported type for G_INTRINSIC_TRUNC!\n"); + return false; + } + + // Legalization would have set us up perfectly for this; we just need to + // set the opcode and move on. + I.setDesc(TII.get(Opc)); + return constrainSelectedInstRegOperands(I, TII, TRI, RBI); +} + +bool AArch64InstructionSelector::selectIntrinsicRound( + MachineInstr &I, MachineRegisterInfo &MRI) const { + const LLT SrcTy = MRI.getType(I.getOperand(0).getReg()); + + // Select the correct opcode. + unsigned Opc = 0; + if (!SrcTy.isVector()) { + switch (SrcTy.getSizeInBits()) { + default: + case 16: + Opc = AArch64::FRINTAHr; + break; + case 32: + Opc = AArch64::FRINTASr; + break; + case 64: + Opc = AArch64::FRINTADr; + break; + } + } else { + unsigned NumElts = SrcTy.getNumElements(); + switch (SrcTy.getElementType().getSizeInBits()) { + default: + break; + case 16: + if (NumElts == 4) + Opc = AArch64::FRINTAv4f16; + else if (NumElts == 8) + Opc = AArch64::FRINTAv8f16; + break; + case 32: + if (NumElts == 2) + Opc = AArch64::FRINTAv2f32; + else if (NumElts == 4) + Opc = AArch64::FRINTAv4f32; + break; + case 64: + if (NumElts == 2) + Opc = AArch64::FRINTAv2f64; + break; + } + } + + if (!Opc) { + // Didn't get an opcode above, bail. + LLVM_DEBUG(dbgs() << "Unsupported type for G_INTRINSIC_ROUND!\n"); + return false; + } + + // Legalization would have set us up perfectly for this; we just need to + // set the opcode and move on. + I.setDesc(TII.get(Opc)); + return constrainSelectedInstRegOperands(I, TII, TRI, RBI); +} + +bool AArch64InstructionSelector::selectVectorICmp( + MachineInstr &I, MachineRegisterInfo &MRI) const { + Register DstReg = I.getOperand(0).getReg(); + LLT DstTy = MRI.getType(DstReg); + Register SrcReg = I.getOperand(2).getReg(); + Register Src2Reg = I.getOperand(3).getReg(); + LLT SrcTy = MRI.getType(SrcReg); + + unsigned SrcEltSize = SrcTy.getElementType().getSizeInBits(); + unsigned NumElts = DstTy.getNumElements(); + + // First index is element size, 0 == 8b, 1 == 16b, 2 == 32b, 3 == 64b + // Second index is num elts, 0 == v2, 1 == v4, 2 == v8, 3 == v16 + // Third index is cc opcode: + // 0 == eq + // 1 == ugt + // 2 == uge + // 3 == ult + // 4 == ule + // 5 == sgt + // 6 == sge + // 7 == slt + // 8 == sle + // ne is done by negating 'eq' result. + + // This table below assumes that for some comparisons the operands will be + // commuted. + // ult op == commute + ugt op + // ule op == commute + uge op + // slt op == commute + sgt op + // sle op == commute + sge op + unsigned PredIdx = 0; + bool SwapOperands = false; + CmpInst::Predicate Pred = (CmpInst::Predicate)I.getOperand(1).getPredicate(); + switch (Pred) { + case CmpInst::ICMP_NE: + case CmpInst::ICMP_EQ: + PredIdx = 0; + break; + case CmpInst::ICMP_UGT: + PredIdx = 1; + break; + case CmpInst::ICMP_UGE: + PredIdx = 2; + break; + case CmpInst::ICMP_ULT: + PredIdx = 3; + SwapOperands = true; + break; + case CmpInst::ICMP_ULE: + PredIdx = 4; + SwapOperands = true; + break; + case CmpInst::ICMP_SGT: + PredIdx = 5; + break; + case CmpInst::ICMP_SGE: + PredIdx = 6; + break; + case CmpInst::ICMP_SLT: + PredIdx = 7; + SwapOperands = true; + break; + case CmpInst::ICMP_SLE: + PredIdx = 8; + SwapOperands = true; + break; + default: + llvm_unreachable("Unhandled icmp predicate"); + return false; + } + + // This table obviously should be tablegen'd when we have our GISel native + // tablegen selector. + + static const unsigned OpcTable[4][4][9] = { + { + {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, + 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, + 0 /* invalid */}, + {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, + 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, + 0 /* invalid */}, + {AArch64::CMEQv8i8, AArch64::CMHIv8i8, AArch64::CMHSv8i8, + AArch64::CMHIv8i8, AArch64::CMHSv8i8, AArch64::CMGTv8i8, + AArch64::CMGEv8i8, AArch64::CMGTv8i8, AArch64::CMGEv8i8}, + {AArch64::CMEQv16i8, AArch64::CMHIv16i8, AArch64::CMHSv16i8, + AArch64::CMHIv16i8, AArch64::CMHSv16i8, AArch64::CMGTv16i8, + AArch64::CMGEv16i8, AArch64::CMGTv16i8, AArch64::CMGEv16i8} + }, + { + {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, + 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, + 0 /* invalid */}, + {AArch64::CMEQv4i16, AArch64::CMHIv4i16, AArch64::CMHSv4i16, + AArch64::CMHIv4i16, AArch64::CMHSv4i16, AArch64::CMGTv4i16, + AArch64::CMGEv4i16, AArch64::CMGTv4i16, AArch64::CMGEv4i16}, + {AArch64::CMEQv8i16, AArch64::CMHIv8i16, AArch64::CMHSv8i16, + AArch64::CMHIv8i16, AArch64::CMHSv8i16, AArch64::CMGTv8i16, + AArch64::CMGEv8i16, AArch64::CMGTv8i16, AArch64::CMGEv8i16}, + {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, + 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, + 0 /* invalid */} + }, + { + {AArch64::CMEQv2i32, AArch64::CMHIv2i32, AArch64::CMHSv2i32, + AArch64::CMHIv2i32, AArch64::CMHSv2i32, AArch64::CMGTv2i32, + AArch64::CMGEv2i32, AArch64::CMGTv2i32, AArch64::CMGEv2i32}, + {AArch64::CMEQv4i32, AArch64::CMHIv4i32, AArch64::CMHSv4i32, + AArch64::CMHIv4i32, AArch64::CMHSv4i32, AArch64::CMGTv4i32, + AArch64::CMGEv4i32, AArch64::CMGTv4i32, AArch64::CMGEv4i32}, + {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, + 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, + 0 /* invalid */}, + {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, + 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, + 0 /* invalid */} + }, + { + {AArch64::CMEQv2i64, AArch64::CMHIv2i64, AArch64::CMHSv2i64, + AArch64::CMHIv2i64, AArch64::CMHSv2i64, AArch64::CMGTv2i64, + AArch64::CMGEv2i64, AArch64::CMGTv2i64, AArch64::CMGEv2i64}, + {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, + 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, + 0 /* invalid */}, + {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, + 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, + 0 /* invalid */}, + {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, + 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, + 0 /* invalid */} + }, + }; + unsigned EltIdx = Log2_32(SrcEltSize / 8); + unsigned NumEltsIdx = Log2_32(NumElts / 2); + unsigned Opc = OpcTable[EltIdx][NumEltsIdx][PredIdx]; + if (!Opc) { + LLVM_DEBUG(dbgs() << "Could not map G_ICMP to cmp opcode"); + return false; + } + + const RegisterBank &VecRB = *RBI.getRegBank(SrcReg, MRI, TRI); + const TargetRegisterClass *SrcRC = + getRegClassForTypeOnBank(SrcTy, VecRB, RBI, true); + if (!SrcRC) { + LLVM_DEBUG(dbgs() << "Could not determine source register class.\n"); + return false; + } + + unsigned NotOpc = Pred == ICmpInst::ICMP_NE ? AArch64::NOTv8i8 : 0; + if (SrcTy.getSizeInBits() == 128) + NotOpc = NotOpc ? AArch64::NOTv16i8 : 0; + + if (SwapOperands) + std::swap(SrcReg, Src2Reg); + + MachineIRBuilder MIB(I); + auto Cmp = MIB.buildInstr(Opc, {SrcRC}, {SrcReg, Src2Reg}); + constrainSelectedInstRegOperands(*Cmp, TII, TRI, RBI); + + // Invert if we had a 'ne' cc. + if (NotOpc) { + Cmp = MIB.buildInstr(NotOpc, {DstReg}, {Cmp}); + constrainSelectedInstRegOperands(*Cmp, TII, TRI, RBI); + } else { + MIB.buildCopy(DstReg, Cmp.getReg(0)); + } + RBI.constrainGenericRegister(DstReg, *SrcRC, MRI); + I.eraseFromParent(); + return true; +} + +MachineInstr *AArch64InstructionSelector::emitScalarToVector( + unsigned EltSize, const TargetRegisterClass *DstRC, Register Scalar, + MachineIRBuilder &MIRBuilder) const { + auto Undef = MIRBuilder.buildInstr(TargetOpcode::IMPLICIT_DEF, {DstRC}, {}); + + auto BuildFn = [&](unsigned SubregIndex) { + auto Ins = + MIRBuilder + .buildInstr(TargetOpcode::INSERT_SUBREG, {DstRC}, {Undef, Scalar}) + .addImm(SubregIndex); + constrainSelectedInstRegOperands(*Undef, TII, TRI, RBI); + constrainSelectedInstRegOperands(*Ins, TII, TRI, RBI); + return &*Ins; + }; + + switch (EltSize) { + case 16: + return BuildFn(AArch64::hsub); + case 32: + return BuildFn(AArch64::ssub); + case 64: + return BuildFn(AArch64::dsub); + default: + return nullptr; + } +} + +bool AArch64InstructionSelector::selectMergeValues( + MachineInstr &I, MachineRegisterInfo &MRI) const { + assert(I.getOpcode() == TargetOpcode::G_MERGE_VALUES && "unexpected opcode"); + const LLT DstTy = MRI.getType(I.getOperand(0).getReg()); + const LLT SrcTy = MRI.getType(I.getOperand(1).getReg()); + assert(!DstTy.isVector() && !SrcTy.isVector() && "invalid merge operation"); + const RegisterBank &RB = *RBI.getRegBank(I.getOperand(1).getReg(), MRI, TRI); + + if (I.getNumOperands() != 3) + return false; + + // Merging 2 s64s into an s128. + if (DstTy == LLT::scalar(128)) { + if (SrcTy.getSizeInBits() != 64) + return false; + MachineIRBuilder MIB(I); + Register DstReg = I.getOperand(0).getReg(); + Register Src1Reg = I.getOperand(1).getReg(); + Register Src2Reg = I.getOperand(2).getReg(); + auto Tmp = MIB.buildInstr(TargetOpcode::IMPLICIT_DEF, {DstTy}, {}); + MachineInstr *InsMI = + emitLaneInsert(None, Tmp.getReg(0), Src1Reg, /* LaneIdx */ 0, RB, MIB); + if (!InsMI) + return false; + MachineInstr *Ins2MI = emitLaneInsert(DstReg, InsMI->getOperand(0).getReg(), + Src2Reg, /* LaneIdx */ 1, RB, MIB); + if (!Ins2MI) + return false; + constrainSelectedInstRegOperands(*InsMI, TII, TRI, RBI); + constrainSelectedInstRegOperands(*Ins2MI, TII, TRI, RBI); + I.eraseFromParent(); + return true; + } + + if (RB.getID() != AArch64::GPRRegBankID) + return false; + + if (DstTy.getSizeInBits() != 64 || SrcTy.getSizeInBits() != 32) + return false; + + auto *DstRC = &AArch64::GPR64RegClass; + Register SubToRegDef = MRI.createVirtualRegister(DstRC); + MachineInstr &SubRegMI = *BuildMI(*I.getParent(), I, I.getDebugLoc(), + TII.get(TargetOpcode::SUBREG_TO_REG)) + .addDef(SubToRegDef) + .addImm(0) + .addUse(I.getOperand(1).getReg()) + .addImm(AArch64::sub_32); + Register SubToRegDef2 = MRI.createVirtualRegister(DstRC); + // Need to anyext the second scalar before we can use bfm + MachineInstr &SubRegMI2 = *BuildMI(*I.getParent(), I, I.getDebugLoc(), + TII.get(TargetOpcode::SUBREG_TO_REG)) + .addDef(SubToRegDef2) + .addImm(0) + .addUse(I.getOperand(2).getReg()) + .addImm(AArch64::sub_32); + MachineInstr &BFM = + *BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(AArch64::BFMXri)) + .addDef(I.getOperand(0).getReg()) + .addUse(SubToRegDef) + .addUse(SubToRegDef2) + .addImm(32) + .addImm(31); + constrainSelectedInstRegOperands(SubRegMI, TII, TRI, RBI); + constrainSelectedInstRegOperands(SubRegMI2, TII, TRI, RBI); + constrainSelectedInstRegOperands(BFM, TII, TRI, RBI); + I.eraseFromParent(); + return true; +} + +static bool getLaneCopyOpcode(unsigned &CopyOpc, unsigned &ExtractSubReg, + const unsigned EltSize) { + // Choose a lane copy opcode and subregister based off of the size of the + // vector's elements. + switch (EltSize) { + case 16: + CopyOpc = AArch64::CPYi16; + ExtractSubReg = AArch64::hsub; + break; + case 32: + CopyOpc = AArch64::CPYi32; + ExtractSubReg = AArch64::ssub; + break; + case 64: + CopyOpc = AArch64::CPYi64; + ExtractSubReg = AArch64::dsub; + break; + default: + // Unknown size, bail out. + LLVM_DEBUG(dbgs() << "Elt size '" << EltSize << "' unsupported.\n"); + return false; + } + return true; +} + +MachineInstr *AArch64InstructionSelector::emitExtractVectorElt( + Optional DstReg, const RegisterBank &DstRB, LLT ScalarTy, + Register VecReg, unsigned LaneIdx, MachineIRBuilder &MIRBuilder) const { + MachineRegisterInfo &MRI = *MIRBuilder.getMRI(); + unsigned CopyOpc = 0; + unsigned ExtractSubReg = 0; + if (!getLaneCopyOpcode(CopyOpc, ExtractSubReg, ScalarTy.getSizeInBits())) { + LLVM_DEBUG( + dbgs() << "Couldn't determine lane copy opcode for instruction.\n"); + return nullptr; + } + + const TargetRegisterClass *DstRC = + getRegClassForTypeOnBank(ScalarTy, DstRB, RBI, true); + if (!DstRC) { + LLVM_DEBUG(dbgs() << "Could not determine destination register class.\n"); + return nullptr; + } + + const RegisterBank &VecRB = *RBI.getRegBank(VecReg, MRI, TRI); + const LLT &VecTy = MRI.getType(VecReg); + const TargetRegisterClass *VecRC = + getRegClassForTypeOnBank(VecTy, VecRB, RBI, true); + if (!VecRC) { + LLVM_DEBUG(dbgs() << "Could not determine source register class.\n"); + return nullptr; + } + + // The register that we're going to copy into. + Register InsertReg = VecReg; + if (!DstReg) + DstReg = MRI.createVirtualRegister(DstRC); + // If the lane index is 0, we just use a subregister COPY. + if (LaneIdx == 0) { + auto Copy = MIRBuilder.buildInstr(TargetOpcode::COPY, {*DstReg}, {}) + .addReg(VecReg, 0, ExtractSubReg); + RBI.constrainGenericRegister(*DstReg, *DstRC, MRI); + return &*Copy; + } + + // Lane copies require 128-bit wide registers. If we're dealing with an + // unpacked vector, then we need to move up to that width. Insert an implicit + // def and a subregister insert to get us there. + if (VecTy.getSizeInBits() != 128) { + MachineInstr *ScalarToVector = emitScalarToVector( + VecTy.getSizeInBits(), &AArch64::FPR128RegClass, VecReg, MIRBuilder); + if (!ScalarToVector) + return nullptr; + InsertReg = ScalarToVector->getOperand(0).getReg(); + } + + MachineInstr *LaneCopyMI = + MIRBuilder.buildInstr(CopyOpc, {*DstReg}, {InsertReg}).addImm(LaneIdx); + constrainSelectedInstRegOperands(*LaneCopyMI, TII, TRI, RBI); + + // Make sure that we actually constrain the initial copy. + RBI.constrainGenericRegister(*DstReg, *DstRC, MRI); + return LaneCopyMI; +} + +bool AArch64InstructionSelector::selectExtractElt( + MachineInstr &I, MachineRegisterInfo &MRI) const { + assert(I.getOpcode() == TargetOpcode::G_EXTRACT_VECTOR_ELT && + "unexpected opcode!"); + Register DstReg = I.getOperand(0).getReg(); + const LLT NarrowTy = MRI.getType(DstReg); + const Register SrcReg = I.getOperand(1).getReg(); + const LLT WideTy = MRI.getType(SrcReg); + (void)WideTy; + assert(WideTy.getSizeInBits() >= NarrowTy.getSizeInBits() && + "source register size too small!"); + assert(NarrowTy.isScalar() && "cannot extract vector into vector!"); + + // Need the lane index to determine the correct copy opcode. + MachineOperand &LaneIdxOp = I.getOperand(2); + assert(LaneIdxOp.isReg() && "Lane index operand was not a register?"); + + if (RBI.getRegBank(DstReg, MRI, TRI)->getID() != AArch64::FPRRegBankID) { + LLVM_DEBUG(dbgs() << "Cannot extract into GPR.\n"); + return false; + } + + // Find the index to extract from. + auto VRegAndVal = getConstantVRegValWithLookThrough(LaneIdxOp.getReg(), MRI); + if (!VRegAndVal) + return false; + unsigned LaneIdx = VRegAndVal->Value; + + MachineIRBuilder MIRBuilder(I); + + const RegisterBank &DstRB = *RBI.getRegBank(DstReg, MRI, TRI); + MachineInstr *Extract = emitExtractVectorElt(DstReg, DstRB, NarrowTy, SrcReg, + LaneIdx, MIRBuilder); + if (!Extract) + return false; + + I.eraseFromParent(); + return true; +} + +bool AArch64InstructionSelector::selectSplitVectorUnmerge( + MachineInstr &I, MachineRegisterInfo &MRI) const { + unsigned NumElts = I.getNumOperands() - 1; + Register SrcReg = I.getOperand(NumElts).getReg(); + const LLT NarrowTy = MRI.getType(I.getOperand(0).getReg()); + const LLT SrcTy = MRI.getType(SrcReg); + + assert(NarrowTy.isVector() && "Expected an unmerge into vectors"); + if (SrcTy.getSizeInBits() > 128) { + LLVM_DEBUG(dbgs() << "Unexpected vector type for vec split unmerge"); + return false; + } + + MachineIRBuilder MIB(I); + + // We implement a split vector operation by treating the sub-vectors as + // scalars and extracting them. + const RegisterBank &DstRB = + *RBI.getRegBank(I.getOperand(0).getReg(), MRI, TRI); + for (unsigned OpIdx = 0; OpIdx < NumElts; ++OpIdx) { + Register Dst = I.getOperand(OpIdx).getReg(); + MachineInstr *Extract = + emitExtractVectorElt(Dst, DstRB, NarrowTy, SrcReg, OpIdx, MIB); + if (!Extract) + return false; + } + I.eraseFromParent(); + return true; +} + +bool AArch64InstructionSelector::selectUnmergeValues( + MachineInstr &I, MachineRegisterInfo &MRI) const { + assert(I.getOpcode() == TargetOpcode::G_UNMERGE_VALUES && + "unexpected opcode"); + + // TODO: Handle unmerging into GPRs and from scalars to scalars. + if (RBI.getRegBank(I.getOperand(0).getReg(), MRI, TRI)->getID() != + AArch64::FPRRegBankID || + RBI.getRegBank(I.getOperand(1).getReg(), MRI, TRI)->getID() != + AArch64::FPRRegBankID) { + LLVM_DEBUG(dbgs() << "Unmerging vector-to-gpr and scalar-to-scalar " + "currently unsupported.\n"); + return false; + } + + // The last operand is the vector source register, and every other operand is + // a register to unpack into. + unsigned NumElts = I.getNumOperands() - 1; + Register SrcReg = I.getOperand(NumElts).getReg(); + const LLT NarrowTy = MRI.getType(I.getOperand(0).getReg()); + const LLT WideTy = MRI.getType(SrcReg); + (void)WideTy; + assert((WideTy.isVector() || WideTy.getSizeInBits() == 128) && + "can only unmerge from vector or s128 types!"); + assert(WideTy.getSizeInBits() > NarrowTy.getSizeInBits() && + "source register size too small!"); + + if (!NarrowTy.isScalar()) + return selectSplitVectorUnmerge(I, MRI); + + MachineIRBuilder MIB(I); + + // Choose a lane copy opcode and subregister based off of the size of the + // vector's elements. + unsigned CopyOpc = 0; + unsigned ExtractSubReg = 0; + if (!getLaneCopyOpcode(CopyOpc, ExtractSubReg, NarrowTy.getSizeInBits())) + return false; + + // Set up for the lane copies. + MachineBasicBlock &MBB = *I.getParent(); + + // Stores the registers we'll be copying from. + SmallVector InsertRegs; + + // We'll use the first register twice, so we only need NumElts-1 registers. + unsigned NumInsertRegs = NumElts - 1; + + // If our elements fit into exactly 128 bits, then we can copy from the source + // directly. Otherwise, we need to do a bit of setup with some subregister + // inserts. + if (NarrowTy.getSizeInBits() * NumElts == 128) { + InsertRegs = SmallVector(NumInsertRegs, SrcReg); + } else { + // No. We have to perform subregister inserts. For each insert, create an + // implicit def and a subregister insert, and save the register we create. + for (unsigned Idx = 0; Idx < NumInsertRegs; ++Idx) { + Register ImpDefReg = MRI.createVirtualRegister(&AArch64::FPR128RegClass); + MachineInstr &ImpDefMI = + *BuildMI(MBB, I, I.getDebugLoc(), TII.get(TargetOpcode::IMPLICIT_DEF), + ImpDefReg); + + // Now, create the subregister insert from SrcReg. + Register InsertReg = MRI.createVirtualRegister(&AArch64::FPR128RegClass); + MachineInstr &InsMI = + *BuildMI(MBB, I, I.getDebugLoc(), + TII.get(TargetOpcode::INSERT_SUBREG), InsertReg) + .addUse(ImpDefReg) + .addUse(SrcReg) + .addImm(AArch64::dsub); + + constrainSelectedInstRegOperands(ImpDefMI, TII, TRI, RBI); + constrainSelectedInstRegOperands(InsMI, TII, TRI, RBI); + + // Save the register so that we can copy from it after. + InsertRegs.push_back(InsertReg); + } + } + + // Now that we've created any necessary subregister inserts, we can + // create the copies. + // + // Perform the first copy separately as a subregister copy. + Register CopyTo = I.getOperand(0).getReg(); + auto FirstCopy = MIB.buildInstr(TargetOpcode::COPY, {CopyTo}, {}) + .addReg(InsertRegs[0], 0, ExtractSubReg); + constrainSelectedInstRegOperands(*FirstCopy, TII, TRI, RBI); + + // Now, perform the remaining copies as vector lane copies. + unsigned LaneIdx = 1; + for (Register InsReg : InsertRegs) { + Register CopyTo = I.getOperand(LaneIdx).getReg(); + MachineInstr &CopyInst = + *BuildMI(MBB, I, I.getDebugLoc(), TII.get(CopyOpc), CopyTo) + .addUse(InsReg) + .addImm(LaneIdx); + constrainSelectedInstRegOperands(CopyInst, TII, TRI, RBI); + ++LaneIdx; + } + + // Separately constrain the first copy's destination. Because of the + // limitation in constrainOperandRegClass, we can't guarantee that this will + // actually be constrained. So, do it ourselves using the second operand. + const TargetRegisterClass *RC = + MRI.getRegClassOrNull(I.getOperand(1).getReg()); + if (!RC) { + LLVM_DEBUG(dbgs() << "Couldn't constrain copy destination.\n"); + return false; + } + + RBI.constrainGenericRegister(CopyTo, *RC, MRI); + I.eraseFromParent(); + return true; +} + +bool AArch64InstructionSelector::selectConcatVectors( + MachineInstr &I, MachineRegisterInfo &MRI) const { + assert(I.getOpcode() == TargetOpcode::G_CONCAT_VECTORS && + "Unexpected opcode"); + Register Dst = I.getOperand(0).getReg(); + Register Op1 = I.getOperand(1).getReg(); + Register Op2 = I.getOperand(2).getReg(); + MachineIRBuilder MIRBuilder(I); + MachineInstr *ConcatMI = emitVectorConcat(Dst, Op1, Op2, MIRBuilder); + if (!ConcatMI) + return false; + I.eraseFromParent(); + return true; +} + +unsigned +AArch64InstructionSelector::emitConstantPoolEntry(const Constant *CPVal, + MachineFunction &MF) const { + Type *CPTy = CPVal->getType(); + Align Alignment = MF.getDataLayout().getPrefTypeAlign(CPTy); + + MachineConstantPool *MCP = MF.getConstantPool(); + return MCP->getConstantPoolIndex(CPVal, Alignment); +} + +MachineInstr *AArch64InstructionSelector::emitLoadFromConstantPool( + const Constant *CPVal, MachineIRBuilder &MIRBuilder) const { + unsigned CPIdx = emitConstantPoolEntry(CPVal, MIRBuilder.getMF()); + + auto Adrp = + MIRBuilder.buildInstr(AArch64::ADRP, {&AArch64::GPR64RegClass}, {}) + .addConstantPoolIndex(CPIdx, 0, AArch64II::MO_PAGE); + + MachineInstr *LoadMI = nullptr; + switch (MIRBuilder.getDataLayout().getTypeStoreSize(CPVal->getType())) { + case 16: + LoadMI = + &*MIRBuilder + .buildInstr(AArch64::LDRQui, {&AArch64::FPR128RegClass}, {Adrp}) + .addConstantPoolIndex(CPIdx, 0, + AArch64II::MO_PAGEOFF | AArch64II::MO_NC); + break; + case 8: + LoadMI = &*MIRBuilder + .buildInstr(AArch64::LDRDui, {&AArch64::FPR64RegClass}, {Adrp}) + .addConstantPoolIndex( + CPIdx, 0, AArch64II::MO_PAGEOFF | AArch64II::MO_NC); + break; + default: + LLVM_DEBUG(dbgs() << "Could not load from constant pool of type " + << *CPVal->getType()); + return nullptr; + } + constrainSelectedInstRegOperands(*Adrp, TII, TRI, RBI); + constrainSelectedInstRegOperands(*LoadMI, TII, TRI, RBI); + return LoadMI; +} + +/// Return an pair to do an vector elt insert of a given +/// size and RB. +static std::pair +getInsertVecEltOpInfo(const RegisterBank &RB, unsigned EltSize) { + unsigned Opc, SubregIdx; + if (RB.getID() == AArch64::GPRRegBankID) { + if (EltSize == 32) { + Opc = AArch64::INSvi32gpr; + SubregIdx = AArch64::ssub; + } else if (EltSize == 64) { + Opc = AArch64::INSvi64gpr; + SubregIdx = AArch64::dsub; + } else { + llvm_unreachable("invalid elt size!"); + } + } else { + if (EltSize == 8) { + Opc = AArch64::INSvi8lane; + SubregIdx = AArch64::bsub; + } else if (EltSize == 16) { + Opc = AArch64::INSvi16lane; + SubregIdx = AArch64::hsub; + } else if (EltSize == 32) { + Opc = AArch64::INSvi32lane; + SubregIdx = AArch64::ssub; + } else if (EltSize == 64) { + Opc = AArch64::INSvi64lane; + SubregIdx = AArch64::dsub; + } else { + llvm_unreachable("invalid elt size!"); + } + } + return std::make_pair(Opc, SubregIdx); +} + +MachineInstr * +AArch64InstructionSelector::emitADD(Register DefReg, MachineOperand &LHS, + MachineOperand &RHS, + MachineIRBuilder &MIRBuilder) const { + assert(LHS.isReg() && RHS.isReg() && "Expected LHS and RHS to be registers!"); + MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo(); + static const unsigned OpcTable[2][2]{{AArch64::ADDXrr, AArch64::ADDXri}, + {AArch64::ADDWrr, AArch64::ADDWri}}; + bool Is32Bit = MRI.getType(LHS.getReg()).getSizeInBits() == 32; + auto ImmFns = selectArithImmed(RHS); + unsigned Opc = OpcTable[Is32Bit][ImmFns.hasValue()]; + auto AddMI = MIRBuilder.buildInstr(Opc, {DefReg}, {LHS}); + + // If we matched a valid constant immediate, add those operands. + if (ImmFns) { + for (auto &RenderFn : *ImmFns) + RenderFn(AddMI); + } else { + AddMI.addUse(RHS.getReg()); + } + + constrainSelectedInstRegOperands(*AddMI, TII, TRI, RBI); + return &*AddMI; +} + +MachineInstr * +AArch64InstructionSelector::emitCMN(MachineOperand &LHS, MachineOperand &RHS, + MachineIRBuilder &MIRBuilder) const { + assert(LHS.isReg() && RHS.isReg() && "Expected LHS and RHS to be registers!"); + MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo(); + static const unsigned OpcTable[2][2]{{AArch64::ADDSXrr, AArch64::ADDSXri}, + {AArch64::ADDSWrr, AArch64::ADDSWri}}; + bool Is32Bit = (MRI.getType(LHS.getReg()).getSizeInBits() == 32); + auto ImmFns = selectArithImmed(RHS); + unsigned Opc = OpcTable[Is32Bit][ImmFns.hasValue()]; + Register ZReg = Is32Bit ? AArch64::WZR : AArch64::XZR; + + auto CmpMI = MIRBuilder.buildInstr(Opc, {ZReg}, {LHS}); + + // If we matched a valid constant immediate, add those operands. + if (ImmFns) { + for (auto &RenderFn : *ImmFns) + RenderFn(CmpMI); + } else { + CmpMI.addUse(RHS.getReg()); + } + + constrainSelectedInstRegOperands(*CmpMI, TII, TRI, RBI); + return &*CmpMI; +} + +MachineInstr * +AArch64InstructionSelector::emitTST(const Register &LHS, const Register &RHS, + MachineIRBuilder &MIRBuilder) const { + MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo(); + unsigned RegSize = MRI.getType(LHS).getSizeInBits(); + bool Is32Bit = (RegSize == 32); + static const unsigned OpcTable[2][2]{{AArch64::ANDSXrr, AArch64::ANDSXri}, + {AArch64::ANDSWrr, AArch64::ANDSWri}}; + Register ZReg = Is32Bit ? AArch64::WZR : AArch64::XZR; + + // We might be able to fold in an immediate into the TST. We need to make sure + // it's a logical immediate though, since ANDS requires that. + auto ValAndVReg = getConstantVRegValWithLookThrough(RHS, MRI); + bool IsImmForm = ValAndVReg.hasValue() && + AArch64_AM::isLogicalImmediate(ValAndVReg->Value, RegSize); + unsigned Opc = OpcTable[Is32Bit][IsImmForm]; + auto TstMI = MIRBuilder.buildInstr(Opc, {ZReg}, {LHS}); + + if (IsImmForm) + TstMI.addImm( + AArch64_AM::encodeLogicalImmediate(ValAndVReg->Value, RegSize)); + else + TstMI.addUse(RHS); + + constrainSelectedInstRegOperands(*TstMI, TII, TRI, RBI); + return &*TstMI; +} + +std::pair +AArch64InstructionSelector::emitIntegerCompare( + MachineOperand &LHS, MachineOperand &RHS, MachineOperand &Predicate, + MachineIRBuilder &MIRBuilder) const { + assert(LHS.isReg() && RHS.isReg() && "Expected LHS and RHS to be registers!"); + assert(Predicate.isPredicate() && "Expected predicate?"); + MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo(); + + CmpInst::Predicate P = (CmpInst::Predicate)Predicate.getPredicate(); + + // Fold the compare if possible. + MachineInstr *FoldCmp = + tryFoldIntegerCompare(LHS, RHS, Predicate, MIRBuilder); + if (FoldCmp) + return {FoldCmp, P}; + + // Can't fold into a CMN. Just emit a normal compare. + unsigned CmpOpc = 0; + Register ZReg; + + LLT CmpTy = MRI.getType(LHS.getReg()); + assert((CmpTy.isScalar() || CmpTy.isPointer()) && + "Expected scalar or pointer"); + if (CmpTy == LLT::scalar(32)) { + CmpOpc = AArch64::SUBSWrr; + ZReg = MRI.createVirtualRegister(&AArch64::GPR32RegClass); + } else if (CmpTy == LLT::scalar(64) || CmpTy.isPointer()) { + CmpOpc = AArch64::SUBSXrr; + ZReg = MRI.createVirtualRegister(&AArch64::GPR64RegClass); + } else { + return {nullptr, CmpInst::Predicate::BAD_ICMP_PREDICATE}; + } + + // Try to match immediate forms. + MachineInstr *ImmedCmp = + tryOptArithImmedIntegerCompare(LHS, RHS, P, MIRBuilder); + if (ImmedCmp) + return {ImmedCmp, P}; + + // If we don't have an immediate, we may have a shift which can be folded + // into the compare. + MachineInstr *ShiftedCmp = tryOptArithShiftedCompare(LHS, RHS, MIRBuilder); + if (ShiftedCmp) + return {ShiftedCmp, P}; + + auto CmpMI = + MIRBuilder.buildInstr(CmpOpc, {ZReg}, {LHS.getReg(), RHS.getReg()}); + // Make sure that we can constrain the compare that we emitted. + constrainSelectedInstRegOperands(*CmpMI, TII, TRI, RBI); + return {&*CmpMI, P}; +} + +MachineInstr *AArch64InstructionSelector::emitVectorConcat( + Optional Dst, Register Op1, Register Op2, + MachineIRBuilder &MIRBuilder) const { + // We implement a vector concat by: + // 1. Use scalar_to_vector to insert the lower vector into the larger dest + // 2. Insert the upper vector into the destination's upper element + // TODO: some of this code is common with G_BUILD_VECTOR handling. + MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo(); + + const LLT Op1Ty = MRI.getType(Op1); + const LLT Op2Ty = MRI.getType(Op2); + + if (Op1Ty != Op2Ty) { + LLVM_DEBUG(dbgs() << "Could not do vector concat of differing vector tys"); + return nullptr; + } + assert(Op1Ty.isVector() && "Expected a vector for vector concat"); + + if (Op1Ty.getSizeInBits() >= 128) { + LLVM_DEBUG(dbgs() << "Vector concat not supported for full size vectors"); + return nullptr; + } + + // At the moment we just support 64 bit vector concats. + if (Op1Ty.getSizeInBits() != 64) { + LLVM_DEBUG(dbgs() << "Vector concat supported for 64b vectors"); + return nullptr; + } + + const LLT ScalarTy = LLT::scalar(Op1Ty.getSizeInBits()); + const RegisterBank &FPRBank = *RBI.getRegBank(Op1, MRI, TRI); + const TargetRegisterClass *DstRC = + getMinClassForRegBank(FPRBank, Op1Ty.getSizeInBits() * 2); + + MachineInstr *WidenedOp1 = + emitScalarToVector(ScalarTy.getSizeInBits(), DstRC, Op1, MIRBuilder); + MachineInstr *WidenedOp2 = + emitScalarToVector(ScalarTy.getSizeInBits(), DstRC, Op2, MIRBuilder); + if (!WidenedOp1 || !WidenedOp2) { + LLVM_DEBUG(dbgs() << "Could not emit a vector from scalar value"); + return nullptr; + } + + // Now do the insert of the upper element. + unsigned InsertOpc, InsSubRegIdx; + std::tie(InsertOpc, InsSubRegIdx) = + getInsertVecEltOpInfo(FPRBank, ScalarTy.getSizeInBits()); + + if (!Dst) + Dst = MRI.createVirtualRegister(DstRC); + auto InsElt = + MIRBuilder + .buildInstr(InsertOpc, {*Dst}, {WidenedOp1->getOperand(0).getReg()}) + .addImm(1) /* Lane index */ + .addUse(WidenedOp2->getOperand(0).getReg()) + .addImm(0); + constrainSelectedInstRegOperands(*InsElt, TII, TRI, RBI); + return &*InsElt; +} + +MachineInstr *AArch64InstructionSelector::emitFMovForFConstant( + MachineInstr &I, MachineRegisterInfo &MRI) const { + assert(I.getOpcode() == TargetOpcode::G_FCONSTANT && + "Expected a G_FCONSTANT!"); + MachineOperand &ImmOp = I.getOperand(1); + unsigned DefSize = MRI.getType(I.getOperand(0).getReg()).getSizeInBits(); + + // Only handle 32 and 64 bit defs for now. + if (DefSize != 32 && DefSize != 64) + return nullptr; + + // Don't handle null values using FMOV. + if (ImmOp.getFPImm()->isNullValue()) + return nullptr; + + // Get the immediate representation for the FMOV. + const APFloat &ImmValAPF = ImmOp.getFPImm()->getValueAPF(); + int Imm = DefSize == 32 ? AArch64_AM::getFP32Imm(ImmValAPF) + : AArch64_AM::getFP64Imm(ImmValAPF); + + // If this is -1, it means the immediate can't be represented as the requested + // floating point value. Bail. + if (Imm == -1) + return nullptr; + + // Update MI to represent the new FMOV instruction, constrain it, and return. + ImmOp.ChangeToImmediate(Imm); + unsigned MovOpc = DefSize == 32 ? AArch64::FMOVSi : AArch64::FMOVDi; + I.setDesc(TII.get(MovOpc)); + constrainSelectedInstRegOperands(I, TII, TRI, RBI); + return &I; +} + +MachineInstr * +AArch64InstructionSelector::emitCSetForICMP(Register DefReg, unsigned Pred, + MachineIRBuilder &MIRBuilder) const { + // CSINC increments the result when the predicate is false. Invert it. + const AArch64CC::CondCode InvCC = changeICMPPredToAArch64CC( + CmpInst::getInversePredicate((CmpInst::Predicate)Pred)); + auto I = + MIRBuilder + .buildInstr(AArch64::CSINCWr, {DefReg}, {Register(AArch64::WZR), Register(AArch64::WZR)}) + .addImm(InvCC); + constrainSelectedInstRegOperands(*I, TII, TRI, RBI); + return &*I; +} + +bool AArch64InstructionSelector::tryOptSelect(MachineInstr &I) const { + MachineIRBuilder MIB(I); + MachineRegisterInfo &MRI = *MIB.getMRI(); + const TargetRegisterInfo &TRI = *MRI.getTargetRegisterInfo(); + + // We want to recognize this pattern: + // + // $z = G_FCMP pred, $x, $y + // ... + // $w = G_SELECT $z, $a, $b + // + // Where the value of $z is *only* ever used by the G_SELECT (possibly with + // some copies/truncs in between.) + // + // If we see this, then we can emit something like this: + // + // fcmp $x, $y + // fcsel $w, $a, $b, pred + // + // Rather than emitting both of the rather long sequences in the standard + // G_FCMP/G_SELECT select methods. + + // First, check if the condition is defined by a compare. + MachineInstr *CondDef = MRI.getVRegDef(I.getOperand(1).getReg()); + while (CondDef) { + // We can only fold if all of the defs have one use. + Register CondDefReg = CondDef->getOperand(0).getReg(); + if (!MRI.hasOneNonDBGUse(CondDefReg)) { + // Unless it's another select. + for (const MachineInstr &UI : MRI.use_nodbg_instructions(CondDefReg)) { + if (CondDef == &UI) + continue; + if (UI.getOpcode() != TargetOpcode::G_SELECT) + return false; + } + } + + // We can skip over G_TRUNC since the condition is 1-bit. + // Truncating/extending can have no impact on the value. + unsigned Opc = CondDef->getOpcode(); + if (Opc != TargetOpcode::COPY && Opc != TargetOpcode::G_TRUNC) + break; + + // Can't see past copies from physregs. + if (Opc == TargetOpcode::COPY && + Register::isPhysicalRegister(CondDef->getOperand(1).getReg())) + return false; + + CondDef = MRI.getVRegDef(CondDef->getOperand(1).getReg()); + } + + // Is the condition defined by a compare? + if (!CondDef) + return false; + + unsigned CondOpc = CondDef->getOpcode(); + if (CondOpc != TargetOpcode::G_ICMP && CondOpc != TargetOpcode::G_FCMP) + return false; + + AArch64CC::CondCode CondCode; + if (CondOpc == TargetOpcode::G_ICMP) { + MachineInstr *Cmp; + CmpInst::Predicate Pred; + + std::tie(Cmp, Pred) = + emitIntegerCompare(CondDef->getOperand(2), CondDef->getOperand(3), + CondDef->getOperand(1), MIB); + + if (!Cmp) { + LLVM_DEBUG(dbgs() << "Couldn't emit compare for select!\n"); + return false; + } + + // Have to collect the CondCode after emitIntegerCompare, since it can + // update the predicate. + CondCode = changeICMPPredToAArch64CC(Pred); + } else { + // Get the condition code for the select. + AArch64CC::CondCode CondCode2; + changeFCMPPredToAArch64CC( + (CmpInst::Predicate)CondDef->getOperand(1).getPredicate(), CondCode, + CondCode2); + + // changeFCMPPredToAArch64CC sets CondCode2 to AL when we require two + // instructions to emit the comparison. + // TODO: Handle FCMP_UEQ and FCMP_ONE. After that, this check will be + // unnecessary. + if (CondCode2 != AArch64CC::AL) + return false; + + // Make sure we'll be able to select the compare. + unsigned CmpOpc = selectFCMPOpc(*CondDef, MRI); + if (!CmpOpc) + return false; + + // Emit a new compare. + auto Cmp = MIB.buildInstr(CmpOpc, {}, {CondDef->getOperand(2).getReg()}); + if (CmpOpc != AArch64::FCMPSri && CmpOpc != AArch64::FCMPDri) + Cmp.addUse(CondDef->getOperand(3).getReg()); + constrainSelectedInstRegOperands(*Cmp, TII, TRI, RBI); + } + + // Emit the select. + unsigned CSelOpc = selectSelectOpc(I, MRI, RBI); + auto CSel = + MIB.buildInstr(CSelOpc, {I.getOperand(0).getReg()}, + {I.getOperand(2).getReg(), I.getOperand(3).getReg()}) + .addImm(CondCode); + constrainSelectedInstRegOperands(*CSel, TII, TRI, RBI); + I.eraseFromParent(); + return true; +} + +MachineInstr *AArch64InstructionSelector::tryFoldIntegerCompare( + MachineOperand &LHS, MachineOperand &RHS, MachineOperand &Predicate, + MachineIRBuilder &MIRBuilder) const { + assert(LHS.isReg() && RHS.isReg() && Predicate.isPredicate() && + "Unexpected MachineOperand"); + MachineRegisterInfo &MRI = *MIRBuilder.getMRI(); + // We want to find this sort of thing: + // x = G_SUB 0, y + // G_ICMP z, x + // + // In this case, we can fold the G_SUB into the G_ICMP using a CMN instead. + // e.g: + // + // cmn z, y + + // Helper lambda to detect the subtract followed by the compare. + // Takes in the def of the LHS or RHS, and checks if it's a subtract from 0. + auto IsCMN = [&](MachineInstr *DefMI, const AArch64CC::CondCode &CC) { + if (!DefMI || DefMI->getOpcode() != TargetOpcode::G_SUB) + return false; + + // Need to make sure NZCV is the same at the end of the transformation. + if (CC != AArch64CC::EQ && CC != AArch64CC::NE) + return false; + + // We want to match against SUBs. + if (DefMI->getOpcode() != TargetOpcode::G_SUB) + return false; + + // Make sure that we're getting + // x = G_SUB 0, y + auto ValAndVReg = + getConstantVRegValWithLookThrough(DefMI->getOperand(1).getReg(), MRI); + if (!ValAndVReg || ValAndVReg->Value != 0) + return false; + + // This can safely be represented as a CMN. + return true; + }; + + // Check if the RHS or LHS of the G_ICMP is defined by a SUB + MachineInstr *LHSDef = getDefIgnoringCopies(LHS.getReg(), MRI); + MachineInstr *RHSDef = getDefIgnoringCopies(RHS.getReg(), MRI); + CmpInst::Predicate P = (CmpInst::Predicate)Predicate.getPredicate(); + const AArch64CC::CondCode CC = changeICMPPredToAArch64CC(P); + + // Given this: + // + // x = G_SUB 0, y + // G_ICMP x, z + // + // Produce this: + // + // cmn y, z + if (IsCMN(LHSDef, CC)) + return emitCMN(LHSDef->getOperand(2), RHS, MIRBuilder); + + // Same idea here, but with the RHS of the compare instead: + // + // Given this: + // + // x = G_SUB 0, y + // G_ICMP z, x + // + // Produce this: + // + // cmn z, y + if (IsCMN(RHSDef, CC)) + return emitCMN(LHS, RHSDef->getOperand(2), MIRBuilder); + + // Given this: + // + // z = G_AND x, y + // G_ICMP z, 0 + // + // Produce this if the compare is signed: + // + // tst x, y + if (!isUnsignedICMPPred(P) && LHSDef && + LHSDef->getOpcode() == TargetOpcode::G_AND) { + // Make sure that the RHS is 0. + auto ValAndVReg = getConstantVRegValWithLookThrough(RHS.getReg(), MRI); + if (!ValAndVReg || ValAndVReg->Value != 0) + return nullptr; + + return emitTST(LHSDef->getOperand(1).getReg(), + LHSDef->getOperand(2).getReg(), MIRBuilder); + } + + return nullptr; +} + +MachineInstr *AArch64InstructionSelector::tryOptArithImmedIntegerCompare( + MachineOperand &LHS, MachineOperand &RHS, CmpInst::Predicate &P, + MachineIRBuilder &MIB) const { + // Attempt to select the immediate form of an integer compare. + MachineRegisterInfo &MRI = *MIB.getMRI(); + auto Ty = MRI.getType(LHS.getReg()); + assert(!Ty.isVector() && "Expected scalar or pointer only?"); + unsigned Size = Ty.getSizeInBits(); + assert((Size == 32 || Size == 64) && + "Expected 32 bit or 64 bit compare only?"); + + // Check if this is a case we can already handle. + InstructionSelector::ComplexRendererFns ImmFns; + ImmFns = selectArithImmed(RHS); + + if (!ImmFns) { + // We didn't get a rendering function, but we may still have a constant. + auto MaybeImmed = getImmedFromMO(RHS); + if (!MaybeImmed) + return nullptr; + + // We have a constant, but it doesn't fit. Try adjusting it by one and + // updating the predicate if possible. + uint64_t C = *MaybeImmed; + CmpInst::Predicate NewP; + switch (P) { + default: + return nullptr; + case CmpInst::ICMP_SLT: + case CmpInst::ICMP_SGE: + // Check for + // + // x slt c => x sle c - 1 + // x sge c => x sgt c - 1 + // + // When c is not the smallest possible negative number. + if ((Size == 64 && static_cast(C) == INT64_MIN) || + (Size == 32 && static_cast(C) == INT32_MIN)) + return nullptr; + NewP = (P == CmpInst::ICMP_SLT) ? CmpInst::ICMP_SLE : CmpInst::ICMP_SGT; + C -= 1; + break; + case CmpInst::ICMP_ULT: + case CmpInst::ICMP_UGE: + // Check for + // + // x ult c => x ule c - 1 + // x uge c => x ugt c - 1 + // + // When c is not zero. + if (C == 0) + return nullptr; + NewP = (P == CmpInst::ICMP_ULT) ? CmpInst::ICMP_ULE : CmpInst::ICMP_UGT; + C -= 1; + break; + case CmpInst::ICMP_SLE: + case CmpInst::ICMP_SGT: + // Check for + // + // x sle c => x slt c + 1 + // x sgt c => s sge c + 1 + // + // When c is not the largest possible signed integer. + if ((Size == 32 && static_cast(C) == INT32_MAX) || + (Size == 64 && static_cast(C) == INT64_MAX)) + return nullptr; + NewP = (P == CmpInst::ICMP_SLE) ? CmpInst::ICMP_SLT : CmpInst::ICMP_SGE; + C += 1; + break; + case CmpInst::ICMP_ULE: + case CmpInst::ICMP_UGT: + // Check for + // + // x ule c => x ult c + 1 + // x ugt c => s uge c + 1 + // + // When c is not the largest possible unsigned integer. + if ((Size == 32 && static_cast(C) == UINT32_MAX) || + (Size == 64 && C == UINT64_MAX)) + return nullptr; + NewP = (P == CmpInst::ICMP_ULE) ? CmpInst::ICMP_ULT : CmpInst::ICMP_UGE; + C += 1; + break; + } + + // Check if the new constant is valid. + if (Size == 32) + C = static_cast(C); + ImmFns = select12BitValueWithLeftShift(C); + if (!ImmFns) + return nullptr; + P = NewP; + } + + // At this point, we know we can select an immediate form. Go ahead and do + // that. + Register ZReg; + unsigned Opc; + if (Size == 32) { + ZReg = AArch64::WZR; + Opc = AArch64::SUBSWri; + } else { + ZReg = AArch64::XZR; + Opc = AArch64::SUBSXri; + } + + auto CmpMI = MIB.buildInstr(Opc, {ZReg}, {LHS.getReg()}); + for (auto &RenderFn : *ImmFns) + RenderFn(CmpMI); + constrainSelectedInstRegOperands(*CmpMI, TII, TRI, RBI); + return &*CmpMI; +} + +MachineInstr *AArch64InstructionSelector::tryOptArithShiftedCompare( + MachineOperand &LHS, MachineOperand &RHS, MachineIRBuilder &MIB) const { + // We are looking for the following pattern: + // + // shift = G_SHL/ASHR/LHSR y, c + // ... + // cmp = G_ICMP pred, something, shift + // + // Since we will select the G_ICMP to a SUBS, we can potentially fold the + // shift into the subtract. + static const unsigned OpcTable[2] = {AArch64::SUBSWrs, AArch64::SUBSXrs}; + static const Register ZRegTable[2] = {AArch64::WZR, AArch64::XZR}; + auto ImmFns = selectShiftedRegister(RHS); + if (!ImmFns) + return nullptr; + MachineRegisterInfo &MRI = *MIB.getMRI(); + auto Ty = MRI.getType(LHS.getReg()); + assert(!Ty.isVector() && "Expected scalar or pointer only?"); + unsigned Size = Ty.getSizeInBits(); + bool Idx = (Size == 64); + Register ZReg = ZRegTable[Idx]; + unsigned Opc = OpcTable[Idx]; + auto CmpMI = MIB.buildInstr(Opc, {ZReg}, {LHS.getReg()}); + for (auto &RenderFn : *ImmFns) + RenderFn(CmpMI); + constrainSelectedInstRegOperands(*CmpMI, TII, TRI, RBI); + return &*CmpMI; +} + +bool AArch64InstructionSelector::selectShuffleVector( + MachineInstr &I, MachineRegisterInfo &MRI) const { + const LLT DstTy = MRI.getType(I.getOperand(0).getReg()); + Register Src1Reg = I.getOperand(1).getReg(); + const LLT Src1Ty = MRI.getType(Src1Reg); + Register Src2Reg = I.getOperand(2).getReg(); + const LLT Src2Ty = MRI.getType(Src2Reg); + ArrayRef Mask = I.getOperand(3).getShuffleMask(); + + MachineBasicBlock &MBB = *I.getParent(); + MachineFunction &MF = *MBB.getParent(); + LLVMContext &Ctx = MF.getFunction().getContext(); + + // G_SHUFFLE_VECTOR is weird in that the source operands can be scalars, if + // it's originated from a <1 x T> type. Those should have been lowered into + // G_BUILD_VECTOR earlier. + if (!Src1Ty.isVector() || !Src2Ty.isVector()) { + LLVM_DEBUG(dbgs() << "Could not select a \"scalar\" G_SHUFFLE_VECTOR\n"); + return false; + } + + unsigned BytesPerElt = DstTy.getElementType().getSizeInBits() / 8; + + SmallVector CstIdxs; + for (int Val : Mask) { + // For now, any undef indexes we'll just assume to be 0. This should be + // optimized in future, e.g. to select DUP etc. + Val = Val < 0 ? 0 : Val; + for (unsigned Byte = 0; Byte < BytesPerElt; ++Byte) { + unsigned Offset = Byte + Val * BytesPerElt; + CstIdxs.emplace_back(ConstantInt::get(Type::getInt8Ty(Ctx), Offset)); + } + } + + MachineIRBuilder MIRBuilder(I); + + // Use a constant pool to load the index vector for TBL. + Constant *CPVal = ConstantVector::get(CstIdxs); + MachineInstr *IndexLoad = emitLoadFromConstantPool(CPVal, MIRBuilder); + if (!IndexLoad) { + LLVM_DEBUG(dbgs() << "Could not load from a constant pool"); + return false; + } + + if (DstTy.getSizeInBits() != 128) { + assert(DstTy.getSizeInBits() == 64 && "Unexpected shuffle result ty"); + // This case can be done with TBL1. + MachineInstr *Concat = emitVectorConcat(None, Src1Reg, Src2Reg, MIRBuilder); + if (!Concat) { + LLVM_DEBUG(dbgs() << "Could not do vector concat for tbl1"); + return false; + } + + // The constant pool load will be 64 bits, so need to convert to FPR128 reg. + IndexLoad = + emitScalarToVector(64, &AArch64::FPR128RegClass, + IndexLoad->getOperand(0).getReg(), MIRBuilder); + + auto TBL1 = MIRBuilder.buildInstr( + AArch64::TBLv16i8One, {&AArch64::FPR128RegClass}, + {Concat->getOperand(0).getReg(), IndexLoad->getOperand(0).getReg()}); + constrainSelectedInstRegOperands(*TBL1, TII, TRI, RBI); + + auto Copy = + MIRBuilder + .buildInstr(TargetOpcode::COPY, {I.getOperand(0).getReg()}, {}) + .addReg(TBL1.getReg(0), 0, AArch64::dsub); + RBI.constrainGenericRegister(Copy.getReg(0), AArch64::FPR64RegClass, MRI); + I.eraseFromParent(); + return true; + } + + // For TBL2 we need to emit a REG_SEQUENCE to tie together two consecutive + // Q registers for regalloc. + auto RegSeq = MIRBuilder + .buildInstr(TargetOpcode::REG_SEQUENCE, + {&AArch64::QQRegClass}, {Src1Reg}) + .addImm(AArch64::qsub0) + .addUse(Src2Reg) + .addImm(AArch64::qsub1); + + auto TBL2 = MIRBuilder.buildInstr(AArch64::TBLv16i8Two, {I.getOperand(0)}, + {RegSeq, IndexLoad->getOperand(0)}); + constrainSelectedInstRegOperands(*RegSeq, TII, TRI, RBI); + constrainSelectedInstRegOperands(*TBL2, TII, TRI, RBI); + I.eraseFromParent(); + return true; +} + +MachineInstr *AArch64InstructionSelector::emitLaneInsert( + Optional DstReg, Register SrcReg, Register EltReg, + unsigned LaneIdx, const RegisterBank &RB, + MachineIRBuilder &MIRBuilder) const { + MachineInstr *InsElt = nullptr; + const TargetRegisterClass *DstRC = &AArch64::FPR128RegClass; + MachineRegisterInfo &MRI = *MIRBuilder.getMRI(); + + // Create a register to define with the insert if one wasn't passed in. + if (!DstReg) + DstReg = MRI.createVirtualRegister(DstRC); + + unsigned EltSize = MRI.getType(EltReg).getSizeInBits(); + unsigned Opc = getInsertVecEltOpInfo(RB, EltSize).first; + + if (RB.getID() == AArch64::FPRRegBankID) { + auto InsSub = emitScalarToVector(EltSize, DstRC, EltReg, MIRBuilder); + InsElt = MIRBuilder.buildInstr(Opc, {*DstReg}, {SrcReg}) + .addImm(LaneIdx) + .addUse(InsSub->getOperand(0).getReg()) + .addImm(0); + } else { + InsElt = MIRBuilder.buildInstr(Opc, {*DstReg}, {SrcReg}) + .addImm(LaneIdx) + .addUse(EltReg); + } + + constrainSelectedInstRegOperands(*InsElt, TII, TRI, RBI); + return InsElt; +} + +bool AArch64InstructionSelector::selectInsertElt( + MachineInstr &I, MachineRegisterInfo &MRI) const { + assert(I.getOpcode() == TargetOpcode::G_INSERT_VECTOR_ELT); + + // Get information on the destination. + Register DstReg = I.getOperand(0).getReg(); + const LLT DstTy = MRI.getType(DstReg); + unsigned VecSize = DstTy.getSizeInBits(); + + // Get information on the element we want to insert into the destination. + Register EltReg = I.getOperand(2).getReg(); + const LLT EltTy = MRI.getType(EltReg); + unsigned EltSize = EltTy.getSizeInBits(); + if (EltSize < 16 || EltSize > 64) + return false; // Don't support all element types yet. + + // Find the definition of the index. Bail out if it's not defined by a + // G_CONSTANT. + Register IdxReg = I.getOperand(3).getReg(); + auto VRegAndVal = getConstantVRegValWithLookThrough(IdxReg, MRI); + if (!VRegAndVal) + return false; + unsigned LaneIdx = VRegAndVal->Value; + + // Perform the lane insert. + Register SrcReg = I.getOperand(1).getReg(); + const RegisterBank &EltRB = *RBI.getRegBank(EltReg, MRI, TRI); + MachineIRBuilder MIRBuilder(I); + + if (VecSize < 128) { + // If the vector we're inserting into is smaller than 128 bits, widen it + // to 128 to do the insert. + MachineInstr *ScalarToVec = emitScalarToVector( + VecSize, &AArch64::FPR128RegClass, SrcReg, MIRBuilder); + if (!ScalarToVec) + return false; + SrcReg = ScalarToVec->getOperand(0).getReg(); + } + + // Create an insert into a new FPR128 register. + // Note that if our vector is already 128 bits, we end up emitting an extra + // register. + MachineInstr *InsMI = + emitLaneInsert(None, SrcReg, EltReg, LaneIdx, EltRB, MIRBuilder); + + if (VecSize < 128) { + // If we had to widen to perform the insert, then we have to demote back to + // the original size to get the result we want. + Register DemoteVec = InsMI->getOperand(0).getReg(); + const TargetRegisterClass *RC = + getMinClassForRegBank(*RBI.getRegBank(DemoteVec, MRI, TRI), VecSize); + if (RC != &AArch64::FPR32RegClass && RC != &AArch64::FPR64RegClass) { + LLVM_DEBUG(dbgs() << "Unsupported register class!\n"); + return false; + } + unsigned SubReg = 0; + if (!getSubRegForClass(RC, TRI, SubReg)) + return false; + if (SubReg != AArch64::ssub && SubReg != AArch64::dsub) { + LLVM_DEBUG(dbgs() << "Unsupported destination size! (" << VecSize + << "\n"); + return false; + } + MIRBuilder.buildInstr(TargetOpcode::COPY, {DstReg}, {}) + .addReg(DemoteVec, 0, SubReg); + RBI.constrainGenericRegister(DstReg, *RC, MRI); + } else { + // No widening needed. + InsMI->getOperand(0).setReg(DstReg); + constrainSelectedInstRegOperands(*InsMI, TII, TRI, RBI); + } + + I.eraseFromParent(); + return true; +} + +bool AArch64InstructionSelector::tryOptConstantBuildVec( + MachineInstr &I, LLT DstTy, MachineRegisterInfo &MRI) const { + assert(I.getOpcode() == TargetOpcode::G_BUILD_VECTOR); + assert(DstTy.getSizeInBits() <= 128 && "Unexpected build_vec type!"); + if (DstTy.getSizeInBits() < 32) + return false; + // Check if we're building a constant vector, in which case we want to + // generate a constant pool load instead of a vector insert sequence. + SmallVector Csts; + for (unsigned Idx = 1; Idx < I.getNumOperands(); ++Idx) { + // Try to find G_CONSTANT or G_FCONSTANT + auto *OpMI = + getOpcodeDef(TargetOpcode::G_CONSTANT, I.getOperand(Idx).getReg(), MRI); + if (OpMI) + Csts.emplace_back( + const_cast(OpMI->getOperand(1).getCImm())); + else if ((OpMI = getOpcodeDef(TargetOpcode::G_FCONSTANT, + I.getOperand(Idx).getReg(), MRI))) + Csts.emplace_back( + const_cast(OpMI->getOperand(1).getFPImm())); + else + return false; + } + Constant *CV = ConstantVector::get(Csts); + MachineIRBuilder MIB(I); + auto *CPLoad = emitLoadFromConstantPool(CV, MIB); + if (!CPLoad) { + LLVM_DEBUG(dbgs() << "Could not generate cp load for build_vector"); + return false; + } + MIB.buildCopy(I.getOperand(0), CPLoad->getOperand(0)); + RBI.constrainGenericRegister(I.getOperand(0).getReg(), + *MRI.getRegClass(CPLoad->getOperand(0).getReg()), + MRI); + I.eraseFromParent(); + return true; +} + +bool AArch64InstructionSelector::selectBuildVector( + MachineInstr &I, MachineRegisterInfo &MRI) const { + assert(I.getOpcode() == TargetOpcode::G_BUILD_VECTOR); + // Until we port more of the optimized selections, for now just use a vector + // insert sequence. + const LLT DstTy = MRI.getType(I.getOperand(0).getReg()); + const LLT EltTy = MRI.getType(I.getOperand(1).getReg()); + unsigned EltSize = EltTy.getSizeInBits(); + + if (tryOptConstantBuildVec(I, DstTy, MRI)) + return true; + if (EltSize < 16 || EltSize > 64) + return false; // Don't support all element types yet. + const RegisterBank &RB = *RBI.getRegBank(I.getOperand(1).getReg(), MRI, TRI); + MachineIRBuilder MIRBuilder(I); + + const TargetRegisterClass *DstRC = &AArch64::FPR128RegClass; + MachineInstr *ScalarToVec = + emitScalarToVector(DstTy.getElementType().getSizeInBits(), DstRC, + I.getOperand(1).getReg(), MIRBuilder); + if (!ScalarToVec) + return false; + + Register DstVec = ScalarToVec->getOperand(0).getReg(); + unsigned DstSize = DstTy.getSizeInBits(); + + // Keep track of the last MI we inserted. Later on, we might be able to save + // a copy using it. + MachineInstr *PrevMI = nullptr; + for (unsigned i = 2, e = DstSize / EltSize + 1; i < e; ++i) { + // Note that if we don't do a subregister copy, we can end up making an + // extra register. + PrevMI = &*emitLaneInsert(None, DstVec, I.getOperand(i).getReg(), i - 1, RB, + MIRBuilder); + DstVec = PrevMI->getOperand(0).getReg(); + } + + // If DstTy's size in bits is less than 128, then emit a subregister copy + // from DstVec to the last register we've defined. + if (DstSize < 128) { + // Force this to be FPR using the destination vector. + const TargetRegisterClass *RC = + getMinClassForRegBank(*RBI.getRegBank(DstVec, MRI, TRI), DstSize); + if (!RC) + return false; + if (RC != &AArch64::FPR32RegClass && RC != &AArch64::FPR64RegClass) { + LLVM_DEBUG(dbgs() << "Unsupported register class!\n"); + return false; + } + + unsigned SubReg = 0; + if (!getSubRegForClass(RC, TRI, SubReg)) + return false; + if (SubReg != AArch64::ssub && SubReg != AArch64::dsub) { + LLVM_DEBUG(dbgs() << "Unsupported destination size! (" << DstSize + << "\n"); + return false; + } + + Register Reg = MRI.createVirtualRegister(RC); + Register DstReg = I.getOperand(0).getReg(); + + MIRBuilder.buildInstr(TargetOpcode::COPY, {DstReg}, {}) + .addReg(DstVec, 0, SubReg); + MachineOperand &RegOp = I.getOperand(1); + RegOp.setReg(Reg); + RBI.constrainGenericRegister(DstReg, *RC, MRI); + } else { + // We don't need a subregister copy. Save a copy by re-using the + // destination register on the final insert. + assert(PrevMI && "PrevMI was null?"); + PrevMI->getOperand(0).setReg(I.getOperand(0).getReg()); + constrainSelectedInstRegOperands(*PrevMI, TII, TRI, RBI); + } + + I.eraseFromParent(); + return true; +} + +/// Helper function to find an intrinsic ID on an a MachineInstr. Returns the +/// ID if it exists, and 0 otherwise. +static unsigned findIntrinsicID(MachineInstr &I) { + auto IntrinOp = find_if(I.operands(), [&](const MachineOperand &Op) { + return Op.isIntrinsicID(); + }); + if (IntrinOp == I.operands_end()) + return 0; + return IntrinOp->getIntrinsicID(); +} + +bool AArch64InstructionSelector::selectIntrinsicWithSideEffects( + MachineInstr &I, MachineRegisterInfo &MRI) const { + // Find the intrinsic ID. + unsigned IntrinID = findIntrinsicID(I); + if (!IntrinID) + return false; + MachineIRBuilder MIRBuilder(I); + + // Select the instruction. + switch (IntrinID) { + default: + return false; + case Intrinsic::trap: + MIRBuilder.buildInstr(AArch64::BRK, {}, {}).addImm(1); + break; + case Intrinsic::debugtrap: + if (!STI.isTargetWindows()) + return false; + MIRBuilder.buildInstr(AArch64::BRK, {}, {}).addImm(0xF000); + break; + } + + I.eraseFromParent(); + return true; +} + +bool AArch64InstructionSelector::selectIntrinsic(MachineInstr &I, + MachineRegisterInfo &MRI) { + unsigned IntrinID = findIntrinsicID(I); + if (!IntrinID) + return false; + MachineIRBuilder MIRBuilder(I); + + switch (IntrinID) { + default: + break; + case Intrinsic::aarch64_crypto_sha1h: { + Register DstReg = I.getOperand(0).getReg(); + Register SrcReg = I.getOperand(2).getReg(); + + // FIXME: Should this be an assert? + if (MRI.getType(DstReg).getSizeInBits() != 32 || + MRI.getType(SrcReg).getSizeInBits() != 32) + return false; + + // The operation has to happen on FPRs. Set up some new FPR registers for + // the source and destination if they are on GPRs. + if (RBI.getRegBank(SrcReg, MRI, TRI)->getID() != AArch64::FPRRegBankID) { + SrcReg = MRI.createVirtualRegister(&AArch64::FPR32RegClass); + MIRBuilder.buildCopy({SrcReg}, {I.getOperand(2)}); + + // Make sure the copy ends up getting constrained properly. + RBI.constrainGenericRegister(I.getOperand(2).getReg(), + AArch64::GPR32RegClass, MRI); + } + + if (RBI.getRegBank(DstReg, MRI, TRI)->getID() != AArch64::FPRRegBankID) + DstReg = MRI.createVirtualRegister(&AArch64::FPR32RegClass); + + // Actually insert the instruction. + auto SHA1Inst = MIRBuilder.buildInstr(AArch64::SHA1Hrr, {DstReg}, {SrcReg}); + constrainSelectedInstRegOperands(*SHA1Inst, TII, TRI, RBI); + + // Did we create a new register for the destination? + if (DstReg != I.getOperand(0).getReg()) { + // Yep. Copy the result of the instruction back into the original + // destination. + MIRBuilder.buildCopy({I.getOperand(0)}, {DstReg}); + RBI.constrainGenericRegister(I.getOperand(0).getReg(), + AArch64::GPR32RegClass, MRI); + } + + I.eraseFromParent(); + return true; + } + case Intrinsic::frameaddress: + case Intrinsic::returnaddress: { + MachineFunction &MF = *I.getParent()->getParent(); + MachineFrameInfo &MFI = MF.getFrameInfo(); + + unsigned Depth = I.getOperand(2).getImm(); + Register DstReg = I.getOperand(0).getReg(); + RBI.constrainGenericRegister(DstReg, AArch64::GPR64RegClass, MRI); + + if (Depth == 0 && IntrinID == Intrinsic::returnaddress) { + if (MFReturnAddr) { + MIRBuilder.buildCopy({DstReg}, MFReturnAddr); + I.eraseFromParent(); + return true; + } + MFI.setReturnAddressIsTaken(true); + MF.addLiveIn(AArch64::LR, &AArch64::GPR64spRegClass); + // Insert the copy from LR/X30 into the entry block, before it can be + // clobbered by anything. + MachineBasicBlock &EntryBlock = *MF.begin(); + if (!EntryBlock.isLiveIn(AArch64::LR)) + EntryBlock.addLiveIn(AArch64::LR); + MachineIRBuilder EntryBuilder(MF); + EntryBuilder.setInstr(*EntryBlock.begin()); + EntryBuilder.buildCopy({DstReg}, {Register(AArch64::LR)}); + MFReturnAddr = DstReg; + I.eraseFromParent(); + return true; + } + + MFI.setFrameAddressIsTaken(true); + Register FrameAddr(AArch64::FP); + while (Depth--) { + Register NextFrame = MRI.createVirtualRegister(&AArch64::GPR64spRegClass); + auto Ldr = + MIRBuilder.buildInstr(AArch64::LDRXui, {NextFrame}, {FrameAddr}) + .addImm(0); + constrainSelectedInstRegOperands(*Ldr, TII, TRI, RBI); + FrameAddr = NextFrame; + } + + if (IntrinID == Intrinsic::frameaddress) + MIRBuilder.buildCopy({DstReg}, {FrameAddr}); + else { + MFI.setReturnAddressIsTaken(true); + MIRBuilder.buildInstr(AArch64::LDRXui, {DstReg}, {FrameAddr}).addImm(1); + } + + I.eraseFromParent(); + return true; + } + } + return false; +} + +InstructionSelector::ComplexRendererFns +AArch64InstructionSelector::selectShiftA_32(const MachineOperand &Root) const { + auto MaybeImmed = getImmedFromMO(Root); + if (MaybeImmed == None || *MaybeImmed > 31) + return None; + uint64_t Enc = (32 - *MaybeImmed) & 0x1f; + return {{[=](MachineInstrBuilder &MIB) { MIB.addImm(Enc); }}}; +} + +InstructionSelector::ComplexRendererFns +AArch64InstructionSelector::selectShiftB_32(const MachineOperand &Root) const { + auto MaybeImmed = getImmedFromMO(Root); + if (MaybeImmed == None || *MaybeImmed > 31) + return None; + uint64_t Enc = 31 - *MaybeImmed; + return {{[=](MachineInstrBuilder &MIB) { MIB.addImm(Enc); }}}; +} + +InstructionSelector::ComplexRendererFns +AArch64InstructionSelector::selectShiftA_64(const MachineOperand &Root) const { + auto MaybeImmed = getImmedFromMO(Root); + if (MaybeImmed == None || *MaybeImmed > 63) + return None; + uint64_t Enc = (64 - *MaybeImmed) & 0x3f; + return {{[=](MachineInstrBuilder &MIB) { MIB.addImm(Enc); }}}; +} + +InstructionSelector::ComplexRendererFns +AArch64InstructionSelector::selectShiftB_64(const MachineOperand &Root) const { + auto MaybeImmed = getImmedFromMO(Root); + if (MaybeImmed == None || *MaybeImmed > 63) + return None; + uint64_t Enc = 63 - *MaybeImmed; + return {{[=](MachineInstrBuilder &MIB) { MIB.addImm(Enc); }}}; +} + +/// Helper to select an immediate value that can be represented as a 12-bit +/// value shifted left by either 0 or 12. If it is possible to do so, return +/// the immediate and shift value. If not, return None. +/// +/// Used by selectArithImmed and selectNegArithImmed. +InstructionSelector::ComplexRendererFns +AArch64InstructionSelector::select12BitValueWithLeftShift( + uint64_t Immed) const { + unsigned ShiftAmt; + if (Immed >> 12 == 0) { + ShiftAmt = 0; + } else if ((Immed & 0xfff) == 0 && Immed >> 24 == 0) { + ShiftAmt = 12; + Immed = Immed >> 12; + } else + return None; + + unsigned ShVal = AArch64_AM::getShifterImm(AArch64_AM::LSL, ShiftAmt); + return {{ + [=](MachineInstrBuilder &MIB) { MIB.addImm(Immed); }, + [=](MachineInstrBuilder &MIB) { MIB.addImm(ShVal); }, + }}; +} + +/// SelectArithImmed - Select an immediate value that can be represented as +/// a 12-bit value shifted left by either 0 or 12. If so, return true with +/// Val set to the 12-bit value and Shift set to the shifter operand. +InstructionSelector::ComplexRendererFns +AArch64InstructionSelector::selectArithImmed(MachineOperand &Root) const { + // This function is called from the addsub_shifted_imm ComplexPattern, + // which lists [imm] as the list of opcode it's interested in, however + // we still need to check whether the operand is actually an immediate + // here because the ComplexPattern opcode list is only used in + // root-level opcode matching. + auto MaybeImmed = getImmedFromMO(Root); + if (MaybeImmed == None) + return None; + return select12BitValueWithLeftShift(*MaybeImmed); +} + +/// SelectNegArithImmed - As above, but negates the value before trying to +/// select it. +InstructionSelector::ComplexRendererFns +AArch64InstructionSelector::selectNegArithImmed(MachineOperand &Root) const { + // We need a register here, because we need to know if we have a 64 or 32 + // bit immediate. + if (!Root.isReg()) + return None; + auto MaybeImmed = getImmedFromMO(Root); + if (MaybeImmed == None) + return None; + uint64_t Immed = *MaybeImmed; + + // This negation is almost always valid, but "cmp wN, #0" and "cmn wN, #0" + // have the opposite effect on the C flag, so this pattern mustn't match under + // those circumstances. + if (Immed == 0) + return None; + + // Check if we're dealing with a 32-bit type on the root or a 64-bit type on + // the root. + MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo(); + if (MRI.getType(Root.getReg()).getSizeInBits() == 32) + Immed = ~((uint32_t)Immed) + 1; + else + Immed = ~Immed + 1ULL; + + if (Immed & 0xFFFFFFFFFF000000ULL) + return None; + + Immed &= 0xFFFFFFULL; + return select12BitValueWithLeftShift(Immed); +} + +/// Return true if it is worth folding MI into an extended register. That is, +/// if it's safe to pull it into the addressing mode of a load or store as a +/// shift. +bool AArch64InstructionSelector::isWorthFoldingIntoExtendedReg( + MachineInstr &MI, const MachineRegisterInfo &MRI) const { + // Always fold if there is one use, or if we're optimizing for size. + Register DefReg = MI.getOperand(0).getReg(); + if (MRI.hasOneNonDBGUse(DefReg) || + MI.getParent()->getParent()->getFunction().hasMinSize()) + return true; + + // It's better to avoid folding and recomputing shifts when we don't have a + // fastpath. + if (!STI.hasLSLFast()) + return false; + + // We have a fastpath, so folding a shift in and potentially computing it + // many times may be beneficial. Check if this is only used in memory ops. + // If it is, then we should fold. + return all_of(MRI.use_nodbg_instructions(DefReg), + [](MachineInstr &Use) { return Use.mayLoadOrStore(); }); +} + +static bool isSignExtendShiftType(AArch64_AM::ShiftExtendType Type) { + switch (Type) { + case AArch64_AM::SXTB: + case AArch64_AM::SXTH: + case AArch64_AM::SXTW: + return true; + default: + return false; + } +} + +InstructionSelector::ComplexRendererFns +AArch64InstructionSelector::selectExtendedSHL( + MachineOperand &Root, MachineOperand &Base, MachineOperand &Offset, + unsigned SizeInBytes, bool WantsExt) const { + assert(Base.isReg() && "Expected base to be a register operand"); + assert(Offset.isReg() && "Expected offset to be a register operand"); + + MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo(); + MachineInstr *OffsetInst = MRI.getVRegDef(Offset.getReg()); + if (!OffsetInst) + return None; + + unsigned OffsetOpc = OffsetInst->getOpcode(); + if (OffsetOpc != TargetOpcode::G_SHL && OffsetOpc != TargetOpcode::G_MUL) + return None; + + // Make sure that the memory op is a valid size. + int64_t LegalShiftVal = Log2_32(SizeInBytes); + if (LegalShiftVal == 0) + return None; + if (!isWorthFoldingIntoExtendedReg(*OffsetInst, MRI)) + return None; + + // Now, try to find the specific G_CONSTANT. Start by assuming that the + // register we will offset is the LHS, and the register containing the + // constant is the RHS. + Register OffsetReg = OffsetInst->getOperand(1).getReg(); + Register ConstantReg = OffsetInst->getOperand(2).getReg(); + auto ValAndVReg = getConstantVRegValWithLookThrough(ConstantReg, MRI); + if (!ValAndVReg) { + // We didn't get a constant on the RHS. If the opcode is a shift, then + // we're done. + if (OffsetOpc == TargetOpcode::G_SHL) + return None; + + // If we have a G_MUL, we can use either register. Try looking at the RHS. + std::swap(OffsetReg, ConstantReg); + ValAndVReg = getConstantVRegValWithLookThrough(ConstantReg, MRI); + if (!ValAndVReg) + return None; + } + + // The value must fit into 3 bits, and must be positive. Make sure that is + // true. + int64_t ImmVal = ValAndVReg->Value; + + // Since we're going to pull this into a shift, the constant value must be + // a power of 2. If we got a multiply, then we need to check this. + if (OffsetOpc == TargetOpcode::G_MUL) { + if (!isPowerOf2_32(ImmVal)) + return None; + + // Got a power of 2. So, the amount we'll shift is the log base-2 of that. + ImmVal = Log2_32(ImmVal); + } + + if ((ImmVal & 0x7) != ImmVal) + return None; + + // We are only allowed to shift by LegalShiftVal. This shift value is built + // into the instruction, so we can't just use whatever we want. + if (ImmVal != LegalShiftVal) + return None; + + unsigned SignExtend = 0; + if (WantsExt) { + // Check if the offset is defined by an extend. + MachineInstr *ExtInst = getDefIgnoringCopies(OffsetReg, MRI); + auto Ext = getExtendTypeForInst(*ExtInst, MRI, true); + if (Ext == AArch64_AM::InvalidShiftExtend) + return None; + + SignExtend = isSignExtendShiftType(Ext) ? 1 : 0; + // We only support SXTW for signed extension here. + if (SignExtend && Ext != AArch64_AM::SXTW) + return None; + + // Need a 32-bit wide register here. + MachineIRBuilder MIB(*MRI.getVRegDef(Root.getReg())); + OffsetReg = ExtInst->getOperand(1).getReg(); + OffsetReg = narrowExtendRegIfNeeded(OffsetReg, MIB); + } + + // We can use the LHS of the GEP as the base, and the LHS of the shift as an + // offset. Signify that we are shifting by setting the shift flag to 1. + return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(Base.getReg()); }, + [=](MachineInstrBuilder &MIB) { MIB.addUse(OffsetReg); }, + [=](MachineInstrBuilder &MIB) { + // Need to add both immediates here to make sure that they are both + // added to the instruction. + MIB.addImm(SignExtend); + MIB.addImm(1); + }}}; +} + +/// This is used for computing addresses like this: +/// +/// ldr x1, [x2, x3, lsl #3] +/// +/// Where x2 is the base register, and x3 is an offset register. The shift-left +/// is a constant value specific to this load instruction. That is, we'll never +/// see anything other than a 3 here (which corresponds to the size of the +/// element being loaded.) +InstructionSelector::ComplexRendererFns +AArch64InstructionSelector::selectAddrModeShiftedExtendXReg( + MachineOperand &Root, unsigned SizeInBytes) const { + if (!Root.isReg()) + return None; + MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo(); + + // We want to find something like this: + // + // val = G_CONSTANT LegalShiftVal + // shift = G_SHL off_reg val + // ptr = G_PTR_ADD base_reg shift + // x = G_LOAD ptr + // + // And fold it into this addressing mode: + // + // ldr x, [base_reg, off_reg, lsl #LegalShiftVal] + + // Check if we can find the G_PTR_ADD. + MachineInstr *PtrAdd = + getOpcodeDef(TargetOpcode::G_PTR_ADD, Root.getReg(), MRI); + if (!PtrAdd || !isWorthFoldingIntoExtendedReg(*PtrAdd, MRI)) + return None; + + // Now, try to match an opcode which will match our specific offset. + // We want a G_SHL or a G_MUL. + MachineInstr *OffsetInst = + getDefIgnoringCopies(PtrAdd->getOperand(2).getReg(), MRI); + return selectExtendedSHL(Root, PtrAdd->getOperand(1), + OffsetInst->getOperand(0), SizeInBytes, + /*WantsExt=*/false); +} + +/// This is used for computing addresses like this: +/// +/// ldr x1, [x2, x3] +/// +/// Where x2 is the base register, and x3 is an offset register. +/// +/// When possible (or profitable) to fold a G_PTR_ADD into the address calculation, +/// this will do so. Otherwise, it will return None. +InstructionSelector::ComplexRendererFns +AArch64InstructionSelector::selectAddrModeRegisterOffset( + MachineOperand &Root) const { + MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo(); + + // We need a GEP. + MachineInstr *Gep = MRI.getVRegDef(Root.getReg()); + if (!Gep || Gep->getOpcode() != TargetOpcode::G_PTR_ADD) + return None; + + // If this is used more than once, let's not bother folding. + // TODO: Check if they are memory ops. If they are, then we can still fold + // without having to recompute anything. + if (!MRI.hasOneNonDBGUse(Gep->getOperand(0).getReg())) + return None; + + // Base is the GEP's LHS, offset is its RHS. + return {{[=](MachineInstrBuilder &MIB) { + MIB.addUse(Gep->getOperand(1).getReg()); + }, + [=](MachineInstrBuilder &MIB) { + MIB.addUse(Gep->getOperand(2).getReg()); + }, + [=](MachineInstrBuilder &MIB) { + // Need to add both immediates here to make sure that they are both + // added to the instruction. + MIB.addImm(0); + MIB.addImm(0); + }}}; +} + +/// This is intended to be equivalent to selectAddrModeXRO in +/// AArch64ISelDAGtoDAG. It's used for selecting X register offset loads. +InstructionSelector::ComplexRendererFns +AArch64InstructionSelector::selectAddrModeXRO(MachineOperand &Root, + unsigned SizeInBytes) const { + MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo(); + + // If we have a constant offset, then we probably don't want to match a + // register offset. + if (isBaseWithConstantOffset(Root, MRI)) + return None; + + // Try to fold shifts into the addressing mode. + auto AddrModeFns = selectAddrModeShiftedExtendXReg(Root, SizeInBytes); + if (AddrModeFns) + return AddrModeFns; + + // If that doesn't work, see if it's possible to fold in registers from + // a GEP. + return selectAddrModeRegisterOffset(Root); +} + +/// This is used for computing addresses like this: +/// +/// ldr x0, [xBase, wOffset, sxtw #LegalShiftVal] +/// +/// Where we have a 64-bit base register, a 32-bit offset register, and an +/// extend (which may or may not be signed). +InstructionSelector::ComplexRendererFns +AArch64InstructionSelector::selectAddrModeWRO(MachineOperand &Root, + unsigned SizeInBytes) const { + MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo(); + + MachineInstr *PtrAdd = + getOpcodeDef(TargetOpcode::G_PTR_ADD, Root.getReg(), MRI); + if (!PtrAdd || !isWorthFoldingIntoExtendedReg(*PtrAdd, MRI)) + return None; + + MachineOperand &LHS = PtrAdd->getOperand(1); + MachineOperand &RHS = PtrAdd->getOperand(2); + MachineInstr *OffsetInst = getDefIgnoringCopies(RHS.getReg(), MRI); + + // The first case is the same as selectAddrModeXRO, except we need an extend. + // In this case, we try to find a shift and extend, and fold them into the + // addressing mode. + // + // E.g. + // + // off_reg = G_Z/S/ANYEXT ext_reg + // val = G_CONSTANT LegalShiftVal + // shift = G_SHL off_reg val + // ptr = G_PTR_ADD base_reg shift + // x = G_LOAD ptr + // + // In this case we can get a load like this: + // + // ldr x0, [base_reg, ext_reg, sxtw #LegalShiftVal] + auto ExtendedShl = selectExtendedSHL(Root, LHS, OffsetInst->getOperand(0), + SizeInBytes, /*WantsExt=*/true); + if (ExtendedShl) + return ExtendedShl; + + // There was no shift. We can try and fold a G_Z/S/ANYEXT in alone though. + // + // e.g. + // ldr something, [base_reg, ext_reg, sxtw] + if (!isWorthFoldingIntoExtendedReg(*OffsetInst, MRI)) + return None; + + // Check if this is an extend. We'll get an extend type if it is. + AArch64_AM::ShiftExtendType Ext = + getExtendTypeForInst(*OffsetInst, MRI, /*IsLoadStore=*/true); + if (Ext == AArch64_AM::InvalidShiftExtend) + return None; + + // Need a 32-bit wide register. + MachineIRBuilder MIB(*PtrAdd); + Register ExtReg = + narrowExtendRegIfNeeded(OffsetInst->getOperand(1).getReg(), MIB); + unsigned SignExtend = Ext == AArch64_AM::SXTW; + + // Base is LHS, offset is ExtReg. + return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(LHS.getReg()); }, + [=](MachineInstrBuilder &MIB) { MIB.addUse(ExtReg); }, + [=](MachineInstrBuilder &MIB) { + MIB.addImm(SignExtend); + MIB.addImm(0); + }}}; +} + +/// Select a "register plus unscaled signed 9-bit immediate" address. This +/// should only match when there is an offset that is not valid for a scaled +/// immediate addressing mode. The "Size" argument is the size in bytes of the +/// memory reference, which is needed here to know what is valid for a scaled +/// immediate. +InstructionSelector::ComplexRendererFns +AArch64InstructionSelector::selectAddrModeUnscaled(MachineOperand &Root, + unsigned Size) const { + MachineRegisterInfo &MRI = + Root.getParent()->getParent()->getParent()->getRegInfo(); + + if (!Root.isReg()) + return None; + + if (!isBaseWithConstantOffset(Root, MRI)) + return None; + + MachineInstr *RootDef = MRI.getVRegDef(Root.getReg()); + if (!RootDef) + return None; + + MachineOperand &OffImm = RootDef->getOperand(2); + if (!OffImm.isReg()) + return None; + MachineInstr *RHS = MRI.getVRegDef(OffImm.getReg()); + if (!RHS || RHS->getOpcode() != TargetOpcode::G_CONSTANT) + return None; + int64_t RHSC; + MachineOperand &RHSOp1 = RHS->getOperand(1); + if (!RHSOp1.isCImm() || RHSOp1.getCImm()->getBitWidth() > 64) + return None; + RHSC = RHSOp1.getCImm()->getSExtValue(); + + // If the offset is valid as a scaled immediate, don't match here. + if ((RHSC & (Size - 1)) == 0 && RHSC >= 0 && RHSC < (0x1000 << Log2_32(Size))) + return None; + if (RHSC >= -256 && RHSC < 256) { + MachineOperand &Base = RootDef->getOperand(1); + return {{ + [=](MachineInstrBuilder &MIB) { MIB.add(Base); }, + [=](MachineInstrBuilder &MIB) { MIB.addImm(RHSC); }, + }}; + } + return None; +} + +InstructionSelector::ComplexRendererFns +AArch64InstructionSelector::tryFoldAddLowIntoImm(MachineInstr &RootDef, + unsigned Size, + MachineRegisterInfo &MRI) const { + if (RootDef.getOpcode() != AArch64::G_ADD_LOW) + return None; + MachineInstr &Adrp = *MRI.getVRegDef(RootDef.getOperand(1).getReg()); + if (Adrp.getOpcode() != AArch64::ADRP) + return None; + + // TODO: add heuristics like isWorthFoldingADDlow() from SelectionDAG. + // TODO: Need to check GV's offset % size if doing offset folding into globals. + assert(Adrp.getOperand(1).getOffset() == 0 && "Unexpected offset in global"); + auto GV = Adrp.getOperand(1).getGlobal(); + if (GV->isThreadLocal()) + return None; + + auto &MF = *RootDef.getParent()->getParent(); + if (GV->getPointerAlignment(MF.getDataLayout()) < Size) + return None; + + unsigned OpFlags = STI.ClassifyGlobalReference(GV, MF.getTarget()); + MachineIRBuilder MIRBuilder(RootDef); + Register AdrpReg = Adrp.getOperand(0).getReg(); + return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(AdrpReg); }, + [=](MachineInstrBuilder &MIB) { + MIB.addGlobalAddress(GV, /* Offset */ 0, + OpFlags | AArch64II::MO_PAGEOFF | + AArch64II::MO_NC); + }}}; +} + +/// Select a "register plus scaled unsigned 12-bit immediate" address. The +/// "Size" argument is the size in bytes of the memory reference, which +/// determines the scale. +InstructionSelector::ComplexRendererFns +AArch64InstructionSelector::selectAddrModeIndexed(MachineOperand &Root, + unsigned Size) const { + MachineFunction &MF = *Root.getParent()->getParent()->getParent(); + MachineRegisterInfo &MRI = MF.getRegInfo(); + + if (!Root.isReg()) + return None; + + MachineInstr *RootDef = MRI.getVRegDef(Root.getReg()); + if (!RootDef) + return None; + + if (RootDef->getOpcode() == TargetOpcode::G_FRAME_INDEX) { + return {{ + [=](MachineInstrBuilder &MIB) { MIB.add(RootDef->getOperand(1)); }, + [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, + }}; + } + + CodeModel::Model CM = MF.getTarget().getCodeModel(); + // Check if we can fold in the ADD of small code model ADRP + ADD address. + if (CM == CodeModel::Small) { + auto OpFns = tryFoldAddLowIntoImm(*RootDef, Size, MRI); + if (OpFns) + return OpFns; + } + + if (isBaseWithConstantOffset(Root, MRI)) { + MachineOperand &LHS = RootDef->getOperand(1); + MachineOperand &RHS = RootDef->getOperand(2); + MachineInstr *LHSDef = MRI.getVRegDef(LHS.getReg()); + MachineInstr *RHSDef = MRI.getVRegDef(RHS.getReg()); + if (LHSDef && RHSDef) { + int64_t RHSC = (int64_t)RHSDef->getOperand(1).getCImm()->getZExtValue(); + unsigned Scale = Log2_32(Size); + if ((RHSC & (Size - 1)) == 0 && RHSC >= 0 && RHSC < (0x1000 << Scale)) { + if (LHSDef->getOpcode() == TargetOpcode::G_FRAME_INDEX) + return {{ + [=](MachineInstrBuilder &MIB) { MIB.add(LHSDef->getOperand(1)); }, + [=](MachineInstrBuilder &MIB) { MIB.addImm(RHSC >> Scale); }, + }}; + + return {{ + [=](MachineInstrBuilder &MIB) { MIB.add(LHS); }, + [=](MachineInstrBuilder &MIB) { MIB.addImm(RHSC >> Scale); }, + }}; + } + } + } + + // Before falling back to our general case, check if the unscaled + // instructions can handle this. If so, that's preferable. + if (selectAddrModeUnscaled(Root, Size).hasValue()) + return None; + + return {{ + [=](MachineInstrBuilder &MIB) { MIB.add(Root); }, + [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, + }}; +} + +/// Given a shift instruction, return the correct shift type for that +/// instruction. +static AArch64_AM::ShiftExtendType getShiftTypeForInst(MachineInstr &MI) { + // TODO: Handle AArch64_AM::ROR + switch (MI.getOpcode()) { + default: + return AArch64_AM::InvalidShiftExtend; + case TargetOpcode::G_SHL: + return AArch64_AM::LSL; + case TargetOpcode::G_LSHR: + return AArch64_AM::LSR; + case TargetOpcode::G_ASHR: + return AArch64_AM::ASR; + } +} + +/// Select a "shifted register" operand. If the value is not shifted, set the +/// shift operand to a default value of "lsl 0". +/// +/// TODO: Allow shifted register to be rotated in logical instructions. +InstructionSelector::ComplexRendererFns +AArch64InstructionSelector::selectShiftedRegister(MachineOperand &Root) const { + if (!Root.isReg()) + return None; + MachineRegisterInfo &MRI = + Root.getParent()->getParent()->getParent()->getRegInfo(); + + // Check if the operand is defined by an instruction which corresponds to + // a ShiftExtendType. E.g. a G_SHL, G_LSHR, etc. + // + // TODO: Handle AArch64_AM::ROR for logical instructions. + MachineInstr *ShiftInst = MRI.getVRegDef(Root.getReg()); + if (!ShiftInst) + return None; + AArch64_AM::ShiftExtendType ShType = getShiftTypeForInst(*ShiftInst); + if (ShType == AArch64_AM::InvalidShiftExtend) + return None; + if (!isWorthFoldingIntoExtendedReg(*ShiftInst, MRI)) + return None; + + // Need an immediate on the RHS. + MachineOperand &ShiftRHS = ShiftInst->getOperand(2); + auto Immed = getImmedFromMO(ShiftRHS); + if (!Immed) + return None; + + // We have something that we can fold. Fold in the shift's LHS and RHS into + // the instruction. + MachineOperand &ShiftLHS = ShiftInst->getOperand(1); + Register ShiftReg = ShiftLHS.getReg(); + + unsigned NumBits = MRI.getType(ShiftReg).getSizeInBits(); + unsigned Val = *Immed & (NumBits - 1); + unsigned ShiftVal = AArch64_AM::getShifterImm(ShType, Val); + + return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(ShiftReg); }, + [=](MachineInstrBuilder &MIB) { MIB.addImm(ShiftVal); }}}; +} + +AArch64_AM::ShiftExtendType AArch64InstructionSelector::getExtendTypeForInst( + MachineInstr &MI, MachineRegisterInfo &MRI, bool IsLoadStore) const { + unsigned Opc = MI.getOpcode(); + + // Handle explicit extend instructions first. + if (Opc == TargetOpcode::G_SEXT || Opc == TargetOpcode::G_SEXT_INREG) { + unsigned Size; + if (Opc == TargetOpcode::G_SEXT) + Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits(); + else + Size = MI.getOperand(2).getImm(); + assert(Size != 64 && "Extend from 64 bits?"); + switch (Size) { + case 8: + return AArch64_AM::SXTB; + case 16: + return AArch64_AM::SXTH; + case 32: + return AArch64_AM::SXTW; + default: + return AArch64_AM::InvalidShiftExtend; + } + } + + if (Opc == TargetOpcode::G_ZEXT || Opc == TargetOpcode::G_ANYEXT) { + unsigned Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits(); + assert(Size != 64 && "Extend from 64 bits?"); + switch (Size) { + case 8: + return AArch64_AM::UXTB; + case 16: + return AArch64_AM::UXTH; + case 32: + return AArch64_AM::UXTW; + default: + return AArch64_AM::InvalidShiftExtend; + } + } + + // Don't have an explicit extend. Try to handle a G_AND with a constant mask + // on the RHS. + if (Opc != TargetOpcode::G_AND) + return AArch64_AM::InvalidShiftExtend; + + Optional MaybeAndMask = getImmedFromMO(MI.getOperand(2)); + if (!MaybeAndMask) + return AArch64_AM::InvalidShiftExtend; + uint64_t AndMask = *MaybeAndMask; + switch (AndMask) { + default: + return AArch64_AM::InvalidShiftExtend; + case 0xFF: + return !IsLoadStore ? AArch64_AM::UXTB : AArch64_AM::InvalidShiftExtend; + case 0xFFFF: + return !IsLoadStore ? AArch64_AM::UXTH : AArch64_AM::InvalidShiftExtend; + case 0xFFFFFFFF: + return AArch64_AM::UXTW; + } +} + +Register AArch64InstructionSelector::narrowExtendRegIfNeeded( + Register ExtReg, MachineIRBuilder &MIB) const { + MachineRegisterInfo &MRI = *MIB.getMRI(); + if (MRI.getType(ExtReg).getSizeInBits() == 32) + return ExtReg; + + // Insert a copy to move ExtReg to GPR32. + Register NarrowReg = MRI.createVirtualRegister(&AArch64::GPR32RegClass); + auto Copy = MIB.buildCopy({NarrowReg}, {ExtReg}); + + // Select the copy into a subregister copy. + selectCopy(*Copy, TII, MRI, TRI, RBI); + return Copy.getReg(0); +} + +Register AArch64InstructionSelector::widenGPRBankRegIfNeeded( + Register Reg, unsigned WideSize, MachineIRBuilder &MIB) const { + assert(WideSize >= 8 && "WideSize is smaller than all possible registers?"); + MachineRegisterInfo &MRI = *MIB.getMRI(); + unsigned NarrowSize = MRI.getType(Reg).getSizeInBits(); + assert(WideSize >= NarrowSize && + "WideSize cannot be smaller than NarrowSize!"); + + // If the sizes match, just return the register. + // + // If NarrowSize is an s1, then we can select it to any size, so we'll treat + // it as a don't care. + if (NarrowSize == WideSize || NarrowSize == 1) + return Reg; + + // Now check the register classes. + const RegisterBank *RB = RBI.getRegBank(Reg, MRI, TRI); + const TargetRegisterClass *OrigRC = getMinClassForRegBank(*RB, NarrowSize); + const TargetRegisterClass *WideRC = getMinClassForRegBank(*RB, WideSize); + assert(OrigRC && "Could not determine narrow RC?"); + assert(WideRC && "Could not determine wide RC?"); + + // If the sizes differ, but the register classes are the same, there is no + // need to insert a SUBREG_TO_REG. + // + // For example, an s8 that's supposed to be a GPR will be selected to either + // a GPR32 or a GPR64 register. Note that this assumes that the s8 will + // always end up on a GPR32. + if (OrigRC == WideRC) + return Reg; + + // We have two different register classes. Insert a SUBREG_TO_REG. + unsigned SubReg = 0; + getSubRegForClass(OrigRC, TRI, SubReg); + assert(SubReg && "Couldn't determine subregister?"); + + // Build the SUBREG_TO_REG and return the new, widened register. + auto SubRegToReg = + MIB.buildInstr(AArch64::SUBREG_TO_REG, {WideRC}, {}) + .addImm(0) + .addUse(Reg) + .addImm(SubReg); + constrainSelectedInstRegOperands(*SubRegToReg, TII, TRI, RBI); + return SubRegToReg.getReg(0); +} + +/// Select an "extended register" operand. This operand folds in an extend +/// followed by an optional left shift. +InstructionSelector::ComplexRendererFns +AArch64InstructionSelector::selectArithExtendedRegister( + MachineOperand &Root) const { + if (!Root.isReg()) + return None; + MachineRegisterInfo &MRI = + Root.getParent()->getParent()->getParent()->getRegInfo(); + + uint64_t ShiftVal = 0; + Register ExtReg; + AArch64_AM::ShiftExtendType Ext; + MachineInstr *RootDef = getDefIgnoringCopies(Root.getReg(), MRI); + if (!RootDef) + return None; + + if (!isWorthFoldingIntoExtendedReg(*RootDef, MRI)) + return None; + + // Check if we can fold a shift and an extend. + if (RootDef->getOpcode() == TargetOpcode::G_SHL) { + // Look for a constant on the RHS of the shift. + MachineOperand &RHS = RootDef->getOperand(2); + Optional MaybeShiftVal = getImmedFromMO(RHS); + if (!MaybeShiftVal) + return None; + ShiftVal = *MaybeShiftVal; + if (ShiftVal > 4) + return None; + // Look for a valid extend instruction on the LHS of the shift. + MachineOperand &LHS = RootDef->getOperand(1); + MachineInstr *ExtDef = getDefIgnoringCopies(LHS.getReg(), MRI); + if (!ExtDef) + return None; + Ext = getExtendTypeForInst(*ExtDef, MRI); + if (Ext == AArch64_AM::InvalidShiftExtend) + return None; + ExtReg = ExtDef->getOperand(1).getReg(); + } else { + // Didn't get a shift. Try just folding an extend. + Ext = getExtendTypeForInst(*RootDef, MRI); + if (Ext == AArch64_AM::InvalidShiftExtend) + return None; + ExtReg = RootDef->getOperand(1).getReg(); + + // If we have a 32 bit instruction which zeroes out the high half of a + // register, we get an implicit zero extend for free. Check if we have one. + // FIXME: We actually emit the extend right now even though we don't have + // to. + if (Ext == AArch64_AM::UXTW && MRI.getType(ExtReg).getSizeInBits() == 32) { + MachineInstr *ExtInst = MRI.getVRegDef(ExtReg); + if (ExtInst && isDef32(*ExtInst)) + return None; + } + } + + // We require a GPR32 here. Narrow the ExtReg if needed using a subregister + // copy. + MachineIRBuilder MIB(*RootDef); + ExtReg = narrowExtendRegIfNeeded(ExtReg, MIB); + + return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(ExtReg); }, + [=](MachineInstrBuilder &MIB) { + MIB.addImm(getArithExtendImm(Ext, ShiftVal)); + }}}; +} + +void AArch64InstructionSelector::renderTruncImm(MachineInstrBuilder &MIB, + const MachineInstr &MI, + int OpIdx) const { + const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); + assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 && + "Expected G_CONSTANT"); + Optional CstVal = getConstantVRegVal(MI.getOperand(0).getReg(), MRI); + assert(CstVal && "Expected constant value"); + MIB.addImm(CstVal.getValue()); +} + +void AArch64InstructionSelector::renderLogicalImm32( + MachineInstrBuilder &MIB, const MachineInstr &I, int OpIdx) const { + assert(I.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 && + "Expected G_CONSTANT"); + uint64_t CstVal = I.getOperand(1).getCImm()->getZExtValue(); + uint64_t Enc = AArch64_AM::encodeLogicalImmediate(CstVal, 32); + MIB.addImm(Enc); +} + +void AArch64InstructionSelector::renderLogicalImm64( + MachineInstrBuilder &MIB, const MachineInstr &I, int OpIdx) const { + assert(I.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 && + "Expected G_CONSTANT"); + uint64_t CstVal = I.getOperand(1).getCImm()->getZExtValue(); + uint64_t Enc = AArch64_AM::encodeLogicalImmediate(CstVal, 64); + MIB.addImm(Enc); +} + +bool AArch64InstructionSelector::isLoadStoreOfNumBytes( + const MachineInstr &MI, unsigned NumBytes) const { + if (!MI.mayLoadOrStore()) + return false; + assert(MI.hasOneMemOperand() && + "Expected load/store to have only one mem op!"); + return (*MI.memoperands_begin())->getSize() == NumBytes; +} + +bool AArch64InstructionSelector::isDef32(const MachineInstr &MI) const { + const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); + if (MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() != 32) + return false; + + // Only return true if we know the operation will zero-out the high half of + // the 64-bit register. Truncates can be subregister copies, which don't + // zero out the high bits. Copies and other copy-like instructions can be + // fed by truncates, or could be lowered as subregister copies. + switch (MI.getOpcode()) { + default: + return true; + case TargetOpcode::COPY: + case TargetOpcode::G_BITCAST: + case TargetOpcode::G_TRUNC: + case TargetOpcode::G_PHI: + return false; + } +} + + +// Perform fixups on the given PHI instruction's operands to force them all +// to be the same as the destination regbank. +static void fixupPHIOpBanks(MachineInstr &MI, MachineRegisterInfo &MRI, + const AArch64RegisterBankInfo &RBI) { + assert(MI.getOpcode() == TargetOpcode::G_PHI && "Expected a G_PHI"); + Register DstReg = MI.getOperand(0).getReg(); + const RegisterBank *DstRB = MRI.getRegBankOrNull(DstReg); + assert(DstRB && "Expected PHI dst to have regbank assigned"); + MachineIRBuilder MIB(MI); + + // Go through each operand and ensure it has the same regbank. + for (unsigned OpIdx = 1; OpIdx < MI.getNumOperands(); ++OpIdx) { + MachineOperand &MO = MI.getOperand(OpIdx); + if (!MO.isReg()) + continue; + Register OpReg = MO.getReg(); + const RegisterBank *RB = MRI.getRegBankOrNull(OpReg); + if (RB != DstRB) { + // Insert a cross-bank copy. + auto *OpDef = MRI.getVRegDef(OpReg); + const LLT &Ty = MRI.getType(OpReg); + MIB.setInsertPt(*OpDef->getParent(), std::next(OpDef->getIterator())); + auto Copy = MIB.buildCopy(Ty, OpReg); + MRI.setRegBank(Copy.getReg(0), *DstRB); + MO.setReg(Copy.getReg(0)); + } + } +} + +void AArch64InstructionSelector::processPHIs(MachineFunction &MF) { + // We're looking for PHIs, build a list so we don't invalidate iterators. + MachineRegisterInfo &MRI = MF.getRegInfo(); + SmallVector Phis; + for (auto &BB : MF) { + for (auto &MI : BB) { + if (MI.getOpcode() == TargetOpcode::G_PHI) + Phis.emplace_back(&MI); + } + } + + for (auto *MI : Phis) { + // We need to do some work here if the operand types are < 16 bit and they + // are split across fpr/gpr banks. Since all types <32b on gpr + // end up being assigned gpr32 regclasses, we can end up with PHIs here + // which try to select between a gpr32 and an fpr16. Ideally RBS shouldn't + // be selecting heterogenous regbanks for operands if possible, but we + // still need to be able to deal with it here. + // + // To fix this, if we have a gpr-bank operand < 32b in size and at least + // one other operand is on the fpr bank, then we add cross-bank copies + // to homogenize the operand banks. For simplicity the bank that we choose + // to settle on is whatever bank the def operand has. For example: + // + // %endbb: + // %dst:gpr(s16) = G_PHI %in1:gpr(s16), %bb1, %in2:fpr(s16), %bb2 + // => + // %bb2: + // ... + // %in2_copy:gpr(s16) = COPY %in2:fpr(s16) + // ... + // %endbb: + // %dst:gpr(s16) = G_PHI %in1:gpr(s16), %bb1, %in2_copy:gpr(s16), %bb2 + bool HasGPROp = false, HasFPROp = false; + for (unsigned OpIdx = 1; OpIdx < MI->getNumOperands(); ++OpIdx) { + const auto &MO = MI->getOperand(OpIdx); + if (!MO.isReg()) + continue; + const LLT &Ty = MRI.getType(MO.getReg()); + if (!Ty.isValid() || !Ty.isScalar()) + break; + if (Ty.getSizeInBits() >= 32) + break; + const RegisterBank *RB = MRI.getRegBankOrNull(MO.getReg()); + // If for some reason we don't have a regbank yet. Don't try anything. + if (!RB) + break; + + if (RB->getID() == AArch64::GPRRegBankID) + HasGPROp = true; + else + HasFPROp = true; + } + // We have heterogenous regbanks, need to fixup. + if (HasGPROp && HasFPROp) + fixupPHIOpBanks(*MI, MRI, RBI); + } +} + +namespace llvm { +InstructionSelector * +createAArch64InstructionSelector(const AArch64TargetMachine &TM, + AArch64Subtarget &Subtarget, + AArch64RegisterBankInfo &RBI) { + return new AArch64InstructionSelector(TM, Subtarget, RBI); +} +} diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp new file mode 100644 index 0000000000000..2eaec0b970fa6 --- /dev/null +++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp @@ -0,0 +1,809 @@ +//===- AArch64LegalizerInfo.cpp ----------------------------------*- C++ -*-==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// \file +/// This file implements the targeting of the Machinelegalizer class for +/// AArch64. +/// \todo This should be generated by TableGen. +//===----------------------------------------------------------------------===// + +#include "AArch64LegalizerInfo.h" +#include "AArch64Subtarget.h" +#include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" +#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" +#include "llvm/CodeGen/GlobalISel/Utils.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/TargetOpcodes.h" +#include "llvm/CodeGen/ValueTypes.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/Type.h" + +#define DEBUG_TYPE "aarch64-legalinfo" + +using namespace llvm; +using namespace LegalizeActions; +using namespace LegalizeMutations; +using namespace LegalityPredicates; + +AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) + : ST(&ST) { + using namespace TargetOpcode; + const LLT p0 = LLT::pointer(0, 64); + const LLT s1 = LLT::scalar(1); + const LLT s8 = LLT::scalar(8); + const LLT s16 = LLT::scalar(16); + const LLT s32 = LLT::scalar(32); + const LLT s64 = LLT::scalar(64); + const LLT s128 = LLT::scalar(128); + const LLT s256 = LLT::scalar(256); + const LLT s512 = LLT::scalar(512); + const LLT v16s8 = LLT::vector(16, 8); + const LLT v8s8 = LLT::vector(8, 8); + const LLT v4s8 = LLT::vector(4, 8); + const LLT v8s16 = LLT::vector(8, 16); + const LLT v4s16 = LLT::vector(4, 16); + const LLT v2s16 = LLT::vector(2, 16); + const LLT v2s32 = LLT::vector(2, 32); + const LLT v4s32 = LLT::vector(4, 32); + const LLT v2s64 = LLT::vector(2, 64); + const LLT v2p0 = LLT::vector(2, p0); + + const TargetMachine &TM = ST.getTargetLowering()->getTargetMachine(); + + // FIXME: support subtargets which have neon/fp-armv8 disabled. + if (!ST.hasNEON() || !ST.hasFPARMv8()) { + computeTables(); + return; + } + + getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE}) + .legalFor({p0, s1, s8, s16, s32, s64, v2s32, v4s32, v2s64}) + .clampScalar(0, s1, s64) + .widenScalarToNextPow2(0, 8) + .fewerElementsIf( + [=](const LegalityQuery &Query) { + return Query.Types[0].isVector() && + (Query.Types[0].getElementType() != s64 || + Query.Types[0].getNumElements() != 2); + }, + [=](const LegalityQuery &Query) { + LLT EltTy = Query.Types[0].getElementType(); + if (EltTy == s64) + return std::make_pair(0, LLT::vector(2, 64)); + return std::make_pair(0, EltTy); + }); + + getActionDefinitionsBuilder(G_PHI) + .legalFor({p0, s16, s32, s64, v2s32, v4s32, v2s64}) + .clampScalar(0, s16, s64) + .widenScalarToNextPow2(0); + + getActionDefinitionsBuilder(G_BSWAP) + .legalFor({s32, s64, v4s32, v2s32, v2s64}) + .clampScalar(0, s32, s64) + .widenScalarToNextPow2(0); + + getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL, G_AND, G_OR, G_XOR}) + .legalFor({s32, s64, v2s32, v4s32, v2s64, v8s16, v16s8}) + .clampScalar(0, s32, s64) + .widenScalarToNextPow2(0) + .clampNumElements(0, v2s32, v4s32) + .clampNumElements(0, v2s64, v2s64) + .moreElementsToNextPow2(0); + + getActionDefinitionsBuilder(G_SHL) + .legalFor({{s32, s32}, {s64, s64}, + {v2s32, v2s32}, {v4s32, v4s32}, {v2s64, v2s64}}) + .clampScalar(1, s32, s64) + .clampScalar(0, s32, s64) + .widenScalarToNextPow2(0) + .clampNumElements(0, v2s32, v4s32) + .clampNumElements(0, v2s64, v2s64) + .moreElementsToNextPow2(0) + .minScalarSameAs(1, 0); + + getActionDefinitionsBuilder(G_PTR_ADD) + .legalFor({{p0, s64}, {v2p0, v2s64}}) + .clampScalar(1, s64, s64); + + getActionDefinitionsBuilder(G_PTRMASK).legalFor({{p0, s64}}); + + getActionDefinitionsBuilder({G_SDIV, G_UDIV}) + .legalFor({s32, s64}) + .libcallFor({s128}) + .clampScalar(0, s32, s64) + .widenScalarToNextPow2(0) + .scalarize(0); + + getActionDefinitionsBuilder({G_LSHR, G_ASHR}) + .customIf([=](const LegalityQuery &Query) { + const auto &SrcTy = Query.Types[0]; + const auto &AmtTy = Query.Types[1]; + return !SrcTy.isVector() && SrcTy.getSizeInBits() == 32 && + AmtTy.getSizeInBits() == 32; + }) + .legalFor({{s32, s32}, + {s32, s64}, + {s64, s64}, + {v2s32, v2s32}, + {v4s32, v4s32}, + {v2s64, v2s64}}) + .clampScalar(1, s32, s64) + .clampScalar(0, s32, s64) + .minScalarSameAs(1, 0); + + getActionDefinitionsBuilder({G_SREM, G_UREM}) + .lowerFor({s1, s8, s16, s32, s64}); + + getActionDefinitionsBuilder({G_SMULO, G_UMULO}) + .lowerFor({{s64, s1}}); + + getActionDefinitionsBuilder({G_SMULH, G_UMULH}).legalFor({s32, s64}); + + getActionDefinitionsBuilder({G_UADDE, G_USUBE, G_SADDO, G_SSUBO, G_UADDO}) + .legalFor({{s32, s1}, {s64, s1}}) + .minScalar(0, s32); + + getActionDefinitionsBuilder({G_FADD, G_FSUB, G_FMUL, G_FDIV, G_FNEG}) + .legalFor({s32, s64, v2s64, v4s32, v2s32}); + + getActionDefinitionsBuilder(G_FREM).libcallFor({s32, s64}); + + getActionDefinitionsBuilder({G_FCEIL, G_FABS, G_FSQRT, G_FFLOOR, G_FRINT, + G_FMA, G_INTRINSIC_TRUNC, G_INTRINSIC_ROUND, + G_FNEARBYINT}) + // If we don't have full FP16 support, then scalarize the elements of + // vectors containing fp16 types. + .fewerElementsIf( + [=, &ST](const LegalityQuery &Query) { + const auto &Ty = Query.Types[0]; + return Ty.isVector() && Ty.getElementType() == s16 && + !ST.hasFullFP16(); + }, + [=](const LegalityQuery &Query) { return std::make_pair(0, s16); }) + // If we don't have full FP16 support, then widen s16 to s32 if we + // encounter it. + .widenScalarIf( + [=, &ST](const LegalityQuery &Query) { + return Query.Types[0] == s16 && !ST.hasFullFP16(); + }, + [=](const LegalityQuery &Query) { return std::make_pair(0, s32); }) + .legalFor({s16, s32, s64, v2s32, v4s32, v2s64, v2s16, v4s16, v8s16}); + + getActionDefinitionsBuilder( + {G_FCOS, G_FSIN, G_FLOG10, G_FLOG, G_FLOG2, G_FEXP, G_FEXP2, G_FPOW}) + // We need a call for these, so we always need to scalarize. + .scalarize(0) + // Regardless of FP16 support, widen 16-bit elements to 32-bits. + .minScalar(0, s32) + .libcallFor({s32, s64, v2s32, v4s32, v2s64}); + + getActionDefinitionsBuilder(G_INSERT) + .unsupportedIf([=](const LegalityQuery &Query) { + return Query.Types[0].getSizeInBits() <= Query.Types[1].getSizeInBits(); + }) + .legalIf([=](const LegalityQuery &Query) { + const LLT &Ty0 = Query.Types[0]; + const LLT &Ty1 = Query.Types[1]; + if (Ty0 != s32 && Ty0 != s64 && Ty0 != p0) + return false; + return isPowerOf2_32(Ty1.getSizeInBits()) && + (Ty1.getSizeInBits() == 1 || Ty1.getSizeInBits() >= 8); + }) + .clampScalar(0, s32, s64) + .widenScalarToNextPow2(0) + .maxScalarIf(typeInSet(0, {s32}), 1, s16) + .maxScalarIf(typeInSet(0, {s64}), 1, s32) + .widenScalarToNextPow2(1); + + getActionDefinitionsBuilder(G_EXTRACT) + .unsupportedIf([=](const LegalityQuery &Query) { + return Query.Types[0].getSizeInBits() >= Query.Types[1].getSizeInBits(); + }) + .legalIf([=](const LegalityQuery &Query) { + const LLT &Ty0 = Query.Types[0]; + const LLT &Ty1 = Query.Types[1]; + if (Ty1 != s32 && Ty1 != s64 && Ty1 != s128) + return false; + if (Ty1 == p0) + return true; + return isPowerOf2_32(Ty0.getSizeInBits()) && + (Ty0.getSizeInBits() == 1 || Ty0.getSizeInBits() >= 8); + }) + .clampScalar(1, s32, s128) + .widenScalarToNextPow2(1) + .maxScalarIf(typeInSet(1, {s32}), 0, s16) + .maxScalarIf(typeInSet(1, {s64}), 0, s32) + .widenScalarToNextPow2(0); + + getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) + .legalForTypesWithMemDesc({{s32, p0, 8, 8}, + {s32, p0, 16, 8}, + {s32, p0, 32, 8}, + {s64, p0, 8, 2}, + {s64, p0, 16, 2}, + {s64, p0, 32, 4}, + {s64, p0, 64, 8}, + {p0, p0, 64, 8}, + {v2s32, p0, 64, 8}}) + .clampScalar(0, s32, s64) + .widenScalarToNextPow2(0) + // TODO: We could support sum-of-pow2's but the lowering code doesn't know + // how to do that yet. + .unsupportedIfMemSizeNotPow2() + // Lower anything left over into G_*EXT and G_LOAD + .lower(); + + auto IsPtrVecPred = [=](const LegalityQuery &Query) { + const LLT &ValTy = Query.Types[0]; + if (!ValTy.isVector()) + return false; + const LLT EltTy = ValTy.getElementType(); + return EltTy.isPointer() && EltTy.getAddressSpace() == 0; + }; + + getActionDefinitionsBuilder(G_LOAD) + .legalForTypesWithMemDesc({{s8, p0, 8, 8}, + {s16, p0, 16, 8}, + {s32, p0, 32, 8}, + {s64, p0, 64, 8}, + {p0, p0, 64, 8}, + {s128, p0, 128, 8}, + {v8s8, p0, 64, 8}, + {v16s8, p0, 128, 8}, + {v4s16, p0, 64, 8}, + {v8s16, p0, 128, 8}, + {v2s32, p0, 64, 8}, + {v4s32, p0, 128, 8}, + {v2s64, p0, 128, 8}}) + // These extends are also legal + .legalForTypesWithMemDesc({{s32, p0, 8, 8}, + {s32, p0, 16, 8}}) + .clampScalar(0, s8, s64) + .lowerIfMemSizeNotPow2() + // Lower any any-extending loads left into G_ANYEXT and G_LOAD + .lowerIf([=](const LegalityQuery &Query) { + return Query.Types[0].getSizeInBits() != Query.MMODescrs[0].SizeInBits; + }) + .widenScalarToNextPow2(0) + .clampMaxNumElements(0, s32, 2) + .clampMaxNumElements(0, s64, 1) + .customIf(IsPtrVecPred); + + getActionDefinitionsBuilder(G_STORE) + .legalForTypesWithMemDesc({{s8, p0, 8, 8}, + {s16, p0, 16, 8}, + {s32, p0, 8, 8}, + {s32, p0, 16, 8}, + {s32, p0, 32, 8}, + {s64, p0, 64, 8}, + {p0, p0, 64, 8}, + {s128, p0, 128, 8}, + {v16s8, p0, 128, 8}, + {v4s16, p0, 64, 8}, + {v8s16, p0, 128, 8}, + {v2s32, p0, 64, 8}, + {v4s32, p0, 128, 8}, + {v2s64, p0, 128, 8}}) + .clampScalar(0, s8, s64) + .lowerIfMemSizeNotPow2() + .lowerIf([=](const LegalityQuery &Query) { + return Query.Types[0].isScalar() && + Query.Types[0].getSizeInBits() != Query.MMODescrs[0].SizeInBits; + }) + .clampMaxNumElements(0, s32, 2) + .clampMaxNumElements(0, s64, 1) + .customIf(IsPtrVecPred); + + // Constants + getActionDefinitionsBuilder(G_CONSTANT) + .legalFor({p0, s8, s16, s32, s64}) + .clampScalar(0, s8, s64) + .widenScalarToNextPow2(0); + getActionDefinitionsBuilder(G_FCONSTANT) + .legalFor({s32, s64}) + .clampScalar(0, s32, s64); + + getActionDefinitionsBuilder(G_ICMP) + .legalFor({{s32, s32}, + {s32, s64}, + {s32, p0}, + {v4s32, v4s32}, + {v2s32, v2s32}, + {v2s64, v2s64}, + {v2s64, v2p0}, + {v4s16, v4s16}, + {v8s16, v8s16}, + {v8s8, v8s8}, + {v16s8, v16s8}}) + .clampScalar(1, s32, s64) + .clampScalar(0, s32, s32) + .minScalarEltSameAsIf( + [=](const LegalityQuery &Query) { + const LLT &Ty = Query.Types[0]; + const LLT &SrcTy = Query.Types[1]; + return Ty.isVector() && !SrcTy.getElementType().isPointer() && + Ty.getElementType() != SrcTy.getElementType(); + }, + 0, 1) + .minScalarOrEltIf( + [=](const LegalityQuery &Query) { return Query.Types[1] == v2s16; }, + 1, s32) + .minScalarOrEltIf( + [=](const LegalityQuery &Query) { return Query.Types[1] == v2p0; }, 0, + s64) + .widenScalarOrEltToNextPow2(1); + + getActionDefinitionsBuilder(G_FCMP) + .legalFor({{s32, s32}, {s32, s64}}) + .clampScalar(0, s32, s32) + .clampScalar(1, s32, s64) + .widenScalarToNextPow2(1); + + // Extensions + auto ExtLegalFunc = [=](const LegalityQuery &Query) { + unsigned DstSize = Query.Types[0].getSizeInBits(); + + if (DstSize == 128 && !Query.Types[0].isVector()) + return false; // Extending to a scalar s128 needs narrowing. + + // Make sure that we have something that will fit in a register, and + // make sure it's a power of 2. + if (DstSize < 8 || DstSize > 128 || !isPowerOf2_32(DstSize)) + return false; + + const LLT &SrcTy = Query.Types[1]; + + // Special case for s1. + if (SrcTy == s1) + return true; + + // Make sure we fit in a register otherwise. Don't bother checking that + // the source type is below 128 bits. We shouldn't be allowing anything + // through which is wider than the destination in the first place. + unsigned SrcSize = SrcTy.getSizeInBits(); + if (SrcSize < 8 || !isPowerOf2_32(SrcSize)) + return false; + + return true; + }; + getActionDefinitionsBuilder({G_ZEXT, G_SEXT, G_ANYEXT}) + .legalIf(ExtLegalFunc) + .clampScalar(0, s64, s64); // Just for s128, others are handled above. + + getActionDefinitionsBuilder(G_TRUNC).alwaysLegal(); + + getActionDefinitionsBuilder(G_SEXT_INREG) + .legalFor({s32, s64}) + .lower(); + + // FP conversions + getActionDefinitionsBuilder(G_FPTRUNC).legalFor( + {{s16, s32}, {s16, s64}, {s32, s64}, {v4s16, v4s32}, {v2s32, v2s64}}); + getActionDefinitionsBuilder(G_FPEXT).legalFor( + {{s32, s16}, {s64, s16}, {s64, s32}, {v4s32, v4s16}, {v2s64, v2s32}}); + + // Conversions + getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) + .legalForCartesianProduct({s32, s64, v2s64, v4s32, v2s32}) + .clampScalar(0, s32, s64) + .widenScalarToNextPow2(0) + .clampScalar(1, s32, s64) + .widenScalarToNextPow2(1); + + getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) + .legalForCartesianProduct({s32, s64, v2s64, v4s32, v2s32}) + .clampScalar(1, s32, s64) + .widenScalarToNextPow2(1) + .clampScalar(0, s32, s64) + .widenScalarToNextPow2(0); + + // Control-flow + getActionDefinitionsBuilder(G_BRCOND).legalFor({s1, s8, s16, s32}); + getActionDefinitionsBuilder(G_BRINDIRECT).legalFor({p0}); + + // Select + // FIXME: We can probably do a bit better than just scalarizing vector + // selects. + getActionDefinitionsBuilder(G_SELECT) + .legalFor({{s32, s1}, {s64, s1}, {p0, s1}}) + .clampScalar(0, s32, s64) + .widenScalarToNextPow2(0) + .scalarize(0); + + // Pointer-handling + getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({p0}); + + if (TM.getCodeModel() == CodeModel::Small) + getActionDefinitionsBuilder(G_GLOBAL_VALUE).custom(); + else + getActionDefinitionsBuilder(G_GLOBAL_VALUE).legalFor({p0}); + + getActionDefinitionsBuilder(G_PTRTOINT) + .legalForCartesianProduct({s1, s8, s16, s32, s64}, {p0}) + .maxScalar(0, s64) + .widenScalarToNextPow2(0, /*Min*/ 8); + + getActionDefinitionsBuilder(G_INTTOPTR) + .unsupportedIf([&](const LegalityQuery &Query) { + return Query.Types[0].getSizeInBits() != Query.Types[1].getSizeInBits(); + }) + .legalFor({{p0, s64}}); + + // Casts for 32 and 64-bit width type are just copies. + // Same for 128-bit width type, except they are on the FPR bank. + getActionDefinitionsBuilder(G_BITCAST) + // FIXME: This is wrong since G_BITCAST is not allowed to change the + // number of bits but it's what the previous code described and fixing + // it breaks tests. + .legalForCartesianProduct({s1, s8, s16, s32, s64, s128, v16s8, v8s8, v4s8, + v8s16, v4s16, v2s16, v4s32, v2s32, v2s64, + v2p0}); + + getActionDefinitionsBuilder(G_VASTART).legalFor({p0}); + + // va_list must be a pointer, but most sized types are pretty easy to handle + // as the destination. + getActionDefinitionsBuilder(G_VAARG) + .customForCartesianProduct({s8, s16, s32, s64, p0}, {p0}) + .clampScalar(0, s8, s64) + .widenScalarToNextPow2(0, /*Min*/ 8); + + if (ST.hasLSE()) { + getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG_WITH_SUCCESS) + .lowerIf(all( + typeInSet(0, {s8, s16, s32, s64}), typeIs(1, s1), typeIs(2, p0), + atomicOrderingAtLeastOrStrongerThan(0, AtomicOrdering::Monotonic))); + + getActionDefinitionsBuilder( + {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, G_ATOMICRMW_AND, + G_ATOMICRMW_OR, G_ATOMICRMW_XOR, G_ATOMICRMW_MIN, G_ATOMICRMW_MAX, + G_ATOMICRMW_UMIN, G_ATOMICRMW_UMAX, G_ATOMIC_CMPXCHG}) + .legalIf(all( + typeInSet(0, {s8, s16, s32, s64}), typeIs(1, p0), + atomicOrderingAtLeastOrStrongerThan(0, AtomicOrdering::Monotonic))); + } + + getActionDefinitionsBuilder(G_BLOCK_ADDR).legalFor({p0}); + + // Merge/Unmerge + for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { + unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; + unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; + + auto notValidElt = [](const LegalityQuery &Query, unsigned TypeIdx) { + const LLT &Ty = Query.Types[TypeIdx]; + if (Ty.isVector()) { + const LLT &EltTy = Ty.getElementType(); + if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 64) + return true; + if (!isPowerOf2_32(EltTy.getSizeInBits())) + return true; + } + return false; + }; + + // FIXME: This rule is horrible, but specifies the same as what we had + // before with the particularly strange definitions removed (e.g. + // s8 = G_MERGE_VALUES s32, s32). + // Part of the complexity comes from these ops being extremely flexible. For + // example, you can build/decompose vectors with it, concatenate vectors, + // etc. and in addition to this you can also bitcast with it at the same + // time. We've been considering breaking it up into multiple ops to make it + // more manageable throughout the backend. + getActionDefinitionsBuilder(Op) + // Break up vectors with weird elements into scalars + .fewerElementsIf( + [=](const LegalityQuery &Query) { return notValidElt(Query, 0); }, + scalarize(0)) + .fewerElementsIf( + [=](const LegalityQuery &Query) { return notValidElt(Query, 1); }, + scalarize(1)) + // Clamp the big scalar to s8-s512 and make it either a power of 2, 192, + // or 384. + .clampScalar(BigTyIdx, s8, s512) + .widenScalarIf( + [=](const LegalityQuery &Query) { + const LLT &Ty = Query.Types[BigTyIdx]; + return !isPowerOf2_32(Ty.getSizeInBits()) && + Ty.getSizeInBits() % 64 != 0; + }, + [=](const LegalityQuery &Query) { + // Pick the next power of 2, or a multiple of 64 over 128. + // Whichever is smaller. + const LLT &Ty = Query.Types[BigTyIdx]; + unsigned NewSizeInBits = 1 + << Log2_32_Ceil(Ty.getSizeInBits() + 1); + if (NewSizeInBits >= 256) { + unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); + if (RoundedTo < NewSizeInBits) + NewSizeInBits = RoundedTo; + } + return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits)); + }) + // Clamp the little scalar to s8-s256 and make it a power of 2. It's not + // worth considering the multiples of 64 since 2*192 and 2*384 are not + // valid. + .clampScalar(LitTyIdx, s8, s256) + .widenScalarToNextPow2(LitTyIdx, /*Min*/ 8) + // So at this point, we have s8, s16, s32, s64, s128, s192, s256, s384, + // s512, , , , or . + // At this point it's simple enough to accept the legal types. + .legalIf([=](const LegalityQuery &Query) { + const LLT &BigTy = Query.Types[BigTyIdx]; + const LLT &LitTy = Query.Types[LitTyIdx]; + if (BigTy.isVector() && BigTy.getSizeInBits() < 32) + return false; + if (LitTy.isVector() && LitTy.getSizeInBits() < 32) + return false; + return BigTy.getSizeInBits() % LitTy.getSizeInBits() == 0; + }) + // Any vectors left are the wrong size. Scalarize them. + .scalarize(0) + .scalarize(1); + } + + getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) + .unsupportedIf([=](const LegalityQuery &Query) { + const LLT &EltTy = Query.Types[1].getElementType(); + return Query.Types[0] != EltTy; + }) + .minScalar(2, s64) + .legalIf([=](const LegalityQuery &Query) { + const LLT &VecTy = Query.Types[1]; + return VecTy == v2s16 || VecTy == v4s16 || VecTy == v8s16 || + VecTy == v4s32 || VecTy == v2s64 || VecTy == v2s32; + }); + + getActionDefinitionsBuilder(G_INSERT_VECTOR_ELT) + .legalIf([=](const LegalityQuery &Query) { + const LLT &VecTy = Query.Types[0]; + // TODO: Support s8 and s16 + return VecTy == v2s32 || VecTy == v4s32 || VecTy == v2s64; + }); + + getActionDefinitionsBuilder(G_BUILD_VECTOR) + .legalFor({{v4s16, s16}, + {v8s16, s16}, + {v2s32, s32}, + {v4s32, s32}, + {v2p0, p0}, + {v2s64, s64}}) + .clampNumElements(0, v4s32, v4s32) + .clampNumElements(0, v2s64, v2s64) + + // Deal with larger scalar types, which will be implicitly truncated. + .legalIf([=](const LegalityQuery &Query) { + return Query.Types[0].getScalarSizeInBits() < + Query.Types[1].getSizeInBits(); + }) + .minScalarSameAs(1, 0); + + getActionDefinitionsBuilder(G_CTLZ).legalForCartesianProduct( + {s32, s64, v8s8, v16s8, v4s16, v8s16, v2s32, v4s32}) + .scalarize(1); + + getActionDefinitionsBuilder(G_SHUFFLE_VECTOR) + .legalIf([=](const LegalityQuery &Query) { + const LLT &DstTy = Query.Types[0]; + const LLT &SrcTy = Query.Types[1]; + // For now just support the TBL2 variant which needs the source vectors + // to be the same size as the dest. + if (DstTy != SrcTy) + return false; + for (auto &Ty : {v2s32, v4s32, v2s64}) { + if (DstTy == Ty) + return true; + } + return false; + }) + // G_SHUFFLE_VECTOR can have scalar sources (from 1 x s vectors), we + // just want those lowered into G_BUILD_VECTOR + .lowerIf([=](const LegalityQuery &Query) { + return !Query.Types[1].isVector(); + }) + .clampNumElements(0, v4s32, v4s32) + .clampNumElements(0, v2s64, v2s64); + + getActionDefinitionsBuilder(G_CONCAT_VECTORS) + .legalFor({{v4s32, v2s32}, {v8s16, v4s16}}); + + getActionDefinitionsBuilder(G_JUMP_TABLE) + .legalFor({{p0}, {s64}}); + + getActionDefinitionsBuilder(G_BRJT).legalIf([=](const LegalityQuery &Query) { + return Query.Types[0] == p0 && Query.Types[1] == s64; + }); + + getActionDefinitionsBuilder(G_DYN_STACKALLOC).lower(); + + computeTables(); + verify(*ST.getInstrInfo()); +} + +bool AArch64LegalizerInfo::legalizeCustom(LegalizerHelper &Helper, + MachineInstr &MI) const { + MachineIRBuilder &MIRBuilder = Helper.MIRBuilder; + MachineRegisterInfo &MRI = *MIRBuilder.getMRI(); + GISelChangeObserver &Observer = Helper.Observer; + switch (MI.getOpcode()) { + default: + // No idea what to do. + return false; + case TargetOpcode::G_VAARG: + return legalizeVaArg(MI, MRI, MIRBuilder); + case TargetOpcode::G_LOAD: + case TargetOpcode::G_STORE: + return legalizeLoadStore(MI, MRI, MIRBuilder, Observer); + case TargetOpcode::G_SHL: + case TargetOpcode::G_ASHR: + case TargetOpcode::G_LSHR: + return legalizeShlAshrLshr(MI, MRI, MIRBuilder, Observer); + case TargetOpcode::G_GLOBAL_VALUE: + return legalizeSmallCMGlobalValue(MI, MRI, MIRBuilder, Observer); + } + + llvm_unreachable("expected switch to return"); +} + +bool AArch64LegalizerInfo::legalizeSmallCMGlobalValue(MachineInstr &MI, + MachineRegisterInfo &MRI, + MachineIRBuilder &MIRBuilder, + GISelChangeObserver &Observer) const { + assert(MI.getOpcode() == TargetOpcode::G_GLOBAL_VALUE); + // We do this custom legalization to convert G_GLOBAL_VALUE into target ADRP + + // G_ADD_LOW instructions. + // By splitting this here, we can optimize accesses in the small code model by + // folding in the G_ADD_LOW into the load/store offset. + auto GV = MI.getOperand(1).getGlobal(); + if (GV->isThreadLocal()) + return true; // Don't want to modify TLS vars. + + auto &TM = ST->getTargetLowering()->getTargetMachine(); + unsigned OpFlags = ST->ClassifyGlobalReference(GV, TM); + + if (OpFlags & AArch64II::MO_GOT) + return true; + + Register DstReg = MI.getOperand(0).getReg(); + auto ADRP = MIRBuilder.buildInstr(AArch64::ADRP, {LLT::pointer(0, 64)}, {}) + .addGlobalAddress(GV, 0, OpFlags | AArch64II::MO_PAGE); + // Set the regclass on the dest reg too. + MRI.setRegClass(ADRP.getReg(0), &AArch64::GPR64RegClass); + + MIRBuilder.buildInstr(AArch64::G_ADD_LOW, {DstReg}, {ADRP}) + .addGlobalAddress(GV, 0, + OpFlags | AArch64II::MO_PAGEOFF | AArch64II::MO_NC); + MI.eraseFromParent(); + return true; +} + +bool AArch64LegalizerInfo::legalizeIntrinsic( + LegalizerHelper &Helper, MachineInstr &MI) const { + MachineIRBuilder &MIRBuilder = Helper.MIRBuilder; + switch (MI.getIntrinsicID()) { + case Intrinsic::memcpy: + case Intrinsic::memset: + case Intrinsic::memmove: + if (createMemLibcall(MIRBuilder, *MIRBuilder.getMRI(), MI) == + LegalizerHelper::UnableToLegalize) + return false; + MI.eraseFromParent(); + return true; + default: + break; + } + return true; +} + +bool AArch64LegalizerInfo::legalizeShlAshrLshr( + MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &MIRBuilder, + GISelChangeObserver &Observer) const { + assert(MI.getOpcode() == TargetOpcode::G_ASHR || + MI.getOpcode() == TargetOpcode::G_LSHR || + MI.getOpcode() == TargetOpcode::G_SHL); + // If the shift amount is a G_CONSTANT, promote it to a 64 bit type so the + // imported patterns can select it later. Either way, it will be legal. + Register AmtReg = MI.getOperand(2).getReg(); + auto *CstMI = MRI.getVRegDef(AmtReg); + assert(CstMI && "expected to find a vreg def"); + if (CstMI->getOpcode() != TargetOpcode::G_CONSTANT) + return true; + // Check the shift amount is in range for an immediate form. + unsigned Amount = CstMI->getOperand(1).getCImm()->getZExtValue(); + if (Amount > 31) + return true; // This will have to remain a register variant. + assert(MRI.getType(AmtReg).getSizeInBits() == 32); + auto ExtCst = MIRBuilder.buildZExt(LLT::scalar(64), AmtReg); + MI.getOperand(2).setReg(ExtCst.getReg(0)); + return true; +} + +bool AArch64LegalizerInfo::legalizeLoadStore( + MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &MIRBuilder, + GISelChangeObserver &Observer) const { + assert(MI.getOpcode() == TargetOpcode::G_STORE || + MI.getOpcode() == TargetOpcode::G_LOAD); + // Here we just try to handle vector loads/stores where our value type might + // have pointer elements, which the SelectionDAG importer can't handle. To + // allow the existing patterns for s64 to fire for p0, we just try to bitcast + // the value to use s64 types. + + // Custom legalization requires the instruction, if not deleted, must be fully + // legalized. In order to allow further legalization of the inst, we create + // a new instruction and erase the existing one. + + Register ValReg = MI.getOperand(0).getReg(); + const LLT ValTy = MRI.getType(ValReg); + + if (!ValTy.isVector() || !ValTy.getElementType().isPointer() || + ValTy.getElementType().getAddressSpace() != 0) { + LLVM_DEBUG(dbgs() << "Tried to do custom legalization on wrong load/store"); + return false; + } + + unsigned PtrSize = ValTy.getElementType().getSizeInBits(); + const LLT NewTy = LLT::vector(ValTy.getNumElements(), PtrSize); + auto &MMO = **MI.memoperands_begin(); + if (MI.getOpcode() == TargetOpcode::G_STORE) { + auto Bitcast = MIRBuilder.buildBitcast(NewTy, ValReg); + MIRBuilder.buildStore(Bitcast.getReg(0), MI.getOperand(1), MMO); + } else { + auto NewLoad = MIRBuilder.buildLoad(NewTy, MI.getOperand(1), MMO); + MIRBuilder.buildBitcast(ValReg, NewLoad); + } + MI.eraseFromParent(); + return true; +} + +bool AArch64LegalizerInfo::legalizeVaArg(MachineInstr &MI, + MachineRegisterInfo &MRI, + MachineIRBuilder &MIRBuilder) const { + MachineFunction &MF = MIRBuilder.getMF(); + Align Alignment(MI.getOperand(2).getImm()); + Register Dst = MI.getOperand(0).getReg(); + Register ListPtr = MI.getOperand(1).getReg(); + + LLT PtrTy = MRI.getType(ListPtr); + LLT IntPtrTy = LLT::scalar(PtrTy.getSizeInBits()); + + const unsigned PtrSize = PtrTy.getSizeInBits() / 8; + const Align PtrAlign = Align(PtrSize); + auto List = MIRBuilder.buildLoad( + PtrTy, ListPtr, + *MF.getMachineMemOperand(MachinePointerInfo(), MachineMemOperand::MOLoad, + PtrSize, PtrAlign)); + + MachineInstrBuilder DstPtr; + if (Alignment > PtrAlign) { + // Realign the list to the actual required alignment. + auto AlignMinus1 = + MIRBuilder.buildConstant(IntPtrTy, Alignment.value() - 1); + auto ListTmp = MIRBuilder.buildPtrAdd(PtrTy, List, AlignMinus1.getReg(0)); + DstPtr = MIRBuilder.buildMaskLowPtrBits(PtrTy, ListTmp, Log2(Alignment)); + } else + DstPtr = List; + + uint64_t ValSize = MRI.getType(Dst).getSizeInBits() / 8; + MIRBuilder.buildLoad( + Dst, DstPtr, + *MF.getMachineMemOperand(MachinePointerInfo(), MachineMemOperand::MOLoad, + ValSize, std::max(Alignment, PtrAlign))); + + auto Size = MIRBuilder.buildConstant(IntPtrTy, alignTo(ValSize, PtrAlign)); + + auto NewList = MIRBuilder.buildPtrAdd(PtrTy, DstPtr, Size.getReg(0)); + + MIRBuilder.buildStore(NewList, ListPtr, + *MF.getMachineMemOperand(MachinePointerInfo(), + MachineMemOperand::MOStore, + PtrSize, PtrAlign)); + + MI.eraseFromParent(); + return true; +} diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.h b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.h new file mode 100644 index 0000000000000..1cb24559c1abf --- /dev/null +++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.h @@ -0,0 +1,51 @@ +//===- AArch64LegalizerInfo --------------------------------------*- C++ -*-==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// \file +/// This file declares the targeting of the Machinelegalizer class for +/// AArch64. +/// \todo This should be generated by TableGen. +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64MACHINELEGALIZER_H +#define LLVM_LIB_TARGET_AARCH64_AARCH64MACHINELEGALIZER_H + +#include "llvm/CodeGen/GlobalISel/GISelChangeObserver.h" +#include "llvm/CodeGen/GlobalISel/LegalizerInfo.h" + +namespace llvm { + +class LLVMContext; +class AArch64Subtarget; + +/// This class provides the information for the target register banks. +class AArch64LegalizerInfo : public LegalizerInfo { +public: + AArch64LegalizerInfo(const AArch64Subtarget &ST); + + bool legalizeCustom(LegalizerHelper &Helper, MachineInstr &MI) const override; + + bool legalizeIntrinsic(LegalizerHelper &Helper, + MachineInstr &MI) const override; + +private: + bool legalizeVaArg(MachineInstr &MI, MachineRegisterInfo &MRI, + MachineIRBuilder &MIRBuilder) const; + bool legalizeLoadStore(MachineInstr &MI, MachineRegisterInfo &MRI, + MachineIRBuilder &MIRBuilder, + GISelChangeObserver &Observer) const; + bool legalizeShlAshrLshr(MachineInstr &MI, MachineRegisterInfo &MRI, + MachineIRBuilder &MIRBuilder, + GISelChangeObserver &Observer) const; + + bool legalizeSmallCMGlobalValue(MachineInstr &MI, MachineRegisterInfo &MRI, + MachineIRBuilder &MIRBuilder, + GISelChangeObserver &Observer) const; + const AArch64Subtarget *ST; +}; +} // End llvm namespace. +#endif diff --git a/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp new file mode 100644 index 0000000000000..baa8515baf3ea --- /dev/null +++ b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp @@ -0,0 +1,507 @@ + //=== lib/CodeGen/GlobalISel/AArch64PostLegalizerCombiner.cpp -------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This performs post-legalization combines on generic MachineInstrs. +// +// Any combine that this pass performs must preserve instruction legality. +// Combines unconcerned with legality should be handled by the +// PreLegalizerCombiner instead. +// +//===----------------------------------------------------------------------===// + +#include "AArch64TargetMachine.h" +#include "llvm/CodeGen/GlobalISel/Combiner.h" +#include "llvm/CodeGen/GlobalISel/CombinerHelper.h" +#include "llvm/CodeGen/GlobalISel/CombinerInfo.h" +#include "llvm/CodeGen/GlobalISel/GISelKnownBits.h" +#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" +#include "llvm/CodeGen/MachineDominators.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/TargetPassConfig.h" +#include "llvm/Support/Debug.h" + +#define DEBUG_TYPE "aarch64-postlegalizer-combiner" + +using namespace llvm; +using namespace MIPatternMatch; + +/// Represents a pseudo instruction which replaces a G_SHUFFLE_VECTOR. +/// +/// Used for matching target-supported shuffles before codegen. +struct ShuffleVectorPseudo { + unsigned Opc; ///< Opcode for the instruction. (E.g. G_ZIP1) + Register Dst; ///< Destination register. + SmallVector SrcOps; ///< Source registers. + ShuffleVectorPseudo(unsigned Opc, Register Dst, + std::initializer_list SrcOps) + : Opc(Opc), Dst(Dst), SrcOps(SrcOps){}; + ShuffleVectorPseudo() {} +}; + +/// \returns The splat index of a G_SHUFFLE_VECTOR \p MI when \p MI is a splat. +/// If \p MI is not a splat, returns None. +static Optional getSplatIndex(MachineInstr &MI) { + assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR && + "Only G_SHUFFLE_VECTOR can have a splat index!"); + ArrayRef Mask = MI.getOperand(3).getShuffleMask(); + auto FirstDefinedIdx = find_if(Mask, [](int Elt) { return Elt >= 0; }); + + // If all elements are undefined, this shuffle can be considered a splat. + // Return 0 for better potential for callers to simplify. + if (FirstDefinedIdx == Mask.end()) + return 0; + + // Make sure all remaining elements are either undef or the same + // as the first non-undef value. + int SplatValue = *FirstDefinedIdx; + if (any_of(make_range(std::next(FirstDefinedIdx), Mask.end()), + [&SplatValue](int Elt) { return Elt >= 0 && Elt != SplatValue; })) + return None; + + return SplatValue; +} + +/// Check if a vector shuffle corresponds to a REV instruction with the +/// specified blocksize. +static bool isREVMask(ArrayRef M, unsigned EltSize, unsigned NumElts, + unsigned BlockSize) { + assert((BlockSize == 16 || BlockSize == 32 || BlockSize == 64) && + "Only possible block sizes for REV are: 16, 32, 64"); + assert(EltSize != 64 && "EltSize cannot be 64 for REV mask."); + + unsigned BlockElts = M[0] + 1; + + // If the first shuffle index is UNDEF, be optimistic. + if (M[0] < 0) + BlockElts = BlockSize / EltSize; + + if (BlockSize <= EltSize || BlockSize != BlockElts * EltSize) + return false; + + for (unsigned i = 0; i < NumElts; ++i) { + // Ignore undef indices. + if (M[i] < 0) + continue; + if (static_cast(M[i]) != + (i - i % BlockElts) + (BlockElts - 1 - i % BlockElts)) + return false; + } + + return true; +} + +/// Determines if \p M is a shuffle vector mask for a TRN of \p NumElts. +/// Whether or not G_TRN1 or G_TRN2 should be used is stored in \p WhichResult. +static bool isTRNMask(ArrayRef M, unsigned NumElts, + unsigned &WhichResult) { + if (NumElts % 2 != 0) + return false; + WhichResult = (M[0] == 0 ? 0 : 1); + for (unsigned i = 0; i < NumElts; i += 2) { + if ((M[i] >= 0 && static_cast(M[i]) != i + WhichResult) || + (M[i + 1] >= 0 && + static_cast(M[i + 1]) != i + NumElts + WhichResult)) + return false; + } + return true; +} + +/// Check if a G_EXT instruction can handle a shuffle mask \p M when the vector +/// sources of the shuffle are different. +static Optional> getExtMask(ArrayRef M, + unsigned NumElts) { + // Look for the first non-undef element. + auto FirstRealElt = find_if(M, [](int Elt) { return Elt >= 0; }); + if (FirstRealElt == M.end()) + return None; + + // Use APInt to handle overflow when calculating expected element. + unsigned MaskBits = APInt(32, NumElts * 2).logBase2(); + APInt ExpectedElt = APInt(MaskBits, *FirstRealElt + 1); + + // The following shuffle indices must be the successive elements after the + // first real element. + if (any_of( + make_range(std::next(FirstRealElt), M.end()), + [&ExpectedElt](int Elt) { return Elt != ExpectedElt++ && Elt >= 0; })) + return None; + + // The index of an EXT is the first element if it is not UNDEF. + // Watch out for the beginning UNDEFs. The EXT index should be the expected + // value of the first element. E.g. + // <-1, -1, 3, ...> is treated as <1, 2, 3, ...>. + // <-1, -1, 0, 1, ...> is treated as <2*NumElts-2, 2*NumElts-1, 0, 1, ...>. + // ExpectedElt is the last mask index plus 1. + uint64_t Imm = ExpectedElt.getZExtValue(); + bool ReverseExt = false; + + // There are two difference cases requiring to reverse input vectors. + // For example, for vector <4 x i32> we have the following cases, + // Case 1: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, -1, 0>) + // Case 2: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, 7, 0>) + // For both cases, we finally use mask <5, 6, 7, 0>, which requires + // to reverse two input vectors. + if (Imm < NumElts) + ReverseExt = true; + else + Imm -= NumElts; + return std::make_pair(ReverseExt, Imm); +} + +/// Determines if \p M is a shuffle vector mask for a UZP of \p NumElts. +/// Whether or not G_UZP1 or G_UZP2 should be used is stored in \p WhichResult. +static bool isUZPMask(ArrayRef M, unsigned NumElts, + unsigned &WhichResult) { + WhichResult = (M[0] == 0 ? 0 : 1); + for (unsigned i = 0; i != NumElts; ++i) { + // Skip undef indices. + if (M[i] < 0) + continue; + if (static_cast(M[i]) != 2 * i + WhichResult) + return false; + } + return true; +} + +/// \return true if \p M is a zip mask for a shuffle vector of \p NumElts. +/// Whether or not G_ZIP1 or G_ZIP2 should be used is stored in \p WhichResult. +static bool isZipMask(ArrayRef M, unsigned NumElts, + unsigned &WhichResult) { + if (NumElts % 2 != 0) + return false; + + // 0 means use ZIP1, 1 means use ZIP2. + WhichResult = (M[0] == 0 ? 0 : 1); + unsigned Idx = WhichResult * NumElts / 2; + for (unsigned i = 0; i != NumElts; i += 2) { + if ((M[i] >= 0 && static_cast(M[i]) != Idx) || + (M[i + 1] >= 0 && static_cast(M[i + 1]) != Idx + NumElts)) + return false; + Idx += 1; + } + return true; +} + +/// \return true if a G_SHUFFLE_VECTOR instruction \p MI can be replaced with a +/// G_REV instruction. Returns the appropriate G_REV opcode in \p Opc. +static bool matchREV(MachineInstr &MI, MachineRegisterInfo &MRI, + ShuffleVectorPseudo &MatchInfo) { + assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR); + ArrayRef ShuffleMask = MI.getOperand(3).getShuffleMask(); + Register Dst = MI.getOperand(0).getReg(); + Register Src = MI.getOperand(1).getReg(); + LLT Ty = MRI.getType(Dst); + unsigned EltSize = Ty.getScalarSizeInBits(); + + // Element size for a rev cannot be 64. + if (EltSize == 64) + return false; + + unsigned NumElts = Ty.getNumElements(); + + // Try to produce G_REV64 + if (isREVMask(ShuffleMask, EltSize, NumElts, 64)) { + MatchInfo = ShuffleVectorPseudo(AArch64::G_REV64, Dst, {Src}); + return true; + } + + // TODO: Produce G_REV32 and G_REV16 once we have proper legalization support. + // This should be identical to above, but with a constant 32 and constant + // 16. + return false; +} + +/// \return true if a G_SHUFFLE_VECTOR instruction \p MI can be replaced with +/// a G_TRN1 or G_TRN2 instruction. +static bool matchTRN(MachineInstr &MI, MachineRegisterInfo &MRI, + ShuffleVectorPseudo &MatchInfo) { + assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR); + unsigned WhichResult; + ArrayRef ShuffleMask = MI.getOperand(3).getShuffleMask(); + Register Dst = MI.getOperand(0).getReg(); + unsigned NumElts = MRI.getType(Dst).getNumElements(); + if (!isTRNMask(ShuffleMask, NumElts, WhichResult)) + return false; + unsigned Opc = (WhichResult == 0) ? AArch64::G_TRN1 : AArch64::G_TRN2; + Register V1 = MI.getOperand(1).getReg(); + Register V2 = MI.getOperand(2).getReg(); + MatchInfo = ShuffleVectorPseudo(Opc, Dst, {V1, V2}); + return true; +} + +/// \return true if a G_SHUFFLE_VECTOR instruction \p MI can be replaced with +/// a G_UZP1 or G_UZP2 instruction. +/// +/// \param [in] MI - The shuffle vector instruction. +/// \param [out] MatchInfo - Either G_UZP1 or G_UZP2 on success. +static bool matchUZP(MachineInstr &MI, MachineRegisterInfo &MRI, + ShuffleVectorPseudo &MatchInfo) { + assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR); + unsigned WhichResult; + ArrayRef ShuffleMask = MI.getOperand(3).getShuffleMask(); + Register Dst = MI.getOperand(0).getReg(); + unsigned NumElts = MRI.getType(Dst).getNumElements(); + if (!isUZPMask(ShuffleMask, NumElts, WhichResult)) + return false; + unsigned Opc = (WhichResult == 0) ? AArch64::G_UZP1 : AArch64::G_UZP2; + Register V1 = MI.getOperand(1).getReg(); + Register V2 = MI.getOperand(2).getReg(); + MatchInfo = ShuffleVectorPseudo(Opc, Dst, {V1, V2}); + return true; +} + +static bool matchZip(MachineInstr &MI, MachineRegisterInfo &MRI, + ShuffleVectorPseudo &MatchInfo) { + assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR); + unsigned WhichResult; + ArrayRef ShuffleMask = MI.getOperand(3).getShuffleMask(); + Register Dst = MI.getOperand(0).getReg(); + unsigned NumElts = MRI.getType(Dst).getNumElements(); + if (!isZipMask(ShuffleMask, NumElts, WhichResult)) + return false; + unsigned Opc = (WhichResult == 0) ? AArch64::G_ZIP1 : AArch64::G_ZIP2; + Register V1 = MI.getOperand(1).getReg(); + Register V2 = MI.getOperand(2).getReg(); + MatchInfo = ShuffleVectorPseudo(Opc, Dst, {V1, V2}); + return true; +} + +/// Helper function for matchDup. +static bool matchDupFromInsertVectorElt(int Lane, MachineInstr &MI, + MachineRegisterInfo &MRI, + ShuffleVectorPseudo &MatchInfo) { + if (Lane != 0) + return false; + + // Try to match a vector splat operation into a dup instruction. + // We're looking for this pattern: + // + // %scalar:gpr(s64) = COPY $x0 + // %undef:fpr(<2 x s64>) = G_IMPLICIT_DEF + // %cst0:gpr(s32) = G_CONSTANT i32 0 + // %zerovec:fpr(<2 x s32>) = G_BUILD_VECTOR %cst0(s32), %cst0(s32) + // %ins:fpr(<2 x s64>) = G_INSERT_VECTOR_ELT %undef, %scalar(s64), %cst0(s32) + // %splat:fpr(<2 x s64>) = G_SHUFFLE_VECTOR %ins(<2 x s64>), %undef, %zerovec(<2 x s32>) + // + // ...into: + // %splat = G_DUP %scalar + + // Begin matching the insert. + auto *InsMI = getOpcodeDef(TargetOpcode::G_INSERT_VECTOR_ELT, + MI.getOperand(1).getReg(), MRI); + if (!InsMI) + return false; + // Match the undef vector operand. + if (!getOpcodeDef(TargetOpcode::G_IMPLICIT_DEF, InsMI->getOperand(1).getReg(), + MRI)) + return false; + + // Match the index constant 0. + int64_t Index = 0; + if (!mi_match(InsMI->getOperand(3).getReg(), MRI, m_ICst(Index)) || Index) + return false; + + MatchInfo = ShuffleVectorPseudo(AArch64::G_DUP, MI.getOperand(0).getReg(), + {InsMI->getOperand(2).getReg()}); + return true; +} + +/// Helper function for matchDup. +static bool matchDupFromBuildVector(int Lane, MachineInstr &MI, + MachineRegisterInfo &MRI, + ShuffleVectorPseudo &MatchInfo) { + assert(Lane >= 0 && "Expected positive lane?"); + // Test if the LHS is a BUILD_VECTOR. If it is, then we can just reference the + // lane's definition directly. + auto *BuildVecMI = getOpcodeDef(TargetOpcode::G_BUILD_VECTOR, + MI.getOperand(1).getReg(), MRI); + if (!BuildVecMI) + return false; + Register Reg = BuildVecMI->getOperand(Lane + 1).getReg(); + MatchInfo = + ShuffleVectorPseudo(AArch64::G_DUP, MI.getOperand(0).getReg(), {Reg}); + return true; +} + +static bool matchDup(MachineInstr &MI, MachineRegisterInfo &MRI, + ShuffleVectorPseudo &MatchInfo) { + assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR); + auto MaybeLane = getSplatIndex(MI); + if (!MaybeLane) + return false; + int Lane = *MaybeLane; + // If this is undef splat, generate it via "just" vdup, if possible. + if (Lane < 0) + Lane = 0; + if (matchDupFromInsertVectorElt(Lane, MI, MRI, MatchInfo)) + return true; + if (matchDupFromBuildVector(Lane, MI, MRI, MatchInfo)) + return true; + return false; +} + +static bool matchEXT(MachineInstr &MI, MachineRegisterInfo &MRI, + ShuffleVectorPseudo &MatchInfo) { + assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR); + Register Dst = MI.getOperand(0).getReg(); + auto ExtInfo = getExtMask(MI.getOperand(3).getShuffleMask(), + MRI.getType(Dst).getNumElements()); + if (!ExtInfo) + return false; + bool ReverseExt; + uint64_t Imm; + std::tie(ReverseExt, Imm) = *ExtInfo; + Register V1 = MI.getOperand(1).getReg(); + Register V2 = MI.getOperand(2).getReg(); + if (ReverseExt) + std::swap(V1, V2); + uint64_t ExtFactor = MRI.getType(V1).getScalarSizeInBits() / 8; + Imm *= ExtFactor; + MatchInfo = ShuffleVectorPseudo(AArch64::G_EXT, Dst, {V1, V2, Imm}); + return true; +} + +/// Replace a G_SHUFFLE_VECTOR instruction with a pseudo. +/// \p Opc is the opcode to use. \p MI is the G_SHUFFLE_VECTOR. +static bool applyShuffleVectorPseudo(MachineInstr &MI, + ShuffleVectorPseudo &MatchInfo) { + MachineIRBuilder MIRBuilder(MI); + MIRBuilder.buildInstr(MatchInfo.Opc, {MatchInfo.Dst}, MatchInfo.SrcOps); + MI.eraseFromParent(); + return true; +} + +/// Replace a G_SHUFFLE_VECTOR instruction with G_EXT. +/// Special-cased because the constant operand must be emitted as a G_CONSTANT +/// for the imported tablegen patterns to work. +static bool applyEXT(MachineInstr &MI, ShuffleVectorPseudo &MatchInfo) { + MachineIRBuilder MIRBuilder(MI); + // Tablegen patterns expect an i32 G_CONSTANT as the final op. + auto Cst = + MIRBuilder.buildConstant(LLT::scalar(32), MatchInfo.SrcOps[2].getImm()); + MIRBuilder.buildInstr(MatchInfo.Opc, {MatchInfo.Dst}, + {MatchInfo.SrcOps[0], MatchInfo.SrcOps[1], Cst}); + MI.eraseFromParent(); + return true; +} + +#define AARCH64POSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS +#include "AArch64GenPostLegalizeGICombiner.inc" +#undef AARCH64POSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS + +namespace { +#define AARCH64POSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H +#include "AArch64GenPostLegalizeGICombiner.inc" +#undef AARCH64POSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H + +class AArch64PostLegalizerCombinerInfo : public CombinerInfo { + GISelKnownBits *KB; + MachineDominatorTree *MDT; + +public: + AArch64GenPostLegalizerCombinerHelperRuleConfig GeneratedRuleCfg; + + AArch64PostLegalizerCombinerInfo(bool EnableOpt, bool OptSize, bool MinSize, + GISelKnownBits *KB, + MachineDominatorTree *MDT) + : CombinerInfo(/*AllowIllegalOps*/ true, /*ShouldLegalizeIllegal*/ false, + /*LegalizerInfo*/ nullptr, EnableOpt, OptSize, MinSize), + KB(KB), MDT(MDT) { + if (!GeneratedRuleCfg.parseCommandLineOption()) + report_fatal_error("Invalid rule identifier"); + } + + virtual bool combine(GISelChangeObserver &Observer, MachineInstr &MI, + MachineIRBuilder &B) const override; +}; + +bool AArch64PostLegalizerCombinerInfo::combine(GISelChangeObserver &Observer, + MachineInstr &MI, + MachineIRBuilder &B) const { + const auto *LI = + MI.getParent()->getParent()->getSubtarget().getLegalizerInfo(); + CombinerHelper Helper(Observer, B, KB, MDT, LI); + AArch64GenPostLegalizerCombinerHelper Generated(GeneratedRuleCfg); + return Generated.tryCombineAll(Observer, MI, B, Helper); +} + +#define AARCH64POSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP +#include "AArch64GenPostLegalizeGICombiner.inc" +#undef AARCH64POSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP + +class AArch64PostLegalizerCombiner : public MachineFunctionPass { +public: + static char ID; + + AArch64PostLegalizerCombiner(bool IsOptNone = false); + + StringRef getPassName() const override { + return "AArch64PostLegalizerCombiner"; + } + + bool runOnMachineFunction(MachineFunction &MF) override; + void getAnalysisUsage(AnalysisUsage &AU) const override; + +private: + bool IsOptNone; +}; +} // end anonymous namespace + +void AArch64PostLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const { + AU.addRequired(); + AU.setPreservesCFG(); + getSelectionDAGFallbackAnalysisUsage(AU); + AU.addRequired(); + AU.addPreserved(); + if (!IsOptNone) { + AU.addRequired(); + AU.addPreserved(); + } + MachineFunctionPass::getAnalysisUsage(AU); +} + +AArch64PostLegalizerCombiner::AArch64PostLegalizerCombiner(bool IsOptNone) + : MachineFunctionPass(ID), IsOptNone(IsOptNone) { + initializeAArch64PostLegalizerCombinerPass(*PassRegistry::getPassRegistry()); +} + +bool AArch64PostLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) { + if (MF.getProperties().hasProperty( + MachineFunctionProperties::Property::FailedISel)) + return false; + assert(MF.getProperties().hasProperty( + MachineFunctionProperties::Property::Legalized) && + "Expected a legalized function?"); + auto *TPC = &getAnalysis(); + const Function &F = MF.getFunction(); + bool EnableOpt = + MF.getTarget().getOptLevel() != CodeGenOpt::None && !skipFunction(F); + GISelKnownBits *KB = &getAnalysis().get(MF); + MachineDominatorTree *MDT = + IsOptNone ? nullptr : &getAnalysis(); + AArch64PostLegalizerCombinerInfo PCInfo(EnableOpt, F.hasOptSize(), + F.hasMinSize(), KB, MDT); + Combiner C(PCInfo, TPC); + return C.combineMachineInstrs(MF, /*CSEInfo*/ nullptr); +} + +char AArch64PostLegalizerCombiner::ID = 0; +INITIALIZE_PASS_BEGIN(AArch64PostLegalizerCombiner, DEBUG_TYPE, + "Combine AArch64 MachineInstrs after legalization", false, + false) +INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) +INITIALIZE_PASS_DEPENDENCY(GISelKnownBitsAnalysis) +INITIALIZE_PASS_END(AArch64PostLegalizerCombiner, DEBUG_TYPE, + "Combine AArch64 MachineInstrs after legalization", false, + false) + +namespace llvm { +FunctionPass *createAArch64PostLegalizeCombiner(bool IsOptNone) { + return new AArch64PostLegalizerCombiner(IsOptNone); +} +} // end namespace llvm diff --git a/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp b/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp new file mode 100644 index 0000000000000..9a1f200d52222 --- /dev/null +++ b/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp @@ -0,0 +1,203 @@ +//=== lib/CodeGen/GlobalISel/AArch64PreLegalizerCombiner.cpp --------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This pass does combining of machine instructions at the generic MI level, +// before the legalizer. +// +//===----------------------------------------------------------------------===// + +#include "AArch64TargetMachine.h" +#include "llvm/CodeGen/GlobalISel/Combiner.h" +#include "llvm/CodeGen/GlobalISel/CombinerHelper.h" +#include "llvm/CodeGen/GlobalISel/CombinerInfo.h" +#include "llvm/CodeGen/GlobalISel/GISelKnownBits.h" +#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" +#include "llvm/CodeGen/MachineDominators.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/TargetPassConfig.h" +#include "llvm/Support/Debug.h" + +#define DEBUG_TYPE "aarch64-prelegalizer-combiner" + +using namespace llvm; +using namespace MIPatternMatch; + +/// Return true if a G_FCONSTANT instruction is known to be better-represented +/// as a G_CONSTANT. +static bool matchFConstantToConstant(MachineInstr &MI, + MachineRegisterInfo &MRI) { + assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT); + Register DstReg = MI.getOperand(0).getReg(); + const unsigned DstSize = MRI.getType(DstReg).getSizeInBits(); + if (DstSize != 32 && DstSize != 64) + return false; + + // When we're storing a value, it doesn't matter what register bank it's on. + // Since not all floating point constants can be materialized using a fmov, + // it makes more sense to just use a GPR. + return all_of(MRI.use_nodbg_instructions(DstReg), + [](const MachineInstr &Use) { return Use.mayStore(); }); +} + +/// Change a G_FCONSTANT into a G_CONSTANT. +static void applyFConstantToConstant(MachineInstr &MI) { + assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT); + MachineIRBuilder MIB(MI); + const APFloat &ImmValAPF = MI.getOperand(1).getFPImm()->getValueAPF(); + MIB.buildConstant(MI.getOperand(0).getReg(), ImmValAPF.bitcastToAPInt()); + MI.eraseFromParent(); +} + +class AArch64PreLegalizerCombinerHelperState { +protected: + CombinerHelper &Helper; + +public: + AArch64PreLegalizerCombinerHelperState(CombinerHelper &Helper) + : Helper(Helper) {} +}; + +#define AARCH64PRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS +#include "AArch64GenPreLegalizeGICombiner.inc" +#undef AARCH64PRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS + +namespace { +#define AARCH64PRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H +#include "AArch64GenPreLegalizeGICombiner.inc" +#undef AARCH64PRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H + +class AArch64PreLegalizerCombinerInfo : public CombinerInfo { + GISelKnownBits *KB; + MachineDominatorTree *MDT; + AArch64GenPreLegalizerCombinerHelperRuleConfig GeneratedRuleCfg; + +public: + AArch64PreLegalizerCombinerInfo(bool EnableOpt, bool OptSize, bool MinSize, + GISelKnownBits *KB, MachineDominatorTree *MDT) + : CombinerInfo(/*AllowIllegalOps*/ true, /*ShouldLegalizeIllegal*/ false, + /*LegalizerInfo*/ nullptr, EnableOpt, OptSize, MinSize), + KB(KB), MDT(MDT) { + if (!GeneratedRuleCfg.parseCommandLineOption()) + report_fatal_error("Invalid rule identifier"); + } + + virtual bool combine(GISelChangeObserver &Observer, MachineInstr &MI, + MachineIRBuilder &B) const override; +}; + +bool AArch64PreLegalizerCombinerInfo::combine(GISelChangeObserver &Observer, + MachineInstr &MI, + MachineIRBuilder &B) const { + CombinerHelper Helper(Observer, B, KB, MDT); + AArch64GenPreLegalizerCombinerHelper Generated(GeneratedRuleCfg, Helper); + + switch (MI.getOpcode()) { + case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS: + switch (MI.getIntrinsicID()) { + case Intrinsic::memcpy: + case Intrinsic::memmove: + case Intrinsic::memset: { + // If we're at -O0 set a maxlen of 32 to inline, otherwise let the other + // heuristics decide. + unsigned MaxLen = EnableOpt ? 0 : 32; + // Try to inline memcpy type calls if optimizations are enabled. + return (!EnableMinSize) ? Helper.tryCombineMemCpyFamily(MI, MaxLen) + : false; + } + default: + break; + } + } + + if (Generated.tryCombineAll(Observer, MI, B)) + return true; + + switch (MI.getOpcode()) { + case TargetOpcode::G_CONCAT_VECTORS: + return Helper.tryCombineConcatVectors(MI); + case TargetOpcode::G_SHUFFLE_VECTOR: + return Helper.tryCombineShuffleVector(MI); + } + + return false; +} + +#define AARCH64PRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP +#include "AArch64GenPreLegalizeGICombiner.inc" +#undef AARCH64PRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP + +// Pass boilerplate +// ================ + +class AArch64PreLegalizerCombiner : public MachineFunctionPass { +public: + static char ID; + + AArch64PreLegalizerCombiner(bool IsOptNone = false); + + StringRef getPassName() const override { return "AArch64PreLegalizerCombiner"; } + + bool runOnMachineFunction(MachineFunction &MF) override; + + void getAnalysisUsage(AnalysisUsage &AU) const override; +private: + bool IsOptNone; +}; +} // end anonymous namespace + +void AArch64PreLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const { + AU.addRequired(); + AU.setPreservesCFG(); + getSelectionDAGFallbackAnalysisUsage(AU); + AU.addRequired(); + AU.addPreserved(); + if (!IsOptNone) { + AU.addRequired(); + AU.addPreserved(); + } + MachineFunctionPass::getAnalysisUsage(AU); +} + +AArch64PreLegalizerCombiner::AArch64PreLegalizerCombiner(bool IsOptNone) + : MachineFunctionPass(ID), IsOptNone(IsOptNone) { + initializeAArch64PreLegalizerCombinerPass(*PassRegistry::getPassRegistry()); +} + +bool AArch64PreLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) { + if (MF.getProperties().hasProperty( + MachineFunctionProperties::Property::FailedISel)) + return false; + auto *TPC = &getAnalysis(); + const Function &F = MF.getFunction(); + bool EnableOpt = + MF.getTarget().getOptLevel() != CodeGenOpt::None && !skipFunction(F); + GISelKnownBits *KB = &getAnalysis().get(MF); + MachineDominatorTree *MDT = + IsOptNone ? nullptr : &getAnalysis(); + AArch64PreLegalizerCombinerInfo PCInfo(EnableOpt, F.hasOptSize(), + F.hasMinSize(), KB, MDT); + Combiner C(PCInfo, TPC); + return C.combineMachineInstrs(MF, /*CSEInfo*/ nullptr); +} + +char AArch64PreLegalizerCombiner::ID = 0; +INITIALIZE_PASS_BEGIN(AArch64PreLegalizerCombiner, DEBUG_TYPE, + "Combine AArch64 machine instrs before legalization", + false, false) +INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) +INITIALIZE_PASS_DEPENDENCY(GISelKnownBitsAnalysis) +INITIALIZE_PASS_END(AArch64PreLegalizerCombiner, DEBUG_TYPE, + "Combine AArch64 machine instrs before legalization", false, + false) + + +namespace llvm { +FunctionPass *createAArch64PreLegalizeCombiner(bool IsOptNone) { + return new AArch64PreLegalizerCombiner(IsOptNone); +} +} // end namespace llvm diff --git a/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp new file mode 100644 index 0000000000000..7e3ff1948dad7 --- /dev/null +++ b/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp @@ -0,0 +1,868 @@ +//===- AArch64RegisterBankInfo.cpp ----------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// \file +/// This file implements the targeting of the RegisterBankInfo class for +/// AArch64. +/// \todo This should be generated by TableGen. +//===----------------------------------------------------------------------===// + +#include "AArch64RegisterBankInfo.h" +#include "AArch64InstrInfo.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/CodeGen/GlobalISel/RegisterBank.h" +#include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h" +#include "llvm/CodeGen/LowLevelType.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineOperand.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/TargetOpcodes.h" +#include "llvm/CodeGen/TargetRegisterInfo.h" +#include "llvm/CodeGen/TargetSubtargetInfo.h" +#include "llvm/Support/ErrorHandling.h" +#include +#include + +#define GET_TARGET_REGBANK_IMPL +#include "AArch64GenRegisterBank.inc" + +// This file will be TableGen'ed at some point. +#include "AArch64GenRegisterBankInfo.def" + +using namespace llvm; + +AArch64RegisterBankInfo::AArch64RegisterBankInfo(const TargetRegisterInfo &TRI) + : AArch64GenRegisterBankInfo() { + static llvm::once_flag InitializeRegisterBankFlag; + + static auto InitializeRegisterBankOnce = [&]() { + // We have only one set of register banks, whatever the subtarget + // is. Therefore, the initialization of the RegBanks table should be + // done only once. Indeed the table of all register banks + // (AArch64::RegBanks) is unique in the compiler. At some point, it + // will get tablegen'ed and the whole constructor becomes empty. + + const RegisterBank &RBGPR = getRegBank(AArch64::GPRRegBankID); + (void)RBGPR; + assert(&AArch64::GPRRegBank == &RBGPR && + "The order in RegBanks is messed up"); + + const RegisterBank &RBFPR = getRegBank(AArch64::FPRRegBankID); + (void)RBFPR; + assert(&AArch64::FPRRegBank == &RBFPR && + "The order in RegBanks is messed up"); + + const RegisterBank &RBCCR = getRegBank(AArch64::CCRegBankID); + (void)RBCCR; + assert(&AArch64::CCRegBank == &RBCCR && + "The order in RegBanks is messed up"); + + // The GPR register bank is fully defined by all the registers in + // GR64all + its subclasses. + assert(RBGPR.covers(*TRI.getRegClass(AArch64::GPR32RegClassID)) && + "Subclass not added?"); + assert(RBGPR.getSize() == 64 && "GPRs should hold up to 64-bit"); + + // The FPR register bank is fully defined by all the registers in + // GR64all + its subclasses. + assert(RBFPR.covers(*TRI.getRegClass(AArch64::QQRegClassID)) && + "Subclass not added?"); + assert(RBFPR.covers(*TRI.getRegClass(AArch64::FPR64RegClassID)) && + "Subclass not added?"); + assert(RBFPR.getSize() == 512 && + "FPRs should hold up to 512-bit via QQQQ sequence"); + + assert(RBCCR.covers(*TRI.getRegClass(AArch64::CCRRegClassID)) && + "Class not added?"); + assert(RBCCR.getSize() == 32 && "CCR should hold up to 32-bit"); + + // Check that the TableGen'ed like file is in sync we our expectations. + // First, the Idx. + assert(checkPartialMappingIdx(PMI_FirstGPR, PMI_LastGPR, + {PMI_GPR32, PMI_GPR64}) && + "PartialMappingIdx's are incorrectly ordered"); + assert(checkPartialMappingIdx(PMI_FirstFPR, PMI_LastFPR, + {PMI_FPR16, PMI_FPR32, PMI_FPR64, PMI_FPR128, + PMI_FPR256, PMI_FPR512}) && + "PartialMappingIdx's are incorrectly ordered"); +// Now, the content. +// Check partial mapping. +#define CHECK_PARTIALMAP(Idx, ValStartIdx, ValLength, RB) \ + do { \ + assert( \ + checkPartialMap(PartialMappingIdx::Idx, ValStartIdx, ValLength, RB) && \ + #Idx " is incorrectly initialized"); \ + } while (false) + + CHECK_PARTIALMAP(PMI_GPR32, 0, 32, RBGPR); + CHECK_PARTIALMAP(PMI_GPR64, 0, 64, RBGPR); + CHECK_PARTIALMAP(PMI_FPR16, 0, 16, RBFPR); + CHECK_PARTIALMAP(PMI_FPR32, 0, 32, RBFPR); + CHECK_PARTIALMAP(PMI_FPR64, 0, 64, RBFPR); + CHECK_PARTIALMAP(PMI_FPR128, 0, 128, RBFPR); + CHECK_PARTIALMAP(PMI_FPR256, 0, 256, RBFPR); + CHECK_PARTIALMAP(PMI_FPR512, 0, 512, RBFPR); + +// Check value mapping. +#define CHECK_VALUEMAP_IMPL(RBName, Size, Offset) \ + do { \ + assert(checkValueMapImpl(PartialMappingIdx::PMI_##RBName##Size, \ + PartialMappingIdx::PMI_First##RBName, Size, \ + Offset) && \ + #RBName #Size " " #Offset " is incorrectly initialized"); \ + } while (false) + +#define CHECK_VALUEMAP(RBName, Size) CHECK_VALUEMAP_IMPL(RBName, Size, 0) + + CHECK_VALUEMAP(GPR, 32); + CHECK_VALUEMAP(GPR, 64); + CHECK_VALUEMAP(FPR, 16); + CHECK_VALUEMAP(FPR, 32); + CHECK_VALUEMAP(FPR, 64); + CHECK_VALUEMAP(FPR, 128); + CHECK_VALUEMAP(FPR, 256); + CHECK_VALUEMAP(FPR, 512); + +// Check the value mapping for 3-operands instructions where all the operands +// map to the same value mapping. +#define CHECK_VALUEMAP_3OPS(RBName, Size) \ + do { \ + CHECK_VALUEMAP_IMPL(RBName, Size, 0); \ + CHECK_VALUEMAP_IMPL(RBName, Size, 1); \ + CHECK_VALUEMAP_IMPL(RBName, Size, 2); \ + } while (false) + + CHECK_VALUEMAP_3OPS(GPR, 32); + CHECK_VALUEMAP_3OPS(GPR, 64); + CHECK_VALUEMAP_3OPS(FPR, 32); + CHECK_VALUEMAP_3OPS(FPR, 64); + CHECK_VALUEMAP_3OPS(FPR, 128); + CHECK_VALUEMAP_3OPS(FPR, 256); + CHECK_VALUEMAP_3OPS(FPR, 512); + +#define CHECK_VALUEMAP_CROSSREGCPY(RBNameDst, RBNameSrc, Size) \ + do { \ + unsigned PartialMapDstIdx = PMI_##RBNameDst##Size - PMI_Min; \ + unsigned PartialMapSrcIdx = PMI_##RBNameSrc##Size - PMI_Min; \ + (void)PartialMapDstIdx; \ + (void)PartialMapSrcIdx; \ + const ValueMapping *Map = getCopyMapping( \ + AArch64::RBNameDst##RegBankID, AArch64::RBNameSrc##RegBankID, Size); \ + (void)Map; \ + assert(Map[0].BreakDown == \ + &AArch64GenRegisterBankInfo::PartMappings[PartialMapDstIdx] && \ + Map[0].NumBreakDowns == 1 && #RBNameDst #Size \ + " Dst is incorrectly initialized"); \ + assert(Map[1].BreakDown == \ + &AArch64GenRegisterBankInfo::PartMappings[PartialMapSrcIdx] && \ + Map[1].NumBreakDowns == 1 && #RBNameSrc #Size \ + " Src is incorrectly initialized"); \ + \ + } while (false) + + CHECK_VALUEMAP_CROSSREGCPY(GPR, GPR, 32); + CHECK_VALUEMAP_CROSSREGCPY(GPR, FPR, 32); + CHECK_VALUEMAP_CROSSREGCPY(GPR, GPR, 64); + CHECK_VALUEMAP_CROSSREGCPY(GPR, FPR, 64); + CHECK_VALUEMAP_CROSSREGCPY(FPR, FPR, 32); + CHECK_VALUEMAP_CROSSREGCPY(FPR, GPR, 32); + CHECK_VALUEMAP_CROSSREGCPY(FPR, FPR, 64); + CHECK_VALUEMAP_CROSSREGCPY(FPR, GPR, 64); + +#define CHECK_VALUEMAP_FPEXT(DstSize, SrcSize) \ + do { \ + unsigned PartialMapDstIdx = PMI_FPR##DstSize - PMI_Min; \ + unsigned PartialMapSrcIdx = PMI_FPR##SrcSize - PMI_Min; \ + (void)PartialMapDstIdx; \ + (void)PartialMapSrcIdx; \ + const ValueMapping *Map = getFPExtMapping(DstSize, SrcSize); \ + (void)Map; \ + assert(Map[0].BreakDown == \ + &AArch64GenRegisterBankInfo::PartMappings[PartialMapDstIdx] && \ + Map[0].NumBreakDowns == 1 && "FPR" #DstSize \ + " Dst is incorrectly initialized"); \ + assert(Map[1].BreakDown == \ + &AArch64GenRegisterBankInfo::PartMappings[PartialMapSrcIdx] && \ + Map[1].NumBreakDowns == 1 && "FPR" #SrcSize \ + " Src is incorrectly initialized"); \ + \ + } while (false) + + CHECK_VALUEMAP_FPEXT(32, 16); + CHECK_VALUEMAP_FPEXT(64, 16); + CHECK_VALUEMAP_FPEXT(64, 32); + CHECK_VALUEMAP_FPEXT(128, 64); + + assert(verify(TRI) && "Invalid register bank information"); + }; + + llvm::call_once(InitializeRegisterBankFlag, InitializeRegisterBankOnce); +} + +unsigned AArch64RegisterBankInfo::copyCost(const RegisterBank &A, + const RegisterBank &B, + unsigned Size) const { + // What do we do with different size? + // copy are same size. + // Will introduce other hooks for different size: + // * extract cost. + // * build_sequence cost. + + // Copy from (resp. to) GPR to (resp. from) FPR involves FMOV. + // FIXME: This should be deduced from the scheduling model. + if (&A == &AArch64::GPRRegBank && &B == &AArch64::FPRRegBank) + // FMOVXDr or FMOVWSr. + return 5; + if (&A == &AArch64::FPRRegBank && &B == &AArch64::GPRRegBank) + // FMOVDXr or FMOVSWr. + return 4; + + return RegisterBankInfo::copyCost(A, B, Size); +} + +const RegisterBank & +AArch64RegisterBankInfo::getRegBankFromRegClass(const TargetRegisterClass &RC, + LLT) const { + switch (RC.getID()) { + case AArch64::FPR8RegClassID: + case AArch64::FPR16RegClassID: + case AArch64::FPR16_loRegClassID: + case AArch64::FPR32_with_hsub_in_FPR16_loRegClassID: + case AArch64::FPR32RegClassID: + case AArch64::FPR64RegClassID: + case AArch64::FPR64_loRegClassID: + case AArch64::FPR128RegClassID: + case AArch64::FPR128_loRegClassID: + case AArch64::DDRegClassID: + case AArch64::DDDRegClassID: + case AArch64::DDDDRegClassID: + case AArch64::QQRegClassID: + case AArch64::QQQRegClassID: + case AArch64::QQQQRegClassID: + return getRegBank(AArch64::FPRRegBankID); + case AArch64::GPR32commonRegClassID: + case AArch64::GPR32RegClassID: + case AArch64::GPR32spRegClassID: + case AArch64::GPR32sponlyRegClassID: + case AArch64::GPR32argRegClassID: + case AArch64::GPR32allRegClassID: + case AArch64::GPR64commonRegClassID: + case AArch64::GPR64RegClassID: + case AArch64::GPR64spRegClassID: + case AArch64::GPR64sponlyRegClassID: + case AArch64::GPR64argRegClassID: + case AArch64::GPR64allRegClassID: + case AArch64::GPR64noipRegClassID: + case AArch64::GPR64common_and_GPR64noipRegClassID: + case AArch64::GPR64noip_and_tcGPR64RegClassID: + case AArch64::tcGPR64RegClassID: + case AArch64::WSeqPairsClassRegClassID: + case AArch64::XSeqPairsClassRegClassID: + return getRegBank(AArch64::GPRRegBankID); + case AArch64::CCRRegClassID: + return getRegBank(AArch64::CCRegBankID); + default: + llvm_unreachable("Register class not supported"); + } +} + +RegisterBankInfo::InstructionMappings +AArch64RegisterBankInfo::getInstrAlternativeMappings( + const MachineInstr &MI) const { + const MachineFunction &MF = *MI.getParent()->getParent(); + const TargetSubtargetInfo &STI = MF.getSubtarget(); + const TargetRegisterInfo &TRI = *STI.getRegisterInfo(); + const MachineRegisterInfo &MRI = MF.getRegInfo(); + + switch (MI.getOpcode()) { + case TargetOpcode::G_OR: { + // 32 and 64-bit or can be mapped on either FPR or + // GPR for the same cost. + unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, TRI); + if (Size != 32 && Size != 64) + break; + + // If the instruction has any implicit-defs or uses, + // do not mess with it. + if (MI.getNumOperands() != 3) + break; + InstructionMappings AltMappings; + const InstructionMapping &GPRMapping = getInstructionMapping( + /*ID*/ 1, /*Cost*/ 1, getValueMapping(PMI_FirstGPR, Size), + /*NumOperands*/ 3); + const InstructionMapping &FPRMapping = getInstructionMapping( + /*ID*/ 2, /*Cost*/ 1, getValueMapping(PMI_FirstFPR, Size), + /*NumOperands*/ 3); + + AltMappings.push_back(&GPRMapping); + AltMappings.push_back(&FPRMapping); + return AltMappings; + } + case TargetOpcode::G_BITCAST: { + unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, TRI); + if (Size != 32 && Size != 64) + break; + + // If the instruction has any implicit-defs or uses, + // do not mess with it. + if (MI.getNumOperands() != 2) + break; + + InstructionMappings AltMappings; + const InstructionMapping &GPRMapping = getInstructionMapping( + /*ID*/ 1, /*Cost*/ 1, + getCopyMapping(AArch64::GPRRegBankID, AArch64::GPRRegBankID, Size), + /*NumOperands*/ 2); + const InstructionMapping &FPRMapping = getInstructionMapping( + /*ID*/ 2, /*Cost*/ 1, + getCopyMapping(AArch64::FPRRegBankID, AArch64::FPRRegBankID, Size), + /*NumOperands*/ 2); + const InstructionMapping &GPRToFPRMapping = getInstructionMapping( + /*ID*/ 3, + /*Cost*/ copyCost(AArch64::GPRRegBank, AArch64::FPRRegBank, Size), + getCopyMapping(AArch64::FPRRegBankID, AArch64::GPRRegBankID, Size), + /*NumOperands*/ 2); + const InstructionMapping &FPRToGPRMapping = getInstructionMapping( + /*ID*/ 3, + /*Cost*/ copyCost(AArch64::GPRRegBank, AArch64::FPRRegBank, Size), + getCopyMapping(AArch64::GPRRegBankID, AArch64::FPRRegBankID, Size), + /*NumOperands*/ 2); + + AltMappings.push_back(&GPRMapping); + AltMappings.push_back(&FPRMapping); + AltMappings.push_back(&GPRToFPRMapping); + AltMappings.push_back(&FPRToGPRMapping); + return AltMappings; + } + case TargetOpcode::G_LOAD: { + unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, TRI); + if (Size != 64) + break; + + // If the instruction has any implicit-defs or uses, + // do not mess with it. + if (MI.getNumOperands() != 2) + break; + + InstructionMappings AltMappings; + const InstructionMapping &GPRMapping = getInstructionMapping( + /*ID*/ 1, /*Cost*/ 1, + getOperandsMapping({getValueMapping(PMI_FirstGPR, Size), + // Addresses are GPR 64-bit. + getValueMapping(PMI_FirstGPR, 64)}), + /*NumOperands*/ 2); + const InstructionMapping &FPRMapping = getInstructionMapping( + /*ID*/ 2, /*Cost*/ 1, + getOperandsMapping({getValueMapping(PMI_FirstFPR, Size), + // Addresses are GPR 64-bit. + getValueMapping(PMI_FirstGPR, 64)}), + /*NumOperands*/ 2); + + AltMappings.push_back(&GPRMapping); + AltMappings.push_back(&FPRMapping); + return AltMappings; + } + default: + break; + } + return RegisterBankInfo::getInstrAlternativeMappings(MI); +} + +void AArch64RegisterBankInfo::applyMappingImpl( + const OperandsMapper &OpdMapper) const { + switch (OpdMapper.getMI().getOpcode()) { + case TargetOpcode::G_OR: + case TargetOpcode::G_BITCAST: + case TargetOpcode::G_LOAD: + // Those ID must match getInstrAlternativeMappings. + assert((OpdMapper.getInstrMapping().getID() >= 1 && + OpdMapper.getInstrMapping().getID() <= 4) && + "Don't know how to handle that ID"); + return applyDefaultMapping(OpdMapper); + default: + llvm_unreachable("Don't know how to handle that operation"); + } +} + +/// Returns whether opcode \p Opc is a pre-isel generic floating-point opcode, +/// having only floating-point operands. +static bool isPreISelGenericFloatingPointOpcode(unsigned Opc) { + switch (Opc) { + case TargetOpcode::G_FADD: + case TargetOpcode::G_FSUB: + case TargetOpcode::G_FMUL: + case TargetOpcode::G_FMA: + case TargetOpcode::G_FDIV: + case TargetOpcode::G_FCONSTANT: + case TargetOpcode::G_FPEXT: + case TargetOpcode::G_FPTRUNC: + case TargetOpcode::G_FCEIL: + case TargetOpcode::G_FFLOOR: + case TargetOpcode::G_FNEARBYINT: + case TargetOpcode::G_FNEG: + case TargetOpcode::G_FCOS: + case TargetOpcode::G_FSIN: + case TargetOpcode::G_FLOG10: + case TargetOpcode::G_FLOG: + case TargetOpcode::G_FLOG2: + case TargetOpcode::G_FSQRT: + case TargetOpcode::G_FABS: + case TargetOpcode::G_FEXP: + case TargetOpcode::G_FRINT: + case TargetOpcode::G_INTRINSIC_TRUNC: + case TargetOpcode::G_INTRINSIC_ROUND: + return true; + } + return false; +} + +const RegisterBankInfo::InstructionMapping & +AArch64RegisterBankInfo::getSameKindOfOperandsMapping( + const MachineInstr &MI) const { + const unsigned Opc = MI.getOpcode(); + const MachineFunction &MF = *MI.getParent()->getParent(); + const MachineRegisterInfo &MRI = MF.getRegInfo(); + + unsigned NumOperands = MI.getNumOperands(); + assert(NumOperands <= 3 && + "This code is for instructions with 3 or less operands"); + + LLT Ty = MRI.getType(MI.getOperand(0).getReg()); + unsigned Size = Ty.getSizeInBits(); + bool IsFPR = Ty.isVector() || isPreISelGenericFloatingPointOpcode(Opc); + + PartialMappingIdx RBIdx = IsFPR ? PMI_FirstFPR : PMI_FirstGPR; + +#ifndef NDEBUG + // Make sure all the operands are using similar size and type. + // Should probably be checked by the machine verifier. + // This code won't catch cases where the number of lanes is + // different between the operands. + // If we want to go to that level of details, it is probably + // best to check that the types are the same, period. + // Currently, we just check that the register banks are the same + // for each types. + for (unsigned Idx = 1; Idx != NumOperands; ++Idx) { + LLT OpTy = MRI.getType(MI.getOperand(Idx).getReg()); + assert( + AArch64GenRegisterBankInfo::getRegBankBaseIdxOffset( + RBIdx, OpTy.getSizeInBits()) == + AArch64GenRegisterBankInfo::getRegBankBaseIdxOffset(RBIdx, Size) && + "Operand has incompatible size"); + bool OpIsFPR = OpTy.isVector() || isPreISelGenericFloatingPointOpcode(Opc); + (void)OpIsFPR; + assert(IsFPR == OpIsFPR && "Operand has incompatible type"); + } +#endif // End NDEBUG. + + return getInstructionMapping(DefaultMappingID, 1, + getValueMapping(RBIdx, Size), NumOperands); +} + +bool AArch64RegisterBankInfo::hasFPConstraints( + const MachineInstr &MI, const MachineRegisterInfo &MRI, + const TargetRegisterInfo &TRI) const { + unsigned Op = MI.getOpcode(); + + // Do we have an explicit floating point instruction? + if (isPreISelGenericFloatingPointOpcode(Op)) + return true; + + // No. Check if we have a copy-like instruction. If we do, then we could + // still be fed by floating point instructions. + if (Op != TargetOpcode::COPY && !MI.isPHI()) + return false; + + // MI is copy-like. Return true if it outputs an FPR. + return getRegBank(MI.getOperand(0).getReg(), MRI, TRI) == + &AArch64::FPRRegBank; +} + +bool AArch64RegisterBankInfo::onlyUsesFP(const MachineInstr &MI, + const MachineRegisterInfo &MRI, + const TargetRegisterInfo &TRI) const { + switch (MI.getOpcode()) { + case TargetOpcode::G_FPTOSI: + case TargetOpcode::G_FPTOUI: + case TargetOpcode::G_FCMP: + return true; + default: + break; + } + return hasFPConstraints(MI, MRI, TRI); +} + +bool AArch64RegisterBankInfo::onlyDefinesFP( + const MachineInstr &MI, const MachineRegisterInfo &MRI, + const TargetRegisterInfo &TRI) const { + switch (MI.getOpcode()) { + case AArch64::G_DUP: + case TargetOpcode::G_SITOFP: + case TargetOpcode::G_UITOFP: + case TargetOpcode::G_EXTRACT_VECTOR_ELT: + case TargetOpcode::G_INSERT_VECTOR_ELT: + return true; + default: + break; + } + return hasFPConstraints(MI, MRI, TRI); +} + +const RegisterBankInfo::InstructionMapping & +AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { + const unsigned Opc = MI.getOpcode(); + + // Try the default logic for non-generic instructions that are either copies + // or already have some operands assigned to banks. + if ((Opc != TargetOpcode::COPY && !isPreISelGenericOpcode(Opc)) || + Opc == TargetOpcode::G_PHI) { + const RegisterBankInfo::InstructionMapping &Mapping = + getInstrMappingImpl(MI); + if (Mapping.isValid()) + return Mapping; + } + + const MachineFunction &MF = *MI.getParent()->getParent(); + const MachineRegisterInfo &MRI = MF.getRegInfo(); + const TargetSubtargetInfo &STI = MF.getSubtarget(); + const TargetRegisterInfo &TRI = *STI.getRegisterInfo(); + + switch (Opc) { + // G_{F|S|U}REM are not listed because they are not legal. + // Arithmetic ops. + case TargetOpcode::G_ADD: + case TargetOpcode::G_SUB: + case TargetOpcode::G_PTR_ADD: + case TargetOpcode::G_MUL: + case TargetOpcode::G_SDIV: + case TargetOpcode::G_UDIV: + // Bitwise ops. + case TargetOpcode::G_AND: + case TargetOpcode::G_OR: + case TargetOpcode::G_XOR: + // Floating point ops. + case TargetOpcode::G_FADD: + case TargetOpcode::G_FSUB: + case TargetOpcode::G_FMUL: + case TargetOpcode::G_FDIV: + return getSameKindOfOperandsMapping(MI); + case TargetOpcode::G_FPEXT: { + LLT DstTy = MRI.getType(MI.getOperand(0).getReg()); + LLT SrcTy = MRI.getType(MI.getOperand(1).getReg()); + return getInstructionMapping( + DefaultMappingID, /*Cost*/ 1, + getFPExtMapping(DstTy.getSizeInBits(), SrcTy.getSizeInBits()), + /*NumOperands*/ 2); + } + // Shifts. + case TargetOpcode::G_SHL: + case TargetOpcode::G_LSHR: + case TargetOpcode::G_ASHR: { + LLT ShiftAmtTy = MRI.getType(MI.getOperand(2).getReg()); + LLT SrcTy = MRI.getType(MI.getOperand(1).getReg()); + if (ShiftAmtTy.getSizeInBits() == 64 && SrcTy.getSizeInBits() == 32) + return getInstructionMapping(DefaultMappingID, 1, + &ValMappings[Shift64Imm], 3); + return getSameKindOfOperandsMapping(MI); + } + case TargetOpcode::COPY: { + Register DstReg = MI.getOperand(0).getReg(); + Register SrcReg = MI.getOperand(1).getReg(); + // Check if one of the register is not a generic register. + if ((Register::isPhysicalRegister(DstReg) || + !MRI.getType(DstReg).isValid()) || + (Register::isPhysicalRegister(SrcReg) || + !MRI.getType(SrcReg).isValid())) { + const RegisterBank *DstRB = getRegBank(DstReg, MRI, TRI); + const RegisterBank *SrcRB = getRegBank(SrcReg, MRI, TRI); + if (!DstRB) + DstRB = SrcRB; + else if (!SrcRB) + SrcRB = DstRB; + // If both RB are null that means both registers are generic. + // We shouldn't be here. + assert(DstRB && SrcRB && "Both RegBank were nullptr"); + unsigned Size = getSizeInBits(DstReg, MRI, TRI); + return getInstructionMapping( + DefaultMappingID, copyCost(*DstRB, *SrcRB, Size), + getCopyMapping(DstRB->getID(), SrcRB->getID(), Size), + // We only care about the mapping of the destination. + /*NumOperands*/ 1); + } + // Both registers are generic, use G_BITCAST. + LLVM_FALLTHROUGH; + } + case TargetOpcode::G_BITCAST: { + LLT DstTy = MRI.getType(MI.getOperand(0).getReg()); + LLT SrcTy = MRI.getType(MI.getOperand(1).getReg()); + unsigned Size = DstTy.getSizeInBits(); + bool DstIsGPR = !DstTy.isVector() && DstTy.getSizeInBits() <= 64; + bool SrcIsGPR = !SrcTy.isVector() && SrcTy.getSizeInBits() <= 64; + const RegisterBank &DstRB = + DstIsGPR ? AArch64::GPRRegBank : AArch64::FPRRegBank; + const RegisterBank &SrcRB = + SrcIsGPR ? AArch64::GPRRegBank : AArch64::FPRRegBank; + return getInstructionMapping( + DefaultMappingID, copyCost(DstRB, SrcRB, Size), + getCopyMapping(DstRB.getID(), SrcRB.getID(), Size), + // We only care about the mapping of the destination for COPY. + /*NumOperands*/ Opc == TargetOpcode::G_BITCAST ? 2 : 1); + } + default: + break; + } + + unsigned NumOperands = MI.getNumOperands(); + + // Track the size and bank of each register. We don't do partial mappings. + SmallVector OpSize(NumOperands); + SmallVector OpRegBankIdx(NumOperands); + for (unsigned Idx = 0; Idx < NumOperands; ++Idx) { + auto &MO = MI.getOperand(Idx); + if (!MO.isReg() || !MO.getReg()) + continue; + + LLT Ty = MRI.getType(MO.getReg()); + OpSize[Idx] = Ty.getSizeInBits(); + + // As a top-level guess, vectors go in FPRs, scalars and pointers in GPRs. + // For floating-point instructions, scalars go in FPRs. + if (Ty.isVector() || isPreISelGenericFloatingPointOpcode(Opc) || + Ty.getSizeInBits() > 64) + OpRegBankIdx[Idx] = PMI_FirstFPR; + else + OpRegBankIdx[Idx] = PMI_FirstGPR; + } + + unsigned Cost = 1; + // Some of the floating-point instructions have mixed GPR and FPR operands: + // fine-tune the computed mapping. + switch (Opc) { + case AArch64::G_DUP: { + Register ScalarReg = MI.getOperand(1).getReg(); + auto ScalarDef = MRI.getVRegDef(ScalarReg); + if (getRegBank(ScalarReg, MRI, TRI) == &AArch64::FPRRegBank || + onlyDefinesFP(*ScalarDef, MRI, TRI)) + OpRegBankIdx = {PMI_FirstFPR, PMI_FirstFPR}; + else + OpRegBankIdx = {PMI_FirstFPR, PMI_FirstGPR}; + break; + } + case TargetOpcode::G_TRUNC: { + LLT SrcTy = MRI.getType(MI.getOperand(1).getReg()); + if (!SrcTy.isVector() && SrcTy.getSizeInBits() == 128) + OpRegBankIdx = {PMI_FirstFPR, PMI_FirstFPR}; + break; + } + case TargetOpcode::G_SITOFP: + case TargetOpcode::G_UITOFP: + if (MRI.getType(MI.getOperand(0).getReg()).isVector()) + break; + OpRegBankIdx = {PMI_FirstFPR, PMI_FirstGPR}; + break; + case TargetOpcode::G_FPTOSI: + case TargetOpcode::G_FPTOUI: + if (MRI.getType(MI.getOperand(0).getReg()).isVector()) + break; + OpRegBankIdx = {PMI_FirstGPR, PMI_FirstFPR}; + break; + case TargetOpcode::G_FCMP: + OpRegBankIdx = {PMI_FirstGPR, + /* Predicate */ PMI_None, PMI_FirstFPR, PMI_FirstFPR}; + break; + case TargetOpcode::G_BITCAST: + // This is going to be a cross register bank copy and this is expensive. + if (OpRegBankIdx[0] != OpRegBankIdx[1]) + Cost = copyCost( + *AArch64GenRegisterBankInfo::PartMappings[OpRegBankIdx[0]].RegBank, + *AArch64GenRegisterBankInfo::PartMappings[OpRegBankIdx[1]].RegBank, + OpSize[0]); + break; + case TargetOpcode::G_LOAD: + // Loading in vector unit is slightly more expensive. + // This is actually only true for the LD1R and co instructions, + // but anyway for the fast mode this number does not matter and + // for the greedy mode the cost of the cross bank copy will + // offset this number. + // FIXME: Should be derived from the scheduling model. + if (OpRegBankIdx[0] != PMI_FirstGPR) + Cost = 2; + else + // Check if that load feeds fp instructions. + // In that case, we want the default mapping to be on FPR + // instead of blind map every scalar to GPR. + for (const MachineInstr &UseMI : + MRI.use_nodbg_instructions(MI.getOperand(0).getReg())) { + // If we have at least one direct use in a FP instruction, + // assume this was a floating point load in the IR. + // If it was not, we would have had a bitcast before + // reaching that instruction. + if (onlyUsesFP(UseMI, MRI, TRI)) { + OpRegBankIdx[0] = PMI_FirstFPR; + break; + } + } + break; + case TargetOpcode::G_STORE: + // Check if that store is fed by fp instructions. + if (OpRegBankIdx[0] == PMI_FirstGPR) { + Register VReg = MI.getOperand(0).getReg(); + if (!VReg) + break; + MachineInstr *DefMI = MRI.getVRegDef(VReg); + if (onlyDefinesFP(*DefMI, MRI, TRI)) + OpRegBankIdx[0] = PMI_FirstFPR; + break; + } + break; + case TargetOpcode::G_SELECT: { + // If the destination is FPR, preserve that. + if (OpRegBankIdx[0] != PMI_FirstGPR) + break; + + // If we're taking in vectors, we have no choice but to put everything on + // FPRs, except for the condition. The condition must always be on a GPR. + LLT SrcTy = MRI.getType(MI.getOperand(2).getReg()); + if (SrcTy.isVector()) { + OpRegBankIdx = {PMI_FirstFPR, PMI_FirstGPR, PMI_FirstFPR, PMI_FirstFPR}; + break; + } + + // Try to minimize the number of copies. If we have more floating point + // constrained values than not, then we'll put everything on FPR. Otherwise, + // everything has to be on GPR. + unsigned NumFP = 0; + + // Check if the uses of the result always produce floating point values. + // + // For example: + // + // %z = G_SELECT %cond %x %y + // fpr = G_FOO %z ... + if (any_of(MRI.use_nodbg_instructions(MI.getOperand(0).getReg()), + [&](MachineInstr &MI) { return onlyUsesFP(MI, MRI, TRI); })) + ++NumFP; + + // Check if the defs of the source values always produce floating point + // values. + // + // For example: + // + // %x = G_SOMETHING_ALWAYS_FLOAT %a ... + // %z = G_SELECT %cond %x %y + // + // Also check whether or not the sources have already been decided to be + // FPR. Keep track of this. + // + // This doesn't check the condition, since it's just whatever is in NZCV. + // This isn't passed explicitly in a register to fcsel/csel. + for (unsigned Idx = 2; Idx < 4; ++Idx) { + Register VReg = MI.getOperand(Idx).getReg(); + MachineInstr *DefMI = MRI.getVRegDef(VReg); + if (getRegBank(VReg, MRI, TRI) == &AArch64::FPRRegBank || + onlyDefinesFP(*DefMI, MRI, TRI)) + ++NumFP; + } + + // If we have more FP constraints than not, then move everything over to + // FPR. + if (NumFP >= 2) + OpRegBankIdx = {PMI_FirstFPR, PMI_FirstGPR, PMI_FirstFPR, PMI_FirstFPR}; + + break; + } + case TargetOpcode::G_UNMERGE_VALUES: { + // If the first operand belongs to a FPR register bank, then make sure that + // we preserve that. + if (OpRegBankIdx[0] != PMI_FirstGPR) + break; + + LLT SrcTy = MRI.getType(MI.getOperand(MI.getNumOperands()-1).getReg()); + // UNMERGE into scalars from a vector should always use FPR. + // Likewise if any of the uses are FP instructions. + if (SrcTy.isVector() || SrcTy == LLT::scalar(128) || + any_of(MRI.use_nodbg_instructions(MI.getOperand(0).getReg()), + [&](MachineInstr &MI) { return onlyUsesFP(MI, MRI, TRI); })) { + // Set the register bank of every operand to FPR. + for (unsigned Idx = 0, NumOperands = MI.getNumOperands(); + Idx < NumOperands; ++Idx) + OpRegBankIdx[Idx] = PMI_FirstFPR; + } + break; + } + case TargetOpcode::G_EXTRACT_VECTOR_ELT: + // Destination and source need to be FPRs. + OpRegBankIdx[0] = PMI_FirstFPR; + OpRegBankIdx[1] = PMI_FirstFPR; + + // Index needs to be a GPR. + OpRegBankIdx[2] = PMI_FirstGPR; + break; + case TargetOpcode::G_INSERT_VECTOR_ELT: + OpRegBankIdx[0] = PMI_FirstFPR; + OpRegBankIdx[1] = PMI_FirstFPR; + + // The element may be either a GPR or FPR. Preserve that behaviour. + if (getRegBank(MI.getOperand(2).getReg(), MRI, TRI) == &AArch64::FPRRegBank) + OpRegBankIdx[2] = PMI_FirstFPR; + else + OpRegBankIdx[2] = PMI_FirstGPR; + + // Index needs to be a GPR. + OpRegBankIdx[3] = PMI_FirstGPR; + break; + case TargetOpcode::G_EXTRACT: { + // For s128 sources we have to use fpr. + LLT SrcTy = MRI.getType(MI.getOperand(1).getReg()); + if (SrcTy.getSizeInBits() == 128) { + OpRegBankIdx[0] = PMI_FirstFPR; + OpRegBankIdx[1] = PMI_FirstFPR; + } + break; + } + case TargetOpcode::G_BUILD_VECTOR: + // If the first source operand belongs to a FPR register bank, then make + // sure that we preserve that. + if (OpRegBankIdx[1] != PMI_FirstGPR) + break; + Register VReg = MI.getOperand(1).getReg(); + if (!VReg) + break; + + // Get the instruction that defined the source operand reg, and check if + // it's a floating point operation. Or, if it's a type like s16 which + // doesn't have a exact size gpr register class. + MachineInstr *DefMI = MRI.getVRegDef(VReg); + unsigned DefOpc = DefMI->getOpcode(); + const LLT SrcTy = MRI.getType(VReg); + if (isPreISelGenericFloatingPointOpcode(DefOpc) || + SrcTy.getSizeInBits() < 32) { + // Have a floating point op. + // Make sure every operand gets mapped to a FPR register class. + unsigned NumOperands = MI.getNumOperands(); + for (unsigned Idx = 0; Idx < NumOperands; ++Idx) + OpRegBankIdx[Idx] = PMI_FirstFPR; + } + break; + } + + // Finally construct the computed mapping. + SmallVector OpdsMapping(NumOperands); + for (unsigned Idx = 0; Idx < NumOperands; ++Idx) { + if (MI.getOperand(Idx).isReg() && MI.getOperand(Idx).getReg()) { + auto Mapping = getValueMapping(OpRegBankIdx[Idx], OpSize[Idx]); + if (!Mapping->isValid()) + return getInvalidInstructionMapping(); + + OpdsMapping[Idx] = Mapping; + } + } + + return getInstructionMapping(DefaultMappingID, Cost, + getOperandsMapping(OpdsMapping), NumOperands); +} diff --git a/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.h b/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.h new file mode 100644 index 0000000000000..e956fca1aa109 --- /dev/null +++ b/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.h @@ -0,0 +1,145 @@ +//===- AArch64RegisterBankInfo -----------------------------------*- C++ -*-==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// \file +/// This file declares the targeting of the RegisterBankInfo class for AArch64. +/// \todo This should be generated by TableGen. +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64REGISTERBANKINFO_H +#define LLVM_LIB_TARGET_AARCH64_AARCH64REGISTERBANKINFO_H + +#include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h" + +#define GET_REGBANK_DECLARATIONS +#include "AArch64GenRegisterBank.inc" + +namespace llvm { + +class TargetRegisterInfo; + +class AArch64GenRegisterBankInfo : public RegisterBankInfo { +protected: + enum PartialMappingIdx { + PMI_None = -1, + PMI_FPR16 = 1, + PMI_FPR32, + PMI_FPR64, + PMI_FPR128, + PMI_FPR256, + PMI_FPR512, + PMI_GPR32, + PMI_GPR64, + PMI_FirstGPR = PMI_GPR32, + PMI_LastGPR = PMI_GPR64, + PMI_FirstFPR = PMI_FPR16, + PMI_LastFPR = PMI_FPR512, + PMI_Min = PMI_FirstFPR, + }; + + static RegisterBankInfo::PartialMapping PartMappings[]; + static RegisterBankInfo::ValueMapping ValMappings[]; + static PartialMappingIdx BankIDToCopyMapIdx[]; + + enum ValueMappingIdx { + InvalidIdx = 0, + First3OpsIdx = 1, + Last3OpsIdx = 22, + DistanceBetweenRegBanks = 3, + FirstCrossRegCpyIdx = 25, + LastCrossRegCpyIdx = 39, + DistanceBetweenCrossRegCpy = 2, + FPExt16To32Idx = 41, + FPExt16To64Idx = 43, + FPExt32To64Idx = 45, + FPExt64To128Idx = 47, + Shift64Imm = 49 + }; + + static bool checkPartialMap(unsigned Idx, unsigned ValStartIdx, + unsigned ValLength, const RegisterBank &RB); + static bool checkValueMapImpl(unsigned Idx, unsigned FirstInBank, + unsigned Size, unsigned Offset); + static bool checkPartialMappingIdx(PartialMappingIdx FirstAlias, + PartialMappingIdx LastAlias, + ArrayRef Order); + + static unsigned getRegBankBaseIdxOffset(unsigned RBIdx, unsigned Size); + + /// Get the pointer to the ValueMapping representing the RegisterBank + /// at \p RBIdx with a size of \p Size. + /// + /// The returned mapping works for instructions with the same kind of + /// operands for up to 3 operands. + /// + /// \pre \p RBIdx != PartialMappingIdx::None + static const RegisterBankInfo::ValueMapping * + getValueMapping(PartialMappingIdx RBIdx, unsigned Size); + + /// Get the pointer to the ValueMapping of the operands of a copy + /// instruction from the \p SrcBankID register bank to the \p DstBankID + /// register bank with a size of \p Size. + static const RegisterBankInfo::ValueMapping * + getCopyMapping(unsigned DstBankID, unsigned SrcBankID, unsigned Size); + + /// Get the instruction mapping for G_FPEXT. + /// + /// \pre (DstSize, SrcSize) pair is one of the following: + /// (32, 16), (64, 16), (64, 32), (128, 64) + /// + /// \return An InstructionMapping with statically allocated OperandsMapping. + static const RegisterBankInfo::ValueMapping * + getFPExtMapping(unsigned DstSize, unsigned SrcSize); + +#define GET_TARGET_REGBANK_CLASS +#include "AArch64GenRegisterBank.inc" +}; + +/// This class provides the information for the target register banks. +class AArch64RegisterBankInfo final : public AArch64GenRegisterBankInfo { + /// See RegisterBankInfo::applyMapping. + void applyMappingImpl(const OperandsMapper &OpdMapper) const override; + + /// Get an instruction mapping where all the operands map to + /// the same register bank and have similar size. + /// + /// \pre MI.getNumOperands() <= 3 + /// + /// \return An InstructionMappings with a statically allocated + /// OperandsMapping. + const InstructionMapping & + getSameKindOfOperandsMapping(const MachineInstr &MI) const; + + /// Returns true if the output of \p MI must be stored on a FPR register. + bool hasFPConstraints(const MachineInstr &MI, const MachineRegisterInfo &MRI, + const TargetRegisterInfo &TRI) const; + + /// Returns true if the source registers of \p MI must all be FPRs. + bool onlyUsesFP(const MachineInstr &MI, const MachineRegisterInfo &MRI, + const TargetRegisterInfo &TRI) const; + + /// Returns true if the destination register of \p MI must be a FPR. + bool onlyDefinesFP(const MachineInstr &MI, const MachineRegisterInfo &MRI, + const TargetRegisterInfo &TRI) const; + +public: + AArch64RegisterBankInfo(const TargetRegisterInfo &TRI); + + unsigned copyCost(const RegisterBank &A, const RegisterBank &B, + unsigned Size) const override; + + const RegisterBank &getRegBankFromRegClass(const TargetRegisterClass &RC, + LLT) const override; + + InstructionMappings + getInstrAlternativeMappings(const MachineInstr &MI) const override; + + const InstructionMapping & + getInstrMapping(const MachineInstr &MI) const override; +}; +} // End llvm namespace. +#endif diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h index 05a909f1780a0..9814f76258538 100644 --- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h +++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h @@ -763,10 +763,10 @@ static inline bool isSVECpyImm(int64_t Imm) { bool IsImm8 = int8_t(Imm) == Imm; bool IsImm16 = int16_t(Imm & ~0xff) == Imm; - if (std::is_same::type>::value) + if (std::is_same>::value) return IsImm8 || uint8_t(Imm) == Imm; - if (std::is_same::type>::value) + if (std::is_same>::value) return IsImm8 || IsImm16 || uint16_t(Imm & ~0xff) == Imm; return IsImm8 || IsImm16; @@ -775,8 +775,7 @@ static inline bool isSVECpyImm(int64_t Imm) { /// Returns true if Imm is valid for ADD/SUB. template static inline bool isSVEAddSubImm(int64_t Imm) { - bool IsInt8t = - std::is_same::type>::value; + bool IsInt8t = std::is_same>::value; return uint8_t(Imm) == Imm || (!IsInt8t && uint16_t(Imm & ~0xff) == Imm); } diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp index 9db746733aa35..9f7dfdf624829 100644 --- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp +++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp @@ -24,6 +24,7 @@ #include "llvm/MC/MCSectionMachO.h" #include "llvm/MC/MCTargetOptions.h" #include "llvm/MC/MCValue.h" +#include "llvm/Support/EndianStream.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/TargetRegistry.h" using namespace llvm; @@ -33,6 +34,7 @@ namespace { class AArch64AsmBackend : public MCAsmBackend { static const unsigned PCRelFlagVal = MCFixupKindInfo::FKF_IsAlignedDownTo32Bits | MCFixupKindInfo::FKF_IsPCRel; +protected: Triple TheTriple; public: @@ -68,6 +70,11 @@ public: {"fixup_aarch64_pcrel_call26", 0, 26, PCRelFlagVal}, {"fixup_aarch64_tlsdesc_call", 0, 0, 0}}; + // Fixup kinds from .reloc directive are like R_AARCH64_NONE. They do not + // require any extra processing. + if (Kind >= FirstLiteralRelocationKind) + return MCAsmBackend::getFixupKindInfo(FK_NONE); + if (Kind < FirstTargetFixupKind) return MCAsmBackend::getFixupKindInfo(Kind); @@ -86,8 +93,8 @@ public: bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value, const MCRelaxableFragment *DF, const MCAsmLayout &Layout) const override; - void relaxInstruction(const MCInst &Inst, const MCSubtargetInfo &STI, - MCInst &Res) const override; + void relaxInstruction(MCInst &Inst, + const MCSubtargetInfo &STI) const override; bool writeNopData(raw_ostream &OS, uint64_t Count) const override; void HandleAssemblerFlag(MCAssemblerFlag Flag) {} @@ -108,7 +115,6 @@ static unsigned getFixupKindNumBytes(unsigned Kind) { default: llvm_unreachable("Unknown fixup kind!"); - case FK_NONE: case AArch64::fixup_aarch64_tlsdesc_call: return 0; @@ -237,11 +243,22 @@ static uint64_t adjustFixupValue(const MCFixup &Fixup, const MCValue &Target, static_cast(Target.getRefKind()); if (AArch64MCExpr::getSymbolLoc(RefKind) != AArch64MCExpr::VK_ABS && AArch64MCExpr::getSymbolLoc(RefKind) != AArch64MCExpr::VK_SABS) { - // VK_GOTTPREL, VK_TPREL, VK_DTPREL are movw fixups, but they can't - // ever be resolved in the assembler. - Ctx.reportError(Fixup.getLoc(), - "relocation for a thread-local variable points to an " - "absolute symbol"); + if (!RefKind) { + // The fixup is an expression + if (SignedValue > 0xFFFF || SignedValue < -0xFFFF) + Ctx.reportError(Fixup.getLoc(), + "fixup value out of range [-0xFFFF, 0xFFFF]"); + + // Invert the negative immediate because it will feed into a MOVN. + if (SignedValue < 0) + SignedValue = ~SignedValue; + Value = static_cast(SignedValue); + } else + // VK_GOTTPREL, VK_TPREL, VK_DTPREL are movw fixups, but they can't + // ever be resolved in the assembler. + Ctx.reportError(Fixup.getLoc(), + "relocation for a thread-local variable points to an " + "absolute symbol"); return Value; } @@ -329,7 +346,6 @@ static uint64_t adjustFixupValue(const MCFixup &Fixup, const MCValue &Target, if (!valueFitsIntoFixupKind(Fixup.getTargetKind(), Value)) Ctx.reportError(Fixup.getLoc(), "fixup value too large for data type!"); LLVM_FALLTHROUGH; - case FK_NONE: case FK_SecRel_2: case FK_SecRel_4: return Value; @@ -337,9 +353,17 @@ static uint64_t adjustFixupValue(const MCFixup &Fixup, const MCValue &Target, } Optional AArch64AsmBackend::getFixupKind(StringRef Name) const { - if (TheTriple.isOSBinFormatELF() && Name == "R_AARCH64_NONE") - return FK_NONE; - return MCAsmBackend::getFixupKind(Name); + if (!TheTriple.isOSBinFormatELF()) + return None; + + unsigned Type = llvm::StringSwitch(Name) +#define ELF_RELOC(X, Y) .Case(#X, Y) +#include "llvm/BinaryFormat/ELFRelocs/AArch64.def" +#undef ELF_RELOC + .Default(-1u); + if (Type == -1u) + return None; + return static_cast(FirstLiteralRelocationKind + Type); } /// getFixupKindContainereSizeInBytes - The number of bytes of the @@ -386,9 +410,12 @@ void AArch64AsmBackend::applyFixup(const MCAssembler &Asm, const MCFixup &Fixup, MutableArrayRef Data, uint64_t Value, bool IsResolved, const MCSubtargetInfo *STI) const { - unsigned NumBytes = getFixupKindNumBytes(Fixup.getKind()); if (!Value) return; // Doesn't change encoding. + unsigned Kind = Fixup.getKind(); + if (Kind >= FirstLiteralRelocationKind) + return; + unsigned NumBytes = getFixupKindNumBytes(Kind); MCFixupKindInfo Info = getFixupKindInfo(Fixup.getKind()); MCContext &Ctx = Asm.getContext(); int64_t SignedValue = static_cast(Value); @@ -424,8 +451,9 @@ void AArch64AsmBackend::applyFixup(const MCAssembler &Asm, const MCFixup &Fixup, // FIXME: getFixupKindInfo() and getFixupKindNumBytes() could be fixed to // handle this more cleanly. This may affect the output of -show-mc-encoding. AArch64MCExpr::VariantKind RefKind = - static_cast(Target.getRefKind()); - if (AArch64MCExpr::getSymbolLoc(RefKind) == AArch64MCExpr::VK_SABS) { + static_cast(Target.getRefKind()); + if (AArch64MCExpr::getSymbolLoc(RefKind) == AArch64MCExpr::VK_SABS || + (!RefKind && Fixup.getTargetKind() == AArch64::fixup_aarch64_movw)) { // If the immediate is negative, generate MOVN else MOVZ. // (Bit 30 = 0) ==> MOVN, (Bit 30 = 1) ==> MOVZ. if (SignedValue < 0) @@ -451,9 +479,8 @@ bool AArch64AsmBackend::fixupNeedsRelaxation(const MCFixup &Fixup, return int64_t(Value) != int64_t(int8_t(Value)); } -void AArch64AsmBackend::relaxInstruction(const MCInst &Inst, - const MCSubtargetInfo &STI, - MCInst &Res) const { +void AArch64AsmBackend::relaxInstruction(MCInst &Inst, + const MCSubtargetInfo &STI) const { llvm_unreachable("AArch64AsmBackend::relaxInstruction() unimplemented"); } @@ -474,7 +501,7 @@ bool AArch64AsmBackend::shouldForceRelocation(const MCAssembler &Asm, const MCFixup &Fixup, const MCValue &Target) { unsigned Kind = Fixup.getKind(); - if (Kind == FK_NONE) + if (Kind >= FirstLiteralRelocationKind) return true; // The ADRP instruction adds some multiple of 0x1000 to the current PC & @@ -544,7 +571,6 @@ enum CompactUnwindEncodings { // FIXME: This should be in a separate file. class DarwinAArch64AsmBackend : public AArch64AsmBackend { const MCRegisterInfo &MRI; - bool IsILP32; /// Encode compact unwind stack adjustment for frameless functions. /// See UNWIND_ARM64_FRAMELESS_STACK_SIZE_MASK in compact_unwind_encoding.h. @@ -555,18 +581,15 @@ class DarwinAArch64AsmBackend : public AArch64AsmBackend { public: DarwinAArch64AsmBackend(const Target &T, const Triple &TT, - const MCRegisterInfo &MRI, bool IsILP32) - : AArch64AsmBackend(T, TT, /*IsLittleEndian*/ true), MRI(MRI), - IsILP32(IsILP32) {} + const MCRegisterInfo &MRI) + : AArch64AsmBackend(T, TT, /*IsLittleEndian*/ true), MRI(MRI) {} std::unique_ptr createObjectTargetWriter() const override { - if (IsILP32) - return createAArch64MachObjectWriter( - MachO::CPU_TYPE_ARM64_32, MachO::CPU_SUBTYPE_ARM64_32_V8, true); - else - return createAArch64MachObjectWriter(MachO::CPU_TYPE_ARM64, - MachO::CPU_SUBTYPE_ARM64_ALL, false); + uint32_t CPUType = cantFail(MachO::getCPUType(TheTriple)); + uint32_t CPUSubType = cantFail(MachO::getCPUSubType(TheTriple)); + return createAArch64MachObjectWriter(CPUType, CPUSubType, + TheTriple.isArch32Bit()); } /// Generate the compact unwind encoding from the CFI directives. @@ -749,8 +772,7 @@ MCAsmBackend *llvm::createAArch64leAsmBackend(const Target &T, const MCTargetOptions &Options) { const Triple &TheTriple = STI.getTargetTriple(); if (TheTriple.isOSBinFormatMachO()) { - const bool IsILP32 = TheTriple.isArch32Bit(); - return new DarwinAArch64AsmBackend(T, TheTriple, MRI, IsILP32); + return new DarwinAArch64AsmBackend(T, TheTriple, MRI); } if (TheTriple.isOSBinFormatCOFF()) diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp index 0fd1ca187be7f..e5637dcab9419 100644 --- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp +++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp @@ -106,13 +106,17 @@ unsigned AArch64ELFObjectWriter::getRelocType(MCContext &Ctx, const MCValue &Target, const MCFixup &Fixup, bool IsPCRel) const { + unsigned Kind = Fixup.getTargetKind(); + if (Kind >= FirstLiteralRelocationKind) + return Kind - FirstLiteralRelocationKind; AArch64MCExpr::VariantKind RefKind = static_cast(Target.getRefKind()); AArch64MCExpr::VariantKind SymLoc = AArch64MCExpr::getSymbolLoc(RefKind); bool IsNC = AArch64MCExpr::isNotChecked(RefKind); assert((!Target.getSymA() || - Target.getSymA()->getKind() == MCSymbolRefExpr::VK_None) && + Target.getSymA()->getKind() == MCSymbolRefExpr::VK_None || + Target.getSymA()->getKind() == MCSymbolRefExpr::VK_PLT) && "Should only be expression-level modifiers here"); assert((!Target.getSymB() || @@ -120,14 +124,17 @@ unsigned AArch64ELFObjectWriter::getRelocType(MCContext &Ctx, "Should only be expression-level modifiers here"); if (IsPCRel) { - switch (Fixup.getTargetKind()) { + switch (Kind) { case FK_Data_1: Ctx.reportError(Fixup.getLoc(), "1-byte data relocations not supported"); return ELF::R_AARCH64_NONE; case FK_Data_2: return R_CLS(PREL16); - case FK_Data_4: - return R_CLS(PREL32); + case FK_Data_4: { + return Target.getAccessVariant() == MCSymbolRefExpr::VK_PLT + ? R_CLS(PLT32) + : R_CLS(PREL32); + } case FK_Data_8: if (IsILP32) { Ctx.reportError(Fixup.getLoc(), @@ -185,8 +192,6 @@ unsigned AArch64ELFObjectWriter::getRelocType(MCContext &Ctx, if (IsILP32 && isNonILP32reloc(Fixup, RefKind, Ctx)) return ELF::R_AARCH64_NONE; switch (Fixup.getTargetKind()) { - case FK_NONE: - return ELF::R_AARCH64_NONE; case FK_Data_1: Ctx.reportError(Fixup.getLoc(), "1-byte data relocations not supported"); return ELF::R_AARCH64_NONE; diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp index c33f7e957b54a..fe4c34be1519b 100644 --- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp +++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp @@ -81,14 +81,14 @@ public: std::move(Emitter)), MappingSymbolCounter(0), LastEMS(EMS_None) {} - void ChangeSection(MCSection *Section, const MCExpr *Subsection) override { + void changeSection(MCSection *Section, const MCExpr *Subsection) override { // We have to keep track of the mapping symbol state of any sections we // use. Each one should start off as EMS_None, which is provided as the // default constructor by DenseMap::lookup. LastMappingSymbols[getPreviousSection().first] = LastEMS; LastEMS = LastMappingSymbols.lookup(Section); - MCELFStreamer::ChangeSection(Section, Subsection); + MCELFStreamer::changeSection(Section, Subsection); } // Reset state between object emissions @@ -102,10 +102,10 @@ public: /// This function is the one used to emit instruction data into the ELF /// streamer. We override it to add the appropriate mapping symbol if /// necessary. - void EmitInstruction(const MCInst &Inst, + void emitInstruction(const MCInst &Inst, const MCSubtargetInfo &STI) override { EmitA64MappingSymbol(); - MCELFStreamer::EmitInstruction(Inst, STI); + MCELFStreamer::emitInstruction(Inst, STI); } /// Emit a 32-bit value as an instruction. This is only used for the .inst @@ -122,28 +122,28 @@ public: } EmitA64MappingSymbol(); - MCELFStreamer::EmitBytes(StringRef(Buffer, 4)); + MCELFStreamer::emitBytes(StringRef(Buffer, 4)); } /// This is one of the functions used to emit data into an ELF section, so the /// AArch64 streamer overrides it to add the appropriate mapping symbol ($d) /// if necessary. - void EmitBytes(StringRef Data) override { - EmitDataMappingSymbol(); - MCELFStreamer::EmitBytes(Data); + void emitBytes(StringRef Data) override { + emitDataMappingSymbol(); + MCELFStreamer::emitBytes(Data); } /// This is one of the functions used to emit data into an ELF section, so the /// AArch64 streamer overrides it to add the appropriate mapping symbol ($d) /// if necessary. - void EmitValueImpl(const MCExpr *Value, unsigned Size, SMLoc Loc) override { - EmitDataMappingSymbol(); - MCELFStreamer::EmitValueImpl(Value, Size, Loc); + void emitValueImpl(const MCExpr *Value, unsigned Size, SMLoc Loc) override { + emitDataMappingSymbol(); + MCELFStreamer::emitValueImpl(Value, Size, Loc); } void emitFill(const MCExpr &NumBytes, uint64_t FillValue, SMLoc Loc) override { - EmitDataMappingSymbol(); + emitDataMappingSymbol(); MCObjectStreamer::emitFill(NumBytes, FillValue, Loc); } private: @@ -153,7 +153,7 @@ private: EMS_Data }; - void EmitDataMappingSymbol() { + void emitDataMappingSymbol() { if (LastEMS == EMS_Data) return; EmitMappingSymbol("$d"); @@ -170,7 +170,7 @@ private: void EmitMappingSymbol(StringRef Name) { auto *Symbol = cast(getContext().getOrCreateSymbol( Name + "." + Twine(MappingSymbolCounter++))); - EmitLabel(Symbol); + emitLabel(Symbol); Symbol->setType(ELF::STT_NOTYPE); Symbol->setBinding(ELF::STB_LOCAL); Symbol->setExternal(false); diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp index 469892213ef87..38474d31460dd 100644 --- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp +++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp @@ -283,7 +283,8 @@ void AArch64InstPrinter::printInst(const MCInst *MI, uint64_t Address, } if (Opcode == AArch64::SPACE) { - O << '\t' << MAI.getCommentString() << " SPACE"; + O << '\t' << MAI.getCommentString() << " SPACE " + << MI->getOperand(1).getImm(); printAnnotation(O, Annot); return; } @@ -295,7 +296,7 @@ void AArch64InstPrinter::printInst(const MCInst *MI, uint64_t Address, return; } - if (!printAliasInstr(MI, STI, O)) + if (!printAliasInstr(MI, Address, STI, O)) printInstruction(MI, Address, STI, O); printAnnotation(O, Annot); @@ -900,6 +901,19 @@ void AArch64InstPrinter::printImmHex(const MCInst *MI, unsigned OpNo, O << format("#%#llx", Op.getImm()); } +template +void AArch64InstPrinter::printSImm(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, + raw_ostream &O) { + const MCOperand &Op = MI->getOperand(OpNo); + if (Size == 8) + O << "#" << formatImm((signed char)Op.getImm()); + else if (Size == 16) + O << "#" << formatImm((signed short)Op.getImm()); + else + O << "#" << formatImm(Op.getImm()); +} + void AArch64InstPrinter::printPostIncOperand(const MCInst *MI, unsigned OpNo, unsigned Imm, raw_ostream &O) { const MCOperand &Op = MI->getOperand(OpNo); @@ -1334,7 +1348,8 @@ void AArch64InstPrinter::printVectorIndex(const MCInst *MI, unsigned OpNum, O << "[" << MI->getOperand(OpNum).getImm() << "]"; } -void AArch64InstPrinter::printAlignedLabel(const MCInst *MI, unsigned OpNum, +void AArch64InstPrinter::printAlignedLabel(const MCInst *MI, uint64_t Address, + unsigned OpNum, const MCSubtargetInfo &STI, raw_ostream &O) { const MCOperand &Op = MI->getOperand(OpNum); @@ -1342,17 +1357,20 @@ void AArch64InstPrinter::printAlignedLabel(const MCInst *MI, unsigned OpNum, // If the label has already been resolved to an immediate offset (say, when // we're running the disassembler), just print the immediate. if (Op.isImm()) { - O << "#" << formatImm(Op.getImm() * 4); + int64_t Offset = Op.getImm() * 4; + if (PrintBranchImmAsAddress) + O << formatHex(Address + Offset); + else + O << "#" << formatImm(Offset); return; } // If the branch target is simply an address then print it in hex. const MCConstantExpr *BranchTarget = dyn_cast(MI->getOperand(OpNum).getExpr()); - int64_t Address; - if (BranchTarget && BranchTarget->evaluateAsAbsolute(Address)) { - O << "0x"; - O.write_hex(Address); + int64_t TargetAddress; + if (BranchTarget && BranchTarget->evaluateAsAbsolute(TargetAddress)) { + O << formatHex(TargetAddress); } else { // Otherwise, just print the expression. MI->getOperand(OpNum).getExpr()->print(O, &MAI); @@ -1411,6 +1429,12 @@ void AArch64InstPrinter::printMRSSystemRegister(const MCInst *MI, unsigned OpNo, return; } + // Horrible hack for two different registers having the same encoding. + if (Val == AArch64SysReg::TRCEXTINSELR) { + O << "TRCEXTINSELR"; + return; + } + const AArch64SysReg::SysReg *Reg = AArch64SysReg::lookupSysRegByEncoding(Val); if (Reg && Reg->Readable && Reg->haveFeatures(STI.getFeatureBits())) O << Reg->Name; @@ -1431,6 +1455,12 @@ void AArch64InstPrinter::printMSRSystemRegister(const MCInst *MI, unsigned OpNo, return; } + // Horrible hack for two different registers having the same encoding. + if (Val == AArch64SysReg::TRCEXTINSELR) { + O << "TRCEXTINSELR"; + return; + } + const AArch64SysReg::SysReg *Reg = AArch64SysReg::lookupSysRegByEncoding(Val); if (Reg && Reg->Writeable && Reg->haveFeatures(STI.getFeatureBits())) O << Reg->Name; @@ -1499,7 +1529,7 @@ void AArch64InstPrinter::printSVERegOp(const MCInst *MI, unsigned OpNum, template void AArch64InstPrinter::printImmSVE(T Value, raw_ostream &O) { - typename std::make_unsigned::type HexValue = Value; + std::make_unsigned_t HexValue = Value; if (getPrintImmHex()) O << '#' << formatHex((uint64_t)HexValue); @@ -1544,8 +1574,8 @@ template void AArch64InstPrinter::printSVELogicalImm(const MCInst *MI, unsigned OpNum, const MCSubtargetInfo &STI, raw_ostream &O) { - typedef typename std::make_signed::type SignedT; - typedef typename std::make_unsigned::type UnsignedT; + typedef std::make_signed_t SignedT; + typedef std::make_unsigned_t UnsignedT; uint64_t Val = MI->getOperand(OpNum).getImm(); UnsignedT PrintVal = AArch64_AM::decodeLogicalImmediate(Val, 64); diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.h b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.h index 993f379b53433..6da5f0e81c803 100644 --- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.h +++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.h @@ -32,10 +32,10 @@ public: // Autogenerated by tblgen. virtual void printInstruction(const MCInst *MI, uint64_t Address, const MCSubtargetInfo &STI, raw_ostream &O); - virtual bool printAliasInstr(const MCInst *MI, const MCSubtargetInfo &STI, - raw_ostream &O); - virtual void printCustomAliasOperand(const MCInst *MI, unsigned OpIdx, - unsigned PrintMethodIdx, + virtual bool printAliasInstr(const MCInst *MI, uint64_t Address, + const MCSubtargetInfo &STI, raw_ostream &O); + virtual void printCustomAliasOperand(const MCInst *MI, uint64_t Address, + unsigned OpIdx, unsigned PrintMethodIdx, const MCSubtargetInfo &STI, raw_ostream &O); @@ -56,6 +56,9 @@ protected: raw_ostream &O); void printImmHex(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O); + template + void printSImm(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, + raw_ostream &O); template void printImmSVE(T Value, raw_ostream &O); void printPostIncOperand(const MCInst *MI, unsigned OpNo, unsigned Imm, raw_ostream &O); @@ -97,7 +100,7 @@ protected: const MCSubtargetInfo &STI, raw_ostream &O); void printInverseCondCode(const MCInst *MI, unsigned OpNum, const MCSubtargetInfo &STI, raw_ostream &O); - void printAlignedLabel(const MCInst *MI, unsigned OpNum, + void printAlignedLabel(const MCInst *MI, uint64_t Address, unsigned OpNum, const MCSubtargetInfo &STI, raw_ostream &O); void printUImm12Offset(const MCInst *MI, unsigned OpNum, unsigned Scale, raw_ostream &O); @@ -202,10 +205,10 @@ public: void printInstruction(const MCInst *MI, uint64_t Address, const MCSubtargetInfo &STI, raw_ostream &O) override; - bool printAliasInstr(const MCInst *MI, const MCSubtargetInfo &STI, - raw_ostream &O) override; - void printCustomAliasOperand(const MCInst *MI, unsigned OpIdx, - unsigned PrintMethodIdx, + bool printAliasInstr(const MCInst *MI, uint64_t Address, + const MCSubtargetInfo &STI, raw_ostream &O) override; + void printCustomAliasOperand(const MCInst *MI, uint64_t Address, + unsigned OpIdx, unsigned PrintMethodIdx, const MCSubtargetInfo &STI, raw_ostream &O) override; diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp index 5926a4f81616c..9a63e26dec190 100644 --- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp +++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp @@ -60,7 +60,7 @@ const MCExpr *AArch64MCAsmInfoDarwin::getExprForPersonalitySymbol( const MCExpr *Res = MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_GOT, Context); MCSymbol *PCSym = Context.createTempSymbol(); - Streamer.EmitLabel(PCSym); + Streamer.emitLabel(PCSym); const MCExpr *PC = MCSymbolRefExpr::create(PCSym, Context); return MCBinaryExpr::createSub(Res, PC, Context); } @@ -96,8 +96,6 @@ AArch64MCAsmInfoELF::AArch64MCAsmInfoELF(const Triple &T) { // Exceptions handling ExceptionsType = ExceptionHandling::DwarfCFI; - UseIntegratedAssembler = true; - HasIdentDirective = true; } diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp index 8f4d9cb94d607..da8f511c650f0 100644 --- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp +++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp @@ -569,23 +569,24 @@ unsigned AArch64MCCodeEmitter::fixMOVZ(const MCInst &MI, unsigned EncodedValue, if (UImm16MO.isImm()) return EncodedValue; - const AArch64MCExpr *A64E = cast(UImm16MO.getExpr()); - switch (A64E->getKind()) { - case AArch64MCExpr::VK_DTPREL_G2: - case AArch64MCExpr::VK_DTPREL_G1: - case AArch64MCExpr::VK_DTPREL_G0: - case AArch64MCExpr::VK_GOTTPREL_G1: - case AArch64MCExpr::VK_TPREL_G2: - case AArch64MCExpr::VK_TPREL_G1: - case AArch64MCExpr::VK_TPREL_G0: - return EncodedValue & ~(1u << 30); - default: - // Nothing to do for an unsigned fixup. - return EncodedValue; + const MCExpr *E = UImm16MO.getExpr(); + if (const AArch64MCExpr *A64E = dyn_cast(E)) { + switch (A64E->getKind()) { + case AArch64MCExpr::VK_DTPREL_G2: + case AArch64MCExpr::VK_DTPREL_G1: + case AArch64MCExpr::VK_DTPREL_G0: + case AArch64MCExpr::VK_GOTTPREL_G1: + case AArch64MCExpr::VK_TPREL_G2: + case AArch64MCExpr::VK_TPREL_G1: + case AArch64MCExpr::VK_TPREL_G0: + return EncodedValue & ~(1u << 30); + default: + // Nothing to do for an unsigned fixup. + return EncodedValue; + } } - - return EncodedValue & ~(1u << 30); + return EncodedValue; } void AArch64MCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS, diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp index 7dc3665baabc5..209bff3a23117 100644 --- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp +++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp @@ -254,7 +254,7 @@ static MCAsmInfo *createAArch64MCAsmInfo(const MCRegisterInfo &MRI, // Initial state of the frame pointer is SP. unsigned Reg = MRI.getDwarfRegNum(AArch64::SP, true); - MCCFIInstruction Inst = MCCFIInstruction::createDefCfa(nullptr, Reg, 0); + MCCFIInstruction Inst = MCCFIInstruction::cfiDefCfa(nullptr, Reg, 0); MAI->addInitialFrameState(Inst); return MAI; diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp index fc04d37eb3623..b0f414bd27edd 100644 --- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp +++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp @@ -139,7 +139,7 @@ static bool canUseLocalRelocation(const MCSectionMachO &Section, return false; if (RefSec.getSegmentName() == "__DATA" && - RefSec.getSectionName() == "__objc_classrefs") + RefSec.getName() == "__objc_classrefs") return false; // FIXME: ld64 currently handles internal pointer-sized relocations @@ -407,5 +407,5 @@ std::unique_ptr llvm::createAArch64MachObjectWriter(uint32_t CPUType, uint32_t CPUSubtype, bool IsILP32) { return std::make_unique(CPUType, CPUSubtype, - IsILP32); + IsILP32); } diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp index f70752f5303f3..48ed68f492635 100644 --- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp +++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp @@ -51,7 +51,7 @@ void AArch64TargetStreamer::emitInst(uint32_t Inst) { Inst >>= 8; } - getStreamer().EmitBytes(StringRef(Buffer, 4)); + getStreamer().emitBytes(StringRef(Buffer, 4)); } namespace llvm { diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.cpp index 37c6fbb039081..03fbab5142a2e 100644 --- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.cpp +++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.cpp @@ -28,7 +28,7 @@ public: void EmitWinEHHandlerData(SMLoc Loc) override; void EmitWindowsUnwindTables() override; - void FinishImpl() override; + void finishImpl() override; }; void AArch64WinCOFFStreamer::EmitWinEHHandlerData(SMLoc Loc) { @@ -45,11 +45,11 @@ void AArch64WinCOFFStreamer::EmitWindowsUnwindTables() { EHStreamer.Emit(*this); } -void AArch64WinCOFFStreamer::FinishImpl() { - EmitFrames(nullptr); +void AArch64WinCOFFStreamer::finishImpl() { + emitFrames(nullptr); EmitWindowsUnwindTables(); - MCWinCOFFStreamer::FinishImpl(); + MCWinCOFFStreamer::finishImpl(); } } // end anonymous namespace @@ -68,7 +68,7 @@ void AArch64TargetWinCOFFStreamer::EmitARM64WinUnwindCode(unsigned UnwindCode, WinEH::FrameInfo *CurFrame = S.EnsureValidWinFrameInfo(SMLoc()); if (!CurFrame) return; - MCSymbol *Label = S.EmitCFILabel(); + MCSymbol *Label = S.emitCFILabel(); auto Inst = WinEH::Instruction(UnwindCode, Label, Reg, Offset); if (InEpilogCFI) CurFrame->EpilogMap[CurrentEpilog].push_back(Inst); @@ -158,7 +158,7 @@ void AArch64TargetWinCOFFStreamer::EmitARM64WinCFIPrologEnd() { if (!CurFrame) return; - MCSymbol *Label = S.EmitCFILabel(); + MCSymbol *Label = S.emitCFILabel(); CurFrame->PrologEnd = Label; WinEH::Instruction Inst = WinEH::Instruction(Win64EH::UOP_End, Label, -1, 0); auto it = CurFrame->Instructions.begin(); @@ -172,7 +172,7 @@ void AArch64TargetWinCOFFStreamer::EmitARM64WinCFIEpilogStart() { return; InEpilogCFI = true; - CurrentEpilog = S.EmitCFILabel(); + CurrentEpilog = S.emitCFILabel(); } void AArch64TargetWinCOFFStreamer::EmitARM64WinCFIEpilogEnd() { @@ -182,7 +182,7 @@ void AArch64TargetWinCOFFStreamer::EmitARM64WinCFIEpilogEnd() { return; InEpilogCFI = false; - MCSymbol *Label = S.EmitCFILabel(); + MCSymbol *Label = S.emitCFILabel(); WinEH::Instruction Inst = WinEH::Instruction(Win64EH::UOP_End, Label, -1, 0); CurFrame->EpilogMap[CurrentEpilog].push_back(Inst); CurrentEpilog = nullptr; diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td index a172b8d7e6b0a..a005d1e65abe1 100644 --- a/llvm/lib/Target/AArch64/SVEInstrFormats.td +++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td @@ -10,6 +10,14 @@ // //===----------------------------------------------------------------------===// +def SDT_AArch64Setcc : SDTypeProfile<1, 4, [ + SDTCisVec<0>, SDTCisVec<1>, SDTCisVec<2>, SDTCisVec<3>, + SDTCVecEltisVT<0, i1>, SDTCVecEltisVT<1, i1>, SDTCisSameAs<2, 3>, + SDTCisVT<4, OtherVT> +]>; + +def AArch64setcc_z : SDNode<"AArch64ISD::SETCC_MERGE_ZERO", SDT_AArch64Setcc>; + def SVEPatternOperand : AsmOperandClass { let Name = "SVEPattern"; let ParserMethod = "tryParseSVEPattern"; @@ -33,7 +41,7 @@ def SVEPrefetchOperand : AsmOperandClass { let RenderMethod = "addPrefetchOperands"; } -def sve_prfop : Operand, ImmLeaf, TImmLeaf { let PrintMethod = "printPrefetchOp"; @@ -167,8 +175,8 @@ def SVEAddSubImmOperand32 : SVEShiftedImmOperand<32, "AddSub", "isSVEAddSubImm">; class imm8_opt_lsl - : Operand, ImmLeaf { + AsmOperandClass OpndClass> + : Operand { let EncoderMethod = "getImm8OptLsl"; let DecoderMethod = "DecodeImm8OptLsl<" # ElementWidth # ">"; let PrintMethod = "printImm8OptLsl<" # printType # ">"; @@ -176,31 +184,15 @@ class imm8_opt_lsl(Imm); -}]>; -def cpy_imm8_opt_lsl_i16 : imm8_opt_lsl<16, "int16_t", SVECpyImmOperand16, [{ - return AArch64_AM::isSVECpyImm(Imm); -}]>; -def cpy_imm8_opt_lsl_i32 : imm8_opt_lsl<32, "int32_t", SVECpyImmOperand32, [{ - return AArch64_AM::isSVECpyImm(Imm); -}]>; -def cpy_imm8_opt_lsl_i64 : imm8_opt_lsl<64, "int64_t", SVECpyImmOperand64, [{ - return AArch64_AM::isSVECpyImm(Imm); -}]>; - -def addsub_imm8_opt_lsl_i8 : imm8_opt_lsl<8, "uint8_t", SVEAddSubImmOperand8, [{ - return AArch64_AM::isSVEAddSubImm(Imm); -}]>; -def addsub_imm8_opt_lsl_i16 : imm8_opt_lsl<16, "uint16_t", SVEAddSubImmOperand16, [{ - return AArch64_AM::isSVEAddSubImm(Imm); -}]>; -def addsub_imm8_opt_lsl_i32 : imm8_opt_lsl<32, "uint32_t", SVEAddSubImmOperand32, [{ - return AArch64_AM::isSVEAddSubImm(Imm); -}]>; -def addsub_imm8_opt_lsl_i64 : imm8_opt_lsl<64, "uint64_t", SVEAddSubImmOperand64, [{ - return AArch64_AM::isSVEAddSubImm(Imm); -}]>; +def cpy_imm8_opt_lsl_i8 : imm8_opt_lsl<8, "int8_t", SVECpyImmOperand8>; +def cpy_imm8_opt_lsl_i16 : imm8_opt_lsl<16, "int16_t", SVECpyImmOperand16>; +def cpy_imm8_opt_lsl_i32 : imm8_opt_lsl<32, "int32_t", SVECpyImmOperand32>; +def cpy_imm8_opt_lsl_i64 : imm8_opt_lsl<64, "int64_t", SVECpyImmOperand64>; + +def addsub_imm8_opt_lsl_i8 : imm8_opt_lsl<8, "uint8_t", SVEAddSubImmOperand8>; +def addsub_imm8_opt_lsl_i16 : imm8_opt_lsl<16, "uint16_t", SVEAddSubImmOperand16>; +def addsub_imm8_opt_lsl_i32 : imm8_opt_lsl<32, "uint32_t", SVEAddSubImmOperand32>; +def addsub_imm8_opt_lsl_i64 : imm8_opt_lsl<64, "uint64_t", SVEAddSubImmOperand64>; def SVEAddSubImm8Pat : ComplexPattern", []>; def SVEAddSubImm16Pat : ComplexPattern", []>; @@ -212,9 +204,13 @@ def SVELogicalImm16Pat : ComplexPattern", def SVELogicalImm32Pat : ComplexPattern", []>; def SVELogicalImm64Pat : ComplexPattern", []>; +def SVE8BitLslImm : ComplexPattern; + def SVEArithUImmPat : ComplexPattern; def SVEArithSImmPat : ComplexPattern; +def SVEShiftImm64 : ComplexPattern", []>; + class SVEExactFPImm : AsmOperandClass { let Name = "SVEExactFPImmOperand" # Suffix; let DiagnosticType = "Invalid" # Name; @@ -324,6 +320,16 @@ class SVE_1_Op_Imm_Arith_Pat; +class SVE_1_Op_Imm_Shift_Pred_Pat + : Pat<(vt (op (pt (AArch64ptrue 31)), (vt zprty:$Op1), (vt (AArch64dup (ImmTy:$imm))))), + (inst $Op1, ImmTy:$imm)>; + +class SVE_1_Op_Imm_Arith_Pred_Pat + : Pat<(vt (op (pt (AArch64ptrue 31)), (vt zprty:$Op1), (vt (AArch64dup (it (cpx i32:$imm)))))), + (inst $Op1, i32:$imm)>; + class SVE_1_Op_Imm_Log_Pat : Pat<(vt (op (vt zprty:$Op1), (vt (AArch64dup (it (cpx i64:$imm)))))), @@ -367,8 +373,22 @@ class SVE_4_Op_Imm_Pat; +def SVEDup0 : ComplexPattern; def SVEDup0Undef : ComplexPattern; +let AddedComplexity = 1 in { +class SVE_3_Op_Pat_SelZero +: Pat<(vtd (vtd (op vt1:$Op1, (vselect vt1:$Op1, vt2:$Op2, (SVEDup0)), vt3:$Op3))), + (inst $Op1, $Op2, $Op3)>; + +class SVE_3_Op_Pat_Shift_Imm_SelZero +: Pat<(vtd (op vt1:$Op1, (vselect vt1:$Op1, vt2:$Op2, (SVEDup0)), (i32 (vt3:$Op3)))), + (inst $Op1, $Op2, vt3:$Op3)>; +} + // // Common but less generic patterns. // @@ -378,6 +398,69 @@ class SVE_1_Op_AllActive_Pat; +class SVE_2_Op_AllActive_Pat +: Pat<(vtd (op vt1:$Op1, vt2:$Op2)), + (inst (ptrue 31), $Op1, $Op2)>; + +// +// Pseudo -> Instruction mappings +// +def getSVEPseudoMap : InstrMapping { + let FilterClass = "SVEPseudo2Instr"; + let RowFields = ["PseudoName"]; + let ColFields = ["IsInstr"]; + let KeyCol = ["0"]; + let ValueCols = [["1"]]; +} + +class SVEPseudo2Instr { + string PseudoName = name; + bit IsInstr = instr; +} + +// Lookup e.g. DIV -> DIVR +def getSVERevInstr : InstrMapping { + let FilterClass = "SVEInstr2Rev"; + let RowFields = ["InstrName"]; + let ColFields = ["isReverseInstr"]; + let KeyCol = ["0"]; + let ValueCols = [["1"]]; +} + +// Lookup e.g. DIVR -> DIV +def getSVENonRevInstr : InstrMapping { + let FilterClass = "SVEInstr2Rev"; + let RowFields = ["InstrName"]; + let ColFields = ["isReverseInstr"]; + let KeyCol = ["1"]; + let ValueCols = [["0"]]; +} + +class SVEInstr2Rev { + string InstrName = !if(name1IsReverseInstr, name1, name2); + bit isReverseInstr = name1IsReverseInstr; +} + +// +// Pseudos for destructive operands +// +let hasNoSchedulingInfo = 1 in { + class PredTwoOpPseudo + : SVEPseudo2Instr, + Pseudo<(outs zprty:$Zd), (ins PPR3bAny:$Pg, zprty:$Zs1, zprty:$Zs2), []> { + let FalseLanes = flags; + } + + class PredTwoOpImmPseudo + : SVEPseudo2Instr, + Pseudo<(outs zprty:$Zd), (ins PPR3bAny:$Pg, zprty:$Zs1, immty:$imm), []> { + let FalseLanes = flags; + } +} + //===----------------------------------------------------------------------===// // SVE Predicate Misc Group //===----------------------------------------------------------------------===// @@ -566,7 +649,7 @@ class sve_int_count_v sz8_64, bits<5> opc, string asm, let Inst{4-0} = Zdn; let Constraints = "$Zdn = $_Zdn"; - let DestructiveInstType = Destructive; + let DestructiveInstType = DestructiveOther; let ElementSize = ElementSizeNone; } @@ -680,7 +763,7 @@ class sve_int_countvlv opc, string asm, ZPRRegOp zprty> let Inst{4-0} = Zdn; let Constraints = "$Zdn = $_Zdn"; - let DestructiveInstType = Destructive; + let DestructiveInstType = DestructiveOther; let ElementSize = ElementSizeNone; } @@ -941,11 +1024,46 @@ multiclass sve_int_perm_tbl { def : SVE_2_Op_Pat(NAME # _D)>; } -multiclass sve2_int_perm_tbl { +multiclass sve2_int_perm_tbl { def _B : sve_int_perm_tbl<0b00, 0b01, asm, ZPR8, ZZ_b>; def _H : sve_int_perm_tbl<0b01, 0b01, asm, ZPR16, ZZ_h>; def _S : sve_int_perm_tbl<0b10, 0b01, asm, ZPR32, ZZ_s>; def _D : sve_int_perm_tbl<0b11, 0b01, asm, ZPR64, ZZ_d>; + + def : Pat<(nxv16i8 (op nxv16i8:$Op1, nxv16i8:$Op2, nxv16i8:$Op3)), + (nxv16i8 (!cast(NAME # _B) (REG_SEQUENCE ZPR2, nxv16i8:$Op1, zsub0, + nxv16i8:$Op2, zsub1), + nxv16i8:$Op3))>; + + def : Pat<(nxv8i16 (op nxv8i16:$Op1, nxv8i16:$Op2, nxv8i16:$Op3)), + (nxv8i16 (!cast(NAME # _H) (REG_SEQUENCE ZPR2, nxv8i16:$Op1, zsub0, + nxv8i16:$Op2, zsub1), + nxv8i16:$Op3))>; + + def : Pat<(nxv4i32 (op nxv4i32:$Op1, nxv4i32:$Op2, nxv4i32:$Op3)), + (nxv4i32 (!cast(NAME # _S) (REG_SEQUENCE ZPR2, nxv4i32:$Op1, zsub0, + nxv4i32:$Op2, zsub1), + nxv4i32:$Op3))>; + + def : Pat<(nxv2i64 (op nxv2i64:$Op1, nxv2i64:$Op2, nxv2i64:$Op3)), + (nxv2i64 (!cast(NAME # _D) (REG_SEQUENCE ZPR2, nxv2i64:$Op1, zsub0, + nxv2i64:$Op2, zsub1), + nxv2i64:$Op3))>; + + def : Pat<(nxv8f16 (op nxv8f16:$Op1, nxv8f16:$Op2, nxv8i16:$Op3)), + (nxv8f16 (!cast(NAME # _H) (REG_SEQUENCE ZPR2, nxv8f16:$Op1, zsub0, + nxv8f16:$Op2, zsub1), + nxv8i16:$Op3))>; + + def : Pat<(nxv4f32 (op nxv4f32:$Op1, nxv4f32:$Op2, nxv4i32:$Op3)), + (nxv4f32 (!cast(NAME # _S) (REG_SEQUENCE ZPR2, nxv4f32:$Op1, zsub0, + nxv4f32:$Op2, zsub1), + nxv4i32:$Op3))>; + + def : Pat<(nxv2f64 (op nxv2f64:$Op1, nxv2f64:$Op2, nxv2i64:$Op3)), + (nxv2f64 (!cast(NAME # _D) (REG_SEQUENCE ZPR2, nxv2f64:$Op1, zsub0, + nxv2f64:$Op2, zsub1), + nxv2i64:$Op3))>; } class sve2_int_perm_tbx sz8_64, string asm, ZPRRegOp zprty> @@ -967,11 +1085,20 @@ class sve2_int_perm_tbx sz8_64, string asm, ZPRRegOp zprty> let Constraints = "$Zd = $_Zd"; } -multiclass sve2_int_perm_tbx { +multiclass sve2_int_perm_tbx { def _B : sve2_int_perm_tbx<0b00, asm, ZPR8>; def _H : sve2_int_perm_tbx<0b01, asm, ZPR16>; def _S : sve2_int_perm_tbx<0b10, asm, ZPR32>; def _D : sve2_int_perm_tbx<0b11, asm, ZPR64>; + + def : SVE_3_Op_Pat(NAME # _B)>; + def : SVE_3_Op_Pat(NAME # _H)>; + def : SVE_3_Op_Pat(NAME # _S)>; + def : SVE_3_Op_Pat(NAME # _D)>; + + def : SVE_3_Op_Pat(NAME # _H)>; + def : SVE_3_Op_Pat(NAME # _S)>; + def : SVE_3_Op_Pat(NAME # _D)>; } class sve_int_perm_reverse_z sz8_64, string asm, ZPRRegOp zprty> @@ -1072,7 +1199,7 @@ class sve_int_perm_insrs sz8_64, string asm, ZPRRegOp zprty, let Inst{4-0} = Zdn; let Constraints = "$Zdn = $_Zdn"; - let DestructiveInstType = Destructive; + let DestructiveInstType = DestructiveOther; } multiclass sve_int_perm_insrs { @@ -1102,7 +1229,7 @@ class sve_int_perm_insrv sz8_64, string asm, ZPRRegOp zprty, let Inst{4-0} = Zdn; let Constraints = "$Zdn = $_Zdn"; - let DestructiveInstType = Destructive; + let DestructiveInstType = DestructiveOther; } multiclass sve_int_perm_insrv { @@ -1135,7 +1262,7 @@ class sve_int_perm_extract_i let Inst{4-0} = Zdn; let Constraints = "$Zdn = $_Zdn"; - let DestructiveInstType = Destructive; + let DestructiveInstType = DestructiveOther; let ElementSize = ElementSizeNone; } @@ -1244,13 +1371,22 @@ class sve_int_pred_log opc, string asm> } -multiclass sve_int_pred_log opc, string asm, SDPatternOperator op> { +multiclass sve_int_pred_log opc, string asm, SDPatternOperator op, + SDPatternOperator op_nopred = null_frag> { def NAME : sve_int_pred_log; def : SVE_3_Op_Pat(NAME)>; def : SVE_3_Op_Pat(NAME)>; def : SVE_3_Op_Pat(NAME)>; def : SVE_3_Op_Pat(NAME)>; + def : SVE_2_Op_AllActive_Pat(NAME), PTRUE_B>; + def : SVE_2_Op_AllActive_Pat(NAME), PTRUE_H>; + def : SVE_2_Op_AllActive_Pat(NAME), PTRUE_S>; + def : SVE_2_Op_AllActive_Pat(NAME), PTRUE_D>; } @@ -1272,7 +1408,7 @@ class sve_int_log_imm opc, string asm> let Constraints = "$Zdn = $_Zdn"; let DecoderMethod = "DecodeSVELogicalImmInstruction"; - let DestructiveInstType = Destructive; + let DestructiveInstType = DestructiveOther; let ElementSize = ElementSizeNone; } @@ -1357,7 +1493,8 @@ class sve_int_bin_cons_arit_0 sz8_64, bits<3> opc, string asm, let Inst{4-0} = Zd; } -multiclass sve_int_bin_cons_arit_0 opc, string asm, SDPatternOperator op> { +multiclass sve_int_bin_cons_arit_0 opc, string asm, + SDPatternOperator op, SDPatternOperator int_op> { def _B : sve_int_bin_cons_arit_0<0b00, opc, asm, ZPR8>; def _H : sve_int_bin_cons_arit_0<0b01, opc, asm, ZPR16>; def _S : sve_int_bin_cons_arit_0<0b10, opc, asm, ZPR32>; @@ -1367,6 +1504,12 @@ multiclass sve_int_bin_cons_arit_0 opc, string asm, SDPatternOperator op def : SVE_2_Op_Pat(NAME # _H)>; def : SVE_2_Op_Pat(NAME # _S)>; def : SVE_2_Op_Pat(NAME # _D)>; + + // Intrinsic version + def : SVE_2_Op_Pat(NAME # _B)>; + def : SVE_2_Op_Pat(NAME # _H)>; + def : SVE_2_Op_Pat(NAME # _S)>; + def : SVE_2_Op_Pat(NAME # _D)>; } //===----------------------------------------------------------------------===// @@ -1394,7 +1537,7 @@ class sve_fp_2op_i_p_zds sz, bits<3> opc, string asm, let Inst{4-0} = Zdn; let Constraints = "$Zdn = $_Zdn"; - let DestructiveInstType = Destructive; + let DestructiveInstType = DestructiveOther; let ElementSize = zprty.ElementSize; } @@ -1423,15 +1566,21 @@ class sve_fp_2op_p_zds sz, bits<4> opc, string asm, let Inst{4-0} = Zdn; let Constraints = "$Zdn = $_Zdn"; - let DestructiveInstType = Destructive; + let DestructiveInstType = DestructiveOther; let ElementSize = zprty.ElementSize; } -multiclass sve_fp_2op_p_zds opc, string asm, - SDPatternOperator op> { - def _H : sve_fp_2op_p_zds<0b01, opc, asm, ZPR16>; - def _S : sve_fp_2op_p_zds<0b10, opc, asm, ZPR32>; - def _D : sve_fp_2op_p_zds<0b11, opc, asm, ZPR64>; +multiclass sve_fp_2op_p_zds opc, string asm, string Ps, + SDPatternOperator op, DestructiveInstTypeEnum flags, + string revname="", bit isReverseInstr=0> { + let DestructiveInstType = flags in { + def _H : sve_fp_2op_p_zds<0b01, opc, asm, ZPR16>, + SVEPseudo2Instr, SVEInstr2Rev; + def _S : sve_fp_2op_p_zds<0b10, opc, asm, ZPR32>, + SVEPseudo2Instr, SVEInstr2Rev; + def _D : sve_fp_2op_p_zds<0b11, opc, asm, ZPR64>, + SVEPseudo2Instr, SVEInstr2Rev; + } def : SVE_3_Op_Pat(NAME # _H)>; def : SVE_3_Op_Pat(NAME # _S)>; @@ -1449,6 +1598,16 @@ multiclass sve_fp_2op_p_zds_fscale opc, string asm, def : SVE_3_Op_Pat(NAME # _D)>; } +multiclass sve_fp_2op_p_zds_zeroing_hsd { + def _ZERO_H : PredTwoOpPseudo; + def _ZERO_S : PredTwoOpPseudo; + def _ZERO_D : PredTwoOpPseudo; + + def : SVE_3_Op_Pat_SelZero(NAME # _ZERO_H)>; + def : SVE_3_Op_Pat_SelZero(NAME # _ZERO_S)>; + def : SVE_3_Op_Pat_SelZero(NAME # _ZERO_D)>; +} + class sve_fp_ftmad sz, string asm, ZPRRegOp zprty> : I<(outs zprty:$Zdn), (ins zprty:$_Zdn, zprty:$Zm, imm32_0_7:$imm3), asm, "\t$Zdn, $_Zdn, $Zm, $imm3", @@ -1466,7 +1625,7 @@ class sve_fp_ftmad sz, string asm, ZPRRegOp zprty> let Inst{4-0} = Zdn; let Constraints = "$Zdn = $_Zdn"; - let DestructiveInstType = Destructive; + let DestructiveInstType = DestructiveOther; let ElementSize = ElementSizeNone; } @@ -1551,7 +1710,7 @@ class sve_fp_3op_p_zds_a sz, bits<2> opc, string asm, ZPRRegOp zprty> let Inst{4-0} = Zda; let Constraints = "$Zda = $_Zda"; - let DestructiveInstType = Destructive; + let DestructiveInstType = DestructiveOther; let ElementSize = zprty.ElementSize; } @@ -1586,7 +1745,7 @@ class sve_fp_3op_p_zds_b sz, bits<2> opc, string asm, let Inst{4-0} = Zdn; let Constraints = "$Zdn = $_Zdn"; - let DestructiveInstType = Destructive; + let DestructiveInstType = DestructiveOther; let ElementSize = zprty.ElementSize; } @@ -1620,7 +1779,7 @@ class sve_fp_fma_by_indexed_elem sz, bit opc, string asm, let Inst{4-0} = Zda; let Constraints = "$Zda = $_Zda"; - let DestructiveInstType = Destructive; + let DestructiveInstType = DestructiveOther; let ElementSize = ElementSizeNone; } @@ -1646,12 +1805,12 @@ multiclass sve_fp_fma_by_indexed_elem(NAME # _H) $Op1, $Op2, $Op3, VectorIndexH32b:$idx)>; - def : Pat<(nxv4f32 (op nxv4f32:$Op1, nxv4f32:$Op2, nxv4f32:$Op3, (i32 VectorIndexS32b:$idx))), - (!cast(NAME # _S) $Op1, $Op2, $Op3, VectorIndexS32b:$idx)>; - def : Pat<(nxv2f64 (op nxv2f64:$Op1, nxv2f64:$Op2, nxv2f64:$Op3, (i32 VectorIndexD32b:$idx))), - (!cast(NAME # _D) $Op1, $Op2, $Op3, VectorIndexD32b:$idx)>; + def : Pat<(nxv8f16 (op nxv8f16:$Op1, nxv8f16:$Op2, nxv8f16:$Op3, (i32 VectorIndexH32b_timm:$idx))), + (!cast(NAME # _H) $Op1, $Op2, $Op3, VectorIndexH32b_timm:$idx)>; + def : Pat<(nxv4f32 (op nxv4f32:$Op1, nxv4f32:$Op2, nxv4f32:$Op3, (i32 VectorIndexS32b_timm:$idx))), + (!cast(NAME # _S) $Op1, $Op2, $Op3, VectorIndexS32b_timm:$idx)>; + def : Pat<(nxv2f64 (op nxv2f64:$Op1, nxv2f64:$Op2, nxv2f64:$Op3, (i32 VectorIndexD32b_timm:$idx))), + (!cast(NAME # _D) $Op1, $Op2, $Op3, VectorIndexD32b_timm:$idx)>; } @@ -1694,12 +1853,12 @@ multiclass sve_fp_fmul_by_indexed_elem { let Inst{19-16} = Zm; } - def : Pat<(nxv8f16 (op nxv8f16:$Op1, nxv8f16:$Op2, (i32 VectorIndexH32b:$idx))), - (!cast(NAME # _H) $Op1, $Op2, VectorIndexH32b:$idx)>; - def : Pat<(nxv4f32 (op nxv4f32:$Op1, nxv4f32:$Op2, (i32 VectorIndexS32b:$idx))), - (!cast(NAME # _S) $Op1, $Op2, VectorIndexS32b:$idx)>; - def : Pat<(nxv2f64 (op nxv2f64:$Op1, nxv2f64:$Op2, (i32 VectorIndexD32b:$idx))), - (!cast(NAME # _D) $Op1, $Op2, VectorIndexD32b:$idx)>; + def : Pat<(nxv8f16 (op nxv8f16:$Op1, nxv8f16:$Op2, (i32 VectorIndexH32b_timm:$idx))), + (!cast(NAME # _H) $Op1, $Op2, VectorIndexH32b_timm:$idx)>; + def : Pat<(nxv4f32 (op nxv4f32:$Op1, nxv4f32:$Op2, (i32 VectorIndexS32b_timm:$idx))), + (!cast(NAME # _S) $Op1, $Op2, VectorIndexS32b_timm:$idx)>; + def : Pat<(nxv2f64 (op nxv2f64:$Op1, nxv2f64:$Op2, (i32 VectorIndexD32b_timm:$idx))), + (!cast(NAME # _D) $Op1, $Op2, VectorIndexD32b_timm:$idx)>; } //===----------------------------------------------------------------------===// @@ -1727,7 +1886,7 @@ class sve_fp_fcmla sz, string asm, ZPRRegOp zprty> let Inst{4-0} = Zda; let Constraints = "$Zda = $_Zda"; - let DestructiveInstType = Destructive; + let DestructiveInstType = DestructiveOther; let ElementSize = zprty.ElementSize; } @@ -1767,7 +1926,7 @@ class sve_fp_fcmla_by_indexed_elem sz, string asm, let Inst{4-0} = Zda; let Constraints = "$Zda = $_Zda"; - let DestructiveInstType = Destructive; + let DestructiveInstType = DestructiveOther; let ElementSize = ElementSizeNone; } @@ -1785,10 +1944,10 @@ multiclass sve_fp_fcmla_by_indexed_elem { let Inst{19-16} = Zm; } - def : Pat<(nxv8f16 (op nxv8f16:$Op1, nxv8f16:$Op2, nxv8f16:$Op3, (i32 VectorIndexS32b:$idx), (i32 complexrotateop:$imm))), - (!cast(NAME # _H) $Op1, $Op2, $Op3, VectorIndexS32b:$idx, complexrotateop:$imm)>; - def : Pat<(nxv4f32 (op nxv4f32:$Op1, nxv4f32:$Op2, nxv4f32:$Op3, (i32 VectorIndexD32b:$idx), (i32 complexrotateop:$imm))), - (!cast(NAME # _S) $Op1, $Op2, $Op3, VectorIndexD32b:$idx, complexrotateop:$imm)>; + def : Pat<(nxv8f16 (op nxv8f16:$Op1, nxv8f16:$Op2, nxv8f16:$Op3, (i32 VectorIndexS32b_timm:$idx), (i32 complexrotateop:$imm))), + (!cast(NAME # _H) $Op1, $Op2, $Op3, VectorIndexS32b_timm:$idx, complexrotateop:$imm)>; + def : Pat<(nxv4f32 (op nxv4f32:$Op1, nxv4f32:$Op2, nxv4f32:$Op3, (i32 VectorIndexD32b_timm:$idx), (i32 complexrotateop:$imm))), + (!cast(NAME # _S) $Op1, $Op2, $Op3, VectorIndexD32b_timm:$idx, complexrotateop:$imm)>; } //===----------------------------------------------------------------------===// @@ -1815,7 +1974,7 @@ class sve_fp_fcadd sz, string asm, ZPRRegOp zprty> let Inst{4-0} = Zdn; let Constraints = "$Zdn = $_Zdn"; - let DestructiveInstType = Destructive; + let DestructiveInstType = DestructiveOther; let ElementSize = zprty.ElementSize; } @@ -1861,22 +2020,22 @@ multiclass sve2_fp_convert_down_narrow { def _StoH : sve2_fp_convert_precision<0b1000, asm, ZPR16, ZPR32>; def _DtoS : sve2_fp_convert_precision<0b1110, asm, ZPR32, ZPR64>; - def : SVE_3_Op_Pat(op # _f16f32), nxv8f16, nxv16i1, nxv4f32, !cast(NAME # _StoH)>; - def : SVE_3_Op_Pat(op # _f32f64), nxv4f32, nxv16i1, nxv2f64, !cast(NAME # _DtoS)>; + def : SVE_3_Op_Pat(op # _f16f32), nxv8f16, nxv4i1, nxv4f32, !cast(NAME # _StoH)>; + def : SVE_3_Op_Pat(op # _f32f64), nxv4f32, nxv2i1, nxv2f64, !cast(NAME # _DtoS)>; } multiclass sve2_fp_convert_up_long { def _HtoS : sve2_fp_convert_precision<0b1001, asm, ZPR32, ZPR16>; def _StoD : sve2_fp_convert_precision<0b1111, asm, ZPR64, ZPR32>; - def : SVE_3_Op_Pat(op # _f32f16), nxv4f32, nxv16i1, nxv8f16, !cast(NAME # _HtoS)>; - def : SVE_3_Op_Pat(op # _f64f32), nxv2f64, nxv16i1, nxv4f32, !cast(NAME # _StoD)>; + def : SVE_3_Op_Pat(op # _f32f16), nxv4f32, nxv4i1, nxv8f16, !cast(NAME # _HtoS)>; + def : SVE_3_Op_Pat(op # _f64f32), nxv2f64, nxv2i1, nxv4f32, !cast(NAME # _StoD)>; } multiclass sve2_fp_convert_down_odd_rounding_top { def _DtoS : sve2_fp_convert_precision<0b0010, asm, ZPR32, ZPR64>; - def : SVE_3_Op_Pat(op # _f32f64), nxv4f32, nxv16i1, nxv2f64, !cast(NAME # _DtoS)>; + def : SVE_3_Op_Pat(op # _f32f64), nxv4f32, nxv2i1, nxv2f64, !cast(NAME # _DtoS)>; } //===----------------------------------------------------------------------===// @@ -1902,7 +2061,7 @@ class sve2_fp_pairwise_pred sz, bits<3> opc, string asm, let Inst{4-0} = Zdn; let Constraints = "$Zdn = $_Zdn"; - let DestructiveInstType = Destructive; + let DestructiveInstType = DestructiveOther; let ElementSize = zprty.ElementSize; } @@ -1942,14 +2101,14 @@ class sve2_fp_mla_long_by_indexed_elem opc, string asm> let Inst{4-0} = Zda; let Constraints = "$Zda = $_Zda"; - let DestructiveInstType = Destructive; + let DestructiveInstType = DestructiveOther; let ElementSize = ElementSizeNone; } multiclass sve2_fp_mla_long_by_indexed_elem opc, string asm, SDPatternOperator op> { def NAME : sve2_fp_mla_long_by_indexed_elem; - def : SVE_4_Op_Imm_Pat(NAME)>; + def : SVE_4_Op_Imm_Pat(NAME)>; } //===----------------------------------------------------------------------===// @@ -1974,7 +2133,7 @@ class sve2_fp_mla_long opc, string asm> let Inst{4-0} = Zda; let Constraints = "$Zda = $_Zda"; - let DestructiveInstType = Destructive; + let DestructiveInstType = DestructiveOther; let ElementSize = ElementSizeNone; } @@ -2084,7 +2243,7 @@ class sve_fp_2op_p_zd opc, string asm, RegisterOperand i_zprtype, let Inst{4-0} = Zd; let Constraints = "$Zd = $_Zd"; - let DestructiveInstType = Destructive; + let DestructiveInstType = DestructiveOther; let ElementSize = size; } @@ -2120,7 +2279,7 @@ multiclass sve2_fp_flogb { multiclass sve2_fp_convert_down_odd_rounding { def _DtoS : sve_fp_2op_p_zd<0b0001010, asm, ZPR64, ZPR32, ElementSizeD>; - def : SVE_3_Op_Pat(op # _f32f64), nxv4f32, nxv16i1, nxv2f64, !cast(NAME # _DtoS)>; + def : SVE_3_Op_Pat(op # _f32f64), nxv4f32, nxv2i1, nxv2f64, !cast(NAME # _DtoS)>; } //===----------------------------------------------------------------------===// @@ -2176,7 +2335,7 @@ class sve_int_bin_pred_arit_log sz8_64, bits<2> fmt, bits<3> opc, let Inst{4-0} = Zdn; let Constraints = "$Zdn = $_Zdn"; - let DestructiveInstType = Destructive; + let DestructiveInstType = DestructiveOther; let ElementSize = zprty.ElementSize; } @@ -2192,11 +2351,20 @@ multiclass sve_int_bin_pred_log opc, string asm, SDPatternOperator op> { def : SVE_3_Op_Pat(NAME # _D)>; } -multiclass sve_int_bin_pred_arit_0 opc, string asm, SDPatternOperator op> { - def _B : sve_int_bin_pred_arit_log<0b00, 0b00, opc, asm, ZPR8>; - def _H : sve_int_bin_pred_arit_log<0b01, 0b00, opc, asm, ZPR16>; - def _S : sve_int_bin_pred_arit_log<0b10, 0b00, opc, asm, ZPR32>; - def _D : sve_int_bin_pred_arit_log<0b11, 0b00, opc, asm, ZPR64>; +multiclass sve_int_bin_pred_arit_0 opc, string asm, string Ps, + SDPatternOperator op, + DestructiveInstTypeEnum flags, + string revname="", bit isReverseInstr=0> { + let DestructiveInstType = flags in { + def _B : sve_int_bin_pred_arit_log<0b00, 0b00, opc, asm, ZPR8>, + SVEPseudo2Instr, SVEInstr2Rev; + def _H : sve_int_bin_pred_arit_log<0b01, 0b00, opc, asm, ZPR16>, + SVEPseudo2Instr, SVEInstr2Rev; + def _S : sve_int_bin_pred_arit_log<0b10, 0b00, opc, asm, ZPR32>, + SVEPseudo2Instr, SVEInstr2Rev; + def _D : sve_int_bin_pred_arit_log<0b11, 0b00, opc, asm, ZPR64>, + SVEPseudo2Instr, SVEInstr2Rev; + } def : SVE_3_Op_Pat(NAME # _B)>; def : SVE_3_Op_Pat(NAME # _H)>; @@ -2229,9 +2397,16 @@ multiclass sve_int_bin_pred_arit_2 opc, string asm, SDPatternOperator op } // Special case for divides which are not defined for 8b/16b elements. -multiclass sve_int_bin_pred_arit_2_div opc, string asm, SDPatternOperator op> { - def _S : sve_int_bin_pred_arit_log<0b10, 0b10, opc, asm, ZPR32>; - def _D : sve_int_bin_pred_arit_log<0b11, 0b10, opc, asm, ZPR64>; +multiclass sve_int_bin_pred_arit_2_div opc, string asm, string Ps, + SDPatternOperator op, + DestructiveInstTypeEnum flags, + string revname="", bit isReverseInstr=0> { + let DestructiveInstType = flags in { + def _S : sve_int_bin_pred_arit_log<0b10, 0b10, opc, asm, ZPR32>, + SVEPseudo2Instr, SVEInstr2Rev; + def _D : sve_int_bin_pred_arit_log<0b11, 0b10, opc, asm, ZPR64>, + SVEPseudo2Instr, SVEInstr2Rev; + } def : SVE_3_Op_Pat(NAME # _S)>; def : SVE_3_Op_Pat(NAME # _D)>; @@ -2262,7 +2437,7 @@ class sve_int_mladdsub_vvv_pred sz8_64, bits<1> opc, string asm, let Inst{4-0} = Zdn; let Constraints = "$Zdn = $_Zdn"; - let DestructiveInstType = Destructive; + let DestructiveInstType = DestructiveOther; let ElementSize = zprty.ElementSize; } @@ -2299,7 +2474,7 @@ class sve_int_mlas_vvv_pred sz8_64, bits<1> opc, string asm, let Inst{4-0} = Zda; let Constraints = "$Zda = $_Zda"; - let DestructiveInstType = Destructive; + let DestructiveInstType = DestructiveOther; let ElementSize = zprty.ElementSize; } @@ -2336,21 +2511,30 @@ class sve2_int_mla sz, bits<5> opc, string asm, let Inst{4-0} = Zda; let Constraints = "$Zda = $_Zda"; - let DestructiveInstType = Destructive; + let DestructiveInstType = DestructiveOther; let ElementSize = ElementSizeNone; } -multiclass sve2_int_mla { +multiclass sve2_int_mla { def _B : sve2_int_mla<0b00, { 0b1110, S }, asm, ZPR8, ZPR8>; def _H : sve2_int_mla<0b01, { 0b1110, S }, asm, ZPR16, ZPR16>; def _S : sve2_int_mla<0b10, { 0b1110, S }, asm, ZPR32, ZPR32>; def _D : sve2_int_mla<0b11, { 0b1110, S }, asm, ZPR64, ZPR64>; + + def : SVE_3_Op_Pat(NAME # _B)>; + def : SVE_3_Op_Pat(NAME # _H)>; + def : SVE_3_Op_Pat(NAME # _S)>; + def : SVE_3_Op_Pat(NAME # _D)>; } -multiclass sve2_int_mla_long opc, string asm> { +multiclass sve2_int_mla_long opc, string asm, SDPatternOperator op> { def _H : sve2_int_mla<0b01, opc, asm, ZPR16, ZPR8>; def _S : sve2_int_mla<0b10, opc, asm, ZPR32, ZPR16>; def _D : sve2_int_mla<0b11, opc, asm, ZPR64, ZPR32>; + + def : SVE_3_Op_Pat(NAME # _H)>; + def : SVE_3_Op_Pat(NAME # _S)>; + def : SVE_3_Op_Pat(NAME # _D)>; } //===----------------------------------------------------------------------===// @@ -2372,39 +2556,44 @@ class sve2_int_mla_by_indexed_elem sz, bits<6> opc, string asm, let Inst{4-0} = Zda; let Constraints = "$Zda = $_Zda"; - let DestructiveInstType = Destructive; + let DestructiveInstType = DestructiveOther; let ElementSize = ElementSizeNone; } -multiclass sve2_int_mla_by_indexed_elem opc, bit S, string asm> { - def _H : sve2_int_mla_by_indexed_elem<{0, ?}, { 0b000, opc, S }, asm, ZPR16, ZPR16, ZPR3b16, VectorIndexH> { +multiclass sve2_int_mla_by_indexed_elem opc, bit S, string asm, + SDPatternOperator op> { + def _H : sve2_int_mla_by_indexed_elem<{0, ?}, { 0b000, opc, S }, asm, ZPR16, ZPR16, ZPR3b16, VectorIndexH32b> { bits<3> Zm; bits<3> iop; let Inst{22} = iop{2}; let Inst{20-19} = iop{1-0}; let Inst{18-16} = Zm; } - def _S : sve2_int_mla_by_indexed_elem<0b10, { 0b000, opc, S }, asm, ZPR32, ZPR32, ZPR3b32, VectorIndexS> { + def _S : sve2_int_mla_by_indexed_elem<0b10, { 0b000, opc, S }, asm, ZPR32, ZPR32, ZPR3b32, VectorIndexS32b> { bits<3> Zm; bits<2> iop; let Inst{20-19} = iop; let Inst{18-16} = Zm; } - def _D : sve2_int_mla_by_indexed_elem<0b11, { 0b000, opc, S }, asm, ZPR64, ZPR64, ZPR4b64, VectorIndexD> { + def _D : sve2_int_mla_by_indexed_elem<0b11, { 0b000, opc, S }, asm, ZPR64, ZPR64, ZPR4b64, VectorIndexD32b> { bits<4> Zm; bit iop; let Inst{20} = iop; let Inst{19-16} = Zm; } + + def : SVE_4_Op_Imm_Pat(NAME # _H)>; + def : SVE_4_Op_Imm_Pat(NAME # _S)>; + def : SVE_4_Op_Imm_Pat(NAME # _D)>; } //===----------------------------------------------------------------------===// // SVE2 Integer Multiply-Add Long - Indexed Group //===----------------------------------------------------------------------===// -multiclass sve2_int_mla_long_by_indexed_elem opc, string asm> { +multiclass sve2_int_mla_long_by_indexed_elem opc, string asm, SDPatternOperator op> { def _S : sve2_int_mla_by_indexed_elem<0b10, { opc{3}, 0b0, opc{2-1}, ?, opc{0} }, - asm, ZPR32, ZPR16, ZPR3b16, VectorIndexH> { + asm, ZPR32, ZPR16, ZPR3b16, VectorIndexH32b> { bits<3> Zm; bits<3> iop; let Inst{20-19} = iop{2-1}; @@ -2412,13 +2601,16 @@ multiclass sve2_int_mla_long_by_indexed_elem opc, string asm> { let Inst{11} = iop{0}; } def _D : sve2_int_mla_by_indexed_elem<0b11, { opc{3}, 0b0, opc{2-1}, ?, opc{0} }, - asm, ZPR64, ZPR32, ZPR4b32, VectorIndexS> { + asm, ZPR64, ZPR32, ZPR4b32, VectorIndexS32b> { bits<4> Zm; bits<2> iop; let Inst{20} = iop{1}; let Inst{19-16} = Zm; let Inst{11} = iop{0}; } + + def : SVE_4_Op_Imm_Pat(NAME # _S)>; + def : SVE_4_Op_Imm_Pat(NAME # _D)>; } //===----------------------------------------------------------------------===// @@ -2442,7 +2634,7 @@ class sve_intx_dot { @@ -2474,28 +2666,28 @@ class sve_intx_dot_by_indexed_elem { - def _S : sve_intx_dot_by_indexed_elem<0b0, opc, asm, ZPR32, ZPR8, ZPR3b8, VectorIndexS32b> { + def _S : sve_intx_dot_by_indexed_elem<0b0, opc, asm, ZPR32, ZPR8, ZPR3b8, VectorIndexS32b_timm> { bits<2> iop; bits<3> Zm; let Inst{20-19} = iop; let Inst{18-16} = Zm; } - def _D : sve_intx_dot_by_indexed_elem<0b1, opc, asm, ZPR64, ZPR16, ZPR4b16, VectorIndexD32b> { + def _D : sve_intx_dot_by_indexed_elem<0b1, opc, asm, ZPR64, ZPR16, ZPR4b16, VectorIndexD32b_timm> { bits<1> iop; bits<4> Zm; let Inst{20} = iop; let Inst{19-16} = Zm; } - def : Pat<(nxv4i32 (op nxv4i32:$Op1, nxv16i8:$Op2, nxv16i8:$Op3, (i32 VectorIndexS32b:$idx))), - (!cast(NAME # _S) $Op1, $Op2, $Op3, VectorIndexS32b:$idx)>; - def : Pat<(nxv2i64 (op nxv2i64:$Op1, nxv8i16:$Op2, nxv8i16:$Op3, (i32 VectorIndexD32b:$idx))), - (!cast(NAME # _D) $Op1, $Op2, $Op3, VectorIndexD32b:$idx)>; + def : Pat<(nxv4i32 (op nxv4i32:$Op1, nxv16i8:$Op2, nxv16i8:$Op3, (i32 VectorIndexS32b_timm:$idx))), + (!cast(NAME # _S) $Op1, $Op2, $Op3, VectorIndexS32b_timm:$idx)>; + def : Pat<(nxv2i64 (op nxv2i64:$Op1, nxv8i16:$Op2, nxv8i16:$Op3, (i32 VectorIndexD32b_timm:$idx))), + (!cast(NAME # _D) $Op1, $Op2, $Op3, VectorIndexD32b_timm:$idx)>; } //===----------------------------------------------------------------------===// @@ -2521,24 +2713,36 @@ class sve2_complex_int_arith sz, bits<4> opc, string asm, let Inst{4-0} = Zda; let Constraints = "$Zda = $_Zda"; - let DestructiveInstType = Destructive; + let DestructiveInstType = DestructiveOther; let ElementSize = ElementSizeNone; } -multiclass sve2_cintx_dot { +multiclass sve2_cintx_dot { def _S : sve2_complex_int_arith<0b10, 0b0001, asm, ZPR32, ZPR8>; def _D : sve2_complex_int_arith<0b11, 0b0001, asm, ZPR64, ZPR16>; + + def : Pat<(nxv4i32 (op (nxv4i32 ZPR32:$Op1), (nxv16i8 ZPR8:$Op2), (nxv16i8 ZPR8:$Op3), + (i32 complexrotateop:$imm))), + (!cast(NAME # "_S") ZPR32:$Op1, ZPR8:$Op2, ZPR8:$Op3, complexrotateop:$imm)>; + def : Pat<(nxv2i64 (op (nxv2i64 ZPR64:$Op1), (nxv8i16 ZPR16:$Op2), (nxv8i16 ZPR16:$Op3), + (i32 complexrotateop:$imm))), + (!cast(NAME # "_D") ZPR64:$Op1, ZPR16:$Op2, ZPR16:$Op3, complexrotateop:$imm)>; } //===----------------------------------------------------------------------===// // SVE2 Complex Multiply-Add Group //===----------------------------------------------------------------------===// -multiclass sve2_int_cmla { +multiclass sve2_int_cmla { def _B : sve2_complex_int_arith<0b00, { 0b001, opc }, asm, ZPR8, ZPR8>; def _H : sve2_complex_int_arith<0b01, { 0b001, opc }, asm, ZPR16, ZPR16>; def _S : sve2_complex_int_arith<0b10, { 0b001, opc }, asm, ZPR32, ZPR32>; def _D : sve2_complex_int_arith<0b11, { 0b001, opc }, asm, ZPR64, ZPR64>; + + def : SVE_4_Op_Imm_Pat(NAME # _B)>; + def : SVE_4_Op_Imm_Pat(NAME # _H)>; + def : SVE_4_Op_Imm_Pat(NAME # _S)>; + def : SVE_4_Op_Imm_Pat(NAME # _D)>; } //===----------------------------------------------------------------------===// @@ -2563,42 +2767,58 @@ class sve2_complex_int_arith_indexed sz, bits<4> opc, string asm, let Inst{4-0} = Zda; let Constraints = "$Zda = $_Zda"; - let DestructiveInstType = Destructive; + let DestructiveInstType = DestructiveOther; let ElementSize = ElementSizeNone; } -multiclass sve2_cintx_dot_by_indexed_elem { - def _S : sve2_complex_int_arith_indexed<0b10, 0b0100, asm, ZPR32, ZPR8, ZPR3b8, VectorIndexS> { +multiclass sve2_cintx_dot_by_indexed_elem { + def _S : sve2_complex_int_arith_indexed<0b10, 0b0100, asm, ZPR32, ZPR8, ZPR3b8, VectorIndexS32b> { bits<2> iop; bits<3> Zm; let Inst{20-19} = iop; let Inst{18-16} = Zm; } - def _D : sve2_complex_int_arith_indexed<0b11, 0b0100, asm, ZPR64, ZPR16, ZPR4b16, VectorIndexD> { + def _D : sve2_complex_int_arith_indexed<0b11, 0b0100, asm, ZPR64, ZPR16, ZPR4b16, VectorIndexD32b> { bit iop; bits<4> Zm; let Inst{20} = iop; let Inst{19-16} = Zm; } + + def : Pat<(nxv4i32 (op (nxv4i32 ZPR32:$Op1), (nxv16i8 ZPR8:$Op2), (nxv16i8 ZPR8:$Op3), + (i32 VectorIndexS32b_timm:$idx), (i32 complexrotateop:$imm))), + (!cast(NAME # "_S") ZPR32:$Op1, ZPR8:$Op2, ZPR8:$Op3, VectorIndexS32b_timm:$idx, complexrotateop:$imm)>; + def : Pat<(nxv2i64 (op (nxv2i64 ZPR64:$Op1), (nxv8i16 ZPR16:$Op2), (nxv8i16 ZPR16:$Op3), + (i32 VectorIndexD32b_timm:$idx), (i32 complexrotateop:$imm))), + (!cast(NAME # "_D") ZPR64:$Op1, ZPR16:$Op2, ZPR16:$Op3, VectorIndexD32b_timm:$idx, complexrotateop:$imm)>; } //===----------------------------------------------------------------------===// // SVE2 Complex Multiply-Add - Indexed Group //===----------------------------------------------------------------------===// -multiclass sve2_cmla_by_indexed_elem { - def _H : sve2_complex_int_arith_indexed<0b10, { 0b011, opc }, asm, ZPR16, ZPR16, ZPR3b16, VectorIndexS> { +multiclass sve2_cmla_by_indexed_elem { + def _H : sve2_complex_int_arith_indexed<0b10, { 0b011, opc }, asm, ZPR16, ZPR16, ZPR3b16, VectorIndexS32b> { bits<2> iop; bits<3> Zm; let Inst{20-19} = iop; let Inst{18-16} = Zm; } - def _S : sve2_complex_int_arith_indexed<0b11, { 0b011, opc }, asm, ZPR32, ZPR32, ZPR4b32, VectorIndexD> { + def _S : sve2_complex_int_arith_indexed<0b11, { 0b011, opc }, asm, ZPR32, ZPR32, ZPR4b32, VectorIndexD32b> { bit iop; bits<4> Zm; let Inst{20} = iop; let Inst{19-16} = Zm; } + + def : Pat<(nxv8i16 (op (nxv8i16 ZPR16:$Op1), (nxv8i16 ZPR16:$Op2), (nxv8i16 ZPR16:$Op3), + (i32 VectorIndexS32b_timm:$idx), (i32 complexrotateop:$imm))), + (!cast(NAME # "_H") ZPR16:$Op1, ZPR16:$Op2, ZPR16:$Op3, VectorIndexS32b_timm:$idx, complexrotateop:$imm)>; + + def : Pat<(nxv4i32 (op (nxv4i32 ZPR32:$Op1), (nxv4i32 ZPR32:$Op2), (nxv4i32 ZPR32:$Op3), + (i32 VectorIndexD32b_timm:$idx), (i32 complexrotateop:$imm))), + (!cast(NAME # "_S") ZPR32:$Op1, ZPR32:$Op2, ZPR32:$Op3, VectorIndexD32b_timm:$idx, complexrotateop:$imm)>; } //===----------------------------------------------------------------------===// @@ -2621,11 +2841,22 @@ class sve2_int_mul sz, bits<3> opc, string asm, ZPRRegOp zprty> let Inst{4-0} = Zd; } -multiclass sve2_int_mul opc, string asm> { +multiclass sve2_int_mul opc, string asm, SDPatternOperator op> { def _B : sve2_int_mul<0b00, opc, asm, ZPR8>; def _H : sve2_int_mul<0b01, opc, asm, ZPR16>; def _S : sve2_int_mul<0b10, opc, asm, ZPR32>; def _D : sve2_int_mul<0b11, opc, asm, ZPR64>; + + def : SVE_2_Op_Pat(NAME # _B)>; + def : SVE_2_Op_Pat(NAME # _H)>; + def : SVE_2_Op_Pat(NAME # _S)>; + def : SVE_2_Op_Pat(NAME # _D)>; +} + +multiclass sve2_int_mul_single opc, string asm, SDPatternOperator op> { + def _B : sve2_int_mul<0b00, opc, asm, ZPR8>; + + def : SVE_2_Op_Pat(NAME # _B)>; } //===----------------------------------------------------------------------===// @@ -2648,31 +2879,37 @@ class sve2_int_mul_by_indexed_elem sz, bits<4> opc, string asm, let Inst{4-0} = Zd; } -multiclass sve2_int_mul_by_indexed_elem opc, string asm> { - def _H : sve2_int_mul_by_indexed_elem<{0, ?}, opc, asm, ZPR16, ZPR16, ZPR3b16, VectorIndexH> { +multiclass sve2_int_mul_by_indexed_elem opc, string asm, + SDPatternOperator op> { + def _H : sve2_int_mul_by_indexed_elem<{0, ?}, opc, asm, ZPR16, ZPR16, ZPR3b16, VectorIndexH32b> { bits<3> Zm; bits<3> iop; let Inst{22} = iop{2}; let Inst{20-19} = iop{1-0}; let Inst{18-16} = Zm; } - def _S : sve2_int_mul_by_indexed_elem<0b10, opc, asm, ZPR32, ZPR32, ZPR3b32, VectorIndexS> { + def _S : sve2_int_mul_by_indexed_elem<0b10, opc, asm, ZPR32, ZPR32, ZPR3b32, VectorIndexS32b> { bits<3> Zm; bits<2> iop; let Inst{20-19} = iop; let Inst{18-16} = Zm; } - def _D : sve2_int_mul_by_indexed_elem<0b11, opc, asm, ZPR64, ZPR64, ZPR4b64, VectorIndexD> { + def _D : sve2_int_mul_by_indexed_elem<0b11, opc, asm, ZPR64, ZPR64, ZPR4b64, VectorIndexD32b> { bits<4> Zm; bit iop; let Inst{20} = iop; let Inst{19-16} = Zm; } + + def : SVE_3_Op_Imm_Pat(NAME # _H)>; + def : SVE_3_Op_Imm_Pat(NAME # _S)>; + def : SVE_3_Op_Imm_Pat(NAME # _D)>; } -multiclass sve2_int_mul_long_by_indexed_elem opc, string asm> { +multiclass sve2_int_mul_long_by_indexed_elem opc, string asm, + SDPatternOperator op> { def _S : sve2_int_mul_by_indexed_elem<0b10, { opc{2-1}, ?, opc{0} }, asm, - ZPR32, ZPR16, ZPR3b16, VectorIndexH> { + ZPR32, ZPR16, ZPR3b16, VectorIndexH32b> { bits<3> Zm; bits<3> iop; let Inst{20-19} = iop{2-1}; @@ -2680,13 +2917,16 @@ multiclass sve2_int_mul_long_by_indexed_elem opc, string asm> { let Inst{11} = iop{0}; } def _D : sve2_int_mul_by_indexed_elem<0b11, { opc{2-1}, ?, opc{0} }, asm, - ZPR64, ZPR32, ZPR4b32, VectorIndexS> { + ZPR64, ZPR32, ZPR4b32, VectorIndexS32b> { bits<4> Zm; bits<2> iop; let Inst{20} = iop{1}; let Inst{19-16} = Zm; let Inst{11} = iop{0}; } + + def : SVE_3_Op_Imm_Pat(NAME # _S)>; + def : SVE_3_Op_Imm_Pat(NAME # _D)>; } //===----------------------------------------------------------------------===// @@ -2702,7 +2942,7 @@ class sve2_int_arith_pred sz, bits<6> opc, string asm, bits<5> Zdn; let Inst{31-24} = 0b01000100; let Inst{23-22} = sz; - let Inst{21} = 0b0; + let Inst{21-20} = 0b01; let Inst{20-16} = opc{5-1}; let Inst{15-14} = 0b10; let Inst{13} = opc{0}; @@ -2711,15 +2951,20 @@ class sve2_int_arith_pred sz, bits<6> opc, string asm, let Inst{4-0} = Zdn; let Constraints = "$Zdn = $_Zdn"; - let DestructiveInstType = Destructive; + let DestructiveInstType = DestructiveOther; let ElementSize = zprty.ElementSize; } -multiclass sve2_int_arith_pred opc, string asm> { +multiclass sve2_int_arith_pred opc, string asm, SDPatternOperator op> { def _B : sve2_int_arith_pred<0b00, opc, asm, ZPR8>; def _H : sve2_int_arith_pred<0b01, opc, asm, ZPR16>; def _S : sve2_int_arith_pred<0b10, opc, asm, ZPR32>; def _D : sve2_int_arith_pred<0b11, opc, asm, ZPR64>; + + def : SVE_3_Op_Pat(NAME # _B)>; + def : SVE_3_Op_Pat(NAME # _H)>; + def : SVE_3_Op_Pat(NAME # _S)>; + def : SVE_3_Op_Pat(NAME # _D)>; } class sve2_int_sadd_long_accum_pairwise sz, bit U, string asm, @@ -2739,14 +2984,18 @@ class sve2_int_sadd_long_accum_pairwise sz, bit U, string asm, let Inst{4-0} = Zda; let Constraints = "$Zda = $_Zda"; - let DestructiveInstType = Destructive; + let DestructiveInstType = DestructiveOther; let ElementSize = zprty1.ElementSize; } -multiclass sve2_int_sadd_long_accum_pairwise { +multiclass sve2_int_sadd_long_accum_pairwise { def _H : sve2_int_sadd_long_accum_pairwise<0b01, U, asm, ZPR16, ZPR8>; def _S : sve2_int_sadd_long_accum_pairwise<0b10, U, asm, ZPR32, ZPR16>; def _D : sve2_int_sadd_long_accum_pairwise<0b11, U, asm, ZPR64, ZPR32>; + + def : SVE_3_Op_Pat(NAME # _H)>; + def : SVE_3_Op_Pat(NAME # _S)>; + def : SVE_3_Op_Pat(NAME # _D)>; } class sve2_int_un_pred_arit sz, bit Q, bits<2> opc, @@ -2770,19 +3019,26 @@ class sve2_int_un_pred_arit sz, bit Q, bits<2> opc, let Inst{4-0} = Zd; let Constraints = "$Zd = $_Zd"; - let DestructiveInstType = Destructive; + let DestructiveInstType = DestructiveOther; let ElementSize = zprty.ElementSize; } -multiclass sve2_int_un_pred_arit_s opc, string asm> { +multiclass sve2_int_un_pred_arit_s opc, string asm, + SDPatternOperator op> { def _S : sve2_int_un_pred_arit<0b10, opc{2}, opc{1-0}, asm, ZPR32>; + def : SVE_3_Op_Pat(NAME # _S)>; } -multiclass sve2_int_un_pred_arit opc, string asm> { +multiclass sve2_int_un_pred_arit opc, string asm, SDPatternOperator op> { def _B : sve2_int_un_pred_arit<0b00, opc{2}, opc{1-0}, asm, ZPR8>; def _H : sve2_int_un_pred_arit<0b01, opc{2}, opc{1-0}, asm, ZPR16>; def _S : sve2_int_un_pred_arit<0b10, opc{2}, opc{1-0}, asm, ZPR32>; def _D : sve2_int_un_pred_arit<0b11, opc{2}, opc{1-0}, asm, ZPR64>; + + def : SVE_3_Op_Pat(NAME # _B)>; + def : SVE_3_Op_Pat(NAME # _H)>; + def : SVE_3_Op_Pat(NAME # _S)>; + def : SVE_3_Op_Pat(NAME # _D)>; } //===----------------------------------------------------------------------===// @@ -2806,21 +3062,47 @@ class sve2_wide_int_arith sz, bits<5> opc, string asm, let Inst{4-0} = Zd; } -multiclass sve2_wide_int_arith_long opc, string asm> { +multiclass sve2_wide_int_arith_long opc, string asm, + SDPatternOperator op> { def _H : sve2_wide_int_arith<0b01, opc, asm, ZPR16, ZPR8, ZPR8>; def _S : sve2_wide_int_arith<0b10, opc, asm, ZPR32, ZPR16, ZPR16>; def _D : sve2_wide_int_arith<0b11, opc, asm, ZPR64, ZPR32, ZPR32>; + + def : SVE_2_Op_Pat(NAME # _H)>; + def : SVE_2_Op_Pat(NAME # _S)>; + def : SVE_2_Op_Pat(NAME # _D)>; } -multiclass sve2_wide_int_arith_wide opc, string asm> { +multiclass sve2_wide_int_arith_wide opc, string asm, + SDPatternOperator op> { def _H : sve2_wide_int_arith<0b01, { 0b10, opc }, asm, ZPR16, ZPR16, ZPR8>; def _S : sve2_wide_int_arith<0b10, { 0b10, opc }, asm, ZPR32, ZPR32, ZPR16>; def _D : sve2_wide_int_arith<0b11, { 0b10, opc }, asm, ZPR64, ZPR64, ZPR32>; + + def : SVE_2_Op_Pat(NAME # _H)>; + def : SVE_2_Op_Pat(NAME # _S)>; + def : SVE_2_Op_Pat(NAME # _D)>; +} + +multiclass sve2_wide_int_arith_pmul sz, bits<5> opc, string asm, + SDPatternOperator op> { + def NAME : sve2_wide_int_arith; + + // To avoid using 128 bit elements in the IR, the pattern below works with + // llvm intrinsics with the _pair suffix, to reflect that + // _Q is implemented as a pair of _D. + def : SVE_2_Op_Pat(NAME)>; } -multiclass sve2_pmul_long opc, string asm> { +multiclass sve2_pmul_long opc, string asm, SDPatternOperator op> { def _H : sve2_wide_int_arith<0b01, {0b1101, opc}, asm, ZPR16, ZPR8, ZPR8>; def _D : sve2_wide_int_arith<0b11, {0b1101, opc}, asm, ZPR64, ZPR32, ZPR32>; + + // To avoid using 128 bit elements in the IR, the patterns below work with + // llvm intrinsics with the _pair suffix, to reflect that + // _H is implemented as a pair of _B and _D is implemented as a pair of _S. + def : SVE_2_Op_Pat(NAME # _H)>; + def : SVE_2_Op_Pat(NAME # _D)>; } //===----------------------------------------------------------------------===// @@ -2844,17 +3126,27 @@ class sve2_misc sz, bits<4> opc, string asm, let Inst{4-0} = Zd; } -multiclass sve2_misc_bitwise opc, string asm> { +multiclass sve2_misc_bitwise opc, string asm, SDPatternOperator op> { def _B : sve2_misc<0b00, opc, asm, ZPR8, ZPR8>; def _H : sve2_misc<0b01, opc, asm, ZPR16, ZPR16>; def _S : sve2_misc<0b10, opc, asm, ZPR32, ZPR32>; def _D : sve2_misc<0b11, opc, asm, ZPR64, ZPR64>; + + def : SVE_2_Op_Pat(NAME # _B)>; + def : SVE_2_Op_Pat(NAME # _H)>; + def : SVE_2_Op_Pat(NAME # _S)>; + def : SVE_2_Op_Pat(NAME # _D)>; } -multiclass sve2_misc_int_addsub_long_interleaved opc, string asm> { +multiclass sve2_misc_int_addsub_long_interleaved opc, string asm, + SDPatternOperator op> { def _H : sve2_misc<0b01, { 0b00, opc }, asm, ZPR16, ZPR8>; def _S : sve2_misc<0b10, { 0b00, opc }, asm, ZPR32, ZPR16>; def _D : sve2_misc<0b11, { 0b00, opc }, asm, ZPR64, ZPR32>; + + def : SVE_2_Op_Pat(NAME # _H)>; + def : SVE_2_Op_Pat(NAME # _S)>; + def : SVE_2_Op_Pat(NAME # _D)>; } class sve2_bitwise_xor_interleaved sz, bits<1> opc, string asm, @@ -2874,15 +3166,21 @@ class sve2_bitwise_xor_interleaved sz, bits<1> opc, string asm, let Inst{4-0} = Zd; let Constraints = "$Zd = $_Zd"; - let DestructiveInstType = Destructive; + let DestructiveInstType = DestructiveOther; let ElementSize = ElementSizeNone; } -multiclass sve2_bitwise_xor_interleaved { +multiclass sve2_bitwise_xor_interleaved { def _B : sve2_bitwise_xor_interleaved<0b00, opc, asm, ZPR8, ZPR8>; def _H : sve2_bitwise_xor_interleaved<0b01, opc, asm, ZPR16, ZPR16>; def _S : sve2_bitwise_xor_interleaved<0b10, opc, asm, ZPR32, ZPR32>; def _D : sve2_bitwise_xor_interleaved<0b11, opc, asm, ZPR64, ZPR64>; + + def : SVE_3_Op_Pat(NAME # _B)>; + def : SVE_3_Op_Pat(NAME # _H)>; + def : SVE_3_Op_Pat(NAME # _S)>; + def : SVE_3_Op_Pat(NAME # _D)>; } class sve2_bitwise_shift_left_long tsz8_64, bits<2> opc, string asm, @@ -2905,7 +3203,8 @@ class sve2_bitwise_shift_left_long tsz8_64, bits<2> opc, string asm, let Inst{4-0} = Zd; } -multiclass sve2_bitwise_shift_left_long opc, string asm> { +multiclass sve2_bitwise_shift_left_long opc, string asm, + SDPatternOperator op> { def _H : sve2_bitwise_shift_left_long<{0,0,1}, opc, asm, ZPR16, ZPR8, vecshiftL8>; def _S : sve2_bitwise_shift_left_long<{0,1,?}, opc, asm, @@ -2916,6 +3215,9 @@ multiclass sve2_bitwise_shift_left_long opc, string asm> { ZPR64, ZPR32, vecshiftL32> { let Inst{20-19} = imm{4-3}; } + def : SVE_2_Op_Imm_Pat(NAME # _H)>; + def : SVE_2_Op_Imm_Pat(NAME # _S)>; + def : SVE_2_Op_Imm_Pat(NAME # _D)>; } //===----------------------------------------------------------------------===// @@ -2943,7 +3245,8 @@ class sve2_int_bin_shift_imm tsz8_64, bit opc, string asm, let Constraints = "$Zd = $_Zd"; } -multiclass sve2_int_bin_shift_imm_left { +multiclass sve2_int_bin_shift_imm_left { def _B : sve2_int_bin_shift_imm<{0,0,0,1}, opc, asm, ZPR8, vecshiftL8>; def _H : sve2_int_bin_shift_imm<{0,0,1,?}, opc, asm, ZPR16, vecshiftL16> { let Inst{19} = imm{3}; @@ -2955,9 +3258,15 @@ multiclass sve2_int_bin_shift_imm_left { let Inst{22} = imm{5}; let Inst{20-19} = imm{4-3}; } + + def : SVE_3_Op_Imm_Pat(NAME # _B)>; + def : SVE_3_Op_Imm_Pat(NAME # _H)>; + def : SVE_3_Op_Imm_Pat(NAME # _S)>; + def : SVE_3_Op_Imm_Pat(NAME # _D)>; } -multiclass sve2_int_bin_shift_imm_right { +multiclass sve2_int_bin_shift_imm_right { def _B : sve2_int_bin_shift_imm<{0,0,0,1}, opc, asm, ZPR8, vecshiftR8>; def _H : sve2_int_bin_shift_imm<{0,0,1,?}, opc, asm, ZPR16, vecshiftR16> { let Inst{19} = imm{3}; @@ -2969,6 +3278,11 @@ multiclass sve2_int_bin_shift_imm_right { let Inst{22} = imm{5}; let Inst{20-19} = imm{4-3}; } + + def : SVE_3_Op_Imm_Pat(NAME # _B)>; + def : SVE_3_Op_Imm_Pat(NAME # _H)>; + def : SVE_3_Op_Imm_Pat(NAME # _S)>; + def : SVE_3_Op_Imm_Pat(NAME # _D)>; } class sve2_int_bin_accum_shift_imm tsz8_64, bits<2> opc, string asm, @@ -2990,11 +3304,12 @@ class sve2_int_bin_accum_shift_imm tsz8_64, bits<2> opc, string asm, let Inst{4-0} = Zda; let Constraints = "$Zda = $_Zda"; - let DestructiveInstType = Destructive; + let DestructiveInstType = DestructiveOther; let ElementSize = ElementSizeNone; } -multiclass sve2_int_bin_accum_shift_imm_right opc, string asm> { +multiclass sve2_int_bin_accum_shift_imm_right opc, string asm, + SDPatternOperator op> { def _B : sve2_int_bin_accum_shift_imm<{0,0,0,1}, opc, asm, ZPR8, vecshiftR8>; def _H : sve2_int_bin_accum_shift_imm<{0,0,1,?}, opc, asm, ZPR16, vecshiftR16> { let Inst{19} = imm{3}; @@ -3006,6 +3321,11 @@ multiclass sve2_int_bin_accum_shift_imm_right opc, string asm> { let Inst{22} = imm{5}; let Inst{20-19} = imm{4-3}; } + + def : SVE_3_Op_Imm_Pat(NAME # _B)>; + def : SVE_3_Op_Imm_Pat(NAME # _H)>; + def : SVE_3_Op_Imm_Pat(NAME # _S)>; + def : SVE_3_Op_Imm_Pat(NAME # _D)>; } class sve2_int_cadd sz, bit opc, string asm, ZPRRegOp zprty> @@ -3024,15 +3344,20 @@ class sve2_int_cadd sz, bit opc, string asm, ZPRRegOp zprty> let Inst{4-0} = Zdn; let Constraints = "$Zdn = $_Zdn"; - let DestructiveInstType = Destructive; + let DestructiveInstType = DestructiveOther; let ElementSize = ElementSizeNone; } -multiclass sve2_int_cadd { +multiclass sve2_int_cadd { def _B : sve2_int_cadd<0b00, opc, asm, ZPR8>; def _H : sve2_int_cadd<0b01, opc, asm, ZPR16>; def _S : sve2_int_cadd<0b10, opc, asm, ZPR32>; def _D : sve2_int_cadd<0b11, opc, asm, ZPR64>; + + def : SVE_3_Op_Imm_Pat(NAME # _B)>; + def : SVE_3_Op_Imm_Pat(NAME # _H)>; + def : SVE_3_Op_Imm_Pat(NAME # _S)>; + def : SVE_3_Op_Imm_Pat(NAME # _D)>; } class sve2_int_absdiff_accum sz, bits<4> opc, string asm, @@ -3052,28 +3377,41 @@ class sve2_int_absdiff_accum sz, bits<4> opc, string asm, let Inst{4-0} = Zda; let Constraints = "$Zda = $_Zda"; - let DestructiveInstType = Destructive; + let DestructiveInstType = DestructiveOther; let ElementSize = ElementSizeNone; } -multiclass sve2_int_absdiff_accum { +multiclass sve2_int_absdiff_accum { def _B : sve2_int_absdiff_accum<0b00, { 0b111, opc }, asm, ZPR8, ZPR8>; def _H : sve2_int_absdiff_accum<0b01, { 0b111, opc }, asm, ZPR16, ZPR16>; def _S : sve2_int_absdiff_accum<0b10, { 0b111, opc }, asm, ZPR32, ZPR32>; def _D : sve2_int_absdiff_accum<0b11, { 0b111, opc }, asm, ZPR64, ZPR64>; + + def : SVE_3_Op_Pat(NAME # _B)>; + def : SVE_3_Op_Pat(NAME # _H)>; + def : SVE_3_Op_Pat(NAME # _S)>; + def : SVE_3_Op_Pat(NAME # _D)>; } -multiclass sve2_int_absdiff_accum_long opc, string asm> { +multiclass sve2_int_absdiff_accum_long opc, string asm, + SDPatternOperator op> { def _H : sve2_int_absdiff_accum<0b01, { 0b00, opc }, asm, ZPR16, ZPR8>; def _S : sve2_int_absdiff_accum<0b10, { 0b00, opc }, asm, ZPR32, ZPR16>; def _D : sve2_int_absdiff_accum<0b11, { 0b00, opc }, asm, ZPR64, ZPR32>; + + def : SVE_3_Op_Pat(NAME # _H)>; + def : SVE_3_Op_Pat(NAME # _S)>; + def : SVE_3_Op_Pat(NAME # _D)>; } -multiclass sve2_int_addsub_long_carry opc, string asm> { +multiclass sve2_int_addsub_long_carry opc, string asm, SDPatternOperator op> { def _S : sve2_int_absdiff_accum<{ opc{1}, 0b0 }, { 0b010, opc{0} }, asm, ZPR32, ZPR32>; def _D : sve2_int_absdiff_accum<{ opc{1}, 0b1 }, { 0b010, opc{0} }, asm, ZPR64, ZPR64>; + + def : SVE_3_Op_Pat(NAME # _S)>; + def : SVE_3_Op_Pat(NAME # _D)>; } //===----------------------------------------------------------------------===// @@ -3300,7 +3638,7 @@ class sve_int_un_pred_arit sz8_64, bits<4> opc, let Inst{4-0} = Zd; let Constraints = "$Zd = $_Zd"; - let DestructiveInstType = Destructive; + let DestructiveInstType = DestructiveOther; let ElementSize = zprty.ElementSize; } @@ -3465,11 +3803,12 @@ class sve_int_arith_imm0 sz8_64, bits<3> opc, string asm, let Inst{4-0} = Zdn; let Constraints = "$Zdn = $_Zdn"; - let DestructiveInstType = Destructive; + let DestructiveInstType = DestructiveOther; let ElementSize = ElementSizeNone; } -multiclass sve_int_arith_imm0 opc, string asm, SDPatternOperator op> { +multiclass sve_int_arith_imm0 opc, string asm, + SDPatternOperator op, SDPatternOperator int_op> { def _B : sve_int_arith_imm0<0b00, opc, asm, ZPR8, addsub_imm8_opt_lsl_i8>; def _H : sve_int_arith_imm0<0b01, opc, asm, ZPR16, addsub_imm8_opt_lsl_i16>; def _S : sve_int_arith_imm0<0b10, opc, asm, ZPR32, addsub_imm8_opt_lsl_i32>; @@ -3479,6 +3818,12 @@ multiclass sve_int_arith_imm0 opc, string asm, SDPatternOperator op> { def : SVE_1_Op_Imm_OptLsl_Pat(NAME # _H)>; def : SVE_1_Op_Imm_OptLsl_Pat(NAME # _S)>; def : SVE_1_Op_Imm_OptLsl_Pat(NAME # _D)>; + + // Intrinsic version + def : SVE_1_Op_Imm_OptLsl_Pat(NAME # _B)>; + def : SVE_1_Op_Imm_OptLsl_Pat(NAME # _H)>; + def : SVE_1_Op_Imm_OptLsl_Pat(NAME # _S)>; + def : SVE_1_Op_Imm_OptLsl_Pat(NAME # _D)>; } multiclass sve_int_arith_imm0_subr opc, string asm, SDPatternOperator op> { @@ -3509,7 +3854,7 @@ class sve_int_arith_imm sz8_64, bits<6> opc, string asm, let Inst{4-0} = Zdn; let Constraints = "$Zdn = $_Zdn"; - let DestructiveInstType = Destructive; + let DestructiveInstType = DestructiveOther; let ElementSize = ElementSizeNone; } @@ -3519,10 +3864,10 @@ multiclass sve_int_arith_imm1 opc, string asm, SDPatternOperator op> { def _S : sve_int_arith_imm<0b10, { 0b1010, opc }, asm, ZPR32, simm8>; def _D : sve_int_arith_imm<0b11, { 0b1010, opc }, asm, ZPR64, simm8>; - def : SVE_1_Op_Imm_Arith_Pat(NAME # _B)>; - def : SVE_1_Op_Imm_Arith_Pat(NAME # _H)>; - def : SVE_1_Op_Imm_Arith_Pat(NAME # _S)>; - def : SVE_1_Op_Imm_Arith_Pat(NAME # _D)>; + def : SVE_1_Op_Imm_Arith_Pred_Pat(NAME # _B)>; + def : SVE_1_Op_Imm_Arith_Pred_Pat(NAME # _H)>; + def : SVE_1_Op_Imm_Arith_Pred_Pat(NAME # _S)>; + def : SVE_1_Op_Imm_Arith_Pred_Pat(NAME # _D)>; } multiclass sve_int_arith_imm1_unsigned opc, string asm, SDPatternOperator op> { @@ -3531,10 +3876,10 @@ multiclass sve_int_arith_imm1_unsigned opc, string asm, SDPatternOperato def _S : sve_int_arith_imm<0b10, { 0b1010, opc }, asm, ZPR32, imm0_255>; def _D : sve_int_arith_imm<0b11, { 0b1010, opc }, asm, ZPR64, imm0_255>; - def : SVE_1_Op_Imm_Arith_Pat(NAME # _B)>; - def : SVE_1_Op_Imm_Arith_Pat(NAME # _H)>; - def : SVE_1_Op_Imm_Arith_Pat(NAME # _S)>; - def : SVE_1_Op_Imm_Arith_Pat(NAME # _D)>; + def : SVE_1_Op_Imm_Arith_Pred_Pat(NAME # _B)>; + def : SVE_1_Op_Imm_Arith_Pred_Pat(NAME # _H)>; + def : SVE_1_Op_Imm_Arith_Pred_Pat(NAME # _S)>; + def : SVE_1_Op_Imm_Arith_Pred_Pat(NAME # _D)>; } multiclass sve_int_arith_imm2 { @@ -3604,11 +3949,11 @@ class sve2_int_bitwise_ternary_op_d opc, string asm> let Inst{4-0} = Zdn; let Constraints = "$Zdn = $_Zdn"; - let DestructiveInstType = Destructive; + let DestructiveInstType = DestructiveOther; let ElementSize = ElementSizeNone; } -multiclass sve2_int_bitwise_ternary_op opc, string asm> { +multiclass sve2_int_bitwise_ternary_op opc, string asm, SDPatternOperator op> { def NAME : sve2_int_bitwise_ternary_op_d; def : InstAlias opc, string asm> { (!cast(NAME) ZPR16:$Zdn, ZPR16:$Zm, ZPR16:$Zk), 1>; def : InstAlias(NAME) ZPR32:$Zdn, ZPR32:$Zm, ZPR32:$Zk), 1>; + + def : SVE_3_Op_Pat(NAME)>; + def : SVE_3_Op_Pat(NAME)>; + def : SVE_3_Op_Pat(NAME)>; + def : SVE_3_Op_Pat(NAME)>; } class sve2_int_rotate_right_imm tsz8_64, string asm, @@ -3638,11 +3988,11 @@ class sve2_int_rotate_right_imm tsz8_64, string asm, let Inst{4-0} = Zdn; let Constraints = "$Zdn = $_Zdn"; - let DestructiveInstType = Destructive; + let DestructiveInstType = DestructiveOther; let ElementSize = ElementSizeNone; } -multiclass sve2_int_rotate_right_imm { +multiclass sve2_int_rotate_right_imm { def _B : sve2_int_rotate_right_imm<{0,0,0,1}, asm, ZPR8, vecshiftR8>; def _H : sve2_int_rotate_right_imm<{0,0,1,?}, asm, ZPR16, vecshiftR16> { let Inst{19} = imm{3}; @@ -3654,6 +4004,10 @@ multiclass sve2_int_rotate_right_imm { let Inst{22} = imm{5}; let Inst{20-19} = imm{4-3}; } + def : SVE_3_Op_Imm_Pat(NAME # _B)>; + def : SVE_3_Op_Imm_Pat(NAME # _H)>; + def : SVE_3_Op_Imm_Pat(NAME # _S)>; + def : SVE_3_Op_Imm_Pat(NAME # _D)>; } //===----------------------------------------------------------------------===// @@ -3678,7 +4032,7 @@ class sve_int_dup_fpimm_pred sz, Operand fpimmtype, let Inst{4-0} = Zd; let Constraints = "$Zd = $_Zd"; - let DestructiveInstType = Destructive; + let DestructiveInstType = DestructiveOther; let ElementSize = zprty.ElementSize; } @@ -3713,26 +4067,34 @@ class sve_int_dup_imm_pred sz8_64, bit m, string asm, let Inst{12-5} = imm{7-0}; // imm8 let Inst{4-0} = Zd; - let DestructiveInstType = Destructive; + let DestructiveInstType = DestructiveOther; let ElementSize = zprty.ElementSize; } -multiclass sve_int_dup_imm_pred_merge { - let Constraints = "$Zd = $_Zd" in { - def _B : sve_int_dup_imm_pred<0b00, 1, asm, ZPR8, "/m", (ins ZPR8:$_Zd, PPRAny:$Pg, cpy_imm8_opt_lsl_i8:$imm)>; - def _H : sve_int_dup_imm_pred<0b01, 1, asm, ZPR16, "/m", (ins ZPR16:$_Zd, PPRAny:$Pg, cpy_imm8_opt_lsl_i16:$imm)>; - def _S : sve_int_dup_imm_pred<0b10, 1, asm, ZPR32, "/m", (ins ZPR32:$_Zd, PPRAny:$Pg, cpy_imm8_opt_lsl_i32:$imm)>; - def _D : sve_int_dup_imm_pred<0b11, 1, asm, ZPR64, "/m", (ins ZPR64:$_Zd, PPRAny:$Pg, cpy_imm8_opt_lsl_i64:$imm)>; - } - - def : InstAlias<"mov $Zd, $Pg/m, $imm", - (!cast(NAME # _B) ZPR8:$Zd, PPRAny:$Pg, cpy_imm8_opt_lsl_i8:$imm), 1>; - def : InstAlias<"mov $Zd, $Pg/m, $imm", - (!cast(NAME # _H) ZPR16:$Zd, PPRAny:$Pg, cpy_imm8_opt_lsl_i16:$imm), 1>; - def : InstAlias<"mov $Zd, $Pg/m, $imm", - (!cast(NAME # _S) ZPR32:$Zd, PPRAny:$Pg, cpy_imm8_opt_lsl_i32:$imm), 1>; +multiclass sve_int_dup_imm_pred_merge_inst< + bits<2> sz8_64, string asm, ZPRRegOp zprty, ValueType intty, + ValueType predty, ValueType scalarty, imm8_opt_lsl cpyimm> { + let Constraints = "$Zd = $_Zd" in + def NAME : sve_int_dup_imm_pred; def : InstAlias<"mov $Zd, $Pg/m, $imm", - (!cast(NAME # _D) ZPR64:$Zd, PPRAny:$Pg, cpy_imm8_opt_lsl_i64:$imm), 1>; + (!cast(NAME) zprty:$Zd, PPRAny:$Pg, cpyimm:$imm), 1>; + def : Pat<(intty + (vselect predty:$Pg, + (intty (AArch64dup (scalarty (SVE8BitLslImm i32:$imm, i32:$shift)))), + intty:$Zd)), + (!cast(NAME) zprty:$Zd, $Pg, i32:$imm, i32:$shift)>; +} + +multiclass sve_int_dup_imm_pred_merge { + defm _B : sve_int_dup_imm_pred_merge_inst<0b00, asm, ZPR8, nxv16i8, nxv16i1, + i32, cpy_imm8_opt_lsl_i8>; + defm _H : sve_int_dup_imm_pred_merge_inst<0b01, asm, ZPR16, nxv8i16, nxv8i1, + i32, cpy_imm8_opt_lsl_i16>; + defm _S : sve_int_dup_imm_pred_merge_inst<0b10, asm, ZPR32, nxv4i32, nxv4i1, + i32, cpy_imm8_opt_lsl_i32>; + defm _D : sve_int_dup_imm_pred_merge_inst<0b11, asm, ZPR64, nxv2i64, nxv2i1, + i64, cpy_imm8_opt_lsl_i64>; def : InstAlias<"fmov $Zd, $Pg/m, #0.0", (!cast(NAME # _H) ZPR16:$Zd, PPRAny:$Pg, 0, 0), 0>; @@ -3742,20 +4104,35 @@ multiclass sve_int_dup_imm_pred_merge { (!cast(NAME # _D) ZPR64:$Zd, PPRAny:$Pg, 0, 0), 0>; } -multiclass sve_int_dup_imm_pred_zero { - def _B : sve_int_dup_imm_pred<0b00, 0, asm, ZPR8, "/z", (ins PPRAny:$Pg, cpy_imm8_opt_lsl_i8:$imm)>; - def _H : sve_int_dup_imm_pred<0b01, 0, asm, ZPR16, "/z", (ins PPRAny:$Pg, cpy_imm8_opt_lsl_i16:$imm)>; - def _S : sve_int_dup_imm_pred<0b10, 0, asm, ZPR32, "/z", (ins PPRAny:$Pg, cpy_imm8_opt_lsl_i32:$imm)>; - def _D : sve_int_dup_imm_pred<0b11, 0, asm, ZPR64, "/z", (ins PPRAny:$Pg, cpy_imm8_opt_lsl_i64:$imm)>; - - def : InstAlias<"mov $Zd, $Pg/z, $imm", - (!cast(NAME # _B) ZPR8:$Zd, PPRAny:$Pg, cpy_imm8_opt_lsl_i8:$imm), 1>; - def : InstAlias<"mov $Zd, $Pg/z, $imm", - (!cast(NAME # _H) ZPR16:$Zd, PPRAny:$Pg, cpy_imm8_opt_lsl_i16:$imm), 1>; - def : InstAlias<"mov $Zd, $Pg/z, $imm", - (!cast(NAME # _S) ZPR32:$Zd, PPRAny:$Pg, cpy_imm8_opt_lsl_i32:$imm), 1>; +multiclass sve_int_dup_imm_pred_zero_inst< + bits<2> sz8_64, string asm, ZPRRegOp zprty, ValueType intty, + ValueType predty, ValueType scalarty, imm8_opt_lsl cpyimm> { + def NAME : sve_int_dup_imm_pred; def : InstAlias<"mov $Zd, $Pg/z, $imm", - (!cast(NAME # _D) ZPR64:$Zd, PPRAny:$Pg, cpy_imm8_opt_lsl_i64:$imm), 1>; + (!cast(NAME) zprty:$Zd, PPRAny:$Pg, cpyimm:$imm), 1>; + def : Pat<(intty (zext (predty PPRAny:$Ps1))), + (!cast(NAME) PPRAny:$Ps1, 1, 0)>; + def : Pat<(intty (sext (predty PPRAny:$Ps1))), + (!cast(NAME) PPRAny:$Ps1, -1, 0)>; + def : Pat<(intty (anyext (predty PPRAny:$Ps1))), + (!cast(NAME) PPRAny:$Ps1, 1, 0)>; + def : Pat<(intty + (vselect predty:$Pg, + (intty (AArch64dup (scalarty (SVE8BitLslImm i32:$imm, i32:$shift)))), + (intty (AArch64dup (scalarty 0))))), + (!cast(NAME) $Pg, i32:$imm, i32:$shift)>; +} + +multiclass sve_int_dup_imm_pred_zero { + defm _B : sve_int_dup_imm_pred_zero_inst<0b00, asm, ZPR8, nxv16i8, nxv16i1, + i32, cpy_imm8_opt_lsl_i8>; + defm _H : sve_int_dup_imm_pred_zero_inst<0b01, asm, ZPR16, nxv8i16, nxv8i1, + i32, cpy_imm8_opt_lsl_i16>; + defm _S : sve_int_dup_imm_pred_zero_inst<0b10, asm, ZPR32, nxv4i32, nxv4i1, + i32, cpy_imm8_opt_lsl_i32>; + defm _D : sve_int_dup_imm_pred_zero_inst<0b11, asm, ZPR64, nxv2i64, nxv2i1, + i64, cpy_imm8_opt_lsl_i64>; } //===----------------------------------------------------------------------===// @@ -3787,17 +4164,24 @@ class sve_int_cmp sz8_64, bits<3> opc, string asm, let Defs = [NZCV]; } -multiclass sve_int_cmp_0 opc, string asm, SDPatternOperator op, - CondCode cc> { +multiclass SVE_SETCC_Pat { + def : Pat<(predvt (AArch64setcc_z predvt:$Op1, intvt:$Op2, intvt:$Op3, cc)), + (cmp $Op1, $Op2, $Op3)>; + def : Pat<(predvt (AArch64setcc_z predvt:$Op1, intvt:$Op2, intvt:$Op3, invcc)), + (cmp $Op1, $Op3, $Op2)>; +} + +multiclass sve_int_cmp_0 opc, string asm, CondCode cc, CondCode invcc> { def _B : sve_int_cmp<0b0, 0b00, opc, asm, PPR8, ZPR8, ZPR8>; def _H : sve_int_cmp<0b0, 0b01, opc, asm, PPR16, ZPR16, ZPR16>; def _S : sve_int_cmp<0b0, 0b10, opc, asm, PPR32, ZPR32, ZPR32>; def _D : sve_int_cmp<0b0, 0b11, opc, asm, PPR64, ZPR64, ZPR64>; - def : SVE_3_Op_Pat(NAME # _B)>; - def : SVE_3_Op_Pat(NAME # _H)>; - def : SVE_3_Op_Pat(NAME # _S)>; - def : SVE_3_Op_Pat(NAME # _D)>; + defm : SVE_SETCC_Pat(NAME # _B)>; + defm : SVE_SETCC_Pat(NAME # _H)>; + defm : SVE_SETCC_Pat(NAME # _S)>; + defm : SVE_SETCC_Pat(NAME # _D)>; } multiclass sve_int_cmp_0_wide opc, string asm, SDPatternOperator op> { @@ -3852,67 +4236,35 @@ class sve_int_scmp_vi sz8_64, bits<3> opc, string asm, PPRRegOp pprty, let ElementSize = pprty.ElementSize; } -multiclass sve_int_scmp_vi opc, string asm, CondCode cc, - SDPatternOperator op = null_frag, - SDPatternOperator inv_op = null_frag> { +multiclass SVE_SETCC_Imm_Pat { + def : Pat<(predvt (AArch64setcc_z (predvt PPR_3b:$Pg), + (intvt ZPR:$Zs1), + (intvt (AArch64dup (immtype:$imm))), + cc)), + (cmp $Pg, $Zs1, immtype:$imm)>; + def : Pat<(predvt (AArch64setcc_z (predvt PPR_3b:$Pg), + (intvt (AArch64dup (immtype:$imm))), + (intvt ZPR:$Zs1), + commuted_cc)), + (cmp $Pg, $Zs1, immtype:$imm)>; +} + +multiclass sve_int_scmp_vi opc, string asm, CondCode cc, CondCode commuted_cc> { def _B : sve_int_scmp_vi<0b00, opc, asm, PPR8, ZPR8, simm5_32b>; def _H : sve_int_scmp_vi<0b01, opc, asm, PPR16, ZPR16, simm5_32b>; def _S : sve_int_scmp_vi<0b10, opc, asm, PPR32, ZPR32, simm5_32b>; def _D : sve_int_scmp_vi<0b11, opc, asm, PPR64, ZPR64, simm5_64b>; - // IR version - def : Pat<(nxv16i1 (setcc (nxv16i8 ZPR:$Zs1), - (nxv16i8 (AArch64dup (simm5_32b:$imm))), - cc)), - (!cast(NAME # "_B") (PTRUE_B 31), ZPR:$Zs1, simm5_32b:$imm)>; - def : Pat<(nxv8i1 (setcc (nxv8i16 ZPR:$Zs1), - (nxv8i16 (AArch64dup (simm5_32b:$imm))), - cc)), - (!cast(NAME # "_H") (PTRUE_H 31), ZPR:$Zs1, simm5_32b:$imm)>; - def : Pat<(nxv4i1 (setcc (nxv4i32 ZPR:$Zs1), - (nxv4i32 (AArch64dup (simm5_32b:$imm))), - cc)), - (!cast(NAME # "_S") (PTRUE_S 31), ZPR:$Zs1, simm5_32b:$imm)>; - def : Pat<(nxv2i1 (setcc (nxv2i64 ZPR:$Zs1), - (nxv2i64 (AArch64dup (simm5_64b:$imm))), - cc)), - (!cast(NAME # "_D") (PTRUE_D 31), ZPR:$Zs1, simm5_64b:$imm)>; - - // Intrinsic version - def : Pat<(nxv16i1 (op (nxv16i1 PPR_3b:$Pg), - (nxv16i8 ZPR:$Zs1), - (nxv16i8 (AArch64dup (simm5_32b:$imm))))), - (!cast(NAME # "_B") PPR_3b:$Pg, ZPR:$Zs1, simm5_32b:$imm)>; - def : Pat<(nxv8i1 (op (nxv8i1 PPR_3b:$Pg), - (nxv8i16 ZPR:$Zs1), - (nxv8i16 (AArch64dup (simm5_32b:$imm))))), - (!cast(NAME # "_H") PPR_3b:$Pg, ZPR:$Zs1, simm5_32b:$imm)>; - def : Pat<(nxv4i1 (op (nxv4i1 PPR_3b:$Pg), - (nxv4i32 ZPR:$Zs1), - (nxv4i32 (AArch64dup (simm5_32b:$imm))))), - (!cast(NAME # "_S") PPR_3b:$Pg, ZPR:$Zs1, simm5_32b:$imm)>; - def : Pat<(nxv2i1 (op (nxv2i1 PPR_3b:$Pg), - (nxv2i64 ZPR:$Zs1), - (nxv2i64 (AArch64dup (simm5_64b:$imm))))), - (!cast(NAME # "_D") PPR_3b:$Pg, ZPR:$Zs1, simm5_64b:$imm)>; - - // Inverted intrinsic version - def : Pat<(nxv16i1 (inv_op (nxv16i1 PPR_3b:$Pg), - (nxv16i8 (AArch64dup (simm5_32b:$imm))), - (nxv16i8 ZPR:$Zs1))), - (!cast(NAME # "_B") PPR_3b:$Pg, ZPR:$Zs1, simm5_32b:$imm)>; - def : Pat<(nxv8i1 (inv_op (nxv8i1 PPR_3b:$Pg), - (nxv8i16 (AArch64dup (simm5_32b:$imm))), - (nxv8i16 ZPR:$Zs1))), - (!cast(NAME # "_H") PPR_3b:$Pg, ZPR:$Zs1, simm5_32b:$imm)>; - def : Pat<(nxv4i1 (inv_op (nxv4i1 PPR_3b:$Pg), - (nxv4i32 (AArch64dup (simm5_32b:$imm))), - (nxv4i32 ZPR:$Zs1))), - (!cast(NAME # "_S") PPR_3b:$Pg, ZPR:$Zs1, simm5_32b:$imm)>; - def : Pat<(nxv2i1 (inv_op (nxv2i1 PPR_3b:$Pg), - (nxv2i64 (AArch64dup (simm5_64b:$imm))), - (nxv2i64 ZPR:$Zs1))), - (!cast(NAME # "_D") PPR_3b:$Pg, ZPR:$Zs1, simm5_64b:$imm)>; + defm : SVE_SETCC_Imm_Pat(NAME # _B)>; + defm : SVE_SETCC_Imm_Pat(NAME # _H)>; + defm : SVE_SETCC_Imm_Pat(NAME # _S)>; + defm : SVE_SETCC_Imm_Pat(NAME # _D)>; } @@ -3944,66 +4296,20 @@ class sve_int_ucmp_vi sz8_64, bits<2> opc, string asm, PPRRegOp pprty, } multiclass sve_int_ucmp_vi opc, string asm, CondCode cc, - SDPatternOperator op = null_frag, - SDPatternOperator inv_op = null_frag> { + CondCode commuted_cc> { def _B : sve_int_ucmp_vi<0b00, opc, asm, PPR8, ZPR8, imm0_127>; def _H : sve_int_ucmp_vi<0b01, opc, asm, PPR16, ZPR16, imm0_127>; def _S : sve_int_ucmp_vi<0b10, opc, asm, PPR32, ZPR32, imm0_127>; def _D : sve_int_ucmp_vi<0b11, opc, asm, PPR64, ZPR64, imm0_127_64b>; - // IR version - def : Pat<(nxv16i1 (setcc (nxv16i8 ZPR:$Zs1), - (nxv16i8 (AArch64dup (imm0_127:$imm))), - cc)), - (!cast(NAME # "_B") (PTRUE_B 31), ZPR:$Zs1, imm0_127:$imm)>; - def : Pat<(nxv8i1 (setcc (nxv8i16 ZPR:$Zs1), - (nxv8i16 (AArch64dup (imm0_127:$imm))), - cc)), - (!cast(NAME # "_H") (PTRUE_H 31), ZPR:$Zs1, imm0_127:$imm)>; - def : Pat<(nxv4i1 (setcc (nxv4i32 ZPR:$Zs1), - (nxv4i32 (AArch64dup (imm0_127:$imm))), - cc)), - (!cast(NAME # "_S") (PTRUE_S 31), ZPR:$Zs1, imm0_127:$imm)>; - def : Pat<(nxv2i1 (setcc (nxv2i64 ZPR:$Zs1), - (nxv2i64 (AArch64dup (imm0_127_64b:$imm))), - cc)), - (!cast(NAME # "_D") (PTRUE_D 31), ZPR:$Zs1, imm0_127_64b:$imm)>; - - // Intrinsic version - def : Pat<(nxv16i1 (op (nxv16i1 PPR_3b:$Pg), - (nxv16i8 ZPR:$Zs1), - (nxv16i8 (AArch64dup (imm0_127:$imm))))), - (!cast(NAME # "_B") PPR_3b:$Pg, ZPR:$Zs1, imm0_127:$imm)>; - def : Pat<(nxv8i1 (op (nxv8i1 PPR_3b:$Pg), - (nxv8i16 ZPR:$Zs1), - (nxv8i16 (AArch64dup (imm0_127:$imm))))), - (!cast(NAME # "_H") PPR_3b:$Pg, ZPR:$Zs1, imm0_127:$imm)>; - def : Pat<(nxv4i1 (op (nxv4i1 PPR_3b:$Pg), - (nxv4i32 ZPR:$Zs1), - (nxv4i32 (AArch64dup (imm0_127:$imm))))), - (!cast(NAME # "_S") PPR_3b:$Pg, ZPR:$Zs1, imm0_127:$imm)>; - def : Pat<(nxv2i1 (op (nxv2i1 PPR_3b:$Pg), - (nxv2i64 ZPR:$Zs1), - (nxv2i64 (AArch64dup (imm0_127_64b:$imm))))), - (!cast(NAME # "_D") PPR_3b:$Pg, ZPR:$Zs1, imm0_127_64b:$imm)>; - - // Inverted intrinsic version - def : Pat<(nxv16i1 (inv_op (nxv16i1 PPR_3b:$Pg), - (nxv16i8 (AArch64dup (imm0_127:$imm))), - (nxv16i8 ZPR:$Zs1))), - (!cast(NAME # "_B") PPR_3b:$Pg, ZPR:$Zs1, imm0_127:$imm)>; - def : Pat<(nxv8i1 (inv_op (nxv8i1 PPR_3b:$Pg), - (nxv8i16 (AArch64dup (imm0_127:$imm))), - (nxv8i16 ZPR:$Zs1))), - (!cast(NAME # "_H") PPR_3b:$Pg, ZPR:$Zs1, imm0_127:$imm)>; - def : Pat<(nxv4i1 (inv_op (nxv4i1 PPR_3b:$Pg), - (nxv4i32 (AArch64dup (imm0_127:$imm))), - (nxv4i32 ZPR:$Zs1))), - (!cast(NAME # "_S") PPR_3b:$Pg, ZPR:$Zs1, imm0_127:$imm)>; - def : Pat<(nxv2i1 (inv_op (nxv2i1 PPR_3b:$Pg), - (nxv2i64 (AArch64dup (imm0_127_64b:$imm))), - (nxv2i64 ZPR:$Zs1))), - (!cast(NAME # "_D") PPR_3b:$Pg, ZPR:$Zs1, imm0_127_64b:$imm)>; + defm : SVE_SETCC_Imm_Pat(NAME # _B)>; + defm : SVE_SETCC_Imm_Pat(NAME # _H)>; + defm : SVE_SETCC_Imm_Pat(NAME # _S)>; + defm : SVE_SETCC_Imm_Pat(NAME # _D)>; } @@ -4096,11 +4402,17 @@ class sve2_int_while_rr sz8_64, bits<1> rw, string asm, let Defs = [NZCV]; } -multiclass sve2_int_while_rr rw, string asm> { +multiclass sve2_int_while_rr rw, string asm, string op> { def _B : sve2_int_while_rr<0b00, rw, asm, PPR8>; def _H : sve2_int_while_rr<0b01, rw, asm, PPR16>; def _S : sve2_int_while_rr<0b10, rw, asm, PPR32>; def _D : sve2_int_while_rr<0b11, rw, asm, PPR64>; + + def : SVE_2_Op_Pat(op # _b), i64, i64, !cast(NAME # _B)>; + def : SVE_2_Op_Pat(op # _h), i64, i64, !cast(NAME # _H)>; + def : SVE_2_Op_Pat(op # _s), i64, i64, !cast(NAME # _S)>; + def : SVE_2_Op_Pat(op # _d), i64, i64, !cast(NAME # _D)>; + } //===----------------------------------------------------------------------===// @@ -4108,8 +4420,8 @@ multiclass sve2_int_while_rr rw, string asm> { //===----------------------------------------------------------------------===// class sve_fp_fast_red sz, bits<3> opc, string asm, - ZPRRegOp zprty, RegisterClass dstRegClass> -: I<(outs dstRegClass:$Vd), (ins PPR3bAny:$Pg, zprty:$Zn), + ZPRRegOp zprty, FPRasZPROperand dstOpType> +: I<(outs dstOpType:$Vd), (ins PPR3bAny:$Pg, zprty:$Zn), asm, "\t$Vd, $Pg, $Zn", "", []>, Sched<[]> { @@ -4127,13 +4439,13 @@ class sve_fp_fast_red sz, bits<3> opc, string asm, } multiclass sve_fp_fast_red opc, string asm, SDPatternOperator op> { - def _H : sve_fp_fast_red<0b01, opc, asm, ZPR16, FPR16>; - def _S : sve_fp_fast_red<0b10, opc, asm, ZPR32, FPR32>; - def _D : sve_fp_fast_red<0b11, opc, asm, ZPR64, FPR64>; + def _H : sve_fp_fast_red<0b01, opc, asm, ZPR16, FPR16asZPR>; + def _S : sve_fp_fast_red<0b10, opc, asm, ZPR32, FPR32asZPR>; + def _D : sve_fp_fast_red<0b11, opc, asm, ZPR64, FPR64asZPR>; - def : SVE_2_Op_Pat(NAME # _H)>; - def : SVE_2_Op_Pat(NAME # _S)>; - def : SVE_2_Op_Pat(NAME # _D)>; + def : SVE_2_Op_Pat(NAME # _H)>; + def : SVE_2_Op_Pat(NAME # _S)>; + def : SVE_2_Op_Pat(NAME # _D)>; } @@ -4142,8 +4454,8 @@ multiclass sve_fp_fast_red opc, string asm, SDPatternOperator op> { //===----------------------------------------------------------------------===// class sve_fp_2op_p_vd sz, bits<3> opc, string asm, - ZPRRegOp zprty, RegisterClass dstRegClass> -: I<(outs dstRegClass:$Vdn), (ins PPR3bAny:$Pg, dstRegClass:$_Vdn, zprty:$Zm), + ZPRRegOp zprty, FPRasZPROperand dstOpType> +: I<(outs dstOpType:$Vdn), (ins PPR3bAny:$Pg, dstOpType:$_Vdn, zprty:$Zm), asm, "\t$Vdn, $Pg, $_Vdn, $Zm", "", []>, @@ -4164,13 +4476,13 @@ class sve_fp_2op_p_vd sz, bits<3> opc, string asm, } multiclass sve_fp_2op_p_vd opc, string asm, SDPatternOperator op> { - def _H : sve_fp_2op_p_vd<0b01, opc, asm, ZPR16, FPR16>; - def _S : sve_fp_2op_p_vd<0b10, opc, asm, ZPR32, FPR32>; - def _D : sve_fp_2op_p_vd<0b11, opc, asm, ZPR64, FPR64>; + def _H : sve_fp_2op_p_vd<0b01, opc, asm, ZPR16, FPR16asZPR>; + def _S : sve_fp_2op_p_vd<0b10, opc, asm, ZPR32, FPR32asZPR>; + def _D : sve_fp_2op_p_vd<0b11, opc, asm, ZPR64, FPR64asZPR>; - def : SVE_3_Op_Pat(NAME # _H)>; - def : SVE_3_Op_Pat(NAME # _S)>; - def : SVE_3_Op_Pat(NAME # _D)>; + def : SVE_3_Op_Pat(NAME # _H)>; + def : SVE_3_Op_Pat(NAME # _S)>; + def : SVE_3_Op_Pat(NAME # _D)>; } //===----------------------------------------------------------------------===// @@ -4210,6 +4522,22 @@ multiclass sve_fp_3op_p_pd opc, string asm, SDPatternOperator op> { def : SVE_3_Op_Pat(NAME # _D)>; } +multiclass sve_fp_3op_p_pd_cc opc, string asm, SDPatternOperator op, + SDPatternOperator op_nopred> +: sve_fp_3op_p_pd { + def : SVE_2_Op_AllActive_Pat(NAME # _H), PTRUE_H>; + def : SVE_2_Op_AllActive_Pat(NAME # _H), PTRUE_S>; + def : SVE_2_Op_AllActive_Pat(NAME # _H), PTRUE_D>; + def : SVE_2_Op_AllActive_Pat(NAME # _S), PTRUE_S>; + def : SVE_2_Op_AllActive_Pat(NAME # _S), PTRUE_D>; + def : SVE_2_Op_AllActive_Pat(NAME # _D), PTRUE_D>; +} //===----------------------------------------------------------------------===// // SVE Floating Point Compare - with Zero Group @@ -4263,11 +4591,20 @@ class sve_int_index_ii sz8_64, string asm, ZPRRegOp zprty, let Inst{4-0} = Zd; } -multiclass sve_int_index_ii { - def _B : sve_int_index_ii<0b00, asm, ZPR8, simm5_32b>; - def _H : sve_int_index_ii<0b01, asm, ZPR16, simm5_32b>; +multiclass sve_int_index_ii { + def _B : sve_int_index_ii<0b00, asm, ZPR8, simm5_8b>; + def _H : sve_int_index_ii<0b01, asm, ZPR16, simm5_16b>; def _S : sve_int_index_ii<0b10, asm, ZPR32, simm5_32b>; def _D : sve_int_index_ii<0b11, asm, ZPR64, simm5_64b>; + + def : Pat<(nxv16i8 (op simm5_8b:$imm5, simm5_8b:$imm5b)), + (!cast(NAME # "_B") simm5_8b:$imm5, simm5_8b:$imm5b)>; + def : Pat<(nxv8i16 (op simm5_16b:$imm5, simm5_16b:$imm5b)), + (!cast(NAME # "_H") simm5_16b:$imm5, simm5_16b:$imm5b)>; + def : Pat<(nxv4i32 (op simm5_32b:$imm5, simm5_32b:$imm5b)), + (!cast(NAME # "_S") simm5_32b:$imm5, simm5_32b:$imm5b)>; + def : Pat<(nxv2i64 (op simm5_64b:$imm5, simm5_64b:$imm5b)), + (!cast(NAME # "_D") simm5_64b:$imm5, simm5_64b:$imm5b)>; } class sve_int_index_ir sz8_64, string asm, ZPRRegOp zprty, @@ -4287,11 +4624,20 @@ class sve_int_index_ir sz8_64, string asm, ZPRRegOp zprty, let Inst{4-0} = Zd; } -multiclass sve_int_index_ir { - def _B : sve_int_index_ir<0b00, asm, ZPR8, GPR32, simm5_32b>; - def _H : sve_int_index_ir<0b01, asm, ZPR16, GPR32, simm5_32b>; +multiclass sve_int_index_ir { + def _B : sve_int_index_ir<0b00, asm, ZPR8, GPR32, simm5_8b>; + def _H : sve_int_index_ir<0b01, asm, ZPR16, GPR32, simm5_16b>; def _S : sve_int_index_ir<0b10, asm, ZPR32, GPR32, simm5_32b>; def _D : sve_int_index_ir<0b11, asm, ZPR64, GPR64, simm5_64b>; + + def : Pat<(nxv16i8 (op simm5_8b:$imm5, GPR32:$Rm)), + (!cast(NAME # "_B") simm5_8b:$imm5, GPR32:$Rm)>; + def : Pat<(nxv8i16 (op simm5_16b:$imm5, GPR32:$Rm)), + (!cast(NAME # "_H") simm5_16b:$imm5, GPR32:$Rm)>; + def : Pat<(nxv4i32 (op simm5_32b:$imm5, GPR32:$Rm)), + (!cast(NAME # "_S") simm5_32b:$imm5, GPR32:$Rm)>; + def : Pat<(nxv2i64 (op simm5_64b:$imm5, GPR64:$Rm)), + (!cast(NAME # "_D") simm5_64b:$imm5, GPR64:$Rm)>; } class sve_int_index_ri sz8_64, string asm, ZPRRegOp zprty, @@ -4311,11 +4657,20 @@ class sve_int_index_ri sz8_64, string asm, ZPRRegOp zprty, let Inst{4-0} = Zd; } -multiclass sve_int_index_ri { - def _B : sve_int_index_ri<0b00, asm, ZPR8, GPR32, simm5_32b>; - def _H : sve_int_index_ri<0b01, asm, ZPR16, GPR32, simm5_32b>; +multiclass sve_int_index_ri { + def _B : sve_int_index_ri<0b00, asm, ZPR8, GPR32, simm5_8b>; + def _H : sve_int_index_ri<0b01, asm, ZPR16, GPR32, simm5_16b>; def _S : sve_int_index_ri<0b10, asm, ZPR32, GPR32, simm5_32b>; def _D : sve_int_index_ri<0b11, asm, ZPR64, GPR64, simm5_64b>; + + def : Pat<(nxv16i8 (op GPR32:$Rm, simm5_8b:$imm5)), + (!cast(NAME # "_B") GPR32:$Rm, simm5_8b:$imm5)>; + def : Pat<(nxv8i16 (op GPR32:$Rm, simm5_16b:$imm5)), + (!cast(NAME # "_H") GPR32:$Rm, simm5_16b:$imm5)>; + def : Pat<(nxv4i32 (op GPR32:$Rm, simm5_32b:$imm5)), + (!cast(NAME # "_S") GPR32:$Rm, simm5_32b:$imm5)>; + def : Pat<(nxv2i64 (op GPR64:$Rm, simm5_64b:$imm5)), + (!cast(NAME # "_D") GPR64:$Rm, simm5_64b:$imm5)>; } class sve_int_index_rr sz8_64, string asm, ZPRRegOp zprty, @@ -4335,19 +4690,23 @@ class sve_int_index_rr sz8_64, string asm, ZPRRegOp zprty, let Inst{4-0} = Zd; } -multiclass sve_int_index_rr { +multiclass sve_int_index_rr { def _B : sve_int_index_rr<0b00, asm, ZPR8, GPR32>; def _H : sve_int_index_rr<0b01, asm, ZPR16, GPR32>; def _S : sve_int_index_rr<0b10, asm, ZPR32, GPR32>; def _D : sve_int_index_rr<0b11, asm, ZPR64, GPR64>; + + def : SVE_2_Op_Pat(NAME # _B)>; + def : SVE_2_Op_Pat(NAME # _H)>; + def : SVE_2_Op_Pat(NAME # _S)>; + def : SVE_2_Op_Pat(NAME # _D)>; } // //===----------------------------------------------------------------------===// // SVE Bitwise Shift - Predicated Group //===----------------------------------------------------------------------===// class sve_int_bin_pred_shift_imm tsz8_64, bits<4> opc, string asm, - ZPRRegOp zprty, Operand immtype, - ElementSizeEnum size> + ZPRRegOp zprty, Operand immtype> : I<(outs zprty:$Zdn), (ins PPR3bAny:$Pg, zprty:$_Zdn, immtype:$imm), asm, "\t$Zdn, $Pg/m, $_Zdn, $imm", "", @@ -4366,50 +4725,99 @@ class sve_int_bin_pred_shift_imm tsz8_64, bits<4> opc, string asm, let Inst{4-0} = Zdn; let Constraints = "$Zdn = $_Zdn"; - let DestructiveInstType = Destructive; - let ElementSize = size; + let DestructiveInstType = DestructiveBinaryImm; + let ElementSize = zprty.ElementSize; +} + +multiclass sve_int_bin_pred_shift_imm_left opc, string asm, string psName=""> { + def _B : SVEPseudo2Instr, + sve_int_bin_pred_shift_imm<{0,0,0,1}, opc, asm, ZPR8, vecshiftL8>; + def _H : SVEPseudo2Instr, + sve_int_bin_pred_shift_imm<{0,0,1,?}, opc, asm, ZPR16, vecshiftL16> { + let Inst{8} = imm{3}; + } + def _S : SVEPseudo2Instr, + sve_int_bin_pred_shift_imm<{0,1,?,?}, opc, asm, ZPR32, vecshiftL32> { + let Inst{9-8} = imm{4-3}; + } + def _D : SVEPseudo2Instr, + sve_int_bin_pred_shift_imm<{1,?,?,?}, opc, asm, ZPR64, vecshiftL64> { + let Inst{22} = imm{5}; + let Inst{9-8} = imm{4-3}; + } } -multiclass sve_int_bin_pred_shift_imm_left opc, string asm> { - def _B : sve_int_bin_pred_shift_imm<{0,0,0,1}, opc, asm, ZPR8, vecshiftL8, - ElementSizeB>; - def _H : sve_int_bin_pred_shift_imm<{0,0,1,?}, opc, asm, ZPR16, vecshiftL16, - ElementSizeH> { +multiclass sve2_int_bin_pred_shift_imm_left opc, string asm, + string psName, + SDPatternOperator op> { + + def _B : SVEPseudo2Instr, sve_int_bin_pred_shift_imm<{0,0,0,1}, opc, asm, ZPR8, vecshiftL8>; + def _H : SVEPseudo2Instr, + sve_int_bin_pred_shift_imm<{0,0,1,?}, opc, asm, ZPR16, vecshiftL16> { let Inst{8} = imm{3}; } - def _S : sve_int_bin_pred_shift_imm<{0,1,?,?}, opc, asm, ZPR32, vecshiftL32, - ElementSizeS> { + def _S : SVEPseudo2Instr, + sve_int_bin_pred_shift_imm<{0,1,?,?}, opc, asm, ZPR32, vecshiftL32> { let Inst{9-8} = imm{4-3}; } - def _D : sve_int_bin_pred_shift_imm<{1,?,?,?}, opc, asm, ZPR64, vecshiftL64, - ElementSizeD> { + def _D : SVEPseudo2Instr, + sve_int_bin_pred_shift_imm<{1,?,?,?}, opc, asm, ZPR64, vecshiftL64> { let Inst{22} = imm{5}; let Inst{9-8} = imm{4-3}; } + + def : SVE_3_Op_Imm_Pat(NAME # _B)>; + def : SVE_3_Op_Imm_Pat(NAME # _H)>; + def : SVE_3_Op_Imm_Pat(NAME # _S)>; + def : SVE_3_Op_Imm_Pat(NAME # _D)>; +} + +multiclass sve_int_bin_pred_shift_imm_left_zeroing_bhsd { + def _ZERO_B : PredTwoOpImmPseudo; + def _ZERO_H : PredTwoOpImmPseudo; + def _ZERO_S : PredTwoOpImmPseudo; + def _ZERO_D : PredTwoOpImmPseudo; + + def : SVE_3_Op_Pat_Shift_Imm_SelZero(NAME # _ZERO_B)>; + def : SVE_3_Op_Pat_Shift_Imm_SelZero(NAME # _ZERO_H)>; + def : SVE_3_Op_Pat_Shift_Imm_SelZero(NAME # _ZERO_S)>; + def : SVE_3_Op_Pat_Shift_Imm_SelZero(NAME # _ZERO_D)>; } -multiclass sve_int_bin_pred_shift_imm_right opc, string asm, +multiclass sve_int_bin_pred_shift_imm_right opc, string asm, string Ps, SDPatternOperator op = null_frag> { - def _B : sve_int_bin_pred_shift_imm<{0,0,0,1}, opc, asm, ZPR8, vecshiftR8, - ElementSizeB>; - def _H : sve_int_bin_pred_shift_imm<{0,0,1,?}, opc, asm, ZPR16, vecshiftR16, - ElementSizeH> { + def _B : SVEPseudo2Instr, + sve_int_bin_pred_shift_imm<{0,0,0,1}, opc, asm, ZPR8, vecshiftR8>; + def _H : SVEPseudo2Instr, + sve_int_bin_pred_shift_imm<{0,0,1,?}, opc, asm, ZPR16, vecshiftR16> { let Inst{8} = imm{3}; } - def _S : sve_int_bin_pred_shift_imm<{0,1,?,?}, opc, asm, ZPR32, vecshiftR32, - ElementSizeS> { + def _S : SVEPseudo2Instr, + sve_int_bin_pred_shift_imm<{0,1,?,?}, opc, asm, ZPR32, vecshiftR32> { let Inst{9-8} = imm{4-3}; } - def _D : sve_int_bin_pred_shift_imm<{1,?,?,?}, opc, asm, ZPR64, vecshiftR64, - ElementSizeD> { + def _D : SVEPseudo2Instr, + sve_int_bin_pred_shift_imm<{1,?,?,?}, opc, asm, ZPR64, vecshiftR64> { let Inst{22} = imm{5}; let Inst{9-8} = imm{4-3}; } - def : SVE_3_Op_Imm_Pat(NAME # _B)>; - def : SVE_3_Op_Imm_Pat(NAME # _H)>; - def : SVE_3_Op_Imm_Pat(NAME # _S)>; - def : SVE_3_Op_Imm_Pat(NAME # _D)>; + def : SVE_3_Op_Imm_Pat(NAME # _B)>; + def : SVE_3_Op_Imm_Pat(NAME # _H)>; + def : SVE_3_Op_Imm_Pat(NAME # _S)>; + def : SVE_3_Op_Imm_Pat(NAME # _D)>; +} + +multiclass sve_int_bin_pred_shift_imm_right_zeroing_bhsd { + def _ZERO_B : PredTwoOpImmPseudo; + def _ZERO_H : PredTwoOpImmPseudo; + def _ZERO_S : PredTwoOpImmPseudo; + def _ZERO_D : PredTwoOpImmPseudo; + + def : SVE_3_Op_Pat_Shift_Imm_SelZero(NAME # _ZERO_B)>; + def : SVE_3_Op_Pat_Shift_Imm_SelZero(NAME # _ZERO_H)>; + def : SVE_3_Op_Pat_Shift_Imm_SelZero(NAME # _ZERO_S)>; + def : SVE_3_Op_Pat_Shift_Imm_SelZero(NAME # _ZERO_D)>; } class sve_int_bin_pred_shift sz8_64, bit wide, bits<3> opc, @@ -4432,23 +4840,40 @@ class sve_int_bin_pred_shift sz8_64, bit wide, bits<3> opc, let Inst{4-0} = Zdn; let Constraints = "$Zdn = $_Zdn"; - let DestructiveInstType = Destructive; + let DestructiveInstType = DestructiveOther; let ElementSize = zprty.ElementSize; } -multiclass sve_int_bin_pred_shift opc, string asm, - SDPatternOperator op> { - def _B : sve_int_bin_pred_shift<0b00, 0b0, opc, asm, ZPR8, ZPR8>; - def _H : sve_int_bin_pred_shift<0b01, 0b0, opc, asm, ZPR16, ZPR16>; - def _S : sve_int_bin_pred_shift<0b10, 0b0, opc, asm, ZPR32, ZPR32>; - def _D : sve_int_bin_pred_shift<0b11, 0b0, opc, asm, ZPR64, ZPR64>; - +multiclass sve_int_bin_pred_shift opc, string asm, string Ps, + SDPatternOperator op, string revname, bit isReverseInstr = 0> { + let DestructiveInstType = DestructiveBinaryCommWithRev in { + def _B : sve_int_bin_pred_shift<0b00, 0b0, opc, asm, ZPR8, ZPR8>, + SVEPseudo2Instr, SVEInstr2Rev; + def _H : sve_int_bin_pred_shift<0b01, 0b0, opc, asm, ZPR16, ZPR16>, + SVEPseudo2Instr, SVEInstr2Rev; + def _S : sve_int_bin_pred_shift<0b10, 0b0, opc, asm, ZPR32, ZPR32>, + SVEPseudo2Instr, SVEInstr2Rev; + def _D : sve_int_bin_pred_shift<0b11, 0b0, opc, asm, ZPR64, ZPR64>, + SVEPseudo2Instr, SVEInstr2Rev; + } def : SVE_3_Op_Pat(NAME # _B)>; def : SVE_3_Op_Pat(NAME # _H)>; def : SVE_3_Op_Pat(NAME # _S)>; def : SVE_3_Op_Pat(NAME # _D)>; } +multiclass sve_int_bin_pred_zeroing_bhsd { + def _ZERO_B : PredTwoOpPseudo; + def _ZERO_H : PredTwoOpPseudo; + def _ZERO_S : PredTwoOpPseudo; + def _ZERO_D : PredTwoOpPseudo; + + def : SVE_3_Op_Pat_SelZero(NAME # _ZERO_B)>; + def : SVE_3_Op_Pat_SelZero(NAME # _ZERO_H)>; + def : SVE_3_Op_Pat_SelZero(NAME # _ZERO_S)>; + def : SVE_3_Op_Pat_SelZero(NAME # _ZERO_D)>; +} + multiclass sve_int_bin_pred_shift_wide opc, string asm, SDPatternOperator op> { def _B : sve_int_bin_pred_shift<0b00, 0b1, opc, asm, ZPR8, ZPR64>; @@ -4493,7 +4918,8 @@ class sve_int_bin_cons_shift_imm tsz8_64, bits<2> opc, string asm, ZPRRegOp zprty, Operand immtype> : I<(outs zprty:$Zd), (ins zprty:$Zn, immtype:$imm), asm, "\t$Zd, $Zn, $imm", - "", []>, Sched<[]> { + "", + []>, Sched<[]> { bits<5> Zd; bits<5> Zn; bits<6> imm; @@ -4508,7 +4934,8 @@ class sve_int_bin_cons_shift_imm tsz8_64, bits<2> opc, string asm, let Inst{4-0} = Zd; } -multiclass sve_int_bin_cons_shift_imm_left opc, string asm> { +multiclass sve_int_bin_cons_shift_imm_left opc, string asm, + SDPatternOperator op> { def _B : sve_int_bin_cons_shift_imm<{0,0,0,1}, opc, asm, ZPR8, vecshiftL8>; def _H : sve_int_bin_cons_shift_imm<{0,0,1,?}, opc, asm, ZPR16, vecshiftL16> { let Inst{19} = imm{3}; @@ -4520,9 +4947,15 @@ multiclass sve_int_bin_cons_shift_imm_left opc, string asm> { let Inst{22} = imm{5}; let Inst{20-19} = imm{4-3}; } + + def : SVE_1_Op_Imm_Shift_Pred_Pat(NAME # _B)>; + def : SVE_1_Op_Imm_Shift_Pred_Pat(NAME # _H)>; + def : SVE_1_Op_Imm_Shift_Pred_Pat(NAME # _S)>; + def : SVE_1_Op_Imm_Arith_Pred_Pat(NAME # _D)>; } -multiclass sve_int_bin_cons_shift_imm_right opc, string asm> { +multiclass sve_int_bin_cons_shift_imm_right opc, string asm, + SDPatternOperator op> { def _B : sve_int_bin_cons_shift_imm<{0,0,0,1}, opc, asm, ZPR8, vecshiftR8>; def _H : sve_int_bin_cons_shift_imm<{0,0,1,?}, opc, asm, ZPR16, vecshiftR16> { let Inst{19} = imm{3}; @@ -4534,6 +4967,11 @@ multiclass sve_int_bin_cons_shift_imm_right opc, string asm> { let Inst{22} = imm{5}; let Inst{20-19} = imm{4-3}; } + + def : SVE_1_Op_Imm_Shift_Pred_Pat(NAME # _B)>; + def : SVE_1_Op_Imm_Shift_Pred_Pat(NAME # _H)>; + def : SVE_1_Op_Imm_Shift_Pred_Pat(NAME # _S)>; + def : SVE_1_Op_Imm_Arith_Pred_Pat(NAME # _D)>; } //===----------------------------------------------------------------------===// // SVE Memory - Store Group @@ -4743,16 +5181,36 @@ class sve2_mem_sstnt_vs_base opc, string asm, let mayStore = 1; } -multiclass sve2_mem_sstnt_vs opc, string asm, - RegisterOperand listty, ZPRRegOp zprty> { - def _REAL : sve2_mem_sstnt_vs_base; +multiclass sve2_mem_sstnt_vs_32_ptrs opc, string asm, + SDPatternOperator op, + ValueType vt> { + def _REAL : sve2_mem_sstnt_vs_base; + + def : InstAlias(NAME # _REAL) ZPR32:$Zt, PPR3bAny:$Pg, ZPR32:$Zn, GPR64:$Rm), 0>; + def : InstAlias(NAME # _REAL) ZPR32:$Zt, PPR3bAny:$Pg, ZPR32:$Zn, XZR), 0>; + def : InstAlias(NAME # _REAL) Z_s:$Zt, PPR3bAny:$Pg, ZPR32:$Zn, XZR), 1>; + + def : Pat <(op (nxv4i32 ZPR32:$Zt), (nxv4i1 PPR3bAny:$Pg), (nxv4i32 ZPR32:$Zn), (i64 GPR64:$Rm), vt), + (!cast(NAME # _REAL) ZPR32:$Zt, PPR3bAny:$Pg, ZPR32:$Zn, GPR64:$Rm)>; +} + +multiclass sve2_mem_sstnt_vs_64_ptrs opc, string asm, + SDPatternOperator op, + ValueType vt> { + def _REAL : sve2_mem_sstnt_vs_base; def : InstAlias(NAME # _REAL) zprty:$Zt, PPR3bAny:$Pg, zprty:$Zn, GPR64:$Rm), 0>; + (!cast(NAME # _REAL) ZPR64:$Zt, PPR3bAny:$Pg, ZPR64:$Zn, GPR64:$Rm), 0>; def : InstAlias(NAME # _REAL) zprty:$Zt, PPR3bAny:$Pg, zprty:$Zn, XZR), 0>; + (!cast(NAME # _REAL) ZPR64:$Zt, PPR3bAny:$Pg, ZPR64:$Zn, XZR), 0>; def : InstAlias(NAME # _REAL) listty:$Zt, PPR3bAny:$Pg, zprty:$Zn, XZR), 1>; + (!cast(NAME # _REAL) Z_d:$Zt, PPR3bAny:$Pg, ZPR64:$Zn, XZR), 1>; + + def : Pat <(op (nxv2i64 ZPR64:$Zt), (nxv2i1 PPR3bAny:$Pg), (nxv2i64 ZPR64:$Zn), (i64 GPR64:$Rm), vt), + (!cast(NAME # _REAL) ZPR64:$Zt, PPR3bAny:$Pg, ZPR64:$Zn, GPR64:$Rm)>; } class sve_mem_sst_sv opc, bit xs, bit scaled, string asm, @@ -5094,6 +5552,17 @@ class sve_int_rdffr_pred let Uses = [FFR]; } +multiclass sve_int_rdffr_pred { + def _REAL : sve_int_rdffr_pred; + + // We need a layer of indirection because early machine code passes balk at + // physical register (i.e. FFR) uses that have no previous definition. + let hasSideEffects = 1, hasNoSchedulingInfo = 1 in { + def "" : Pseudo<(outs PPR8:$Pd), (ins PPRAny:$Pg), [(set (nxv16i1 PPR8:$Pd), (op (nxv16i1 PPRAny:$Pg)))]>, + PseudoInstExpansion<(!cast(NAME # _REAL) PPR8:$Pd, PPRAny:$Pg)>; + } +} + class sve_int_rdffr_unpred : I< (outs PPR8:$Pd), (ins), asm, "\t$Pd", @@ -5106,11 +5575,22 @@ class sve_int_rdffr_unpred : I< let Uses = [FFR]; } -class sve_int_wrffr +multiclass sve_int_rdffr_unpred { + def _REAL : sve_int_rdffr_unpred; + + // We need a layer of indirection because early machine code passes balk at + // physical register (i.e. FFR) uses that have no previous definition. + let hasSideEffects = 1, hasNoSchedulingInfo = 1 in { + def "" : Pseudo<(outs PPR8:$Pd), (ins), [(set (nxv16i1 PPR8:$Pd), (op))]>, + PseudoInstExpansion<(!cast(NAME # _REAL) PPR8:$Pd)>; + } +} + +class sve_int_wrffr : I<(outs), (ins PPR8:$Pn), asm, "\t$Pn", "", - []>, Sched<[]> { + [(op (nxv16i1 PPR8:$Pn))]>, Sched<[]> { bits<4> Pn; let Inst{31-9} = 0b00100101001010001001000; let Inst{8-5} = Pn; @@ -5120,11 +5600,11 @@ class sve_int_wrffr let Defs = [FFR]; } -class sve_int_setffr +class sve_int_setffr : I<(outs), (ins), asm, "", "", - []>, Sched<[]> { + [(op)]>, Sched<[]> { let Inst{31-0} = 0b00100101001011001001000000000000; let hasSideEffects = 1; @@ -5219,7 +5699,7 @@ class sve_int_perm_clast_zz sz8_64, bit ab, string asm, let Inst{4-0} = Zdn; let Constraints = "$Zdn = $_Zdn"; - let DestructiveInstType = Destructive; + let DestructiveInstType = DestructiveOther; let ElementSize = ElementSizeNone; } @@ -5317,7 +5797,7 @@ class sve_int_perm_splice sz8_64, string asm, ZPRRegOp zprty> let Inst{4-0} = Zdn; let Constraints = "$Zdn = $_Zdn"; - let DestructiveInstType = Destructive; + let DestructiveInstType = DestructiveOther; let ElementSize = ElementSizeNone; } @@ -5332,9 +5812,9 @@ multiclass sve_int_perm_splice { def : SVE_3_Op_Pat(NAME # _S)>; def : SVE_3_Op_Pat(NAME # _D)>; - def : SVE_3_Op_Pat(NAME # _H)>; - def : SVE_3_Op_Pat(NAME # _S)>; - def : SVE_3_Op_Pat(NAME # _D)>; + def : SVE_3_Op_Pat(NAME # _H)>; + def : SVE_3_Op_Pat(NAME # _S)>; + def : SVE_3_Op_Pat(NAME # _D)>; } class sve2_int_perm_splice_cons sz8_64, string asm, @@ -5380,7 +5860,7 @@ class sve_int_perm_rev sz8_64, bits<2> opc, string asm, let Inst{4-0} = Zd; let Constraints = "$Zd = $_Zd"; - let DestructiveInstType = Destructive; + let DestructiveInstType = DestructiveOther; let ElementSize = zprty.ElementSize; } @@ -5443,11 +5923,11 @@ class sve_int_perm_cpy_r sz8_64, string asm, ZPRRegOp zprty, let Inst{4-0} = Zd; let Constraints = "$Zd = $_Zd"; - let DestructiveInstType = Destructive; + let DestructiveInstType = DestructiveOther; let ElementSize = zprty.ElementSize; } -multiclass sve_int_perm_cpy_r { +multiclass sve_int_perm_cpy_r { def _B : sve_int_perm_cpy_r<0b00, asm, ZPR8, GPR32sp>; def _H : sve_int_perm_cpy_r<0b01, asm, ZPR16, GPR32sp>; def _S : sve_int_perm_cpy_r<0b10, asm, ZPR32, GPR32sp>; @@ -5461,6 +5941,15 @@ multiclass sve_int_perm_cpy_r { (!cast(NAME # _S) ZPR32:$Zd, PPR3bAny:$Pg, GPR32sp:$Rn), 1>; def : InstAlias<"mov $Zd, $Pg/m, $Rn", (!cast(NAME # _D) ZPR64:$Zd, PPR3bAny:$Pg, GPR64sp:$Rn), 1>; + + def : Pat<(nxv16i8 (op nxv16i1:$pg, i32:$splat, nxv16i8:$passthru)), + (!cast(NAME # _B) $passthru, $pg, $splat)>; + def : Pat<(nxv8i16 (op nxv8i1:$pg, i32:$splat, nxv8i16:$passthru)), + (!cast(NAME # _H) $passthru, $pg, $splat)>; + def : Pat<(nxv4i32 (op nxv4i1:$pg, i32:$splat, nxv4i32:$passthru)), + (!cast(NAME # _S) $passthru, $pg, $splat)>; + def : Pat<(nxv2i64 (op nxv2i1:$pg, i64:$splat, nxv2i64:$passthru)), + (!cast(NAME # _D) $passthru, $pg, $splat)>; } class sve_int_perm_cpy_v sz8_64, string asm, ZPRRegOp zprty, @@ -5480,11 +5969,11 @@ class sve_int_perm_cpy_v sz8_64, string asm, ZPRRegOp zprty, let Inst{4-0} = Zd; let Constraints = "$Zd = $_Zd"; - let DestructiveInstType = Destructive; + let DestructiveInstType = DestructiveOther; let ElementSize = zprty.ElementSize; } -multiclass sve_int_perm_cpy_v { +multiclass sve_int_perm_cpy_v { def _B : sve_int_perm_cpy_v<0b00, asm, ZPR8, FPR8>; def _H : sve_int_perm_cpy_v<0b01, asm, ZPR16, FPR16>; def _S : sve_int_perm_cpy_v<0b10, asm, ZPR32, FPR32>; @@ -5498,6 +5987,16 @@ multiclass sve_int_perm_cpy_v { (!cast(NAME # _S) ZPR32:$Zd, PPR3bAny:$Pg, FPR32:$Vn), 1>; def : InstAlias<"mov $Zd, $Pg/m, $Vn", (!cast(NAME # _D) ZPR64:$Zd, PPR3bAny:$Pg, FPR64:$Vn), 1>; + + + def : Pat<(nxv8f16 (op nxv8i1:$pg, f16:$splat, nxv8f16:$passthru)), + (!cast(NAME # _H) $passthru, $pg, $splat)>; + def : Pat<(nxv2f32 (op nxv2i1:$pg, f32:$splat, nxv2f32:$passthru)), + (!cast(NAME # _S) $passthru, $pg, $splat)>; + def : Pat<(nxv4f32 (op nxv4i1:$pg, f32:$splat, nxv4f32:$passthru)), + (!cast(NAME # _S) $passthru, $pg, $splat)>; + def : Pat<(nxv2f64 (op nxv2i1:$pg, f64:$splat, nxv2f64:$passthru)), + (!cast(NAME # _D) $passthru, $pg, $splat)>; } class sve_int_perm_compact @@ -5557,14 +6056,21 @@ class sve_mem_cld_si_base dtype, bit nf, string asm, multiclass sve_mem_cld_si_base dtype, bit nf, string asm, RegisterOperand listty, ZPRRegOp zprty> { - def "" : sve_mem_cld_si_base; + def _REAL : sve_mem_cld_si_base; def : InstAlias(NAME) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, 0), 0>; + (!cast(NAME # _REAL) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, 0), 0>; def : InstAlias(NAME) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, simm4s1:$imm4), 0>; + (!cast(NAME # _REAL) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, simm4s1:$imm4), 0>; def : InstAlias(NAME) listty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, 0), 1>; + (!cast(NAME # _REAL) listty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, 0), 1>; + + // We need a layer of indirection because early machine code passes balk at + // physical register (i.e. FFR) uses that have no previous definition. + let hasSideEffects = 1, hasNoSchedulingInfo = 1, mayLoad = 1 in { + def "" : Pseudo<(outs listty:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, simm4s1:$imm4), []>, + PseudoInstExpansion<(!cast(NAME # _REAL) listty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, simm4s1:$imm4)>; + } } multiclass sve_mem_cld_si dtype, string asm, RegisterOperand listty, @@ -5773,6 +6279,13 @@ multiclass sve_mem_cldff_ss dtype, string asm, RegisterOperand listty, def : InstAlias(NAME # _REAL) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, XZR), 0>; + + // We need a layer of indirection because early machine code passes balk at + // physical register (i.e. FFR) uses that have no previous definition. + let hasSideEffects = 1, hasNoSchedulingInfo = 1 in { + def "" : Pseudo<(outs listty:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, gprty:$Rm), []>, + PseudoInstExpansion<(!cast(NAME # _REAL) listty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, gprty:$Rm)>; + } } multiclass sve_mem_cldnf_si dtype, string asm, RegisterOperand listty, @@ -5878,10 +6391,19 @@ multiclass sve_mem_32b_gld_sv_32_scaled opc, string asm, def : InstAlias(NAME # _SXTW_SCALED_REAL) ZPR32:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm), 0>; + // We need a layer of indirection because early machine code passes balk at + // physical register (i.e. FFR) uses that have no previous definition. + let hasSideEffects = 1, hasNoSchedulingInfo = 1 in { + def _UXTW_SCALED : Pseudo<(outs Z_s:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, uxtw_opnd:$Zm), []>, + PseudoInstExpansion<(!cast(NAME # _UXTW_SCALED_REAL) Z_s:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, uxtw_opnd:$Zm)>; + def _SXTW_SCALED : Pseudo<(outs Z_s:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm), []>, + PseudoInstExpansion<(!cast(NAME # _SXTW_SCALED_REAL) Z_s:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm)>; + } + def : Pat<(nxv4i32 (uxtw_op (nxv4i1 PPR:$gp), GPR64sp:$base, (nxv4i32 ZPR:$indices), vt)), - (!cast(NAME # _UXTW_SCALED_REAL) PPR:$gp, GPR64sp:$base, ZPR:$indices)>; + (!cast(NAME # _UXTW_SCALED) PPR:$gp, GPR64sp:$base, ZPR:$indices)>; def : Pat<(nxv4i32 (sxtw_op (nxv4i1 PPR:$gp), GPR64sp:$base, (nxv4i32 ZPR:$indices), vt)), - (!cast(NAME # _SXTW_SCALED_REAL) PPR:$gp, GPR64sp:$base, ZPR:$indices)>; + (!cast(NAME # _SXTW_SCALED) PPR:$gp, GPR64sp:$base, ZPR:$indices)>; } multiclass sve_mem_32b_gld_vs_32_unscaled opc, string asm, @@ -5898,10 +6420,19 @@ multiclass sve_mem_32b_gld_vs_32_unscaled opc, string asm, def : InstAlias(NAME # _SXTW_REAL) ZPR32:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm), 0>; + // We need a layer of indirection because early machine code passes balk at + // physical register (i.e. FFR) uses that have no previous definition. + let hasSideEffects = 1, hasNoSchedulingInfo = 1 in { + def _UXTW : Pseudo<(outs Z_s:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, uxtw_opnd:$Zm), []>, + PseudoInstExpansion<(!cast(NAME # _UXTW_REAL) Z_s:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, uxtw_opnd:$Zm)>; + def _SXTW : Pseudo<(outs Z_s:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm), []>, + PseudoInstExpansion<(!cast(NAME # _SXTW_REAL) Z_s:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm)>; + } + def : Pat<(nxv4i32 (uxtw_op (nxv4i1 PPR:$gp), GPR64sp:$base, (nxv4i32 ZPR:$offsets), vt)), - (!cast(NAME # _UXTW_REAL) PPR:$gp, GPR64sp:$base, ZPR:$offsets)>; + (!cast(NAME # _UXTW) PPR:$gp, GPR64sp:$base, ZPR:$offsets)>; def : Pat<(nxv4i32 (sxtw_op (nxv4i1 PPR:$gp), GPR64sp:$base, (nxv4i32 ZPR:$offsets), vt)), - (!cast(NAME # _SXTW_REAL) PPR:$gp, GPR64sp:$base, ZPR:$offsets)>; + (!cast(NAME # _SXTW) PPR:$gp, GPR64sp:$base, ZPR:$offsets)>; } @@ -5940,8 +6471,15 @@ multiclass sve_mem_32b_gld_vi_32_ptrs opc, string asm, Operand imm_ty, def : InstAlias(NAME # _IMM_REAL) Z_s:$Zt, PPR3bAny:$Pg, ZPR32:$Zn, 0), 1>; + // We need a layer of indirection because early machine code passes balk at + // physical register (i.e. FFR) uses that have no previous definition. + let hasSideEffects = 1, hasNoSchedulingInfo = 1 in { + def _IMM : Pseudo<(outs Z_s:$Zt), (ins PPR3bAny:$Pg, ZPR32:$Zn, imm_ty:$imm5), []>, + PseudoInstExpansion<(!cast(NAME # _IMM_REAL) Z_s:$Zt, PPR3bAny:$Pg, ZPR32:$Zn, imm_ty:$imm5)>; + } + def : Pat<(nxv4i32 (op (nxv4i1 PPR:$gp), (nxv4i32 ZPR:$ptrs), imm_ty:$index, vt)), - (!cast(NAME # _IMM_REAL) PPR:$gp, ZPR:$ptrs, imm_ty:$index)>; + (!cast(NAME # _IMM) PPR:$gp, ZPR:$ptrs, imm_ty:$index)>; } class sve_mem_prfm_si msz, string asm> @@ -6022,9 +6560,17 @@ class sve_mem_32b_prfm_sv msz, bit xs, string asm, multiclass sve_mem_32b_prfm_sv_scaled msz, string asm, RegisterOperand sxtw_opnd, - RegisterOperand uxtw_opnd> { + RegisterOperand uxtw_opnd, + PatFrag op_sxtw, + PatFrag op_uxtw> { def _UXTW_SCALED : sve_mem_32b_prfm_sv; def _SXTW_SCALED : sve_mem_32b_prfm_sv; + + def : Pat<(op_uxtw (nxv4i1 PPR3bAny:$Pg), (i64 GPR64sp:$Rn), (nxv4i32 uxtw_opnd:$Zm), (i32 sve_prfop:$prfop)), + (!cast(NAME # _UXTW_SCALED) sve_prfop:$prfop, PPR3bAny:$Pg, GPR64sp:$Rn, uxtw_opnd:$Zm)>; + + def : Pat<(op_sxtw (nxv4i1 PPR3bAny:$Pg), (i64 GPR64sp:$Rn), (nxv4i32 sxtw_opnd:$Zm), (i32 sve_prfop:$prfop)), + (!cast(NAME # _SXTW_SCALED) sve_prfop:$prfop, PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm)>; } class sve_mem_32b_prfm_vi msz, string asm, Operand imm_ty> @@ -6047,11 +6593,14 @@ class sve_mem_32b_prfm_vi msz, string asm, Operand imm_ty> let Inst{3-0} = prfop; } -multiclass sve_mem_32b_prfm_vi msz, string asm, Operand imm_ty> { +multiclass sve_mem_32b_prfm_vi msz, string asm, Operand imm_ty, SDPatternOperator op> { def NAME : sve_mem_32b_prfm_vi; def : InstAlias(NAME) sve_prfop:$prfop, PPR3bAny:$Pg, ZPR32:$Zn, 0), 1>; + + def : Pat<(op (nxv4i1 PPR_3b:$Pg), (nxv4i32 ZPR32:$Zn), (i64 imm_ty:$imm), (i32 sve_prfop:$prfop)), + (!cast(NAME) sve_prfop:$prfop, PPR_3b:$Pg, ZPR32:$Zn, imm_ty:$imm)>; } class sve_mem_z_fill @@ -6130,17 +6679,38 @@ class sve2_mem_gldnt_vs_base opc, dag iops, string asm, let mayLoad = 1; } -multiclass sve2_mem_gldnt_vs opc, string asm, - RegisterOperand listty, ZPRRegOp zprty> { - def _REAL : sve2_mem_gldnt_vs_base; +multiclass sve2_mem_gldnt_vs_32_ptrs opc, string asm, + SDPatternOperator op, + ValueType vt> { + def _REAL : sve2_mem_gldnt_vs_base; + + def : InstAlias(NAME # _REAL) ZPR32:$Zt, PPR3bAny:$Pg, ZPR32:$Zn, GPR64:$Rm), 0>; + def : InstAlias(NAME # _REAL) ZPR32:$Zt, PPR3bAny:$Pg, ZPR32:$Zn, XZR), 0>; + def : InstAlias(NAME # _REAL) Z_s:$Zt, PPR3bAny:$Pg, ZPR32:$Zn, XZR), 1>; + + def : Pat <(nxv4i32 (op (nxv4i1 PPR3bAny:$Pg), (nxv4i32 ZPR32:$Zd), (i64 GPR64:$Rm), vt)), + (!cast(NAME # _REAL) PPR3bAny:$Pg, ZPR32:$Zd, GPR64:$Rm)>; +} + +multiclass sve2_mem_gldnt_vs_64_ptrs opc, string asm, + SDPatternOperator op, + ValueType vt> { + def _REAL : sve2_mem_gldnt_vs_base; def : InstAlias(NAME # _REAL) zprty:$Zt, PPR3bAny:$Pg, zprty:$Zn, GPR64:$Rm), 0>; + (!cast(NAME # _REAL) ZPR64:$Zt, PPR3bAny:$Pg, ZPR64:$Zn, GPR64:$Rm), 0>; def : InstAlias(NAME # _REAL) zprty:$Zt, PPR3bAny:$Pg, zprty:$Zn, XZR), 0>; + (!cast(NAME # _REAL) ZPR64:$Zt, PPR3bAny:$Pg, ZPR64:$Zn, XZR), 0>; def : InstAlias(NAME # _REAL) listty:$Zt, PPR3bAny:$Pg, zprty:$Zn, XZR), 1>; + (!cast(NAME # _REAL) Z_d:$Zt, PPR3bAny:$Pg, ZPR64:$Zn, XZR), 1>; + + def : Pat <(nxv2i64 (op (nxv2i1 PPR3bAny:$Pg), (nxv2i64 ZPR64:$Zd), (i64 GPR64:$Rm), vt)), + (!cast(NAME # _REAL) PPR3bAny:$Pg, ZPR64:$Zd, GPR64:$Rm)>; } //===----------------------------------------------------------------------===// @@ -6190,10 +6760,19 @@ multiclass sve_mem_64b_gld_sv_32_scaled opc, string asm, def : InstAlias(NAME # _SXTW_SCALED_REAL) ZPR64:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm), 0>; - def : Pat<(nxv2i64 (uxtw_op (nxv2i1 PPR:$gp), GPR64sp:$base, (nxv2i64 ZPR:$indices), vt)), - (!cast(NAME # _UXTW_SCALED_REAL) PPR:$gp, GPR64sp:$base, ZPR:$indices)>; + // We need a layer of indirection because early machine code passes balk at + // physical register (i.e. FFR) uses that have no previous definition. + let hasSideEffects = 1, hasNoSchedulingInfo = 1 in { + def _UXTW_SCALED : Pseudo<(outs Z_d:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, uxtw_opnd:$Zm), []>, + PseudoInstExpansion<(!cast(NAME # _UXTW_SCALED_REAL) Z_d:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, uxtw_opnd:$Zm)>; + def _SXTW_SCALED : Pseudo<(outs Z_d:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm), []>, + PseudoInstExpansion<(!cast(NAME # _SXTW_SCALED_REAL) Z_d:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm)>; + } + + def : Pat<(nxv2i64 (uxtw_op (nxv2i1 PPR:$gp), GPR64sp:$base, (nxv2i64 ZPR:$indices), vt)), + (!cast(NAME # _UXTW_SCALED) PPR:$gp, GPR64sp:$base, ZPR:$indices)>; def : Pat<(nxv2i64 (sxtw_op (nxv2i1 PPR:$gp), GPR64sp:$base, (nxv2i64 ZPR:$indices), vt)), - (!cast(NAME # _SXTW_SCALED_REAL) PPR:$gp, GPR64sp:$base, ZPR:$indices)>; + (!cast(NAME # _SXTW_SCALED) PPR:$gp, GPR64sp:$base, ZPR:$indices)>; } multiclass sve_mem_64b_gld_vs_32_unscaled opc, string asm, @@ -6210,10 +6789,19 @@ multiclass sve_mem_64b_gld_vs_32_unscaled opc, string asm, def : InstAlias(NAME # _SXTW_REAL) ZPR64:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm), 0>; + // We need a layer of indirection because early machine code passes balk at + // physical register (i.e. FFR) uses that have no previous definition. + let hasSideEffects = 1, hasNoSchedulingInfo = 1 in { + def _UXTW : Pseudo<(outs Z_d:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, uxtw_opnd:$Zm), []>, + PseudoInstExpansion<(!cast(NAME # _UXTW_REAL) Z_d:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, uxtw_opnd:$Zm)>; + def _SXTW : Pseudo<(outs Z_d:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm), []>, + PseudoInstExpansion<(!cast(NAME # _SXTW_REAL) Z_d:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm)>; + } + def : Pat<(nxv2i64 (uxtw_op (nxv2i1 PPR:$gp), GPR64sp:$base, (nxv2i64 ZPR:$offsets), vt)), - (!cast(NAME # _UXTW_REAL) PPR:$gp, GPR64sp:$base, ZPR:$offsets)>; + (!cast(NAME # _UXTW) PPR:$gp, GPR64sp:$base, ZPR:$offsets)>; def : Pat<(nxv2i64 (sxtw_op (nxv2i1 PPR:$gp), GPR64sp:$base, (nxv2i64 ZPR:$offsets), vt)), - (!cast(NAME # _SXTW_REAL) PPR:$gp, GPR64sp:$base, ZPR:$offsets)>; + (!cast(NAME # _SXTW) PPR:$gp, GPR64sp:$base, ZPR:$offsets)>; } multiclass sve_mem_64b_gld_sv2_64_scaled opc, string asm, @@ -6224,8 +6812,15 @@ multiclass sve_mem_64b_gld_sv2_64_scaled opc, string asm, def : InstAlias(NAME # _SCALED_REAL) ZPR64:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, zprext:$Zm), 0>; + // We need a layer of indirection because early machine code passes balk at + // physical register (i.e. FFR) uses that have no previous definition. + let hasSideEffects = 1, hasNoSchedulingInfo = 1 in { + def _SCALED : Pseudo<(outs Z_d:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, zprext:$Zm), []>, + PseudoInstExpansion<(!cast(NAME # _SCALED_REAL) Z_d:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, zprext:$Zm)>; + } + def : Pat<(nxv2i64 (op (nxv2i1 PPR:$gp), GPR64sp:$base, (nxv2i64 ZPR:$indices), vt)), - (!cast(NAME # _SCALED_REAL) PPR:$gp, GPR64sp:$base, ZPR:$indices)>; + (!cast(NAME # _SCALED) PPR:$gp, GPR64sp:$base, ZPR:$indices)>; } multiclass sve_mem_64b_gld_vs2_64_unscaled opc, string asm, @@ -6235,8 +6830,15 @@ multiclass sve_mem_64b_gld_vs2_64_unscaled opc, string asm, def : InstAlias(NAME # _REAL) ZPR64:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, ZPR64ExtLSL8:$Zm), 0>; + // We need a layer of indirection because early machine code passes balk at + // physical register (i.e. FFR) uses that have no previous definition. + let hasSideEffects = 1, hasNoSchedulingInfo = 1 in { + def "" : Pseudo<(outs Z_d:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, ZPR64ExtLSL8:$Zm), []>, + PseudoInstExpansion<(!cast(NAME # _REAL) Z_d:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, ZPR64ExtLSL8:$Zm)>; + } + def : Pat<(nxv2i64 (op (nxv2i1 PPR:$gp), GPR64sp:$base, (nxv2i64 ZPR:$offsets), vt)), - (!cast(NAME # _REAL) PPR:$gp, GPR64sp:$base, ZPR:$offsets)>; + (!cast(NAME) PPR:$gp, GPR64sp:$base, ZPR:$offsets)>; } class sve_mem_64b_gld_vi opc, string asm, Operand imm_ty> @@ -6274,8 +6876,15 @@ multiclass sve_mem_64b_gld_vi_64_ptrs opc, string asm, Operand imm_ty, def : InstAlias(NAME # _IMM_REAL) Z_d:$Zt, PPR3bAny:$Pg, ZPR64:$Zn, 0), 1>; + // We need a layer of indirection because early machine code passes balk at + // physical register (i.e. FFR) uses that have no previous definition. + let hasSideEffects = 1, hasNoSchedulingInfo = 1 in { + def _IMM : Pseudo<(outs Z_d:$Zt), (ins PPR3bAny:$Pg, ZPR64:$Zn, imm_ty:$imm5), []>, + PseudoInstExpansion<(!cast(NAME # _IMM_REAL) Z_d:$Zt, PPR3bAny:$Pg, ZPR64:$Zn, imm_ty:$imm5)>; + } + def : Pat<(nxv2i64 (op (nxv2i1 PPR:$gp), (nxv2i64 ZPR:$ptrs), imm_ty:$index, vt)), - (!cast(NAME # _IMM_REAL) PPR:$gp, ZPR:$ptrs, imm_ty:$index)>; + (!cast(NAME # _IMM) PPR:$gp, ZPR:$ptrs, imm_ty:$index)>; } // bit lsl is '0' if the offsets are extended (uxtw/sxtw), '1' if shifted (lsl) @@ -6305,14 +6914,27 @@ class sve_mem_64b_prfm_sv msz, bit xs, bit lsl, string asm, multiclass sve_mem_64b_prfm_sv_ext_scaled msz, string asm, RegisterOperand sxtw_opnd, - RegisterOperand uxtw_opnd> { + RegisterOperand uxtw_opnd, + PatFrag op_sxtw, + PatFrag op_uxtw> { def _UXTW_SCALED : sve_mem_64b_prfm_sv; def _SXTW_SCALED : sve_mem_64b_prfm_sv; + + def : Pat<(op_uxtw (nxv2i1 PPR3bAny:$Pg), (i64 GPR64sp:$Rn), (nxv2i64 uxtw_opnd:$Zm), (i32 sve_prfop:$prfop)), + (!cast(NAME # _UXTW_SCALED) sve_prfop:$prfop, PPR3bAny:$Pg, GPR64sp:$Rn, uxtw_opnd:$Zm)>; + + def : Pat<(op_sxtw (nxv2i1 PPR3bAny:$Pg), (i64 GPR64sp:$Rn), (nxv2i64 sxtw_opnd:$Zm), (i32 sve_prfop:$prfop)), + (!cast(NAME # _SXTW_SCALED) sve_prfop:$prfop, PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm)>; + } multiclass sve_mem_64b_prfm_sv_lsl_scaled msz, string asm, - RegisterOperand zprext> { + RegisterOperand zprext, PatFrag frag> { def NAME : sve_mem_64b_prfm_sv; + + def : Pat<(frag (nxv2i1 PPR3bAny:$Pg), (i64 GPR64sp:$Rn), (nxv2i64 zprext:$Zm), (i32 sve_prfop:$prfop)), + (!cast(NAME) sve_prfop:$prfop, PPR3bAny:$Pg, GPR64sp:$Rn, zprext:$Zm)>; + } @@ -6338,13 +6960,15 @@ class sve_mem_64b_prfm_vi msz, string asm, Operand imm_ty> let hasSideEffects = 1; } -multiclass sve_mem_64b_prfm_vi msz, string asm, Operand imm_ty> { +multiclass sve_mem_64b_prfm_vi msz, string asm, Operand imm_ty, SDPatternOperator op> { def NAME : sve_mem_64b_prfm_vi; def : InstAlias(NAME) sve_prfop:$prfop, PPR3bAny:$Pg, ZPR64:$Zn, 0), 1>; -} + def : Pat<(op (nxv2i1 PPR_3b:$Pg), (nxv2i64 ZPR32:$Zn), (i64 imm_ty:$imm), (i32 sve_prfop:$prfop)), + (!cast(NAME) sve_prfop:$prfop, PPR_3b:$Pg, ZPR32:$Zn, imm_ty:$imm)>; +} //===----------------------------------------------------------------------===// // SVE Compute Vector Address Group @@ -6600,6 +7224,12 @@ class sve_int_brkp opc, string asm> let Defs = !if(!eq (opc{1}, 1), [NZCV], []); } +multiclass sve_int_brkp opc, string asm, SDPatternOperator op> { + def NAME : sve_int_brkp; + + def : SVE_3_Op_Pat(NAME)>; +} + //===----------------------------------------------------------------------===// // SVE Partition Break Group @@ -6626,6 +7256,12 @@ class sve_int_brkn let Defs = !if(!eq (S, 0b1), [NZCV], []); } +multiclass sve_int_brkn opc, string asm, SDPatternOperator op> { + def NAME : sve_int_brkn; + + def : SVE_3_Op_Pat(NAME)>; +} + class sve_int_break opc, string asm, string suffix, dag iops> : I<(outs PPR8:$Pd), iops, asm, "\t$Pd, $Pg"#suffix#", $Pn", @@ -6648,12 +7284,16 @@ class sve_int_break opc, string asm, string suffix, dag iops> } -multiclass sve_int_break_m opc, string asm> { +multiclass sve_int_break_m opc, string asm, SDPatternOperator op> { def NAME : sve_int_break; + + def : SVE_3_Op_Pat(NAME)>; } -multiclass sve_int_break_z opc, string asm> { +multiclass sve_int_break_z opc, string asm, SDPatternOperator op> { def NAME : sve_int_break; + + def : SVE_2_Op_Pat(NAME)>; } //===----------------------------------------------------------------------===// @@ -6683,20 +7323,23 @@ class sve2_char_match { +multiclass sve2_char_match { def _B : sve2_char_match<0b0, opc, asm, PPR8, ZPR8>; def _H : sve2_char_match<0b1, opc, asm, PPR16, ZPR16>; + + def : SVE_3_Op_Pat(NAME # _B)>; + def : SVE_3_Op_Pat(NAME # _H)>; } //===----------------------------------------------------------------------===// // SVE2 Histogram Computation - Segment Group //===----------------------------------------------------------------------===// -class sve2_hist_gen_segment +class sve2_hist_gen_segment : I<(outs ZPR8:$Zd), (ins ZPR8:$Zn, ZPR8:$Zm), asm, "\t$Zd, $Zn, $Zm", "", - []>, Sched<[]> { + [(set nxv16i8:$Zd, (op nxv16i8:$Zn, nxv16i8:$Zm))]>, Sched<[]> { bits<5> Zd; bits<5> Zn; bits<5> Zm; @@ -6730,9 +7373,12 @@ class sve2_hist_gen_vector let Inst{4-0} = Zd; } -multiclass sve2_hist_gen_vector { +multiclass sve2_hist_gen_vector { def _S : sve2_hist_gen_vector<0b0, asm, ZPR32>; def _D : sve2_hist_gen_vector<0b1, asm, ZPR64>; + + def : SVE_3_Op_Pat(NAME # _S)>; + def : SVE_3_Op_Pat(NAME # _D)>; } //===----------------------------------------------------------------------===// @@ -6755,6 +7401,12 @@ class sve2_crypto_cons_bin_op let Inst{4-0} = Zd; } +multiclass sve2_crypto_cons_bin_op { + def NAME : sve2_crypto_cons_bin_op; + def : SVE_2_Op_Pat(NAME)>; +} + class sve2_crypto_des_bin_op opc, string asm, ZPRRegOp zprty> : I<(outs zprty:$Zdn), (ins zprty:$_Zdn, zprty:$Zm), asm, "\t$Zdn, $_Zdn, $Zm", @@ -6772,8 +7424,14 @@ class sve2_crypto_des_bin_op opc, string asm, ZPRRegOp zprty> let Constraints = "$Zdn = $_Zdn"; } -class sve2_crypto_unary_op -: I<(outs ZPR8:$Zdn), (ins ZPR8:$_Zdn), +multiclass sve2_crypto_des_bin_op opc, string asm, ZPRRegOp zprty, + SDPatternOperator op, ValueType vt> { + def NAME : sve2_crypto_des_bin_op; + def : SVE_2_Op_Pat(NAME)>; +} + +class sve2_crypto_unary_op +: I<(outs zprty:$Zdn), (ins zprty:$_Zdn), asm, "\t$Zdn, $_Zdn", "", []>, Sched<[]> { @@ -6785,3 +7443,389 @@ class sve2_crypto_unary_op let Constraints = "$Zdn = $_Zdn"; } + +multiclass sve2_crypto_unary_op { + def NAME : sve2_crypto_unary_op; + def : SVE_1_Op_Pat(NAME)>; +} + +//===----------------------------------------------------------------------===// +// SVE BFloat16 Group +//===----------------------------------------------------------------------===// + +class sve_bfloat_dot_base opc, string asm, string ops, dag iops> +: I<(outs ZPR32:$Zda), iops, asm, ops, "", []>, Sched<[]> { + bits<5> Zda; + bits<5> Zn; + let Inst{31-21} = 0b01100100011; + let Inst{15-14} = opc; + let Inst{13-10} = 0b0000; + let Inst{9-5} = Zn; + let Inst{4-0} = Zda; + + let Constraints = "$Zda = $_Zda"; + let DestructiveInstType = DestructiveOther; + let ElementSize = ElementSizeH; +} + +class sve_bfloat_dot +: sve_bfloat_dot_base<0b10, asm, "\t$Zda, $Zn, $Zm", + (ins ZPR32:$_Zda, ZPR16:$Zn, ZPR16:$Zm)> { + bits<5> Zm; + let Inst{20-16} = Zm; +} + +multiclass sve_bfloat_dot { + def NAME : sve_bfloat_dot; + def : SVE_3_Op_Pat(NAME)>; +} + +class sve_bfloat_dot_indexed +: sve_bfloat_dot_base<0b01, asm, "\t$Zda, $Zn, $Zm$iop", + (ins ZPR32:$_Zda, ZPR16:$Zn, ZPR3b16:$Zm, VectorIndexS:$iop)> { + bits<2> iop; + bits<3> Zm; + let Inst{20-19} = iop; + let Inst{18-16} = Zm; +} + +multiclass sve_bfloat_dot_indexed { + def NAME : sve_bfloat_dot_indexed; + def : SVE_4_Op_Imm_Pat(NAME)>; +} + +class sve_bfloat_matmul +: I<(outs ZPR32:$Zda), (ins ZPR32:$_Zda, ZPR16:$Zn, ZPR16:$Zm), + asm, "\t$Zda, $Zn, $Zm", "", []>, Sched<[]> { + bits<5> Zm; + bits<5> Zda; + bits<5> Zn; + let Inst{31-21} = 0b01100100011; + let Inst{20-16} = Zm; + let Inst{15-10} = 0b111001; + let Inst{9-5} = Zn; + let Inst{4-0} = Zda; + + let Constraints = "$Zda = $_Zda"; + let DestructiveInstType = DestructiveOther; + let ElementSize = ElementSizeH; +} + +multiclass sve_bfloat_matmul { + def NAME : sve_bfloat_matmul; + def : SVE_3_Op_Pat(NAME)>; +} + +class sve_bfloat_matmul_longvecl +: sve_bfloat_matmul { + let Inst{23} = 0b1; + let Inst{14-13} = 0b00; + let Inst{10} = BT; +} + +multiclass sve_bfloat_matmul_longvecl { + def NAME : sve_bfloat_matmul_longvecl; + def : SVE_3_Op_Pat(NAME)>; +} + +class sve_bfloat_matmul_longvecl_idx +: sve_bfloat_dot_base<0b01, asm, "\t$Zda, $Zn, $Zm$iop", + (ins ZPR32:$_Zda, ZPR16:$Zn, ZPR3b16:$Zm, VectorIndexH:$iop)> { + bits<3> iop; + bits<3> Zm; + let Inst{23} = 0b1; + let Inst{20-19} = iop{2-1}; + let Inst{18-16} = Zm; + let Inst{11} = iop{0}; + let Inst{10} = BT; +} + +multiclass sve_bfloat_matmul_longvecl_idx { + def NAME : sve_bfloat_matmul_longvecl_idx; + def : SVE_4_Op_Imm_Pat(NAME)>; +} + +class sve_bfloat_convert +: I<(outs ZPR16:$Zd), (ins ZPR16:$_Zd, PPR3bAny:$Pg, ZPR32:$Zn), + asm, "\t$Zd, $Pg/m, $Zn", "", []>, Sched<[]> { + bits<5> Zd; + bits<3> Pg; + bits<5> Zn; + let Inst{31-25} = 0b0110010; + let Inst{24} = N; + let Inst{23-13} = 0b10001010101; + let Inst{12-10} = Pg; + let Inst{9-5} = Zn; + let Inst{4-0} = Zd; + + let Constraints = "$Zd = $_Zd"; + let DestructiveInstType = DestructiveOther; + let hasSideEffects = 1; + let ElementSize = ElementSizeS; +} + +multiclass sve_bfloat_convert { + def NAME : sve_bfloat_convert; + def : SVE_3_Op_Pat(NAME)>; +} + +//===----------------------------------------------------------------------===// +// SVE Integer Matrix Multiply Group +//===----------------------------------------------------------------------===// + +class sve_int_matmul uns, string asm> +: I<(outs ZPR32:$Zda), (ins ZPR32:$_Zda, ZPR8:$Zn, ZPR8:$Zm), asm, + "\t$Zda, $Zn, $Zm", "", []>, Sched<[]> { + bits<5> Zda; + bits<5> Zn; + bits<5> Zm; + let Inst{31-24} = 0b01000101; + let Inst{23-22} = uns; + let Inst{21} = 0; + let Inst{20-16} = Zm; + let Inst{15-10} = 0b100110; + let Inst{9-5} = Zn; + let Inst{4-0} = Zda; + + let Constraints = "$Zda = $_Zda"; + let DestructiveInstType = DestructiveOther; + let ElementSize = ZPR32.ElementSize; +} + +multiclass sve_int_matmul uns, string asm, SDPatternOperator op> { + def NAME : sve_int_matmul; + + def : SVE_3_Op_Pat(NAME)>; +} + +//===----------------------------------------------------------------------===// +// SVE Integer Dot Product Mixed Sign Group +//===----------------------------------------------------------------------===// + +class sve_int_dot_mixed +: I<(outs ZPR32:$Zda), (ins ZPR32:$_Zda, ZPR8:$Zn, ZPR8:$Zm), asm, + "\t$Zda, $Zn, $Zm", "", []>, Sched<[]> { + bits<5> Zda; + bits<5> Zn; + bits<5> Zm; + let Inst{31-21} = 0b01000100100; + let Inst{20-16} = Zm; + let Inst{15-10} = 0b011110; + let Inst{9-5} = Zn; + let Inst{4-0} = Zda; + + let Constraints = "$Zda = $_Zda"; + let DestructiveInstType = DestructiveOther; + let ElementSize = ZPR32.ElementSize; +} + +multiclass sve_int_dot_mixed { + def NAME : sve_int_dot_mixed; + + def : SVE_3_Op_Pat(NAME)>; +} + +//===----------------------------------------------------------------------===// +// SVE Integer Dot Product Mixed Sign - Indexed Group +//===----------------------------------------------------------------------===// + +class sve_int_dot_mixed_indexed +: I<(outs ZPR32:$Zda), (ins ZPR32:$_Zda, ZPR8:$Zn, ZPR3b8:$Zm, VectorIndexS32b:$idx), + asm, "\t$Zda, $Zn, $Zm$idx", "", []>, Sched<[]> { + bits<5> Zda; + bits<5> Zn; + bits<3> Zm; + bits<2> idx; + let Inst{31-21} = 0b01000100101; + let Inst{20-19} = idx; + let Inst{18-16} = Zm; + let Inst{15-11} = 0b00011; + let Inst{10} = U; + let Inst{9-5} = Zn; + let Inst{4-0} = Zda; + + let Constraints = "$Zda = $_Zda"; + let DestructiveInstType = DestructiveOther; + let ElementSize = ZPR32.ElementSize; +} + +multiclass sve_int_dot_mixed_indexed { + def NAME : sve_int_dot_mixed_indexed; + + def : SVE_4_Op_Imm_Pat(NAME)>; +} + +//===----------------------------------------------------------------------===// +// SVE Floating Point Matrix Multiply Accumulate Group +//===----------------------------------------------------------------------===// + +class sve_fp_matrix_mla +: I<(outs zprty:$Zda), (ins zprty:$_Zda, zprty:$Zn, zprty:$Zm), + asm, "\t$Zda, $Zn, $Zm", "", []>, Sched<[]> { + bits<5> Zda; + bits<5> Zn; + bits<5> Zm; + let Inst{31-23} = 0b011001001; + let Inst{22} = sz; + let Inst{21} = 1; + let Inst{20-16} = Zm; + let Inst{15-10} = 0b111001; + let Inst{9-5} = Zn; + let Inst{4-0} = Zda; + + let Constraints = "$Zda = $_Zda"; + let DestructiveInstType = DestructiveOther; + let ElementSize = zprty.ElementSize; +} + +multiclass sve_fp_matrix_mla { + def NAME : sve_fp_matrix_mla; + + def : SVE_3_Op_Pat(NAME)>; +} + +//===----------------------------------------------------------------------===// +// SVE Memory - Contiguous Load And Replicate 256-bit Group +//===----------------------------------------------------------------------===// + +class sve_mem_ldor_si sz, string asm, RegisterOperand VecList> +: I<(outs VecList:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, simm4s32:$imm4), + asm, "\t$Zt, $Pg/z, [$Rn, $imm4]", "", []>, Sched<[]> { + bits<5> Zt; + bits<5> Rn; + bits<3> Pg; + bits<4> imm4; + let Inst{31-25} = 0b1010010; + let Inst{24-23} = sz; + let Inst{22-20} = 0b010; + let Inst{19-16} = imm4; + let Inst{15-13} = 0b001; + let Inst{12-10} = Pg; + let Inst{9-5} = Rn; + let Inst{4-0} = Zt; + + let mayLoad = 1; +} + +multiclass sve_mem_ldor_si sz, string asm, RegisterOperand listty, + ZPRRegOp zprty, ValueType Ty, ValueType PredTy, SDNode Ld1ro> { + def NAME : sve_mem_ldor_si; + def : InstAlias(NAME) listty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, 0), 1>; + def : InstAlias(NAME) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, 0), 0>; + def : InstAlias(NAME) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, simm4s32:$imm4), 0>; + + // Base addressing mode + def : Pat<(Ty (Ld1ro (PredTy PPR3bAny:$gp), GPR64sp:$base)), + (!cast(NAME) PPR3bAny:$gp, GPR64sp:$base, (i64 0))>; + +} + +class sve_mem_ldor_ss sz, string asm, RegisterOperand VecList, + RegisterOperand gprty> +: I<(outs VecList:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, gprty:$Rm), + asm, "\t$Zt, $Pg/z, [$Rn, $Rm]", "", []>, Sched<[]> { + bits<5> Zt; + bits<3> Pg; + bits<5> Rn; + bits<5> Rm; + let Inst{31-25} = 0b1010010; + let Inst{24-23} = sz; + let Inst{22-21} = 0b01; + let Inst{20-16} = Rm; + let Inst{15-13} = 0; + let Inst{12-10} = Pg; + let Inst{9-5} = Rn; + let Inst{4-0} = Zt; + + let mayLoad = 1; +} + +multiclass sve_mem_ldor_ss sz, string asm, RegisterOperand listty, + ZPRRegOp zprty, RegisterOperand gprty, ValueType Ty, + ValueType PredTy, SDNode Ld1ro, ComplexPattern AddrCP> { + def NAME : sve_mem_ldor_ss; + + def : InstAlias(NAME) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, gprty:$Rm), 0>; + + def : Pat<(Ty (Ld1ro (PredTy PPR3bAny:$gp), (AddrCP GPR64sp:$base, gprty:$offset))), + (!cast(NAME) PPR3bAny:$gp, GPR64sp:$base, gprty:$offset)>; +} + +//===----------------------------------------------------------------------===// +// SVE Interleave 128-bit Elements Group +//===----------------------------------------------------------------------===// + +class sve_int_perm_bin_perm_128_zz opc, bit P, string asm> +: I<(outs ZPR128:$Zd), (ins ZPR128:$Zn, ZPR128:$Zm), + asm, "\t$Zd, $Zn, $Zm", + "", + []>, Sched<[]> { + bits<5> Zd; + bits<5> Zm; + bits<5> Zn; + let Inst{31-21} = 0b00000101101; + let Inst{20-16} = Zm; + let Inst{15-13} = 0b000; + let Inst{12-11} = opc; + let Inst{10} = P; + let Inst{9-5} = Zn; + let Inst{4-0} = Zd; +} + +multiclass sve_int_perm_bin_perm_128_zz opc, bit P, string asm, SDPatternOperator op> { + def NAME : sve_int_perm_bin_perm_128_zz; + + def : SVE_2_Op_Pat(NAME)>; + def : SVE_2_Op_Pat(NAME)>; + def : SVE_2_Op_Pat(NAME)>; + def : SVE_2_Op_Pat(NAME)>; + def : SVE_2_Op_Pat(NAME)>; + def : SVE_2_Op_Pat(NAME)>; + def : SVE_2_Op_Pat(NAME)>; +} + +/// Addressing modes +def am_sve_indexed_s4 :ComplexPattern", [], [SDNPWantRoot]>; +def am_sve_indexed_s6 :ComplexPattern", [], [SDNPWantRoot]>; + +def am_sve_regreg_lsl0 : ComplexPattern", []>; +def am_sve_regreg_lsl1 : ComplexPattern", []>; +def am_sve_regreg_lsl2 : ComplexPattern", []>; +def am_sve_regreg_lsl3 : ComplexPattern", []>; + +// Predicated pseudo floating point two operand instructions. +multiclass sve_fp_bin_pred_hfd { + def _UNDEF_H : PredTwoOpPseudo; + def _UNDEF_S : PredTwoOpPseudo; + def _UNDEF_D : PredTwoOpPseudo; + + def : SVE_3_Op_Pat(NAME # _UNDEF_H)>; + def : SVE_3_Op_Pat(NAME # _UNDEF_S)>; + def : SVE_3_Op_Pat(NAME # _UNDEF_D)>; +} + +// Predicated pseudo integer two operand instructions. +multiclass sve_int_bin_pred_bhsd { + def _UNDEF_B : PredTwoOpPseudo; + def _UNDEF_H : PredTwoOpPseudo; + def _UNDEF_S : PredTwoOpPseudo; + def _UNDEF_D : PredTwoOpPseudo; + + def : SVE_3_Op_Pat(NAME # _UNDEF_B)>; + def : SVE_3_Op_Pat(NAME # _UNDEF_H)>; + def : SVE_3_Op_Pat(NAME # _UNDEF_S)>; + def : SVE_3_Op_Pat(NAME # _UNDEF_D)>; +} + +// As sve_int_bin_pred but when only i32 and i64 vector types are required. +multiclass sve_int_bin_pred_sd { + def _UNDEF_S : PredTwoOpPseudo; + def _UNDEF_D : PredTwoOpPseudo; + + def : SVE_3_Op_Pat(NAME # _UNDEF_S)>; + def : SVE_3_Op_Pat(NAME # _UNDEF_D)>; +} diff --git a/llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp b/llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp new file mode 100644 index 0000000000000..74fe0cdd1ea7f --- /dev/null +++ b/llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp @@ -0,0 +1,265 @@ +//===----- SVEIntrinsicOpts - SVE ACLE Intrinsics Opts --------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// Performs general IR level optimizations on SVE intrinsics. +// +// The main goal of this pass is to remove unnecessary reinterpret +// intrinsics (llvm.aarch64.sve.convert.[to|from].svbool), e.g: +// +// %1 = @llvm.aarch64.sve.convert.to.svbool.nxv4i1( %a) +// %2 = @llvm.aarch64.sve.convert.from.svbool.nxv4i1( %1) +// +// This pass also looks for ptest intrinsics & phi instructions where the +// operands are being needlessly converted to and from svbool_t. +// +//===----------------------------------------------------------------------===// + +#include "Utils/AArch64BaseInfo.h" +#include "llvm/ADT/PostOrderIterator.h" +#include "llvm/ADT/SetVector.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/IntrinsicsAArch64.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/IR/PatternMatch.h" +#include "llvm/InitializePasses.h" +#include "llvm/Support/Debug.h" + +using namespace llvm; +using namespace llvm::PatternMatch; + +#define DEBUG_TYPE "sve-intrinsic-opts" + +namespace llvm { +void initializeSVEIntrinsicOptsPass(PassRegistry &); +} + +namespace { +struct SVEIntrinsicOpts : public ModulePass { + static char ID; // Pass identification, replacement for typeid + SVEIntrinsicOpts() : ModulePass(ID) { + initializeSVEIntrinsicOptsPass(*PassRegistry::getPassRegistry()); + } + + bool runOnModule(Module &M) override; + void getAnalysisUsage(AnalysisUsage &AU) const override; + +private: + static IntrinsicInst *isReinterpretToSVBool(Value *V); + + static bool optimizeIntrinsic(Instruction *I); + + bool optimizeFunctions(SmallSetVector &Functions); + + static bool optimizeConvertFromSVBool(IntrinsicInst *I); + static bool optimizePTest(IntrinsicInst *I); + + static bool processPhiNode(IntrinsicInst *I); +}; +} // end anonymous namespace + +void SVEIntrinsicOpts::getAnalysisUsage(AnalysisUsage &AU) const { + AU.addRequired(); + AU.setPreservesCFG(); +} + +char SVEIntrinsicOpts::ID = 0; +static const char *name = "SVE intrinsics optimizations"; +INITIALIZE_PASS_BEGIN(SVEIntrinsicOpts, DEBUG_TYPE, name, false, false) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass); +INITIALIZE_PASS_END(SVEIntrinsicOpts, DEBUG_TYPE, name, false, false) + +namespace llvm { +ModulePass *createSVEIntrinsicOptsPass() { return new SVEIntrinsicOpts(); } +} // namespace llvm + +/// Returns V if it's a cast from (aka svbool_t), nullptr +/// otherwise. +IntrinsicInst *SVEIntrinsicOpts::isReinterpretToSVBool(Value *V) { + IntrinsicInst *I = dyn_cast(V); + if (!I) + return nullptr; + + if (I->getIntrinsicID() != Intrinsic::aarch64_sve_convert_to_svbool) + return nullptr; + + return I; +} + +/// The function will remove redundant reinterprets casting in the presence +/// of the control flow +bool SVEIntrinsicOpts::processPhiNode(IntrinsicInst *X) { + + SmallVector Worklist; + auto RequiredType = X->getType(); + + auto *PN = dyn_cast(X->getArgOperand(0)); + assert(PN && "Expected Phi Node!"); + + // Don't create a new Phi unless we can remove the old one. + if (!PN->hasOneUse()) + return false; + + for (Value *IncValPhi : PN->incoming_values()) { + auto *Reinterpret = isReinterpretToSVBool(IncValPhi); + if (!Reinterpret || + RequiredType != Reinterpret->getArgOperand(0)->getType()) + return false; + } + + // Create the new Phi + LLVMContext &Ctx = PN->getContext(); + IRBuilder<> Builder(Ctx); + Builder.SetInsertPoint(PN); + PHINode *NPN = Builder.CreatePHI(RequiredType, PN->getNumIncomingValues()); + Worklist.push_back(PN); + + for (unsigned I = 0; I < PN->getNumIncomingValues(); I++) { + auto *Reinterpret = cast(PN->getIncomingValue(I)); + NPN->addIncoming(Reinterpret->getOperand(0), PN->getIncomingBlock(I)); + Worklist.push_back(Reinterpret); + } + + // Cleanup Phi Node and reinterprets + X->replaceAllUsesWith(NPN); + X->eraseFromParent(); + + for (auto &I : Worklist) + if (I->use_empty()) + I->eraseFromParent(); + + return true; +} + +bool SVEIntrinsicOpts::optimizePTest(IntrinsicInst *I) { + IntrinsicInst *Op1 = dyn_cast(I->getArgOperand(0)); + IntrinsicInst *Op2 = dyn_cast(I->getArgOperand(1)); + + if (Op1 && Op2 && + Op1->getIntrinsicID() == Intrinsic::aarch64_sve_convert_to_svbool && + Op2->getIntrinsicID() == Intrinsic::aarch64_sve_convert_to_svbool && + Op1->getArgOperand(0)->getType() == Op2->getArgOperand(0)->getType()) { + + Value *Ops[] = {Op1->getArgOperand(0), Op2->getArgOperand(0)}; + Type *Tys[] = {Op1->getArgOperand(0)->getType()}; + Module *M = I->getParent()->getParent()->getParent(); + + auto Fn = Intrinsic::getDeclaration(M, I->getIntrinsicID(), Tys); + auto CI = CallInst::Create(Fn, Ops, I->getName(), I); + + I->replaceAllUsesWith(CI); + I->eraseFromParent(); + if (Op1->use_empty()) + Op1->eraseFromParent(); + if (Op2->use_empty()) + Op2->eraseFromParent(); + + return true; + } + + return false; +} + +bool SVEIntrinsicOpts::optimizeConvertFromSVBool(IntrinsicInst *I) { + assert(I->getIntrinsicID() == Intrinsic::aarch64_sve_convert_from_svbool && + "Unexpected opcode"); + + // If the reinterpret instruction operand is a PHI Node + if (isa(I->getArgOperand(0))) + return processPhiNode(I); + + // If we have a reinterpret intrinsic I of type A which is converting from + // another reinterpret Y of type B, and the source type of Y is A, then we can + // elide away both reinterprets if there are no other users of Y. + auto *Y = isReinterpretToSVBool(I->getArgOperand(0)); + if (!Y) + return false; + + Value *SourceVal = Y->getArgOperand(0); + if (I->getType() != SourceVal->getType()) + return false; + + I->replaceAllUsesWith(SourceVal); + I->eraseFromParent(); + if (Y->use_empty()) + Y->eraseFromParent(); + + return true; +} + +bool SVEIntrinsicOpts::optimizeIntrinsic(Instruction *I) { + IntrinsicInst *IntrI = dyn_cast(I); + if (!IntrI) + return false; + + switch (IntrI->getIntrinsicID()) { + case Intrinsic::aarch64_sve_convert_from_svbool: + return optimizeConvertFromSVBool(IntrI); + case Intrinsic::aarch64_sve_ptest_any: + case Intrinsic::aarch64_sve_ptest_first: + case Intrinsic::aarch64_sve_ptest_last: + return optimizePTest(IntrI); + default: + return false; + } + + return true; +} + +bool SVEIntrinsicOpts::optimizeFunctions( + SmallSetVector &Functions) { + bool Changed = false; + for (auto *F : Functions) { + DominatorTree *DT = &getAnalysis(*F).getDomTree(); + + // Traverse the DT with an rpo walk so we see defs before uses, allowing + // simplification to be done incrementally. + BasicBlock *Root = DT->getRoot(); + ReversePostOrderTraversal RPOT(Root); + for (auto *BB : RPOT) + for (Instruction &I : make_early_inc_range(*BB)) + Changed |= optimizeIntrinsic(&I); + } + return Changed; +} + +bool SVEIntrinsicOpts::runOnModule(Module &M) { + bool Changed = false; + SmallSetVector Functions; + + // Check for SVE intrinsic declarations first so that we only iterate over + // relevant functions. Where an appropriate declaration is found, store the + // function(s) where it is used so we can target these only. + for (auto &F : M.getFunctionList()) { + if (!F.isDeclaration()) + continue; + + switch (F.getIntrinsicID()) { + case Intrinsic::aarch64_sve_convert_from_svbool: + case Intrinsic::aarch64_sve_ptest_any: + case Intrinsic::aarch64_sve_ptest_first: + case Intrinsic::aarch64_sve_ptest_last: + for (auto I = F.user_begin(), E = F.user_end(); I != E;) { + auto *Inst = dyn_cast(*I++); + Functions.insert(Inst->getFunction()); + } + break; + default: + break; + } + } + + if (!Functions.empty()) + Changed |= optimizeFunctions(Functions); + + return Changed; +} diff --git a/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h b/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h index 87980cddb7c0b..4e289fbe23257 100644 --- a/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h +++ b/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h @@ -658,6 +658,7 @@ namespace AArch64 { // in index i*P of a vector. The other elements of the // vector (such as index 1) are undefined. static constexpr unsigned SVEBitsPerBlock = 128; +static constexpr unsigned SVEMaxBitsPerVector = 2048; const unsigned NeonBitsPerVector = 128; } // end namespace AArch64 } // end namespace llvm -- cgit v1.2.3