summaryrefslogtreecommitdiff
path: root/llvm/lib/Target/AArch64
diff options
context:
space:
mode:
authorDimitry Andric <dim@FreeBSD.org>2020-07-26 19:36:28 +0000
committerDimitry Andric <dim@FreeBSD.org>2020-07-26 19:36:28 +0000
commitcfca06d7963fa0909f90483b42a6d7d194d01e08 (patch)
tree209fb2a2d68f8f277793fc8df46c753d31bc853b /llvm/lib/Target/AArch64
parent706b4fc47bbc608932d3b491ae19a3b9cde9497b (diff)
Notes
Diffstat (limited to 'llvm/lib/Target/AArch64')
-rw-r--r--llvm/lib/Target/AArch64/AArch64.h9
-rw-r--r--llvm/lib/Target/AArch64/AArch64.td174
-rw-r--r--llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp182
-rw-r--r--llvm/lib/Target/AArch64/AArch64BranchTargets.cpp12
-rw-r--r--llvm/lib/Target/AArch64/AArch64CallingConvention.cpp15
-rw-r--r--llvm/lib/Target/AArch64/AArch64CallingConvention.td145
-rw-r--r--llvm/lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp4
-rw-r--r--llvm/lib/Target/AArch64/AArch64CollectLOH.cpp21
-rw-r--r--llvm/lib/Target/AArch64/AArch64Combine.td68
-rw-r--r--llvm/lib/Target/AArch64/AArch64CompressJumpTables.cpp2
-rw-r--r--llvm/lib/Target/AArch64/AArch64CondBrTuning.cpp16
-rw-r--r--llvm/lib/Target/AArch64/AArch64ConditionOptimizer.cpp37
-rw-r--r--llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp14
-rw-r--r--llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp327
-rw-r--r--llvm/lib/Target/AArch64/AArch64FalkorHWPFFix.cpp3
-rw-r--r--llvm/lib/Target/AArch64/AArch64FastISel.cpp15
-rw-r--r--llvm/lib/Target/AArch64/AArch64FrameLowering.cpp702
-rw-r--r--llvm/lib/Target/AArch64/AArch64FrameLowering.h30
-rw-r--r--llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp743
-rw-r--r--llvm/lib/Target/AArch64/AArch64ISelLowering.cpp3005
-rw-r--r--llvm/lib/Target/AArch64/AArch64ISelLowering.h236
-rw-r--r--llvm/lib/Target/AArch64/AArch64InstrFormats.td560
-rw-r--r--llvm/lib/Target/AArch64/AArch64InstrGISel.td124
-rw-r--r--llvm/lib/Target/AArch64/AArch64InstrInfo.cpp557
-rw-r--r--llvm/lib/Target/AArch64/AArch64InstrInfo.h99
-rw-r--r--llvm/lib/Target/AArch64/AArch64InstrInfo.td632
-rw-r--r--llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp122
-rw-r--r--llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp32
-rw-r--r--llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h32
-rw-r--r--llvm/lib/Target/AArch64/AArch64PromoteConstant.cpp23
-rw-r--r--llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp140
-rw-r--r--llvm/lib/Target/AArch64/AArch64RegisterInfo.h19
-rw-r--r--llvm/lib/Target/AArch64/AArch64RegisterInfo.td34
-rw-r--r--llvm/lib/Target/AArch64/AArch64SIMDInstrOpt.cpp5
-rw-r--r--llvm/lib/Target/AArch64/AArch64SLSHardening.cpp443
-rw-r--r--llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td2140
-rw-r--r--llvm/lib/Target/AArch64/AArch64SchedA53.td3
-rw-r--r--llvm/lib/Target/AArch64/AArch64SchedA57.td5
-rw-r--r--llvm/lib/Target/AArch64/AArch64SchedCyclone.td5
-rw-r--r--llvm/lib/Target/AArch64/AArch64SchedExynosM3.td5
-rw-r--r--llvm/lib/Target/AArch64/AArch64SchedExynosM4.td5
-rw-r--r--llvm/lib/Target/AArch64/AArch64SchedExynosM5.td5
-rw-r--r--llvm/lib/Target/AArch64/AArch64SchedFalkor.td4
-rw-r--r--llvm/lib/Target/AArch64/AArch64SchedFalkorDetails.td4
-rw-r--r--llvm/lib/Target/AArch64/AArch64SchedKryo.td4
-rw-r--r--llvm/lib/Target/AArch64/AArch64SchedKryoDetails.td4
-rw-r--r--llvm/lib/Target/AArch64/AArch64SchedThunderX.td4
-rw-r--r--llvm/lib/Target/AArch64/AArch64SchedThunderX2T99.td6
-rw-r--r--llvm/lib/Target/AArch64/AArch64SchedThunderX3T110.td1997
-rw-r--r--llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp29
-rw-r--r--llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.h3
-rw-r--r--llvm/lib/Target/AArch64/AArch64StackOffset.h1
-rw-r--r--llvm/lib/Target/AArch64/AArch64StackTagging.cpp49
-rw-r--r--llvm/lib/Target/AArch64/AArch64StorePairSuppress.cpp4
-rw-r--r--llvm/lib/Target/AArch64/AArch64Subtarget.cpp67
-rw-r--r--llvm/lib/Target/AArch64/AArch64Subtarget.h61
-rw-r--r--llvm/lib/Target/AArch64/AArch64SystemOperands.td60
-rw-r--r--llvm/lib/Target/AArch64/AArch64TargetMachine.cpp67
-rw-r--r--llvm/lib/Target/AArch64/AArch64TargetMachine.h8
-rw-r--r--llvm/lib/Target/AArch64/AArch64TargetObjectFile.cpp5
-rw-r--r--llvm/lib/Target/AArch64/AArch64TargetObjectFile.h5
-rw-r--r--llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp202
-rw-r--r--llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h85
-rw-r--r--llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp95
-rw-r--r--llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp36
-rw-r--r--llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp (renamed from llvm/lib/Target/AArch64/AArch64CallLowering.cpp)96
-rw-r--r--llvm/lib/Target/AArch64/GISel/AArch64CallLowering.h (renamed from llvm/lib/Target/AArch64/AArch64CallLowering.h)2
-rw-r--r--llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp (renamed from llvm/lib/Target/AArch64/AArch64InstructionSelector.cpp)1396
-rw-r--r--llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp (renamed from llvm/lib/Target/AArch64/AArch64LegalizerInfo.cpp)116
-rw-r--r--llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.h (renamed from llvm/lib/Target/AArch64/AArch64LegalizerInfo.h)13
-rw-r--r--llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp507
-rw-r--r--llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp (renamed from llvm/lib/Target/AArch64/AArch64PreLegalizerCombiner.cpp)49
-rw-r--r--llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp (renamed from llvm/lib/Target/AArch64/AArch64RegisterBankInfo.cpp)202
-rw-r--r--llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.h (renamed from llvm/lib/Target/AArch64/AArch64RegisterBankInfo.h)0
-rw-r--r--llvm/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h7
-rw-r--r--llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp84
-rw-r--r--llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp17
-rw-r--r--llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp28
-rw-r--r--llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp52
-rw-r--r--llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.h21
-rw-r--r--llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp4
-rw-r--r--llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp31
-rw-r--r--llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp2
-rw-r--r--llvm/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp4
-rw-r--r--llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp2
-rw-r--r--llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.cpp16
-rw-r--r--llvm/lib/Target/AArch64/SVEInstrFormats.td1918
-rw-r--r--llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp265
-rw-r--r--llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h1
89 files changed, 15016 insertions, 3367 deletions
diff --git a/llvm/lib/Target/AArch64/AArch64.h b/llvm/lib/Target/AArch64/AArch64.h
index ac765ebcddc04..fd35b530e3ce4 100644
--- a/llvm/lib/Target/AArch64/AArch64.h
+++ b/llvm/lib/Target/AArch64/AArch64.h
@@ -38,6 +38,8 @@ FunctionPass *createAArch64ISelDag(AArch64TargetMachine &TM,
CodeGenOpt::Level OptLevel);
FunctionPass *createAArch64StorePairSuppressPass();
FunctionPass *createAArch64ExpandPseudoPass();
+FunctionPass *createAArch64SLSHardeningPass();
+FunctionPass *createAArch64IndirectThunks();
FunctionPass *createAArch64SpeculationHardeningPass();
FunctionPass *createAArch64LoadStoreOptimizationPass();
FunctionPass *createAArch64SIMDInstrOptPass();
@@ -52,11 +54,13 @@ FunctionPass *createAArch64BranchTargetsPass();
FunctionPass *createAArch64CleanupLocalDynamicTLSPass();
FunctionPass *createAArch64CollectLOHPass();
+ModulePass *createSVEIntrinsicOptsPass();
InstructionSelector *
createAArch64InstructionSelector(const AArch64TargetMachine &,
AArch64Subtarget &, AArch64RegisterBankInfo &);
FunctionPass *createAArch64PreLegalizeCombiner(bool IsOptNone);
-FunctionPass *createAArch64StackTaggingPass(bool MergeInit);
+FunctionPass *createAArch64PostLegalizeCombiner(bool IsOptNone);
+FunctionPass *createAArch64StackTaggingPass(bool IsOptNone);
FunctionPass *createAArch64StackTaggingPreRAPass();
void initializeAArch64A53Fix835769Pass(PassRegistry&);
@@ -70,16 +74,19 @@ void initializeAArch64ConditionalComparesPass(PassRegistry&);
void initializeAArch64ConditionOptimizerPass(PassRegistry&);
void initializeAArch64DeadRegisterDefinitionsPass(PassRegistry&);
void initializeAArch64ExpandPseudoPass(PassRegistry&);
+void initializeAArch64SLSHardeningPass(PassRegistry&);
void initializeAArch64SpeculationHardeningPass(PassRegistry&);
void initializeAArch64LoadStoreOptPass(PassRegistry&);
void initializeAArch64SIMDInstrOptPass(PassRegistry&);
void initializeAArch64PreLegalizerCombinerPass(PassRegistry&);
+void initializeAArch64PostLegalizerCombinerPass(PassRegistry &);
void initializeAArch64PromoteConstantPass(PassRegistry&);
void initializeAArch64RedundantCopyEliminationPass(PassRegistry&);
void initializeAArch64StorePairSuppressPass(PassRegistry&);
void initializeFalkorHWPFFixPass(PassRegistry&);
void initializeFalkorMarkStridedAccessesLegacyPass(PassRegistry&);
void initializeLDTLSCleanupPass(PassRegistry&);
+void initializeSVEIntrinsicOptsPass(PassRegistry&);
void initializeAArch64StackTaggingPass(PassRegistry&);
void initializeAArch64StackTaggingPreRAPass(PassRegistry&);
} // end namespace llvm
diff --git a/llvm/lib/Target/AArch64/AArch64.td b/llvm/lib/Target/AArch64/AArch64.td
index 0106355b1a440..534af9686af06 100644
--- a/llvm/lib/Target/AArch64/AArch64.td
+++ b/llvm/lib/Target/AArch64/AArch64.td
@@ -42,11 +42,11 @@ def FeatureAES : SubtargetFeature<
"Enable AES support", [FeatureNEON]>;
// Crypto has been split up and any combination is now valid (see the
-// crypto defintions above). Also, crypto is now context sensitive:
+// crypto definitions above). Also, crypto is now context sensitive:
// it has a different meaning for e.g. Armv8.4 than it has for Armv8.2.
// Therefore, we rely on Clang, the user interacing tool, to pass on the
// appropriate crypto options. But here in the backend, crypto has very little
-// meaning anymore. We kept the Crypto defintion here for backward
+// meaning anymore. We kept the Crypto definition here for backward
// compatibility, and now imply features SHA2 and AES, which was the
// "traditional" meaning of Crypto.
def FeatureCrypto : SubtargetFeature<"crypto", "HasCrypto", "true",
@@ -101,7 +101,25 @@ def FeatureCCPP : SubtargetFeature<"ccpp", "HasCCPP",
"true", "Enable v8.2 data Cache Clean to Point of Persistence" >;
def FeatureSVE : SubtargetFeature<"sve", "HasSVE", "true",
- "Enable Scalable Vector Extension (SVE) instructions">;
+ "Enable Scalable Vector Extension (SVE) instructions", [FeatureFullFP16]>;
+
+// This flag is currently still labeled as Experimental, but when fully
+// implemented this should tell the compiler to use the zeroing pseudos to
+// benefit from the reverse instructions (e.g. SUB vs SUBR) if the inactive
+// lanes are known to be zero. The pseudos will then be expanded using the
+// MOVPRFX instruction to zero the inactive lanes. This feature should only be
+// enabled if MOVPRFX instructions are known to merge with the destructive
+// operations they prefix.
+//
+// This feature could similarly be extended to support cheap merging of _any_
+// value into the inactive lanes using the MOVPRFX instruction that uses
+// merging-predication.
+def FeatureExperimentalZeroingPseudos
+ : SubtargetFeature<"use-experimental-zeroing-pseudos",
+ "UseExperimentalZeroingPseudos", "true",
+ "Hint to the compiler that the MOVPRFX instruction is "
+ "merged with destructive operations",
+ []>;
def FeatureSVE2 : SubtargetFeature<"sve2", "HasSVE2", "true",
"Enable Scalable Vector Extension 2 (SVE2) instructions", [FeatureSVE]>;
@@ -142,7 +160,7 @@ def FeatureStrictAlign : SubtargetFeature<"strict-align",
"Disallow all unaligned memory "
"access">;
-foreach i = {1-7,9-15,18,20-28} in
+foreach i = {1-7,9-15,18,20-28,30} in
def FeatureReserveX#i : SubtargetFeature<"reserve-x"#i, "ReserveXRegister["#i#"]", "true",
"Reserve X"#i#", making it unavailable "
"as a GPR">;
@@ -240,11 +258,11 @@ def FeatureDotProd : SubtargetFeature<
def FeaturePA : SubtargetFeature<
"pa", "HasPA", "true",
- "Enable v8.3-A Pointer Authentication enchancement">;
+ "Enable v8.3-A Pointer Authentication extension">;
def FeatureJS : SubtargetFeature<
"jsconv", "HasJS", "true",
- "Enable v8.3-A JavaScript FP conversion enchancement",
+ "Enable v8.3-A JavaScript FP conversion instructions",
[FeatureFPARMv8]>;
def FeatureCCIDX : SubtargetFeature<
@@ -281,6 +299,11 @@ def FeatureAM : SubtargetFeature<
"am", "HasAM", "true",
"Enable v8.4-A Activity Monitors extension">;
+def FeatureAMVS : SubtargetFeature<
+ "amvs", "HasAMVS", "true",
+ "Enable v8.6-A Activity Monitors Virtualization support",
+ [FeatureAM]>;
+
def FeatureSEL2 : SubtargetFeature<
"sel2", "HasSEL2", "true",
"Enable v8.4-A Secure Exception Level 2 extension">;
@@ -365,6 +388,25 @@ def FeatureTaggedGlobals : SubtargetFeature<"tagged-globals",
"true", "Use an instruction sequence for taking the address of a global "
"that allows a memory tag in the upper address bits">;
+def FeatureBF16 : SubtargetFeature<"bf16", "HasBF16",
+ "true", "Enable BFloat16 Extension" >;
+
+def FeatureMatMulInt8 : SubtargetFeature<"i8mm", "HasMatMulInt8",
+ "true", "Enable Matrix Multiply Int8 Extension">;
+
+def FeatureMatMulFP32 : SubtargetFeature<"f32mm", "HasMatMulFP32",
+ "true", "Enable Matrix Multiply FP32 Extension", [FeatureSVE]>;
+
+def FeatureMatMulFP64 : SubtargetFeature<"f64mm", "HasMatMulFP64",
+ "true", "Enable Matrix Multiply FP64 Extension", [FeatureSVE]>;
+
+def FeatureFineGrainedTraps : SubtargetFeature<"fgt", "HasFineGrainedTraps",
+ "true", "Enable fine grained virtualization traps extension">;
+
+def FeatureEnhancedCounterVirtualization :
+ SubtargetFeature<"ecv", "HasEnhancedCounterVirtualization",
+ "true", "Enable enhanced counter virtualization extension">;
+
//===----------------------------------------------------------------------===//
// Architectures.
//
@@ -391,8 +433,13 @@ def HasV8_5aOps : SubtargetFeature<
"v8.5a", "HasV8_5aOps", "true", "Support ARM v8.5a instructions",
[HasV8_4aOps, FeatureAltFPCmp, FeatureFRInt3264, FeatureSpecRestrict,
FeatureSSBS, FeatureSB, FeaturePredRes, FeatureCacheDeepPersist,
- FeatureBranchTargetId]
->;
+ FeatureBranchTargetId]>;
+
+def HasV8_6aOps : SubtargetFeature<
+ "v8.6a", "HasV8_6aOps", "true", "Support ARM v8.6a instructions",
+
+ [HasV8_5aOps, FeatureAMVS, FeatureBF16, FeatureFineGrainedTraps,
+ FeatureEnhancedCounterVirtualization, FeatureMatMulInt8]>;
//===----------------------------------------------------------------------===//
// Register File Description
@@ -429,6 +476,17 @@ def FeatureUseEL#i#ForTP : SubtargetFeature<"tpidr-el"#i, "UseEL"#i#"ForTP",
"true", "Permit use of TPIDR_EL"#i#" for the TLS base">;
//===----------------------------------------------------------------------===//
+// Control codegen mitigation against Straight Line Speculation vulnerability.
+//===----------------------------------------------------------------------===//
+
+def FeatureHardenSlsRetBr : SubtargetFeature<"harden-sls-retbr",
+ "HardenSlsRetBr", "true",
+ "Harden against straight line speculation across RET and BR instructions">;
+def FeatureHardenSlsBlr : SubtargetFeature<"harden-sls-blr",
+ "HardenSlsBlr", "true",
+ "Harden against straight line speculation across BLR instructions">;
+
+//===----------------------------------------------------------------------===//
// AArch64 Processors supported.
//
@@ -443,6 +501,10 @@ def SVEUnsupported : AArch64Unsupported {
HasSVE2BitPerm];
}
+def PAUnsupported : AArch64Unsupported {
+ let F = [HasPA];
+}
+
include "AArch64SchedA53.td"
include "AArch64SchedA57.td"
include "AArch64SchedCyclone.td"
@@ -453,6 +515,7 @@ include "AArch64SchedExynosM4.td"
include "AArch64SchedExynosM5.td"
include "AArch64SchedThunderX.td"
include "AArch64SchedThunderX2T99.td"
+include "AArch64SchedThunderX3T110.td"
def ProcA35 : SubtargetFeature<"a35", "ARMProcFamily", "CortexA35",
"Cortex-A35 ARM processors", [
@@ -563,6 +626,67 @@ def ProcA76 : SubtargetFeature<"a76", "ARMProcFamily", "CortexA76",
FeatureSSBS
]>;
+def ProcA77 : SubtargetFeature<"a77", "ARMProcFamily", "CortexA77",
+ "Cortex-A77 ARM processors", [
+ HasV8_2aOps,
+ FeatureFPARMv8,
+ FeatureNEON, FeatureRCPC,
+ FeatureCrypto,
+ FeatureFullFP16,
+ FeatureDotProd
+ ]>;
+
+def ProcA78 : SubtargetFeature<"cortex-a78", "ARMProcFamily",
+ "CortexA78",
+ "Cortex-A78 ARM processors", [
+ HasV8_2aOps,
+ FeatureCrypto,
+ FeatureFPARMv8,
+ FeatureFuseAES,
+ FeatureNEON,
+ FeatureRCPC,
+ FeaturePerfMon,
+ FeaturePostRAScheduler,
+ FeatureSPE,
+ FeatureFullFP16,
+ FeatureSSBS,
+ FeatureDotProd]>;
+
+def ProcX1 : SubtargetFeature<"cortex-x1", "ARMProcFamily", "CortexX1",
+ "Cortex-X1 ARM processors", [
+ HasV8_2aOps,
+ FeatureCrypto,
+ FeatureFPARMv8,
+ FeatureFuseAES,
+ FeatureNEON,
+ FeatureRCPC,
+ FeaturePerfMon,
+ FeaturePostRAScheduler,
+ FeatureSPE,
+ FeatureFullFP16,
+ FeatureDotProd]>;
+
+def ProcA64FX : SubtargetFeature<"a64fx", "ARMProcFamily", "A64FX",
+ "Fujitsu A64FX processors", [
+ HasV8_2aOps,
+ FeatureFPARMv8,
+ FeatureNEON,
+ FeatureSHA2,
+ FeaturePerfMon,
+ FeatureFullFP16,
+ FeatureSVE,
+ FeaturePostRAScheduler,
+ FeatureComplxNum
+ ]>;
+
+def ProcCarmel : SubtargetFeature<"carmel", "ARMProcFamily", "Carmel",
+ "Nvidia Carmel processors", [
+ HasV8_2aOps,
+ FeatureNEON,
+ FeatureCrypto,
+ FeatureFullFP16
+ ]>;
+
// Note that cyclone does not fuse AES instructions, but newer apple chips do
// perform the fusion and cyclone is used by default when targetting apple OSes.
def ProcAppleA7 : SubtargetFeature<"apple-a7", "ARMProcFamily", "AppleA7",
@@ -780,6 +904,25 @@ def ProcThunderX2T99 : SubtargetFeature<"thunderx2t99", "ARMProcFamily",
FeatureLSE,
HasV8_1aOps]>;
+def ProcThunderX3T110 : SubtargetFeature<"thunderx3t110", "ARMProcFamily",
+ "ThunderX3T110",
+ "Marvell ThunderX3 processors", [
+ FeatureAggressiveFMA,
+ FeatureCRC,
+ FeatureCrypto,
+ FeatureFPARMv8,
+ FeatureArithmeticBccFusion,
+ FeatureNEON,
+ FeaturePostRAScheduler,
+ FeaturePredictableSelectIsExpensive,
+ FeatureLSE,
+ FeaturePA,
+ FeatureUseAA,
+ FeatureBalanceFPOps,
+ FeaturePerfMon,
+ FeatureStrictAlign,
+ HasV8_3aOps]>;
+
def ProcThunderX : SubtargetFeature<"thunderx", "ARMProcFamily", "ThunderX",
"Cavium ThunderX processors", [
FeatureCRC,
@@ -844,7 +987,7 @@ def : ProcessorModel<"generic", NoSchedModel, [
FeatureNEON,
FeaturePerfMon,
FeaturePostRAScheduler,
-// ETE and TRBE are future architecture extensions. We temporariliy enable them
+// ETE and TRBE are future architecture extensions. We temporarily enable them
// by default for users targeting generic AArch64, until it is decided in which
// armv8.x-a architecture revision they will end up. The extensions do not
// affect code generated by the compiler and can be used only by explicitly
@@ -853,6 +996,7 @@ def : ProcessorModel<"generic", NoSchedModel, [
]>;
def : ProcessorModel<"cortex-a35", CortexA53Model, [ProcA35]>;
+def : ProcessorModel<"cortex-a34", CortexA53Model, [ProcA35]>;
def : ProcessorModel<"cortex-a53", CortexA53Model, [ProcA53]>;
def : ProcessorModel<"cortex-a55", CortexA53Model, [ProcA55]>;
def : ProcessorModel<"cortex-a57", CortexA57Model, [ProcA57]>;
@@ -863,6 +1007,9 @@ def : ProcessorModel<"cortex-a73", CortexA57Model, [ProcA73]>;
def : ProcessorModel<"cortex-a75", CortexA57Model, [ProcA75]>;
def : ProcessorModel<"cortex-a76", CortexA57Model, [ProcA76]>;
def : ProcessorModel<"cortex-a76ae", CortexA57Model, [ProcA76]>;
+def : ProcessorModel<"cortex-a77", CortexA57Model, [ProcA77]>;
+def : ProcessorModel<"cortex-a78", CortexA57Model, [ProcA78]>;
+def : ProcessorModel<"cortex-x1", CortexA57Model, [ProcX1]>;
def : ProcessorModel<"neoverse-e1", CortexA53Model, [ProcNeoverseE1]>;
def : ProcessorModel<"neoverse-n1", CortexA57Model, [ProcNeoverseN1]>;
def : ProcessorModel<"exynos-m3", ExynosM3Model, [ProcExynosM3]>;
@@ -878,6 +1025,8 @@ def : ProcessorModel<"thunderxt81", ThunderXT8XModel, [ProcThunderXT81]>;
def : ProcessorModel<"thunderxt83", ThunderXT8XModel, [ProcThunderXT83]>;
// Cavium ThunderX2T9X Processors. Formerly Broadcom Vulcan.
def : ProcessorModel<"thunderx2t99", ThunderX2T99Model, [ProcThunderX2T99]>;
+// Marvell ThunderX3T110 Processors.
+def : ProcessorModel<"thunderx3t110", ThunderX3T110Model, [ProcThunderX3T110]>;
// FIXME: HiSilicon TSV110 is currently modeled as a Cortex-A57.
def : ProcessorModel<"tsv110", CortexA57Model, [ProcTSV110]>;
@@ -900,6 +1049,13 @@ def : ProcessorModel<"apple-s5", CycloneModel, [ProcAppleA12]>;
// Alias for the latest Apple processor model supported by LLVM.
def : ProcessorModel<"apple-latest", CycloneModel, [ProcAppleA13]>;
+// Fujitsu A64FX
+// FIXME: Scheduling model is not implemented yet.
+def : ProcessorModel<"a64fx", NoSchedModel, [ProcA64FX]>;
+
+// Nvidia Carmel
+def : ProcessorModel<"carmel", NoSchedModel, [ProcCarmel]>;
+
//===----------------------------------------------------------------------===//
// Assembly parser
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
index 00e321f9b8509..3a94820dac8d3 100644
--- a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
+++ b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
@@ -84,8 +84,8 @@ public:
return MCInstLowering.lowerOperand(MO, MCOp);
}
- void EmitStartOfAsmFile(Module &M) override;
- void EmitJumpTableInfo() override;
+ void emitStartOfAsmFile(Module &M) override;
+ void emitJumpTableInfo() override;
void emitJumpTableEntry(const MachineJumpTableInfo *MJTI,
const MachineBasicBlock *MBB, unsigned JTI);
@@ -112,7 +112,9 @@ public:
bool emitPseudoExpansionLowering(MCStreamer &OutStreamer,
const MachineInstr *MI);
- void EmitInstruction(const MachineInstr *MI) override;
+ void emitInstruction(const MachineInstr *MI) override;
+
+ void emitFunctionHeaderComment() override;
void getAnalysisUsage(AnalysisUsage &AU) const override {
AsmPrinter::getAnalysisUsage(AU);
@@ -139,7 +141,7 @@ public:
}
// Emit the rest of the function body.
- EmitFunctionBody();
+ emitFunctionBody();
// Emit the XRay table for this function.
emitXRayTable();
@@ -162,10 +164,10 @@ private:
void PrintDebugValueComment(const MachineInstr *MI, raw_ostream &OS);
- void EmitFunctionBodyEnd() override;
+ void emitFunctionBodyEnd() override;
MCSymbol *GetCPISymbol(unsigned CPID) const override;
- void EmitEndOfAsmFile(Module &M) override;
+ void emitEndOfAsmFile(Module &M) override;
AArch64FunctionInfo *AArch64FI = nullptr;
@@ -182,7 +184,7 @@ private:
} // end anonymous namespace
-void AArch64AsmPrinter::EmitStartOfAsmFile(Module &M) {
+void AArch64AsmPrinter::emitStartOfAsmFile(Module &M) {
if (!TM.getTargetTriple().isOSBinFormatELF())
return;
@@ -225,22 +227,29 @@ void AArch64AsmPrinter::EmitStartOfAsmFile(Module &M) {
OutStreamer->SwitchSection(Nt);
// Emit the note header.
- EmitAlignment(Align(8));
- OutStreamer->EmitIntValue(4, 4); // data size for "GNU\0"
- OutStreamer->EmitIntValue(4 * 4, 4); // Elf_Prop size
- OutStreamer->EmitIntValue(ELF::NT_GNU_PROPERTY_TYPE_0, 4);
- OutStreamer->EmitBytes(StringRef("GNU", 4)); // note name
+ emitAlignment(Align(8));
+ OutStreamer->emitInt32(4); // data size for "GNU\0"
+ OutStreamer->emitInt32(4 * 4); // Elf_Prop size
+ OutStreamer->emitInt32(ELF::NT_GNU_PROPERTY_TYPE_0);
+ OutStreamer->emitBytes(StringRef("GNU", 4)); // note name
// Emit the PAC/BTI properties.
- OutStreamer->EmitIntValue(ELF::GNU_PROPERTY_AARCH64_FEATURE_1_AND, 4);
- OutStreamer->EmitIntValue(4, 4); // data size
- OutStreamer->EmitIntValue(Flags, 4); // data
- OutStreamer->EmitIntValue(0, 4); // pad
+ OutStreamer->emitInt32(ELF::GNU_PROPERTY_AARCH64_FEATURE_1_AND);
+ OutStreamer->emitInt32(4); // data size
+ OutStreamer->emitInt32(Flags); // data
+ OutStreamer->emitInt32(0); // pad
OutStreamer->endSection(Nt);
OutStreamer->SwitchSection(Cur);
}
+void AArch64AsmPrinter::emitFunctionHeaderComment() {
+ const AArch64FunctionInfo *FI = MF->getInfo<AArch64FunctionInfo>();
+ Optional<std::string> OutlinerString = FI->getOutliningStyle();
+ if (OutlinerString != None)
+ OutStreamer->GetCommentOS() << ' ' << OutlinerString;
+}
+
void AArch64AsmPrinter::LowerPATCHABLE_FUNCTION_ENTER(const MachineInstr &MI)
{
const Function &F = MF->getFunction();
@@ -250,8 +259,7 @@ void AArch64AsmPrinter::LowerPATCHABLE_FUNCTION_ENTER(const MachineInstr &MI)
.getValueAsString()
.getAsInteger(10, Num))
return;
- for (; Num; --Num)
- EmitToStreamer(*OutStreamer, MCInstBuilder(AArch64::HINT).addImm(0));
+ emitNops(Num);
return;
}
@@ -291,9 +299,9 @@ void AArch64AsmPrinter::EmitSled(const MachineInstr &MI, SledKind Kind)
// ;DATA: higher 32 bits of the address of the trampoline
// LDP X0, X30, [SP], #16 ; pop X0 and the link register from the stack
//
- OutStreamer->EmitCodeAlignment(4);
+ OutStreamer->emitCodeAlignment(4);
auto CurSled = OutContext.createTempSymbol("xray_sled_", true);
- OutStreamer->EmitLabel(CurSled);
+ OutStreamer->emitLabel(CurSled);
auto Target = OutContext.createTempSymbol();
// Emit "B #32" instruction, which jumps over the next 28 bytes.
@@ -304,8 +312,8 @@ void AArch64AsmPrinter::EmitSled(const MachineInstr &MI, SledKind Kind)
for (int8_t I = 0; I < NoopsInSledCount; I++)
EmitToStreamer(*OutStreamer, MCInstBuilder(AArch64::HINT).addImm(0));
- OutStreamer->EmitLabel(Target);
- recordSled(CurSled, MI, Kind);
+ OutStreamer->emitLabel(Target);
+ recordSled(CurSled, MI, Kind, 2);
}
void AArch64AsmPrinter::LowerHWASAN_CHECK_MEMACCESS(const MachineInstr &MI) {
@@ -364,25 +372,25 @@ void AArch64AsmPrinter::EmitHwasanMemaccessSymbols(Module &M) {
ELF::SHF_EXECINSTR | ELF::SHF_ALLOC | ELF::SHF_GROUP, 0,
Sym->getName()));
- OutStreamer->EmitSymbolAttribute(Sym, MCSA_ELF_TypeFunction);
- OutStreamer->EmitSymbolAttribute(Sym, MCSA_Weak);
- OutStreamer->EmitSymbolAttribute(Sym, MCSA_Hidden);
- OutStreamer->EmitLabel(Sym);
+ OutStreamer->emitSymbolAttribute(Sym, MCSA_ELF_TypeFunction);
+ OutStreamer->emitSymbolAttribute(Sym, MCSA_Weak);
+ OutStreamer->emitSymbolAttribute(Sym, MCSA_Hidden);
+ OutStreamer->emitLabel(Sym);
- OutStreamer->EmitInstruction(MCInstBuilder(AArch64::UBFMXri)
+ OutStreamer->emitInstruction(MCInstBuilder(AArch64::UBFMXri)
.addReg(AArch64::X16)
.addReg(Reg)
.addImm(4)
.addImm(55),
*STI);
- OutStreamer->EmitInstruction(MCInstBuilder(AArch64::LDRBBroX)
+ OutStreamer->emitInstruction(MCInstBuilder(AArch64::LDRBBroX)
.addReg(AArch64::W16)
.addReg(AArch64::X9)
.addReg(AArch64::X16)
.addImm(0)
.addImm(0),
*STI);
- OutStreamer->EmitInstruction(
+ OutStreamer->emitInstruction(
MCInstBuilder(AArch64::SUBSXrs)
.addReg(AArch64::XZR)
.addReg(AArch64::X16)
@@ -390,33 +398,33 @@ void AArch64AsmPrinter::EmitHwasanMemaccessSymbols(Module &M) {
.addImm(AArch64_AM::getShifterImm(AArch64_AM::LSR, 56)),
*STI);
MCSymbol *HandleMismatchOrPartialSym = OutContext.createTempSymbol();
- OutStreamer->EmitInstruction(
+ OutStreamer->emitInstruction(
MCInstBuilder(AArch64::Bcc)
.addImm(AArch64CC::NE)
.addExpr(MCSymbolRefExpr::create(HandleMismatchOrPartialSym,
OutContext)),
*STI);
MCSymbol *ReturnSym = OutContext.createTempSymbol();
- OutStreamer->EmitLabel(ReturnSym);
- OutStreamer->EmitInstruction(
+ OutStreamer->emitLabel(ReturnSym);
+ OutStreamer->emitInstruction(
MCInstBuilder(AArch64::RET).addReg(AArch64::LR), *STI);
- OutStreamer->EmitLabel(HandleMismatchOrPartialSym);
+ OutStreamer->emitLabel(HandleMismatchOrPartialSym);
if (IsShort) {
- OutStreamer->EmitInstruction(MCInstBuilder(AArch64::SUBSWri)
+ OutStreamer->emitInstruction(MCInstBuilder(AArch64::SUBSWri)
.addReg(AArch64::WZR)
.addReg(AArch64::W16)
.addImm(15)
.addImm(0),
*STI);
MCSymbol *HandleMismatchSym = OutContext.createTempSymbol();
- OutStreamer->EmitInstruction(
+ OutStreamer->emitInstruction(
MCInstBuilder(AArch64::Bcc)
.addImm(AArch64CC::HI)
.addExpr(MCSymbolRefExpr::create(HandleMismatchSym, OutContext)),
*STI);
- OutStreamer->EmitInstruction(
+ OutStreamer->emitInstruction(
MCInstBuilder(AArch64::ANDXri)
.addReg(AArch64::X17)
.addReg(Reg)
@@ -424,59 +432,59 @@ void AArch64AsmPrinter::EmitHwasanMemaccessSymbols(Module &M) {
*STI);
unsigned Size = 1 << (AccessInfo & 0xf);
if (Size != 1)
- OutStreamer->EmitInstruction(MCInstBuilder(AArch64::ADDXri)
+ OutStreamer->emitInstruction(MCInstBuilder(AArch64::ADDXri)
.addReg(AArch64::X17)
.addReg(AArch64::X17)
.addImm(Size - 1)
.addImm(0),
*STI);
- OutStreamer->EmitInstruction(MCInstBuilder(AArch64::SUBSWrs)
+ OutStreamer->emitInstruction(MCInstBuilder(AArch64::SUBSWrs)
.addReg(AArch64::WZR)
.addReg(AArch64::W16)
.addReg(AArch64::W17)
.addImm(0),
*STI);
- OutStreamer->EmitInstruction(
+ OutStreamer->emitInstruction(
MCInstBuilder(AArch64::Bcc)
.addImm(AArch64CC::LS)
.addExpr(MCSymbolRefExpr::create(HandleMismatchSym, OutContext)),
*STI);
- OutStreamer->EmitInstruction(
+ OutStreamer->emitInstruction(
MCInstBuilder(AArch64::ORRXri)
.addReg(AArch64::X16)
.addReg(Reg)
.addImm(AArch64_AM::encodeLogicalImmediate(0xf, 64)),
*STI);
- OutStreamer->EmitInstruction(MCInstBuilder(AArch64::LDRBBui)
+ OutStreamer->emitInstruction(MCInstBuilder(AArch64::LDRBBui)
.addReg(AArch64::W16)
.addReg(AArch64::X16)
.addImm(0),
*STI);
- OutStreamer->EmitInstruction(
+ OutStreamer->emitInstruction(
MCInstBuilder(AArch64::SUBSXrs)
.addReg(AArch64::XZR)
.addReg(AArch64::X16)
.addReg(Reg)
.addImm(AArch64_AM::getShifterImm(AArch64_AM::LSR, 56)),
*STI);
- OutStreamer->EmitInstruction(
+ OutStreamer->emitInstruction(
MCInstBuilder(AArch64::Bcc)
.addImm(AArch64CC::EQ)
.addExpr(MCSymbolRefExpr::create(ReturnSym, OutContext)),
*STI);
- OutStreamer->EmitLabel(HandleMismatchSym);
+ OutStreamer->emitLabel(HandleMismatchSym);
}
- OutStreamer->EmitInstruction(MCInstBuilder(AArch64::STPXpre)
+ OutStreamer->emitInstruction(MCInstBuilder(AArch64::STPXpre)
.addReg(AArch64::SP)
.addReg(AArch64::X0)
.addReg(AArch64::X1)
.addReg(AArch64::SP)
.addImm(-32),
*STI);
- OutStreamer->EmitInstruction(MCInstBuilder(AArch64::STPXi)
+ OutStreamer->emitInstruction(MCInstBuilder(AArch64::STPXi)
.addReg(AArch64::FP)
.addReg(AArch64::LR)
.addReg(AArch64::SP)
@@ -484,13 +492,13 @@ void AArch64AsmPrinter::EmitHwasanMemaccessSymbols(Module &M) {
*STI);
if (Reg != AArch64::X0)
- OutStreamer->EmitInstruction(MCInstBuilder(AArch64::ORRXrs)
+ OutStreamer->emitInstruction(MCInstBuilder(AArch64::ORRXrs)
.addReg(AArch64::X0)
.addReg(AArch64::XZR)
.addReg(Reg)
.addImm(0),
*STI);
- OutStreamer->EmitInstruction(MCInstBuilder(AArch64::MOVZXi)
+ OutStreamer->emitInstruction(MCInstBuilder(AArch64::MOVZXi)
.addReg(AArch64::X1)
.addImm(AccessInfo)
.addImm(0),
@@ -499,14 +507,14 @@ void AArch64AsmPrinter::EmitHwasanMemaccessSymbols(Module &M) {
// Intentionally load the GOT entry and branch to it, rather than possibly
// late binding the function, which may clobber the registers before we have
// a chance to save them.
- OutStreamer->EmitInstruction(
+ OutStreamer->emitInstruction(
MCInstBuilder(AArch64::ADRP)
.addReg(AArch64::X16)
.addExpr(AArch64MCExpr::create(
HwasanTagMismatchRef, AArch64MCExpr::VariantKind::VK_GOT_PAGE,
OutContext)),
*STI);
- OutStreamer->EmitInstruction(
+ OutStreamer->emitInstruction(
MCInstBuilder(AArch64::LDRXui)
.addReg(AArch64::X16)
.addReg(AArch64::X16)
@@ -514,12 +522,12 @@ void AArch64AsmPrinter::EmitHwasanMemaccessSymbols(Module &M) {
HwasanTagMismatchRef, AArch64MCExpr::VariantKind::VK_GOT_LO12,
OutContext)),
*STI);
- OutStreamer->EmitInstruction(
+ OutStreamer->emitInstruction(
MCInstBuilder(AArch64::BR).addReg(AArch64::X16), *STI);
}
}
-void AArch64AsmPrinter::EmitEndOfAsmFile(Module &M) {
+void AArch64AsmPrinter::emitEndOfAsmFile(Module &M) {
EmitHwasanMemaccessSymbols(M);
const Triple &TT = TM.getTargetTriple();
@@ -529,7 +537,7 @@ void AArch64AsmPrinter::EmitEndOfAsmFile(Module &M) {
// implementation of multiple entry points). If this doesn't occur, the
// linker can safely perform dead code stripping. Since LLVM never
// generates code that does this, it is always safe to set.
- OutStreamer->EmitAssemblerFlag(MCAF_SubsectionsViaSymbols);
+ OutStreamer->emitAssemblerFlag(MCAF_SubsectionsViaSymbols);
}
emitStackMaps(SM);
}
@@ -544,12 +552,12 @@ void AArch64AsmPrinter::EmitLOHs() {
"Label hasn't been inserted for LOH related instruction");
MCArgs.push_back(LabelIt->second);
}
- OutStreamer->EmitLOHDirective(D.getKind(), MCArgs);
+ OutStreamer->emitLOHDirective(D.getKind(), MCArgs);
MCArgs.clear();
}
}
-void AArch64AsmPrinter::EmitFunctionBodyEnd() {
+void AArch64AsmPrinter::emitFunctionBodyEnd() {
if (!AArch64FI->getLOHRelated().empty())
EmitLOHs();
}
@@ -741,11 +749,10 @@ void AArch64AsmPrinter::PrintDebugValueComment(const MachineInstr *MI,
assert(NOps == 4);
OS << '\t' << MAI->getCommentString() << "DEBUG_VALUE: ";
// cast away const; DIetc do not take const operands for some reason.
- OS << cast<DILocalVariable>(MI->getOperand(NOps - 2).getMetadata())
- ->getName();
+ OS << MI->getDebugVariable()->getName();
OS << " <- ";
// Frame address. Currently handles register +- offset only.
- assert(MI->getOperand(0).isReg() && MI->getOperand(1).isImm());
+ assert(MI->getDebugOperand(0).isReg() && MI->isDebugOffsetImm());
OS << '[';
printOperand(MI, 0, OS);
OS << '+';
@@ -755,7 +762,7 @@ void AArch64AsmPrinter::PrintDebugValueComment(const MachineInstr *MI,
printOperand(MI, NOps - 2, OS);
}
-void AArch64AsmPrinter::EmitJumpTableInfo() {
+void AArch64AsmPrinter::emitJumpTableInfo() {
const MachineJumpTableInfo *MJTI = MF->getJumpTableInfo();
if (!MJTI) return;
@@ -783,8 +790,8 @@ void AArch64AsmPrinter::EmitJumpTableInfo() {
if (JTBBs.empty()) continue;
unsigned Size = AFI->getJumpTableEntrySize(JTI);
- EmitAlignment(Align(Size));
- OutStreamer->EmitLabel(GetJTISymbol(JTI));
+ emitAlignment(Align(Size));
+ OutStreamer->emitLabel(GetJTISymbol(JTI));
for (auto *JTBB : JTBBs)
emitJumpTableEntry(MJTI, JTBB, JTI);
@@ -812,7 +819,7 @@ void AArch64AsmPrinter::emitJumpTableEntry(const MachineJumpTableInfo *MJTI,
Value, MCConstantExpr::create(2, OutContext), OutContext);
}
- OutStreamer->EmitValue(Value, Size);
+ OutStreamer->emitValue(Value, Size);
}
/// Small jump tables contain an unsigned byte or half, representing the offset
@@ -868,7 +875,7 @@ void AArch64AsmPrinter::LowerSTACKMAP(MCStreamer &OutStreamer, StackMaps &SM,
auto &Ctx = OutStreamer.getContext();
MCSymbol *MILabel = Ctx.createTempSymbol();
- OutStreamer.EmitLabel(MILabel);
+ OutStreamer.emitLabel(MILabel);
SM.recordStackMap(*MILabel, MI);
assert(NumNOPBytes % 4 == 0 && "Invalid number of NOP bytes requested!");
@@ -898,7 +905,7 @@ void AArch64AsmPrinter::LowerPATCHPOINT(MCStreamer &OutStreamer, StackMaps &SM,
const MachineInstr &MI) {
auto &Ctx = OutStreamer.getContext();
MCSymbol *MILabel = Ctx.createTempSymbol();
- OutStreamer.EmitLabel(MILabel);
+ OutStreamer.emitLabel(MILabel);
SM.recordPatchPoint(*MILabel, MI);
PatchPointOpers Opers(&MI);
@@ -982,7 +989,7 @@ void AArch64AsmPrinter::EmitFMov0(const MachineInstr &MI) {
// instructions) auto-generated.
#include "AArch64GenMCPseudoLowering.inc"
-void AArch64AsmPrinter::EmitInstruction(const MachineInstr *MI) {
+void AArch64AsmPrinter::emitInstruction(const MachineInstr *MI) {
// Do any auto-generated pseudo lowerings.
if (emitPseudoExpansionLowering(*OutStreamer, MI))
return;
@@ -992,7 +999,7 @@ void AArch64AsmPrinter::EmitInstruction(const MachineInstr *MI) {
MCSymbol *LOHLabel = createTempSymbol("loh");
// Associate the instruction with the label
LOHInstToLabel[MI] = LOHLabel;
- OutStreamer->EmitLabel(LOHLabel);
+ OutStreamer->emitLabel(LOHLabel);
}
AArch64TargetStreamer *TS =
@@ -1001,6 +1008,26 @@ void AArch64AsmPrinter::EmitInstruction(const MachineInstr *MI) {
switch (MI->getOpcode()) {
default:
break;
+ case AArch64::HINT: {
+ // CurrentPatchableFunctionEntrySym can be CurrentFnBegin only for
+ // -fpatchable-function-entry=N,0. The entry MBB is guaranteed to be
+ // non-empty. If MI is the initial BTI, place the
+ // __patchable_function_entries label after BTI.
+ if (CurrentPatchableFunctionEntrySym &&
+ CurrentPatchableFunctionEntrySym == CurrentFnBegin &&
+ MI == &MF->front().front()) {
+ int64_t Imm = MI->getOperand(0).getImm();
+ if ((Imm & 32) && (Imm & 6)) {
+ MCInst Inst;
+ MCInstLowering.Lower(MI, Inst);
+ EmitToStreamer(*OutStreamer, Inst);
+ CurrentPatchableFunctionEntrySym = createTempSymbol("patch");
+ OutStreamer->emitLabel(CurrentPatchableFunctionEntrySym);
+ return;
+ }
+ }
+ break;
+ }
case AArch64::MOVMCSym: {
Register DestReg = MI->getOperand(0).getReg();
const MachineOperand &MO_Sym = MI->getOperand(1);
@@ -1048,7 +1075,7 @@ void AArch64AsmPrinter::EmitInstruction(const MachineInstr *MI) {
SmallString<128> TmpStr;
raw_svector_ostream OS(TmpStr);
PrintDebugValueComment(MI, OS);
- OutStreamer->EmitRawText(StringRef(OS.str()));
+ OutStreamer->emitRawText(StringRef(OS.str()));
}
return;
@@ -1061,7 +1088,7 @@ void AArch64AsmPrinter::EmitInstruction(const MachineInstr *MI) {
if (needsCFIMoves() == CFI_M_None)
return;
- OutStreamer->EmitCFIBKeyFrame();
+ OutStreamer->emitCFIBKeyFrame();
return;
}
}
@@ -1087,6 +1114,25 @@ void AArch64AsmPrinter::EmitInstruction(const MachineInstr *MI) {
EmitToStreamer(*OutStreamer, TmpInst);
return;
}
+ case AArch64::SpeculationBarrierISBDSBEndBB: {
+ // Print DSB SYS + ISB
+ MCInst TmpInstDSB;
+ TmpInstDSB.setOpcode(AArch64::DSB);
+ TmpInstDSB.addOperand(MCOperand::createImm(0xf));
+ EmitToStreamer(*OutStreamer, TmpInstDSB);
+ MCInst TmpInstISB;
+ TmpInstISB.setOpcode(AArch64::ISB);
+ TmpInstISB.addOperand(MCOperand::createImm(0xf));
+ EmitToStreamer(*OutStreamer, TmpInstISB);
+ return;
+ }
+ case AArch64::SpeculationBarrierSBEndBB: {
+ // Print SB
+ MCInst TmpInstSB;
+ TmpInstSB.setOpcode(AArch64::SB);
+ EmitToStreamer(*OutStreamer, TmpInstSB);
+ return;
+ }
case AArch64::TLSDESC_CALLSEQ: {
/// lower this to:
/// adrp x0, :tlsdesc:var
diff --git a/llvm/lib/Target/AArch64/AArch64BranchTargets.cpp b/llvm/lib/Target/AArch64/AArch64BranchTargets.cpp
index 6fa3a462bc71a..1956014b738d0 100644
--- a/llvm/lib/Target/AArch64/AArch64BranchTargets.cpp
+++ b/llvm/lib/Target/AArch64/AArch64BranchTargets.cpp
@@ -118,9 +118,15 @@ void AArch64BranchTargets::addBTI(MachineBasicBlock &MBB, bool CouldCall,
auto MBBI = MBB.begin();
- // PACI[AB]SP are implicitly BTI JC, so no BTI instruction needed there.
- if (MBBI != MBB.end() && (MBBI->getOpcode() == AArch64::PACIASP ||
- MBBI->getOpcode() == AArch64::PACIBSP))
+ // Skip the meta instuctions, those will be removed anyway.
+ for (; MBBI != MBB.end() && MBBI->isMetaInstruction(); ++MBBI)
+ ;
+
+ // SCTLR_EL1.BT[01] is set to 0 by default which means
+ // PACI[AB]SP are implicitly BTI C so no BTI C instruction is needed there.
+ if (MBBI != MBB.end() && HintNum == 34 &&
+ (MBBI->getOpcode() == AArch64::PACIASP ||
+ MBBI->getOpcode() == AArch64::PACIBSP))
return;
BuildMI(MBB, MBB.begin(), MBB.findDebugLoc(MBB.begin()),
diff --git a/llvm/lib/Target/AArch64/AArch64CallingConvention.cpp b/llvm/lib/Target/AArch64/AArch64CallingConvention.cpp
index a0695cef615f3..84ec5afcc9c19 100644
--- a/llvm/lib/Target/AArch64/AArch64CallingConvention.cpp
+++ b/llvm/lib/Target/AArch64/AArch64CallingConvention.cpp
@@ -38,18 +38,17 @@ static const MCPhysReg QRegList[] = {AArch64::Q0, AArch64::Q1, AArch64::Q2,
static bool finishStackBlock(SmallVectorImpl<CCValAssign> &PendingMembers,
MVT LocVT, ISD::ArgFlagsTy &ArgFlags,
- CCState &State, unsigned SlotAlign) {
+ CCState &State, Align SlotAlign) {
unsigned Size = LocVT.getSizeInBits() / 8;
const Align StackAlign =
State.getMachineFunction().getDataLayout().getStackAlignment();
- const Align OrigAlign(ArgFlags.getOrigAlign());
- const Align Align = std::min(OrigAlign, StackAlign);
+ const Align OrigAlign = ArgFlags.getNonZeroOrigAlign();
+ const Align Alignment = std::min(OrigAlign, StackAlign);
for (auto &It : PendingMembers) {
- It.convertToMem(State.AllocateStack(
- Size, std::max((unsigned)Align.value(), SlotAlign)));
+ It.convertToMem(State.AllocateStack(Size, std::max(Alignment, SlotAlign)));
State.addLoc(It);
- SlotAlign = 1;
+ SlotAlign = Align(1);
}
// All pending members have now been allocated
@@ -72,7 +71,7 @@ static bool CC_AArch64_Custom_Stack_Block(
if (!ArgFlags.isInConsecutiveRegsLast())
return true;
- return finishStackBlock(PendingMembers, LocVT, ArgFlags, State, 8);
+ return finishStackBlock(PendingMembers, LocVT, ArgFlags, State, Align(8));
}
/// Given an [N x Ty] block, it should be passed in a consecutive sequence of
@@ -146,7 +145,7 @@ static bool CC_AArch64_Custom_Block(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
for (auto Reg : RegList)
State.AllocateReg(Reg);
- unsigned SlotAlign = Subtarget.isTargetDarwin() ? 1 : 8;
+ const Align SlotAlign = Subtarget.isTargetDarwin() ? Align(1) : Align(8);
return finishStackBlock(PendingMembers, LocVT, ArgFlags, State, SlotAlign);
}
diff --git a/llvm/lib/Target/AArch64/AArch64CallingConvention.td b/llvm/lib/Target/AArch64/AArch64CallingConvention.td
index a0b2d7712b662..fdcc890bf5892 100644
--- a/llvm/lib/Target/AArch64/AArch64CallingConvention.td
+++ b/llvm/lib/Target/AArch64/AArch64CallingConvention.td
@@ -10,9 +10,6 @@
//
//===----------------------------------------------------------------------===//
-/// CCIfAlign - Match of the original alignment of the arg
-class CCIfAlign<string Align, CCAction A> :
- CCIf<!strconcat("ArgFlags.getOrigAlign() == ", Align), A>;
/// CCIfBigEndian - Match only if we're in big endian mode.
class CCIfBigEndian<CCAction A> :
CCIf<"State.getMachineFunction().getDataLayout().isBigEndian()", A>;
@@ -33,9 +30,9 @@ def CC_AArch64_AAPCS : CallingConv<[
// Big endian vectors must be passed as if they were 1-element vectors so that
// their lanes are in a consistent order.
- CCIfBigEndian<CCIfType<[v2i32, v2f32, v4i16, v4f16, v8i8],
+ CCIfBigEndian<CCIfType<[v2i32, v2f32, v4i16, v4f16, v4bf16, v8i8],
CCBitConvertToType<f64>>>,
- CCIfBigEndian<CCIfType<[v2i64, v2f64, v4i32, v4f32, v8i16, v8f16, v16i8],
+ CCIfBigEndian<CCIfType<[v2i64, v2f64, v4i32, v4f32, v8i16, v8f16, v8bf16, v16i8],
CCBitConvertToType<f128>>>,
// In AAPCS, an SRet is passed in X8, not X0 like a normal pointer parameter.
@@ -75,10 +72,10 @@ def CC_AArch64_AAPCS : CallingConv<[
CCIfConsecutiveRegs<CCCustom<"CC_AArch64_Custom_Block">>,
CCIfType<[nxv16i8, nxv8i16, nxv4i32, nxv2i64, nxv2f16, nxv4f16, nxv8f16,
- nxv2f32, nxv4f32, nxv2f64],
+ nxv2bf16, nxv4bf16, nxv8bf16, nxv2f32, nxv4f32, nxv2f64],
CCAssignToReg<[Z0, Z1, Z2, Z3, Z4, Z5, Z6, Z7]>>,
CCIfType<[nxv16i8, nxv8i16, nxv4i32, nxv2i64, nxv2f16, nxv4f16, nxv8f16,
- nxv2f32, nxv4f32, nxv2f64],
+ nxv2bf16, nxv4bf16, nxv8bf16, nxv2f32, nxv4f32, nxv2f64],
CCPassIndirect<i64>>,
CCIfType<[nxv2i1, nxv4i1, nxv8i1, nxv16i1],
@@ -102,22 +99,24 @@ def CC_AArch64_AAPCS : CallingConv<[
[W0, W1, W2, W3, W4, W5, W6, W7]>>,
CCIfType<[f16], CCAssignToRegWithShadow<[H0, H1, H2, H3, H4, H5, H6, H7],
[Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
+ CCIfType<[bf16], CCAssignToRegWithShadow<[H0, H1, H2, H3, H4, H5, H6, H7],
+ [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
CCIfType<[f32], CCAssignToRegWithShadow<[S0, S1, S2, S3, S4, S5, S6, S7],
[Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
CCIfType<[f64], CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7],
[Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
- CCIfType<[v1i64, v2i32, v4i16, v8i8, v1f64, v2f32, v4f16],
+ CCIfType<[v1i64, v2i32, v4i16, v8i8, v1f64, v2f32, v4f16, v4bf16],
CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7],
[Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
- CCIfType<[f128, v2i64, v4i32, v8i16, v16i8, v4f32, v2f64, v8f16],
+ CCIfType<[f128, v2i64, v4i32, v8i16, v16i8, v4f32, v2f64, v8f16, v8bf16],
CCAssignToReg<[Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
// If more than will fit in registers, pass them on the stack instead.
- CCIfType<[i1, i8, i16, f16], CCAssignToStack<8, 8>>,
+ CCIfType<[i1, i8, i16, f16, bf16], CCAssignToStack<8, 8>>,
CCIfType<[i32, f32], CCAssignToStack<8, 8>>,
- CCIfType<[i64, f64, v1f64, v2f32, v1i64, v2i32, v4i16, v8i8, v4f16],
+ CCIfType<[i64, f64, v1f64, v2f32, v1i64, v2i32, v4i16, v8i8, v4f16, v4bf16],
CCAssignToStack<8, 8>>,
- CCIfType<[f128, v2i64, v4i32, v8i16, v16i8, v4f32, v2f64, v8f16],
+ CCIfType<[f128, v2i64, v4i32, v8i16, v16i8, v4f32, v2f64, v8f16, v8bf16],
CCAssignToStack<16, 16>>
]>;
@@ -132,9 +131,9 @@ def RetCC_AArch64_AAPCS : CallingConv<[
// Big endian vectors must be passed as if they were 1-element vectors so that
// their lanes are in a consistent order.
- CCIfBigEndian<CCIfType<[v2i32, v2f32, v4i16, v4f16, v8i8],
+ CCIfBigEndian<CCIfType<[v2i32, v2f32, v4i16, v4f16, v4bf16, v8i8],
CCBitConvertToType<f64>>>,
- CCIfBigEndian<CCIfType<[v2i64, v2f64, v4i32, v4f32, v8i16, v8f16, v16i8],
+ CCIfBigEndian<CCIfType<[v2i64, v2f64, v4i32, v4f32, v8i16, v8f16, v8bf16, v16i8],
CCBitConvertToType<f128>>>,
CCIfType<[i1, i8, i16], CCPromoteToType<i32>>,
@@ -144,18 +143,20 @@ def RetCC_AArch64_AAPCS : CallingConv<[
[W0, W1, W2, W3, W4, W5, W6, W7]>>,
CCIfType<[f16], CCAssignToRegWithShadow<[H0, H1, H2, H3, H4, H5, H6, H7],
[Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
+ CCIfType<[bf16], CCAssignToRegWithShadow<[H0, H1, H2, H3, H4, H5, H6, H7],
+ [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
CCIfType<[f32], CCAssignToRegWithShadow<[S0, S1, S2, S3, S4, S5, S6, S7],
[Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
CCIfType<[f64], CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7],
[Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
- CCIfType<[v1i64, v2i32, v4i16, v8i8, v1f64, v2f32, v4f16],
+ CCIfType<[v1i64, v2i32, v4i16, v8i8, v1f64, v2f32, v4f16, v4bf16],
CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7],
[Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
- CCIfType<[f128, v2i64, v4i32, v8i16, v16i8, v4f32, v2f64, v8f16],
+ CCIfType<[f128, v2i64, v4i32, v8i16, v16i8, v4f32, v2f64, v8f16, v8bf16],
CCAssignToReg<[Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
CCIfType<[nxv16i8, nxv8i16, nxv4i32, nxv2i64, nxv2f16, nxv4f16, nxv8f16,
- nxv2f32, nxv4f32, nxv2f64],
+ nxv2bf16, nxv4bf16, nxv8bf16, nxv2f32, nxv4f32, nxv2f64],
CCAssignToReg<[Z0, Z1, Z2, Z3, Z4, Z5, Z6, Z7]>>,
CCIfType<[nxv2i1, nxv4i1, nxv8i1, nxv16i1],
@@ -165,7 +166,7 @@ def RetCC_AArch64_AAPCS : CallingConv<[
// Vararg functions on windows pass floats in integer registers
let Entry = 1 in
def CC_AArch64_Win64_VarArg : CallingConv<[
- CCIfType<[f16, f32], CCPromoteToType<f64>>,
+ CCIfType<[f16, bf16, f32], CCPromoteToType<f64>>,
CCIfType<[f64], CCBitConvertToType<i64>>,
CCDelegateTo<CC_AArch64_AAPCS>
]>;
@@ -219,19 +220,22 @@ def CC_AArch64_DarwinPCS : CallingConv<[
[W0, W1, W2, W3, W4, W5, W6, W7]>>,
CCIfType<[f16], CCAssignToRegWithShadow<[H0, H1, H2, H3, H4, H5, H6, H7],
[Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
+ CCIfType<[bf16], CCAssignToRegWithShadow<[H0, H1, H2, H3, H4, H5, H6, H7],
+ [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
CCIfType<[f32], CCAssignToRegWithShadow<[S0, S1, S2, S3, S4, S5, S6, S7],
[Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
CCIfType<[f64], CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7],
[Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
- CCIfType<[v1i64, v2i32, v4i16, v8i8, v1f64, v2f32, v4f16],
+ CCIfType<[v1i64, v2i32, v4i16, v8i8, v1f64, v2f32, v4f16, v4bf16],
CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7],
[Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
- CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32, v2f64, v8f16],
+ CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32, v2f64, v8f16, v8bf16],
CCAssignToReg<[Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
// If more than will fit in registers, pass them on the stack instead.
CCIf<"ValVT == MVT::i1 || ValVT == MVT::i8", CCAssignToStack<1, 1>>,
- CCIf<"ValVT == MVT::i16 || ValVT == MVT::f16", CCAssignToStack<2, 2>>,
+ CCIf<"ValVT == MVT::i16 || ValVT == MVT::f16 || ValVT == MVT::bf16",
+ CCAssignToStack<2, 2>>,
CCIfType<[i32, f32], CCAssignToStack<4, 4>>,
// Re-demote pointers to 32-bits so we don't end up storing 64-bit
@@ -239,9 +243,9 @@ def CC_AArch64_DarwinPCS : CallingConv<[
CCIfPtr<CCIfILP32<CCTruncToType<i32>>>,
CCIfPtr<CCIfILP32<CCAssignToStack<4, 4>>>,
- CCIfType<[i64, f64, v1f64, v2f32, v1i64, v2i32, v4i16, v8i8, v4f16],
+ CCIfType<[i64, f64, v1f64, v2f32, v1i64, v2i32, v4i16, v8i8, v4f16, v4bf16],
CCAssignToStack<8, 8>>,
- CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32, v2f64, v8f16],
+ CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32, v2f64, v8f16, v8bf16],
CCAssignToStack<16, 16>>
]>;
@@ -255,14 +259,14 @@ def CC_AArch64_DarwinPCS_VarArg : CallingConv<[
// Handle all scalar types as either i64 or f64.
CCIfType<[i8, i16, i32], CCPromoteToType<i64>>,
- CCIfType<[f16, f32], CCPromoteToType<f64>>,
+ CCIfType<[f16, bf16, f32], CCPromoteToType<f64>>,
// Everything is on the stack.
// i128 is split to two i64s, and its stack alignment is 16 bytes.
CCIfType<[i64], CCIfSplit<CCAssignToStack<8, 16>>>,
- CCIfType<[i64, f64, v1i64, v2i32, v4i16, v8i8, v1f64, v2f32, v4f16],
+ CCIfType<[i64, f64, v1i64, v2i32, v4i16, v8i8, v1f64, v2f32, v4f16, v4bf16],
CCAssignToStack<8, 8>>,
- CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32, v2f64, v8f16],
+ CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32, v2f64, v8f16, v8bf16],
CCAssignToStack<16, 16>>
]>;
@@ -275,16 +279,16 @@ def CC_AArch64_DarwinPCS_ILP32_VarArg : CallingConv<[
// Handle all scalar types as either i32 or f32.
CCIfType<[i8, i16], CCPromoteToType<i32>>,
- CCIfType<[f16], CCPromoteToType<f32>>,
+ CCIfType<[f16, bf16], CCPromoteToType<f32>>,
// Everything is on the stack.
// i128 is split to two i64s, and its stack alignment is 16 bytes.
CCIfPtr<CCIfILP32<CCTruncToType<i32>>>,
CCIfType<[i32, f32], CCAssignToStack<4, 4>>,
CCIfType<[i64], CCIfSplit<CCAssignToStack<8, 16>>>,
- CCIfType<[i64, f64, v1i64, v2i32, v4i16, v8i8, v1f64, v2f32, v4f16],
+ CCIfType<[i64, f64, v1i64, v2i32, v4i16, v8i8, v1f64, v2f32, v4f16, v4bf16],
CCAssignToStack<8, 8>>,
- CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32, v2f64, v8f16],
+ CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32, v2f64, v8f16, v8bf16],
CCAssignToStack<16, 16>>
]>;
@@ -377,11 +381,9 @@ def CSR_AArch64_AAPCS : CalleeSavedRegs<(add X19, X20, X21, X22, X23, X24,
D8, D9, D10, D11,
D12, D13, D14, D15)>;
-// Darwin puts the frame-record at the top of the callee-save area.
-def CSR_Darwin_AArch64_AAPCS : CalleeSavedRegs<(add LR, FP, X19, X20, X21, X22,
- X23, X24, X25, X26, X27, X28,
- D8, D9, D10, D11,
- D12, D13, D14, D15)>;
+// A variant for treating X18 as callee saved, when interfacing with
+// code that needs X18 to be preserved.
+def CSR_AArch64_AAPCS_X18 : CalleeSavedRegs<(add X18, CSR_AArch64_AAPCS)>;
// Win64 has unwinding codes for an (FP,LR) pair, save_fplr and save_fplr_x.
// We put FP before LR, so that frame lowering logic generates (FP,LR) pairs,
@@ -421,33 +423,7 @@ def CSR_AArch64_SVE_AAPCS : CalleeSavedRegs<(add (sequence "Z%u", 8, 23),
def CSR_AArch64_AAPCS_ThisReturn : CalleeSavedRegs<(add CSR_AArch64_AAPCS, X0)>;
def CSR_AArch64_AAPCS_SwiftError
- : CalleeSavedRegs<(sub CSR_Darwin_AArch64_AAPCS, X21)>;
-
-// The function used by Darwin to obtain the address of a thread-local variable
-// guarantees more than a normal AAPCS function. x16 and x17 are used on the
-// fast path for calculation, but other registers except X0 (argument/return)
-// and LR (it is a call, after all) are preserved.
-def CSR_AArch64_TLS_Darwin
- : CalleeSavedRegs<(add (sub (sequence "X%u", 1, 28), X16, X17),
- FP,
- (sequence "Q%u", 0, 31))>;
-
-// We can only handle a register pair with adjacent registers, the register pair
-// should belong to the same class as well. Since the access function on the
-// fast path calls a function that follows CSR_AArch64_TLS_Darwin,
-// CSR_AArch64_CXX_TLS_Darwin should be a subset of CSR_AArch64_TLS_Darwin.
-def CSR_AArch64_CXX_TLS_Darwin
- : CalleeSavedRegs<(add CSR_Darwin_AArch64_AAPCS,
- (sub (sequence "X%u", 1, 28), X15, X16, X17, X18),
- (sequence "D%u", 0, 31))>;
-
-// CSRs that are handled by prologue, epilogue.
-def CSR_AArch64_CXX_TLS_Darwin_PE
- : CalleeSavedRegs<(add LR, FP)>;
-
-// CSRs that are handled explicitly via copies.
-def CSR_AArch64_CXX_TLS_Darwin_ViaCopy
- : CalleeSavedRegs<(sub CSR_AArch64_CXX_TLS_Darwin, LR, FP)>;
+ : CalleeSavedRegs<(sub CSR_AArch64_AAPCS, X21)>;
// The ELF stub used for TLS-descriptor access saves every feasible
// register. Only X0 and LR are clobbered.
@@ -472,14 +448,57 @@ def CSR_AArch64_StackProbe_Windows
(sequence "X%u", 18, 28), FP, SP,
(sequence "Q%u", 0, 31))>;
+// Darwin variants of AAPCS.
+// Darwin puts the frame-record at the top of the callee-save area.
+def CSR_Darwin_AArch64_AAPCS : CalleeSavedRegs<(add LR, FP, X19, X20, X21, X22,
+ X23, X24, X25, X26, X27, X28,
+ D8, D9, D10, D11,
+ D12, D13, D14, D15)>;
+
+def CSR_Darwin_AArch64_AAVPCS : CalleeSavedRegs<(add LR, FP, X19, X20, X21,
+ X22, X23, X24, X25, X26, X27,
+ X28, (sequence "Q%u", 8, 23))>;
+def CSR_Darwin_AArch64_AAPCS_ThisReturn
+ : CalleeSavedRegs<(add CSR_Darwin_AArch64_AAPCS, X0)>;
+
+def CSR_Darwin_AArch64_AAPCS_SwiftError
+ : CalleeSavedRegs<(sub CSR_Darwin_AArch64_AAPCS, X21)>;
+
+// The function used by Darwin to obtain the address of a thread-local variable
+// guarantees more than a normal AAPCS function. x16 and x17 are used on the
+// fast path for calculation, but other registers except X0 (argument/return)
+// and LR (it is a call, after all) are preserved.
+def CSR_Darwin_AArch64_TLS
+ : CalleeSavedRegs<(add (sub (sequence "X%u", 1, 28), X16, X17),
+ FP,
+ (sequence "Q%u", 0, 31))>;
+
+// We can only handle a register pair with adjacent registers, the register pair
+// should belong to the same class as well. Since the access function on the
+// fast path calls a function that follows CSR_Darwin_AArch64_TLS,
+// CSR_Darwin_AArch64_CXX_TLS should be a subset of CSR_Darwin_AArch64_TLS.
+def CSR_Darwin_AArch64_CXX_TLS
+ : CalleeSavedRegs<(add CSR_Darwin_AArch64_AAPCS,
+ (sub (sequence "X%u", 1, 28), X15, X16, X17, X18),
+ (sequence "D%u", 0, 31))>;
+
+// CSRs that are handled by prologue, epilogue.
+def CSR_Darwin_AArch64_CXX_TLS_PE
+ : CalleeSavedRegs<(add LR, FP)>;
+
+// CSRs that are handled explicitly via copies.
+def CSR_Darwin_AArch64_CXX_TLS_ViaCopy
+ : CalleeSavedRegs<(sub CSR_Darwin_AArch64_CXX_TLS, LR, FP)>;
+
+def CSR_Darwin_AArch64_RT_MostRegs
+ : CalleeSavedRegs<(add CSR_Darwin_AArch64_AAPCS, (sequence "X%u", 9, 15))>;
+
// Variants of the standard calling conventions for shadow call stack.
// These all preserve x18 in addition to any other registers.
def CSR_AArch64_NoRegs_SCS
: CalleeSavedRegs<(add CSR_AArch64_NoRegs, X18)>;
def CSR_AArch64_AllRegs_SCS
: CalleeSavedRegs<(add CSR_AArch64_AllRegs, X18)>;
-def CSR_AArch64_CXX_TLS_Darwin_SCS
- : CalleeSavedRegs<(add CSR_AArch64_CXX_TLS_Darwin, X18)>;
def CSR_AArch64_AAPCS_SwiftError_SCS
: CalleeSavedRegs<(add CSR_AArch64_AAPCS_SwiftError, X18)>;
def CSR_AArch64_RT_MostRegs_SCS
diff --git a/llvm/lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp b/llvm/lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp
index 688bd1b28e855..3f244ba10102a 100644
--- a/llvm/lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp
+++ b/llvm/lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp
@@ -105,6 +105,10 @@ struct LDTLSCleanup : public MachineFunctionPass {
TII->get(TargetOpcode::COPY), AArch64::X0)
.addReg(TLSBaseAddrReg);
+ // Update the call site info.
+ if (I.shouldUpdateCallSiteInfo())
+ I.getMF()->eraseCallSiteInfo(&I);
+
// Erase the TLS_base_addr instruction.
I.eraseFromParent();
diff --git a/llvm/lib/Target/AArch64/AArch64CollectLOH.cpp b/llvm/lib/Target/AArch64/AArch64CollectLOH.cpp
index 35e6fef24363c..efdb1131abc91 100644
--- a/llvm/lib/Target/AArch64/AArch64CollectLOH.cpp
+++ b/llvm/lib/Target/AArch64/AArch64CollectLOH.cpp
@@ -382,7 +382,7 @@ static bool handleMiddleInst(const MachineInstr &MI, LOHInfo &DefInfo,
/// Update state when seeing and ADRP instruction.
static void handleADRP(const MachineInstr &MI, AArch64FunctionInfo &AFI,
- LOHInfo &Info) {
+ LOHInfo &Info, LOHInfo *LOHInfos) {
if (Info.LastADRP != nullptr) {
LLVM_DEBUG(dbgs() << "Adding MCLOH_AdrpAdrp:\n"
<< '\t' << MI << '\t' << *Info.LastADRP);
@@ -393,12 +393,24 @@ static void handleADRP(const MachineInstr &MI, AArch64FunctionInfo &AFI,
// Produce LOH directive if possible.
if (Info.IsCandidate) {
switch (Info.Type) {
- case MCLOH_AdrpAdd:
+ case MCLOH_AdrpAdd: {
+ // ADRPs and ADDs for this candidate may be split apart if using
+ // GlobalISel instead of pseudo-expanded. If that happens, the
+ // def register of the ADD may have a use in between. Adding an LOH in
+ // this case can cause the linker to rewrite the ADRP to write to that
+ // register, clobbering the use.
+ const MachineInstr *AddMI = Info.MI0;
+ int DefIdx = mapRegToGPRIndex(MI.getOperand(0).getReg());
+ int OpIdx = mapRegToGPRIndex(AddMI->getOperand(0).getReg());
+ LOHInfo DefInfo = LOHInfos[OpIdx];
+ if (DefIdx != OpIdx && (DefInfo.OneUser || DefInfo.MultiUsers))
+ break;
LLVM_DEBUG(dbgs() << "Adding MCLOH_AdrpAdd:\n"
<< '\t' << MI << '\t' << *Info.MI0);
AFI.addLOHDirective(MCLOH_AdrpAdd, {&MI, Info.MI0});
++NumADRSimpleCandidate;
break;
+ }
case MCLOH_AdrpLdr:
if (supportLoadFromLiteral(*Info.MI0)) {
LLVM_DEBUG(dbgs() << "Adding MCLOH_AdrpLdr:\n"
@@ -522,7 +534,8 @@ bool AArch64CollectLOH::runOnMachineFunction(MachineFunction &MF) {
// Walk the basic block backwards and update the per register state machine
// in the process.
- for (const MachineInstr &MI : make_range(MBB.rbegin(), MBB.rend())) {
+ for (const MachineInstr &MI :
+ instructionsWithoutDebug(MBB.rbegin(), MBB.rend())) {
unsigned Opcode = MI.getOpcode();
switch (Opcode) {
case AArch64::ADDXri:
@@ -544,7 +557,7 @@ bool AArch64CollectLOH::runOnMachineFunction(MachineFunction &MF) {
const MachineOperand &Op0 = MI.getOperand(0);
int Idx = mapRegToGPRIndex(Op0.getReg());
if (Idx >= 0) {
- handleADRP(MI, AFI, LOHInfos[Idx]);
+ handleADRP(MI, AFI, LOHInfos[Idx], LOHInfos);
continue;
}
break;
diff --git a/llvm/lib/Target/AArch64/AArch64Combine.td b/llvm/lib/Target/AArch64/AArch64Combine.td
index bb99f2516ecf0..aa41cae289e8b 100644
--- a/llvm/lib/Target/AArch64/AArch64Combine.td
+++ b/llvm/lib/Target/AArch64/AArch64Combine.td
@@ -11,8 +11,74 @@
include "llvm/Target/GlobalISel/Combine.td"
+def fconstant_to_constant : GICombineRule<
+ (defs root:$root),
+ (match (wip_match_opcode G_FCONSTANT):$root,
+ [{ return matchFConstantToConstant(*${root}, MRI); }]),
+ (apply [{ applyFConstantToConstant(*${root}); }])>;
+
def AArch64PreLegalizerCombinerHelper: GICombinerHelper<
"AArch64GenPreLegalizerCombinerHelper", [all_combines,
- elide_br_by_inverting_cond]> {
+ elide_br_by_inverting_cond,
+ fconstant_to_constant]> {
let DisableRuleOption = "aarch64prelegalizercombiner-disable-rule";
+ let StateClass = "AArch64PreLegalizerCombinerHelperState";
+ let AdditionalArguments = [];
+}
+
+// Matchdata for combines which replace a G_SHUFFLE_VECTOR with a
+// target-specific opcode.
+def shuffle_matchdata : GIDefMatchData<"ShuffleVectorPseudo">;
+
+def rev : GICombineRule<
+ (defs root:$root, shuffle_matchdata:$matchinfo),
+ (match (wip_match_opcode G_SHUFFLE_VECTOR):$root,
+ [{ return matchREV(*${root}, MRI, ${matchinfo}); }]),
+ (apply [{ applyShuffleVectorPseudo(*${root}, ${matchinfo}); }])
+>;
+
+def zip : GICombineRule<
+ (defs root:$root, shuffle_matchdata:$matchinfo),
+ (match (wip_match_opcode G_SHUFFLE_VECTOR):$root,
+ [{ return matchZip(*${root}, MRI, ${matchinfo}); }]),
+ (apply [{ applyShuffleVectorPseudo(*${root}, ${matchinfo}); }])
+>;
+
+def uzp : GICombineRule<
+ (defs root:$root, shuffle_matchdata:$matchinfo),
+ (match (wip_match_opcode G_SHUFFLE_VECTOR):$root,
+ [{ return matchUZP(*${root}, MRI, ${matchinfo}); }]),
+ (apply [{ applyShuffleVectorPseudo(*${root}, ${matchinfo}); }])
+>;
+
+def dup: GICombineRule <
+ (defs root:$root, shuffle_matchdata:$matchinfo),
+ (match (wip_match_opcode G_SHUFFLE_VECTOR):$root,
+ [{ return matchDup(*${root}, MRI, ${matchinfo}); }]),
+ (apply [{ applyShuffleVectorPseudo(*${root}, ${matchinfo}); }])
+>;
+
+def trn : GICombineRule<
+ (defs root:$root, shuffle_matchdata:$matchinfo),
+ (match (wip_match_opcode G_SHUFFLE_VECTOR):$root,
+ [{ return matchTRN(*${root}, MRI, ${matchinfo}); }]),
+ (apply [{ applyShuffleVectorPseudo(*${root}, ${matchinfo}); }])
+>;
+
+def ext: GICombineRule <
+ (defs root:$root, shuffle_matchdata:$matchinfo),
+ (match (wip_match_opcode G_SHUFFLE_VECTOR):$root,
+ [{ return matchEXT(*${root}, MRI, ${matchinfo}); }]),
+ (apply [{ applyEXT(*${root}, ${matchinfo}); }])
+>;
+
+// Combines which replace a G_SHUFFLE_VECTOR with a target-specific pseudo
+// instruction.
+def shuffle_vector_pseudos : GICombineGroup<[dup, rev, ext, zip, uzp, trn]>;
+
+def AArch64PostLegalizerCombinerHelper
+ : GICombinerHelper<"AArch64GenPostLegalizerCombinerHelper",
+ [erase_undef_store, combines_for_extload,
+ sext_already_extended, shuffle_vector_pseudos]> {
+ let DisableRuleOption = "aarch64postlegalizercombiner-disable-rule";
}
diff --git a/llvm/lib/Target/AArch64/AArch64CompressJumpTables.cpp b/llvm/lib/Target/AArch64/AArch64CompressJumpTables.cpp
index 2592387059652..57dc8a4061f12 100644
--- a/llvm/lib/Target/AArch64/AArch64CompressJumpTables.cpp
+++ b/llvm/lib/Target/AArch64/AArch64CompressJumpTables.cpp
@@ -79,7 +79,7 @@ void AArch64CompressJumpTables::scanFunction() {
for (MachineBasicBlock &MBB : *MF) {
const Align Alignment = MBB.getAlignment();
unsigned AlignedOffset;
- if (Alignment == Align::None())
+ if (Alignment == Align(1))
AlignedOffset = Offset;
else
AlignedOffset = alignTo(Offset, Alignment);
diff --git a/llvm/lib/Target/AArch64/AArch64CondBrTuning.cpp b/llvm/lib/Target/AArch64/AArch64CondBrTuning.cpp
index 25e23e4623de1..e90e8e3da0576 100644
--- a/llvm/lib/Target/AArch64/AArch64CondBrTuning.cpp
+++ b/llvm/lib/Target/AArch64/AArch64CondBrTuning.cpp
@@ -194,12 +194,8 @@ bool AArch64CondBrTuning::tryToTuneBranch(MachineInstr &MI,
// There must not be any instruction between DefMI and MI that clobbers or
// reads NZCV.
- MachineBasicBlock::iterator I(DefMI), E(MI);
- for (I = std::next(I); I != E; ++I) {
- if (I->modifiesRegister(AArch64::NZCV, TRI) ||
- I->readsRegister(AArch64::NZCV, TRI))
- return false;
- }
+ if (isNZCVTouchedInInstructionRange(DefMI, MI, TRI))
+ return false;
LLVM_DEBUG(dbgs() << " Replacing instructions:\n ");
LLVM_DEBUG(DefMI.print(dbgs()));
LLVM_DEBUG(dbgs() << " ");
@@ -253,12 +249,8 @@ bool AArch64CondBrTuning::tryToTuneBranch(MachineInstr &MI,
return false;
// There must not be any instruction between DefMI and MI that clobbers or
// reads NZCV.
- MachineBasicBlock::iterator I(DefMI), E(MI);
- for (I = std::next(I); I != E; ++I) {
- if (I->modifiesRegister(AArch64::NZCV, TRI) ||
- I->readsRegister(AArch64::NZCV, TRI))
- return false;
- }
+ if (isNZCVTouchedInInstructionRange(DefMI, MI, TRI))
+ return false;
LLVM_DEBUG(dbgs() << " Replacing instructions:\n ");
LLVM_DEBUG(DefMI.print(dbgs()));
LLVM_DEBUG(dbgs() << " ");
diff --git a/llvm/lib/Target/AArch64/AArch64ConditionOptimizer.cpp b/llvm/lib/Target/AArch64/AArch64ConditionOptimizer.cpp
index 51b2ce0297019..64f0bb63762de 100644
--- a/llvm/lib/Target/AArch64/AArch64ConditionOptimizer.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ConditionOptimizer.cpp
@@ -145,11 +145,11 @@ void AArch64ConditionOptimizer::getAnalysisUsage(AnalysisUsage &AU) const {
// instructions.
MachineInstr *AArch64ConditionOptimizer::findSuitableCompare(
MachineBasicBlock *MBB) {
- MachineBasicBlock::iterator I = MBB->getFirstTerminator();
- if (I == MBB->end())
+ MachineBasicBlock::iterator Term = MBB->getFirstTerminator();
+ if (Term == MBB->end())
return nullptr;
- if (I->getOpcode() != AArch64::Bcc)
+ if (Term->getOpcode() != AArch64::Bcc)
return nullptr;
// Since we may modify cmp of this MBB, make sure NZCV does not live out.
@@ -158,32 +158,33 @@ MachineInstr *AArch64ConditionOptimizer::findSuitableCompare(
return nullptr;
// Now find the instruction controlling the terminator.
- for (MachineBasicBlock::iterator B = MBB->begin(); I != B;) {
- --I;
- assert(!I->isTerminator() && "Spurious terminator");
+ for (MachineBasicBlock::iterator B = MBB->begin(), It = Term; It != B;) {
+ It = prev_nodbg(It, B);
+ MachineInstr &I = *It;
+ assert(!I.isTerminator() && "Spurious terminator");
// Check if there is any use of NZCV between CMP and Bcc.
- if (I->readsRegister(AArch64::NZCV))
+ if (I.readsRegister(AArch64::NZCV))
return nullptr;
- switch (I->getOpcode()) {
+ switch (I.getOpcode()) {
// cmp is an alias for subs with a dead destination register.
case AArch64::SUBSWri:
case AArch64::SUBSXri:
// cmn is an alias for adds with a dead destination register.
case AArch64::ADDSWri:
case AArch64::ADDSXri: {
- unsigned ShiftAmt = AArch64_AM::getShiftValue(I->getOperand(3).getImm());
- if (!I->getOperand(2).isImm()) {
- LLVM_DEBUG(dbgs() << "Immediate of cmp is symbolic, " << *I << '\n');
+ unsigned ShiftAmt = AArch64_AM::getShiftValue(I.getOperand(3).getImm());
+ if (!I.getOperand(2).isImm()) {
+ LLVM_DEBUG(dbgs() << "Immediate of cmp is symbolic, " << I << '\n');
return nullptr;
- } else if (I->getOperand(2).getImm() << ShiftAmt >= 0xfff) {
- LLVM_DEBUG(dbgs() << "Immediate of cmp may be out of range, " << *I
+ } else if (I.getOperand(2).getImm() << ShiftAmt >= 0xfff) {
+ LLVM_DEBUG(dbgs() << "Immediate of cmp may be out of range, " << I
<< '\n');
return nullptr;
- } else if (!MRI->use_empty(I->getOperand(0).getReg())) {
- LLVM_DEBUG(dbgs() << "Destination of cmp is not dead, " << *I << '\n');
+ } else if (!MRI->use_nodbg_empty(I.getOperand(0).getReg())) {
+ LLVM_DEBUG(dbgs() << "Destination of cmp is not dead, " << I << '\n');
return nullptr;
}
- return &*I;
+ return &I;
}
// Prevent false positive case like:
// cmp w19, #0
@@ -294,12 +295,10 @@ void AArch64ConditionOptimizer::modifyCmp(MachineInstr *CmpMI,
.add(BrMI.getOperand(1));
BrMI.eraseFromParent();
- MBB->updateTerminator();
-
++NumConditionsAdjusted;
}
-// Parse a condition code returned by AnalyzeBranch, and compute the CondCode
+// Parse a condition code returned by analyzeBranch, and compute the CondCode
// corresponding to TBB.
// Returns true if parsing was successful, otherwise false is returned.
static bool parseCond(ArrayRef<MachineOperand> Cond, AArch64CC::CondCode &CC) {
diff --git a/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp b/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp
index 054ef8f482ca9..82e8df3b73f90 100644
--- a/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp
@@ -157,7 +157,7 @@ public:
MachineInstr *CmpMI;
private:
- /// The branch condition in Head as determined by AnalyzeBranch.
+ /// The branch condition in Head as determined by analyzeBranch.
SmallVector<MachineOperand, 4> HeadCond;
/// The condition code that makes Head branch to CmpBB.
@@ -267,7 +267,7 @@ bool SSACCmpConv::isDeadDef(unsigned DstReg) {
return MRI->use_nodbg_empty(DstReg);
}
-// Parse a condition code returned by AnalyzeBranch, and compute the CondCode
+// Parse a condition code returned by analyzeBranch, and compute the CondCode
// corresponding to TBB.
// Return
static bool parseCond(ArrayRef<MachineOperand> Cond, AArch64CC::CondCode &CC) {
@@ -317,7 +317,7 @@ MachineInstr *SSACCmpConv::findConvertibleCompare(MachineBasicBlock *MBB) {
// Now find the instruction controlling the terminator.
for (MachineBasicBlock::iterator B = MBB->begin(); I != B;) {
- --I;
+ I = prev_nodbg(I, MBB->begin());
assert(!I->isTerminator() && "Spurious terminator");
switch (I->getOpcode()) {
// cmp is an alias for subs with a dead destination register.
@@ -509,7 +509,7 @@ bool SSACCmpConv::canConvert(MachineBasicBlock *MBB) {
// landing pad.
if (!TBB || HeadCond.empty()) {
LLVM_DEBUG(
- dbgs() << "AnalyzeBranch didn't find conditional branch in Head.\n");
+ dbgs() << "analyzeBranch didn't find conditional branch in Head.\n");
++NumHeadBranchRejs;
return false;
}
@@ -536,7 +536,7 @@ bool SSACCmpConv::canConvert(MachineBasicBlock *MBB) {
if (!TBB || CmpBBCond.empty()) {
LLVM_DEBUG(
- dbgs() << "AnalyzeBranch didn't find conditional branch in CmpBB.\n");
+ dbgs() << "analyzeBranch didn't find conditional branch in CmpBB.\n");
++NumCmpBranchRejs;
return false;
}
@@ -710,7 +710,7 @@ void SSACCmpConv::convert(SmallVectorImpl<MachineBasicBlock *> &RemovedBlocks) {
.add(CmpMI->getOperand(1)); // Branch target.
}
CmpMI->eraseFromParent();
- Head->updateTerminator();
+ Head->updateTerminator(CmpBB->getNextNode());
RemovedBlocks.push_back(CmpBB);
CmpBB->eraseFromParent();
@@ -828,7 +828,7 @@ void AArch64ConditionalCompares::updateDomTree(
assert(Node != HeadNode && "Cannot erase the head node");
assert(Node->getIDom() == HeadNode && "CmpBB should be dominated by Head");
while (Node->getNumChildren())
- DomTree->changeImmediateDominator(Node->getChildren().back(), HeadNode);
+ DomTree->changeImmediateDominator(Node->back(), HeadNode);
DomTree->eraseNode(RemovedMBB);
}
}
diff --git a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
index 3b8f8a19fe49c..9e65ad2e18f95 100644
--- a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
@@ -68,6 +68,8 @@ private:
bool expandMOVImm(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
unsigned BitSize);
+ bool expand_DestructiveOp(MachineInstr &MI, MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI);
bool expandCMP_SWAP(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
unsigned LdarOp, unsigned StlrOp, unsigned CmpOp,
unsigned ExtendImm, unsigned ZeroReg,
@@ -78,6 +80,9 @@ private:
bool expandSetTagLoop(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MBBI,
MachineBasicBlock::iterator &NextMBBI);
+ bool expandSVESpillFill(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI, unsigned Opc,
+ unsigned N);
};
} // end anonymous namespace
@@ -344,27 +349,225 @@ bool AArch64ExpandPseudo::expandCMP_SWAP_128(
return true;
}
+/// \brief Expand Pseudos to Instructions with destructive operands.
+///
+/// This mechanism uses MOVPRFX instructions for zeroing the false lanes
+/// or for fixing relaxed register allocation conditions to comply with
+/// the instructions register constraints. The latter case may be cheaper
+/// than setting the register constraints in the register allocator,
+/// since that will insert regular MOV instructions rather than MOVPRFX.
+///
+/// Example (after register allocation):
+///
+/// FSUB_ZPZZ_ZERO_B Z0, Pg, Z1, Z0
+///
+/// * The Pseudo FSUB_ZPZZ_ZERO_B maps to FSUB_ZPmZ_B.
+/// * We cannot map directly to FSUB_ZPmZ_B because the register
+/// constraints of the instruction are not met.
+/// * Also the _ZERO specifies the false lanes need to be zeroed.
+///
+/// We first try to see if the destructive operand == result operand,
+/// if not, we try to swap the operands, e.g.
+///
+/// FSUB_ZPmZ_B Z0, Pg/m, Z0, Z1
+///
+/// But because FSUB_ZPmZ is not commutative, this is semantically
+/// different, so we need a reverse instruction:
+///
+/// FSUBR_ZPmZ_B Z0, Pg/m, Z0, Z1
+///
+/// Then we implement the zeroing of the false lanes of Z0 by adding
+/// a zeroing MOVPRFX instruction:
+///
+/// MOVPRFX_ZPzZ_B Z0, Pg/z, Z0
+/// FSUBR_ZPmZ_B Z0, Pg/m, Z0, Z1
+///
+/// Note that this can only be done for _ZERO or _UNDEF variants where
+/// we can guarantee the false lanes to be zeroed (by implementing this)
+/// or that they are undef (don't care / not used), otherwise the
+/// swapping of operands is illegal because the operation is not
+/// (or cannot be emulated to be) fully commutative.
+bool AArch64ExpandPseudo::expand_DestructiveOp(
+ MachineInstr &MI,
+ MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI) {
+ unsigned Opcode = AArch64::getSVEPseudoMap(MI.getOpcode());
+ uint64_t DType = TII->get(Opcode).TSFlags & AArch64::DestructiveInstTypeMask;
+ uint64_t FalseLanes = MI.getDesc().TSFlags & AArch64::FalseLanesMask;
+ bool FalseZero = FalseLanes == AArch64::FalseLanesZero;
+
+ unsigned DstReg = MI.getOperand(0).getReg();
+ bool DstIsDead = MI.getOperand(0).isDead();
+
+ if (DType == AArch64::DestructiveBinary)
+ assert(DstReg != MI.getOperand(3).getReg());
+
+ bool UseRev = false;
+ unsigned PredIdx, DOPIdx, SrcIdx;
+ switch (DType) {
+ case AArch64::DestructiveBinaryComm:
+ case AArch64::DestructiveBinaryCommWithRev:
+ if (DstReg == MI.getOperand(3).getReg()) {
+ // FSUB Zd, Pg, Zs1, Zd ==> FSUBR Zd, Pg/m, Zd, Zs1
+ std::tie(PredIdx, DOPIdx, SrcIdx) = std::make_tuple(1, 3, 2);
+ UseRev = true;
+ break;
+ }
+ LLVM_FALLTHROUGH;
+ case AArch64::DestructiveBinary:
+ case AArch64::DestructiveBinaryImm:
+ std::tie(PredIdx, DOPIdx, SrcIdx) = std::make_tuple(1, 2, 3);
+ break;
+ default:
+ llvm_unreachable("Unsupported Destructive Operand type");
+ }
+
+#ifndef NDEBUG
+ // MOVPRFX can only be used if the destination operand
+ // is the destructive operand, not as any other operand,
+ // so the Destructive Operand must be unique.
+ bool DOPRegIsUnique = false;
+ switch (DType) {
+ case AArch64::DestructiveBinaryComm:
+ case AArch64::DestructiveBinaryCommWithRev:
+ DOPRegIsUnique =
+ DstReg != MI.getOperand(DOPIdx).getReg() ||
+ MI.getOperand(DOPIdx).getReg() != MI.getOperand(SrcIdx).getReg();
+ break;
+ case AArch64::DestructiveBinaryImm:
+ DOPRegIsUnique = true;
+ break;
+ }
+#endif
+
+ // Resolve the reverse opcode
+ if (UseRev) {
+ int NewOpcode;
+ // e.g. DIV -> DIVR
+ if ((NewOpcode = AArch64::getSVERevInstr(Opcode)) != -1)
+ Opcode = NewOpcode;
+ // e.g. DIVR -> DIV
+ else if ((NewOpcode = AArch64::getSVENonRevInstr(Opcode)) != -1)
+ Opcode = NewOpcode;
+ }
+
+ // Get the right MOVPRFX
+ uint64_t ElementSize = TII->getElementSizeForOpcode(Opcode);
+ unsigned MovPrfx, MovPrfxZero;
+ switch (ElementSize) {
+ case AArch64::ElementSizeNone:
+ case AArch64::ElementSizeB:
+ MovPrfx = AArch64::MOVPRFX_ZZ;
+ MovPrfxZero = AArch64::MOVPRFX_ZPzZ_B;
+ break;
+ case AArch64::ElementSizeH:
+ MovPrfx = AArch64::MOVPRFX_ZZ;
+ MovPrfxZero = AArch64::MOVPRFX_ZPzZ_H;
+ break;
+ case AArch64::ElementSizeS:
+ MovPrfx = AArch64::MOVPRFX_ZZ;
+ MovPrfxZero = AArch64::MOVPRFX_ZPzZ_S;
+ break;
+ case AArch64::ElementSizeD:
+ MovPrfx = AArch64::MOVPRFX_ZZ;
+ MovPrfxZero = AArch64::MOVPRFX_ZPzZ_D;
+ break;
+ default:
+ llvm_unreachable("Unsupported ElementSize");
+ }
+
+ //
+ // Create the destructive operation (if required)
+ //
+ MachineInstrBuilder PRFX, DOP;
+ if (FalseZero) {
+#ifndef NDEBUG
+ assert(DOPRegIsUnique && "The destructive operand should be unique");
+#endif
+ assert(ElementSize != AArch64::ElementSizeNone &&
+ "This instruction is unpredicated");
+
+ // Merge source operand into destination register
+ PRFX = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(MovPrfxZero))
+ .addReg(DstReg, RegState::Define)
+ .addReg(MI.getOperand(PredIdx).getReg())
+ .addReg(MI.getOperand(DOPIdx).getReg());
+
+ // After the movprfx, the destructive operand is same as Dst
+ DOPIdx = 0;
+ } else if (DstReg != MI.getOperand(DOPIdx).getReg()) {
+#ifndef NDEBUG
+ assert(DOPRegIsUnique && "The destructive operand should be unique");
+#endif
+ PRFX = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(MovPrfx))
+ .addReg(DstReg, RegState::Define)
+ .addReg(MI.getOperand(DOPIdx).getReg());
+ DOPIdx = 0;
+ }
+
+ //
+ // Create the destructive operation
+ //
+ DOP = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Opcode))
+ .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead));
+
+ switch (DType) {
+ case AArch64::DestructiveBinaryImm:
+ case AArch64::DestructiveBinaryComm:
+ case AArch64::DestructiveBinaryCommWithRev:
+ DOP.add(MI.getOperand(PredIdx))
+ .addReg(MI.getOperand(DOPIdx).getReg(), RegState::Kill)
+ .add(MI.getOperand(SrcIdx));
+ break;
+ }
+
+ if (PRFX) {
+ finalizeBundle(MBB, PRFX->getIterator(), MBBI->getIterator());
+ transferImpOps(MI, PRFX, DOP);
+ } else
+ transferImpOps(MI, DOP, DOP);
+
+ MI.eraseFromParent();
+ return true;
+}
+
bool AArch64ExpandPseudo::expandSetTagLoop(
MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
MachineBasicBlock::iterator &NextMBBI) {
MachineInstr &MI = *MBBI;
DebugLoc DL = MI.getDebugLoc();
- Register SizeReg = MI.getOperand(2).getReg();
- Register AddressReg = MI.getOperand(3).getReg();
+ Register SizeReg = MI.getOperand(0).getReg();
+ Register AddressReg = MI.getOperand(1).getReg();
MachineFunction *MF = MBB.getParent();
- bool ZeroData = MI.getOpcode() == AArch64::STZGloop;
- const unsigned OpCode =
+ bool ZeroData = MI.getOpcode() == AArch64::STZGloop_wback;
+ const unsigned OpCode1 =
+ ZeroData ? AArch64::STZGPostIndex : AArch64::STGPostIndex;
+ const unsigned OpCode2 =
ZeroData ? AArch64::STZ2GPostIndex : AArch64::ST2GPostIndex;
+ unsigned Size = MI.getOperand(2).getImm();
+ assert(Size > 0 && Size % 16 == 0);
+ if (Size % (16 * 2) != 0) {
+ BuildMI(MBB, MBBI, DL, TII->get(OpCode1), AddressReg)
+ .addReg(AddressReg)
+ .addReg(AddressReg)
+ .addImm(1);
+ Size -= 16;
+ }
+ MachineBasicBlock::iterator I =
+ BuildMI(MBB, MBBI, DL, TII->get(AArch64::MOVi64imm), SizeReg)
+ .addImm(Size);
+ expandMOVImm(MBB, I, 64);
+
auto LoopBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
auto DoneBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
MF->insert(++MBB.getIterator(), LoopBB);
MF->insert(++LoopBB->getIterator(), DoneBB);
- BuildMI(LoopBB, DL, TII->get(OpCode))
+ BuildMI(LoopBB, DL, TII->get(OpCode2))
.addDef(AddressReg)
.addReg(AddressReg)
.addReg(AddressReg)
@@ -402,6 +605,28 @@ bool AArch64ExpandPseudo::expandSetTagLoop(
return true;
}
+bool AArch64ExpandPseudo::expandSVESpillFill(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ unsigned Opc, unsigned N) {
+ const TargetRegisterInfo *TRI =
+ MBB.getParent()->getSubtarget().getRegisterInfo();
+ MachineInstr &MI = *MBBI;
+ for (unsigned Offset = 0; Offset < N; ++Offset) {
+ int ImmOffset = MI.getOperand(2).getImm() + Offset;
+ bool Kill = (Offset + 1 == N) ? MI.getOperand(1).isKill() : false;
+ assert(ImmOffset >= -256 && ImmOffset < 256 &&
+ "Immediate spill offset out of range");
+ BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Opc))
+ .addReg(
+ TRI->getSubReg(MI.getOperand(0).getReg(), AArch64::zsub0 + Offset),
+ Opc == AArch64::LDR_ZXI ? RegState::Define : 0)
+ .addReg(MI.getOperand(1).getReg(), getKillRegState(Kill))
+ .addImm(ImmOffset);
+ }
+ MI.eraseFromParent();
+ return true;
+}
+
/// If MBBI references a pseudo instruction that should be expanded here,
/// do the expansion and return true. Otherwise return false.
bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB,
@@ -409,10 +634,76 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB,
MachineBasicBlock::iterator &NextMBBI) {
MachineInstr &MI = *MBBI;
unsigned Opcode = MI.getOpcode();
+
+ // Check if we can expand the destructive op
+ int OrigInstr = AArch64::getSVEPseudoMap(MI.getOpcode());
+ if (OrigInstr != -1) {
+ auto &Orig = TII->get(OrigInstr);
+ if ((Orig.TSFlags & AArch64::DestructiveInstTypeMask)
+ != AArch64::NotDestructive) {
+ return expand_DestructiveOp(MI, MBB, MBBI);
+ }
+ }
+
switch (Opcode) {
default:
break;
+ case AArch64::BSPv8i8:
+ case AArch64::BSPv16i8: {
+ Register DstReg = MI.getOperand(0).getReg();
+ if (DstReg == MI.getOperand(3).getReg()) {
+ // Expand to BIT
+ BuildMI(MBB, MBBI, MI.getDebugLoc(),
+ TII->get(Opcode == AArch64::BSPv8i8 ? AArch64::BITv8i8
+ : AArch64::BITv16i8))
+ .add(MI.getOperand(0))
+ .add(MI.getOperand(3))
+ .add(MI.getOperand(2))
+ .add(MI.getOperand(1));
+ } else if (DstReg == MI.getOperand(2).getReg()) {
+ // Expand to BIF
+ BuildMI(MBB, MBBI, MI.getDebugLoc(),
+ TII->get(Opcode == AArch64::BSPv8i8 ? AArch64::BIFv8i8
+ : AArch64::BIFv16i8))
+ .add(MI.getOperand(0))
+ .add(MI.getOperand(2))
+ .add(MI.getOperand(3))
+ .add(MI.getOperand(1));
+ } else {
+ // Expand to BSL, use additional move if required
+ if (DstReg == MI.getOperand(1).getReg()) {
+ BuildMI(MBB, MBBI, MI.getDebugLoc(),
+ TII->get(Opcode == AArch64::BSPv8i8 ? AArch64::BSLv8i8
+ : AArch64::BSLv16i8))
+ .add(MI.getOperand(0))
+ .add(MI.getOperand(1))
+ .add(MI.getOperand(2))
+ .add(MI.getOperand(3));
+ } else {
+ BuildMI(MBB, MBBI, MI.getDebugLoc(),
+ TII->get(Opcode == AArch64::BSPv8i8 ? AArch64::ORRv8i8
+ : AArch64::ORRv16i8))
+ .addReg(DstReg,
+ RegState::Define |
+ getRenamableRegState(MI.getOperand(0).isRenamable()))
+ .add(MI.getOperand(1))
+ .add(MI.getOperand(1));
+ BuildMI(MBB, MBBI, MI.getDebugLoc(),
+ TII->get(Opcode == AArch64::BSPv8i8 ? AArch64::BSLv8i8
+ : AArch64::BSLv16i8))
+ .add(MI.getOperand(0))
+ .addReg(DstReg,
+ RegState::Kill |
+ getRenamableRegState(MI.getOperand(0).isRenamable()))
+ .add(MI.getOperand(2))
+ .add(MI.getOperand(3));
+ }
+ }
+ MI.eraseFromParent();
+ return true;
+ }
+
case AArch64::ADDWrr:
case AArch64::SUBWrr:
case AArch64::ADDXrr:
@@ -599,10 +890,7 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB,
Register DstReg = MI.getOperand(0).getReg();
auto SysReg = AArch64SysReg::TPIDR_EL0;
MachineFunction *MF = MBB.getParent();
- if (MF->getTarget().getTargetTriple().isOSFuchsia() &&
- MF->getTarget().getCodeModel() == CodeModel::Kernel)
- SysReg = AArch64SysReg::TPIDR_EL1;
- else if (MF->getSubtarget<AArch64Subtarget>().useEL3ForTP())
+ if (MF->getSubtarget<AArch64Subtarget>().useEL3ForTP())
SysReg = AArch64SysReg::TPIDR_EL3;
else if (MF->getSubtarget<AArch64Subtarget>().useEL2ForTP())
SysReg = AArch64SysReg::TPIDR_EL2;
@@ -676,7 +964,7 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB,
// almost always point to SP-after-prologue; if not, emit a longer
// instruction sequence.
int BaseOffset = -AFI->getTaggedBasePointerOffset();
- unsigned FrameReg;
+ Register FrameReg;
StackOffset FrameRegOffset = TFI->resolveFrameOffsetReference(
MF, BaseOffset, false /*isFixed*/, false /*isSVE*/, FrameReg,
/*PreferFP=*/false,
@@ -706,9 +994,26 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB,
MI.eraseFromParent();
return true;
}
+ case AArch64::STGloop_wback:
+ case AArch64::STZGloop_wback:
+ return expandSetTagLoop(MBB, MBBI, NextMBBI);
case AArch64::STGloop:
case AArch64::STZGloop:
- return expandSetTagLoop(MBB, MBBI, NextMBBI);
+ report_fatal_error(
+ "Non-writeback variants of STGloop / STZGloop should not "
+ "survive past PrologEpilogInserter.");
+ case AArch64::STR_ZZZZXI:
+ return expandSVESpillFill(MBB, MBBI, AArch64::STR_ZXI, 4);
+ case AArch64::STR_ZZZXI:
+ return expandSVESpillFill(MBB, MBBI, AArch64::STR_ZXI, 3);
+ case AArch64::STR_ZZXI:
+ return expandSVESpillFill(MBB, MBBI, AArch64::STR_ZXI, 2);
+ case AArch64::LDR_ZZZZXI:
+ return expandSVESpillFill(MBB, MBBI, AArch64::LDR_ZXI, 4);
+ case AArch64::LDR_ZZZXI:
+ return expandSVESpillFill(MBB, MBBI, AArch64::LDR_ZXI, 3);
+ case AArch64::LDR_ZZXI:
+ return expandSVESpillFill(MBB, MBBI, AArch64::LDR_ZXI, 2);
}
return false;
}
diff --git a/llvm/lib/Target/AArch64/AArch64FalkorHWPFFix.cpp b/llvm/lib/Target/AArch64/AArch64FalkorHWPFFix.cpp
index c1fc183b04f6f..538863ebe95af 100644
--- a/llvm/lib/Target/AArch64/AArch64FalkorHWPFFix.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FalkorHWPFFix.cpp
@@ -823,9 +823,6 @@ bool FalkorHWPFFix::runOnMachineFunction(MachineFunction &Fn) {
TII = static_cast<const AArch64InstrInfo *>(ST.getInstrInfo());
TRI = ST.getRegisterInfo();
- assert(TRI->trackLivenessAfterRegAlloc(Fn) &&
- "Register liveness not available!");
-
MachineLoopInfo &LI = getAnalysis<MachineLoopInfo>();
Modified = false;
diff --git a/llvm/lib/Target/AArch64/AArch64FastISel.cpp b/llvm/lib/Target/AArch64/AArch64FastISel.cpp
index 7e9c68f2bb305..0f63f4ca62e5e 100644
--- a/llvm/lib/Target/AArch64/AArch64FastISel.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FastISel.cpp
@@ -434,11 +434,9 @@ unsigned AArch64FastISel::materializeFP(const ConstantFP *CFP, MVT VT) {
// Materialize via constant pool. MachineConstantPool wants an explicit
// alignment.
- unsigned Align = DL.getPrefTypeAlignment(CFP->getType());
- if (Align == 0)
- Align = DL.getTypeAllocSize(CFP->getType());
+ Align Alignment = DL.getPrefTypeAlign(CFP->getType());
- unsigned CPI = MCP.getConstantPoolIndex(cast<Constant>(CFP), Align);
+ unsigned CPI = MCP.getConstantPoolIndex(cast<Constant>(CFP), Alignment);
unsigned ADRPReg = createResultReg(&AArch64::GPR64commonRegClass);
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ADRP),
ADRPReg).addConstantPoolIndex(CPI, 0, AArch64II::MO_PAGE);
@@ -1130,7 +1128,7 @@ void AArch64FastISel::addLoadStoreOperands(Address &Addr,
// and alignment should be based on the VT.
MMO = FuncInfo.MF->getMachineMemOperand(
MachinePointerInfo::getFixedStack(*FuncInfo.MF, FI, Offset), Flags,
- MFI.getObjectSize(FI), MFI.getObjectAlignment(FI));
+ MFI.getObjectSize(FI), MFI.getObjectAlign(FI));
// Now add the rest of the operands.
MIB.addFrameIndex(FI).addImm(Offset);
} else {
@@ -3137,7 +3135,7 @@ bool AArch64FastISel::processCallArgs(CallLoweringInfo &CLI,
Addr.setReg(AArch64::SP);
Addr.setOffset(VA.getLocMemOffset() + BEAlign);
- unsigned Alignment = DL.getABITypeAlignment(ArgVal->getType());
+ Align Alignment = DL.getABITypeAlign(ArgVal->getType());
MachineMemOperand *MMO = FuncInfo.MF->getMachineMemOperand(
MachinePointerInfo::getStack(*FuncInfo.MF, Addr.getOffset()),
MachineMemOperand::MOStore, ArgVT.getStoreSize(), Alignment);
@@ -3272,7 +3270,8 @@ bool AArch64FastISel::fastLowerCall(CallLoweringInfo &CLI) {
// Issue the call.
MachineInstrBuilder MIB;
if (Subtarget->useSmallAddressing()) {
- const MCInstrDesc &II = TII.get(Addr.getReg() ? AArch64::BLR : AArch64::BL);
+ const MCInstrDesc &II =
+ TII.get(Addr.getReg() ? getBLRCallOpcode(*MF) : (unsigned)AArch64::BL);
MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II);
if (Symbol)
MIB.addSym(Symbol, 0);
@@ -3305,7 +3304,7 @@ bool AArch64FastISel::fastLowerCall(CallLoweringInfo &CLI) {
if (!CallReg)
return false;
- const MCInstrDesc &II = TII.get(AArch64::BLR);
+ const MCInstrDesc &II = TII.get(getBLRCallOpcode(*MF));
CallReg = constrainOperandRegClass(II, CallReg, 0);
MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II).addReg(CallReg);
}
diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index ea3e800a1ad20..efa3fd5ca9cef 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -170,8 +170,45 @@ static cl::opt<bool>
cl::desc("reverse the CSR restore sequence"),
cl::init(false), cl::Hidden);
+static cl::opt<bool> StackTaggingMergeSetTag(
+ "stack-tagging-merge-settag",
+ cl::desc("merge settag instruction in function epilog"), cl::init(true),
+ cl::Hidden);
+
STATISTIC(NumRedZoneFunctions, "Number of functions using red zone");
+/// Returns the argument pop size.
+static uint64_t getArgumentPopSize(MachineFunction &MF,
+ MachineBasicBlock &MBB) {
+ MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
+ bool IsTailCallReturn = false;
+ if (MBB.end() != MBBI) {
+ unsigned RetOpcode = MBBI->getOpcode();
+ IsTailCallReturn = RetOpcode == AArch64::TCRETURNdi ||
+ RetOpcode == AArch64::TCRETURNri ||
+ RetOpcode == AArch64::TCRETURNriBTI;
+ }
+ AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
+
+ uint64_t ArgumentPopSize = 0;
+ if (IsTailCallReturn) {
+ MachineOperand &StackAdjust = MBBI->getOperand(1);
+
+ // For a tail-call in a callee-pops-arguments environment, some or all of
+ // the stack may actually be in use for the call's arguments, this is
+ // calculated during LowerCall and consumed here...
+ ArgumentPopSize = StackAdjust.getImm();
+ } else {
+ // ... otherwise the amount to pop is *all* of the argument space,
+ // conveniently stored in the MachineFunctionInfo by
+ // LowerFormalArguments. This will, of course, be zero for the C calling
+ // convention.
+ ArgumentPopSize = AFI->getArgumentStackToRestore();
+ }
+
+ return ArgumentPopSize;
+}
+
/// This is the biggest offset to the stack pointer we can encode in aarch64
/// instructions (without using a separate calculation and a temp register).
/// Note that the exception here are vector stores/loads which cannot encode any
@@ -211,6 +248,24 @@ AArch64FrameLowering::getStackIDForScalableVectors() const {
return TargetStackID::SVEVector;
}
+/// Returns the size of the fixed object area (allocated next to sp on entry)
+/// On Win64 this may include a var args area and an UnwindHelp object for EH.
+static unsigned getFixedObjectSize(const MachineFunction &MF,
+ const AArch64FunctionInfo *AFI, bool IsWin64,
+ bool IsFunclet) {
+ if (!IsWin64 || IsFunclet) {
+ // Only Win64 uses fixed objects, and then only for the function (not
+ // funclets)
+ return 0;
+ } else {
+ // Var args are stored here in the primary function.
+ const unsigned VarArgsArea = AFI->getVarArgsGPRSize();
+ // To support EH funclets we allocate an UnwindHelp object
+ const unsigned UnwindHelpObject = (MF.hasEHFunclets() ? 8 : 0);
+ return alignTo(VarArgsArea + UnwindHelpObject, 16);
+ }
+}
+
/// Returns the size of the entire SVE stackframe (calleesaves + spills).
static StackOffset getSVEStackSize(const MachineFunction &MF) {
const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
@@ -286,10 +341,8 @@ MachineBasicBlock::iterator AArch64FrameLowering::eliminateCallFramePseudoInstr(
uint64_t CalleePopAmount = IsDestroy ? I->getOperand(1).getImm() : 0;
if (!hasReservedCallFrame(MF)) {
- unsigned Align = getStackAlignment();
-
int64_t Amount = I->getOperand(0).getImm();
- Amount = alignTo(Amount, Align);
+ Amount = alignTo(Amount, getStackAlign());
if (!IsDestroy)
Amount = -Amount;
@@ -480,6 +533,39 @@ bool AArch64FrameLowering::shouldCombineCSRLocalStackBump(
return true;
}
+bool AArch64FrameLowering::shouldCombineCSRLocalStackBumpInEpilogue(
+ MachineBasicBlock &MBB, unsigned StackBumpBytes) const {
+ if (!shouldCombineCSRLocalStackBump(*MBB.getParent(), StackBumpBytes))
+ return false;
+
+ if (MBB.empty())
+ return true;
+
+ // Disable combined SP bump if the last instruction is an MTE tag store. It
+ // is almost always better to merge SP adjustment into those instructions.
+ MachineBasicBlock::iterator LastI = MBB.getFirstTerminator();
+ MachineBasicBlock::iterator Begin = MBB.begin();
+ while (LastI != Begin) {
+ --LastI;
+ if (LastI->isTransient())
+ continue;
+ if (!LastI->getFlag(MachineInstr::FrameDestroy))
+ break;
+ }
+ switch (LastI->getOpcode()) {
+ case AArch64::STGloop:
+ case AArch64::STZGloop:
+ case AArch64::STGOffset:
+ case AArch64::STZGOffset:
+ case AArch64::ST2GOffset:
+ case AArch64::STZ2GOffset:
+ return false;
+ default:
+ return true;
+ }
+ llvm_unreachable("unreachable");
+}
+
// Given a load or a store instruction, generate an appropriate unwinding SEH
// code on Windows.
static MachineBasicBlock::iterator InsertSEH(MachineBasicBlock::iterator MBBI,
@@ -940,11 +1026,11 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
// Label used to tie together the PROLOG_LABEL and the MachineMoves.
MCSymbol *FrameLabel = MMI.getContext().createTempSymbol();
// Encode the stack size of the leaf function.
- unsigned CFIIndex = MF.addFrameInst(
- MCCFIInstruction::createDefCfaOffset(FrameLabel, -NumBytes));
- BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
- .addCFIIndex(CFIIndex)
- .setMIFlags(MachineInstr::FrameSetup);
+ unsigned CFIIndex = MF.addFrameInst(
+ MCCFIInstruction::cfiDefCfaOffset(FrameLabel, NumBytes));
+ BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
+ .addCFIIndex(CFIIndex)
+ .setMIFlags(MachineInstr::FrameSetup);
}
}
@@ -959,10 +1045,7 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
bool IsWin64 =
Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv());
- // Var args are accounted for in the containing function, so don't
- // include them for funclets.
- unsigned FixedObject = (IsWin64 && !IsFunclet) ?
- alignTo(AFI->getVarArgsGPRSize(), 16) : 0;
+ unsigned FixedObject = getFixedObjectSize(MF, AFI, IsWin64, IsFunclet);
auto PrologueSaveSize = AFI->getCalleeSavedStackSize() + FixedObject;
// All of the remaining stack allocations are for locals.
@@ -993,32 +1076,8 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
++MBBI;
}
- // The code below is not applicable to funclets. We have emitted all the SEH
- // opcodes that we needed to emit. The FP and BP belong to the containing
- // function.
- if (IsFunclet) {
- if (NeedsWinCFI) {
- HasWinCFI = true;
- BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_PrologEnd))
- .setMIFlag(MachineInstr::FrameSetup);
- }
-
- // SEH funclets are passed the frame pointer in X1. If the parent
- // function uses the base register, then the base register is used
- // directly, and is not retrieved from X1.
- if (F.hasPersonalityFn()) {
- EHPersonality Per = classifyEHPersonality(F.getPersonalityFn());
- if (isAsynchronousEHPersonality(Per)) {
- BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::COPY), AArch64::FP)
- .addReg(AArch64::X1).setMIFlag(MachineInstr::FrameSetup);
- MBB.addLiveIn(AArch64::X1);
- }
- }
-
- return;
- }
-
- if (HasFP) {
+ // For funclets the FP belongs to the containing function.
+ if (!IsFunclet && HasFP) {
// Only set up FP if we actually need to.
int64_t FPOffset = isTargetDarwin(MF) ? (AFI->getCalleeSavedStackSize() - 16) : 0;
@@ -1099,7 +1158,7 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
.setMIFlag(MachineInstr::FrameSetup);
}
- BuildMI(MBB, MBBI, DL, TII->get(AArch64::BLR))
+ BuildMI(MBB, MBBI, DL, TII->get(getBLRCallOpcode(MF)))
.addReg(AArch64::X16, RegState::Kill)
.addReg(AArch64::X15, RegState::Implicit | RegState::Define)
.addReg(AArch64::X16, RegState::Implicit | RegState::Define | RegState::Dead)
@@ -1161,7 +1220,9 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
// Allocate space for the rest of the frame.
if (NumBytes) {
- const bool NeedsRealignment = RegInfo->needsStackRealignment(MF);
+ // Alignment is required for the parent frame, not the funclet
+ const bool NeedsRealignment =
+ !IsFunclet && RegInfo->needsStackRealignment(MF);
unsigned scratchSPReg = AArch64::SP;
if (NeedsRealignment) {
@@ -1179,8 +1240,7 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
false, NeedsWinCFI, &HasWinCFI);
if (NeedsRealignment) {
- const unsigned Alignment = MFI.getMaxAlignment();
- const unsigned NrBitsToZero = countTrailingZeros(Alignment);
+ const unsigned NrBitsToZero = Log2(MFI.getMaxAlign());
assert(NrBitsToZero > 1);
assert(scratchSPReg != AArch64::SP);
@@ -1215,7 +1275,8 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
// FIXME: Clarify FrameSetup flags here.
// Note: Use emitFrameOffset() like above for FP if the FrameSetup flag is
// needed.
- if (RegInfo->hasBasePointer(MF)) {
+ // For funclets the BP belongs to the containing function.
+ if (!IsFunclet && RegInfo->hasBasePointer(MF)) {
TII->copyPhysReg(MBB, MBBI, DL, RegInfo->getBaseRegister(), AArch64::SP,
false);
if (NeedsWinCFI) {
@@ -1232,6 +1293,19 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
.setMIFlag(MachineInstr::FrameSetup);
}
+ // SEH funclets are passed the frame pointer in X1. If the parent
+ // function uses the base register, then the base register is used
+ // directly, and is not retrieved from X1.
+ if (IsFunclet && F.hasPersonalityFn()) {
+ EHPersonality Per = classifyEHPersonality(F.getPersonalityFn());
+ if (isAsynchronousEHPersonality(Per)) {
+ BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::COPY), AArch64::FP)
+ .addReg(AArch64::X1)
+ .setMIFlag(MachineInstr::FrameSetup);
+ MBB.addLiveIn(AArch64::X1);
+ }
+ }
+
if (needsFrameMoves) {
const DataLayout &TD = MF.getDataLayout();
const int StackGrowth = isTargetDarwin(MF)
@@ -1307,15 +1381,15 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
if (HasFP) {
// Define the current CFA rule to use the provided FP.
unsigned Reg = RegInfo->getDwarfRegNum(FramePtr, true);
- unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::createDefCfa(
- nullptr, Reg, StackGrowth - FixedObject));
+ unsigned CFIIndex = MF.addFrameInst(
+ MCCFIInstruction::cfiDefCfa(nullptr, Reg, FixedObject - StackGrowth));
BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
.addCFIIndex(CFIIndex)
.setMIFlags(MachineInstr::FrameSetup);
} else {
// Encode the stack size of the leaf function.
unsigned CFIIndex = MF.addFrameInst(
- MCCFIInstruction::createDefCfaOffset(nullptr, -MFI.getStackSize()));
+ MCCFIInstruction::cfiDefCfaOffset(nullptr, MFI.getStackSize()));
BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
.addCFIIndex(CFIIndex)
.setMIFlags(MachineInstr::FrameSetup);
@@ -1374,7 +1448,6 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
const TargetInstrInfo *TII = Subtarget.getInstrInfo();
DebugLoc DL;
- bool IsTailCallReturn = false;
bool NeedsWinCFI = needsWinCFI(MF);
bool HasWinCFI = false;
bool IsFunclet = false;
@@ -1385,10 +1458,6 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
if (MBB.end() != MBBI) {
DL = MBBI->getDebugLoc();
- unsigned RetOpcode = MBBI->getOpcode();
- IsTailCallReturn = RetOpcode == AArch64::TCRETURNdi ||
- RetOpcode == AArch64::TCRETURNri ||
- RetOpcode == AArch64::TCRETURNriBTI;
IsFunclet = isFuncletReturnInstr(*MBBI);
}
@@ -1403,21 +1472,7 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
// Initial and residual are named for consistency with the prologue. Note that
// in the epilogue, the residual adjustment is executed first.
- uint64_t ArgumentPopSize = 0;
- if (IsTailCallReturn) {
- MachineOperand &StackAdjust = MBBI->getOperand(1);
-
- // For a tail-call in a callee-pops-arguments environment, some or all of
- // the stack may actually be in use for the call's arguments, this is
- // calculated during LowerCall and consumed here...
- ArgumentPopSize = StackAdjust.getImm();
- } else {
- // ... otherwise the amount to pop is *all* of the argument space,
- // conveniently stored in the MachineFunctionInfo by
- // LowerFormalArguments. This will, of course, be zero for the C calling
- // convention.
- ArgumentPopSize = AFI->getArgumentStackToRestore();
- }
+ uint64_t ArgumentPopSize = getArgumentPopSize(MF, MBB);
// The stack frame should be like below,
//
@@ -1450,10 +1505,7 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
bool IsWin64 =
Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv());
- // Var args are accounted for in the containing function, so don't
- // include them for funclets.
- unsigned FixedObject =
- (IsWin64 && !IsFunclet) ? alignTo(AFI->getVarArgsGPRSize(), 16) : 0;
+ unsigned FixedObject = getFixedObjectSize(MF, AFI, IsWin64, IsFunclet);
uint64_t AfterCSRPopSize = ArgumentPopSize;
auto PrologueSaveSize = AFI->getCalleeSavedStackSize() + FixedObject;
@@ -1463,7 +1515,7 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
// function.
if (MF.hasEHFunclets())
AFI->setLocalStackSize(NumBytes - PrologueSaveSize);
- bool CombineSPBump = shouldCombineCSRLocalStackBump(MF, NumBytes);
+ bool CombineSPBump = shouldCombineCSRLocalStackBumpInEpilogue(MBB, NumBytes);
// Assume we can't combine the last pop with the sp restore.
if (!CombineSPBump && PrologueSaveSize != 0) {
@@ -1660,7 +1712,7 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
/// SP-relative and simple call frames aren't used.
int AArch64FrameLowering::getFrameIndexReference(const MachineFunction &MF,
int FI,
- unsigned &FrameReg) const {
+ Register &FrameReg) const {
return resolveFrameIndexReference(
MF, FI, FrameReg,
/*PreferFP=*/
@@ -1679,7 +1731,9 @@ static StackOffset getFPOffset(const MachineFunction &MF, int64_t ObjectOffset)
const auto &Subtarget = MF.getSubtarget<AArch64Subtarget>();
bool IsWin64 =
Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv());
- unsigned FixedObject = IsWin64 ? alignTo(AFI->getVarArgsGPRSize(), 16) : 0;
+
+ unsigned FixedObject =
+ getFixedObjectSize(MF, AFI, IsWin64, /*IsFunclet=*/false);
unsigned FPAdjust = isTargetDarwin(MF)
? 16 : AFI->getCalleeSavedStackSize(MF.getFrameInfo());
return {ObjectOffset + FixedObject + FPAdjust, MVT::i8};
@@ -1701,7 +1755,7 @@ int AArch64FrameLowering::getSEHFrameIndexOffset(const MachineFunction &MF,
}
StackOffset AArch64FrameLowering::resolveFrameIndexReference(
- const MachineFunction &MF, int FI, unsigned &FrameReg, bool PreferFP,
+ const MachineFunction &MF, int FI, Register &FrameReg, bool PreferFP,
bool ForSimm) const {
const auto &MFI = MF.getFrameInfo();
int64_t ObjectOffset = MFI.getObjectOffset(FI);
@@ -1713,7 +1767,7 @@ StackOffset AArch64FrameLowering::resolveFrameIndexReference(
StackOffset AArch64FrameLowering::resolveFrameOffsetReference(
const MachineFunction &MF, int64_t ObjectOffset, bool isFixed, bool isSVE,
- unsigned &FrameReg, bool PreferFP, bool ForSimm) const {
+ Register &FrameReg, bool PreferFP, bool ForSimm) const {
const auto &MFI = MF.getFrameInfo();
const auto *RegInfo = static_cast<const AArch64RegisterInfo *>(
MF.getSubtarget().getRegisterInfo());
@@ -1764,10 +1818,8 @@ StackOffset AArch64FrameLowering::resolveFrameOffsetReference(
bool CanUseBP = RegInfo->hasBasePointer(MF);
if (FPOffsetFits && CanUseBP) // Both are ok. Pick the best.
UseFP = PreferFP;
- else if (!CanUseBP) { // Can't use BP. Forced to use FP.
- assert(!SVEStackSize && "Expected BP to be available");
+ else if (!CanUseBP) // Can't use BP. Forced to use FP.
UseFP = true;
- }
// else we can use BP and FP, but the offset from FP won't fit.
// That will make us scavenge registers which we can probably avoid by
// using BP. If it won't fit for BP either, we'll scavenge anyway.
@@ -1933,7 +1985,7 @@ struct RegPairInfo {
} // end anonymous namespace
static void computeCalleeSaveRegisterPairs(
- MachineFunction &MF, const std::vector<CalleeSavedInfo> &CSI,
+ MachineFunction &MF, ArrayRef<CalleeSavedInfo> CSI,
const TargetRegisterInfo *TRI, SmallVectorImpl<RegPairInfo> &RegPairs,
bool &NeedShadowCallStackProlog, bool NeedsFrameRecord) {
@@ -2058,8 +2110,8 @@ static void computeCalleeSaveRegisterPairs(
FixupDone = true;
ByteOffset -= 8;
assert(ByteOffset % 16 == 0);
- assert(MFI.getObjectAlignment(RPI.FrameIdx) <= 16);
- MFI.setObjectAlignment(RPI.FrameIdx, 16);
+ assert(MFI.getObjectAlign(RPI.FrameIdx) <= Align(16));
+ MFI.setObjectAlignment(RPI.FrameIdx, Align(16));
}
int Offset = RPI.isScalable() ? ScalableByteOffset : ByteOffset;
@@ -2078,8 +2130,7 @@ static void computeCalleeSaveRegisterPairs(
bool AArch64FrameLowering::spillCalleeSavedRegisters(
MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
- const std::vector<CalleeSavedInfo> &CSI,
- const TargetRegisterInfo *TRI) const {
+ ArrayRef<CalleeSavedInfo> CSI, const TargetRegisterInfo *TRI) const {
MachineFunction &MF = *MBB.getParent();
const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
bool NeedsWinCFI = needsWinCFI(MF);
@@ -2142,32 +2193,33 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters(
// Rationale: This sequence saves uop updates compared to a sequence of
// pre-increment spills like stp xi,xj,[sp,#-16]!
// Note: Similar rationale and sequence for restores in epilog.
- unsigned Size, Align;
+ unsigned Size;
+ Align Alignment;
switch (RPI.Type) {
case RegPairInfo::GPR:
StrOpc = RPI.isPaired() ? AArch64::STPXi : AArch64::STRXui;
Size = 8;
- Align = 8;
+ Alignment = Align(8);
break;
case RegPairInfo::FPR64:
StrOpc = RPI.isPaired() ? AArch64::STPDi : AArch64::STRDui;
Size = 8;
- Align = 8;
+ Alignment = Align(8);
break;
case RegPairInfo::FPR128:
StrOpc = RPI.isPaired() ? AArch64::STPQi : AArch64::STRQui;
Size = 16;
- Align = 16;
+ Alignment = Align(16);
break;
case RegPairInfo::ZPR:
StrOpc = AArch64::STR_ZXI;
Size = 16;
- Align = 16;
+ Alignment = Align(16);
break;
case RegPairInfo::PPR:
StrOpc = AArch64::STR_PXI;
Size = 2;
- Align = 2;
+ Alignment = Align(2);
break;
}
LLVM_DEBUG(dbgs() << "CSR spill: (" << printReg(Reg1, TRI);
@@ -2196,7 +2248,7 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters(
MIB.addReg(Reg2, getPrologueDeath(MF, Reg2));
MIB.addMemOperand(MF.getMachineMemOperand(
MachinePointerInfo::getFixedStack(MF, FrameIdxReg2),
- MachineMemOperand::MOStore, Size, Align));
+ MachineMemOperand::MOStore, Size, Alignment));
}
MIB.addReg(Reg1, getPrologueDeath(MF, Reg1))
.addReg(AArch64::SP)
@@ -2204,8 +2256,8 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters(
// where factor*scale is implicit
.setMIFlag(MachineInstr::FrameSetup);
MIB.addMemOperand(MF.getMachineMemOperand(
- MachinePointerInfo::getFixedStack(MF,FrameIdxReg1),
- MachineMemOperand::MOStore, Size, Align));
+ MachinePointerInfo::getFixedStack(MF, FrameIdxReg1),
+ MachineMemOperand::MOStore, Size, Alignment));
if (NeedsWinCFI)
InsertSEH(MIB, TII, MachineInstr::FrameSetup);
@@ -2220,8 +2272,7 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters(
bool AArch64FrameLowering::restoreCalleeSavedRegisters(
MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
- std::vector<CalleeSavedInfo> &CSI,
- const TargetRegisterInfo *TRI) const {
+ MutableArrayRef<CalleeSavedInfo> CSI, const TargetRegisterInfo *TRI) const {
MachineFunction &MF = *MBB.getParent();
const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
DebugLoc DL;
@@ -2248,32 +2299,33 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters(
// ldp x22, x21, [sp, #0] // addImm(+0)
// Note: see comment in spillCalleeSavedRegisters()
unsigned LdrOpc;
- unsigned Size, Align;
+ unsigned Size;
+ Align Alignment;
switch (RPI.Type) {
case RegPairInfo::GPR:
LdrOpc = RPI.isPaired() ? AArch64::LDPXi : AArch64::LDRXui;
Size = 8;
- Align = 8;
+ Alignment = Align(8);
break;
case RegPairInfo::FPR64:
LdrOpc = RPI.isPaired() ? AArch64::LDPDi : AArch64::LDRDui;
Size = 8;
- Align = 8;
+ Alignment = Align(8);
break;
case RegPairInfo::FPR128:
LdrOpc = RPI.isPaired() ? AArch64::LDPQi : AArch64::LDRQui;
Size = 16;
- Align = 16;
+ Alignment = Align(16);
break;
case RegPairInfo::ZPR:
LdrOpc = AArch64::LDR_ZXI;
Size = 16;
- Align = 16;
+ Alignment = Align(16);
break;
case RegPairInfo::PPR:
LdrOpc = AArch64::LDR_PXI;
Size = 2;
- Align = 2;
+ Alignment = Align(2);
break;
}
LLVM_DEBUG(dbgs() << "CSR restore: (" << printReg(Reg1, TRI);
@@ -2296,7 +2348,7 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters(
MIB.addReg(Reg2, getDefRegState(true));
MIB.addMemOperand(MF.getMachineMemOperand(
MachinePointerInfo::getFixedStack(MF, FrameIdxReg2),
- MachineMemOperand::MOLoad, Size, Align));
+ MachineMemOperand::MOLoad, Size, Alignment));
}
MIB.addReg(Reg1, getDefRegState(true))
.addReg(AArch64::SP)
@@ -2305,7 +2357,7 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters(
.setMIFlag(MachineInstr::FrameDestroy);
MIB.addMemOperand(MF.getMachineMemOperand(
MachinePointerInfo::getFixedStack(MF, FrameIdxReg1),
- MachineMemOperand::MOLoad, Size, Align));
+ MachineMemOperand::MOLoad, Size, Alignment));
if (NeedsWinCFI)
InsertSEH(MIB, TII, MachineInstr::FrameDestroy);
};
@@ -2348,6 +2400,7 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF,
TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS);
const AArch64RegisterInfo *RegInfo = static_cast<const AArch64RegisterInfo *>(
MF.getSubtarget().getRegisterInfo());
+ const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
unsigned UnspilledCSGPR = AArch64::NoRegister;
unsigned UnspilledCSGPRPaired = AArch64::NoRegister;
@@ -2396,6 +2449,16 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF,
}
}
+ if (MF.getFunction().getCallingConv() == CallingConv::Win64 &&
+ !Subtarget.isTargetWindows()) {
+ // For Windows calling convention on a non-windows OS, where X18 is treated
+ // as reserved, back up X18 when entering non-windows code (marked with the
+ // Windows calling convention) and restore when returning regardless of
+ // whether the individual function uses it - it might call other functions
+ // that clobber it.
+ SavedRegs.set(AArch64::X18);
+ }
+
// Calculates the callee saved stack size.
unsigned CSStackSize = 0;
unsigned SVECSStackSize = 0;
@@ -2467,8 +2530,8 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF,
const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
const TargetRegisterClass &RC = AArch64::GPR64RegClass;
unsigned Size = TRI->getSpillSize(RC);
- unsigned Align = TRI->getSpillAlignment(RC);
- int FI = MFI.CreateStackObject(Size, Align, false);
+ Align Alignment = TRI->getSpillAlign(RC);
+ int FI = MFI.CreateStackObject(Size, Alignment, false);
RS->addScavengingFrameIndex(FI);
LLVM_DEBUG(dbgs() << "No available CS registers, allocated fi#" << FI
<< " as the emergency spill slot.\n");
@@ -2549,12 +2612,12 @@ static int64_t determineSVEStackObjectOffsets(MachineFrameInfo &MFI,
// Then process all callee saved slots.
if (getSVECalleeSaveSlotRange(MFI, MinCSFrameIndex, MaxCSFrameIndex)) {
// Make sure to align the last callee save slot.
- MFI.setObjectAlignment(MaxCSFrameIndex, 16U);
+ MFI.setObjectAlignment(MaxCSFrameIndex, Align(16));
// Assign offsets to the callee save slots.
for (int I = MinCSFrameIndex; I <= MaxCSFrameIndex; ++I) {
Offset += MFI.getObjectSize(I);
- Offset = alignTo(Offset, MFI.getObjectAlignment(I));
+ Offset = alignTo(Offset, MFI.getObjectAlign(I));
if (AssignOffsets)
Assign(I, -Offset);
}
@@ -2576,15 +2639,15 @@ static int64_t determineSVEStackObjectOffsets(MachineFrameInfo &MFI,
// Allocate all SVE locals and spills
for (unsigned FI : ObjectsToAllocate) {
- unsigned Align = MFI.getObjectAlignment(FI);
+ Align Alignment = MFI.getObjectAlign(FI);
// FIXME: Given that the length of SVE vectors is not necessarily a power of
// two, we'd need to align every object dynamically at runtime if the
// alignment is larger than 16. This is not yet supported.
- if (Align > 16)
+ if (Alignment > Align(16))
report_fatal_error(
"Alignment of scalable vectors > 16 bytes is not yet supported");
- Offset = alignTo(Offset + MFI.getObjectSize(FI), Align);
+ Offset = alignTo(Offset + MFI.getObjectSize(FI), Alignment);
if (AssignOffsets)
Assign(FI, -Offset);
}
@@ -2632,9 +2695,14 @@ void AArch64FrameLowering::processFunctionBeforeFrameFinalized(
++MBBI;
// Create an UnwindHelp object.
- int UnwindHelpFI =
- MFI.CreateStackObject(/*size*/8, /*alignment*/16, false);
+ // The UnwindHelp object is allocated at the start of the fixed object area
+ int64_t FixedObject =
+ getFixedObjectSize(MF, AFI, /*IsWin64*/ true, /*IsFunclet*/ false);
+ int UnwindHelpFI = MFI.CreateFixedObject(/*Size*/ 8,
+ /*SPOffset*/ -FixedObject,
+ /*IsImmutable=*/false);
EHInfo.UnwindHelpFrameIdx = UnwindHelpFI;
+
// We need to store -2 into the UnwindHelp object at the start of the
// function.
DebugLoc DL;
@@ -2649,17 +2717,411 @@ void AArch64FrameLowering::processFunctionBeforeFrameFinalized(
.addImm(0);
}
-/// For Win64 AArch64 EH, the offset to the Unwind object is from the SP before
-/// the update. This is easily retrieved as it is exactly the offset that is set
-/// in processFunctionBeforeFrameFinalized.
+namespace {
+struct TagStoreInstr {
+ MachineInstr *MI;
+ int64_t Offset, Size;
+ explicit TagStoreInstr(MachineInstr *MI, int64_t Offset, int64_t Size)
+ : MI(MI), Offset(Offset), Size(Size) {}
+};
+
+class TagStoreEdit {
+ MachineFunction *MF;
+ MachineBasicBlock *MBB;
+ MachineRegisterInfo *MRI;
+ // Tag store instructions that are being replaced.
+ SmallVector<TagStoreInstr, 8> TagStores;
+ // Combined memref arguments of the above instructions.
+ SmallVector<MachineMemOperand *, 8> CombinedMemRefs;
+
+ // Replace allocation tags in [FrameReg + FrameRegOffset, FrameReg +
+ // FrameRegOffset + Size) with the address tag of SP.
+ Register FrameReg;
+ StackOffset FrameRegOffset;
+ int64_t Size;
+ // If not None, move FrameReg to (FrameReg + FrameRegUpdate) at the end.
+ Optional<int64_t> FrameRegUpdate;
+ // MIFlags for any FrameReg updating instructions.
+ unsigned FrameRegUpdateFlags;
+
+ // Use zeroing instruction variants.
+ bool ZeroData;
+ DebugLoc DL;
+
+ void emitUnrolled(MachineBasicBlock::iterator InsertI);
+ void emitLoop(MachineBasicBlock::iterator InsertI);
+
+public:
+ TagStoreEdit(MachineBasicBlock *MBB, bool ZeroData)
+ : MBB(MBB), ZeroData(ZeroData) {
+ MF = MBB->getParent();
+ MRI = &MF->getRegInfo();
+ }
+ // Add an instruction to be replaced. Instructions must be added in the
+ // ascending order of Offset, and have to be adjacent.
+ void addInstruction(TagStoreInstr I) {
+ assert((TagStores.empty() ||
+ TagStores.back().Offset + TagStores.back().Size == I.Offset) &&
+ "Non-adjacent tag store instructions.");
+ TagStores.push_back(I);
+ }
+ void clear() { TagStores.clear(); }
+ // Emit equivalent code at the given location, and erase the current set of
+ // instructions. May skip if the replacement is not profitable. May invalidate
+ // the input iterator and replace it with a valid one.
+ void emitCode(MachineBasicBlock::iterator &InsertI,
+ const AArch64FrameLowering *TFI, bool IsLast);
+};
+
+void TagStoreEdit::emitUnrolled(MachineBasicBlock::iterator InsertI) {
+ const AArch64InstrInfo *TII =
+ MF->getSubtarget<AArch64Subtarget>().getInstrInfo();
+
+ const int64_t kMinOffset = -256 * 16;
+ const int64_t kMaxOffset = 255 * 16;
+
+ Register BaseReg = FrameReg;
+ int64_t BaseRegOffsetBytes = FrameRegOffset.getBytes();
+ if (BaseRegOffsetBytes < kMinOffset ||
+ BaseRegOffsetBytes + (Size - Size % 32) > kMaxOffset) {
+ Register ScratchReg = MRI->createVirtualRegister(&AArch64::GPR64RegClass);
+ emitFrameOffset(*MBB, InsertI, DL, ScratchReg, BaseReg,
+ {BaseRegOffsetBytes, MVT::i8}, TII);
+ BaseReg = ScratchReg;
+ BaseRegOffsetBytes = 0;
+ }
+
+ MachineInstr *LastI = nullptr;
+ while (Size) {
+ int64_t InstrSize = (Size > 16) ? 32 : 16;
+ unsigned Opcode =
+ InstrSize == 16
+ ? (ZeroData ? AArch64::STZGOffset : AArch64::STGOffset)
+ : (ZeroData ? AArch64::STZ2GOffset : AArch64::ST2GOffset);
+ MachineInstr *I = BuildMI(*MBB, InsertI, DL, TII->get(Opcode))
+ .addReg(AArch64::SP)
+ .addReg(BaseReg)
+ .addImm(BaseRegOffsetBytes / 16)
+ .setMemRefs(CombinedMemRefs);
+ // A store to [BaseReg, #0] should go last for an opportunity to fold the
+ // final SP adjustment in the epilogue.
+ if (BaseRegOffsetBytes == 0)
+ LastI = I;
+ BaseRegOffsetBytes += InstrSize;
+ Size -= InstrSize;
+ }
+
+ if (LastI)
+ MBB->splice(InsertI, MBB, LastI);
+}
+
+void TagStoreEdit::emitLoop(MachineBasicBlock::iterator InsertI) {
+ const AArch64InstrInfo *TII =
+ MF->getSubtarget<AArch64Subtarget>().getInstrInfo();
+
+ Register BaseReg = FrameRegUpdate
+ ? FrameReg
+ : MRI->createVirtualRegister(&AArch64::GPR64RegClass);
+ Register SizeReg = MRI->createVirtualRegister(&AArch64::GPR64RegClass);
+
+ emitFrameOffset(*MBB, InsertI, DL, BaseReg, FrameReg, FrameRegOffset, TII);
+
+ int64_t LoopSize = Size;
+ // If the loop size is not a multiple of 32, split off one 16-byte store at
+ // the end to fold BaseReg update into.
+ if (FrameRegUpdate && *FrameRegUpdate)
+ LoopSize -= LoopSize % 32;
+ MachineInstr *LoopI = BuildMI(*MBB, InsertI, DL,
+ TII->get(ZeroData ? AArch64::STZGloop_wback
+ : AArch64::STGloop_wback))
+ .addDef(SizeReg)
+ .addDef(BaseReg)
+ .addImm(LoopSize)
+ .addReg(BaseReg)
+ .setMemRefs(CombinedMemRefs);
+ if (FrameRegUpdate)
+ LoopI->setFlags(FrameRegUpdateFlags);
+
+ int64_t ExtraBaseRegUpdate =
+ FrameRegUpdate ? (*FrameRegUpdate - FrameRegOffset.getBytes() - Size) : 0;
+ if (LoopSize < Size) {
+ assert(FrameRegUpdate);
+ assert(Size - LoopSize == 16);
+ // Tag 16 more bytes at BaseReg and update BaseReg.
+ BuildMI(*MBB, InsertI, DL,
+ TII->get(ZeroData ? AArch64::STZGPostIndex : AArch64::STGPostIndex))
+ .addDef(BaseReg)
+ .addReg(BaseReg)
+ .addReg(BaseReg)
+ .addImm(1 + ExtraBaseRegUpdate / 16)
+ .setMemRefs(CombinedMemRefs)
+ .setMIFlags(FrameRegUpdateFlags);
+ } else if (ExtraBaseRegUpdate) {
+ // Update BaseReg.
+ BuildMI(
+ *MBB, InsertI, DL,
+ TII->get(ExtraBaseRegUpdate > 0 ? AArch64::ADDXri : AArch64::SUBXri))
+ .addDef(BaseReg)
+ .addReg(BaseReg)
+ .addImm(std::abs(ExtraBaseRegUpdate))
+ .addImm(0)
+ .setMIFlags(FrameRegUpdateFlags);
+ }
+}
+
+// Check if *II is a register update that can be merged into STGloop that ends
+// at (Reg + Size). RemainingOffset is the required adjustment to Reg after the
+// end of the loop.
+bool canMergeRegUpdate(MachineBasicBlock::iterator II, unsigned Reg,
+ int64_t Size, int64_t *TotalOffset) {
+ MachineInstr &MI = *II;
+ if ((MI.getOpcode() == AArch64::ADDXri ||
+ MI.getOpcode() == AArch64::SUBXri) &&
+ MI.getOperand(0).getReg() == Reg && MI.getOperand(1).getReg() == Reg) {
+ unsigned Shift = AArch64_AM::getShiftValue(MI.getOperand(3).getImm());
+ int64_t Offset = MI.getOperand(2).getImm() << Shift;
+ if (MI.getOpcode() == AArch64::SUBXri)
+ Offset = -Offset;
+ int64_t AbsPostOffset = std::abs(Offset - Size);
+ const int64_t kMaxOffset =
+ 0xFFF; // Max encoding for unshifted ADDXri / SUBXri
+ if (AbsPostOffset <= kMaxOffset && AbsPostOffset % 16 == 0) {
+ *TotalOffset = Offset;
+ return true;
+ }
+ }
+ return false;
+}
+
+void mergeMemRefs(const SmallVectorImpl<TagStoreInstr> &TSE,
+ SmallVectorImpl<MachineMemOperand *> &MemRefs) {
+ MemRefs.clear();
+ for (auto &TS : TSE) {
+ MachineInstr *MI = TS.MI;
+ // An instruction without memory operands may access anything. Be
+ // conservative and return an empty list.
+ if (MI->memoperands_empty()) {
+ MemRefs.clear();
+ return;
+ }
+ MemRefs.append(MI->memoperands_begin(), MI->memoperands_end());
+ }
+}
+
+void TagStoreEdit::emitCode(MachineBasicBlock::iterator &InsertI,
+ const AArch64FrameLowering *TFI, bool IsLast) {
+ if (TagStores.empty())
+ return;
+ TagStoreInstr &FirstTagStore = TagStores[0];
+ TagStoreInstr &LastTagStore = TagStores[TagStores.size() - 1];
+ Size = LastTagStore.Offset - FirstTagStore.Offset + LastTagStore.Size;
+ DL = TagStores[0].MI->getDebugLoc();
+
+ Register Reg;
+ FrameRegOffset = TFI->resolveFrameOffsetReference(
+ *MF, FirstTagStore.Offset, false /*isFixed*/, false /*isSVE*/, Reg,
+ /*PreferFP=*/false, /*ForSimm=*/true);
+ FrameReg = Reg;
+ FrameRegUpdate = None;
+
+ mergeMemRefs(TagStores, CombinedMemRefs);
+
+ LLVM_DEBUG(dbgs() << "Replacing adjacent STG instructions:\n";
+ for (const auto &Instr
+ : TagStores) { dbgs() << " " << *Instr.MI; });
+
+ // Size threshold where a loop becomes shorter than a linear sequence of
+ // tagging instructions.
+ const int kSetTagLoopThreshold = 176;
+ if (Size < kSetTagLoopThreshold) {
+ if (TagStores.size() < 2)
+ return;
+ emitUnrolled(InsertI);
+ } else {
+ MachineInstr *UpdateInstr = nullptr;
+ int64_t TotalOffset;
+ if (IsLast) {
+ // See if we can merge base register update into the STGloop.
+ // This is done in AArch64LoadStoreOptimizer for "normal" stores,
+ // but STGloop is way too unusual for that, and also it only
+ // realistically happens in function epilogue. Also, STGloop is expanded
+ // before that pass.
+ if (InsertI != MBB->end() &&
+ canMergeRegUpdate(InsertI, FrameReg, FrameRegOffset.getBytes() + Size,
+ &TotalOffset)) {
+ UpdateInstr = &*InsertI++;
+ LLVM_DEBUG(dbgs() << "Folding SP update into loop:\n "
+ << *UpdateInstr);
+ }
+ }
+
+ if (!UpdateInstr && TagStores.size() < 2)
+ return;
+
+ if (UpdateInstr) {
+ FrameRegUpdate = TotalOffset;
+ FrameRegUpdateFlags = UpdateInstr->getFlags();
+ }
+ emitLoop(InsertI);
+ if (UpdateInstr)
+ UpdateInstr->eraseFromParent();
+ }
+
+ for (auto &TS : TagStores)
+ TS.MI->eraseFromParent();
+}
+
+bool isMergeableStackTaggingInstruction(MachineInstr &MI, int64_t &Offset,
+ int64_t &Size, bool &ZeroData) {
+ MachineFunction &MF = *MI.getParent()->getParent();
+ const MachineFrameInfo &MFI = MF.getFrameInfo();
+
+ unsigned Opcode = MI.getOpcode();
+ ZeroData = (Opcode == AArch64::STZGloop || Opcode == AArch64::STZGOffset ||
+ Opcode == AArch64::STZ2GOffset);
+
+ if (Opcode == AArch64::STGloop || Opcode == AArch64::STZGloop) {
+ if (!MI.getOperand(0).isDead() || !MI.getOperand(1).isDead())
+ return false;
+ if (!MI.getOperand(2).isImm() || !MI.getOperand(3).isFI())
+ return false;
+ Offset = MFI.getObjectOffset(MI.getOperand(3).getIndex());
+ Size = MI.getOperand(2).getImm();
+ return true;
+ }
+
+ if (Opcode == AArch64::STGOffset || Opcode == AArch64::STZGOffset)
+ Size = 16;
+ else if (Opcode == AArch64::ST2GOffset || Opcode == AArch64::STZ2GOffset)
+ Size = 32;
+ else
+ return false;
+
+ if (MI.getOperand(0).getReg() != AArch64::SP || !MI.getOperand(1).isFI())
+ return false;
+
+ Offset = MFI.getObjectOffset(MI.getOperand(1).getIndex()) +
+ 16 * MI.getOperand(2).getImm();
+ return true;
+}
+
+// Detect a run of memory tagging instructions for adjacent stack frame slots,
+// and replace them with a shorter instruction sequence:
+// * replace STG + STG with ST2G
+// * replace STGloop + STGloop with STGloop
+// This code needs to run when stack slot offsets are already known, but before
+// FrameIndex operands in STG instructions are eliminated.
+MachineBasicBlock::iterator tryMergeAdjacentSTG(MachineBasicBlock::iterator II,
+ const AArch64FrameLowering *TFI,
+ RegScavenger *RS) {
+ bool FirstZeroData;
+ int64_t Size, Offset;
+ MachineInstr &MI = *II;
+ MachineBasicBlock *MBB = MI.getParent();
+ MachineBasicBlock::iterator NextI = ++II;
+ if (&MI == &MBB->instr_back())
+ return II;
+ if (!isMergeableStackTaggingInstruction(MI, Offset, Size, FirstZeroData))
+ return II;
+
+ SmallVector<TagStoreInstr, 4> Instrs;
+ Instrs.emplace_back(&MI, Offset, Size);
+
+ constexpr int kScanLimit = 10;
+ int Count = 0;
+ for (MachineBasicBlock::iterator E = MBB->end();
+ NextI != E && Count < kScanLimit; ++NextI) {
+ MachineInstr &MI = *NextI;
+ bool ZeroData;
+ int64_t Size, Offset;
+ // Collect instructions that update memory tags with a FrameIndex operand
+ // and (when applicable) constant size, and whose output registers are dead
+ // (the latter is almost always the case in practice). Since these
+ // instructions effectively have no inputs or outputs, we are free to skip
+ // any non-aliasing instructions in between without tracking used registers.
+ if (isMergeableStackTaggingInstruction(MI, Offset, Size, ZeroData)) {
+ if (ZeroData != FirstZeroData)
+ break;
+ Instrs.emplace_back(&MI, Offset, Size);
+ continue;
+ }
+
+ // Only count non-transient, non-tagging instructions toward the scan
+ // limit.
+ if (!MI.isTransient())
+ ++Count;
+
+ // Just in case, stop before the epilogue code starts.
+ if (MI.getFlag(MachineInstr::FrameSetup) ||
+ MI.getFlag(MachineInstr::FrameDestroy))
+ break;
+
+ // Reject anything that may alias the collected instructions.
+ if (MI.mayLoadOrStore() || MI.hasUnmodeledSideEffects())
+ break;
+ }
+
+ // New code will be inserted after the last tagging instruction we've found.
+ MachineBasicBlock::iterator InsertI = Instrs.back().MI;
+ InsertI++;
+
+ llvm::stable_sort(Instrs,
+ [](const TagStoreInstr &Left, const TagStoreInstr &Right) {
+ return Left.Offset < Right.Offset;
+ });
+
+ // Make sure that we don't have any overlapping stores.
+ int64_t CurOffset = Instrs[0].Offset;
+ for (auto &Instr : Instrs) {
+ if (CurOffset > Instr.Offset)
+ return NextI;
+ CurOffset = Instr.Offset + Instr.Size;
+ }
+
+ // Find contiguous runs of tagged memory and emit shorter instruction
+ // sequencies for them when possible.
+ TagStoreEdit TSE(MBB, FirstZeroData);
+ Optional<int64_t> EndOffset;
+ for (auto &Instr : Instrs) {
+ if (EndOffset && *EndOffset != Instr.Offset) {
+ // Found a gap.
+ TSE.emitCode(InsertI, TFI, /*IsLast = */ false);
+ TSE.clear();
+ }
+
+ TSE.addInstruction(Instr);
+ EndOffset = Instr.Offset + Instr.Size;
+ }
+
+ TSE.emitCode(InsertI, TFI, /*IsLast = */ true);
+
+ return InsertI;
+}
+} // namespace
+
+void AArch64FrameLowering::processFunctionBeforeFrameIndicesReplaced(
+ MachineFunction &MF, RegScavenger *RS = nullptr) const {
+ if (StackTaggingMergeSetTag)
+ for (auto &BB : MF)
+ for (MachineBasicBlock::iterator II = BB.begin(); II != BB.end();)
+ II = tryMergeAdjacentSTG(II, this, RS);
+}
+
+/// For Win64 AArch64 EH, the offset to the Unwind object is from the SP
+/// before the update. This is easily retrieved as it is exactly the offset
+/// that is set in processFunctionBeforeFrameFinalized.
int AArch64FrameLowering::getFrameIndexReferencePreferSP(
- const MachineFunction &MF, int FI, unsigned &FrameReg,
+ const MachineFunction &MF, int FI, Register &FrameReg,
bool IgnoreSPUpdates) const {
const MachineFrameInfo &MFI = MF.getFrameInfo();
- LLVM_DEBUG(dbgs() << "Offset from the SP for " << FI << " is "
- << MFI.getObjectOffset(FI) << "\n");
- FrameReg = AArch64::SP;
- return MFI.getObjectOffset(FI);
+ if (IgnoreSPUpdates) {
+ LLVM_DEBUG(dbgs() << "Offset from the SP for " << FI << " is "
+ << MFI.getObjectOffset(FI) << "\n");
+ FrameReg = AArch64::SP;
+ return MFI.getObjectOffset(FI);
+ }
+
+ return getFrameIndexReference(MF, FI, FrameReg);
}
/// The parent frame offset (aka dispFrame) is only used on X86_64 to retrieve
@@ -2678,5 +3140,5 @@ unsigned AArch64FrameLowering::getWinEHFuncletFrameSize(
MF.getInfo<AArch64FunctionInfo>()->getCalleeSavedStackSize();
// This is the amount of stack a funclet needs to allocate.
return alignTo(CSSize + MF.getFrameInfo().getMaxCallFrameSize(),
- getStackAlignment());
+ getStackAlign());
}
diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.h b/llvm/lib/Target/AArch64/AArch64FrameLowering.h
index b5719feb6b154..9d0a6d9eaf255 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.h
@@ -24,8 +24,9 @@ public:
: TargetFrameLowering(StackGrowsDown, Align(16), 0, Align(16),
true /*StackRealignable*/) {}
- void emitCalleeSavedFrameMoves(MachineBasicBlock &MBB,
- MachineBasicBlock::iterator MBBI) const;
+ void
+ emitCalleeSavedFrameMoves(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI) const override;
MachineBasicBlock::iterator
eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
@@ -39,23 +40,24 @@ public:
bool canUseAsPrologue(const MachineBasicBlock &MBB) const override;
int getFrameIndexReference(const MachineFunction &MF, int FI,
- unsigned &FrameReg) const override;
+ Register &FrameReg) const override;
StackOffset resolveFrameIndexReference(const MachineFunction &MF, int FI,
- unsigned &FrameReg, bool PreferFP,
+ Register &FrameReg, bool PreferFP,
bool ForSimm) const;
StackOffset resolveFrameOffsetReference(const MachineFunction &MF,
int64_t ObjectOffset, bool isFixed,
- bool isSVE, unsigned &FrameReg,
+ bool isSVE, Register &FrameReg,
bool PreferFP, bool ForSimm) const;
bool spillCalleeSavedRegisters(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MI,
- const std::vector<CalleeSavedInfo> &CSI,
+ ArrayRef<CalleeSavedInfo> CSI,
const TargetRegisterInfo *TRI) const override;
- bool restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
- MachineBasicBlock::iterator MI,
- std::vector<CalleeSavedInfo> &CSI,
- const TargetRegisterInfo *TRI) const override;
+ bool
+ restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI,
+ MutableArrayRef<CalleeSavedInfo> CSI,
+ const TargetRegisterInfo *TRI) const override;
/// Can this function use the red zone for local allocations.
bool canUseRedZone(const MachineFunction &MF) const;
@@ -77,12 +79,16 @@ public:
void processFunctionBeforeFrameFinalized(MachineFunction &MF,
RegScavenger *RS) const override;
+ void
+ processFunctionBeforeFrameIndicesReplaced(MachineFunction &MF,
+ RegScavenger *RS) const override;
+
unsigned getWinEHParentFrameOffset(const MachineFunction &MF) const override;
unsigned getWinEHFuncletFrameSize(const MachineFunction &MF) const;
int getFrameIndexReferencePreferSP(const MachineFunction &MF, int FI,
- unsigned &FrameReg,
+ Register &FrameReg,
bool IgnoreSPUpdates) const override;
int getNonLocalFrameIndexReference(const MachineFunction &MF,
int FI) const override;
@@ -107,6 +113,8 @@ private:
int64_t assignSVEStackObjectOffsets(MachineFrameInfo &MF,
int &MinCSFrameIndex,
int &MaxCSFrameIndex) const;
+ bool shouldCombineCSRLocalStackBumpInEpilogue(MachineBasicBlock &MBB,
+ unsigned StackBumpBytes) const;
};
} // End llvm namespace
diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
index a51aa85a931c0..10c4778533533 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -62,6 +62,9 @@ public:
unsigned ConstraintID,
std::vector<SDValue> &OutOps) override;
+ template <signed Low, signed High, signed Scale>
+ bool SelectRDVLImm(SDValue N, SDValue &Imm);
+
bool tryMLAV64LaneV128(SDNode *N);
bool tryMULLV64LaneV128(unsigned IntNo, SDNode *N);
bool SelectArithExtendedRegister(SDValue N, SDValue &Reg, SDValue &Shift);
@@ -159,6 +162,24 @@ public:
return false;
}
+ bool SelectDupZero(SDValue N) {
+ switch(N->getOpcode()) {
+ case AArch64ISD::DUP:
+ case ISD::SPLAT_VECTOR: {
+ auto Opnd0 = N->getOperand(0);
+ if (auto CN = dyn_cast<ConstantSDNode>(Opnd0))
+ if (CN->isNullValue())
+ return true;
+ if (auto CN = dyn_cast<ConstantFPSDNode>(Opnd0))
+ if (CN->isZero())
+ return true;
+ break;
+ }
+ }
+
+ return false;
+ }
+
template<MVT::SimpleValueType VT>
bool SelectSVEAddSubImm(SDValue N, SDValue &Imm, SDValue &Shift) {
return SelectSVEAddSubImm(N, VT, Imm, Shift);
@@ -169,6 +190,11 @@ public:
return SelectSVELogicalImm(N, VT, Imm);
}
+ template <unsigned Low, unsigned High>
+ bool SelectSVEShiftImm64(SDValue N, SDValue &Imm) {
+ return SelectSVEShiftImm64(N, Low, High, Imm);
+ }
+
// Returns a suitable CNT/INC/DEC/RDVL multiplier to calculate VSCALE*N.
template<signed Min, signed Max, signed Scale, bool Shift>
bool SelectCntImm(SDValue N, SDValue &Imm) {
@@ -197,6 +223,9 @@ public:
/// unchanged; otherwise a REG_SEQUENCE value is returned.
SDValue createDTuple(ArrayRef<SDValue> Vecs);
SDValue createQTuple(ArrayRef<SDValue> Vecs);
+ // Form a sequence of SVE registers for instructions using list of vectors,
+ // e.g. structured loads and stores (ldN, stN).
+ SDValue createZTuple(ArrayRef<SDValue> Vecs);
/// Generic helper for the createDTuple/createQTuple
/// functions. Those should almost always be called instead.
@@ -216,11 +245,31 @@ public:
unsigned SubRegIdx);
void SelectLoadLane(SDNode *N, unsigned NumVecs, unsigned Opc);
void SelectPostLoadLane(SDNode *N, unsigned NumVecs, unsigned Opc);
+ void SelectPredicatedLoad(SDNode *N, unsigned NumVecs, const unsigned Opc);
+
+ bool SelectAddrModeFrameIndexSVE(SDValue N, SDValue &Base, SDValue &OffImm);
+ /// SVE Reg+Imm addressing mode.
+ template <int64_t Min, int64_t Max>
+ bool SelectAddrModeIndexedSVE(SDNode *Root, SDValue N, SDValue &Base,
+ SDValue &OffImm);
+ /// SVE Reg+Reg address mode.
+ template <unsigned Scale>
+ bool SelectSVERegRegAddrMode(SDValue N, SDValue &Base, SDValue &Offset) {
+ return SelectSVERegRegAddrMode(N, Scale, Base, Offset);
+ }
void SelectStore(SDNode *N, unsigned NumVecs, unsigned Opc);
void SelectPostStore(SDNode *N, unsigned NumVecs, unsigned Opc);
void SelectStoreLane(SDNode *N, unsigned NumVecs, unsigned Opc);
void SelectPostStoreLane(SDNode *N, unsigned NumVecs, unsigned Opc);
+ template <unsigned Scale>
+ void SelectPredicatedStore(SDNode *N, unsigned NumVecs, const unsigned Opc_rr,
+ const unsigned Opc_ri);
+ template <unsigned Scale>
+ std::tuple<unsigned, SDValue, SDValue>
+ findAddrModeSVELoadStore(SDNode *N, const unsigned Opc_rr,
+ const unsigned Opc_ri, const SDValue &OldBase,
+ const SDValue &OldOffset);
bool tryBitfieldExtractOp(SDNode *N);
bool tryBitfieldExtractOpFromSExt(SDNode *N);
@@ -268,13 +317,19 @@ private:
bool SelectCMP_SWAP(SDNode *N);
+ bool SelectSVE8BitLslImm(SDValue N, SDValue &Imm, SDValue &Shift);
+
bool SelectSVEAddSubImm(SDValue N, MVT VT, SDValue &Imm, SDValue &Shift);
bool SelectSVELogicalImm(SDValue N, MVT VT, SDValue &Imm);
bool SelectSVESignedArithImm(SDValue N, SDValue &Imm);
+ bool SelectSVEShiftImm64(SDValue N, uint64_t Low, uint64_t High,
+ SDValue &Imm);
bool SelectSVEArithImm(SDValue N, SDValue &Imm);
+ bool SelectSVERegRegAddrMode(SDValue N, unsigned Scale, SDValue &Base,
+ SDValue &Offset);
};
} // end anonymous namespace
@@ -679,6 +734,23 @@ static SDValue narrowIfNeeded(SelectionDAG *CurDAG, SDValue N) {
return SDValue(Node, 0);
}
+// Returns a suitable CNT/INC/DEC/RDVL multiplier to calculate VSCALE*N.
+template<signed Low, signed High, signed Scale>
+bool AArch64DAGToDAGISel::SelectRDVLImm(SDValue N, SDValue &Imm) {
+ if (!isa<ConstantSDNode>(N))
+ return false;
+
+ int64_t MulImm = cast<ConstantSDNode>(N)->getSExtValue();
+ if ((MulImm % std::abs(Scale)) == 0) {
+ int64_t RDVLImm = MulImm / Scale;
+ if ((RDVLImm >= Low) && (RDVLImm <= High)) {
+ Imm = CurDAG->getTargetConstant(RDVLImm, SDLoc(N), MVT::i32);
+ return true;
+ }
+ }
+
+ return false;
+}
/// SelectArithExtendedRegister - Select a "extended register" operand. This
/// operand folds in an extend followed by an optional left shift.
@@ -832,16 +904,9 @@ bool AArch64DAGToDAGISel::SelectAddrModeIndexed(SDValue N, unsigned Size,
if (!GAN)
return true;
- if (GAN->getOffset() % Size == 0) {
- const GlobalValue *GV = GAN->getGlobal();
- unsigned Alignment = GV->getAlignment();
- Type *Ty = GV->getValueType();
- if (Alignment == 0 && Ty->isSized())
- Alignment = DL.getABITypeAlignment(Ty);
-
- if (Alignment >= Size)
- return true;
- }
+ if (GAN->getOffset() % Size == 0 &&
+ GAN->getGlobal()->getPointerAlignment(DL) >= Size)
+ return true;
}
if (CurDAG->isBaseWithConstantOffset(N)) {
@@ -1132,6 +1197,16 @@ SDValue AArch64DAGToDAGISel::createQTuple(ArrayRef<SDValue> Regs) {
return createTuple(Regs, RegClassIDs, SubRegs);
}
+SDValue AArch64DAGToDAGISel::createZTuple(ArrayRef<SDValue> Regs) {
+ static const unsigned RegClassIDs[] = {AArch64::ZPR2RegClassID,
+ AArch64::ZPR3RegClassID,
+ AArch64::ZPR4RegClassID};
+ static const unsigned SubRegs[] = {AArch64::zsub0, AArch64::zsub1,
+ AArch64::zsub2, AArch64::zsub3};
+
+ return createTuple(Regs, RegClassIDs, SubRegs);
+}
+
SDValue AArch64DAGToDAGISel::createTuple(ArrayRef<SDValue> Regs,
const unsigned RegClassIDs[],
const unsigned SubRegs[]) {
@@ -1240,6 +1315,8 @@ bool AArch64DAGToDAGISel::tryIndexedLoad(SDNode *N) {
}
} else if (VT == MVT::f16) {
Opcode = IsPre ? AArch64::LDRHpre : AArch64::LDRHpost;
+ } else if (VT == MVT::bf16) {
+ Opcode = IsPre ? AArch64::LDRHpre : AArch64::LDRHpost;
} else if (VT == MVT::f32) {
Opcode = IsPre ? AArch64::LDRSpre : AArch64::LDRSpost;
} else if (VT == MVT::f64 || VT.is64BitVector()) {
@@ -1334,6 +1411,54 @@ void AArch64DAGToDAGISel::SelectPostLoad(SDNode *N, unsigned NumVecs,
CurDAG->RemoveDeadNode(N);
}
+/// Optimize \param OldBase and \param OldOffset selecting the best addressing
+/// mode. Returns a tuple consisting of an Opcode, an SDValue representing the
+/// new Base and an SDValue representing the new offset.
+template <unsigned Scale>
+std::tuple<unsigned, SDValue, SDValue>
+AArch64DAGToDAGISel::findAddrModeSVELoadStore(SDNode *N, const unsigned Opc_rr,
+ const unsigned Opc_ri,
+ const SDValue &OldBase,
+ const SDValue &OldOffset) {
+ SDValue NewBase = OldBase;
+ SDValue NewOffset = OldOffset;
+ // Detect a possible Reg+Imm addressing mode.
+ const bool IsRegImm = SelectAddrModeIndexedSVE</*Min=*/-8, /*Max=*/7>(
+ N, OldBase, NewBase, NewOffset);
+
+ // Detect a possible reg+reg addressing mode, but only if we haven't already
+ // detected a Reg+Imm one.
+ const bool IsRegReg =
+ !IsRegImm && SelectSVERegRegAddrMode<Scale>(OldBase, NewBase, NewOffset);
+
+ // Select the instruction.
+ return std::make_tuple(IsRegReg ? Opc_rr : Opc_ri, NewBase, NewOffset);
+}
+
+void AArch64DAGToDAGISel::SelectPredicatedLoad(SDNode *N, unsigned NumVecs,
+ const unsigned Opc) {
+ SDLoc DL(N);
+ EVT VT = N->getValueType(0);
+ SDValue Chain = N->getOperand(0);
+
+ SDValue Ops[] = {N->getOperand(1), // Predicate
+ N->getOperand(2), // Memory operand
+ CurDAG->getTargetConstant(0, DL, MVT::i64), Chain};
+
+ const EVT ResTys[] = {MVT::Untyped, MVT::Other};
+
+ SDNode *Load = CurDAG->getMachineNode(Opc, DL, ResTys, Ops);
+ SDValue SuperReg = SDValue(Load, 0);
+ for (unsigned i = 0; i < NumVecs; ++i)
+ ReplaceUses(SDValue(N, i), CurDAG->getTargetExtractSubreg(
+ AArch64::zsub0 + i, DL, VT, SuperReg));
+
+ // Copy chain
+ unsigned ChainIdx = NumVecs;
+ ReplaceUses(SDValue(N, ChainIdx), SDValue(Load, 1));
+ CurDAG->RemoveDeadNode(N);
+}
+
void AArch64DAGToDAGISel::SelectStore(SDNode *N, unsigned NumVecs,
unsigned Opc) {
SDLoc dl(N);
@@ -1354,6 +1479,49 @@ void AArch64DAGToDAGISel::SelectStore(SDNode *N, unsigned NumVecs,
ReplaceNode(N, St);
}
+template <unsigned Scale>
+void AArch64DAGToDAGISel::SelectPredicatedStore(SDNode *N, unsigned NumVecs,
+ const unsigned Opc_rr,
+ const unsigned Opc_ri) {
+ SDLoc dl(N);
+
+ // Form a REG_SEQUENCE to force register allocation.
+ SmallVector<SDValue, 4> Regs(N->op_begin() + 2, N->op_begin() + 2 + NumVecs);
+ SDValue RegSeq = createZTuple(Regs);
+
+ // Optimize addressing mode.
+ unsigned Opc;
+ SDValue Offset, Base;
+ std::tie(Opc, Base, Offset) = findAddrModeSVELoadStore<Scale>(
+ N, Opc_rr, Opc_ri, N->getOperand(NumVecs + 3),
+ CurDAG->getTargetConstant(0, dl, MVT::i64));
+
+ SDValue Ops[] = {RegSeq, N->getOperand(NumVecs + 2), // predicate
+ Base, // address
+ Offset, // offset
+ N->getOperand(0)}; // chain
+ SDNode *St = CurDAG->getMachineNode(Opc, dl, N->getValueType(0), Ops);
+
+ ReplaceNode(N, St);
+}
+
+bool AArch64DAGToDAGISel::SelectAddrModeFrameIndexSVE(SDValue N, SDValue &Base,
+ SDValue &OffImm) {
+ SDLoc dl(N);
+ const DataLayout &DL = CurDAG->getDataLayout();
+ const TargetLowering *TLI = getTargetLowering();
+
+ // Try to match it for the frame address
+ if (auto FINode = dyn_cast<FrameIndexSDNode>(N)) {
+ int FI = FINode->getIndex();
+ Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy(DL));
+ OffImm = CurDAG->getTargetConstant(0, dl, MVT::i64);
+ return true;
+ }
+
+ return false;
+}
+
void AArch64DAGToDAGISel::SelectPostStore(SDNode *N, unsigned NumVecs,
unsigned Opc) {
SDLoc dl(N);
@@ -2632,7 +2800,8 @@ bool AArch64DAGToDAGISel::tryShiftAmountMod(SDNode *N) {
// bits that are implicitly ANDed off by the above opcodes and if so, skip
// the AND.
uint64_t MaskImm;
- if (!isOpcWithIntImmediate(ShiftAmt.getNode(), ISD::AND, MaskImm))
+ if (!isOpcWithIntImmediate(ShiftAmt.getNode(), ISD::AND, MaskImm) &&
+ !isOpcWithIntImmediate(ShiftAmt.getNode(), AArch64ISD::ANDS, MaskImm))
return false;
if (countTrailingOnes(MaskImm) < Bits)
@@ -2879,6 +3048,32 @@ bool AArch64DAGToDAGISel::SelectCMP_SWAP(SDNode *N) {
return true;
}
+bool AArch64DAGToDAGISel::SelectSVE8BitLslImm(SDValue N, SDValue &Base,
+ SDValue &Offset) {
+ auto C = dyn_cast<ConstantSDNode>(N);
+ if (!C)
+ return false;
+
+ auto Ty = N->getValueType(0);
+
+ int64_t Imm = C->getSExtValue();
+ SDLoc DL(N);
+
+ if ((Imm >= -128) && (Imm <= 127)) {
+ Base = CurDAG->getTargetConstant(Imm, DL, Ty);
+ Offset = CurDAG->getTargetConstant(0, DL, Ty);
+ return true;
+ }
+
+ if (((Imm % 256) == 0) && (Imm >= -32768) && (Imm <= 32512)) {
+ Base = CurDAG->getTargetConstant(Imm/256, DL, Ty);
+ Offset = CurDAG->getTargetConstant(8, DL, Ty);
+ return true;
+ }
+
+ return false;
+}
+
bool AArch64DAGToDAGISel::SelectSVEAddSubImm(SDValue N, MVT VT, SDValue &Imm, SDValue &Shift) {
if (auto CNode = dyn_cast<ConstantSDNode>(N)) {
const int64_t ImmVal = CNode->getZExtValue();
@@ -2917,7 +3112,7 @@ bool AArch64DAGToDAGISel::SelectSVESignedArithImm(SDValue N, SDValue &Imm) {
if (auto CNode = dyn_cast<ConstantSDNode>(N)) {
int64_t ImmVal = CNode->getSExtValue();
SDLoc DL(N);
- if (ImmVal >= -127 && ImmVal < 127) {
+ if (ImmVal >= -128 && ImmVal < 128) {
Imm = CurDAG->getTargetConstant(ImmVal, DL, MVT::i32);
return true;
}
@@ -2975,6 +3170,24 @@ bool AArch64DAGToDAGISel::SelectSVELogicalImm(SDValue N, MVT VT, SDValue &Imm) {
return false;
}
+// This method is only needed to "cast" i64s into i32s when the value
+// is a valid shift which has been splatted into a vector with i64 elements.
+// Every other type is fine in tablegen.
+bool AArch64DAGToDAGISel::SelectSVEShiftImm64(SDValue N, uint64_t Low,
+ uint64_t High, SDValue &Imm) {
+ if (auto *CN = dyn_cast<ConstantSDNode>(N)) {
+ uint64_t ImmVal = CN->getZExtValue();
+ SDLoc DL(N);
+
+ if (ImmVal >= Low && ImmVal <= High) {
+ Imm = CurDAG->getTargetConstant(ImmVal, DL, MVT::i32);
+ return true;
+ }
+ }
+
+ return false;
+}
+
bool AArch64DAGToDAGISel::trySelectStackSlotTagP(SDNode *N) {
// tagp(FrameIndex, IRGstack, tag_offset):
// since the offset between FrameIndex and IRGstack is a compile-time
@@ -3027,6 +3240,63 @@ void AArch64DAGToDAGISel::SelectTagP(SDNode *N) {
ReplaceNode(N, N3);
}
+// NOTE: We cannot use EXTRACT_SUBREG in all cases because the fixed length
+// vector types larger than NEON don't have a matching SubRegIndex.
+static SDNode *extractSubReg(SelectionDAG *DAG, EVT VT, SDValue V) {
+ assert(V.getValueType().isScalableVector() &&
+ V.getValueType().getSizeInBits().getKnownMinSize() ==
+ AArch64::SVEBitsPerBlock &&
+ "Expected to extract from a packed scalable vector!");
+ assert(VT.isFixedLengthVector() &&
+ "Expected to extract a fixed length vector!");
+
+ SDLoc DL(V);
+ switch (VT.getSizeInBits()) {
+ case 64: {
+ auto SubReg = DAG->getTargetConstant(AArch64::dsub, DL, MVT::i32);
+ return DAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL, VT, V, SubReg);
+ }
+ case 128: {
+ auto SubReg = DAG->getTargetConstant(AArch64::zsub, DL, MVT::i32);
+ return DAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL, VT, V, SubReg);
+ }
+ default: {
+ auto RC = DAG->getTargetConstant(AArch64::ZPRRegClassID, DL, MVT::i64);
+ return DAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS, DL, VT, V, RC);
+ }
+ }
+}
+
+// NOTE: We cannot use INSERT_SUBREG in all cases because the fixed length
+// vector types larger than NEON don't have a matching SubRegIndex.
+static SDNode *insertSubReg(SelectionDAG *DAG, EVT VT, SDValue V) {
+ assert(VT.isScalableVector() &&
+ VT.getSizeInBits().getKnownMinSize() == AArch64::SVEBitsPerBlock &&
+ "Expected to insert into a packed scalable vector!");
+ assert(V.getValueType().isFixedLengthVector() &&
+ "Expected to insert a fixed length vector!");
+
+ SDLoc DL(V);
+ switch (V.getValueType().getSizeInBits()) {
+ case 64: {
+ auto SubReg = DAG->getTargetConstant(AArch64::dsub, DL, MVT::i32);
+ auto Container = DAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, VT);
+ return DAG->getMachineNode(TargetOpcode::INSERT_SUBREG, DL, VT,
+ SDValue(Container, 0), V, SubReg);
+ }
+ case 128: {
+ auto SubReg = DAG->getTargetConstant(AArch64::zsub, DL, MVT::i32);
+ auto Container = DAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, VT);
+ return DAG->getMachineNode(TargetOpcode::INSERT_SUBREG, DL, VT,
+ SDValue(Container, 0), V, SubReg);
+ }
+ default: {
+ auto RC = DAG->getTargetConstant(AArch64::ZPRRegClassID, DL, MVT::i64);
+ return DAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS, DL, VT, V, RC);
+ }
+ }
+}
+
void AArch64DAGToDAGISel::Select(SDNode *Node) {
// If we have a custom node, we already have selected!
if (Node->isMachineOpcode()) {
@@ -3100,6 +3370,52 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
return;
break;
+ case ISD::EXTRACT_SUBVECTOR: {
+ // Bail when not a "cast" like extract_subvector.
+ if (cast<ConstantSDNode>(Node->getOperand(1))->getZExtValue() != 0)
+ break;
+
+ // Bail when normal isel can do the job.
+ EVT InVT = Node->getOperand(0).getValueType();
+ if (VT.isScalableVector() || InVT.isFixedLengthVector())
+ break;
+
+ // NOTE: We can only get here when doing fixed length SVE code generation.
+ // We do manual selection because the types involved are not linked to real
+ // registers (despite being legal) and must be coerced into SVE registers.
+ //
+ // NOTE: If the above changes, be aware that selection will still not work
+ // because the td definition of extract_vector does not support extracting
+ // a fixed length vector from a scalable vector.
+
+ ReplaceNode(Node, extractSubReg(CurDAG, VT, Node->getOperand(0)));
+ return;
+ }
+
+ case ISD::INSERT_SUBVECTOR: {
+ // Bail when not a "cast" like insert_subvector.
+ if (cast<ConstantSDNode>(Node->getOperand(2))->getZExtValue() != 0)
+ break;
+ if (!Node->getOperand(0).isUndef())
+ break;
+
+ // Bail when normal isel should do the job.
+ EVT InVT = Node->getOperand(1).getValueType();
+ if (VT.isFixedLengthVector() || InVT.isScalableVector())
+ break;
+
+ // NOTE: We can only get here when doing fixed length SVE code generation.
+ // We do manual selection because the types involved are not linked to real
+ // registers (despite being legal) and must be coerced into SVE registers.
+ //
+ // NOTE: If the above changes, be aware that selection will still not work
+ // because the td definition of insert_vector does not support inserting a
+ // fixed length vector into a scalable vector.
+
+ ReplaceNode(Node, insertSubReg(CurDAG, VT, Node->getOperand(1)));
+ return;
+ }
+
case ISD::Constant: {
// Materialize zero constants as copies from WZR/XZR. This allows
// the coalescer to propagate these into other instructions.
@@ -3185,10 +3501,10 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
} else if (VT == MVT::v16i8) {
SelectLoad(Node, 2, AArch64::LD1Twov16b, AArch64::qsub0);
return;
- } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+ } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) {
SelectLoad(Node, 2, AArch64::LD1Twov4h, AArch64::dsub0);
return;
- } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+ } else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) {
SelectLoad(Node, 2, AArch64::LD1Twov8h, AArch64::qsub0);
return;
} else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
@@ -3212,10 +3528,10 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
} else if (VT == MVT::v16i8) {
SelectLoad(Node, 3, AArch64::LD1Threev16b, AArch64::qsub0);
return;
- } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+ } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) {
SelectLoad(Node, 3, AArch64::LD1Threev4h, AArch64::dsub0);
return;
- } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+ } else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) {
SelectLoad(Node, 3, AArch64::LD1Threev8h, AArch64::qsub0);
return;
} else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
@@ -3239,10 +3555,10 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
} else if (VT == MVT::v16i8) {
SelectLoad(Node, 4, AArch64::LD1Fourv16b, AArch64::qsub0);
return;
- } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+ } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) {
SelectLoad(Node, 4, AArch64::LD1Fourv4h, AArch64::dsub0);
return;
- } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+ } else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) {
SelectLoad(Node, 4, AArch64::LD1Fourv8h, AArch64::qsub0);
return;
} else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
@@ -3266,10 +3582,10 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
} else if (VT == MVT::v16i8) {
SelectLoad(Node, 2, AArch64::LD2Twov16b, AArch64::qsub0);
return;
- } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+ } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) {
SelectLoad(Node, 2, AArch64::LD2Twov4h, AArch64::dsub0);
return;
- } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+ } else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) {
SelectLoad(Node, 2, AArch64::LD2Twov8h, AArch64::qsub0);
return;
} else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
@@ -3293,10 +3609,10 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
} else if (VT == MVT::v16i8) {
SelectLoad(Node, 3, AArch64::LD3Threev16b, AArch64::qsub0);
return;
- } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+ } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) {
SelectLoad(Node, 3, AArch64::LD3Threev4h, AArch64::dsub0);
return;
- } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+ } else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) {
SelectLoad(Node, 3, AArch64::LD3Threev8h, AArch64::qsub0);
return;
} else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
@@ -3320,10 +3636,10 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
} else if (VT == MVT::v16i8) {
SelectLoad(Node, 4, AArch64::LD4Fourv16b, AArch64::qsub0);
return;
- } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+ } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) {
SelectLoad(Node, 4, AArch64::LD4Fourv4h, AArch64::dsub0);
return;
- } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+ } else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) {
SelectLoad(Node, 4, AArch64::LD4Fourv8h, AArch64::qsub0);
return;
} else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
@@ -3347,10 +3663,10 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
} else if (VT == MVT::v16i8) {
SelectLoad(Node, 2, AArch64::LD2Rv16b, AArch64::qsub0);
return;
- } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+ } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) {
SelectLoad(Node, 2, AArch64::LD2Rv4h, AArch64::dsub0);
return;
- } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+ } else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) {
SelectLoad(Node, 2, AArch64::LD2Rv8h, AArch64::qsub0);
return;
} else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
@@ -3374,10 +3690,10 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
} else if (VT == MVT::v16i8) {
SelectLoad(Node, 3, AArch64::LD3Rv16b, AArch64::qsub0);
return;
- } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+ } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) {
SelectLoad(Node, 3, AArch64::LD3Rv4h, AArch64::dsub0);
return;
- } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+ } else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) {
SelectLoad(Node, 3, AArch64::LD3Rv8h, AArch64::qsub0);
return;
} else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
@@ -3401,10 +3717,10 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
} else if (VT == MVT::v16i8) {
SelectLoad(Node, 4, AArch64::LD4Rv16b, AArch64::qsub0);
return;
- } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+ } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) {
SelectLoad(Node, 4, AArch64::LD4Rv4h, AArch64::dsub0);
return;
- } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+ } else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) {
SelectLoad(Node, 4, AArch64::LD4Rv8h, AArch64::qsub0);
return;
} else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
@@ -3426,7 +3742,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
SelectLoadLane(Node, 2, AArch64::LD2i8);
return;
} else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
- VT == MVT::v8f16) {
+ VT == MVT::v8f16 || VT == MVT::v4bf16 || VT == MVT::v8bf16) {
SelectLoadLane(Node, 2, AArch64::LD2i16);
return;
} else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
@@ -3444,7 +3760,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
SelectLoadLane(Node, 3, AArch64::LD3i8);
return;
} else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
- VT == MVT::v8f16) {
+ VT == MVT::v8f16 || VT == MVT::v4bf16 || VT == MVT::v8bf16) {
SelectLoadLane(Node, 3, AArch64::LD3i16);
return;
} else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
@@ -3462,7 +3778,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
SelectLoadLane(Node, 4, AArch64::LD4i8);
return;
} else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
- VT == MVT::v8f16) {
+ VT == MVT::v8f16 || VT == MVT::v4bf16 || VT == MVT::v8bf16) {
SelectLoadLane(Node, 4, AArch64::LD4i16);
return;
} else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
@@ -3537,10 +3853,12 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
} else if (VT == MVT::v16i8) {
SelectStore(Node, 2, AArch64::ST1Twov16b);
return;
- } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+ } else if (VT == MVT::v4i16 || VT == MVT::v4f16 ||
+ VT == MVT::v4bf16) {
SelectStore(Node, 2, AArch64::ST1Twov4h);
return;
- } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+ } else if (VT == MVT::v8i16 || VT == MVT::v8f16 ||
+ VT == MVT::v8bf16) {
SelectStore(Node, 2, AArch64::ST1Twov8h);
return;
} else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
@@ -3565,10 +3883,12 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
} else if (VT == MVT::v16i8) {
SelectStore(Node, 3, AArch64::ST1Threev16b);
return;
- } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+ } else if (VT == MVT::v4i16 || VT == MVT::v4f16 ||
+ VT == MVT::v4bf16) {
SelectStore(Node, 3, AArch64::ST1Threev4h);
return;
- } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+ } else if (VT == MVT::v8i16 || VT == MVT::v8f16 ||
+ VT == MVT::v8bf16) {
SelectStore(Node, 3, AArch64::ST1Threev8h);
return;
} else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
@@ -3593,10 +3913,12 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
} else if (VT == MVT::v16i8) {
SelectStore(Node, 4, AArch64::ST1Fourv16b);
return;
- } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+ } else if (VT == MVT::v4i16 || VT == MVT::v4f16 ||
+ VT == MVT::v4bf16) {
SelectStore(Node, 4, AArch64::ST1Fourv4h);
return;
- } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+ } else if (VT == MVT::v8i16 || VT == MVT::v8f16 ||
+ VT == MVT::v8bf16) {
SelectStore(Node, 4, AArch64::ST1Fourv8h);
return;
} else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
@@ -3621,10 +3943,12 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
} else if (VT == MVT::v16i8) {
SelectStore(Node, 2, AArch64::ST2Twov16b);
return;
- } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+ } else if (VT == MVT::v4i16 || VT == MVT::v4f16 ||
+ VT == MVT::v4bf16) {
SelectStore(Node, 2, AArch64::ST2Twov4h);
return;
- } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+ } else if (VT == MVT::v8i16 || VT == MVT::v8f16 ||
+ VT == MVT::v8bf16) {
SelectStore(Node, 2, AArch64::ST2Twov8h);
return;
} else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
@@ -3649,10 +3973,12 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
} else if (VT == MVT::v16i8) {
SelectStore(Node, 3, AArch64::ST3Threev16b);
return;
- } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+ } else if (VT == MVT::v4i16 || VT == MVT::v4f16 ||
+ VT == MVT::v4bf16) {
SelectStore(Node, 3, AArch64::ST3Threev4h);
return;
- } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+ } else if (VT == MVT::v8i16 || VT == MVT::v8f16 ||
+ VT == MVT::v8bf16) {
SelectStore(Node, 3, AArch64::ST3Threev8h);
return;
} else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
@@ -3677,10 +4003,12 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
} else if (VT == MVT::v16i8) {
SelectStore(Node, 4, AArch64::ST4Fourv16b);
return;
- } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+ } else if (VT == MVT::v4i16 || VT == MVT::v4f16 ||
+ VT == MVT::v4bf16) {
SelectStore(Node, 4, AArch64::ST4Fourv4h);
return;
- } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+ } else if (VT == MVT::v8i16 || VT == MVT::v8f16 ||
+ VT == MVT::v8bf16) {
SelectStore(Node, 4, AArch64::ST4Fourv8h);
return;
} else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
@@ -3703,7 +4031,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
SelectStoreLane(Node, 2, AArch64::ST2i8);
return;
} else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
- VT == MVT::v8f16) {
+ VT == MVT::v8f16 || VT == MVT::v4bf16 || VT == MVT::v8bf16) {
SelectStoreLane(Node, 2, AArch64::ST2i16);
return;
} else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
@@ -3722,7 +4050,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
SelectStoreLane(Node, 3, AArch64::ST3i8);
return;
} else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
- VT == MVT::v8f16) {
+ VT == MVT::v8f16 || VT == MVT::v4bf16 || VT == MVT::v8bf16) {
SelectStoreLane(Node, 3, AArch64::ST3i16);
return;
} else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
@@ -3741,7 +4069,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
SelectStoreLane(Node, 4, AArch64::ST4i8);
return;
} else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
- VT == MVT::v8f16) {
+ VT == MVT::v8f16 || VT == MVT::v4bf16 || VT == MVT::v8bf16) {
SelectStoreLane(Node, 4, AArch64::ST4i16);
return;
} else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
@@ -3755,6 +4083,69 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
}
break;
}
+ case Intrinsic::aarch64_sve_st2: {
+ if (VT == MVT::nxv16i8) {
+ SelectPredicatedStore</*Scale=*/0>(Node, 2, AArch64::ST2B,
+ AArch64::ST2B_IMM);
+ return;
+ } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 ||
+ (VT == MVT::nxv8bf16 && Subtarget->hasBF16())) {
+ SelectPredicatedStore</*Scale=*/1>(Node, 2, AArch64::ST2H,
+ AArch64::ST2H_IMM);
+ return;
+ } else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) {
+ SelectPredicatedStore</*Scale=*/2>(Node, 2, AArch64::ST2W,
+ AArch64::ST2W_IMM);
+ return;
+ } else if (VT == MVT::nxv2i64 || VT == MVT::nxv2f64) {
+ SelectPredicatedStore</*Scale=*/3>(Node, 2, AArch64::ST2D,
+ AArch64::ST2D_IMM);
+ return;
+ }
+ break;
+ }
+ case Intrinsic::aarch64_sve_st3: {
+ if (VT == MVT::nxv16i8) {
+ SelectPredicatedStore</*Scale=*/0>(Node, 3, AArch64::ST3B,
+ AArch64::ST3B_IMM);
+ return;
+ } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 ||
+ (VT == MVT::nxv8bf16 && Subtarget->hasBF16())) {
+ SelectPredicatedStore</*Scale=*/1>(Node, 3, AArch64::ST3H,
+ AArch64::ST3H_IMM);
+ return;
+ } else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) {
+ SelectPredicatedStore</*Scale=*/2>(Node, 3, AArch64::ST3W,
+ AArch64::ST3W_IMM);
+ return;
+ } else if (VT == MVT::nxv2i64 || VT == MVT::nxv2f64) {
+ SelectPredicatedStore</*Scale=*/3>(Node, 3, AArch64::ST3D,
+ AArch64::ST3D_IMM);
+ return;
+ }
+ break;
+ }
+ case Intrinsic::aarch64_sve_st4: {
+ if (VT == MVT::nxv16i8) {
+ SelectPredicatedStore</*Scale=*/0>(Node, 4, AArch64::ST4B,
+ AArch64::ST4B_IMM);
+ return;
+ } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 ||
+ (VT == MVT::nxv8bf16 && Subtarget->hasBF16())) {
+ SelectPredicatedStore</*Scale=*/1>(Node, 4, AArch64::ST4H,
+ AArch64::ST4H_IMM);
+ return;
+ } else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) {
+ SelectPredicatedStore</*Scale=*/2>(Node, 4, AArch64::ST4W,
+ AArch64::ST4W_IMM);
+ return;
+ } else if (VT == MVT::nxv2i64 || VT == MVT::nxv2f64) {
+ SelectPredicatedStore</*Scale=*/3>(Node, 4, AArch64::ST4D,
+ AArch64::ST4D_IMM);
+ return;
+ }
+ break;
+ }
}
break;
}
@@ -3765,10 +4156,10 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
} else if (VT == MVT::v16i8) {
SelectPostLoad(Node, 2, AArch64::LD2Twov16b_POST, AArch64::qsub0);
return;
- } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+ } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) {
SelectPostLoad(Node, 2, AArch64::LD2Twov4h_POST, AArch64::dsub0);
return;
- } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+ } else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) {
SelectPostLoad(Node, 2, AArch64::LD2Twov8h_POST, AArch64::qsub0);
return;
} else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
@@ -3793,10 +4184,10 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
} else if (VT == MVT::v16i8) {
SelectPostLoad(Node, 3, AArch64::LD3Threev16b_POST, AArch64::qsub0);
return;
- } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+ } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) {
SelectPostLoad(Node, 3, AArch64::LD3Threev4h_POST, AArch64::dsub0);
return;
- } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+ } else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) {
SelectPostLoad(Node, 3, AArch64::LD3Threev8h_POST, AArch64::qsub0);
return;
} else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
@@ -3821,10 +4212,10 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
} else if (VT == MVT::v16i8) {
SelectPostLoad(Node, 4, AArch64::LD4Fourv16b_POST, AArch64::qsub0);
return;
- } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+ } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) {
SelectPostLoad(Node, 4, AArch64::LD4Fourv4h_POST, AArch64::dsub0);
return;
- } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+ } else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) {
SelectPostLoad(Node, 4, AArch64::LD4Fourv8h_POST, AArch64::qsub0);
return;
} else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
@@ -3849,10 +4240,10 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
} else if (VT == MVT::v16i8) {
SelectPostLoad(Node, 2, AArch64::LD1Twov16b_POST, AArch64::qsub0);
return;
- } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+ } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) {
SelectPostLoad(Node, 2, AArch64::LD1Twov4h_POST, AArch64::dsub0);
return;
- } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+ } else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) {
SelectPostLoad(Node, 2, AArch64::LD1Twov8h_POST, AArch64::qsub0);
return;
} else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
@@ -3877,10 +4268,10 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
} else if (VT == MVT::v16i8) {
SelectPostLoad(Node, 3, AArch64::LD1Threev16b_POST, AArch64::qsub0);
return;
- } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+ } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) {
SelectPostLoad(Node, 3, AArch64::LD1Threev4h_POST, AArch64::dsub0);
return;
- } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+ } else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) {
SelectPostLoad(Node, 3, AArch64::LD1Threev8h_POST, AArch64::qsub0);
return;
} else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
@@ -3905,10 +4296,10 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
} else if (VT == MVT::v16i8) {
SelectPostLoad(Node, 4, AArch64::LD1Fourv16b_POST, AArch64::qsub0);
return;
- } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+ } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) {
SelectPostLoad(Node, 4, AArch64::LD1Fourv4h_POST, AArch64::dsub0);
return;
- } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+ } else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) {
SelectPostLoad(Node, 4, AArch64::LD1Fourv8h_POST, AArch64::qsub0);
return;
} else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
@@ -3933,10 +4324,10 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
} else if (VT == MVT::v16i8) {
SelectPostLoad(Node, 1, AArch64::LD1Rv16b_POST, AArch64::qsub0);
return;
- } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+ } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) {
SelectPostLoad(Node, 1, AArch64::LD1Rv4h_POST, AArch64::dsub0);
return;
- } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+ } else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) {
SelectPostLoad(Node, 1, AArch64::LD1Rv8h_POST, AArch64::qsub0);
return;
} else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
@@ -3961,10 +4352,10 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
} else if (VT == MVT::v16i8) {
SelectPostLoad(Node, 2, AArch64::LD2Rv16b_POST, AArch64::qsub0);
return;
- } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+ } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) {
SelectPostLoad(Node, 2, AArch64::LD2Rv4h_POST, AArch64::dsub0);
return;
- } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+ } else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) {
SelectPostLoad(Node, 2, AArch64::LD2Rv8h_POST, AArch64::qsub0);
return;
} else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
@@ -3989,10 +4380,10 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
} else if (VT == MVT::v16i8) {
SelectPostLoad(Node, 3, AArch64::LD3Rv16b_POST, AArch64::qsub0);
return;
- } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+ } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) {
SelectPostLoad(Node, 3, AArch64::LD3Rv4h_POST, AArch64::dsub0);
return;
- } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+ } else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) {
SelectPostLoad(Node, 3, AArch64::LD3Rv8h_POST, AArch64::qsub0);
return;
} else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
@@ -4017,10 +4408,10 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
} else if (VT == MVT::v16i8) {
SelectPostLoad(Node, 4, AArch64::LD4Rv16b_POST, AArch64::qsub0);
return;
- } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+ } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) {
SelectPostLoad(Node, 4, AArch64::LD4Rv4h_POST, AArch64::dsub0);
return;
- } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+ } else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) {
SelectPostLoad(Node, 4, AArch64::LD4Rv8h_POST, AArch64::qsub0);
return;
} else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
@@ -4043,7 +4434,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
SelectPostLoadLane(Node, 1, AArch64::LD1i8_POST);
return;
} else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
- VT == MVT::v8f16) {
+ VT == MVT::v8f16 || VT == MVT::v4bf16 || VT == MVT::v8bf16) {
SelectPostLoadLane(Node, 1, AArch64::LD1i16_POST);
return;
} else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
@@ -4062,7 +4453,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
SelectPostLoadLane(Node, 2, AArch64::LD2i8_POST);
return;
} else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
- VT == MVT::v8f16) {
+ VT == MVT::v8f16 || VT == MVT::v4bf16 || VT == MVT::v8bf16) {
SelectPostLoadLane(Node, 2, AArch64::LD2i16_POST);
return;
} else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
@@ -4081,7 +4472,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
SelectPostLoadLane(Node, 3, AArch64::LD3i8_POST);
return;
} else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
- VT == MVT::v8f16) {
+ VT == MVT::v8f16 || VT == MVT::v4bf16 || VT == MVT::v8bf16) {
SelectPostLoadLane(Node, 3, AArch64::LD3i16_POST);
return;
} else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
@@ -4100,7 +4491,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
SelectPostLoadLane(Node, 4, AArch64::LD4i8_POST);
return;
} else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
- VT == MVT::v8f16) {
+ VT == MVT::v8f16 || VT == MVT::v4bf16 || VT == MVT::v8bf16) {
SelectPostLoadLane(Node, 4, AArch64::LD4i16_POST);
return;
} else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
@@ -4122,10 +4513,10 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
} else if (VT == MVT::v16i8) {
SelectPostStore(Node, 2, AArch64::ST2Twov16b_POST);
return;
- } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+ } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) {
SelectPostStore(Node, 2, AArch64::ST2Twov4h_POST);
return;
- } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+ } else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) {
SelectPostStore(Node, 2, AArch64::ST2Twov8h_POST);
return;
} else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
@@ -4151,10 +4542,10 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
} else if (VT == MVT::v16i8) {
SelectPostStore(Node, 3, AArch64::ST3Threev16b_POST);
return;
- } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+ } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) {
SelectPostStore(Node, 3, AArch64::ST3Threev4h_POST);
return;
- } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+ } else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) {
SelectPostStore(Node, 3, AArch64::ST3Threev8h_POST);
return;
} else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
@@ -4180,10 +4571,10 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
} else if (VT == MVT::v16i8) {
SelectPostStore(Node, 4, AArch64::ST4Fourv16b_POST);
return;
- } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+ } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) {
SelectPostStore(Node, 4, AArch64::ST4Fourv4h_POST);
return;
- } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+ } else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) {
SelectPostStore(Node, 4, AArch64::ST4Fourv8h_POST);
return;
} else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
@@ -4209,10 +4600,10 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
} else if (VT == MVT::v16i8) {
SelectPostStore(Node, 2, AArch64::ST1Twov16b_POST);
return;
- } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+ } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) {
SelectPostStore(Node, 2, AArch64::ST1Twov4h_POST);
return;
- } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+ } else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) {
SelectPostStore(Node, 2, AArch64::ST1Twov8h_POST);
return;
} else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
@@ -4238,10 +4629,10 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
} else if (VT == MVT::v16i8) {
SelectPostStore(Node, 3, AArch64::ST1Threev16b_POST);
return;
- } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+ } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) {
SelectPostStore(Node, 3, AArch64::ST1Threev4h_POST);
return;
- } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+ } else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16 ) {
SelectPostStore(Node, 3, AArch64::ST1Threev8h_POST);
return;
} else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
@@ -4267,10 +4658,10 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
} else if (VT == MVT::v16i8) {
SelectPostStore(Node, 4, AArch64::ST1Fourv16b_POST);
return;
- } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+ } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) {
SelectPostStore(Node, 4, AArch64::ST1Fourv4h_POST);
return;
- } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+ } else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) {
SelectPostStore(Node, 4, AArch64::ST1Fourv8h_POST);
return;
} else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
@@ -4294,7 +4685,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
SelectPostStoreLane(Node, 2, AArch64::ST2i8_POST);
return;
} else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
- VT == MVT::v8f16) {
+ VT == MVT::v8f16 || VT == MVT::v4bf16 || VT == MVT::v8bf16) {
SelectPostStoreLane(Node, 2, AArch64::ST2i16_POST);
return;
} else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
@@ -4314,7 +4705,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
SelectPostStoreLane(Node, 3, AArch64::ST3i8_POST);
return;
} else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
- VT == MVT::v8f16) {
+ VT == MVT::v8f16 || VT == MVT::v4bf16 || VT == MVT::v8bf16) {
SelectPostStoreLane(Node, 3, AArch64::ST3i16_POST);
return;
} else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
@@ -4334,7 +4725,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
SelectPostStoreLane(Node, 4, AArch64::ST4i8_POST);
return;
} else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
- VT == MVT::v8f16) {
+ VT == MVT::v8f16 || VT == MVT::v4bf16 || VT == MVT::v8bf16) {
SelectPostStoreLane(Node, 4, AArch64::ST4i16_POST);
return;
} else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
@@ -4348,6 +4739,57 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
}
break;
}
+ case AArch64ISD::SVE_LD2_MERGE_ZERO: {
+ if (VT == MVT::nxv16i8) {
+ SelectPredicatedLoad(Node, 2, AArch64::LD2B_IMM);
+ return;
+ } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 ||
+ (VT == MVT::nxv8bf16 && Subtarget->hasBF16())) {
+ SelectPredicatedLoad(Node, 2, AArch64::LD2H_IMM);
+ return;
+ } else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) {
+ SelectPredicatedLoad(Node, 2, AArch64::LD2W_IMM);
+ return;
+ } else if (VT == MVT::nxv2i64 || VT == MVT::nxv2f64) {
+ SelectPredicatedLoad(Node, 2, AArch64::LD2D_IMM);
+ return;
+ }
+ break;
+ }
+ case AArch64ISD::SVE_LD3_MERGE_ZERO: {
+ if (VT == MVT::nxv16i8) {
+ SelectPredicatedLoad(Node, 3, AArch64::LD3B_IMM);
+ return;
+ } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 ||
+ (VT == MVT::nxv8bf16 && Subtarget->hasBF16())) {
+ SelectPredicatedLoad(Node, 3, AArch64::LD3H_IMM);
+ return;
+ } else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) {
+ SelectPredicatedLoad(Node, 3, AArch64::LD3W_IMM);
+ return;
+ } else if (VT == MVT::nxv2i64 || VT == MVT::nxv2f64) {
+ SelectPredicatedLoad(Node, 3, AArch64::LD3D_IMM);
+ return;
+ }
+ break;
+ }
+ case AArch64ISD::SVE_LD4_MERGE_ZERO: {
+ if (VT == MVT::nxv16i8) {
+ SelectPredicatedLoad(Node, 4, AArch64::LD4B_IMM);
+ return;
+ } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 ||
+ (VT == MVT::nxv8bf16 && Subtarget->hasBF16())) {
+ SelectPredicatedLoad(Node, 4, AArch64::LD4H_IMM);
+ return;
+ } else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) {
+ SelectPredicatedLoad(Node, 4, AArch64::LD4W_IMM);
+ return;
+ } else if (VT == MVT::nxv2i64 || VT == MVT::nxv2f64) {
+ SelectPredicatedLoad(Node, 4, AArch64::LD4D_IMM);
+ return;
+ }
+ break;
+ }
}
// Select the default instruction
@@ -4360,3 +4802,130 @@ FunctionPass *llvm::createAArch64ISelDag(AArch64TargetMachine &TM,
CodeGenOpt::Level OptLevel) {
return new AArch64DAGToDAGISel(TM, OptLevel);
}
+
+/// When \p PredVT is a scalable vector predicate in the form
+/// MVT::nx<M>xi1, it builds the correspondent scalable vector of
+/// integers MVT::nx<M>xi<bits> s.t. M x bits = 128. If the input
+/// PredVT is not in the form MVT::nx<M>xi1, it returns an invalid
+/// EVT.
+static EVT getPackedVectorTypeFromPredicateType(LLVMContext &Ctx, EVT PredVT) {
+ if (!PredVT.isScalableVector() || PredVT.getVectorElementType() != MVT::i1)
+ return EVT();
+
+ if (PredVT != MVT::nxv16i1 && PredVT != MVT::nxv8i1 &&
+ PredVT != MVT::nxv4i1 && PredVT != MVT::nxv2i1)
+ return EVT();
+
+ ElementCount EC = PredVT.getVectorElementCount();
+ EVT ScalarVT = EVT::getIntegerVT(Ctx, AArch64::SVEBitsPerBlock / EC.Min);
+ EVT MemVT = EVT::getVectorVT(Ctx, ScalarVT, EC);
+ return MemVT;
+}
+
+/// Return the EVT of the data associated to a memory operation in \p
+/// Root. If such EVT cannot be retrived, it returns an invalid EVT.
+static EVT getMemVTFromNode(LLVMContext &Ctx, SDNode *Root) {
+ if (isa<MemSDNode>(Root))
+ return cast<MemSDNode>(Root)->getMemoryVT();
+
+ if (isa<MemIntrinsicSDNode>(Root))
+ return cast<MemIntrinsicSDNode>(Root)->getMemoryVT();
+
+ const unsigned Opcode = Root->getOpcode();
+ // For custom ISD nodes, we have to look at them individually to extract the
+ // type of the data moved to/from memory.
+ switch (Opcode) {
+ case AArch64ISD::LD1_MERGE_ZERO:
+ case AArch64ISD::LD1S_MERGE_ZERO:
+ case AArch64ISD::LDNF1_MERGE_ZERO:
+ case AArch64ISD::LDNF1S_MERGE_ZERO:
+ return cast<VTSDNode>(Root->getOperand(3))->getVT();
+ case AArch64ISD::ST1_PRED:
+ return cast<VTSDNode>(Root->getOperand(4))->getVT();
+ default:
+ break;
+ }
+
+ if (Opcode != ISD::INTRINSIC_VOID)
+ return EVT();
+
+ const unsigned IntNo =
+ cast<ConstantSDNode>(Root->getOperand(1))->getZExtValue();
+ if (IntNo != Intrinsic::aarch64_sve_prf)
+ return EVT();
+
+ // We are using an SVE prefetch intrinsic. Type must be inferred
+ // from the width of the predicate.
+ return getPackedVectorTypeFromPredicateType(
+ Ctx, Root->getOperand(2)->getValueType(0));
+}
+
+/// SelectAddrModeIndexedSVE - Attempt selection of the addressing mode:
+/// Base + OffImm * sizeof(MemVT) for Min >= OffImm <= Max
+/// where Root is the memory access using N for its address.
+template <int64_t Min, int64_t Max>
+bool AArch64DAGToDAGISel::SelectAddrModeIndexedSVE(SDNode *Root, SDValue N,
+ SDValue &Base,
+ SDValue &OffImm) {
+ const EVT MemVT = getMemVTFromNode(*(CurDAG->getContext()), Root);
+
+ if (MemVT == EVT())
+ return false;
+
+ if (N.getOpcode() != ISD::ADD)
+ return false;
+
+ SDValue VScale = N.getOperand(1);
+ if (VScale.getOpcode() != ISD::VSCALE)
+ return false;
+
+ TypeSize TS = MemVT.getSizeInBits();
+ int64_t MemWidthBytes = static_cast<int64_t>(TS.getKnownMinSize()) / 8;
+ int64_t MulImm = cast<ConstantSDNode>(VScale.getOperand(0))->getSExtValue();
+
+ if ((MulImm % MemWidthBytes) != 0)
+ return false;
+
+ int64_t Offset = MulImm / MemWidthBytes;
+ if (Offset < Min || Offset > Max)
+ return false;
+
+ Base = N.getOperand(0);
+ OffImm = CurDAG->getTargetConstant(Offset, SDLoc(N), MVT::i64);
+ return true;
+}
+
+/// Select register plus register addressing mode for SVE, with scaled
+/// offset.
+bool AArch64DAGToDAGISel::SelectSVERegRegAddrMode(SDValue N, unsigned Scale,
+ SDValue &Base,
+ SDValue &Offset) {
+ if (N.getOpcode() != ISD::ADD)
+ return false;
+
+ // Process an ADD node.
+ const SDValue LHS = N.getOperand(0);
+ const SDValue RHS = N.getOperand(1);
+
+ // 8 bit data does not come with the SHL node, so it is treated
+ // separately.
+ if (Scale == 0) {
+ Base = LHS;
+ Offset = RHS;
+ return true;
+ }
+
+ // Check if the RHS is a shift node with a constant.
+ if (RHS.getOpcode() != ISD::SHL)
+ return false;
+
+ const SDValue ShiftRHS = RHS.getOperand(1);
+ if (auto *C = dyn_cast<ConstantSDNode>(ShiftRHS))
+ if (C->getZExtValue() == Scale) {
+ Base = LHS;
+ Offset = RHS.getOperand(0);
+ return true;
+ }
+
+ return false;
+}
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index d45a80057564a..85db14ab66feb 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -99,11 +99,6 @@ STATISTIC(NumTailCalls, "Number of tail calls");
STATISTIC(NumShiftInserts, "Number of vector shift inserts");
STATISTIC(NumOptimizedImms, "Number of times immediates were optimized");
-static cl::opt<bool>
-EnableAArch64SlrGeneration("aarch64-shift-insert-generation", cl::Hidden,
- cl::desc("Allow AArch64 SLI/SRI formation"),
- cl::init(false));
-
// FIXME: The necessary dtprel relocations don't seem to be supported
// well in the GNU bfd and gold linkers at the moment. Therefore, by
// default, for now, fall back to GeneralDynamic code generation.
@@ -121,6 +116,18 @@ EnableOptimizeLogicalImm("aarch64-enable-logical-imm", cl::Hidden,
/// Value type used for condition codes.
static const MVT MVT_CC = MVT::i32;
+/// Returns true if VT's elements occupy the lowest bit positions of its
+/// associated register class without any intervening space.
+///
+/// For example, nxv2f16, nxv4f16 and nxv8f16 are legal types that belong to the
+/// same register class, but only nxv8f16 can be treated as a packed vector.
+static inline bool isPackedVectorType(EVT VT, SelectionDAG &DAG) {
+ assert(VT.isVector() && DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
+ "Expected legal vector type!");
+ return VT.isFixedLengthVector() ||
+ VT.getSizeInBits().getKnownMinSize() == AArch64::SVEBitsPerBlock;
+}
+
AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
const AArch64Subtarget &STI)
: TargetLowering(TM), Subtarget(&STI) {
@@ -137,6 +144,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
if (Subtarget->hasFPARMv8()) {
addRegisterClass(MVT::f16, &AArch64::FPR16RegClass);
+ addRegisterClass(MVT::bf16, &AArch64::FPR16RegClass);
addRegisterClass(MVT::f32, &AArch64::FPR32RegClass);
addRegisterClass(MVT::f64, &AArch64::FPR64RegClass);
addRegisterClass(MVT::f128, &AArch64::FPR128RegClass);
@@ -153,6 +161,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
addDRTypeForNEON(MVT::v1i64);
addDRTypeForNEON(MVT::v1f64);
addDRTypeForNEON(MVT::v4f16);
+ addDRTypeForNEON(MVT::v4bf16);
addQRTypeForNEON(MVT::v4f32);
addQRTypeForNEON(MVT::v2f64);
@@ -161,6 +170,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
addQRTypeForNEON(MVT::v4i32);
addQRTypeForNEON(MVT::v2i64);
addQRTypeForNEON(MVT::v8f16);
+ addQRTypeForNEON(MVT::v8bf16);
}
if (Subtarget->hasSVE()) {
@@ -183,21 +193,51 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
addRegisterClass(MVT::nxv4f32, &AArch64::ZPRRegClass);
addRegisterClass(MVT::nxv2f64, &AArch64::ZPRRegClass);
+ if (Subtarget->hasBF16()) {
+ addRegisterClass(MVT::nxv2bf16, &AArch64::ZPRRegClass);
+ addRegisterClass(MVT::nxv4bf16, &AArch64::ZPRRegClass);
+ addRegisterClass(MVT::nxv8bf16, &AArch64::ZPRRegClass);
+ }
+
+ if (useSVEForFixedLengthVectors()) {
+ for (MVT VT : MVT::integer_fixedlen_vector_valuetypes())
+ if (useSVEForFixedLengthVectorVT(VT))
+ addRegisterClass(VT, &AArch64::ZPRRegClass);
+
+ for (MVT VT : MVT::fp_fixedlen_vector_valuetypes())
+ if (useSVEForFixedLengthVectorVT(VT))
+ addRegisterClass(VT, &AArch64::ZPRRegClass);
+ }
+
for (auto VT : { MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32, MVT::nxv2i64 }) {
setOperationAction(ISD::SADDSAT, VT, Legal);
setOperationAction(ISD::UADDSAT, VT, Legal);
setOperationAction(ISD::SSUBSAT, VT, Legal);
setOperationAction(ISD::USUBSAT, VT, Legal);
- setOperationAction(ISD::SMAX, VT, Legal);
- setOperationAction(ISD::UMAX, VT, Legal);
- setOperationAction(ISD::SMIN, VT, Legal);
- setOperationAction(ISD::UMIN, VT, Legal);
+ setOperationAction(ISD::UREM, VT, Expand);
+ setOperationAction(ISD::SREM, VT, Expand);
+ setOperationAction(ISD::SDIVREM, VT, Expand);
+ setOperationAction(ISD::UDIVREM, VT, Expand);
}
for (auto VT :
{ MVT::nxv2i8, MVT::nxv2i16, MVT::nxv2i32, MVT::nxv2i64, MVT::nxv4i8,
MVT::nxv4i16, MVT::nxv4i32, MVT::nxv8i8, MVT::nxv8i16 })
setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Legal);
+
+ for (auto VT :
+ { MVT::nxv2f16, MVT::nxv4f16, MVT::nxv8f16, MVT::nxv2f32, MVT::nxv4f32,
+ MVT::nxv2f64 }) {
+ setCondCodeAction(ISD::SETO, VT, Expand);
+ setCondCodeAction(ISD::SETOLT, VT, Expand);
+ setCondCodeAction(ISD::SETOLE, VT, Expand);
+ setCondCodeAction(ISD::SETULT, VT, Expand);
+ setCondCodeAction(ISD::SETULE, VT, Expand);
+ setCondCodeAction(ISD::SETUGE, VT, Expand);
+ setCondCodeAction(ISD::SETUGT, VT, Expand);
+ setCondCodeAction(ISD::SETUEQ, VT, Expand);
+ setCondCodeAction(ISD::SETUNE, VT, Expand);
+ }
}
// Compute derived properties from the register classes
@@ -211,6 +251,12 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setOperationAction(ISD::SETCC, MVT::f16, Custom);
setOperationAction(ISD::SETCC, MVT::f32, Custom);
setOperationAction(ISD::SETCC, MVT::f64, Custom);
+ setOperationAction(ISD::STRICT_FSETCC, MVT::f16, Custom);
+ setOperationAction(ISD::STRICT_FSETCC, MVT::f32, Custom);
+ setOperationAction(ISD::STRICT_FSETCC, MVT::f64, Custom);
+ setOperationAction(ISD::STRICT_FSETCCS, MVT::f16, Custom);
+ setOperationAction(ISD::STRICT_FSETCCS, MVT::f32, Custom);
+ setOperationAction(ISD::STRICT_FSETCCS, MVT::f64, Custom);
setOperationAction(ISD::BITREVERSE, MVT::i32, Legal);
setOperationAction(ISD::BITREVERSE, MVT::i64, Legal);
setOperationAction(ISD::BRCOND, MVT::Other, Expand);
@@ -266,6 +312,8 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setOperationAction(ISD::FSUB, MVT::f128, Custom);
setOperationAction(ISD::FTRUNC, MVT::f128, Expand);
setOperationAction(ISD::SETCC, MVT::f128, Custom);
+ setOperationAction(ISD::STRICT_FSETCC, MVT::f128, Custom);
+ setOperationAction(ISD::STRICT_FSETCCS, MVT::f128, Custom);
setOperationAction(ISD::BR_CC, MVT::f128, Custom);
setOperationAction(ISD::SELECT, MVT::f128, Custom);
setOperationAction(ISD::SELECT_CC, MVT::f128, Custom);
@@ -276,17 +324,31 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);
setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
setOperationAction(ISD::FP_TO_SINT, MVT::i128, Custom);
+ setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i32, Custom);
+ setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i64, Custom);
+ setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i128, Custom);
setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
setOperationAction(ISD::FP_TO_UINT, MVT::i128, Custom);
+ setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i32, Custom);
+ setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i64, Custom);
+ setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i128, Custom);
setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
setOperationAction(ISD::SINT_TO_FP, MVT::i128, Custom);
+ setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i32, Custom);
+ setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i64, Custom);
+ setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i128, Custom);
setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom);
setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);
setOperationAction(ISD::UINT_TO_FP, MVT::i128, Custom);
+ setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i32, Custom);
+ setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i64, Custom);
+ setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i128, Custom);
setOperationAction(ISD::FP_ROUND, MVT::f32, Custom);
setOperationAction(ISD::FP_ROUND, MVT::f64, Custom);
+ setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Custom);
+ setOperationAction(ISD::STRICT_FP_ROUND, MVT::f64, Custom);
// Variable arguments.
setOperationAction(ISD::VASTART, MVT::Other, Custom);
@@ -327,12 +389,17 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setOperationAction(ISD::ROTR, VT, Expand);
}
+ // AArch64 doesn't have i32 MULH{S|U}.
+ setOperationAction(ISD::MULHU, MVT::i32, Expand);
+ setOperationAction(ISD::MULHS, MVT::i32, Expand);
+
// AArch64 doesn't have {U|S}MUL_LOHI.
setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand);
setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand);
setOperationAction(ISD::CTPOP, MVT::i32, Custom);
setOperationAction(ISD::CTPOP, MVT::i64, Custom);
+ setOperationAction(ISD::CTPOP, MVT::i128, Custom);
setOperationAction(ISD::SDIVREM, MVT::i32, Expand);
setOperationAction(ISD::SDIVREM, MVT::i64, Expand);
@@ -525,6 +592,17 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setOperationAction(ISD::LOAD, MVT::i128, Custom);
setOperationAction(ISD::STORE, MVT::i128, Custom);
+ // 256 bit non-temporal stores can be lowered to STNP. Do this as part of the
+ // custom lowering, as there are no un-paired non-temporal stores and
+ // legalization will break up 256 bit inputs.
+ setOperationAction(ISD::STORE, MVT::v32i8, Custom);
+ setOperationAction(ISD::STORE, MVT::v16i16, Custom);
+ setOperationAction(ISD::STORE, MVT::v16f16, Custom);
+ setOperationAction(ISD::STORE, MVT::v8i32, Custom);
+ setOperationAction(ISD::STORE, MVT::v8f32, Custom);
+ setOperationAction(ISD::STORE, MVT::v4f64, Custom);
+ setOperationAction(ISD::STORE, MVT::v4i64, Custom);
+
// Lower READCYCLECOUNTER using an mrs from PMCCNTR_EL0.
// This requires the Performance Monitors extension.
if (Subtarget->hasPerfMon())
@@ -574,6 +652,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setOperationAction(ISD::BITCAST, MVT::i16, Custom);
setOperationAction(ISD::BITCAST, MVT::f16, Custom);
+ setOperationAction(ISD::BITCAST, MVT::bf16, Custom);
// Indexed loads and stores are supported.
for (unsigned im = (unsigned)ISD::PRE_INC;
@@ -585,6 +664,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setIndexedLoadAction(im, MVT::f64, Legal);
setIndexedLoadAction(im, MVT::f32, Legal);
setIndexedLoadAction(im, MVT::f16, Legal);
+ setIndexedLoadAction(im, MVT::bf16, Legal);
setIndexedStoreAction(im, MVT::i8, Legal);
setIndexedStoreAction(im, MVT::i16, Legal);
setIndexedStoreAction(im, MVT::i32, Legal);
@@ -592,6 +672,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setIndexedStoreAction(im, MVT::f64, Legal);
setIndexedStoreAction(im, MVT::f32, Legal);
setIndexedStoreAction(im, MVT::f16, Legal);
+ setIndexedStoreAction(im, MVT::bf16, Legal);
}
// Trap.
@@ -769,6 +850,8 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setOperationAction(ISD::UADDSAT, VT, Legal);
setOperationAction(ISD::SSUBSAT, VT, Legal);
setOperationAction(ISD::USUBSAT, VT, Legal);
+
+ setOperationAction(ISD::TRUNCATE, VT, Custom);
}
for (MVT VT : { MVT::v4f16, MVT::v2f32,
MVT::v8f16, MVT::v4f32, MVT::v2f64 }) {
@@ -825,6 +908,9 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
}
}
+ if (Subtarget->hasSVE())
+ setOperationAction(ISD::VSCALE, MVT::i32, Custom);
+
setTruncStoreAction(MVT::v4i16, MVT::v4i8, Custom);
}
@@ -833,11 +919,60 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
// splat of 0 or undef) once vector selects supported in SVE codegen. See
// D68877 for more details.
for (MVT VT : MVT::integer_scalable_vector_valuetypes()) {
- if (isTypeLegal(VT))
+ if (isTypeLegal(VT)) {
+ setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
setOperationAction(ISD::SPLAT_VECTOR, VT, Custom);
+ setOperationAction(ISD::SELECT, VT, Custom);
+ setOperationAction(ISD::SDIV, VT, Custom);
+ setOperationAction(ISD::UDIV, VT, Custom);
+ setOperationAction(ISD::SMIN, VT, Custom);
+ setOperationAction(ISD::UMIN, VT, Custom);
+ setOperationAction(ISD::SMAX, VT, Custom);
+ setOperationAction(ISD::UMAX, VT, Custom);
+ setOperationAction(ISD::SHL, VT, Custom);
+ setOperationAction(ISD::SRL, VT, Custom);
+ setOperationAction(ISD::SRA, VT, Custom);
+ if (VT.getScalarType() == MVT::i1)
+ setOperationAction(ISD::SETCC, VT, Custom);
+ }
}
+
+ for (auto VT : {MVT::nxv8i8, MVT::nxv4i16, MVT::nxv2i32})
+ setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
+
setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i8, Custom);
setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i16, Custom);
+
+ for (MVT VT : MVT::fp_scalable_vector_valuetypes()) {
+ if (isTypeLegal(VT)) {
+ setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
+ setOperationAction(ISD::SPLAT_VECTOR, VT, Custom);
+ setOperationAction(ISD::SELECT, VT, Custom);
+ setOperationAction(ISD::FMA, VT, Custom);
+ }
+ }
+
+ // NOTE: Currently this has to happen after computeRegisterProperties rather
+ // than the preferred option of combining it with the addRegisterClass call.
+ if (useSVEForFixedLengthVectors()) {
+ for (MVT VT : MVT::integer_fixedlen_vector_valuetypes())
+ if (useSVEForFixedLengthVectorVT(VT))
+ addTypeForFixedLengthSVE(VT);
+ for (MVT VT : MVT::fp_fixedlen_vector_valuetypes())
+ if (useSVEForFixedLengthVectorVT(VT))
+ addTypeForFixedLengthSVE(VT);
+
+ // 64bit results can mean a bigger than NEON input.
+ for (auto VT : {MVT::v8i8, MVT::v4i16})
+ setOperationAction(ISD::TRUNCATE, VT, Custom);
+ setOperationAction(ISD::FP_ROUND, MVT::v4f16, Custom);
+
+ // 128bit results imply a bigger than NEON input.
+ for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32})
+ setOperationAction(ISD::TRUNCATE, VT, Custom);
+ for (auto VT : {MVT::v8f16, MVT::v4f32})
+ setOperationAction(ISD::FP_ROUND, VT, Expand);
+ }
}
PredictableSelectIsExpensive = Subtarget->predictableSelectIsExpensive();
@@ -922,6 +1057,24 @@ void AArch64TargetLowering::addTypeForNEON(MVT VT, MVT PromotedBitwiseVT) {
}
}
+void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT) {
+ assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
+
+ // By default everything must be expanded.
+ for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op)
+ setOperationAction(Op, VT, Expand);
+
+ // We use EXTRACT_SUBVECTOR to "cast" a scalable vector to a fixed length one.
+ setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
+
+ // Lower fixed length vector operations to scalable equivalents.
+ setOperationAction(ISD::ADD, VT, Custom);
+ setOperationAction(ISD::FADD, VT, Custom);
+ setOperationAction(ISD::LOAD, VT, Custom);
+ setOperationAction(ISD::STORE, VT, Custom);
+ setOperationAction(ISD::TRUNCATE, VT, Custom);
+}
+
void AArch64TargetLowering::addDRTypeForNEON(MVT VT) {
addRegisterClass(VT, &AArch64::FPR64RegClass);
addTypeForNEON(VT, MVT::v2i32);
@@ -932,10 +1085,12 @@ void AArch64TargetLowering::addQRTypeForNEON(MVT VT) {
addTypeForNEON(VT, MVT::v4i32);
}
-EVT AArch64TargetLowering::getSetCCResultType(const DataLayout &, LLVMContext &,
- EVT VT) const {
+EVT AArch64TargetLowering::getSetCCResultType(const DataLayout &,
+ LLVMContext &C, EVT VT) const {
if (!VT.isVector())
return MVT::i32;
+ if (VT.isScalableVector())
+ return EVT::getVectorVT(C, MVT::i1, VT.getVectorElementCount());
return VT.changeVectorElementTypeToInteger();
}
@@ -1035,7 +1190,8 @@ static bool optimizeLogicalImm(SDValue Op, unsigned Size, uint64_t Imm,
}
bool AArch64TargetLowering::targetShrinkDemandedConstant(
- SDValue Op, const APInt &Demanded, TargetLoweringOpt &TLO) const {
+ SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
+ TargetLoweringOpt &TLO) const {
// Delay this optimization to as late as possible.
if (!TLO.LegalOps)
return false;
@@ -1052,7 +1208,7 @@ bool AArch64TargetLowering::targetShrinkDemandedConstant(
"i32 or i64 is expected after legalization.");
// Exit early if we demand all bits.
- if (Demanded.countPopulation() == Size)
+ if (DemandedBits.countPopulation() == Size)
return false;
unsigned NewOpc;
@@ -1073,7 +1229,7 @@ bool AArch64TargetLowering::targetShrinkDemandedConstant(
if (!C)
return false;
uint64_t Imm = C->getZExtValue();
- return optimizeLogicalImm(Op, Size, Imm, Demanded, TLO, NewOpc);
+ return optimizeLogicalImm(Op, Size, Imm, DemandedBits, TLO, NewOpc);
}
/// computeKnownBitsForTargetNode - Determine which of the bits specified in
@@ -1177,7 +1333,7 @@ bool AArch64TargetLowering::allowsMisalignedMemoryAccesses(
// Same as above but handling LLTs instead.
bool AArch64TargetLowering::allowsMisalignedMemoryAccesses(
- LLT Ty, unsigned AddrSpace, unsigned Align, MachineMemOperand::Flags Flags,
+ LLT Ty, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
bool *Fast) const {
if (Subtarget->requiresStrictAlign())
return false;
@@ -1192,7 +1348,7 @@ bool AArch64TargetLowering::allowsMisalignedMemoryAccesses(
// Code that uses clang vector extensions can mark that it
// wants unaligned accesses to be treated as fast by
// underspecifying alignment to be 1 or 2.
- Align <= 2 ||
+ Alignment <= 2 ||
// Disregard v2i64. Memcpy lowering produces those and splitting
// them regresses performance on micro-benchmarks and olden/bh.
@@ -1208,181 +1364,246 @@ AArch64TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
}
const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
+#define MAKE_CASE(V) \
+ case V: \
+ return #V;
switch ((AArch64ISD::NodeType)Opcode) {
- case AArch64ISD::FIRST_NUMBER: break;
- case AArch64ISD::CALL: return "AArch64ISD::CALL";
- case AArch64ISD::ADRP: return "AArch64ISD::ADRP";
- case AArch64ISD::ADR: return "AArch64ISD::ADR";
- case AArch64ISD::ADDlow: return "AArch64ISD::ADDlow";
- case AArch64ISD::LOADgot: return "AArch64ISD::LOADgot";
- case AArch64ISD::RET_FLAG: return "AArch64ISD::RET_FLAG";
- case AArch64ISD::BRCOND: return "AArch64ISD::BRCOND";
- case AArch64ISD::CSEL: return "AArch64ISD::CSEL";
- case AArch64ISD::FCSEL: return "AArch64ISD::FCSEL";
- case AArch64ISD::CSINV: return "AArch64ISD::CSINV";
- case AArch64ISD::CSNEG: return "AArch64ISD::CSNEG";
- case AArch64ISD::CSINC: return "AArch64ISD::CSINC";
- case AArch64ISD::THREAD_POINTER: return "AArch64ISD::THREAD_POINTER";
- case AArch64ISD::TLSDESC_CALLSEQ: return "AArch64ISD::TLSDESC_CALLSEQ";
- case AArch64ISD::ADC: return "AArch64ISD::ADC";
- case AArch64ISD::SBC: return "AArch64ISD::SBC";
- case AArch64ISD::ADDS: return "AArch64ISD::ADDS";
- case AArch64ISD::SUBS: return "AArch64ISD::SUBS";
- case AArch64ISD::ADCS: return "AArch64ISD::ADCS";
- case AArch64ISD::SBCS: return "AArch64ISD::SBCS";
- case AArch64ISD::ANDS: return "AArch64ISD::ANDS";
- case AArch64ISD::CCMP: return "AArch64ISD::CCMP";
- case AArch64ISD::CCMN: return "AArch64ISD::CCMN";
- case AArch64ISD::FCCMP: return "AArch64ISD::FCCMP";
- case AArch64ISD::FCMP: return "AArch64ISD::FCMP";
- case AArch64ISD::DUP: return "AArch64ISD::DUP";
- case AArch64ISD::DUPLANE8: return "AArch64ISD::DUPLANE8";
- case AArch64ISD::DUPLANE16: return "AArch64ISD::DUPLANE16";
- case AArch64ISD::DUPLANE32: return "AArch64ISD::DUPLANE32";
- case AArch64ISD::DUPLANE64: return "AArch64ISD::DUPLANE64";
- case AArch64ISD::MOVI: return "AArch64ISD::MOVI";
- case AArch64ISD::MOVIshift: return "AArch64ISD::MOVIshift";
- case AArch64ISD::MOVIedit: return "AArch64ISD::MOVIedit";
- case AArch64ISD::MOVImsl: return "AArch64ISD::MOVImsl";
- case AArch64ISD::FMOV: return "AArch64ISD::FMOV";
- case AArch64ISD::MVNIshift: return "AArch64ISD::MVNIshift";
- case AArch64ISD::MVNImsl: return "AArch64ISD::MVNImsl";
- case AArch64ISD::BICi: return "AArch64ISD::BICi";
- case AArch64ISD::ORRi: return "AArch64ISD::ORRi";
- case AArch64ISD::BSL: return "AArch64ISD::BSL";
- case AArch64ISD::NEG: return "AArch64ISD::NEG";
- case AArch64ISD::EXTR: return "AArch64ISD::EXTR";
- case AArch64ISD::ZIP1: return "AArch64ISD::ZIP1";
- case AArch64ISD::ZIP2: return "AArch64ISD::ZIP2";
- case AArch64ISD::UZP1: return "AArch64ISD::UZP1";
- case AArch64ISD::UZP2: return "AArch64ISD::UZP2";
- case AArch64ISD::TRN1: return "AArch64ISD::TRN1";
- case AArch64ISD::TRN2: return "AArch64ISD::TRN2";
- case AArch64ISD::REV16: return "AArch64ISD::REV16";
- case AArch64ISD::REV32: return "AArch64ISD::REV32";
- case AArch64ISD::REV64: return "AArch64ISD::REV64";
- case AArch64ISD::EXT: return "AArch64ISD::EXT";
- case AArch64ISD::VSHL: return "AArch64ISD::VSHL";
- case AArch64ISD::VLSHR: return "AArch64ISD::VLSHR";
- case AArch64ISD::VASHR: return "AArch64ISD::VASHR";
- case AArch64ISD::CMEQ: return "AArch64ISD::CMEQ";
- case AArch64ISD::CMGE: return "AArch64ISD::CMGE";
- case AArch64ISD::CMGT: return "AArch64ISD::CMGT";
- case AArch64ISD::CMHI: return "AArch64ISD::CMHI";
- case AArch64ISD::CMHS: return "AArch64ISD::CMHS";
- case AArch64ISD::FCMEQ: return "AArch64ISD::FCMEQ";
- case AArch64ISD::FCMGE: return "AArch64ISD::FCMGE";
- case AArch64ISD::FCMGT: return "AArch64ISD::FCMGT";
- case AArch64ISD::CMEQz: return "AArch64ISD::CMEQz";
- case AArch64ISD::CMGEz: return "AArch64ISD::CMGEz";
- case AArch64ISD::CMGTz: return "AArch64ISD::CMGTz";
- case AArch64ISD::CMLEz: return "AArch64ISD::CMLEz";
- case AArch64ISD::CMLTz: return "AArch64ISD::CMLTz";
- case AArch64ISD::FCMEQz: return "AArch64ISD::FCMEQz";
- case AArch64ISD::FCMGEz: return "AArch64ISD::FCMGEz";
- case AArch64ISD::FCMGTz: return "AArch64ISD::FCMGTz";
- case AArch64ISD::FCMLEz: return "AArch64ISD::FCMLEz";
- case AArch64ISD::FCMLTz: return "AArch64ISD::FCMLTz";
- case AArch64ISD::SADDV: return "AArch64ISD::SADDV";
- case AArch64ISD::UADDV: return "AArch64ISD::UADDV";
- case AArch64ISD::SMINV: return "AArch64ISD::SMINV";
- case AArch64ISD::UMINV: return "AArch64ISD::UMINV";
- case AArch64ISD::SMAXV: return "AArch64ISD::SMAXV";
- case AArch64ISD::UMAXV: return "AArch64ISD::UMAXV";
- case AArch64ISD::SMAXV_PRED: return "AArch64ISD::SMAXV_PRED";
- case AArch64ISD::UMAXV_PRED: return "AArch64ISD::UMAXV_PRED";
- case AArch64ISD::SMINV_PRED: return "AArch64ISD::SMINV_PRED";
- case AArch64ISD::UMINV_PRED: return "AArch64ISD::UMINV_PRED";
- case AArch64ISD::ORV_PRED: return "AArch64ISD::ORV_PRED";
- case AArch64ISD::EORV_PRED: return "AArch64ISD::EORV_PRED";
- case AArch64ISD::ANDV_PRED: return "AArch64ISD::ANDV_PRED";
- case AArch64ISD::CLASTA_N: return "AArch64ISD::CLASTA_N";
- case AArch64ISD::CLASTB_N: return "AArch64ISD::CLASTB_N";
- case AArch64ISD::LASTA: return "AArch64ISD::LASTA";
- case AArch64ISD::LASTB: return "AArch64ISD::LASTB";
- case AArch64ISD::REV: return "AArch64ISD::REV";
- case AArch64ISD::TBL: return "AArch64ISD::TBL";
- case AArch64ISD::NOT: return "AArch64ISD::NOT";
- case AArch64ISD::BIT: return "AArch64ISD::BIT";
- case AArch64ISD::CBZ: return "AArch64ISD::CBZ";
- case AArch64ISD::CBNZ: return "AArch64ISD::CBNZ";
- case AArch64ISD::TBZ: return "AArch64ISD::TBZ";
- case AArch64ISD::TBNZ: return "AArch64ISD::TBNZ";
- case AArch64ISD::TC_RETURN: return "AArch64ISD::TC_RETURN";
- case AArch64ISD::PREFETCH: return "AArch64ISD::PREFETCH";
- case AArch64ISD::SITOF: return "AArch64ISD::SITOF";
- case AArch64ISD::UITOF: return "AArch64ISD::UITOF";
- case AArch64ISD::NVCAST: return "AArch64ISD::NVCAST";
- case AArch64ISD::SQSHL_I: return "AArch64ISD::SQSHL_I";
- case AArch64ISD::UQSHL_I: return "AArch64ISD::UQSHL_I";
- case AArch64ISD::SRSHR_I: return "AArch64ISD::SRSHR_I";
- case AArch64ISD::URSHR_I: return "AArch64ISD::URSHR_I";
- case AArch64ISD::SQSHLU_I: return "AArch64ISD::SQSHLU_I";
- case AArch64ISD::WrapperLarge: return "AArch64ISD::WrapperLarge";
- case AArch64ISD::LD2post: return "AArch64ISD::LD2post";
- case AArch64ISD::LD3post: return "AArch64ISD::LD3post";
- case AArch64ISD::LD4post: return "AArch64ISD::LD4post";
- case AArch64ISD::ST2post: return "AArch64ISD::ST2post";
- case AArch64ISD::ST3post: return "AArch64ISD::ST3post";
- case AArch64ISD::ST4post: return "AArch64ISD::ST4post";
- case AArch64ISD::LD1x2post: return "AArch64ISD::LD1x2post";
- case AArch64ISD::LD1x3post: return "AArch64ISD::LD1x3post";
- case AArch64ISD::LD1x4post: return "AArch64ISD::LD1x4post";
- case AArch64ISD::ST1x2post: return "AArch64ISD::ST1x2post";
- case AArch64ISD::ST1x3post: return "AArch64ISD::ST1x3post";
- case AArch64ISD::ST1x4post: return "AArch64ISD::ST1x4post";
- case AArch64ISD::LD1DUPpost: return "AArch64ISD::LD1DUPpost";
- case AArch64ISD::LD2DUPpost: return "AArch64ISD::LD2DUPpost";
- case AArch64ISD::LD3DUPpost: return "AArch64ISD::LD3DUPpost";
- case AArch64ISD::LD4DUPpost: return "AArch64ISD::LD4DUPpost";
- case AArch64ISD::LD1LANEpost: return "AArch64ISD::LD1LANEpost";
- case AArch64ISD::LD2LANEpost: return "AArch64ISD::LD2LANEpost";
- case AArch64ISD::LD3LANEpost: return "AArch64ISD::LD3LANEpost";
- case AArch64ISD::LD4LANEpost: return "AArch64ISD::LD4LANEpost";
- case AArch64ISD::ST2LANEpost: return "AArch64ISD::ST2LANEpost";
- case AArch64ISD::ST3LANEpost: return "AArch64ISD::ST3LANEpost";
- case AArch64ISD::ST4LANEpost: return "AArch64ISD::ST4LANEpost";
- case AArch64ISD::SMULL: return "AArch64ISD::SMULL";
- case AArch64ISD::UMULL: return "AArch64ISD::UMULL";
- case AArch64ISD::FRECPE: return "AArch64ISD::FRECPE";
- case AArch64ISD::FRECPS: return "AArch64ISD::FRECPS";
- case AArch64ISD::FRSQRTE: return "AArch64ISD::FRSQRTE";
- case AArch64ISD::FRSQRTS: return "AArch64ISD::FRSQRTS";
- case AArch64ISD::STG: return "AArch64ISD::STG";
- case AArch64ISD::STZG: return "AArch64ISD::STZG";
- case AArch64ISD::ST2G: return "AArch64ISD::ST2G";
- case AArch64ISD::STZ2G: return "AArch64ISD::STZ2G";
- case AArch64ISD::SUNPKHI: return "AArch64ISD::SUNPKHI";
- case AArch64ISD::SUNPKLO: return "AArch64ISD::SUNPKLO";
- case AArch64ISD::UUNPKHI: return "AArch64ISD::UUNPKHI";
- case AArch64ISD::UUNPKLO: return "AArch64ISD::UUNPKLO";
- case AArch64ISD::INSR: return "AArch64ISD::INSR";
- case AArch64ISD::PTEST: return "AArch64ISD::PTEST";
- case AArch64ISD::PTRUE: return "AArch64ISD::PTRUE";
- case AArch64ISD::GLD1: return "AArch64ISD::GLD1";
- case AArch64ISD::GLD1_SCALED: return "AArch64ISD::GLD1_SCALED";
- case AArch64ISD::GLD1_SXTW: return "AArch64ISD::GLD1_SXTW";
- case AArch64ISD::GLD1_UXTW: return "AArch64ISD::GLD1_UXTW";
- case AArch64ISD::GLD1_SXTW_SCALED: return "AArch64ISD::GLD1_SXTW_SCALED";
- case AArch64ISD::GLD1_UXTW_SCALED: return "AArch64ISD::GLD1_UXTW_SCALED";
- case AArch64ISD::GLD1_IMM: return "AArch64ISD::GLD1_IMM";
- case AArch64ISD::GLD1S: return "AArch64ISD::GLD1S";
- case AArch64ISD::GLD1S_SCALED: return "AArch64ISD::GLD1S_SCALED";
- case AArch64ISD::GLD1S_SXTW: return "AArch64ISD::GLD1S_SXTW";
- case AArch64ISD::GLD1S_UXTW: return "AArch64ISD::GLD1S_UXTW";
- case AArch64ISD::GLD1S_SXTW_SCALED: return "AArch64ISD::GLD1S_SXTW_SCALED";
- case AArch64ISD::GLD1S_UXTW_SCALED: return "AArch64ISD::GLD1S_UXTW_SCALED";
- case AArch64ISD::GLD1S_IMM: return "AArch64ISD::GLD1S_IMM";
- case AArch64ISD::SST1: return "AArch64ISD::SST1";
- case AArch64ISD::SST1_SCALED: return "AArch64ISD::SST1_SCALED";
- case AArch64ISD::SST1_SXTW: return "AArch64ISD::SST1_SXTW";
- case AArch64ISD::SST1_UXTW: return "AArch64ISD::SST1_UXTW";
- case AArch64ISD::SST1_SXTW_SCALED: return "AArch64ISD::SST1_SXTW_SCALED";
- case AArch64ISD::SST1_UXTW_SCALED: return "AArch64ISD::SST1_UXTW_SCALED";
- case AArch64ISD::SST1_IMM: return "AArch64ISD::SST1_IMM";
- case AArch64ISD::LDP: return "AArch64ISD::LDP";
- case AArch64ISD::STP: return "AArch64ISD::STP";
- }
+ case AArch64ISD::FIRST_NUMBER:
+ break;
+ MAKE_CASE(AArch64ISD::CALL)
+ MAKE_CASE(AArch64ISD::ADRP)
+ MAKE_CASE(AArch64ISD::ADR)
+ MAKE_CASE(AArch64ISD::ADDlow)
+ MAKE_CASE(AArch64ISD::LOADgot)
+ MAKE_CASE(AArch64ISD::RET_FLAG)
+ MAKE_CASE(AArch64ISD::BRCOND)
+ MAKE_CASE(AArch64ISD::CSEL)
+ MAKE_CASE(AArch64ISD::FCSEL)
+ MAKE_CASE(AArch64ISD::CSINV)
+ MAKE_CASE(AArch64ISD::CSNEG)
+ MAKE_CASE(AArch64ISD::CSINC)
+ MAKE_CASE(AArch64ISD::THREAD_POINTER)
+ MAKE_CASE(AArch64ISD::TLSDESC_CALLSEQ)
+ MAKE_CASE(AArch64ISD::ADD_PRED)
+ MAKE_CASE(AArch64ISD::SDIV_PRED)
+ MAKE_CASE(AArch64ISD::UDIV_PRED)
+ MAKE_CASE(AArch64ISD::SMIN_MERGE_OP1)
+ MAKE_CASE(AArch64ISD::UMIN_MERGE_OP1)
+ MAKE_CASE(AArch64ISD::SMAX_MERGE_OP1)
+ MAKE_CASE(AArch64ISD::UMAX_MERGE_OP1)
+ MAKE_CASE(AArch64ISD::SHL_MERGE_OP1)
+ MAKE_CASE(AArch64ISD::SRL_MERGE_OP1)
+ MAKE_CASE(AArch64ISD::SRA_MERGE_OP1)
+ MAKE_CASE(AArch64ISD::SETCC_MERGE_ZERO)
+ MAKE_CASE(AArch64ISD::ADC)
+ MAKE_CASE(AArch64ISD::SBC)
+ MAKE_CASE(AArch64ISD::ADDS)
+ MAKE_CASE(AArch64ISD::SUBS)
+ MAKE_CASE(AArch64ISD::ADCS)
+ MAKE_CASE(AArch64ISD::SBCS)
+ MAKE_CASE(AArch64ISD::ANDS)
+ MAKE_CASE(AArch64ISD::CCMP)
+ MAKE_CASE(AArch64ISD::CCMN)
+ MAKE_CASE(AArch64ISD::FCCMP)
+ MAKE_CASE(AArch64ISD::FCMP)
+ MAKE_CASE(AArch64ISD::STRICT_FCMP)
+ MAKE_CASE(AArch64ISD::STRICT_FCMPE)
+ MAKE_CASE(AArch64ISD::DUP)
+ MAKE_CASE(AArch64ISD::DUPLANE8)
+ MAKE_CASE(AArch64ISD::DUPLANE16)
+ MAKE_CASE(AArch64ISD::DUPLANE32)
+ MAKE_CASE(AArch64ISD::DUPLANE64)
+ MAKE_CASE(AArch64ISD::MOVI)
+ MAKE_CASE(AArch64ISD::MOVIshift)
+ MAKE_CASE(AArch64ISD::MOVIedit)
+ MAKE_CASE(AArch64ISD::MOVImsl)
+ MAKE_CASE(AArch64ISD::FMOV)
+ MAKE_CASE(AArch64ISD::MVNIshift)
+ MAKE_CASE(AArch64ISD::MVNImsl)
+ MAKE_CASE(AArch64ISD::BICi)
+ MAKE_CASE(AArch64ISD::ORRi)
+ MAKE_CASE(AArch64ISD::BSP)
+ MAKE_CASE(AArch64ISD::NEG)
+ MAKE_CASE(AArch64ISD::EXTR)
+ MAKE_CASE(AArch64ISD::ZIP1)
+ MAKE_CASE(AArch64ISD::ZIP2)
+ MAKE_CASE(AArch64ISD::UZP1)
+ MAKE_CASE(AArch64ISD::UZP2)
+ MAKE_CASE(AArch64ISD::TRN1)
+ MAKE_CASE(AArch64ISD::TRN2)
+ MAKE_CASE(AArch64ISD::REV16)
+ MAKE_CASE(AArch64ISD::REV32)
+ MAKE_CASE(AArch64ISD::REV64)
+ MAKE_CASE(AArch64ISD::EXT)
+ MAKE_CASE(AArch64ISD::VSHL)
+ MAKE_CASE(AArch64ISD::VLSHR)
+ MAKE_CASE(AArch64ISD::VASHR)
+ MAKE_CASE(AArch64ISD::VSLI)
+ MAKE_CASE(AArch64ISD::VSRI)
+ MAKE_CASE(AArch64ISD::CMEQ)
+ MAKE_CASE(AArch64ISD::CMGE)
+ MAKE_CASE(AArch64ISD::CMGT)
+ MAKE_CASE(AArch64ISD::CMHI)
+ MAKE_CASE(AArch64ISD::CMHS)
+ MAKE_CASE(AArch64ISD::FCMEQ)
+ MAKE_CASE(AArch64ISD::FCMGE)
+ MAKE_CASE(AArch64ISD::FCMGT)
+ MAKE_CASE(AArch64ISD::CMEQz)
+ MAKE_CASE(AArch64ISD::CMGEz)
+ MAKE_CASE(AArch64ISD::CMGTz)
+ MAKE_CASE(AArch64ISD::CMLEz)
+ MAKE_CASE(AArch64ISD::CMLTz)
+ MAKE_CASE(AArch64ISD::FCMEQz)
+ MAKE_CASE(AArch64ISD::FCMGEz)
+ MAKE_CASE(AArch64ISD::FCMGTz)
+ MAKE_CASE(AArch64ISD::FCMLEz)
+ MAKE_CASE(AArch64ISD::FCMLTz)
+ MAKE_CASE(AArch64ISD::SADDV)
+ MAKE_CASE(AArch64ISD::UADDV)
+ MAKE_CASE(AArch64ISD::SRHADD)
+ MAKE_CASE(AArch64ISD::URHADD)
+ MAKE_CASE(AArch64ISD::SMINV)
+ MAKE_CASE(AArch64ISD::UMINV)
+ MAKE_CASE(AArch64ISD::SMAXV)
+ MAKE_CASE(AArch64ISD::UMAXV)
+ MAKE_CASE(AArch64ISD::SMAXV_PRED)
+ MAKE_CASE(AArch64ISD::UMAXV_PRED)
+ MAKE_CASE(AArch64ISD::SMINV_PRED)
+ MAKE_CASE(AArch64ISD::UMINV_PRED)
+ MAKE_CASE(AArch64ISD::ORV_PRED)
+ MAKE_CASE(AArch64ISD::EORV_PRED)
+ MAKE_CASE(AArch64ISD::ANDV_PRED)
+ MAKE_CASE(AArch64ISD::CLASTA_N)
+ MAKE_CASE(AArch64ISD::CLASTB_N)
+ MAKE_CASE(AArch64ISD::LASTA)
+ MAKE_CASE(AArch64ISD::LASTB)
+ MAKE_CASE(AArch64ISD::REV)
+ MAKE_CASE(AArch64ISD::REINTERPRET_CAST)
+ MAKE_CASE(AArch64ISD::TBL)
+ MAKE_CASE(AArch64ISD::FADD_PRED)
+ MAKE_CASE(AArch64ISD::FADDA_PRED)
+ MAKE_CASE(AArch64ISD::FADDV_PRED)
+ MAKE_CASE(AArch64ISD::FMA_PRED)
+ MAKE_CASE(AArch64ISD::FMAXV_PRED)
+ MAKE_CASE(AArch64ISD::FMAXNMV_PRED)
+ MAKE_CASE(AArch64ISD::FMINV_PRED)
+ MAKE_CASE(AArch64ISD::FMINNMV_PRED)
+ MAKE_CASE(AArch64ISD::NOT)
+ MAKE_CASE(AArch64ISD::BIT)
+ MAKE_CASE(AArch64ISD::CBZ)
+ MAKE_CASE(AArch64ISD::CBNZ)
+ MAKE_CASE(AArch64ISD::TBZ)
+ MAKE_CASE(AArch64ISD::TBNZ)
+ MAKE_CASE(AArch64ISD::TC_RETURN)
+ MAKE_CASE(AArch64ISD::PREFETCH)
+ MAKE_CASE(AArch64ISD::SITOF)
+ MAKE_CASE(AArch64ISD::UITOF)
+ MAKE_CASE(AArch64ISD::NVCAST)
+ MAKE_CASE(AArch64ISD::SQSHL_I)
+ MAKE_CASE(AArch64ISD::UQSHL_I)
+ MAKE_CASE(AArch64ISD::SRSHR_I)
+ MAKE_CASE(AArch64ISD::URSHR_I)
+ MAKE_CASE(AArch64ISD::SQSHLU_I)
+ MAKE_CASE(AArch64ISD::WrapperLarge)
+ MAKE_CASE(AArch64ISD::LD2post)
+ MAKE_CASE(AArch64ISD::LD3post)
+ MAKE_CASE(AArch64ISD::LD4post)
+ MAKE_CASE(AArch64ISD::ST2post)
+ MAKE_CASE(AArch64ISD::ST3post)
+ MAKE_CASE(AArch64ISD::ST4post)
+ MAKE_CASE(AArch64ISD::LD1x2post)
+ MAKE_CASE(AArch64ISD::LD1x3post)
+ MAKE_CASE(AArch64ISD::LD1x4post)
+ MAKE_CASE(AArch64ISD::ST1x2post)
+ MAKE_CASE(AArch64ISD::ST1x3post)
+ MAKE_CASE(AArch64ISD::ST1x4post)
+ MAKE_CASE(AArch64ISD::LD1DUPpost)
+ MAKE_CASE(AArch64ISD::LD2DUPpost)
+ MAKE_CASE(AArch64ISD::LD3DUPpost)
+ MAKE_CASE(AArch64ISD::LD4DUPpost)
+ MAKE_CASE(AArch64ISD::LD1LANEpost)
+ MAKE_CASE(AArch64ISD::LD2LANEpost)
+ MAKE_CASE(AArch64ISD::LD3LANEpost)
+ MAKE_CASE(AArch64ISD::LD4LANEpost)
+ MAKE_CASE(AArch64ISD::ST2LANEpost)
+ MAKE_CASE(AArch64ISD::ST3LANEpost)
+ MAKE_CASE(AArch64ISD::ST4LANEpost)
+ MAKE_CASE(AArch64ISD::SMULL)
+ MAKE_CASE(AArch64ISD::UMULL)
+ MAKE_CASE(AArch64ISD::FRECPE)
+ MAKE_CASE(AArch64ISD::FRECPS)
+ MAKE_CASE(AArch64ISD::FRSQRTE)
+ MAKE_CASE(AArch64ISD::FRSQRTS)
+ MAKE_CASE(AArch64ISD::STG)
+ MAKE_CASE(AArch64ISD::STZG)
+ MAKE_CASE(AArch64ISD::ST2G)
+ MAKE_CASE(AArch64ISD::STZ2G)
+ MAKE_CASE(AArch64ISD::SUNPKHI)
+ MAKE_CASE(AArch64ISD::SUNPKLO)
+ MAKE_CASE(AArch64ISD::UUNPKHI)
+ MAKE_CASE(AArch64ISD::UUNPKLO)
+ MAKE_CASE(AArch64ISD::INSR)
+ MAKE_CASE(AArch64ISD::PTEST)
+ MAKE_CASE(AArch64ISD::PTRUE)
+ MAKE_CASE(AArch64ISD::LD1_MERGE_ZERO)
+ MAKE_CASE(AArch64ISD::LD1S_MERGE_ZERO)
+ MAKE_CASE(AArch64ISD::LDNF1_MERGE_ZERO)
+ MAKE_CASE(AArch64ISD::LDNF1S_MERGE_ZERO)
+ MAKE_CASE(AArch64ISD::LDFF1_MERGE_ZERO)
+ MAKE_CASE(AArch64ISD::LDFF1S_MERGE_ZERO)
+ MAKE_CASE(AArch64ISD::LD1RQ_MERGE_ZERO)
+ MAKE_CASE(AArch64ISD::LD1RO_MERGE_ZERO)
+ MAKE_CASE(AArch64ISD::SVE_LD2_MERGE_ZERO)
+ MAKE_CASE(AArch64ISD::SVE_LD3_MERGE_ZERO)
+ MAKE_CASE(AArch64ISD::SVE_LD4_MERGE_ZERO)
+ MAKE_CASE(AArch64ISD::GLD1_MERGE_ZERO)
+ MAKE_CASE(AArch64ISD::GLD1_SCALED_MERGE_ZERO)
+ MAKE_CASE(AArch64ISD::GLD1_SXTW_MERGE_ZERO)
+ MAKE_CASE(AArch64ISD::GLD1_UXTW_MERGE_ZERO)
+ MAKE_CASE(AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO)
+ MAKE_CASE(AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO)
+ MAKE_CASE(AArch64ISD::GLD1_IMM_MERGE_ZERO)
+ MAKE_CASE(AArch64ISD::GLD1S_MERGE_ZERO)
+ MAKE_CASE(AArch64ISD::GLD1S_SCALED_MERGE_ZERO)
+ MAKE_CASE(AArch64ISD::GLD1S_SXTW_MERGE_ZERO)
+ MAKE_CASE(AArch64ISD::GLD1S_UXTW_MERGE_ZERO)
+ MAKE_CASE(AArch64ISD::GLD1S_SXTW_SCALED_MERGE_ZERO)
+ MAKE_CASE(AArch64ISD::GLD1S_UXTW_SCALED_MERGE_ZERO)
+ MAKE_CASE(AArch64ISD::GLD1S_IMM_MERGE_ZERO)
+ MAKE_CASE(AArch64ISD::GLDFF1_MERGE_ZERO)
+ MAKE_CASE(AArch64ISD::GLDFF1_SCALED_MERGE_ZERO)
+ MAKE_CASE(AArch64ISD::GLDFF1_SXTW_MERGE_ZERO)
+ MAKE_CASE(AArch64ISD::GLDFF1_UXTW_MERGE_ZERO)
+ MAKE_CASE(AArch64ISD::GLDFF1_SXTW_SCALED_MERGE_ZERO)
+ MAKE_CASE(AArch64ISD::GLDFF1_UXTW_SCALED_MERGE_ZERO)
+ MAKE_CASE(AArch64ISD::GLDFF1_IMM_MERGE_ZERO)
+ MAKE_CASE(AArch64ISD::GLDFF1S_MERGE_ZERO)
+ MAKE_CASE(AArch64ISD::GLDFF1S_SCALED_MERGE_ZERO)
+ MAKE_CASE(AArch64ISD::GLDFF1S_SXTW_MERGE_ZERO)
+ MAKE_CASE(AArch64ISD::GLDFF1S_UXTW_MERGE_ZERO)
+ MAKE_CASE(AArch64ISD::GLDFF1S_SXTW_SCALED_MERGE_ZERO)
+ MAKE_CASE(AArch64ISD::GLDFF1S_UXTW_SCALED_MERGE_ZERO)
+ MAKE_CASE(AArch64ISD::GLDFF1S_IMM_MERGE_ZERO)
+ MAKE_CASE(AArch64ISD::GLDNT1_MERGE_ZERO)
+ MAKE_CASE(AArch64ISD::GLDNT1_INDEX_MERGE_ZERO)
+ MAKE_CASE(AArch64ISD::GLDNT1S_MERGE_ZERO)
+ MAKE_CASE(AArch64ISD::ST1_PRED)
+ MAKE_CASE(AArch64ISD::SST1_PRED)
+ MAKE_CASE(AArch64ISD::SST1_SCALED_PRED)
+ MAKE_CASE(AArch64ISD::SST1_SXTW_PRED)
+ MAKE_CASE(AArch64ISD::SST1_UXTW_PRED)
+ MAKE_CASE(AArch64ISD::SST1_SXTW_SCALED_PRED)
+ MAKE_CASE(AArch64ISD::SST1_UXTW_SCALED_PRED)
+ MAKE_CASE(AArch64ISD::SST1_IMM_PRED)
+ MAKE_CASE(AArch64ISD::SSTNT1_PRED)
+ MAKE_CASE(AArch64ISD::SSTNT1_INDEX_PRED)
+ MAKE_CASE(AArch64ISD::LDP)
+ MAKE_CASE(AArch64ISD::STP)
+ MAKE_CASE(AArch64ISD::STNP)
+ MAKE_CASE(AArch64ISD::DUP_MERGE_PASSTHRU)
+ MAKE_CASE(AArch64ISD::INDEX_VECTOR)
+ }
+#undef MAKE_CASE
return nullptr;
}
@@ -1454,12 +1675,6 @@ MachineBasicBlock *AArch64TargetLowering::EmitLoweredCatchRet(
return BB;
}
-MachineBasicBlock *AArch64TargetLowering::EmitLoweredCatchPad(
- MachineInstr &MI, MachineBasicBlock *BB) const {
- MI.eraseFromParent();
- return BB;
-}
-
MachineBasicBlock *AArch64TargetLowering::EmitInstrWithCustomInserter(
MachineInstr &MI, MachineBasicBlock *BB) const {
switch (MI.getOpcode()) {
@@ -1478,8 +1693,6 @@ MachineBasicBlock *AArch64TargetLowering::EmitInstrWithCustomInserter(
case AArch64::CATCHRET:
return EmitLoweredCatchRet(MI, BB);
- case AArch64::CATCHPAD:
- return EmitLoweredCatchPad(MI, BB);
}
}
@@ -1668,6 +1881,17 @@ static bool isCMN(SDValue Op, ISD::CondCode CC) {
(CC == ISD::SETEQ || CC == ISD::SETNE);
}
+static SDValue emitStrictFPComparison(SDValue LHS, SDValue RHS, const SDLoc &dl,
+ SelectionDAG &DAG, SDValue Chain,
+ bool IsSignaling) {
+ EVT VT = LHS.getValueType();
+ assert(VT != MVT::f128);
+ assert(VT != MVT::f16 && "Lowering of strict fp16 not yet implemented");
+ unsigned Opcode =
+ IsSignaling ? AArch64ISD::STRICT_FCMPE : AArch64ISD::STRICT_FCMP;
+ return DAG.getNode(Opcode, dl, {VT, MVT::Other}, {Chain, LHS, RHS});
+}
+
static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC,
const SDLoc &dl, SelectionDAG &DAG) {
EVT VT = LHS.getValueType();
@@ -1699,14 +1923,22 @@ static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC,
// we combine a (CMP (sub 0, op1), op2) into a CMN instruction ?
Opcode = AArch64ISD::ADDS;
LHS = LHS.getOperand(1);
- } else if (LHS.getOpcode() == ISD::AND && isNullConstant(RHS) &&
- !isUnsignedIntSetCC(CC)) {
- // Similarly, (CMP (and X, Y), 0) can be implemented with a TST
- // (a.k.a. ANDS) except that the flags are only guaranteed to work for one
- // of the signed comparisons.
- Opcode = AArch64ISD::ANDS;
- RHS = LHS.getOperand(1);
- LHS = LHS.getOperand(0);
+ } else if (isNullConstant(RHS) && !isUnsignedIntSetCC(CC)) {
+ if (LHS.getOpcode() == ISD::AND) {
+ // Similarly, (CMP (and X, Y), 0) can be implemented with a TST
+ // (a.k.a. ANDS) except that the flags are only guaranteed to work for one
+ // of the signed comparisons.
+ const SDValue ANDSNode = DAG.getNode(AArch64ISD::ANDS, dl,
+ DAG.getVTList(VT, MVT_CC),
+ LHS.getOperand(0),
+ LHS.getOperand(1));
+ // Replace all users of (and X, Y) with newly generated (ands X, Y)
+ DAG.ReplaceAllUsesWith(LHS, ANDSNode);
+ return ANDSNode.getValue(1);
+ } else if (LHS.getOpcode() == AArch64ISD::ANDS) {
+ // Use result of ANDS
+ return LHS.getValue(1);
+ }
}
return DAG.getNode(Opcode, dl, DAG.getVTList(VT, MVT_CC), LHS, RHS)
@@ -2284,18 +2516,16 @@ getAArch64XALUOOp(AArch64CC::CondCode &CC, SDValue Op, SelectionDAG &DAG) {
SDValue AArch64TargetLowering::LowerF128Call(SDValue Op, SelectionDAG &DAG,
RTLIB::Libcall Call) const {
- SmallVector<SDValue, 2> Ops(Op->op_begin(), Op->op_end());
+ bool IsStrict = Op->isStrictFPOpcode();
+ unsigned Offset = IsStrict ? 1 : 0;
+ SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
+ SmallVector<SDValue, 2> Ops(Op->op_begin() + Offset, Op->op_end());
MakeLibCallOptions CallOptions;
- return makeLibCall(DAG, Call, MVT::f128, Ops, CallOptions, SDLoc(Op)).first;
-}
-
-// Returns true if the given Op is the overflow flag result of an overflow
-// intrinsic operation.
-static bool isOverflowIntrOpRes(SDValue Op) {
- unsigned Opc = Op.getOpcode();
- return (Op.getResNo() == 1 &&
- (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO ||
- Opc == ISD::USUBO || Opc == ISD::SMULO || Opc == ISD::UMULO));
+ SDValue Result;
+ SDLoc dl(Op);
+ std::tie(Result, Chain) = makeLibCall(DAG, Call, Op.getValueType(), Ops,
+ CallOptions, dl, Chain);
+ return IsStrict ? DAG.getMergeValues({Result, Chain}, dl) : Result;
}
static SDValue LowerXOR(SDValue Op, SelectionDAG &DAG) {
@@ -2310,7 +2540,7 @@ static SDValue LowerXOR(SDValue Op, SelectionDAG &DAG) {
// (csel 1, 0, invert(cc), overflow_op_bool)
// ... which later gets transformed to just a cset instruction with an
// inverted condition code, rather than a cset + eor sequence.
- if (isOneConstant(Other) && isOverflowIntrOpRes(Sel)) {
+ if (isOneConstant(Other) && ISD::isOverflowIntrOpRes(Sel)) {
// Only lower legal XALUO ops.
if (!DAG.getTargetLoweringInfo().isTypeLegal(Sel->getValueType(0)))
return SDValue();
@@ -2483,21 +2713,32 @@ SDValue AArch64TargetLowering::LowerFP_EXTEND(SDValue Op,
SDValue AArch64TargetLowering::LowerFP_ROUND(SDValue Op,
SelectionDAG &DAG) const {
- if (Op.getOperand(0).getValueType() != MVT::f128) {
+ bool IsStrict = Op->isStrictFPOpcode();
+ SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
+ EVT SrcVT = SrcVal.getValueType();
+
+ if (SrcVT != MVT::f128) {
+ // Expand cases where the input is a vector bigger than NEON.
+ if (useSVEForFixedLengthVectorVT(SrcVT))
+ return SDValue();
+
// It's legal except when f128 is involved
return Op;
}
RTLIB::Libcall LC;
- LC = RTLIB::getFPROUND(Op.getOperand(0).getValueType(), Op.getValueType());
+ LC = RTLIB::getFPROUND(SrcVT, Op.getValueType());
// FP_ROUND node has a second operand indicating whether it is known to be
// precise. That doesn't take part in the LibCall so we can't directly use
// LowerF128Call.
- SDValue SrcVal = Op.getOperand(0);
MakeLibCallOptions CallOptions;
- return makeLibCall(DAG, LC, Op.getValueType(), SrcVal, CallOptions,
- SDLoc(Op)).first;
+ SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
+ SDValue Result;
+ SDLoc dl(Op);
+ std::tie(Result, Chain) = makeLibCall(DAG, LC, Op.getValueType(), SrcVal,
+ CallOptions, dl, Chain);
+ return IsStrict ? DAG.getMergeValues({Result, Chain}, dl) : Result;
}
SDValue AArch64TargetLowering::LowerVectorFP_TO_INT(SDValue Op,
@@ -2542,32 +2783,34 @@ SDValue AArch64TargetLowering::LowerVectorFP_TO_INT(SDValue Op,
SDValue AArch64TargetLowering::LowerFP_TO_INT(SDValue Op,
SelectionDAG &DAG) const {
- if (Op.getOperand(0).getValueType().isVector())
+ bool IsStrict = Op->isStrictFPOpcode();
+ SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
+
+ if (SrcVal.getValueType().isVector())
return LowerVectorFP_TO_INT(Op, DAG);
// f16 conversions are promoted to f32 when full fp16 is not supported.
- if (Op.getOperand(0).getValueType() == MVT::f16 &&
- !Subtarget->hasFullFP16()) {
+ if (SrcVal.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) {
+ assert(!IsStrict && "Lowering of strict fp16 not yet implemented");
SDLoc dl(Op);
return DAG.getNode(
Op.getOpcode(), dl, Op.getValueType(),
- DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, Op.getOperand(0)));
+ DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, SrcVal));
}
- if (Op.getOperand(0).getValueType() != MVT::f128) {
+ if (SrcVal.getValueType() != MVT::f128) {
// It's legal except when f128 is involved
return Op;
}
RTLIB::Libcall LC;
- if (Op.getOpcode() == ISD::FP_TO_SINT)
- LC = RTLIB::getFPTOSINT(Op.getOperand(0).getValueType(), Op.getValueType());
+ if (Op.getOpcode() == ISD::FP_TO_SINT ||
+ Op.getOpcode() == ISD::STRICT_FP_TO_SINT)
+ LC = RTLIB::getFPTOSINT(SrcVal.getValueType(), Op.getValueType());
else
- LC = RTLIB::getFPTOUINT(Op.getOperand(0).getValueType(), Op.getValueType());
+ LC = RTLIB::getFPTOUINT(SrcVal.getValueType(), Op.getValueType());
- SmallVector<SDValue, 2> Ops(Op->op_begin(), Op->op_end());
- MakeLibCallOptions CallOptions;
- return makeLibCall(DAG, LC, Op.getValueType(), Ops, CallOptions, SDLoc(Op)).first;
+ return LowerF128Call(Op, DAG, LC);
}
static SDValue LowerVectorINT_TO_FP(SDValue Op, SelectionDAG &DAG) {
@@ -2603,18 +2846,22 @@ SDValue AArch64TargetLowering::LowerINT_TO_FP(SDValue Op,
if (Op.getValueType().isVector())
return LowerVectorINT_TO_FP(Op, DAG);
+ bool IsStrict = Op->isStrictFPOpcode();
+ SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
+
// f16 conversions are promoted to f32 when full fp16 is not supported.
if (Op.getValueType() == MVT::f16 &&
!Subtarget->hasFullFP16()) {
+ assert(!IsStrict && "Lowering of strict fp16 not yet implemented");
SDLoc dl(Op);
return DAG.getNode(
ISD::FP_ROUND, dl, MVT::f16,
- DAG.getNode(Op.getOpcode(), dl, MVT::f32, Op.getOperand(0)),
+ DAG.getNode(Op.getOpcode(), dl, MVT::f32, SrcVal),
DAG.getIntPtrConstant(0, dl));
}
// i128 conversions are libcalls.
- if (Op.getOperand(0).getValueType() == MVT::i128)
+ if (SrcVal.getValueType() == MVT::i128)
return SDValue();
// Other conversions are legal, unless it's to the completely software-based
@@ -2623,10 +2870,11 @@ SDValue AArch64TargetLowering::LowerINT_TO_FP(SDValue Op,
return Op;
RTLIB::Libcall LC;
- if (Op.getOpcode() == ISD::SINT_TO_FP)
- LC = RTLIB::getSINTTOFP(Op.getOperand(0).getValueType(), Op.getValueType());
+ if (Op.getOpcode() == ISD::SINT_TO_FP ||
+ Op.getOpcode() == ISD::STRICT_SINT_TO_FP)
+ LC = RTLIB::getSINTTOFP(SrcVal.getValueType(), Op.getValueType());
else
- LC = RTLIB::getUINTTOFP(Op.getOperand(0).getValueType(), Op.getValueType());
+ LC = RTLIB::getUINTTOFP(SrcVal.getValueType(), Op.getValueType());
return LowerF128Call(Op, DAG, LC);
}
@@ -2666,7 +2914,8 @@ SDValue AArch64TargetLowering::LowerFSINCOS(SDValue Op,
}
static SDValue LowerBITCAST(SDValue Op, SelectionDAG &DAG) {
- if (Op.getValueType() != MVT::f16)
+ EVT OpVT = Op.getValueType();
+ if (OpVT != MVT::f16 && OpVT != MVT::bf16)
return SDValue();
assert(Op.getOperand(0).getValueType() == MVT::i16);
@@ -2675,7 +2924,7 @@ static SDValue LowerBITCAST(SDValue Op, SelectionDAG &DAG) {
Op = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op.getOperand(0));
Op = DAG.getNode(ISD::BITCAST, DL, MVT::f32, Op);
return SDValue(
- DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL, MVT::f16, Op,
+ DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL, OpVT, Op,
DAG.getTargetConstant(AArch64::hsub, DL, MVT::i32)),
0);
}
@@ -2804,16 +3053,19 @@ SDValue AArch64TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
// so that the shift + and get folded into a bitfield extract.
SDLoc dl(Op);
- SDValue FPCR_64 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::i64,
- DAG.getConstant(Intrinsic::aarch64_get_fpcr, dl,
- MVT::i64));
+ SDValue Chain = Op.getOperand(0);
+ SDValue FPCR_64 = DAG.getNode(
+ ISD::INTRINSIC_W_CHAIN, dl, {MVT::i64, MVT::Other},
+ {Chain, DAG.getConstant(Intrinsic::aarch64_get_fpcr, dl, MVT::i64)});
+ Chain = FPCR_64.getValue(1);
SDValue FPCR_32 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, FPCR_64);
SDValue FltRounds = DAG.getNode(ISD::ADD, dl, MVT::i32, FPCR_32,
DAG.getConstant(1U << 22, dl, MVT::i32));
SDValue RMODE = DAG.getNode(ISD::SRL, dl, MVT::i32, FltRounds,
DAG.getConstant(22, dl, MVT::i32));
- return DAG.getNode(ISD::AND, dl, MVT::i32, RMODE,
- DAG.getConstant(3, dl, MVT::i32));
+ SDValue AND = DAG.getNode(ISD::AND, dl, MVT::i32, RMODE,
+ DAG.getConstant(3, dl, MVT::i32));
+ return DAG.getMergeValues({AND, Chain}, dl);
}
static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) {
@@ -2885,6 +3137,12 @@ static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) {
DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1));
}
+static inline SDValue getPTrue(SelectionDAG &DAG, SDLoc DL, EVT VT,
+ int Pattern) {
+ return DAG.getNode(AArch64ISD::PTRUE, DL, VT,
+ DAG.getTargetConstant(Pattern, DL, MVT::i32));
+}
+
SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
SelectionDAG &DAG) const {
unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
@@ -2972,6 +3230,26 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
case Intrinsic::aarch64_sve_ptrue:
return DAG.getNode(AArch64ISD::PTRUE, dl, Op.getValueType(),
Op.getOperand(1));
+ case Intrinsic::aarch64_sve_dupq_lane:
+ return LowerDUPQLane(Op, DAG);
+ case Intrinsic::aarch64_sve_convert_from_svbool:
+ return DAG.getNode(AArch64ISD::REINTERPRET_CAST, dl, Op.getValueType(),
+ Op.getOperand(1));
+ case Intrinsic::aarch64_sve_convert_to_svbool: {
+ EVT OutVT = Op.getValueType();
+ EVT InVT = Op.getOperand(1).getValueType();
+ // Return the operand if the cast isn't changing type,
+ // i.e. <n x 16 x i1> -> <n x 16 x i1>
+ if (InVT == OutVT)
+ return Op.getOperand(1);
+ // Otherwise, zero the newly introduced lanes.
+ SDValue Reinterpret =
+ DAG.getNode(AArch64ISD::REINTERPRET_CAST, dl, OutVT, Op.getOperand(1));
+ SDValue Mask = getPTrue(DAG, dl, InVT, AArch64SVEPredPattern::all);
+ SDValue MaskReinterpret =
+ DAG.getNode(AArch64ISD::REINTERPRET_CAST, dl, OutVT, Mask);
+ return DAG.getNode(ISD::AND, dl, OutVT, Reinterpret, MaskReinterpret);
+ }
case Intrinsic::aarch64_sve_insr: {
SDValue Scalar = Op.getOperand(2);
@@ -3004,6 +3282,29 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
"llvm.eh.recoverfp must take a function as the first argument");
return IncomingFPOp;
}
+
+ case Intrinsic::aarch64_neon_vsri:
+ case Intrinsic::aarch64_neon_vsli: {
+ EVT Ty = Op.getValueType();
+
+ if (!Ty.isVector())
+ report_fatal_error("Unexpected type for aarch64_neon_vsli");
+
+ assert(Op.getConstantOperandVal(3) <= Ty.getScalarSizeInBits());
+
+ bool IsShiftRight = IntNo == Intrinsic::aarch64_neon_vsri;
+ unsigned Opcode = IsShiftRight ? AArch64ISD::VSRI : AArch64ISD::VSLI;
+ return DAG.getNode(Opcode, dl, Ty, Op.getOperand(1), Op.getOperand(2),
+ Op.getOperand(3));
+ }
+
+ case Intrinsic::aarch64_neon_srhadd:
+ case Intrinsic::aarch64_neon_urhadd: {
+ bool IsSignedAdd = IntNo == Intrinsic::aarch64_neon_srhadd;
+ unsigned Opcode = IsSignedAdd ? AArch64ISD::SRHADD : AArch64ISD::URHADD;
+ return DAG.getNode(Opcode, dl, Op.getValueType(), Op.getOperand(1),
+ Op.getOperand(2));
+ }
}
}
@@ -3058,10 +3359,13 @@ SDValue AArch64TargetLowering::LowerSTORE(SDValue Op,
EVT MemVT = StoreNode->getMemoryVT();
if (VT.isVector()) {
+ if (useSVEForFixedLengthVectorVT(VT))
+ return LowerFixedLengthVectorStoreToSVE(Op, DAG);
+
unsigned AS = StoreNode->getAddressSpace();
- unsigned Align = StoreNode->getAlignment();
- if (Align < MemVT.getStoreSize() &&
- !allowsMisalignedMemoryAccesses(MemVT, AS, Align,
+ Align Alignment = StoreNode->getAlign();
+ if (Alignment < MemVT.getStoreSize() &&
+ !allowsMisalignedMemoryAccesses(MemVT, AS, Alignment.value(),
StoreNode->getMemOperand()->getFlags(),
nullptr)) {
return scalarizeVectorStore(StoreNode, DAG);
@@ -3070,6 +3374,30 @@ SDValue AArch64TargetLowering::LowerSTORE(SDValue Op,
if (StoreNode->isTruncatingStore()) {
return LowerTruncateVectorStore(Dl, StoreNode, VT, MemVT, DAG);
}
+ // 256 bit non-temporal stores can be lowered to STNP. Do this as part of
+ // the custom lowering, as there are no un-paired non-temporal stores and
+ // legalization will break up 256 bit inputs.
+ if (StoreNode->isNonTemporal() && MemVT.getSizeInBits() == 256u &&
+ MemVT.getVectorElementCount().Min % 2u == 0 &&
+ ((MemVT.getScalarSizeInBits() == 8u ||
+ MemVT.getScalarSizeInBits() == 16u ||
+ MemVT.getScalarSizeInBits() == 32u ||
+ MemVT.getScalarSizeInBits() == 64u))) {
+ SDValue Lo =
+ DAG.getNode(ISD::EXTRACT_SUBVECTOR, Dl,
+ MemVT.getHalfNumVectorElementsVT(*DAG.getContext()),
+ StoreNode->getValue(), DAG.getConstant(0, Dl, MVT::i64));
+ SDValue Hi = DAG.getNode(
+ ISD::EXTRACT_SUBVECTOR, Dl,
+ MemVT.getHalfNumVectorElementsVT(*DAG.getContext()),
+ StoreNode->getValue(),
+ DAG.getConstant(MemVT.getVectorElementCount().Min / 2, Dl, MVT::i64));
+ SDValue Result = DAG.getMemIntrinsicNode(
+ AArch64ISD::STNP, Dl, DAG.getVTList(MVT::Other),
+ {StoreNode->getChain(), Lo, Hi, StoreNode->getBasePtr()},
+ StoreNode->getMemoryVT(), StoreNode->getMemOperand());
+ return Result;
+ }
} else if (MemVT == MVT::i128 && StoreNode->isVolatile()) {
assert(StoreNode->getValue()->getValueType(0) == MVT::i128);
SDValue Lo =
@@ -3104,6 +3432,8 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
case ISD::GlobalTLSAddress:
return LowerGlobalTLSAddress(Op, DAG);
case ISD::SETCC:
+ case ISD::STRICT_FSETCC:
+ case ISD::STRICT_FSETCCS:
return LowerSETCC(Op, DAG);
case ISD::BR_CC:
return LowerBR_CC(Op, DAG);
@@ -3138,14 +3468,19 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
case ISD::UMULO:
return LowerXALUO(Op, DAG);
case ISD::FADD:
+ if (useSVEForFixedLengthVectorVT(Op.getValueType()))
+ return LowerToPredicatedOp(Op, DAG, AArch64ISD::FADD_PRED);
return LowerF128Call(Op, DAG, RTLIB::ADD_F128);
case ISD::FSUB:
return LowerF128Call(Op, DAG, RTLIB::SUB_F128);
case ISD::FMUL:
return LowerF128Call(Op, DAG, RTLIB::MUL_F128);
+ case ISD::FMA:
+ return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMA_PRED);
case ISD::FDIV:
return LowerF128Call(Op, DAG, RTLIB::DIV_F128);
case ISD::FP_ROUND:
+ case ISD::STRICT_FP_ROUND:
return LowerFP_ROUND(Op, DAG);
case ISD::FP_EXTEND:
return LowerFP_EXTEND(Op, DAG);
@@ -3169,6 +3504,20 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
return LowerSPLAT_VECTOR(Op, DAG);
case ISD::EXTRACT_SUBVECTOR:
return LowerEXTRACT_SUBVECTOR(Op, DAG);
+ case ISD::INSERT_SUBVECTOR:
+ return LowerINSERT_SUBVECTOR(Op, DAG);
+ case ISD::SDIV:
+ return LowerToPredicatedOp(Op, DAG, AArch64ISD::SDIV_PRED);
+ case ISD::UDIV:
+ return LowerToPredicatedOp(Op, DAG, AArch64ISD::UDIV_PRED);
+ case ISD::SMIN:
+ return LowerToPredicatedOp(Op, DAG, AArch64ISD::SMIN_MERGE_OP1);
+ case ISD::UMIN:
+ return LowerToPredicatedOp(Op, DAG, AArch64ISD::UMIN_MERGE_OP1);
+ case ISD::SMAX:
+ return LowerToPredicatedOp(Op, DAG, AArch64ISD::SMAX_MERGE_OP1);
+ case ISD::UMAX:
+ return LowerToPredicatedOp(Op, DAG, AArch64ISD::UMAX_MERGE_OP1);
case ISD::SRA:
case ISD::SRL:
case ISD::SHL:
@@ -3190,9 +3539,13 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
return LowerPREFETCH(Op, DAG);
case ISD::SINT_TO_FP:
case ISD::UINT_TO_FP:
+ case ISD::STRICT_SINT_TO_FP:
+ case ISD::STRICT_UINT_TO_FP:
return LowerINT_TO_FP(Op, DAG);
case ISD::FP_TO_SINT:
case ISD::FP_TO_UINT:
+ case ISD::STRICT_FP_TO_SINT:
+ case ISD::STRICT_FP_TO_UINT:
return LowerFP_TO_INT(Op, DAG);
case ISD::FSINCOS:
return LowerFSINCOS(Op, DAG);
@@ -3218,9 +3571,68 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
return LowerATOMIC_LOAD_AND(Op, DAG);
case ISD::DYNAMIC_STACKALLOC:
return LowerDYNAMIC_STACKALLOC(Op, DAG);
+ case ISD::VSCALE:
+ return LowerVSCALE(Op, DAG);
+ case ISD::TRUNCATE:
+ return LowerTRUNCATE(Op, DAG);
+ case ISD::LOAD:
+ if (useSVEForFixedLengthVectorVT(Op.getValueType()))
+ return LowerFixedLengthVectorLoadToSVE(Op, DAG);
+ llvm_unreachable("Unexpected request to lower ISD::LOAD");
+ case ISD::ADD:
+ if (useSVEForFixedLengthVectorVT(Op.getValueType()))
+ return LowerToPredicatedOp(Op, DAG, AArch64ISD::ADD_PRED);
+ llvm_unreachable("Unexpected request to lower ISD::ADD");
}
}
+bool AArch64TargetLowering::useSVEForFixedLengthVectors() const {
+ // Prefer NEON unless larger SVE registers are available.
+ return Subtarget->hasSVE() && Subtarget->getMinSVEVectorSizeInBits() >= 256;
+}
+
+bool AArch64TargetLowering::useSVEForFixedLengthVectorVT(EVT VT) const {
+ if (!useSVEForFixedLengthVectors())
+ return false;
+
+ if (!VT.isFixedLengthVector())
+ return false;
+
+ // Fixed length predicates should be promoted to i8.
+ // NOTE: This is consistent with how NEON (and thus 64/128bit vectors) work.
+ if (VT.getVectorElementType() == MVT::i1)
+ return false;
+
+ // Don't use SVE for vectors we cannot scalarize if required.
+ switch (VT.getVectorElementType().getSimpleVT().SimpleTy) {
+ default:
+ return false;
+ case MVT::i8:
+ case MVT::i16:
+ case MVT::i32:
+ case MVT::i64:
+ case MVT::f16:
+ case MVT::f32:
+ case MVT::f64:
+ break;
+ }
+
+ // Ensure NEON MVTs only belong to a single register class.
+ if (VT.getSizeInBits() <= 128)
+ return false;
+
+ // Don't use SVE for types that don't fit.
+ if (VT.getSizeInBits() > Subtarget->getMinSVEVectorSizeInBits())
+ return false;
+
+ // TODO: Perhaps an artificial restriction, but worth having whilst getting
+ // the base fixed length SVE support in place.
+ if (!VT.isPow2VectorType())
+ return false;
+
+ return true;
+}
+
//===----------------------------------------------------------------------===//
// Calling Convention Implementation
//===----------------------------------------------------------------------===//
@@ -3231,9 +3643,6 @@ CCAssignFn *AArch64TargetLowering::CCAssignFnForCall(CallingConv::ID CC,
switch (CC) {
default:
report_fatal_error("Unsupported calling convention.");
- case CallingConv::AArch64_SVE_VectorCall:
- // Calling SVE functions is currently not yet supported.
- report_fatal_error("Unsupported calling convention.");
case CallingConv::WebKit_JS:
return CC_AArch64_WebKit_JS;
case CallingConv::GHC:
@@ -3256,6 +3665,7 @@ CCAssignFn *AArch64TargetLowering::CCAssignFnForCall(CallingConv::ID CC,
case CallingConv::CFGuard_Check:
return CC_AArch64_Win64_CFGuard_Check;
case CallingConv::AArch64_VectorCall:
+ case CallingConv::AArch64_SVE_VectorCall:
return CC_AArch64_AAPCS;
}
}
@@ -3343,7 +3753,7 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
RC = &AArch64::GPR32RegClass;
else if (RegVT == MVT::i64)
RC = &AArch64::GPR64RegClass;
- else if (RegVT == MVT::f16)
+ else if (RegVT == MVT::f16 || RegVT == MVT::bf16)
RC = &AArch64::FPR16RegClass;
else if (RegVT == MVT::f32)
RC = &AArch64::FPR32RegClass;
@@ -3374,7 +3784,7 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
case CCValAssign::Indirect:
assert(VA.getValVT().isScalableVector() &&
"Only scalable vectors can be passed indirectly");
- llvm_unreachable("Spilling of SVE vectors not yet implemented");
+ break;
case CCValAssign::BCvt:
ArgValue = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), ArgValue);
break;
@@ -3391,7 +3801,9 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
} else { // VA.isRegLoc()
assert(VA.isMemLoc() && "CCValAssign is neither reg nor mem");
unsigned ArgOffset = VA.getLocMemOffset();
- unsigned ArgSize = VA.getValVT().getSizeInBits() / 8;
+ unsigned ArgSize = (VA.getLocInfo() == CCValAssign::Indirect
+ ? VA.getLocVT().getSizeInBits()
+ : VA.getValVT().getSizeInBits()) / 8;
uint32_t BEAlign = 0;
if (!Subtarget->isLittleEndian() && ArgSize < 8 &&
@@ -3417,7 +3829,8 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
case CCValAssign::Indirect:
assert(VA.getValVT().isScalableVector() &&
"Only scalable vectors can be passed indirectly");
- llvm_unreachable("Spilling of SVE vectors not yet implemented");
+ MemVT = VA.getLocVT();
+ break;
case CCValAssign::SExt:
ExtType = ISD::SEXTLOAD;
break;
@@ -3435,6 +3848,15 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
MemVT);
}
+
+ if (VA.getLocInfo() == CCValAssign::Indirect) {
+ assert(VA.getValVT().isScalableVector() &&
+ "Only scalable vectors can be passed indirectly");
+ // If value is passed via pointer - do a load.
+ ArgValue =
+ DAG.getLoad(VA.getValVT(), DL, Chain, ArgValue, MachinePointerInfo());
+ }
+
if (Subtarget->isTargetILP32() && Ins[i].Flags.isPointer())
ArgValue = DAG.getNode(ISD::AssertZext, DL, ArgValue.getValueType(),
ArgValue, DAG.getValueType(MVT::i32));
@@ -3550,7 +3972,7 @@ void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo,
// The extra size here, if triggered, will always be 8.
MFI.CreateFixedObject(16 - (GPRSaveSize & 15), -(int)alignTo(GPRSaveSize, 16), false);
} else
- GPRIdx = MFI.CreateStackObject(GPRSaveSize, 8, false);
+ GPRIdx = MFI.CreateStackObject(GPRSaveSize, Align(8), false);
SDValue FIN = DAG.getFrameIndex(GPRIdx, PtrVT);
@@ -3582,7 +4004,7 @@ void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo,
unsigned FPRSaveSize = 16 * (NumFPRArgRegs - FirstVariadicFPR);
int FPRIdx = 0;
if (FPRSaveSize != 0) {
- FPRIdx = MFI.CreateStackObject(FPRSaveSize, 16, false);
+ FPRIdx = MFI.CreateStackObject(FPRSaveSize, Align(16), false);
SDValue FIN = DAG.getFrameIndex(FPRIdx, PtrVT);
@@ -3703,6 +4125,13 @@ bool AArch64TargetLowering::isEligibleForTailCallOptimization(
CallingConv::ID CallerCC = CallerF.getCallingConv();
bool CCMatch = CallerCC == CalleeCC;
+ // When using the Windows calling convention on a non-windows OS, we want
+ // to back up and restore X18 in such functions; we can't do a tail call
+ // from those functions.
+ if (CallerCC == CallingConv::Win64 && !Subtarget->isTargetWindows() &&
+ CalleeCC != CallingConv::Win64)
+ return false;
+
// Byval parameters hand the function a pointer directly into the stack area
// we want to reuse during a tail call. Working around this *is* possible (see
// X86) but less efficient and uglier in LowerCall.
@@ -3795,6 +4224,18 @@ bool AArch64TargetLowering::isEligibleForTailCallOptimization(
const AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
+ // If any of the arguments is passed indirectly, it must be SVE, so the
+ // 'getBytesInStackArgArea' is not sufficient to determine whether we need to
+ // allocate space on the stack. That is why we determine this explicitly here
+ // the call cannot be a tailcall.
+ if (llvm::any_of(ArgLocs, [](CCValAssign &A) {
+ assert((A.getLocInfo() != CCValAssign::Indirect ||
+ A.getValVT().isScalableVector()) &&
+ "Expected value to be scalable");
+ return A.getLocInfo() == CCValAssign::Indirect;
+ }))
+ return false;
+
// If the stack arguments for this call do not fit into our own save area then
// the call cannot be made tail.
if (CCInfo.getNextStackOffset() > FuncInfo->getBytesInStackArgArea())
@@ -3873,7 +4314,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
// Check if it's really possible to do a tail call.
IsTailCall = isEligibleForTailCallOptimization(
Callee, CallConv, IsVarArg, Outs, OutVals, Ins, DAG);
- if (!IsTailCall && CLI.CS && CLI.CS.isMustTailCall())
+ if (!IsTailCall && CLI.CB && CLI.CB->isMustTailCall())
report_fatal_error("failed to perform tail call elimination on a call "
"site marked musttail");
@@ -3983,7 +4424,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
SmallVector<SDValue, 8> MemOpChains;
auto PtrVT = getPointerTy(DAG.getDataLayout());
- if (IsVarArg && CLI.CS && CLI.CS.isMustTailCall()) {
+ if (IsVarArg && CLI.CB && CLI.CB->isMustTailCall()) {
const auto &Forwards = FuncInfo->getForwardedMustTailRegParms();
for (const auto &F : Forwards) {
SDValue Val = DAG.getCopyFromReg(Chain, DL, F.VReg, F.VT);
@@ -4035,7 +4476,20 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
case CCValAssign::Indirect:
assert(VA.getValVT().isScalableVector() &&
"Only scalable vectors can be passed indirectly");
- llvm_unreachable("Spilling of SVE vectors not yet implemented");
+ MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
+ Type *Ty = EVT(VA.getValVT()).getTypeForEVT(*DAG.getContext());
+ Align Alignment = DAG.getDataLayout().getPrefTypeAlign(Ty);
+ int FI = MFI.CreateStackObject(
+ VA.getValVT().getStoreSize().getKnownMinSize(), Alignment, false);
+ MFI.setStackID(FI, TargetStackID::SVEVector);
+
+ SDValue SpillSlot = DAG.getFrameIndex(
+ FI, DAG.getTargetLoweringInfo().getFrameIndexTy(DAG.getDataLayout()));
+ Chain = DAG.getStore(
+ Chain, DL, Arg, SpillSlot,
+ MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
+ Arg = SpillSlot;
+ break;
}
if (VA.isRegLoc()) {
@@ -4071,7 +4525,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
RegsToPass.emplace_back(VA.getLocReg(), Arg);
RegsUsed.insert(VA.getLocReg());
const TargetOptions &Options = DAG.getTarget().Options;
- if (Options.EnableDebugEntryValues)
+ if (Options.EmitCallSiteInfo)
CSInfo.emplace_back(VA.getLocReg(), i);
}
} else {
@@ -4083,8 +4537,12 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
// FIXME: This works on big-endian for composite byvals, which are the
// common case. It should also work for fundamental types too.
uint32_t BEAlign = 0;
- unsigned OpSize = Flags.isByVal() ? Flags.getByValSize() * 8
- : VA.getValVT().getSizeInBits();
+ unsigned OpSize;
+ if (VA.getLocInfo() == CCValAssign::Indirect)
+ OpSize = VA.getLocVT().getSizeInBits();
+ else
+ OpSize = Flags.isByVal() ? Flags.getByValSize() * 8
+ : VA.getValVT().getSizeInBits();
OpSize = (OpSize + 7) / 8;
if (!Subtarget->isLittleEndian() && !Flags.isByVal() &&
!Flags.isInConsecutiveRegs()) {
@@ -4120,10 +4578,10 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
SDValue SizeNode =
DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i64);
SDValue Cpy = DAG.getMemcpy(
- Chain, DL, DstAddr, Arg, SizeNode, Outs[i].Flags.getByValAlign(),
+ Chain, DL, DstAddr, Arg, SizeNode,
+ Outs[i].Flags.getNonZeroByValAlign(),
/*isVol = */ false, /*AlwaysInline = */ false,
- /*isTailCall = */ false,
- DstInfo, MachinePointerInfo());
+ /*isTailCall = */ false, DstInfo, MachinePointerInfo());
MemOpChains.push_back(Cpy);
} else {
@@ -4257,6 +4715,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
// Returns a chain and a flag for retval copy to use.
Chain = DAG.getNode(AArch64ISD::CALL, DL, NodeTys, Ops);
+ DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge);
InFlag = Chain.getValue(1);
DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo));
@@ -4422,7 +4881,7 @@ SDValue AArch64TargetLowering::getTargetNode(JumpTableSDNode *N, EVT Ty,
SDValue AArch64TargetLowering::getTargetNode(ConstantPoolSDNode *N, EVT Ty,
SelectionDAG &DAG,
unsigned Flag) const {
- return DAG.getTargetConstantPool(N->getConstVal(), Ty, N->getAlignment(),
+ return DAG.getTargetConstantPool(N->getConstVal(), Ty, N->getAlign(),
N->getOffset(), Flag);
}
@@ -4913,7 +5372,7 @@ SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
// Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch
// instruction.
- if (isOverflowIntrOpRes(LHS) && isOneConstant(RHS) &&
+ if (ISD::isOverflowIntrOpRes(LHS) && isOneConstant(RHS) &&
(CC == ISD::SETEQ || CC == ISD::SETNE)) {
// Only lower legal XALUO ops.
if (!DAG.getTargetLoweringInfo().isTypeLegal(LHS->getValueType(0)))
@@ -4997,8 +5456,8 @@ SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
Cmp);
}
- assert(LHS.getValueType() == MVT::f16 || LHS.getValueType() == MVT::f32 ||
- LHS.getValueType() == MVT::f64);
+ assert(LHS.getValueType() == MVT::f16 || LHS.getValueType() == MVT::bf16 ||
+ LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64);
// Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
// clean. Some of them require two branches to implement.
@@ -5124,6 +5583,15 @@ SDValue AArch64TargetLowering::LowerCTPOP(SDValue Op, SelectionDAG &DAG) const {
if (VT == MVT::i64)
UaddLV = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, UaddLV);
return UaddLV;
+ } else if (VT == MVT::i128) {
+ Val = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Val);
+
+ SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v16i8, Val);
+ SDValue UaddLV = DAG.getNode(
+ ISD::INTRINSIC_WO_CHAIN, DL, MVT::i32,
+ DAG.getConstant(Intrinsic::aarch64_neon_uaddlv, DL, MVT::i32), CtPop);
+
+ return DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i128, UaddLV);
}
assert((VT == MVT::v1i64 || VT == MVT::v2i64 || VT == MVT::v2i32 ||
@@ -5154,9 +5622,15 @@ SDValue AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
if (Op.getValueType().isVector())
return LowerVSETCC(Op, DAG);
- SDValue LHS = Op.getOperand(0);
- SDValue RHS = Op.getOperand(1);
- ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
+ bool IsStrict = Op->isStrictFPOpcode();
+ bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
+ unsigned OpNo = IsStrict ? 1 : 0;
+ SDValue Chain;
+ if (IsStrict)
+ Chain = Op.getOperand(0);
+ SDValue LHS = Op.getOperand(OpNo + 0);
+ SDValue RHS = Op.getOperand(OpNo + 1);
+ ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(OpNo + 2))->get();
SDLoc dl(Op);
// We chose ZeroOrOneBooleanContents, so use zero and one.
@@ -5167,13 +5641,14 @@ SDValue AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
// Handle f128 first, since one possible outcome is a normal integer
// comparison which gets picked up by the next if statement.
if (LHS.getValueType() == MVT::f128) {
- softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl, LHS, RHS);
+ softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl, LHS, RHS, Chain,
+ IsSignaling);
// If softenSetCCOperands returned a scalar, use it.
if (!RHS.getNode()) {
assert(LHS.getValueType() == Op.getValueType() &&
"Unexpected setcc expansion!");
- return LHS;
+ return IsStrict ? DAG.getMergeValues({LHS, Chain}, dl) : LHS;
}
}
@@ -5185,7 +5660,8 @@ SDValue AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
// Note that we inverted the condition above, so we reverse the order of
// the true and false operands here. This will allow the setcc to be
// matched to a single CSINC instruction.
- return DAG.getNode(AArch64ISD::CSEL, dl, VT, FVal, TVal, CCVal, Cmp);
+ SDValue Res = DAG.getNode(AArch64ISD::CSEL, dl, VT, FVal, TVal, CCVal, Cmp);
+ return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res;
}
// Now we know we're dealing with FP values.
@@ -5194,10 +5670,15 @@ SDValue AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
// If that fails, we'll need to perform an FCMP + CSEL sequence. Go ahead
// and do the comparison.
- SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
+ SDValue Cmp;
+ if (IsStrict)
+ Cmp = emitStrictFPComparison(LHS, RHS, dl, DAG, Chain, IsSignaling);
+ else
+ Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
AArch64CC::CondCode CC1, CC2;
changeFPCCToAArch64CC(CC, CC1, CC2);
+ SDValue Res;
if (CC2 == AArch64CC::AL) {
changeFPCCToAArch64CC(ISD::getSetCCInverse(CC, LHS.getValueType()), CC1,
CC2);
@@ -5206,7 +5687,7 @@ SDValue AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
// Note that we inverted the condition above, so we reverse the order of
// the true and false operands here. This will allow the setcc to be
// matched to a single CSINC instruction.
- return DAG.getNode(AArch64ISD::CSEL, dl, VT, FVal, TVal, CC1Val, Cmp);
+ Res = DAG.getNode(AArch64ISD::CSEL, dl, VT, FVal, TVal, CC1Val, Cmp);
} else {
// Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't
// totally clean. Some of them require two CSELs to implement. As is in
@@ -5219,8 +5700,9 @@ SDValue AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, FVal, CC1Val, Cmp);
SDValue CC2Val = DAG.getConstant(CC2, dl, MVT::i32);
- return DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, CS1, CC2Val, Cmp);
+ Res = DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, CS1, CC2Val, Cmp);
}
+ return IsStrict ? DAG.getMergeValues({Res, Cmp.getValue(1)}, dl) : Res;
}
SDValue AArch64TargetLowering::LowerSELECT_CC(ISD::CondCode CC, SDValue LHS,
@@ -5429,9 +5911,17 @@ SDValue AArch64TargetLowering::LowerSELECT(SDValue Op,
SDValue FVal = Op->getOperand(2);
SDLoc DL(Op);
+ EVT Ty = Op.getValueType();
+ if (Ty.isScalableVector()) {
+ SDValue TruncCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, CCVal);
+ MVT PredVT = MVT::getVectorVT(MVT::i1, Ty.getVectorElementCount());
+ SDValue SplatPred = DAG.getNode(ISD::SPLAT_VECTOR, DL, PredVT, TruncCC);
+ return DAG.getNode(ISD::VSELECT, DL, Ty, SplatPred, TVal, FVal);
+ }
+
// Optimize {s|u}{add|sub|mul}.with.overflow feeding into a select
// instruction.
- if (isOverflowIntrOpRes(CCVal)) {
+ if (ISD::isOverflowIntrOpRes(CCVal)) {
// Only lower legal XALUO ops.
if (!DAG.getTargetLoweringInfo().isTypeLegal(CCVal->getValueType(0)))
return SDValue();
@@ -5642,9 +6132,9 @@ SDValue AArch64TargetLowering::LowerVACOPY(SDValue Op,
const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
return DAG.getMemcpy(Op.getOperand(0), DL, Op.getOperand(1), Op.getOperand(2),
- DAG.getConstant(VaListSize, DL, MVT::i32), PtrSize,
- false, false, false, MachinePointerInfo(DestSV),
- MachinePointerInfo(SrcSV));
+ DAG.getConstant(VaListSize, DL, MVT::i32),
+ Align(PtrSize), false, false, false,
+ MachinePointerInfo(DestSV), MachinePointerInfo(SrcSV));
}
SDValue AArch64TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
@@ -5656,7 +6146,7 @@ SDValue AArch64TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
SDLoc DL(Op);
SDValue Chain = Op.getOperand(0);
SDValue Addr = Op.getOperand(1);
- unsigned Align = Op.getConstantOperandVal(3);
+ MaybeAlign Align(Op.getConstantOperandVal(3));
unsigned MinSlotSize = Subtarget->isTargetILP32() ? 4 : 8;
auto PtrVT = getPointerTy(DAG.getDataLayout());
auto PtrMemVT = getPointerMemTy(DAG.getDataLayout());
@@ -5665,12 +6155,11 @@ SDValue AArch64TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
Chain = VAList.getValue(1);
VAList = DAG.getZExtOrTrunc(VAList, DL, PtrVT);
- if (Align > MinSlotSize) {
- assert(((Align & (Align - 1)) == 0) && "Expected Align to be a power of 2");
+ if (Align && *Align > MinSlotSize) {
VAList = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
- DAG.getConstant(Align - 1, DL, PtrVT));
+ DAG.getConstant(Align->value() - 1, DL, PtrVT));
VAList = DAG.getNode(ISD::AND, DL, PtrVT, VAList,
- DAG.getConstant(-(int64_t)Align, DL, PtrVT));
+ DAG.getConstant(-(int64_t)Align->value(), DL, PtrVT));
}
Type *ArgTy = VT.getTypeForEVT(*DAG.getContext());
@@ -7001,7 +7490,8 @@ static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS,
return DAG.getNode(AArch64ISD::REV64, dl, VT, OpLHS);
// vrev <4 x i16> -> REV32
if (VT.getVectorElementType() == MVT::i16 ||
- VT.getVectorElementType() == MVT::f16)
+ VT.getVectorElementType() == MVT::f16 ||
+ VT.getVectorElementType() == MVT::bf16)
return DAG.getNode(AArch64ISD::REV32, dl, VT, OpLHS);
// vrev <4 x i8> -> REV16
assert(VT.getVectorElementType() == MVT::i8);
@@ -7014,7 +7504,7 @@ static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS,
unsigned Opcode;
if (EltTy == MVT::i8)
Opcode = AArch64ISD::DUPLANE8;
- else if (EltTy == MVT::i16 || EltTy == MVT::f16)
+ else if (EltTy == MVT::i16 || EltTy == MVT::f16 || EltTy == MVT::bf16)
Opcode = AArch64ISD::DUPLANE16;
else if (EltTy == MVT::i32 || EltTy == MVT::f32)
Opcode = AArch64ISD::DUPLANE32;
@@ -7121,7 +7611,7 @@ static SDValue GenerateTBL(SDValue Op, ArrayRef<int> ShuffleMask,
static unsigned getDUPLANEOp(EVT EltType) {
if (EltType == MVT::i8)
return AArch64ISD::DUPLANE8;
- if (EltType == MVT::i16 || EltType == MVT::f16)
+ if (EltType == MVT::i16 || EltType == MVT::f16 || EltType == MVT::bf16)
return AArch64ISD::DUPLANE16;
if (EltType == MVT::i32 || EltType == MVT::f32)
return AArch64ISD::DUPLANE32;
@@ -7330,18 +7820,16 @@ SDValue AArch64TargetLowering::LowerSPLAT_VECTOR(SDValue Op,
// Extend input splat value where needed to fit into a GPR (32b or 64b only)
// FPRs don't have this restriction.
switch (ElemVT.getSimpleVT().SimpleTy) {
- case MVT::i8:
- case MVT::i16:
- case MVT::i32:
- SplatVal = DAG.getAnyExtOrTrunc(SplatVal, dl, MVT::i32);
- return DAG.getNode(AArch64ISD::DUP, dl, VT, SplatVal);
- case MVT::i64:
- SplatVal = DAG.getAnyExtOrTrunc(SplatVal, dl, MVT::i64);
- return DAG.getNode(AArch64ISD::DUP, dl, VT, SplatVal);
case MVT::i1: {
+ // The only legal i1 vectors are SVE vectors, so we can use SVE-specific
+ // lowering code.
+ if (auto *ConstVal = dyn_cast<ConstantSDNode>(SplatVal)) {
+ if (ConstVal->isOne())
+ return getPTrue(DAG, dl, VT, AArch64SVEPredPattern::all);
+ // TODO: Add special case for constant false
+ }
// The general case of i1. There isn't any natural way to do this,
// so we use some trickery with whilelo.
- // TODO: Add special cases for splat of constant true/false.
SplatVal = DAG.getAnyExtOrTrunc(SplatVal, dl, MVT::i64);
SplatVal = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::i64, SplatVal,
DAG.getValueType(MVT::i1));
@@ -7350,15 +7838,76 @@ SDValue AArch64TargetLowering::LowerSPLAT_VECTOR(SDValue Op,
return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, ID,
DAG.getConstant(0, dl, MVT::i64), SplatVal);
}
- // TODO: we can support float types, but haven't added patterns yet.
+ case MVT::i8:
+ case MVT::i16:
+ case MVT::i32:
+ SplatVal = DAG.getAnyExtOrTrunc(SplatVal, dl, MVT::i32);
+ break;
+ case MVT::i64:
+ SplatVal = DAG.getAnyExtOrTrunc(SplatVal, dl, MVT::i64);
+ break;
case MVT::f16:
+ case MVT::bf16:
case MVT::f32:
case MVT::f64:
+ // Fine as is
+ break;
default:
report_fatal_error("Unsupported SPLAT_VECTOR input operand type");
}
+
+ return DAG.getNode(AArch64ISD::DUP, dl, VT, SplatVal);
+}
+
+SDValue AArch64TargetLowering::LowerDUPQLane(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDLoc DL(Op);
+
+ EVT VT = Op.getValueType();
+ if (!isTypeLegal(VT) || !VT.isScalableVector())
+ return SDValue();
+
+ // Current lowering only supports the SVE-ACLE types.
+ if (VT.getSizeInBits().getKnownMinSize() != AArch64::SVEBitsPerBlock)
+ return SDValue();
+
+ // The DUPQ operation is indepedent of element type so normalise to i64s.
+ SDValue V = DAG.getNode(ISD::BITCAST, DL, MVT::nxv2i64, Op.getOperand(1));
+ SDValue Idx128 = Op.getOperand(2);
+
+ // DUPQ can be used when idx is in range.
+ auto *CIdx = dyn_cast<ConstantSDNode>(Idx128);
+ if (CIdx && (CIdx->getZExtValue() <= 3)) {
+ SDValue CI = DAG.getTargetConstant(CIdx->getZExtValue(), DL, MVT::i64);
+ SDNode *DUPQ =
+ DAG.getMachineNode(AArch64::DUP_ZZI_Q, DL, MVT::nxv2i64, V, CI);
+ return DAG.getNode(ISD::BITCAST, DL, VT, SDValue(DUPQ, 0));
+ }
+
+ // The ACLE says this must produce the same result as:
+ // svtbl(data, svadd_x(svptrue_b64(),
+ // svand_x(svptrue_b64(), svindex_u64(0, 1), 1),
+ // index * 2))
+ SDValue One = DAG.getConstant(1, DL, MVT::i64);
+ SDValue SplatOne = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::nxv2i64, One);
+
+ // create the vector 0,1,0,1,...
+ SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
+ SDValue SV = DAG.getNode(AArch64ISD::INDEX_VECTOR,
+ DL, MVT::nxv2i64, Zero, One);
+ SV = DAG.getNode(ISD::AND, DL, MVT::nxv2i64, SV, SplatOne);
+
+ // create the vector idx64,idx64+1,idx64,idx64+1,...
+ SDValue Idx64 = DAG.getNode(ISD::ADD, DL, MVT::i64, Idx128, Idx128);
+ SDValue SplatIdx64 = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::nxv2i64, Idx64);
+ SDValue ShuffleMask = DAG.getNode(ISD::ADD, DL, MVT::nxv2i64, SV, SplatIdx64);
+
+ // create the vector Val[idx64],Val[idx64+1],Val[idx64],Val[idx64+1],...
+ SDValue TBL = DAG.getNode(AArch64ISD::TBL, DL, MVT::nxv2i64, V, ShuffleMask);
+ return DAG.getNode(ISD::BITCAST, DL, VT, TBL);
}
+
static bool resolveBuildVector(BuildVectorSDNode *BVN, APInt &CnstBits,
APInt &UndefBits) {
EVT VT = BVN->getValueType(0);
@@ -7609,8 +8158,10 @@ static unsigned getIntrinsicID(const SDNode *N) {
// Attempt to form a vector S[LR]I from (or (and X, BvecC1), (lsl Y, C2)),
// to (SLI X, Y, C2), where X and Y have matching vector types, BvecC1 is a
-// BUILD_VECTORs with constant element C1, C2 is a constant, and C1 == ~C2.
-// Also, logical shift right -> sri, with the same structure.
+// BUILD_VECTORs with constant element C1, C2 is a constant, and:
+// - for the SLI case: C1 == ~(Ones(ElemSizeInBits) << C2)
+// - for the SRI case: C1 == ~(Ones(ElemSizeInBits) >> C2)
+// The (or (lsl Y, C2), (and X, BvecC1)) case is also handled.
static SDValue tryLowerToSLI(SDNode *N, SelectionDAG &DAG) {
EVT VT = N->getValueType(0);
@@ -7619,49 +8170,70 @@ static SDValue tryLowerToSLI(SDNode *N, SelectionDAG &DAG) {
SDLoc DL(N);
- // Is the first op an AND?
- const SDValue And = N->getOperand(0);
- if (And.getOpcode() != ISD::AND)
+ SDValue And;
+ SDValue Shift;
+
+ SDValue FirstOp = N->getOperand(0);
+ unsigned FirstOpc = FirstOp.getOpcode();
+ SDValue SecondOp = N->getOperand(1);
+ unsigned SecondOpc = SecondOp.getOpcode();
+
+ // Is one of the operands an AND or a BICi? The AND may have been optimised to
+ // a BICi in order to use an immediate instead of a register.
+ // Is the other operand an shl or lshr? This will have been turned into:
+ // AArch64ISD::VSHL vector, #shift or AArch64ISD::VLSHR vector, #shift.
+ if ((FirstOpc == ISD::AND || FirstOpc == AArch64ISD::BICi) &&
+ (SecondOpc == AArch64ISD::VSHL || SecondOpc == AArch64ISD::VLSHR)) {
+ And = FirstOp;
+ Shift = SecondOp;
+
+ } else if ((SecondOpc == ISD::AND || SecondOpc == AArch64ISD::BICi) &&
+ (FirstOpc == AArch64ISD::VSHL || FirstOpc == AArch64ISD::VLSHR)) {
+ And = SecondOp;
+ Shift = FirstOp;
+ } else
return SDValue();
- // Is the second op an shl or lshr?
- SDValue Shift = N->getOperand(1);
- // This will have been turned into: AArch64ISD::VSHL vector, #shift
- // or AArch64ISD::VLSHR vector, #shift
- unsigned ShiftOpc = Shift.getOpcode();
- if ((ShiftOpc != AArch64ISD::VSHL && ShiftOpc != AArch64ISD::VLSHR))
- return SDValue();
- bool IsShiftRight = ShiftOpc == AArch64ISD::VLSHR;
+ bool IsAnd = And.getOpcode() == ISD::AND;
+ bool IsShiftRight = Shift.getOpcode() == AArch64ISD::VLSHR;
// Is the shift amount constant?
ConstantSDNode *C2node = dyn_cast<ConstantSDNode>(Shift.getOperand(1));
if (!C2node)
return SDValue();
- // Is the and mask vector all constant?
uint64_t C1;
- if (!isAllConstantBuildVector(And.getOperand(1), C1))
- return SDValue();
+ if (IsAnd) {
+ // Is the and mask vector all constant?
+ if (!isAllConstantBuildVector(And.getOperand(1), C1))
+ return SDValue();
+ } else {
+ // Reconstruct the corresponding AND immediate from the two BICi immediates.
+ ConstantSDNode *C1nodeImm = dyn_cast<ConstantSDNode>(And.getOperand(1));
+ ConstantSDNode *C1nodeShift = dyn_cast<ConstantSDNode>(And.getOperand(2));
+ assert(C1nodeImm && C1nodeShift);
+ C1 = ~(C1nodeImm->getZExtValue() << C1nodeShift->getZExtValue());
+ }
- // Is C1 == ~C2, taking into account how much one can shift elements of a
- // particular size?
+ // Is C1 == ~(Ones(ElemSizeInBits) << C2) or
+ // C1 == ~(Ones(ElemSizeInBits) >> C2), taking into account
+ // how much one can shift elements of a particular size?
uint64_t C2 = C2node->getZExtValue();
unsigned ElemSizeInBits = VT.getScalarSizeInBits();
if (C2 > ElemSizeInBits)
return SDValue();
- unsigned ElemMask = (1 << ElemSizeInBits) - 1;
- if ((C1 & ElemMask) != (~C2 & ElemMask))
+
+ APInt C1AsAPInt(ElemSizeInBits, C1);
+ APInt RequiredC1 = IsShiftRight ? APInt::getHighBitsSet(ElemSizeInBits, C2)
+ : APInt::getLowBitsSet(ElemSizeInBits, C2);
+ if (C1AsAPInt != RequiredC1)
return SDValue();
SDValue X = And.getOperand(0);
SDValue Y = Shift.getOperand(0);
- unsigned Intrin =
- IsShiftRight ? Intrinsic::aarch64_neon_vsri : Intrinsic::aarch64_neon_vsli;
- SDValue ResultSLI =
- DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
- DAG.getConstant(Intrin, DL, MVT::i32), X, Y,
- Shift.getOperand(1));
+ unsigned Inst = IsShiftRight ? AArch64ISD::VSRI : AArch64ISD::VSLI;
+ SDValue ResultSLI = DAG.getNode(Inst, DL, VT, X, Y, Shift.getOperand(1));
LLVM_DEBUG(dbgs() << "aarch64-lower: transformed: \n");
LLVM_DEBUG(N->dump(&DAG));
@@ -7675,10 +8247,8 @@ static SDValue tryLowerToSLI(SDNode *N, SelectionDAG &DAG) {
SDValue AArch64TargetLowering::LowerVectorOR(SDValue Op,
SelectionDAG &DAG) const {
// Attempt to form a vector S[LR]I from (or (and X, C1), (lsl Y, C2))
- if (EnableAArch64SlrGeneration) {
- if (SDValue Res = tryLowerToSLI(Op.getNode(), DAG))
- return Res;
- }
+ if (SDValue Res = tryLowerToSLI(Op.getNode(), DAG))
+ return Res;
EVT VT = Op.getValueType();
@@ -7966,8 +8536,8 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
if (VT.getVectorElementType().isFloatingPoint()) {
SmallVector<SDValue, 8> Ops;
EVT EltTy = VT.getVectorElementType();
- assert ((EltTy == MVT::f16 || EltTy == MVT::f32 || EltTy == MVT::f64) &&
- "Unsupported floating-point vector type");
+ assert ((EltTy == MVT::f16 || EltTy == MVT::bf16 || EltTy == MVT::f32 ||
+ EltTy == MVT::f64) && "Unsupported floating-point vector type");
LLVM_DEBUG(
dbgs() << "LowerBUILD_VECTOR: float constant splats, creating int "
"BITCASTS, and try again\n");
@@ -8086,11 +8656,12 @@ SDValue AArch64TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
// Insertion/extraction are legal for V128 types.
if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 ||
VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64 ||
- VT == MVT::v8f16)
+ VT == MVT::v8f16 || VT == MVT::v8bf16)
return Op;
if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 &&
- VT != MVT::v1i64 && VT != MVT::v2f32 && VT != MVT::v4f16)
+ VT != MVT::v1i64 && VT != MVT::v2f32 && VT != MVT::v4f16 &&
+ VT != MVT::v4bf16)
return SDValue();
// For V64 types, we perform insertion by expanding the value
@@ -8120,11 +8691,12 @@ AArch64TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
// Insertion/extraction are legal for V128 types.
if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 ||
VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64 ||
- VT == MVT::v8f16)
+ VT == MVT::v8f16 || VT == MVT::v8bf16)
return Op;
if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 &&
- VT != MVT::v1i64 && VT != MVT::v2f32 && VT != MVT::v4f16)
+ VT != MVT::v1i64 && VT != MVT::v2f32 && VT != MVT::v4f16 &&
+ VT != MVT::v4bf16)
return SDValue();
// For V64 types, we perform extraction by expanding the value
@@ -8144,32 +8716,57 @@ AArch64TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
SDValue AArch64TargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
SelectionDAG &DAG) const {
- EVT VT = Op.getOperand(0).getValueType();
- SDLoc dl(Op);
- // Just in case...
- if (!VT.isVector())
- return SDValue();
-
- ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op.getOperand(1));
- if (!Cst)
- return SDValue();
- unsigned Val = Cst->getZExtValue();
+ assert(Op.getValueType().isFixedLengthVector() &&
+ "Only cases that extract a fixed length vector are supported!");
+ EVT InVT = Op.getOperand(0).getValueType();
+ unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
unsigned Size = Op.getValueSizeInBits();
+ if (InVT.isScalableVector()) {
+ // This will be matched by custom code during ISelDAGToDAG.
+ if (Idx == 0 && isPackedVectorType(InVT, DAG))
+ return Op;
+
+ return SDValue();
+ }
+
// This will get lowered to an appropriate EXTRACT_SUBREG in ISel.
- if (Val == 0)
+ if (Idx == 0 && InVT.getSizeInBits() <= 128)
return Op;
// If this is extracting the upper 64-bits of a 128-bit vector, we match
// that directly.
- if (Size == 64 && Val * VT.getScalarSizeInBits() == 64)
+ if (Size == 64 && Idx * InVT.getScalarSizeInBits() == 64)
+ return Op;
+
+ return SDValue();
+}
+
+SDValue AArch64TargetLowering::LowerINSERT_SUBVECTOR(SDValue Op,
+ SelectionDAG &DAG) const {
+ assert(Op.getValueType().isScalableVector() &&
+ "Only expect to lower inserts into scalable vectors!");
+
+ EVT InVT = Op.getOperand(1).getValueType();
+ unsigned Idx = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
+
+ // We don't have any patterns for scalable vector yet.
+ if (InVT.isScalableVector() || !useSVEForFixedLengthVectorVT(InVT))
+ return SDValue();
+
+ // This will be matched by custom code during ISelDAGToDAG.
+ if (Idx == 0 && isPackedVectorType(InVT, DAG) && Op.getOperand(0).isUndef())
return Op;
return SDValue();
}
bool AArch64TargetLowering::isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const {
+ // Currently no fixed length shuffles that require SVE are legal.
+ if (useSVEForFixedLengthVectorVT(VT))
+ return false;
+
if (VT.getVectorNumElements() == 4 &&
(VT.is128BitVector() || VT.is64BitVector())) {
unsigned PFIndexes[4];
@@ -8249,6 +8846,81 @@ static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, int64_t &Cnt) {
return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits / 2 : ElementBits));
}
+// Attempt to form urhadd(OpA, OpB) from
+// truncate(vlshr(sub(zext(OpB), xor(zext(OpA), Ones(ElemSizeInBits))), 1)).
+// The original form of this expression is
+// truncate(srl(add(zext(OpB), add(zext(OpA), 1)), 1)) and before this function
+// is called the srl will have been lowered to AArch64ISD::VLSHR and the
+// ((OpA + OpB + 1) >> 1) expression will have been changed to (OpB - (~OpA)).
+// This pass can also recognize a variant of this pattern that uses sign
+// extension instead of zero extension and form a srhadd(OpA, OpB) from it.
+SDValue AArch64TargetLowering::LowerTRUNCATE(SDValue Op,
+ SelectionDAG &DAG) const {
+ EVT VT = Op.getValueType();
+
+ if (!VT.isVector() || VT.isScalableVector())
+ return Op;
+
+ if (useSVEForFixedLengthVectorVT(Op.getOperand(0).getValueType()))
+ return LowerFixedLengthVectorTruncateToSVE(Op, DAG);
+
+ // Since we are looking for a right shift by a constant value of 1 and we are
+ // operating on types at least 16 bits in length (sign/zero extended OpA and
+ // OpB, which are at least 8 bits), it follows that the truncate will always
+ // discard the shifted-in bit and therefore the right shift will be logical
+ // regardless of the signedness of OpA and OpB.
+ SDValue Shift = Op.getOperand(0);
+ if (Shift.getOpcode() != AArch64ISD::VLSHR)
+ return Op;
+
+ // Is the right shift using an immediate value of 1?
+ uint64_t ShiftAmount = Shift.getConstantOperandVal(1);
+ if (ShiftAmount != 1)
+ return Op;
+
+ SDValue Sub = Shift->getOperand(0);
+ if (Sub.getOpcode() != ISD::SUB)
+ return Op;
+
+ SDValue Xor = Sub.getOperand(1);
+ if (Xor.getOpcode() != ISD::XOR)
+ return Op;
+
+ SDValue ExtendOpA = Xor.getOperand(0);
+ SDValue ExtendOpB = Sub.getOperand(0);
+ unsigned ExtendOpAOpc = ExtendOpA.getOpcode();
+ unsigned ExtendOpBOpc = ExtendOpB.getOpcode();
+ if (!(ExtendOpAOpc == ExtendOpBOpc &&
+ (ExtendOpAOpc == ISD::ZERO_EXTEND || ExtendOpAOpc == ISD::SIGN_EXTEND)))
+ return Op;
+
+ // Is the result of the right shift being truncated to the same value type as
+ // the original operands, OpA and OpB?
+ SDValue OpA = ExtendOpA.getOperand(0);
+ SDValue OpB = ExtendOpB.getOperand(0);
+ EVT OpAVT = OpA.getValueType();
+ assert(ExtendOpA.getValueType() == ExtendOpB.getValueType());
+ if (!(VT == OpAVT && OpAVT == OpB.getValueType()))
+ return Op;
+
+ // Is the XOR using a constant amount of all ones in the right hand side?
+ uint64_t C;
+ if (!isAllConstantBuildVector(Xor.getOperand(1), C))
+ return Op;
+
+ unsigned ElemSizeInBits = VT.getScalarSizeInBits();
+ APInt CAsAPInt(ElemSizeInBits, C);
+ if (CAsAPInt != APInt::getAllOnesValue(ElemSizeInBits))
+ return Op;
+
+ SDLoc DL(Op);
+ bool IsSignExtend = ExtendOpAOpc == ISD::SIGN_EXTEND;
+ unsigned RHADDOpc = IsSignExtend ? AArch64ISD::SRHADD : AArch64ISD::URHADD;
+ SDValue ResultURHADD = DAG.getNode(RHADDOpc, DL, VT, OpA, OpB);
+
+ return ResultURHADD;
+}
+
SDValue AArch64TargetLowering::LowerVectorSRA_SRL_SHL(SDValue Op,
SelectionDAG &DAG) const {
EVT VT = Op.getValueType();
@@ -8264,6 +8936,9 @@ SDValue AArch64TargetLowering::LowerVectorSRA_SRL_SHL(SDValue Op,
llvm_unreachable("unexpected shift opcode");
case ISD::SHL:
+ if (VT.isScalableVector())
+ return LowerToPredicatedOp(Op, DAG, AArch64ISD::SHL_MERGE_OP1);
+
if (isVShiftLImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize)
return DAG.getNode(AArch64ISD::VSHL, DL, VT, Op.getOperand(0),
DAG.getConstant(Cnt, DL, MVT::i32));
@@ -8273,6 +8948,12 @@ SDValue AArch64TargetLowering::LowerVectorSRA_SRL_SHL(SDValue Op,
Op.getOperand(0), Op.getOperand(1));
case ISD::SRA:
case ISD::SRL:
+ if (VT.isScalableVector()) {
+ unsigned Opc = Op.getOpcode() == ISD::SRA ? AArch64ISD::SRA_MERGE_OP1
+ : AArch64ISD::SRL_MERGE_OP1;
+ return LowerToPredicatedOp(Op, DAG, Opc);
+ }
+
// Right shift immediate
if (isVShiftRImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize) {
unsigned Opc =
@@ -8395,6 +9076,12 @@ static SDValue EmitVectorComparison(SDValue LHS, SDValue RHS,
SDValue AArch64TargetLowering::LowerVSETCC(SDValue Op,
SelectionDAG &DAG) const {
+ if (Op.getValueType().isScalableVector()) {
+ if (Op.getOperand(0).getValueType().isFloatingPoint())
+ return Op;
+ return LowerToPredicatedOp(Op, DAG, AArch64ISD::SETCC_MERGE_ZERO);
+ }
+
ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
SDValue LHS = Op.getOperand(0);
SDValue RHS = Op.getOperand(1);
@@ -8570,7 +9257,8 @@ AArch64TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
SDNode *Node = Op.getNode();
SDValue Chain = Op.getOperand(0);
SDValue Size = Op.getOperand(1);
- unsigned Align = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
+ MaybeAlign Align =
+ cast<ConstantSDNode>(Op.getOperand(2))->getMaybeAlignValue();
EVT VT = Node->getValueType(0);
if (DAG.getMachineFunction().getFunction().hasFnAttribute(
@@ -8580,7 +9268,7 @@ AArch64TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size);
if (Align)
SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
- DAG.getConstant(-(uint64_t)Align, dl, VT));
+ DAG.getConstant(-(uint64_t)Align->value(), dl, VT));
Chain = DAG.getCopyToReg(Chain, dl, AArch64::SP, SP);
SDValue Ops[2] = {SP, Chain};
return DAG.getMergeValues(Ops, dl);
@@ -8595,7 +9283,7 @@ AArch64TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size);
if (Align)
SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
- DAG.getConstant(-(uint64_t)Align, dl, VT));
+ DAG.getConstant(-(uint64_t)Align->value(), dl, VT));
Chain = DAG.getCopyToReg(Chain, dl, AArch64::SP, SP);
Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, dl, true),
@@ -8605,6 +9293,41 @@ AArch64TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
return DAG.getMergeValues(Ops, dl);
}
+SDValue AArch64TargetLowering::LowerVSCALE(SDValue Op,
+ SelectionDAG &DAG) const {
+ EVT VT = Op.getValueType();
+ assert(VT != MVT::i64 && "Expected illegal VSCALE node");
+
+ SDLoc DL(Op);
+ APInt MulImm = cast<ConstantSDNode>(Op.getOperand(0))->getAPIntValue();
+ return DAG.getZExtOrTrunc(DAG.getVScale(DL, MVT::i64, MulImm.sextOrSelf(64)),
+ DL, VT);
+}
+
+/// Set the IntrinsicInfo for the `aarch64_sve_st<N>` intrinsics.
+template <unsigned NumVecs>
+static bool setInfoSVEStN(AArch64TargetLowering::IntrinsicInfo &Info,
+ const CallInst &CI) {
+ Info.opc = ISD::INTRINSIC_VOID;
+ // Retrieve EC from first vector argument.
+ const EVT VT = EVT::getEVT(CI.getArgOperand(0)->getType());
+ ElementCount EC = VT.getVectorElementCount();
+#ifndef NDEBUG
+ // Check the assumption that all input vectors are the same type.
+ for (unsigned I = 0; I < NumVecs; ++I)
+ assert(VT == EVT::getEVT(CI.getArgOperand(I)->getType()) &&
+ "Invalid type.");
+#endif
+ // memVT is `NumVecs * VT`.
+ Info.memVT = EVT::getVectorVT(CI.getType()->getContext(), VT.getScalarType(),
+ EC * NumVecs);
+ Info.ptrVal = CI.getArgOperand(CI.getNumArgOperands() - 1);
+ Info.offset = 0;
+ Info.align.reset();
+ Info.flags = MachineMemOperand::MOStore;
+ return true;
+}
+
/// getTgtMemIntrinsic - Represent NEON load and store intrinsics as
/// MemIntrinsicNodes. The associated MachineMemOperands record the alignment
/// specified in the intrinsic calls.
@@ -8614,6 +9337,12 @@ bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
unsigned Intrinsic) const {
auto &DL = I.getModule()->getDataLayout();
switch (Intrinsic) {
+ case Intrinsic::aarch64_sve_st2:
+ return setInfoSVEStN<2>(Info, I);
+ case Intrinsic::aarch64_sve_st3:
+ return setInfoSVEStN<3>(Info, I);
+ case Intrinsic::aarch64_sve_st4:
+ return setInfoSVEStN<4>(Info, I);
case Intrinsic::aarch64_neon_ld2:
case Intrinsic::aarch64_neon_ld3:
case Intrinsic::aarch64_neon_ld4:
@@ -8670,7 +9399,7 @@ bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
Info.memVT = MVT::getVT(PtrTy->getElementType());
Info.ptrVal = I.getArgOperand(0);
Info.offset = 0;
- Info.align = MaybeAlign(DL.getABITypeAlignment(PtrTy->getElementType()));
+ Info.align = DL.getABITypeAlign(PtrTy->getElementType());
Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile;
return true;
}
@@ -8681,7 +9410,7 @@ bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
Info.memVT = MVT::getVT(PtrTy->getElementType());
Info.ptrVal = I.getArgOperand(1);
Info.offset = 0;
- Info.align = MaybeAlign(DL.getABITypeAlignment(PtrTy->getElementType()));
+ Info.align = DL.getABITypeAlign(PtrTy->getElementType());
Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOVolatile;
return true;
}
@@ -8706,21 +9435,25 @@ bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
case Intrinsic::aarch64_sve_ldnt1: {
PointerType *PtrTy = cast<PointerType>(I.getArgOperand(1)->getType());
Info.opc = ISD::INTRINSIC_W_CHAIN;
- Info.memVT = MVT::getVT(PtrTy->getElementType());
+ Info.memVT = MVT::getVT(I.getType());
Info.ptrVal = I.getArgOperand(1);
Info.offset = 0;
- Info.align = MaybeAlign(DL.getABITypeAlignment(PtrTy->getElementType()));
- Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MONonTemporal;
+ Info.align = DL.getABITypeAlign(PtrTy->getElementType());
+ Info.flags = MachineMemOperand::MOLoad;
+ if (Intrinsic == Intrinsic::aarch64_sve_ldnt1)
+ Info.flags |= MachineMemOperand::MONonTemporal;
return true;
}
case Intrinsic::aarch64_sve_stnt1: {
PointerType *PtrTy = cast<PointerType>(I.getArgOperand(2)->getType());
Info.opc = ISD::INTRINSIC_W_CHAIN;
- Info.memVT = MVT::getVT(PtrTy->getElementType());
+ Info.memVT = MVT::getVT(I.getOperand(0)->getType());
Info.ptrVal = I.getArgOperand(2);
Info.offset = 0;
- Info.align = MaybeAlign(DL.getABITypeAlignment(PtrTy->getElementType()));
- Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MONonTemporal;
+ Info.align = DL.getABITypeAlign(PtrTy->getElementType());
+ Info.flags = MachineMemOperand::MOStore;
+ if (Intrinsic == Intrinsic::aarch64_sve_stnt1)
+ Info.flags |= MachineMemOperand::MONonTemporal;
return true;
}
default:
@@ -8895,21 +9628,22 @@ bool AArch64TargetLowering::isExtFreeImpl(const Instruction *Ext) const {
/// or upper half of the vector elements.
static bool areExtractShuffleVectors(Value *Op1, Value *Op2) {
auto areTypesHalfed = [](Value *FullV, Value *HalfV) {
- auto *FullVT = cast<VectorType>(FullV->getType());
- auto *HalfVT = cast<VectorType>(HalfV->getType());
- return FullVT->getBitWidth() == 2 * HalfVT->getBitWidth();
+ auto *FullTy = FullV->getType();
+ auto *HalfTy = HalfV->getType();
+ return FullTy->getPrimitiveSizeInBits().getFixedSize() ==
+ 2 * HalfTy->getPrimitiveSizeInBits().getFixedSize();
};
auto extractHalf = [](Value *FullV, Value *HalfV) {
- auto *FullVT = cast<VectorType>(FullV->getType());
- auto *HalfVT = cast<VectorType>(HalfV->getType());
+ auto *FullVT = cast<FixedVectorType>(FullV->getType());
+ auto *HalfVT = cast<FixedVectorType>(HalfV->getType());
return FullVT->getNumElements() == 2 * HalfVT->getNumElements();
};
- Constant *M1, *M2;
+ ArrayRef<int> M1, M2;
Value *S1Op1, *S2Op1;
- if (!match(Op1, m_ShuffleVector(m_Value(S1Op1), m_Undef(), m_Constant(M1))) ||
- !match(Op2, m_ShuffleVector(m_Value(S2Op1), m_Undef(), m_Constant(M2))))
+ if (!match(Op1, m_Shuffle(m_Value(S1Op1), m_Undef(), m_Mask(M1))) ||
+ !match(Op2, m_Shuffle(m_Value(S2Op1), m_Undef(), m_Mask(M2))))
return false;
// Check that the operands are half as wide as the result and we extract
@@ -8922,7 +9656,7 @@ static bool areExtractShuffleVectors(Value *Op1, Value *Op2) {
// elements.
int M1Start = -1;
int M2Start = -1;
- int NumElements = cast<VectorType>(Op1->getType())->getNumElements() * 2;
+ int NumElements = cast<FixedVectorType>(Op1->getType())->getNumElements() * 2;
if (!ShuffleVectorInst::isExtractSubvectorMask(M1, NumElements, M1Start) ||
!ShuffleVectorInst::isExtractSubvectorMask(M2, NumElements, M2Start) ||
M1Start != M2Start || (M1Start != 0 && M2Start != (NumElements / 2)))
@@ -8948,6 +9682,22 @@ static bool areExtractExts(Value *Ext1, Value *Ext2) {
return true;
}
+/// Check if Op could be used with vmull_high_p64 intrinsic.
+static bool isOperandOfVmullHighP64(Value *Op) {
+ Value *VectorOperand = nullptr;
+ ConstantInt *ElementIndex = nullptr;
+ return match(Op, m_ExtractElt(m_Value(VectorOperand),
+ m_ConstantInt(ElementIndex))) &&
+ ElementIndex->getValue() == 1 &&
+ isa<FixedVectorType>(VectorOperand->getType()) &&
+ cast<FixedVectorType>(VectorOperand->getType())->getNumElements() == 2;
+}
+
+/// Check if Op1 and Op2 could be used with vmull_high_p64 intrinsic.
+static bool areOperandsOfVmullHighP64(Value *Op1, Value *Op2) {
+ return isOperandOfVmullHighP64(Op1) && isOperandOfVmullHighP64(Op2);
+}
+
/// Check if sinking \p I's operands to I's basic block is profitable, because
/// the operands can be folded into a target instruction, e.g.
/// shufflevectors extracts and/or sext/zext can be folded into (u,s)subl(2).
@@ -8964,6 +9714,15 @@ bool AArch64TargetLowering::shouldSinkOperands(
Ops.push_back(&II->getOperandUse(0));
Ops.push_back(&II->getOperandUse(1));
return true;
+
+ case Intrinsic::aarch64_neon_pmull64:
+ if (!areOperandsOfVmullHighP64(II->getArgOperand(0),
+ II->getArgOperand(1)))
+ return false;
+ Ops.push_back(&II->getArgOperandUse(0));
+ Ops.push_back(&II->getArgOperandUse(1));
+ return true;
+
default:
return false;
}
@@ -8996,12 +9755,12 @@ bool AArch64TargetLowering::shouldSinkOperands(
}
bool AArch64TargetLowering::hasPairedLoad(EVT LoadedType,
- unsigned &RequiredAligment) const {
+ Align &RequiredAligment) const {
if (!LoadedType.isSimple() ||
(!LoadedType.isInteger() && !LoadedType.isFloatingPoint()))
return false;
// Cyclone supports unaligned accesses.
- RequiredAligment = 0;
+ RequiredAligment = Align(1);
unsigned NumBits = LoadedType.getSizeInBits();
return NumBits == 32 || NumBits == 64;
}
@@ -9015,7 +9774,7 @@ AArch64TargetLowering::getNumInterleavedAccesses(VectorType *VecTy,
}
MachineMemOperand::Flags
-AArch64TargetLowering::getMMOFlags(const Instruction &I) const {
+AArch64TargetLowering::getTargetMMOFlags(const Instruction &I) const {
if (Subtarget->getProcFamily() == AArch64Subtarget::Falkor &&
I.getMetadata(FALKOR_STRIDED_ACCESS_MD) != nullptr)
return MOStridedAccess;
@@ -9029,7 +9788,7 @@ bool AArch64TargetLowering::isLegalInterleavedAccessType(
unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType());
// Ensure the number of vector elements is greater than 1.
- if (VecTy->getNumElements() < 2)
+ if (cast<FixedVectorType>(VecTy)->getNumElements() < 2)
return false;
// Ensure the element type is legal.
@@ -9063,22 +9822,24 @@ bool AArch64TargetLowering::lowerInterleavedLoad(
const DataLayout &DL = LI->getModule()->getDataLayout();
- VectorType *VecTy = Shuffles[0]->getType();
+ VectorType *VTy = Shuffles[0]->getType();
// Skip if we do not have NEON and skip illegal vector types. We can
// "legalize" wide vector types into multiple interleaved accesses as long as
// the vector types are divisible by 128.
- if (!Subtarget->hasNEON() || !isLegalInterleavedAccessType(VecTy, DL))
+ if (!Subtarget->hasNEON() || !isLegalInterleavedAccessType(VTy, DL))
return false;
- unsigned NumLoads = getNumInterleavedAccesses(VecTy, DL);
+ unsigned NumLoads = getNumInterleavedAccesses(VTy, DL);
+
+ auto *FVTy = cast<FixedVectorType>(VTy);
// A pointer vector can not be the return type of the ldN intrinsics. Need to
// load integer vectors first and then convert to pointer vectors.
- Type *EltTy = VecTy->getVectorElementType();
+ Type *EltTy = FVTy->getElementType();
if (EltTy->isPointerTy())
- VecTy =
- VectorType::get(DL.getIntPtrType(EltTy), VecTy->getVectorNumElements());
+ FVTy =
+ FixedVectorType::get(DL.getIntPtrType(EltTy), FVTy->getNumElements());
IRBuilder<> Builder(LI);
@@ -9088,19 +9849,19 @@ bool AArch64TargetLowering::lowerInterleavedLoad(
if (NumLoads > 1) {
// If we're going to generate more than one load, reset the sub-vector type
// to something legal.
- VecTy = VectorType::get(VecTy->getVectorElementType(),
- VecTy->getVectorNumElements() / NumLoads);
+ FVTy = FixedVectorType::get(FVTy->getElementType(),
+ FVTy->getNumElements() / NumLoads);
// We will compute the pointer operand of each load from the original base
// address using GEPs. Cast the base address to a pointer to the scalar
// element type.
BaseAddr = Builder.CreateBitCast(
- BaseAddr, VecTy->getVectorElementType()->getPointerTo(
- LI->getPointerAddressSpace()));
+ BaseAddr,
+ FVTy->getElementType()->getPointerTo(LI->getPointerAddressSpace()));
}
- Type *PtrTy = VecTy->getPointerTo(LI->getPointerAddressSpace());
- Type *Tys[2] = {VecTy, PtrTy};
+ Type *PtrTy = FVTy->getPointerTo(LI->getPointerAddressSpace());
+ Type *Tys[2] = {FVTy, PtrTy};
static const Intrinsic::ID LoadInts[3] = {Intrinsic::aarch64_neon_ld2,
Intrinsic::aarch64_neon_ld3,
Intrinsic::aarch64_neon_ld4};
@@ -9117,9 +9878,8 @@ bool AArch64TargetLowering::lowerInterleavedLoad(
// If we're generating more than one load, compute the base address of
// subsequent loads as an offset from the previous.
if (LoadCount > 0)
- BaseAddr =
- Builder.CreateConstGEP1_32(VecTy->getVectorElementType(), BaseAddr,
- VecTy->getVectorNumElements() * Factor);
+ BaseAddr = Builder.CreateConstGEP1_32(FVTy->getElementType(), BaseAddr,
+ FVTy->getNumElements() * Factor);
CallInst *LdN = Builder.CreateCall(
LdNFunc, Builder.CreateBitCast(BaseAddr, PtrTy), "ldN");
@@ -9134,8 +9894,8 @@ bool AArch64TargetLowering::lowerInterleavedLoad(
// Convert the integer vector to pointer vector if the element is pointer.
if (EltTy->isPointerTy())
SubVec = Builder.CreateIntToPtr(
- SubVec, VectorType::get(SVI->getType()->getVectorElementType(),
- VecTy->getVectorNumElements()));
+ SubVec, FixedVectorType::get(SVI->getType()->getElementType(),
+ FVTy->getNumElements()));
SubVecs[SVI].push_back(SubVec);
}
}
@@ -9186,13 +9946,12 @@ bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI,
assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
"Invalid interleave factor");
- VectorType *VecTy = SVI->getType();
- assert(VecTy->getVectorNumElements() % Factor == 0 &&
- "Invalid interleaved store");
+ auto *VecTy = cast<FixedVectorType>(SVI->getType());
+ assert(VecTy->getNumElements() % Factor == 0 && "Invalid interleaved store");
- unsigned LaneLen = VecTy->getVectorNumElements() / Factor;
- Type *EltTy = VecTy->getVectorElementType();
- VectorType *SubVecTy = VectorType::get(EltTy, LaneLen);
+ unsigned LaneLen = VecTy->getNumElements() / Factor;
+ Type *EltTy = VecTy->getElementType();
+ auto *SubVecTy = FixedVectorType::get(EltTy, LaneLen);
const DataLayout &DL = SI->getModule()->getDataLayout();
@@ -9212,14 +9971,15 @@ bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI,
// vectors to integer vectors.
if (EltTy->isPointerTy()) {
Type *IntTy = DL.getIntPtrType(EltTy);
- unsigned NumOpElts = Op0->getType()->getVectorNumElements();
+ unsigned NumOpElts =
+ cast<FixedVectorType>(Op0->getType())->getNumElements();
// Convert to the corresponding integer vector.
- Type *IntVecTy = VectorType::get(IntTy, NumOpElts);
+ auto *IntVecTy = FixedVectorType::get(IntTy, NumOpElts);
Op0 = Builder.CreatePtrToInt(Op0, IntVecTy);
Op1 = Builder.CreatePtrToInt(Op1, IntVecTy);
- SubVecTy = VectorType::get(IntTy, LaneLen);
+ SubVecTy = FixedVectorType::get(IntTy, LaneLen);
}
// The base address of the store.
@@ -9229,14 +9989,14 @@ bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI,
// If we're going to generate more than one store, reset the lane length
// and sub-vector type to something legal.
LaneLen /= NumStores;
- SubVecTy = VectorType::get(SubVecTy->getVectorElementType(), LaneLen);
+ SubVecTy = FixedVectorType::get(SubVecTy->getElementType(), LaneLen);
// We will compute the pointer operand of each store from the original base
// address using GEPs. Cast the base address to a pointer to the scalar
// element type.
BaseAddr = Builder.CreateBitCast(
- BaseAddr, SubVecTy->getVectorElementType()->getPointerTo(
- SI->getPointerAddressSpace()));
+ BaseAddr,
+ SubVecTy->getElementType()->getPointerTo(SI->getPointerAddressSpace()));
}
auto Mask = SVI->getShuffleMask();
@@ -9258,7 +10018,7 @@ bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI,
unsigned IdxI = StoreCount * LaneLen * Factor + i;
if (Mask[IdxI] >= 0) {
Ops.push_back(Builder.CreateShuffleVector(
- Op0, Op1, createSequentialMask(Builder, Mask[IdxI], LaneLen, 0)));
+ Op0, Op1, createSequentialMask(Mask[IdxI], LaneLen, 0)));
} else {
unsigned StartMask = 0;
for (unsigned j = 1; j < LaneLen; j++) {
@@ -9274,14 +10034,14 @@ bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI,
// Note: StartMask cannot be negative, it's checked in
// isReInterleaveMask
Ops.push_back(Builder.CreateShuffleVector(
- Op0, Op1, createSequentialMask(Builder, StartMask, LaneLen, 0)));
+ Op0, Op1, createSequentialMask(StartMask, LaneLen, 0)));
}
}
// If we generating more than one store, we compute the base address of
// subsequent stores as an offset from the previous.
if (StoreCount > 0)
- BaseAddr = Builder.CreateConstGEP1_32(SubVecTy->getVectorElementType(),
+ BaseAddr = Builder.CreateConstGEP1_32(SubVecTy->getElementType(),
BaseAddr, LaneLen * Factor);
Ops.push_back(Builder.CreateBitCast(BaseAddr, PtrTy));
@@ -9290,16 +10050,59 @@ bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI,
return true;
}
-static bool memOpAlign(unsigned DstAlign, unsigned SrcAlign,
- unsigned AlignCheck) {
- return ((SrcAlign == 0 || SrcAlign % AlignCheck == 0) &&
- (DstAlign == 0 || DstAlign % AlignCheck == 0));
+// Lower an SVE structured load intrinsic returning a tuple type to target
+// specific intrinsic taking the same input but returning a multi-result value
+// of the split tuple type.
+//
+// E.g. Lowering an LD3:
+//
+// call <vscale x 12 x i32> @llvm.aarch64.sve.ld3.nxv12i32(
+// <vscale x 4 x i1> %pred,
+// <vscale x 4 x i32>* %addr)
+//
+// Output DAG:
+//
+// t0: ch = EntryToken
+// t2: nxv4i1,ch = CopyFromReg t0, Register:nxv4i1 %0
+// t4: i64,ch = CopyFromReg t0, Register:i64 %1
+// t5: nxv4i32,nxv4i32,nxv4i32,ch = AArch64ISD::SVE_LD3 t0, t2, t4
+// t6: nxv12i32 = concat_vectors t5, t5:1, t5:2
+//
+// This is called pre-legalization to avoid widening/splitting issues with
+// non-power-of-2 tuple types used for LD3, such as nxv12i32.
+SDValue AArch64TargetLowering::LowerSVEStructLoad(unsigned Intrinsic,
+ ArrayRef<SDValue> LoadOps,
+ EVT VT, SelectionDAG &DAG,
+ const SDLoc &DL) const {
+ assert(VT.isScalableVector() && "Can only lower scalable vectors");
+
+ unsigned N, Opcode;
+ static std::map<unsigned, std::pair<unsigned, unsigned>> IntrinsicMap = {
+ {Intrinsic::aarch64_sve_ld2, {2, AArch64ISD::SVE_LD2_MERGE_ZERO}},
+ {Intrinsic::aarch64_sve_ld3, {3, AArch64ISD::SVE_LD3_MERGE_ZERO}},
+ {Intrinsic::aarch64_sve_ld4, {4, AArch64ISD::SVE_LD4_MERGE_ZERO}}};
+
+ std::tie(N, Opcode) = IntrinsicMap[Intrinsic];
+ assert(VT.getVectorElementCount().Min % N == 0 &&
+ "invalid tuple vector type!");
+
+ EVT SplitVT = EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(),
+ VT.getVectorElementCount() / N);
+ assert(isTypeLegal(SplitVT));
+
+ SmallVector<EVT, 5> VTs(N, SplitVT);
+ VTs.push_back(MVT::Other); // Chain
+ SDVTList NodeTys = DAG.getVTList(VTs);
+
+ SDValue PseudoLoad = DAG.getNode(Opcode, DL, NodeTys, LoadOps);
+ SmallVector<SDValue, 4> PseudoLoadOps;
+ for (unsigned I = 0; I < N; ++I)
+ PseudoLoadOps.push_back(SDValue(PseudoLoad.getNode(), I));
+ return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, PseudoLoadOps);
}
EVT AArch64TargetLowering::getOptimalMemOpType(
- uint64_t Size, unsigned DstAlign, unsigned SrcAlign, bool IsMemset,
- bool ZeroMemset, bool MemcpyStrSrc,
- const AttributeList &FuncAttributes) const {
+ const MemOp &Op, const AttributeList &FuncAttributes) const {
bool CanImplicitFloat =
!FuncAttributes.hasFnAttribute(Attribute::NoImplicitFloat);
bool CanUseNEON = Subtarget->hasNEON() && CanImplicitFloat;
@@ -9307,9 +10110,9 @@ EVT AArch64TargetLowering::getOptimalMemOpType(
// Only use AdvSIMD to implement memset of 32-byte and above. It would have
// taken one instruction to materialize the v2i64 zero and one store (with
// restrictive addressing mode). Just do i64 stores.
- bool IsSmallMemset = IsMemset && Size < 32;
- auto AlignmentIsAcceptable = [&](EVT VT, unsigned AlignCheck) {
- if (memOpAlign(SrcAlign, DstAlign, AlignCheck))
+ bool IsSmallMemset = Op.isMemset() && Op.size() < 32;
+ auto AlignmentIsAcceptable = [&](EVT VT, Align AlignCheck) {
+ if (Op.isAligned(AlignCheck))
return true;
bool Fast;
return allowsMisalignedMemoryAccesses(VT, 0, 1, MachineMemOperand::MONone,
@@ -9317,22 +10120,20 @@ EVT AArch64TargetLowering::getOptimalMemOpType(
Fast;
};
- if (CanUseNEON && IsMemset && !IsSmallMemset &&
- AlignmentIsAcceptable(MVT::v2i64, 16))
+ if (CanUseNEON && Op.isMemset() && !IsSmallMemset &&
+ AlignmentIsAcceptable(MVT::v2i64, Align(16)))
return MVT::v2i64;
- if (CanUseFP && !IsSmallMemset && AlignmentIsAcceptable(MVT::f128, 16))
+ if (CanUseFP && !IsSmallMemset && AlignmentIsAcceptable(MVT::f128, Align(16)))
return MVT::f128;
- if (Size >= 8 && AlignmentIsAcceptable(MVT::i64, 8))
+ if (Op.size() >= 8 && AlignmentIsAcceptable(MVT::i64, Align(8)))
return MVT::i64;
- if (Size >= 4 && AlignmentIsAcceptable(MVT::i32, 4))
+ if (Op.size() >= 4 && AlignmentIsAcceptable(MVT::i32, Align(4)))
return MVT::i32;
return MVT::Other;
}
LLT AArch64TargetLowering::getOptimalMemOpLLT(
- uint64_t Size, unsigned DstAlign, unsigned SrcAlign, bool IsMemset,
- bool ZeroMemset, bool MemcpyStrSrc,
- const AttributeList &FuncAttributes) const {
+ const MemOp &Op, const AttributeList &FuncAttributes) const {
bool CanImplicitFloat =
!FuncAttributes.hasFnAttribute(Attribute::NoImplicitFloat);
bool CanUseNEON = Subtarget->hasNEON() && CanImplicitFloat;
@@ -9340,9 +10141,9 @@ LLT AArch64TargetLowering::getOptimalMemOpLLT(
// Only use AdvSIMD to implement memset of 32-byte and above. It would have
// taken one instruction to materialize the v2i64 zero and one store (with
// restrictive addressing mode). Just do i64 stores.
- bool IsSmallMemset = IsMemset && Size < 32;
- auto AlignmentIsAcceptable = [&](EVT VT, unsigned AlignCheck) {
- if (memOpAlign(SrcAlign, DstAlign, AlignCheck))
+ bool IsSmallMemset = Op.isMemset() && Op.size() < 32;
+ auto AlignmentIsAcceptable = [&](EVT VT, Align AlignCheck) {
+ if (Op.isAligned(AlignCheck))
return true;
bool Fast;
return allowsMisalignedMemoryAccesses(VT, 0, 1, MachineMemOperand::MONone,
@@ -9350,14 +10151,14 @@ LLT AArch64TargetLowering::getOptimalMemOpLLT(
Fast;
};
- if (CanUseNEON && IsMemset && !IsSmallMemset &&
- AlignmentIsAcceptable(MVT::v2i64, 16))
+ if (CanUseNEON && Op.isMemset() && !IsSmallMemset &&
+ AlignmentIsAcceptable(MVT::v2i64, Align(16)))
return LLT::vector(2, 64);
- if (CanUseFP && !IsSmallMemset && AlignmentIsAcceptable(MVT::f128, 16))
+ if (CanUseFP && !IsSmallMemset && AlignmentIsAcceptable(MVT::f128, Align(16)))
return LLT::scalar(128);
- if (Size >= 8 && AlignmentIsAcceptable(MVT::i64, 8))
+ if (Op.size() >= 8 && AlignmentIsAcceptable(MVT::i64, Align(8)))
return LLT::scalar(64);
- if (Size >= 4 && AlignmentIsAcceptable(MVT::i32, 4))
+ if (Op.size() >= 4 && AlignmentIsAcceptable(MVT::i32, Align(4)))
return LLT::scalar(32);
return LLT();
}
@@ -9404,6 +10205,10 @@ bool AArch64TargetLowering::isLegalAddressingMode(const DataLayout &DL,
if (AM.HasBaseReg && AM.BaseOffs && AM.Scale)
return false;
+ // FIXME: Update this method to support scalable addressing modes.
+ if (isa<ScalableVectorType>(Ty))
+ return AM.HasBaseReg && !AM.BaseOffs && !AM.Scale;
+
// check reg + imm case:
// i.e., reg + 0, reg + imm9, reg + SIZE_IN_BYTES * uimm12
uint64_t NumBytes = 0;
@@ -10110,7 +10915,7 @@ static SDValue tryCombineToBSL(SDNode *N,
}
if (FoundMatch)
- return DAG.getNode(AArch64ISD::BSL, DL, VT, SDValue(BVN0, 0),
+ return DAG.getNode(AArch64ISD::BSP, DL, VT, SDValue(BVN0, 0),
N0->getOperand(1 - i), N1->getOperand(1 - j));
}
@@ -10167,29 +10972,81 @@ static SDValue performSVEAndCombine(SDNode *N,
if (DCI.isBeforeLegalizeOps())
return SDValue();
+ SelectionDAG &DAG = DCI.DAG;
SDValue Src = N->getOperand(0);
+ unsigned Opc = Src->getOpcode();
+
+ // Zero/any extend of an unsigned unpack
+ if (Opc == AArch64ISD::UUNPKHI || Opc == AArch64ISD::UUNPKLO) {
+ SDValue UnpkOp = Src->getOperand(0);
+ SDValue Dup = N->getOperand(1);
+
+ if (Dup.getOpcode() != AArch64ISD::DUP)
+ return SDValue();
+
+ SDLoc DL(N);
+ ConstantSDNode *C = dyn_cast<ConstantSDNode>(Dup->getOperand(0));
+ uint64_t ExtVal = C->getZExtValue();
+
+ // If the mask is fully covered by the unpack, we don't need to push
+ // a new AND onto the operand
+ EVT EltTy = UnpkOp->getValueType(0).getVectorElementType();
+ if ((ExtVal == 0xFF && EltTy == MVT::i8) ||
+ (ExtVal == 0xFFFF && EltTy == MVT::i16) ||
+ (ExtVal == 0xFFFFFFFF && EltTy == MVT::i32))
+ return Src;
+
+ // Truncate to prevent a DUP with an over wide constant
+ APInt Mask = C->getAPIntValue().trunc(EltTy.getSizeInBits());
+
+ // Otherwise, make sure we propagate the AND to the operand
+ // of the unpack
+ Dup = DAG.getNode(AArch64ISD::DUP, DL,
+ UnpkOp->getValueType(0),
+ DAG.getConstant(Mask.zextOrTrunc(32), DL, MVT::i32));
+
+ SDValue And = DAG.getNode(ISD::AND, DL,
+ UnpkOp->getValueType(0), UnpkOp, Dup);
+
+ return DAG.getNode(Opc, DL, N->getValueType(0), And);
+ }
+
SDValue Mask = N->getOperand(1);
if (!Src.hasOneUse())
return SDValue();
- // GLD1* instructions perform an implicit zero-extend, which makes them
+ EVT MemVT;
+
+ // SVE load instructions perform an implicit zero-extend, which makes them
// perfect candidates for combining.
- switch (Src->getOpcode()) {
- case AArch64ISD::GLD1:
- case AArch64ISD::GLD1_SCALED:
- case AArch64ISD::GLD1_SXTW:
- case AArch64ISD::GLD1_SXTW_SCALED:
- case AArch64ISD::GLD1_UXTW:
- case AArch64ISD::GLD1_UXTW_SCALED:
- case AArch64ISD::GLD1_IMM:
+ switch (Opc) {
+ case AArch64ISD::LD1_MERGE_ZERO:
+ case AArch64ISD::LDNF1_MERGE_ZERO:
+ case AArch64ISD::LDFF1_MERGE_ZERO:
+ MemVT = cast<VTSDNode>(Src->getOperand(3))->getVT();
+ break;
+ case AArch64ISD::GLD1_MERGE_ZERO:
+ case AArch64ISD::GLD1_SCALED_MERGE_ZERO:
+ case AArch64ISD::GLD1_SXTW_MERGE_ZERO:
+ case AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO:
+ case AArch64ISD::GLD1_UXTW_MERGE_ZERO:
+ case AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO:
+ case AArch64ISD::GLD1_IMM_MERGE_ZERO:
+ case AArch64ISD::GLDFF1_MERGE_ZERO:
+ case AArch64ISD::GLDFF1_SCALED_MERGE_ZERO:
+ case AArch64ISD::GLDFF1_SXTW_MERGE_ZERO:
+ case AArch64ISD::GLDFF1_SXTW_SCALED_MERGE_ZERO:
+ case AArch64ISD::GLDFF1_UXTW_MERGE_ZERO:
+ case AArch64ISD::GLDFF1_UXTW_SCALED_MERGE_ZERO:
+ case AArch64ISD::GLDFF1_IMM_MERGE_ZERO:
+ case AArch64ISD::GLDNT1_MERGE_ZERO:
+ MemVT = cast<VTSDNode>(Src->getOperand(4))->getVT();
break;
default:
return SDValue();
}
- EVT MemVT = cast<VTSDNode>(Src->getOperand(4))->getVT();
-
if (isConstantSplatVectorMaskForType(Mask.getNode(), MemVT))
return Src;
@@ -10273,6 +11130,7 @@ static SDValue performConcatVectorsCombine(SDNode *N,
SDLoc dl(N);
EVT VT = N->getValueType(0);
SDValue N0 = N->getOperand(0), N1 = N->getOperand(1);
+ unsigned N0Opc = N0->getOpcode(), N1Opc = N1->getOpcode();
// Optimize concat_vectors of truncated vectors, where the intermediate
// type is illegal, to avoid said illegality, e.g.,
@@ -10285,9 +11143,8 @@ static SDValue performConcatVectorsCombine(SDNode *N,
// This isn't really target-specific, but ISD::TRUNCATE legality isn't keyed
// on both input and result type, so we might generate worse code.
// On AArch64 we know it's fine for v2i64->v4i16 and v4i32->v8i8.
- if (N->getNumOperands() == 2 &&
- N0->getOpcode() == ISD::TRUNCATE &&
- N1->getOpcode() == ISD::TRUNCATE) {
+ if (N->getNumOperands() == 2 && N0Opc == ISD::TRUNCATE &&
+ N1Opc == ISD::TRUNCATE) {
SDValue N00 = N0->getOperand(0);
SDValue N10 = N1->getOperand(0);
EVT N00VT = N00.getValueType();
@@ -10312,6 +11169,52 @@ static SDValue performConcatVectorsCombine(SDNode *N,
if (DCI.isBeforeLegalizeOps())
return SDValue();
+ // Optimise concat_vectors of two [us]rhadds that use extracted subvectors
+ // from the same original vectors. Combine these into a single [us]rhadd that
+ // operates on the two original vectors. Example:
+ // (v16i8 (concat_vectors (v8i8 (urhadd (extract_subvector (v16i8 OpA, <0>),
+ // extract_subvector (v16i8 OpB,
+ // <0>))),
+ // (v8i8 (urhadd (extract_subvector (v16i8 OpA, <8>),
+ // extract_subvector (v16i8 OpB,
+ // <8>)))))
+ // ->
+ // (v16i8(urhadd(v16i8 OpA, v16i8 OpB)))
+ if (N->getNumOperands() == 2 && N0Opc == N1Opc &&
+ (N0Opc == AArch64ISD::URHADD || N0Opc == AArch64ISD::SRHADD)) {
+ SDValue N00 = N0->getOperand(0);
+ SDValue N01 = N0->getOperand(1);
+ SDValue N10 = N1->getOperand(0);
+ SDValue N11 = N1->getOperand(1);
+
+ EVT N00VT = N00.getValueType();
+ EVT N10VT = N10.getValueType();
+
+ if (N00->getOpcode() == ISD::EXTRACT_SUBVECTOR &&
+ N01->getOpcode() == ISD::EXTRACT_SUBVECTOR &&
+ N10->getOpcode() == ISD::EXTRACT_SUBVECTOR &&
+ N11->getOpcode() == ISD::EXTRACT_SUBVECTOR && N00VT == N10VT) {
+ SDValue N00Source = N00->getOperand(0);
+ SDValue N01Source = N01->getOperand(0);
+ SDValue N10Source = N10->getOperand(0);
+ SDValue N11Source = N11->getOperand(0);
+
+ if (N00Source == N10Source && N01Source == N11Source &&
+ N00Source.getValueType() == VT && N01Source.getValueType() == VT) {
+ assert(N0.getValueType() == N1.getValueType());
+
+ uint64_t N00Index = N00.getConstantOperandVal(1);
+ uint64_t N01Index = N01.getConstantOperandVal(1);
+ uint64_t N10Index = N10.getConstantOperandVal(1);
+ uint64_t N11Index = N11.getConstantOperandVal(1);
+
+ if (N00Index == N01Index && N10Index == N11Index && N00Index == 0 &&
+ N10Index == N00VT.getVectorNumElements())
+ return DAG.getNode(N0Opc, dl, VT, N00Source, N01Source);
+ }
+ }
+ }
+
// If we see a (concat_vectors (v1x64 A), (v1x64 A)) it's really a vector
// splat. The indexed instructions are going to be expecting a DUPLANE64, so
// canonicalise to that.
@@ -10330,7 +11233,7 @@ static SDValue performConcatVectorsCombine(SDNode *N,
// becomes
// (bitconvert (concat_vectors (v4i16 (bitconvert LHS)), RHS))
- if (N1->getOpcode() != ISD::BITCAST)
+ if (N1Opc != ISD::BITCAST)
return SDValue();
SDValue RHS = N1->getOperand(0);
MVT RHSTy = RHS.getValueType().getSimpleVT();
@@ -10794,6 +11697,35 @@ static SDValue LowerSVEIntReduction(SDNode *N, unsigned Opc,
return SDValue();
}
+static SDValue LowerSVEIntrinsicIndex(SDNode *N, SelectionDAG &DAG) {
+ SDLoc DL(N);
+ SDValue Op1 = N->getOperand(1);
+ SDValue Op2 = N->getOperand(2);
+ EVT ScalarTy = Op1.getValueType();
+
+ if ((ScalarTy == MVT::i8) || (ScalarTy == MVT::i16)) {
+ Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op1);
+ Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op2);
+ }
+
+ return DAG.getNode(AArch64ISD::INDEX_VECTOR, DL, N->getValueType(0),
+ Op1, Op2);
+}
+
+static SDValue LowerSVEIntrinsicDUP(SDNode *N, SelectionDAG &DAG) {
+ SDLoc dl(N);
+ SDValue Scalar = N->getOperand(3);
+ EVT ScalarTy = Scalar.getValueType();
+
+ if ((ScalarTy == MVT::i8) || (ScalarTy == MVT::i16))
+ Scalar = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Scalar);
+
+ SDValue Passthru = N->getOperand(1);
+ SDValue Pred = N->getOperand(2);
+ return DAG.getNode(AArch64ISD::DUP_MERGE_PASSTHRU, dl, N->getValueType(0),
+ Pred, Scalar, Passthru);
+}
+
static SDValue LowerSVEIntrinsicEXT(SDNode *N, SelectionDAG &DAG) {
SDLoc dl(N);
LLVMContext &Ctx = *DAG.getContext();
@@ -10819,8 +11751,7 @@ static SDValue LowerSVEIntrinsicEXT(SDNode *N, SelectionDAG &DAG) {
return DAG.getNode(ISD::BITCAST, dl, VT, EXT);
}
-static SDValue tryConvertSVEWideCompare(SDNode *N, unsigned ReplacementIID,
- bool Invert,
+static SDValue tryConvertSVEWideCompare(SDNode *N, ISD::CondCode CC,
TargetLowering::DAGCombinerInfo &DCI,
SelectionDAG &DAG) {
if (DCI.isBeforeLegalize())
@@ -10873,18 +11804,12 @@ static SDValue tryConvertSVEWideCompare(SDNode *N, unsigned ReplacementIID,
}
}
+ if (!Imm)
+ return SDValue();
+
SDValue Splat = DAG.getNode(ISD::SPLAT_VECTOR, DL, CmpVT, Imm);
- SDValue ID = DAG.getTargetConstant(ReplacementIID, DL, MVT::i64);
- SDValue Op0, Op1;
- if (Invert) {
- Op0 = Splat;
- Op1 = N->getOperand(2);
- } else {
- Op0 = N->getOperand(2);
- Op1 = Splat;
- }
- return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
- ID, Pred, Op0, Op1);
+ return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, DL, VT, Pred,
+ N->getOperand(2), Splat, DAG.getCondCode(CC));
}
return SDValue();
@@ -10914,6 +11839,46 @@ static SDValue getPTest(SelectionDAG &DAG, EVT VT, SDValue Pg, SDValue Op,
return DAG.getZExtOrTrunc(Res, DL, VT);
}
+static SDValue combineSVEReductionFP(SDNode *N, unsigned Opc,
+ SelectionDAG &DAG) {
+ SDLoc DL(N);
+
+ SDValue Pred = N->getOperand(1);
+ SDValue VecToReduce = N->getOperand(2);
+
+ EVT ReduceVT = VecToReduce.getValueType();
+ SDValue Reduce = DAG.getNode(Opc, DL, ReduceVT, Pred, VecToReduce);
+
+ // SVE reductions set the whole vector register with the first element
+ // containing the reduction result, which we'll now extract.
+ SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
+ return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0), Reduce,
+ Zero);
+}
+
+static SDValue combineSVEReductionOrderedFP(SDNode *N, unsigned Opc,
+ SelectionDAG &DAG) {
+ SDLoc DL(N);
+
+ SDValue Pred = N->getOperand(1);
+ SDValue InitVal = N->getOperand(2);
+ SDValue VecToReduce = N->getOperand(3);
+ EVT ReduceVT = VecToReduce.getValueType();
+
+ // Ordered reductions use the first lane of the result vector as the
+ // reduction's initial value.
+ SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
+ InitVal = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ReduceVT,
+ DAG.getUNDEF(ReduceVT), InitVal, Zero);
+
+ SDValue Reduce = DAG.getNode(Opc, DL, ReduceVT, Pred, InitVal, VecToReduce);
+
+ // SVE reductions set the whole vector register with the first element
+ // containing the reduction result, which we'll now extract.
+ return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0), Reduce,
+ Zero);
+}
+
static SDValue performIntrinsicCombine(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI,
const AArch64Subtarget *Subtarget) {
@@ -10982,38 +11947,107 @@ static SDValue performIntrinsicCombine(SDNode *N,
return LowerSVEIntReduction(N, AArch64ISD::EORV_PRED, DAG);
case Intrinsic::aarch64_sve_andv:
return LowerSVEIntReduction(N, AArch64ISD::ANDV_PRED, DAG);
+ case Intrinsic::aarch64_sve_index:
+ return LowerSVEIntrinsicIndex(N, DAG);
+ case Intrinsic::aarch64_sve_dup:
+ return LowerSVEIntrinsicDUP(N, DAG);
+ case Intrinsic::aarch64_sve_dup_x:
+ return DAG.getNode(ISD::SPLAT_VECTOR, SDLoc(N), N->getValueType(0),
+ N->getOperand(1));
case Intrinsic::aarch64_sve_ext:
return LowerSVEIntrinsicEXT(N, DAG);
+ case Intrinsic::aarch64_sve_smin:
+ return DAG.getNode(AArch64ISD::SMIN_MERGE_OP1, SDLoc(N), N->getValueType(0),
+ N->getOperand(1), N->getOperand(2), N->getOperand(3));
+ case Intrinsic::aarch64_sve_umin:
+ return DAG.getNode(AArch64ISD::UMIN_MERGE_OP1, SDLoc(N), N->getValueType(0),
+ N->getOperand(1), N->getOperand(2), N->getOperand(3));
+ case Intrinsic::aarch64_sve_smax:
+ return DAG.getNode(AArch64ISD::SMAX_MERGE_OP1, SDLoc(N), N->getValueType(0),
+ N->getOperand(1), N->getOperand(2), N->getOperand(3));
+ case Intrinsic::aarch64_sve_umax:
+ return DAG.getNode(AArch64ISD::UMAX_MERGE_OP1, SDLoc(N), N->getValueType(0),
+ N->getOperand(1), N->getOperand(2), N->getOperand(3));
+ case Intrinsic::aarch64_sve_lsl:
+ return DAG.getNode(AArch64ISD::SHL_MERGE_OP1, SDLoc(N), N->getValueType(0),
+ N->getOperand(1), N->getOperand(2), N->getOperand(3));
+ case Intrinsic::aarch64_sve_lsr:
+ return DAG.getNode(AArch64ISD::SRL_MERGE_OP1, SDLoc(N), N->getValueType(0),
+ N->getOperand(1), N->getOperand(2), N->getOperand(3));
+ case Intrinsic::aarch64_sve_asr:
+ return DAG.getNode(AArch64ISD::SRA_MERGE_OP1, SDLoc(N), N->getValueType(0),
+ N->getOperand(1), N->getOperand(2), N->getOperand(3));
+ case Intrinsic::aarch64_sve_cmphs:
+ if (!N->getOperand(2).getValueType().isFloatingPoint())
+ return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
+ N->getValueType(0), N->getOperand(1), N->getOperand(2),
+ N->getOperand(3), DAG.getCondCode(ISD::SETUGE));
+ break;
+ case Intrinsic::aarch64_sve_cmphi:
+ if (!N->getOperand(2).getValueType().isFloatingPoint())
+ return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
+ N->getValueType(0), N->getOperand(1), N->getOperand(2),
+ N->getOperand(3), DAG.getCondCode(ISD::SETUGT));
+ break;
+ case Intrinsic::aarch64_sve_cmpge:
+ if (!N->getOperand(2).getValueType().isFloatingPoint())
+ return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
+ N->getValueType(0), N->getOperand(1), N->getOperand(2),
+ N->getOperand(3), DAG.getCondCode(ISD::SETGE));
+ break;
+ case Intrinsic::aarch64_sve_cmpgt:
+ if (!N->getOperand(2).getValueType().isFloatingPoint())
+ return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
+ N->getValueType(0), N->getOperand(1), N->getOperand(2),
+ N->getOperand(3), DAG.getCondCode(ISD::SETGT));
+ break;
+ case Intrinsic::aarch64_sve_cmpeq:
+ if (!N->getOperand(2).getValueType().isFloatingPoint())
+ return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
+ N->getValueType(0), N->getOperand(1), N->getOperand(2),
+ N->getOperand(3), DAG.getCondCode(ISD::SETEQ));
+ break;
+ case Intrinsic::aarch64_sve_cmpne:
+ if (!N->getOperand(2).getValueType().isFloatingPoint())
+ return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
+ N->getValueType(0), N->getOperand(1), N->getOperand(2),
+ N->getOperand(3), DAG.getCondCode(ISD::SETNE));
+ break;
+ case Intrinsic::aarch64_sve_fadda:
+ return combineSVEReductionOrderedFP(N, AArch64ISD::FADDA_PRED, DAG);
+ case Intrinsic::aarch64_sve_faddv:
+ return combineSVEReductionFP(N, AArch64ISD::FADDV_PRED, DAG);
+ case Intrinsic::aarch64_sve_fmaxnmv:
+ return combineSVEReductionFP(N, AArch64ISD::FMAXNMV_PRED, DAG);
+ case Intrinsic::aarch64_sve_fmaxv:
+ return combineSVEReductionFP(N, AArch64ISD::FMAXV_PRED, DAG);
+ case Intrinsic::aarch64_sve_fminnmv:
+ return combineSVEReductionFP(N, AArch64ISD::FMINNMV_PRED, DAG);
+ case Intrinsic::aarch64_sve_fminv:
+ return combineSVEReductionFP(N, AArch64ISD::FMINV_PRED, DAG);
+ case Intrinsic::aarch64_sve_sel:
+ return DAG.getNode(ISD::VSELECT, SDLoc(N), N->getValueType(0),
+ N->getOperand(1), N->getOperand(2), N->getOperand(3));
case Intrinsic::aarch64_sve_cmpeq_wide:
- return tryConvertSVEWideCompare(N, Intrinsic::aarch64_sve_cmpeq,
- false, DCI, DAG);
+ return tryConvertSVEWideCompare(N, ISD::SETEQ, DCI, DAG);
case Intrinsic::aarch64_sve_cmpne_wide:
- return tryConvertSVEWideCompare(N, Intrinsic::aarch64_sve_cmpne,
- false, DCI, DAG);
+ return tryConvertSVEWideCompare(N, ISD::SETNE, DCI, DAG);
case Intrinsic::aarch64_sve_cmpge_wide:
- return tryConvertSVEWideCompare(N, Intrinsic::aarch64_sve_cmpge,
- false, DCI, DAG);
+ return tryConvertSVEWideCompare(N, ISD::SETGE, DCI, DAG);
case Intrinsic::aarch64_sve_cmpgt_wide:
- return tryConvertSVEWideCompare(N, Intrinsic::aarch64_sve_cmpgt,
- false, DCI, DAG);
+ return tryConvertSVEWideCompare(N, ISD::SETGT, DCI, DAG);
case Intrinsic::aarch64_sve_cmplt_wide:
- return tryConvertSVEWideCompare(N, Intrinsic::aarch64_sve_cmpgt,
- true, DCI, DAG);
+ return tryConvertSVEWideCompare(N, ISD::SETLT, DCI, DAG);
case Intrinsic::aarch64_sve_cmple_wide:
- return tryConvertSVEWideCompare(N, Intrinsic::aarch64_sve_cmpge,
- true, DCI, DAG);
+ return tryConvertSVEWideCompare(N, ISD::SETLE, DCI, DAG);
case Intrinsic::aarch64_sve_cmphs_wide:
- return tryConvertSVEWideCompare(N, Intrinsic::aarch64_sve_cmphs,
- false, DCI, DAG);
+ return tryConvertSVEWideCompare(N, ISD::SETUGE, DCI, DAG);
case Intrinsic::aarch64_sve_cmphi_wide:
- return tryConvertSVEWideCompare(N, Intrinsic::aarch64_sve_cmphi,
- false, DCI, DAG);
+ return tryConvertSVEWideCompare(N, ISD::SETUGT, DCI, DAG);
case Intrinsic::aarch64_sve_cmplo_wide:
- return tryConvertSVEWideCompare(N, Intrinsic::aarch64_sve_cmphi, true,
- DCI, DAG);
+ return tryConvertSVEWideCompare(N, ISD::SETULT, DCI, DAG);
case Intrinsic::aarch64_sve_cmpls_wide:
- return tryConvertSVEWideCompare(N, Intrinsic::aarch64_sve_cmphs, true,
- DCI, DAG);
+ return tryConvertSVEWideCompare(N, ISD::SETULE, DCI, DAG);
case Intrinsic::aarch64_sve_ptest_any:
return getPTest(DAG, N->getValueType(0), N->getOperand(1), N->getOperand(2),
AArch64CC::ANY_ACTIVE);
@@ -11091,14 +12125,14 @@ static SDValue performExtendCombine(SDNode *N,
if (!ResVT.isSimple() || !SrcVT.isSimple())
return SDValue();
- // If the source VT is a 64-bit vector, we can play games and get the
- // better results we want.
- if (SrcVT.getSizeInBits() != 64)
+ // If the source VT is a 64-bit fixed or scalable vector, we can play games
+ // and get the better results we want.
+ if (SrcVT.getSizeInBits().getKnownMinSize() != 64)
return SDValue();
unsigned SrcEltSize = SrcVT.getScalarSizeInBits();
- unsigned ElementCount = SrcVT.getVectorNumElements();
- SrcVT = MVT::getVectorVT(MVT::getIntegerVT(SrcEltSize * 2), ElementCount);
+ ElementCount SrcEC = SrcVT.getVectorElementCount();
+ SrcVT = MVT::getVectorVT(MVT::getIntegerVT(SrcEltSize * 2), SrcEC);
SDLoc DL(N);
Src = DAG.getNode(N->getOpcode(), DL, SrcVT, Src);
@@ -11106,17 +12140,14 @@ static SDValue performExtendCombine(SDNode *N,
// bit source.
EVT LoVT, HiVT;
SDValue Lo, Hi;
- unsigned NumElements = ResVT.getVectorNumElements();
- assert(!(NumElements & 1) && "Splitting vector, but not in half!");
- LoVT = HiVT = EVT::getVectorVT(*DAG.getContext(),
- ResVT.getVectorElementType(), NumElements / 2);
+ LoVT = HiVT = ResVT.getHalfNumVectorElementsVT(*DAG.getContext());
EVT InNVT = EVT::getVectorVT(*DAG.getContext(), SrcVT.getVectorElementType(),
- LoVT.getVectorNumElements());
+ LoVT.getVectorElementCount());
Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InNVT, Src,
DAG.getConstant(0, DL, MVT::i64));
Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InNVT, Src,
- DAG.getConstant(InNVT.getVectorNumElements(), DL, MVT::i64));
+ DAG.getConstant(InNVT.getVectorMinNumElements(), DL, MVT::i64));
Lo = DAG.getNode(N->getOpcode(), DL, LoVT, Lo);
Hi = DAG.getNode(N->getOpcode(), DL, HiVT, Hi);
@@ -11165,11 +12196,71 @@ static SDValue splitStoreSplat(SelectionDAG &DAG, StoreSDNode &St,
return NewST1;
}
+// Returns an SVE type that ContentTy can be trivially sign or zero extended
+// into.
+static MVT getSVEContainerType(EVT ContentTy) {
+ assert(ContentTy.isSimple() && "No SVE containers for extended types");
+
+ switch (ContentTy.getSimpleVT().SimpleTy) {
+ default:
+ llvm_unreachable("No known SVE container for this MVT type");
+ case MVT::nxv2i8:
+ case MVT::nxv2i16:
+ case MVT::nxv2i32:
+ case MVT::nxv2i64:
+ case MVT::nxv2f32:
+ case MVT::nxv2f64:
+ return MVT::nxv2i64;
+ case MVT::nxv4i8:
+ case MVT::nxv4i16:
+ case MVT::nxv4i32:
+ case MVT::nxv4f32:
+ return MVT::nxv4i32;
+ case MVT::nxv8i8:
+ case MVT::nxv8i16:
+ case MVT::nxv8f16:
+ case MVT::nxv8bf16:
+ return MVT::nxv8i16;
+ case MVT::nxv16i8:
+ return MVT::nxv16i8;
+ }
+}
+
+static SDValue performLD1Combine(SDNode *N, SelectionDAG &DAG, unsigned Opc) {
+ SDLoc DL(N);
+ EVT VT = N->getValueType(0);
+
+ if (VT.getSizeInBits().getKnownMinSize() > AArch64::SVEBitsPerBlock)
+ return SDValue();
+
+ EVT ContainerVT = VT;
+ if (ContainerVT.isInteger())
+ ContainerVT = getSVEContainerType(ContainerVT);
+
+ SDVTList VTs = DAG.getVTList(ContainerVT, MVT::Other);
+ SDValue Ops[] = { N->getOperand(0), // Chain
+ N->getOperand(2), // Pg
+ N->getOperand(3), // Base
+ DAG.getValueType(VT) };
+
+ SDValue Load = DAG.getNode(Opc, DL, VTs, Ops);
+ SDValue LoadChain = SDValue(Load.getNode(), 1);
+
+ if (ContainerVT.isInteger() && (VT != ContainerVT))
+ Load = DAG.getNode(ISD::TRUNCATE, DL, VT, Load.getValue(0));
+
+ return DAG.getMergeValues({ Load, LoadChain }, DL);
+}
+
static SDValue performLDNT1Combine(SDNode *N, SelectionDAG &DAG) {
SDLoc DL(N);
EVT VT = N->getValueType(0);
EVT PtrTy = N->getOperand(3).getValueType();
+ if (VT == MVT::nxv8bf16 &&
+ !static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasBF16())
+ return SDValue();
+
EVT LoadVT = VT;
if (VT.isFloatingPoint())
LoadVT = VT.changeTypeToInteger();
@@ -11190,6 +12281,58 @@ static SDValue performLDNT1Combine(SDNode *N, SelectionDAG &DAG) {
return L;
}
+template <unsigned Opcode>
+static SDValue performLD1ReplicateCombine(SDNode *N, SelectionDAG &DAG) {
+ static_assert(Opcode == AArch64ISD::LD1RQ_MERGE_ZERO ||
+ Opcode == AArch64ISD::LD1RO_MERGE_ZERO,
+ "Unsupported opcode.");
+ SDLoc DL(N);
+ EVT VT = N->getValueType(0);
+
+ EVT LoadVT = VT;
+ if (VT.isFloatingPoint())
+ LoadVT = VT.changeTypeToInteger();
+
+ SDValue Ops[] = {N->getOperand(0), N->getOperand(2), N->getOperand(3)};
+ SDValue Load = DAG.getNode(Opcode, DL, {LoadVT, MVT::Other}, Ops);
+ SDValue LoadChain = SDValue(Load.getNode(), 1);
+
+ if (VT.isFloatingPoint())
+ Load = DAG.getNode(ISD::BITCAST, DL, VT, Load.getValue(0));
+
+ return DAG.getMergeValues({Load, LoadChain}, DL);
+}
+
+static SDValue performST1Combine(SDNode *N, SelectionDAG &DAG) {
+ SDLoc DL(N);
+ SDValue Data = N->getOperand(2);
+ EVT DataVT = Data.getValueType();
+ EVT HwSrcVt = getSVEContainerType(DataVT);
+ SDValue InputVT = DAG.getValueType(DataVT);
+
+ if (DataVT == MVT::nxv8bf16 &&
+ !static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasBF16())
+ return SDValue();
+
+ if (DataVT.isFloatingPoint())
+ InputVT = DAG.getValueType(HwSrcVt);
+
+ SDValue SrcNew;
+ if (Data.getValueType().isFloatingPoint())
+ SrcNew = DAG.getNode(ISD::BITCAST, DL, HwSrcVt, Data);
+ else
+ SrcNew = DAG.getNode(ISD::ANY_EXTEND, DL, HwSrcVt, Data);
+
+ SDValue Ops[] = { N->getOperand(0), // Chain
+ SrcNew,
+ N->getOperand(4), // Base
+ N->getOperand(3), // Pg
+ InputVT
+ };
+
+ return DAG.getNode(AArch64ISD::ST1_PRED, DL, N->getValueType(0), Ops);
+}
+
static SDValue performSTNT1Combine(SDNode *N, SelectionDAG &DAG) {
SDLoc DL(N);
@@ -11197,6 +12340,10 @@ static SDValue performSTNT1Combine(SDNode *N, SelectionDAG &DAG) {
EVT DataVT = Data.getValueType();
EVT PtrTy = N->getOperand(4).getValueType();
+ if (DataVT == MVT::nxv8bf16 &&
+ !static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasBF16())
+ return SDValue();
+
if (DataVT.isFloatingPoint())
Data = DAG.getNode(ISD::BITCAST, DL, DataVT.changeTypeToInteger(), Data);
@@ -11226,6 +12373,10 @@ static SDValue replaceZeroVectorStore(SelectionDAG &DAG, StoreSDNode &St) {
SDValue StVal = St.getValue();
EVT VT = StVal.getValueType();
+ // Avoid scalarizing zero splat stores for scalable vectors.
+ if (VT.isScalableVector())
+ return SDValue();
+
// It is beneficial to scalarize a zero splat store for 2 or 3 i64 elements or
// 2, 3 or 4 i32 elements.
int NumVecElts = VT.getVectorNumElements();
@@ -11348,7 +12499,8 @@ static SDValue splitStores(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
SDValue StVal = S->getValue();
EVT VT = StVal.getValueType();
- if (!VT.isVector())
+
+ if (!VT.isFixedLengthVector())
return SDValue();
// If we get a splat of zeros, convert this vector store to a store of
@@ -11419,6 +12571,9 @@ static SDValue performPostLD1Combine(SDNode *N,
SelectionDAG &DAG = DCI.DAG;
EVT VT = N->getValueType(0);
+ if (VT.isScalableVector())
+ return SDValue();
+
unsigned LoadIdx = IsLaneOp ? 1 : 0;
SDNode *LD = N->getOperand(LoadIdx).getNode();
// If it is not LOAD, can not do such combine.
@@ -12258,32 +13413,57 @@ static SDValue performGlobalAddressCombine(SDNode *N, SelectionDAG &DAG,
DAG.getConstant(MinOffset, DL, MVT::i64));
}
-// Returns an SVE type that ContentTy can be trivially sign or zero extended
-// into.
-static MVT getSVEContainerType(EVT ContentTy) {
- assert(ContentTy.isSimple() && "No SVE containers for extended types");
+// Turns the vector of indices into a vector of byte offstes by scaling Offset
+// by (BitWidth / 8).
+static SDValue getScaledOffsetForBitWidth(SelectionDAG &DAG, SDValue Offset,
+ SDLoc DL, unsigned BitWidth) {
+ assert(Offset.getValueType().isScalableVector() &&
+ "This method is only for scalable vectors of offsets");
- switch (ContentTy.getSimpleVT().SimpleTy) {
- default:
- llvm_unreachable("No known SVE container for this MVT type");
- case MVT::nxv2i8:
- case MVT::nxv2i16:
- case MVT::nxv2i32:
- case MVT::nxv2i64:
- case MVT::nxv2f32:
- case MVT::nxv2f64:
- return MVT::nxv2i64;
- case MVT::nxv4i8:
- case MVT::nxv4i16:
- case MVT::nxv4i32:
- case MVT::nxv4f32:
- return MVT::nxv4i32;
- }
+ SDValue Shift = DAG.getConstant(Log2_32(BitWidth / 8), DL, MVT::i64);
+ SDValue SplatShift = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::nxv2i64, Shift);
+
+ return DAG.getNode(ISD::SHL, DL, MVT::nxv2i64, Offset, SplatShift);
}
-static SDValue performST1ScatterCombine(SDNode *N, SelectionDAG &DAG,
- unsigned Opcode,
- bool OnlyPackedOffsets = true) {
+/// Check if the value of \p OffsetInBytes can be used as an immediate for
+/// the gather load/prefetch and scatter store instructions with vector base and
+/// immediate offset addressing mode:
+///
+/// [<Zn>.[S|D]{, #<imm>}]
+///
+/// where <imm> = sizeof(<T>) * k, for k = 0, 1, ..., 31.
+
+inline static bool isValidImmForSVEVecImmAddrMode(unsigned OffsetInBytes,
+ unsigned ScalarSizeInBytes) {
+ // The immediate is not a multiple of the scalar size.
+ if (OffsetInBytes % ScalarSizeInBytes)
+ return false;
+
+ // The immediate is out of range.
+ if (OffsetInBytes / ScalarSizeInBytes > 31)
+ return false;
+
+ return true;
+}
+
+/// Check if the value of \p Offset represents a valid immediate for the SVE
+/// gather load/prefetch and scatter store instructiona with vector base and
+/// immediate offset addressing mode:
+///
+/// [<Zn>.[S|D]{, #<imm>}]
+///
+/// where <imm> = sizeof(<T>) * k, for k = 0, 1, ..., 31.
+static bool isValidImmForSVEVecImmAddrMode(SDValue Offset,
+ unsigned ScalarSizeInBytes) {
+ ConstantSDNode *OffsetConst = dyn_cast<ConstantSDNode>(Offset.getNode());
+ return OffsetConst && isValidImmForSVEVecImmAddrMode(
+ OffsetConst->getZExtValue(), ScalarSizeInBytes);
+}
+
+static SDValue performScatterStoreCombine(SDNode *N, SelectionDAG &DAG,
+ unsigned Opcode,
+ bool OnlyPackedOffsets = true) {
const SDValue Src = N->getOperand(2);
const EVT SrcVT = Src->getValueType(0);
assert(SrcVT.isScalableVector() &&
@@ -12303,11 +13483,46 @@ static SDValue performST1ScatterCombine(SDNode *N, SelectionDAG &DAG,
// Depending on the addressing mode, this is either a pointer or a vector of
// pointers (that fits into one register)
- const SDValue Base = N->getOperand(4);
+ SDValue Base = N->getOperand(4);
// Depending on the addressing mode, this is either a single offset or a
// vector of offsets (that fits into one register)
SDValue Offset = N->getOperand(5);
+ // For "scalar + vector of indices", just scale the indices. This only
+ // applies to non-temporal scatters because there's no instruction that takes
+ // indicies.
+ if (Opcode == AArch64ISD::SSTNT1_INDEX_PRED) {
+ Offset =
+ getScaledOffsetForBitWidth(DAG, Offset, DL, SrcElVT.getSizeInBits());
+ Opcode = AArch64ISD::SSTNT1_PRED;
+ }
+
+ // In the case of non-temporal gather loads there's only one SVE instruction
+ // per data-size: "scalar + vector", i.e.
+ // * stnt1{b|h|w|d} { z0.s }, p0/z, [z0.s, x0]
+ // Since we do have intrinsics that allow the arguments to be in a different
+ // order, we may need to swap them to match the spec.
+ if (Opcode == AArch64ISD::SSTNT1_PRED && Offset.getValueType().isVector())
+ std::swap(Base, Offset);
+
+ // SST1_IMM requires that the offset is an immediate that is:
+ // * a multiple of #SizeInBytes,
+ // * in the range [0, 31 x #SizeInBytes],
+ // where #SizeInBytes is the size in bytes of the stored items. For
+ // immediates outside that range and non-immediate scalar offsets use SST1 or
+ // SST1_UXTW instead.
+ if (Opcode == AArch64ISD::SST1_IMM_PRED) {
+ if (!isValidImmForSVEVecImmAddrMode(Offset,
+ SrcVT.getScalarSizeInBits() / 8)) {
+ if (MVT::nxv4i32 == Base.getValueType().getSimpleVT().SimpleTy)
+ Opcode = AArch64ISD::SST1_UXTW_PRED;
+ else
+ Opcode = AArch64ISD::SST1_PRED;
+
+ std::swap(Base, Offset);
+ }
+ }
+
auto &TLI = DAG.getTargetLoweringInfo();
if (!TLI.isTypeLegal(Base.getValueType()))
return SDValue();
@@ -12325,9 +13540,9 @@ static SDValue performST1ScatterCombine(SDNode *N, SelectionDAG &DAG,
// Source value type that is representable in hardware
EVT HwSrcVt = getSVEContainerType(SrcVT);
- // Keep the original type of the input data to store - this is needed to
- // differentiate between ST1B, ST1H, ST1W and ST1D. For FP values we want the
- // integer equivalent, so just use HwSrcVt.
+ // Keep the original type of the input data to store - this is needed to be
+ // able to select the correct instruction, e.g. ST1B, ST1H, ST1W and ST1D. For
+ // FP values we want the integer equivalent, so just use HwSrcVt.
SDValue InputVT = DAG.getValueType(SrcVT);
if (SrcVT.isFloatingPoint())
InputVT = DAG.getValueType(HwSrcVt);
@@ -12350,24 +13565,67 @@ static SDValue performST1ScatterCombine(SDNode *N, SelectionDAG &DAG,
return DAG.getNode(Opcode, DL, VTs, Ops);
}
-static SDValue performLD1GatherCombine(SDNode *N, SelectionDAG &DAG,
- unsigned Opcode,
- bool OnlyPackedOffsets = true) {
- EVT RetVT = N->getValueType(0);
+static SDValue performGatherLoadCombine(SDNode *N, SelectionDAG &DAG,
+ unsigned Opcode,
+ bool OnlyPackedOffsets = true) {
+ const EVT RetVT = N->getValueType(0);
assert(RetVT.isScalableVector() &&
"Gather loads are only possible for SVE vectors");
+
SDLoc DL(N);
+ // Make sure that the loaded data will fit into an SVE register
if (RetVT.getSizeInBits().getKnownMinSize() > AArch64::SVEBitsPerBlock)
return SDValue();
// Depending on the addressing mode, this is either a pointer or a vector of
// pointers (that fits into one register)
- const SDValue Base = N->getOperand(3);
+ SDValue Base = N->getOperand(3);
// Depending on the addressing mode, this is either a single offset or a
// vector of offsets (that fits into one register)
SDValue Offset = N->getOperand(4);
+ // For "scalar + vector of indices", just scale the indices. This only
+ // applies to non-temporal gathers because there's no instruction that takes
+ // indicies.
+ if (Opcode == AArch64ISD::GLDNT1_INDEX_MERGE_ZERO) {
+ Offset = getScaledOffsetForBitWidth(DAG, Offset, DL,
+ RetVT.getScalarSizeInBits());
+ Opcode = AArch64ISD::GLDNT1_MERGE_ZERO;
+ }
+
+ // In the case of non-temporal gather loads there's only one SVE instruction
+ // per data-size: "scalar + vector", i.e.
+ // * ldnt1{b|h|w|d} { z0.s }, p0/z, [z0.s, x0]
+ // Since we do have intrinsics that allow the arguments to be in a different
+ // order, we may need to swap them to match the spec.
+ if (Opcode == AArch64ISD::GLDNT1_MERGE_ZERO &&
+ Offset.getValueType().isVector())
+ std::swap(Base, Offset);
+
+ // GLD{FF}1_IMM requires that the offset is an immediate that is:
+ // * a multiple of #SizeInBytes,
+ // * in the range [0, 31 x #SizeInBytes],
+ // where #SizeInBytes is the size in bytes of the loaded items. For
+ // immediates outside that range and non-immediate scalar offsets use
+ // GLD1_MERGE_ZERO or GLD1_UXTW_MERGE_ZERO instead.
+ if (Opcode == AArch64ISD::GLD1_IMM_MERGE_ZERO ||
+ Opcode == AArch64ISD::GLDFF1_IMM_MERGE_ZERO) {
+ if (!isValidImmForSVEVecImmAddrMode(Offset,
+ RetVT.getScalarSizeInBits() / 8)) {
+ if (MVT::nxv4i32 == Base.getValueType().getSimpleVT().SimpleTy)
+ Opcode = (Opcode == AArch64ISD::GLD1_IMM_MERGE_ZERO)
+ ? AArch64ISD::GLD1_UXTW_MERGE_ZERO
+ : AArch64ISD::GLDFF1_UXTW_MERGE_ZERO;
+ else
+ Opcode = (Opcode == AArch64ISD::GLD1_IMM_MERGE_ZERO)
+ ? AArch64ISD::GLD1_MERGE_ZERO
+ : AArch64ISD::GLDFF1_MERGE_ZERO;
+
+ std::swap(Base, Offset);
+ }
+ }
+
auto &TLI = DAG.getTargetLoweringInfo();
if (!TLI.isTypeLegal(Base.getValueType()))
return SDValue();
@@ -12382,10 +13640,9 @@ static SDValue performLD1GatherCombine(SDNode *N, SelectionDAG &DAG,
// Return value type that is representable in hardware
EVT HwRetVt = getSVEContainerType(RetVT);
- // Keep the original output value type around - this will better inform
- // optimisations (e.g. instruction folding when load is followed by
- // zext/sext). This will only be used for ints, so the value for FPs
- // doesn't matter.
+ // Keep the original output value type around - this is needed to be able to
+ // select the correct instruction, e.g. LD1B, LD1H, LD1W and LD1D. For FP
+ // values we want the integer equivalent, so just use HwRetVT.
SDValue OutVT = DAG.getValueType(RetVT);
if (RetVT.isFloatingPoint())
OutVT = DAG.getValueType(HwRetVt);
@@ -12409,55 +13666,126 @@ static SDValue performLD1GatherCombine(SDNode *N, SelectionDAG &DAG,
return DAG.getMergeValues({Load, LoadChain}, DL);
}
-
static SDValue
performSignExtendInRegCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
SelectionDAG &DAG) {
if (DCI.isBeforeLegalizeOps())
return SDValue();
+ SDLoc DL(N);
SDValue Src = N->getOperand(0);
unsigned Opc = Src->getOpcode();
- // Gather load nodes (e.g. AArch64ISD::GLD1) are straightforward candidates
+ // Sign extend of an unsigned unpack -> signed unpack
+ if (Opc == AArch64ISD::UUNPKHI || Opc == AArch64ISD::UUNPKLO) {
+
+ unsigned SOpc = Opc == AArch64ISD::UUNPKHI ? AArch64ISD::SUNPKHI
+ : AArch64ISD::SUNPKLO;
+
+ // Push the sign extend to the operand of the unpack
+ // This is necessary where, for example, the operand of the unpack
+ // is another unpack:
+ // 4i32 sign_extend_inreg (4i32 uunpklo(8i16 uunpklo (16i8 opnd)), from 4i8)
+ // ->
+ // 4i32 sunpklo (8i16 sign_extend_inreg(8i16 uunpklo (16i8 opnd), from 8i8)
+ // ->
+ // 4i32 sunpklo(8i16 sunpklo(16i8 opnd))
+ SDValue ExtOp = Src->getOperand(0);
+ auto VT = cast<VTSDNode>(N->getOperand(1))->getVT();
+ EVT EltTy = VT.getVectorElementType();
+ (void)EltTy;
+
+ assert((EltTy == MVT::i8 || EltTy == MVT::i16 || EltTy == MVT::i32) &&
+ "Sign extending from an invalid type");
+
+ EVT ExtVT = EVT::getVectorVT(*DAG.getContext(),
+ VT.getVectorElementType(),
+ VT.getVectorElementCount() * 2);
+
+ SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, ExtOp.getValueType(),
+ ExtOp, DAG.getValueType(ExtVT));
+
+ return DAG.getNode(SOpc, DL, N->getValueType(0), Ext);
+ }
+
+ // SVE load nodes (e.g. AArch64ISD::GLD1) are straightforward candidates
// for DAG Combine with SIGN_EXTEND_INREG. Bail out for all other nodes.
unsigned NewOpc;
+ unsigned MemVTOpNum = 4;
switch (Opc) {
- case AArch64ISD::GLD1:
- NewOpc = AArch64ISD::GLD1S;
+ case AArch64ISD::LD1_MERGE_ZERO:
+ NewOpc = AArch64ISD::LD1S_MERGE_ZERO;
+ MemVTOpNum = 3;
+ break;
+ case AArch64ISD::LDNF1_MERGE_ZERO:
+ NewOpc = AArch64ISD::LDNF1S_MERGE_ZERO;
+ MemVTOpNum = 3;
+ break;
+ case AArch64ISD::LDFF1_MERGE_ZERO:
+ NewOpc = AArch64ISD::LDFF1S_MERGE_ZERO;
+ MemVTOpNum = 3;
+ break;
+ case AArch64ISD::GLD1_MERGE_ZERO:
+ NewOpc = AArch64ISD::GLD1S_MERGE_ZERO;
break;
- case AArch64ISD::GLD1_SCALED:
- NewOpc = AArch64ISD::GLD1S_SCALED;
+ case AArch64ISD::GLD1_SCALED_MERGE_ZERO:
+ NewOpc = AArch64ISD::GLD1S_SCALED_MERGE_ZERO;
break;
- case AArch64ISD::GLD1_SXTW:
- NewOpc = AArch64ISD::GLD1S_SXTW;
+ case AArch64ISD::GLD1_SXTW_MERGE_ZERO:
+ NewOpc = AArch64ISD::GLD1S_SXTW_MERGE_ZERO;
break;
- case AArch64ISD::GLD1_SXTW_SCALED:
- NewOpc = AArch64ISD::GLD1S_SXTW_SCALED;
+ case AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO:
+ NewOpc = AArch64ISD::GLD1S_SXTW_SCALED_MERGE_ZERO;
break;
- case AArch64ISD::GLD1_UXTW:
- NewOpc = AArch64ISD::GLD1S_UXTW;
+ case AArch64ISD::GLD1_UXTW_MERGE_ZERO:
+ NewOpc = AArch64ISD::GLD1S_UXTW_MERGE_ZERO;
break;
- case AArch64ISD::GLD1_UXTW_SCALED:
- NewOpc = AArch64ISD::GLD1S_UXTW_SCALED;
+ case AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO:
+ NewOpc = AArch64ISD::GLD1S_UXTW_SCALED_MERGE_ZERO;
break;
- case AArch64ISD::GLD1_IMM:
- NewOpc = AArch64ISD::GLD1S_IMM;
+ case AArch64ISD::GLD1_IMM_MERGE_ZERO:
+ NewOpc = AArch64ISD::GLD1S_IMM_MERGE_ZERO;
+ break;
+ case AArch64ISD::GLDFF1_MERGE_ZERO:
+ NewOpc = AArch64ISD::GLDFF1S_MERGE_ZERO;
+ break;
+ case AArch64ISD::GLDFF1_SCALED_MERGE_ZERO:
+ NewOpc = AArch64ISD::GLDFF1S_SCALED_MERGE_ZERO;
+ break;
+ case AArch64ISD::GLDFF1_SXTW_MERGE_ZERO:
+ NewOpc = AArch64ISD::GLDFF1S_SXTW_MERGE_ZERO;
+ break;
+ case AArch64ISD::GLDFF1_SXTW_SCALED_MERGE_ZERO:
+ NewOpc = AArch64ISD::GLDFF1S_SXTW_SCALED_MERGE_ZERO;
+ break;
+ case AArch64ISD::GLDFF1_UXTW_MERGE_ZERO:
+ NewOpc = AArch64ISD::GLDFF1S_UXTW_MERGE_ZERO;
+ break;
+ case AArch64ISD::GLDFF1_UXTW_SCALED_MERGE_ZERO:
+ NewOpc = AArch64ISD::GLDFF1S_UXTW_SCALED_MERGE_ZERO;
+ break;
+ case AArch64ISD::GLDFF1_IMM_MERGE_ZERO:
+ NewOpc = AArch64ISD::GLDFF1S_IMM_MERGE_ZERO;
+ break;
+ case AArch64ISD::GLDNT1_MERGE_ZERO:
+ NewOpc = AArch64ISD::GLDNT1S_MERGE_ZERO;
break;
default:
return SDValue();
}
EVT SignExtSrcVT = cast<VTSDNode>(N->getOperand(1))->getVT();
- EVT GLD1SrcMemVT = cast<VTSDNode>(Src->getOperand(4))->getVT();
+ EVT SrcMemVT = cast<VTSDNode>(Src->getOperand(MemVTOpNum))->getVT();
- if ((SignExtSrcVT != GLD1SrcMemVT) || !Src.hasOneUse())
+ if ((SignExtSrcVT != SrcMemVT) || !Src.hasOneUse())
return SDValue();
EVT DstVT = N->getValueType(0);
SDVTList VTs = DAG.getVTList(DstVT, MVT::Other);
- SDValue Ops[] = {Src->getOperand(0), Src->getOperand(1), Src->getOperand(2),
- Src->getOperand(3), Src->getOperand(4)};
+
+ SmallVector<SDValue, 5> Ops;
+ for (unsigned I = 0; I < Src->getNumOperands(); ++I)
+ Ops.push_back(Src->getOperand(I));
SDValue ExtLoad = DAG.getNode(NewOpc, SDLoc(N), VTs, Ops);
DCI.CombineTo(N, ExtLoad);
@@ -12467,6 +13795,51 @@ performSignExtendInRegCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
return SDValue(N, 0);
}
+/// Legalize the gather prefetch (scalar + vector addressing mode) when the
+/// offset vector is an unpacked 32-bit scalable vector. The other cases (Offset
+/// != nxv2i32) do not need legalization.
+static SDValue legalizeSVEGatherPrefetchOffsVec(SDNode *N, SelectionDAG &DAG) {
+ const unsigned OffsetPos = 4;
+ SDValue Offset = N->getOperand(OffsetPos);
+
+ // Not an unpacked vector, bail out.
+ if (Offset.getValueType().getSimpleVT().SimpleTy != MVT::nxv2i32)
+ return SDValue();
+
+ // Extend the unpacked offset vector to 64-bit lanes.
+ SDLoc DL(N);
+ Offset = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::nxv2i64, Offset);
+ SmallVector<SDValue, 5> Ops(N->op_begin(), N->op_end());
+ // Replace the offset operand with the 64-bit one.
+ Ops[OffsetPos] = Offset;
+
+ return DAG.getNode(N->getOpcode(), DL, DAG.getVTList(MVT::Other), Ops);
+}
+
+/// Combines a node carrying the intrinsic
+/// `aarch64_sve_prf<T>_gather_scalar_offset` into a node that uses
+/// `aarch64_sve_prfb_gather_uxtw_index` when the scalar offset passed to
+/// `aarch64_sve_prf<T>_gather_scalar_offset` is not a valid immediate for the
+/// sve gather prefetch instruction with vector plus immediate addressing mode.
+static SDValue combineSVEPrefetchVecBaseImmOff(SDNode *N, SelectionDAG &DAG,
+ unsigned ScalarSizeInBytes) {
+ const unsigned ImmPos = 4, OffsetPos = 3;
+ // No need to combine the node if the immediate is valid...
+ if (isValidImmForSVEVecImmAddrMode(N->getOperand(ImmPos), ScalarSizeInBytes))
+ return SDValue();
+
+ // ...otherwise swap the offset base with the offset...
+ SmallVector<SDValue, 5> Ops(N->op_begin(), N->op_end());
+ std::swap(Ops[ImmPos], Ops[OffsetPos]);
+ // ...and remap the intrinsic `aarch64_sve_prf<T>_gather_scalar_offset` to
+ // `aarch64_sve_prfb_gather_uxtw_index`.
+ SDLoc DL(N);
+ Ops[1] = DAG.getConstant(Intrinsic::aarch64_sve_prfb_gather_uxtw_index, DL,
+ MVT::i64);
+
+ return DAG.getNode(N->getOpcode(), DL, DAG.getVTList(MVT::Other), Ops);
+}
+
SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
SelectionDAG &DAG = DCI.DAG;
@@ -12531,6 +13904,23 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
case ISD::INTRINSIC_VOID:
case ISD::INTRINSIC_W_CHAIN:
switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
+ case Intrinsic::aarch64_sve_prfb_gather_scalar_offset:
+ return combineSVEPrefetchVecBaseImmOff(N, DAG, 1 /*=ScalarSizeInBytes*/);
+ case Intrinsic::aarch64_sve_prfh_gather_scalar_offset:
+ return combineSVEPrefetchVecBaseImmOff(N, DAG, 2 /*=ScalarSizeInBytes*/);
+ case Intrinsic::aarch64_sve_prfw_gather_scalar_offset:
+ return combineSVEPrefetchVecBaseImmOff(N, DAG, 4 /*=ScalarSizeInBytes*/);
+ case Intrinsic::aarch64_sve_prfd_gather_scalar_offset:
+ return combineSVEPrefetchVecBaseImmOff(N, DAG, 8 /*=ScalarSizeInBytes*/);
+ case Intrinsic::aarch64_sve_prfb_gather_uxtw_index:
+ case Intrinsic::aarch64_sve_prfb_gather_sxtw_index:
+ case Intrinsic::aarch64_sve_prfh_gather_uxtw_index:
+ case Intrinsic::aarch64_sve_prfh_gather_sxtw_index:
+ case Intrinsic::aarch64_sve_prfw_gather_uxtw_index:
+ case Intrinsic::aarch64_sve_prfw_gather_sxtw_index:
+ case Intrinsic::aarch64_sve_prfd_gather_uxtw_index:
+ case Intrinsic::aarch64_sve_prfd_gather_sxtw_index:
+ return legalizeSVEGatherPrefetchOffsVec(N, DAG);
case Intrinsic::aarch64_neon_ld2:
case Intrinsic::aarch64_neon_ld3:
case Intrinsic::aarch64_neon_ld4:
@@ -12555,44 +13945,180 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
return performNEONPostLDSTCombine(N, DCI, DAG);
case Intrinsic::aarch64_sve_ldnt1:
return performLDNT1Combine(N, DAG);
+ case Intrinsic::aarch64_sve_ld1rq:
+ return performLD1ReplicateCombine<AArch64ISD::LD1RQ_MERGE_ZERO>(N, DAG);
+ case Intrinsic::aarch64_sve_ld1ro:
+ return performLD1ReplicateCombine<AArch64ISD::LD1RO_MERGE_ZERO>(N, DAG);
+ case Intrinsic::aarch64_sve_ldnt1_gather_scalar_offset:
+ return performGatherLoadCombine(N, DAG, AArch64ISD::GLDNT1_MERGE_ZERO);
+ case Intrinsic::aarch64_sve_ldnt1_gather:
+ return performGatherLoadCombine(N, DAG, AArch64ISD::GLDNT1_MERGE_ZERO);
+ case Intrinsic::aarch64_sve_ldnt1_gather_index:
+ return performGatherLoadCombine(N, DAG,
+ AArch64ISD::GLDNT1_INDEX_MERGE_ZERO);
+ case Intrinsic::aarch64_sve_ldnt1_gather_uxtw:
+ return performGatherLoadCombine(N, DAG, AArch64ISD::GLDNT1_MERGE_ZERO);
+ case Intrinsic::aarch64_sve_ld1:
+ return performLD1Combine(N, DAG, AArch64ISD::LD1_MERGE_ZERO);
+ case Intrinsic::aarch64_sve_ldnf1:
+ return performLD1Combine(N, DAG, AArch64ISD::LDNF1_MERGE_ZERO);
+ case Intrinsic::aarch64_sve_ldff1:
+ return performLD1Combine(N, DAG, AArch64ISD::LDFF1_MERGE_ZERO);
+ case Intrinsic::aarch64_sve_st1:
+ return performST1Combine(N, DAG);
case Intrinsic::aarch64_sve_stnt1:
return performSTNT1Combine(N, DAG);
+ case Intrinsic::aarch64_sve_stnt1_scatter_scalar_offset:
+ return performScatterStoreCombine(N, DAG, AArch64ISD::SSTNT1_PRED);
+ case Intrinsic::aarch64_sve_stnt1_scatter_uxtw:
+ return performScatterStoreCombine(N, DAG, AArch64ISD::SSTNT1_PRED);
+ case Intrinsic::aarch64_sve_stnt1_scatter:
+ return performScatterStoreCombine(N, DAG, AArch64ISD::SSTNT1_PRED);
+ case Intrinsic::aarch64_sve_stnt1_scatter_index:
+ return performScatterStoreCombine(N, DAG, AArch64ISD::SSTNT1_INDEX_PRED);
case Intrinsic::aarch64_sve_ld1_gather:
- return performLD1GatherCombine(N, DAG, AArch64ISD::GLD1);
+ return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1_MERGE_ZERO);
case Intrinsic::aarch64_sve_ld1_gather_index:
- return performLD1GatherCombine(N, DAG, AArch64ISD::GLD1_SCALED);
+ return performGatherLoadCombine(N, DAG,
+ AArch64ISD::GLD1_SCALED_MERGE_ZERO);
case Intrinsic::aarch64_sve_ld1_gather_sxtw:
- return performLD1GatherCombine(N, DAG, AArch64ISD::GLD1_SXTW,
+ return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1_SXTW_MERGE_ZERO,
/*OnlyPackedOffsets=*/false);
case Intrinsic::aarch64_sve_ld1_gather_uxtw:
- return performLD1GatherCombine(N, DAG, AArch64ISD::GLD1_UXTW,
+ return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1_UXTW_MERGE_ZERO,
/*OnlyPackedOffsets=*/false);
case Intrinsic::aarch64_sve_ld1_gather_sxtw_index:
- return performLD1GatherCombine(N, DAG, AArch64ISD::GLD1_SXTW_SCALED,
+ return performGatherLoadCombine(N, DAG,
+ AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO,
/*OnlyPackedOffsets=*/false);
case Intrinsic::aarch64_sve_ld1_gather_uxtw_index:
- return performLD1GatherCombine(N, DAG, AArch64ISD::GLD1_UXTW_SCALED,
+ return performGatherLoadCombine(N, DAG,
+ AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO,
+ /*OnlyPackedOffsets=*/false);
+ case Intrinsic::aarch64_sve_ld1_gather_scalar_offset:
+ return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1_IMM_MERGE_ZERO);
+ case Intrinsic::aarch64_sve_ldff1_gather:
+ return performGatherLoadCombine(N, DAG, AArch64ISD::GLDFF1_MERGE_ZERO);
+ case Intrinsic::aarch64_sve_ldff1_gather_index:
+ return performGatherLoadCombine(N, DAG,
+ AArch64ISD::GLDFF1_SCALED_MERGE_ZERO);
+ case Intrinsic::aarch64_sve_ldff1_gather_sxtw:
+ return performGatherLoadCombine(N, DAG,
+ AArch64ISD::GLDFF1_SXTW_MERGE_ZERO,
+ /*OnlyPackedOffsets=*/false);
+ case Intrinsic::aarch64_sve_ldff1_gather_uxtw:
+ return performGatherLoadCombine(N, DAG,
+ AArch64ISD::GLDFF1_UXTW_MERGE_ZERO,
+ /*OnlyPackedOffsets=*/false);
+ case Intrinsic::aarch64_sve_ldff1_gather_sxtw_index:
+ return performGatherLoadCombine(N, DAG,
+ AArch64ISD::GLDFF1_SXTW_SCALED_MERGE_ZERO,
+ /*OnlyPackedOffsets=*/false);
+ case Intrinsic::aarch64_sve_ldff1_gather_uxtw_index:
+ return performGatherLoadCombine(N, DAG,
+ AArch64ISD::GLDFF1_UXTW_SCALED_MERGE_ZERO,
/*OnlyPackedOffsets=*/false);
- case Intrinsic::aarch64_sve_ld1_gather_imm:
- return performLD1GatherCombine(N, DAG, AArch64ISD::GLD1_IMM);
+ case Intrinsic::aarch64_sve_ldff1_gather_scalar_offset:
+ return performGatherLoadCombine(N, DAG,
+ AArch64ISD::GLDFF1_IMM_MERGE_ZERO);
case Intrinsic::aarch64_sve_st1_scatter:
- return performST1ScatterCombine(N, DAG, AArch64ISD::SST1);
+ return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_PRED);
case Intrinsic::aarch64_sve_st1_scatter_index:
- return performST1ScatterCombine(N, DAG, AArch64ISD::SST1_SCALED);
+ return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_SCALED_PRED);
case Intrinsic::aarch64_sve_st1_scatter_sxtw:
- return performST1ScatterCombine(N, DAG, AArch64ISD::SST1_SXTW,
- /*OnlyPackedOffsets=*/false);
+ return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_SXTW_PRED,
+ /*OnlyPackedOffsets=*/false);
case Intrinsic::aarch64_sve_st1_scatter_uxtw:
- return performST1ScatterCombine(N, DAG, AArch64ISD::SST1_UXTW,
- /*OnlyPackedOffsets=*/false);
+ return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_UXTW_PRED,
+ /*OnlyPackedOffsets=*/false);
case Intrinsic::aarch64_sve_st1_scatter_sxtw_index:
- return performST1ScatterCombine(N, DAG, AArch64ISD::SST1_SXTW_SCALED,
- /*OnlyPackedOffsets=*/false);
+ return performScatterStoreCombine(N, DAG,
+ AArch64ISD::SST1_SXTW_SCALED_PRED,
+ /*OnlyPackedOffsets=*/false);
case Intrinsic::aarch64_sve_st1_scatter_uxtw_index:
- return performST1ScatterCombine(N, DAG, AArch64ISD::SST1_UXTW_SCALED,
- /*OnlyPackedOffsets=*/false);
- case Intrinsic::aarch64_sve_st1_scatter_imm:
- return performST1ScatterCombine(N, DAG, AArch64ISD::SST1_IMM);
+ return performScatterStoreCombine(N, DAG,
+ AArch64ISD::SST1_UXTW_SCALED_PRED,
+ /*OnlyPackedOffsets=*/false);
+ case Intrinsic::aarch64_sve_st1_scatter_scalar_offset:
+ return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_IMM_PRED);
+ case Intrinsic::aarch64_sve_tuple_get: {
+ SDLoc DL(N);
+ SDValue Chain = N->getOperand(0);
+ SDValue Src1 = N->getOperand(2);
+ SDValue Idx = N->getOperand(3);
+
+ uint64_t IdxConst = cast<ConstantSDNode>(Idx)->getZExtValue();
+ EVT ResVT = N->getValueType(0);
+ uint64_t NumLanes = ResVT.getVectorElementCount().Min;
+ SDValue Val =
+ DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ResVT, Src1,
+ DAG.getConstant(IdxConst * NumLanes, DL, MVT::i32));
+ return DAG.getMergeValues({Val, Chain}, DL);
+ }
+ case Intrinsic::aarch64_sve_tuple_set: {
+ SDLoc DL(N);
+ SDValue Chain = N->getOperand(0);
+ SDValue Tuple = N->getOperand(2);
+ SDValue Idx = N->getOperand(3);
+ SDValue Vec = N->getOperand(4);
+
+ EVT TupleVT = Tuple.getValueType();
+ uint64_t TupleLanes = TupleVT.getVectorElementCount().Min;
+
+ uint64_t IdxConst = cast<ConstantSDNode>(Idx)->getZExtValue();
+ uint64_t NumLanes = Vec.getValueType().getVectorElementCount().Min;
+
+ if ((TupleLanes % NumLanes) != 0)
+ report_fatal_error("invalid tuple vector!");
+
+ uint64_t NumVecs = TupleLanes / NumLanes;
+
+ SmallVector<SDValue, 4> Opnds;
+ for (unsigned I = 0; I < NumVecs; ++I) {
+ if (I == IdxConst)
+ Opnds.push_back(Vec);
+ else {
+ Opnds.push_back(
+ DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, Vec.getValueType(), Tuple,
+ DAG.getConstant(I * NumLanes, DL, MVT::i32)));
+ }
+ }
+ SDValue Concat =
+ DAG.getNode(ISD::CONCAT_VECTORS, DL, Tuple.getValueType(), Opnds);
+ return DAG.getMergeValues({Concat, Chain}, DL);
+ }
+ case Intrinsic::aarch64_sve_tuple_create2:
+ case Intrinsic::aarch64_sve_tuple_create3:
+ case Intrinsic::aarch64_sve_tuple_create4: {
+ SDLoc DL(N);
+ SDValue Chain = N->getOperand(0);
+
+ SmallVector<SDValue, 4> Opnds;
+ for (unsigned I = 2; I < N->getNumOperands(); ++I)
+ Opnds.push_back(N->getOperand(I));
+
+ EVT VT = Opnds[0].getValueType();
+ EVT EltVT = VT.getVectorElementType();
+ EVT DestVT = EVT::getVectorVT(*DAG.getContext(), EltVT,
+ VT.getVectorElementCount() *
+ (N->getNumOperands() - 2));
+ SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, DestVT, Opnds);
+ return DAG.getMergeValues({Concat, Chain}, DL);
+ }
+ case Intrinsic::aarch64_sve_ld2:
+ case Intrinsic::aarch64_sve_ld3:
+ case Intrinsic::aarch64_sve_ld4: {
+ SDLoc DL(N);
+ SDValue Chain = N->getOperand(0);
+ SDValue Mask = N->getOperand(2);
+ SDValue BasePtr = N->getOperand(3);
+ SDValue LoadOps[] = {Chain, Mask, BasePtr};
+ unsigned IntrinsicID =
+ cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
+ SDValue Result =
+ LowerSVEStructLoad(IntrinsicID, LoadOps, N->getValueType(0), DAG, DL);
+ return DAG.getMergeValues({Result, Chain}, DL);
+ }
default:
break;
}
@@ -12724,7 +14250,8 @@ static void ReplaceBITCASTResults(SDNode *N, SmallVectorImpl<SDValue> &Results,
SDLoc DL(N);
SDValue Op = N->getOperand(0);
- if (N->getValueType(0) != MVT::i16 || Op.getValueType() != MVT::f16)
+ if (N->getValueType(0) != MVT::i16 ||
+ (Op.getValueType() != MVT::f16 && Op.getValueType() != MVT::bf16))
return;
Op = SDValue(
@@ -12759,6 +14286,40 @@ static std::pair<SDValue, SDValue> splitInt128(SDValue N, SelectionDAG &DAG) {
return std::make_pair(Lo, Hi);
}
+void AArch64TargetLowering::ReplaceExtractSubVectorResults(
+ SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const {
+ SDValue In = N->getOperand(0);
+ EVT InVT = In.getValueType();
+
+ // Common code will handle these just fine.
+ if (!InVT.isScalableVector() || !InVT.isInteger())
+ return;
+
+ SDLoc DL(N);
+ EVT VT = N->getValueType(0);
+
+ // The following checks bail if this is not a halving operation.
+
+ ElementCount ResEC = VT.getVectorElementCount();
+
+ if (InVT.getVectorElementCount().Min != (ResEC.Min * 2))
+ return;
+
+ auto *CIndex = dyn_cast<ConstantSDNode>(N->getOperand(1));
+ if (!CIndex)
+ return;
+
+ unsigned Index = CIndex->getZExtValue();
+ if ((Index != 0) && (Index != ResEC.Min))
+ return;
+
+ unsigned Opcode = (Index == 0) ? AArch64ISD::UUNPKLO : AArch64ISD::UUNPKHI;
+ EVT ExtendedHalfVT = VT.widenIntegerVectorElementType(*DAG.getContext());
+
+ SDValue Half = DAG.getNode(Opcode, DL, ExtendedHalfVT, N->getOperand(0));
+ Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, Half));
+}
+
// Create an even/odd pair of X registers holding integer value V.
static SDValue createGPRPairNode(SelectionDAG &DAG, SDValue V) {
SDLoc dl(V.getNode());
@@ -12822,10 +14383,12 @@ static void ReplaceCMP_SWAP_128Results(SDNode *N,
unsigned SubReg1 = AArch64::sube64, SubReg2 = AArch64::subo64;
if (DAG.getDataLayout().isBigEndian())
std::swap(SubReg1, SubReg2);
- Results.push_back(DAG.getTargetExtractSubreg(SubReg1, SDLoc(N), MVT::i64,
- SDValue(CmpSwap, 0)));
- Results.push_back(DAG.getTargetExtractSubreg(SubReg2, SDLoc(N), MVT::i64,
- SDValue(CmpSwap, 0)));
+ SDValue Lo = DAG.getTargetExtractSubreg(SubReg1, SDLoc(N), MVT::i64,
+ SDValue(CmpSwap, 0));
+ SDValue Hi = DAG.getTargetExtractSubreg(SubReg2, SDLoc(N), MVT::i64,
+ SDValue(CmpSwap, 0));
+ Results.push_back(
+ DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i128, Lo, Hi));
Results.push_back(SDValue(CmpSwap, 1)); // Chain out
return;
}
@@ -12841,8 +14404,8 @@ static void ReplaceCMP_SWAP_128Results(SDNode *N,
MachineMemOperand *MemOp = cast<MemSDNode>(N)->getMemOperand();
DAG.setNodeMemRefs(cast<MachineSDNode>(CmpSwap), {MemOp});
- Results.push_back(SDValue(CmpSwap, 0));
- Results.push_back(SDValue(CmpSwap, 1));
+ Results.push_back(DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i128,
+ SDValue(CmpSwap, 0), SDValue(CmpSwap, 1)));
Results.push_back(SDValue(CmpSwap, 3));
}
@@ -12862,6 +14425,9 @@ void AArch64TargetLowering::ReplaceNodeResults(
Results.push_back(LowerVECREDUCE(SDValue(N, 0), DAG));
return;
+ case ISD::CTPOP:
+ Results.push_back(LowerCTPOP(SDValue(N, 0), DAG));
+ return;
case AArch64ISD::SADDV:
ReplaceReductionResults(N, Results, DAG, ISD::ADD, AArch64ISD::SADDV);
return;
@@ -12909,6 +14475,9 @@ void AArch64TargetLowering::ReplaceNodeResults(
Results.append({Pair, Result.getValue(2) /* Chain */});
return;
}
+ case ISD::EXTRACT_SUBVECTOR:
+ ReplaceExtractSubVectorResults(N, Results, DAG);
+ return;
case ISD::INTRINSIC_WO_CHAIN: {
EVT VT = N->getValueType(0);
assert((VT == MVT::i8 || VT == MVT::i16) &&
@@ -13019,7 +14588,7 @@ AArch64TargetLowering::shouldExpandAtomicCmpXchgInIR(
// on the stack and close enough to the spill slot, this can lead to a
// situation where the monitor always gets cleared and the atomic operation
// can never succeed. So at -O0 we need a late-expanded pseudo-inst instead.
- if (getTargetMachine().getOptLevel() == 0)
+ if (getTargetMachine().getOptLevel() == CodeGenOpt::None)
return AtomicExpansionKind::None;
return AtomicExpansionKind::LLSC;
}
@@ -13278,8 +14847,7 @@ bool AArch64TargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const {
// integer division, leaving the division as-is is a loss even in terms of
// size, because it will have to be scalarized, while the alternative code
// sequence can be performed in vector form.
- bool OptSize =
- Attr.hasAttribute(AttributeList::FunctionIndex, Attribute::MinSize);
+ bool OptSize = Attr.hasFnAttribute(Attribute::MinSize);
return OptSize && !VT.isVector();
}
@@ -13309,3 +14877,280 @@ void AArch64TargetLowering::finalizeLowering(MachineFunction &MF) const {
bool AArch64TargetLowering::needsFixedCatchObjects() const {
return false;
}
+
+bool AArch64TargetLowering::shouldLocalize(
+ const MachineInstr &MI, const TargetTransformInfo *TTI) const {
+ switch (MI.getOpcode()) {
+ case TargetOpcode::G_GLOBAL_VALUE: {
+ // On Darwin, TLS global vars get selected into function calls, which
+ // we don't want localized, as they can get moved into the middle of a
+ // another call sequence.
+ const GlobalValue &GV = *MI.getOperand(1).getGlobal();
+ if (GV.isThreadLocal() && Subtarget->isTargetMachO())
+ return false;
+ break;
+ }
+ // If we legalized G_GLOBAL_VALUE into ADRP + G_ADD_LOW, mark both as being
+ // localizable.
+ case AArch64::ADRP:
+ case AArch64::G_ADD_LOW:
+ return true;
+ default:
+ break;
+ }
+ return TargetLoweringBase::shouldLocalize(MI, TTI);
+}
+
+bool AArch64TargetLowering::fallBackToDAGISel(const Instruction &Inst) const {
+ if (isa<ScalableVectorType>(Inst.getType()))
+ return true;
+
+ for (unsigned i = 0; i < Inst.getNumOperands(); ++i)
+ if (isa<ScalableVectorType>(Inst.getOperand(i)->getType()))
+ return true;
+
+ return false;
+}
+
+// Return the largest legal scalable vector type that matches VT's element type.
+static EVT getContainerForFixedLengthVector(SelectionDAG &DAG, EVT VT) {
+ assert(VT.isFixedLengthVector() &&
+ DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
+ "Expected legal fixed length vector!");
+ switch (VT.getVectorElementType().getSimpleVT().SimpleTy) {
+ default:
+ llvm_unreachable("unexpected element type for SVE container");
+ case MVT::i8:
+ return EVT(MVT::nxv16i8);
+ case MVT::i16:
+ return EVT(MVT::nxv8i16);
+ case MVT::i32:
+ return EVT(MVT::nxv4i32);
+ case MVT::i64:
+ return EVT(MVT::nxv2i64);
+ case MVT::f16:
+ return EVT(MVT::nxv8f16);
+ case MVT::f32:
+ return EVT(MVT::nxv4f32);
+ case MVT::f64:
+ return EVT(MVT::nxv2f64);
+ }
+}
+
+// Return a PTRUE with active lanes corresponding to the extent of VT.
+static SDValue getPredicateForFixedLengthVector(SelectionDAG &DAG, SDLoc &DL,
+ EVT VT) {
+ assert(VT.isFixedLengthVector() &&
+ DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
+ "Expected legal fixed length vector!");
+
+ int PgPattern;
+ switch (VT.getVectorNumElements()) {
+ default:
+ llvm_unreachable("unexpected element count for SVE predicate");
+ case 1:
+ PgPattern = AArch64SVEPredPattern::vl1;
+ break;
+ case 2:
+ PgPattern = AArch64SVEPredPattern::vl2;
+ break;
+ case 4:
+ PgPattern = AArch64SVEPredPattern::vl4;
+ break;
+ case 8:
+ PgPattern = AArch64SVEPredPattern::vl8;
+ break;
+ case 16:
+ PgPattern = AArch64SVEPredPattern::vl16;
+ break;
+ case 32:
+ PgPattern = AArch64SVEPredPattern::vl32;
+ break;
+ case 64:
+ PgPattern = AArch64SVEPredPattern::vl64;
+ break;
+ case 128:
+ PgPattern = AArch64SVEPredPattern::vl128;
+ break;
+ case 256:
+ PgPattern = AArch64SVEPredPattern::vl256;
+ break;
+ }
+
+ // TODO: For vectors that are exactly getMaxSVEVectorSizeInBits big, we can
+ // use AArch64SVEPredPattern::all, which can enable the use of unpredicated
+ // variants of instructions when available.
+
+ MVT MaskVT;
+ switch (VT.getVectorElementType().getSimpleVT().SimpleTy) {
+ default:
+ llvm_unreachable("unexpected element type for SVE predicate");
+ case MVT::i8:
+ MaskVT = MVT::nxv16i1;
+ break;
+ case MVT::i16:
+ case MVT::f16:
+ MaskVT = MVT::nxv8i1;
+ break;
+ case MVT::i32:
+ case MVT::f32:
+ MaskVT = MVT::nxv4i1;
+ break;
+ case MVT::i64:
+ case MVT::f64:
+ MaskVT = MVT::nxv2i1;
+ break;
+ }
+
+ return DAG.getNode(AArch64ISD::PTRUE, DL, MaskVT,
+ DAG.getTargetConstant(PgPattern, DL, MVT::i64));
+}
+
+static SDValue getPredicateForScalableVector(SelectionDAG &DAG, SDLoc &DL,
+ EVT VT) {
+ assert(VT.isScalableVector() && DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
+ "Expected legal scalable vector!");
+ auto PredTy = VT.changeVectorElementType(MVT::i1);
+ return getPTrue(DAG, DL, PredTy, AArch64SVEPredPattern::all);
+}
+
+static SDValue getPredicateForVector(SelectionDAG &DAG, SDLoc &DL, EVT VT) {
+ if (VT.isFixedLengthVector())
+ return getPredicateForFixedLengthVector(DAG, DL, VT);
+
+ return getPredicateForScalableVector(DAG, DL, VT);
+}
+
+// Grow V to consume an entire SVE register.
+static SDValue convertToScalableVector(SelectionDAG &DAG, EVT VT, SDValue V) {
+ assert(VT.isScalableVector() &&
+ "Expected to convert into a scalable vector!");
+ assert(V.getValueType().isFixedLengthVector() &&
+ "Expected a fixed length vector operand!");
+ SDLoc DL(V);
+ SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
+ return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V, Zero);
+}
+
+// Shrink V so it's just big enough to maintain a VT's worth of data.
+static SDValue convertFromScalableVector(SelectionDAG &DAG, EVT VT, SDValue V) {
+ assert(VT.isFixedLengthVector() &&
+ "Expected to convert into a fixed length vector!");
+ assert(V.getValueType().isScalableVector() &&
+ "Expected a scalable vector operand!");
+ SDLoc DL(V);
+ SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
+ return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V, Zero);
+}
+
+// Convert all fixed length vector loads larger than NEON to masked_loads.
+SDValue AArch64TargetLowering::LowerFixedLengthVectorLoadToSVE(
+ SDValue Op, SelectionDAG &DAG) const {
+ auto Load = cast<LoadSDNode>(Op);
+
+ SDLoc DL(Op);
+ EVT VT = Op.getValueType();
+ EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
+
+ auto NewLoad = DAG.getMaskedLoad(
+ ContainerVT, DL, Load->getChain(), Load->getBasePtr(), Load->getOffset(),
+ getPredicateForFixedLengthVector(DAG, DL, VT), DAG.getUNDEF(ContainerVT),
+ Load->getMemoryVT(), Load->getMemOperand(), Load->getAddressingMode(),
+ Load->getExtensionType());
+
+ auto Result = convertFromScalableVector(DAG, VT, NewLoad);
+ SDValue MergedValues[2] = {Result, Load->getChain()};
+ return DAG.getMergeValues(MergedValues, DL);
+}
+
+// Convert all fixed length vector stores larger than NEON to masked_stores.
+SDValue AArch64TargetLowering::LowerFixedLengthVectorStoreToSVE(
+ SDValue Op, SelectionDAG &DAG) const {
+ auto Store = cast<StoreSDNode>(Op);
+
+ SDLoc DL(Op);
+ EVT VT = Store->getValue().getValueType();
+ EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
+
+ auto NewValue = convertToScalableVector(DAG, ContainerVT, Store->getValue());
+ return DAG.getMaskedStore(
+ Store->getChain(), DL, NewValue, Store->getBasePtr(), Store->getOffset(),
+ getPredicateForFixedLengthVector(DAG, DL, VT), Store->getMemoryVT(),
+ Store->getMemOperand(), Store->getAddressingMode(),
+ Store->isTruncatingStore());
+}
+
+SDValue AArch64TargetLowering::LowerFixedLengthVectorTruncateToSVE(
+ SDValue Op, SelectionDAG &DAG) const {
+ EVT VT = Op.getValueType();
+ assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
+
+ SDLoc DL(Op);
+ SDValue Val = Op.getOperand(0);
+ EVT ContainerVT = getContainerForFixedLengthVector(DAG, Val.getValueType());
+ Val = convertToScalableVector(DAG, ContainerVT, Val);
+
+ // Repeatedly truncate Val until the result is of the desired element type.
+ switch (ContainerVT.getSimpleVT().SimpleTy) {
+ default:
+ llvm_unreachable("unimplemented container type");
+ case MVT::nxv2i64:
+ Val = DAG.getNode(ISD::BITCAST, DL, MVT::nxv4i32, Val);
+ Val = DAG.getNode(AArch64ISD::UZP1, DL, MVT::nxv4i32, Val, Val);
+ if (VT.getVectorElementType() == MVT::i32)
+ break;
+ LLVM_FALLTHROUGH;
+ case MVT::nxv4i32:
+ Val = DAG.getNode(ISD::BITCAST, DL, MVT::nxv8i16, Val);
+ Val = DAG.getNode(AArch64ISD::UZP1, DL, MVT::nxv8i16, Val, Val);
+ if (VT.getVectorElementType() == MVT::i16)
+ break;
+ LLVM_FALLTHROUGH;
+ case MVT::nxv8i16:
+ Val = DAG.getNode(ISD::BITCAST, DL, MVT::nxv16i8, Val);
+ Val = DAG.getNode(AArch64ISD::UZP1, DL, MVT::nxv16i8, Val, Val);
+ assert(VT.getVectorElementType() == MVT::i8 && "Unexpected element type!");
+ break;
+ }
+
+ return convertFromScalableVector(DAG, VT, Val);
+}
+
+SDValue AArch64TargetLowering::LowerToPredicatedOp(SDValue Op,
+ SelectionDAG &DAG,
+ unsigned NewOp) const {
+ EVT VT = Op.getValueType();
+ SDLoc DL(Op);
+ auto Pg = getPredicateForVector(DAG, DL, VT);
+
+ if (useSVEForFixedLengthVectorVT(VT)) {
+ EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
+
+ // Create list of operands by convereting existing ones to scalable types.
+ SmallVector<SDValue, 4> Operands = {Pg};
+ for (const SDValue &V : Op->op_values()) {
+ if (isa<CondCodeSDNode>(V)) {
+ Operands.push_back(V);
+ continue;
+ }
+
+ assert(useSVEForFixedLengthVectorVT(V.getValueType()) &&
+ "Only fixed length vectors are supported!");
+ Operands.push_back(convertToScalableVector(DAG, ContainerVT, V));
+ }
+
+ auto ScalableRes = DAG.getNode(NewOp, DL, ContainerVT, Operands);
+ return convertFromScalableVector(DAG, VT, ScalableRes);
+ }
+
+ assert(VT.isScalableVector() && "Only expect to lower scalable vector op!");
+
+ SmallVector<SDValue, 4> Operands = {Pg};
+ for (const SDValue &V : Op->op_values()) {
+ assert((isa<CondCodeSDNode>(V) || V.getValueType().isScalableVector()) &&
+ "Only scalable vectors are supported!");
+ Operands.push_back(V);
+ }
+
+ return DAG.getNode(NewOp, DL, VT, Operands);
+}
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 672dfc4fcbc06..4fe77481706b3 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -25,6 +25,26 @@ namespace llvm {
namespace AArch64ISD {
+// For predicated nodes where the result is a vector, the operation is
+// controlled by a governing predicate and the inactive lanes are explicitly
+// defined with a value, please stick the following naming convention:
+//
+// _MERGE_OP<n> The result value is a vector with inactive lanes equal
+// to source operand OP<n>.
+//
+// _MERGE_ZERO The result value is a vector with inactive lanes
+// actively zeroed.
+//
+// _MERGE_PASSTHRU The result value is a vector with inactive lanes equal
+// to the last source operand which only purpose is being
+// a passthru value.
+//
+// For other cases where no explicit action is needed to set the inactive lanes,
+// or when the result is not a vector and it is needed or helpful to
+// distinguish a node from similar unpredicated nodes, use:
+//
+// _PRED
+//
enum NodeType : unsigned {
FIRST_NUMBER = ISD::BUILTIN_OP_END,
WrapperLarge, // 4-instruction MOVZ/MOVK sequence for 64-bit addresses.
@@ -52,6 +72,22 @@ enum NodeType : unsigned {
ADC,
SBC, // adc, sbc instructions
+ // Arithmetic instructions
+ ADD_PRED,
+ FADD_PRED,
+ SDIV_PRED,
+ UDIV_PRED,
+ FMA_PRED,
+ SMIN_MERGE_OP1,
+ UMIN_MERGE_OP1,
+ SMAX_MERGE_OP1,
+ UMAX_MERGE_OP1,
+ SHL_MERGE_OP1,
+ SRL_MERGE_OP1,
+ SRA_MERGE_OP1,
+
+ SETCC_MERGE_ZERO,
+
// Arithmetic instructions which write flags.
ADDS,
SUBS,
@@ -90,9 +126,9 @@ enum NodeType : unsigned {
BICi,
ORRi,
- // Vector bit select: similar to ISD::VSELECT but not all bits within an
+ // Vector bitwise select: similar to ISD::VSELECT but not all bits within an
// element must be identical.
- BSL,
+ BSP,
// Vector arithmetic negation
NEG,
@@ -121,6 +157,10 @@ enum NodeType : unsigned {
SRSHR_I,
URSHR_I,
+ // Vector shift by constant and insert
+ VSLI,
+ VSRI,
+
// Vector comparisons
CMEQ,
CMGE,
@@ -148,6 +188,10 @@ enum NodeType : unsigned {
SADDV,
UADDV,
+ // Vector rounding halving addition
+ SRHADD,
+ URHADD,
+
// Vector across-lanes min/max
// Only the lower result lane is defined.
SMINV,
@@ -166,7 +210,7 @@ enum NodeType : unsigned {
// Vector bitwise negation
NOT,
- // Vector bitwise selection
+ // Vector bitwise insertion
BIT,
// Compare-and-branch
@@ -196,8 +240,10 @@ enum NodeType : unsigned {
UMULL,
// Reciprocal estimates and steps.
- FRECPE, FRECPS,
- FRSQRTE, FRSQRTS,
+ FRECPE,
+ FRECPS,
+ FRSQRTE,
+ FRSQRTS,
SUNPKHI,
SUNPKLO,
@@ -211,35 +257,97 @@ enum NodeType : unsigned {
REV,
TBL,
+ // Floating-point reductions.
+ FADDA_PRED,
+ FADDV_PRED,
+ FMAXV_PRED,
+ FMAXNMV_PRED,
+ FMINV_PRED,
+ FMINNMV_PRED,
+
INSR,
PTEST,
PTRUE,
+ DUP_MERGE_PASSTHRU,
+ INDEX_VECTOR,
+
+ REINTERPRET_CAST,
+
+ LD1_MERGE_ZERO,
+ LD1S_MERGE_ZERO,
+ LDNF1_MERGE_ZERO,
+ LDNF1S_MERGE_ZERO,
+ LDFF1_MERGE_ZERO,
+ LDFF1S_MERGE_ZERO,
+ LD1RQ_MERGE_ZERO,
+ LD1RO_MERGE_ZERO,
+
+ // Structured loads.
+ SVE_LD2_MERGE_ZERO,
+ SVE_LD3_MERGE_ZERO,
+ SVE_LD4_MERGE_ZERO,
+
// Unsigned gather loads.
- GLD1,
- GLD1_SCALED,
- GLD1_UXTW,
- GLD1_SXTW,
- GLD1_UXTW_SCALED,
- GLD1_SXTW_SCALED,
- GLD1_IMM,
+ GLD1_MERGE_ZERO,
+ GLD1_SCALED_MERGE_ZERO,
+ GLD1_UXTW_MERGE_ZERO,
+ GLD1_SXTW_MERGE_ZERO,
+ GLD1_UXTW_SCALED_MERGE_ZERO,
+ GLD1_SXTW_SCALED_MERGE_ZERO,
+ GLD1_IMM_MERGE_ZERO,
// Signed gather loads
- GLD1S,
- GLD1S_SCALED,
- GLD1S_UXTW,
- GLD1S_SXTW,
- GLD1S_UXTW_SCALED,
- GLD1S_SXTW_SCALED,
- GLD1S_IMM,
+ GLD1S_MERGE_ZERO,
+ GLD1S_SCALED_MERGE_ZERO,
+ GLD1S_UXTW_MERGE_ZERO,
+ GLD1S_SXTW_MERGE_ZERO,
+ GLD1S_UXTW_SCALED_MERGE_ZERO,
+ GLD1S_SXTW_SCALED_MERGE_ZERO,
+ GLD1S_IMM_MERGE_ZERO,
+
+ // Unsigned gather loads.
+ GLDFF1_MERGE_ZERO,
+ GLDFF1_SCALED_MERGE_ZERO,
+ GLDFF1_UXTW_MERGE_ZERO,
+ GLDFF1_SXTW_MERGE_ZERO,
+ GLDFF1_UXTW_SCALED_MERGE_ZERO,
+ GLDFF1_SXTW_SCALED_MERGE_ZERO,
+ GLDFF1_IMM_MERGE_ZERO,
+
+ // Signed gather loads.
+ GLDFF1S_MERGE_ZERO,
+ GLDFF1S_SCALED_MERGE_ZERO,
+ GLDFF1S_UXTW_MERGE_ZERO,
+ GLDFF1S_SXTW_MERGE_ZERO,
+ GLDFF1S_UXTW_SCALED_MERGE_ZERO,
+ GLDFF1S_SXTW_SCALED_MERGE_ZERO,
+ GLDFF1S_IMM_MERGE_ZERO,
+
+ // Non-temporal gather loads
+ GLDNT1_MERGE_ZERO,
+ GLDNT1_INDEX_MERGE_ZERO,
+ GLDNT1S_MERGE_ZERO,
+
+ // Contiguous masked store.
+ ST1_PRED,
+
// Scatter store
- SST1,
- SST1_SCALED,
- SST1_UXTW,
- SST1_SXTW,
- SST1_UXTW_SCALED,
- SST1_SXTW_SCALED,
- SST1_IMM,
+ SST1_PRED,
+ SST1_SCALED_PRED,
+ SST1_UXTW_PRED,
+ SST1_SXTW_PRED,
+ SST1_UXTW_SCALED_PRED,
+ SST1_SXTW_SCALED_PRED,
+ SST1_IMM_PRED,
+
+ // Non-temporal scatter store
+ SSTNT1_PRED,
+ SSTNT1_INDEX_PRED,
+
+ // Strict (exception-raising) floating point comparison
+ STRICT_FCMP = ISD::FIRST_TARGET_STRICTFP_OPCODE,
+ STRICT_FCMPE,
// NEON Load/Store with post-increment base updates
LD2post = ISD::FIRST_TARGET_MEMORY_OPCODE,
@@ -272,7 +380,8 @@ enum NodeType : unsigned {
STZ2G,
LDP,
- STP
+ STP,
+ STNP
};
} // end namespace AArch64ISD
@@ -321,7 +430,8 @@ public:
return MVT::getIntegerVT(64);
}
- bool targetShrinkDemandedConstant(SDValue Op, const APInt &Demanded,
+ bool targetShrinkDemandedConstant(SDValue Op, const APInt &DemandedBits,
+ const APInt &DemandedElts,
TargetLoweringOpt &TLO) const override;
MVT getScalarShiftAmountTy(const DataLayout &DL, EVT) const override;
@@ -333,9 +443,10 @@ public:
MachineMemOperand::Flags Flags = MachineMemOperand::MONone,
bool *Fast = nullptr) const override;
/// LLT variant.
- bool allowsMisalignedMemoryAccesses(
- LLT Ty, unsigned AddrSpace, unsigned Align, MachineMemOperand::Flags Flags,
- bool *Fast = nullptr) const override;
+ bool allowsMisalignedMemoryAccesses(LLT Ty, unsigned AddrSpace,
+ Align Alignment,
+ MachineMemOperand::Flags Flags,
+ bool *Fast = nullptr) const override;
/// Provide custom lowering hooks for some operations.
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
@@ -376,9 +487,6 @@ public:
MachineBasicBlock *EmitLoweredCatchRet(MachineInstr &MI,
MachineBasicBlock *BB) const;
- MachineBasicBlock *EmitLoweredCatchPad(MachineInstr &MI,
- MachineBasicBlock *BB) const;
-
MachineBasicBlock *
EmitInstrWithCustomInserter(MachineInstr &MI,
MachineBasicBlock *MBB) const override;
@@ -402,7 +510,7 @@ public:
bool shouldSinkOperands(Instruction *I,
SmallVectorImpl<Use *> &Ops) const override;
- bool hasPairedLoad(EVT LoadedType, unsigned &RequiredAligment) const override;
+ bool hasPairedLoad(EVT LoadedType, Align &RequiredAligment) const override;
unsigned getMaxSupportedInterleaveFactor() const override { return 4; }
@@ -418,13 +526,11 @@ public:
bool shouldConsiderGEPOffsetSplit() const override;
- EVT getOptimalMemOpType(uint64_t Size, unsigned DstAlign, unsigned SrcAlign,
- bool IsMemset, bool ZeroMemset, bool MemcpyStrSrc,
+ EVT getOptimalMemOpType(const MemOp &Op,
const AttributeList &FuncAttributes) const override;
- LLT getOptimalMemOpLLT(uint64_t Size, unsigned DstAlign, unsigned SrcAlign,
- bool IsMemset, bool ZeroMemset, bool MemcpyStrSrc,
- const AttributeList &FuncAttributes) const override;
+ LLT getOptimalMemOpLLT(const MemOp &Op,
+ const AttributeList &FuncAttributes) const override;
/// Return true if the addressing mode represented by AM is legal for this
/// target, for a load/store of the specified type.
@@ -463,6 +569,13 @@ public:
bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
unsigned Index) const override;
+ bool shouldFormOverflowOp(unsigned Opcode, EVT VT,
+ bool MathUsed) const override {
+ // Using overflow ops for overflow checks only should beneficial on
+ // AArch64.
+ return TargetLowering::shouldFormOverflowOp(Opcode, VT, true);
+ }
+
Value *emitLoadLinked(IRBuilder<> &Builder, Value *Addr,
AtomicOrdering Ord) const override;
Value *emitStoreConditional(IRBuilder<> &Builder, Value *Val,
@@ -497,7 +610,7 @@ public:
/// If a physical register, this returns the register that receives the
/// exception address on entry to an EH pad.
- unsigned
+ Register
getExceptionPointerRegister(const Constant *PersonalityFn) const override {
// FIXME: This is a guess. Has this been defined yet?
return AArch64::X0;
@@ -505,7 +618,7 @@ public:
/// If a physical register, this returns the register that receives the
/// exception typeid on entry to a landing pad.
- unsigned
+ Register
getExceptionSelectorRegister(const Constant *PersonalityFn) const override {
// FIXME: This is a guess. Has this been defined yet?
return AArch64::X1;
@@ -611,13 +724,27 @@ public:
unsigned getNumInterleavedAccesses(VectorType *VecTy,
const DataLayout &DL) const;
- MachineMemOperand::Flags getMMOFlags(const Instruction &I) const override;
+ MachineMemOperand::Flags getTargetMMOFlags(
+ const Instruction &I) const override;
bool functionArgumentNeedsConsecutiveRegisters(Type *Ty,
CallingConv::ID CallConv,
bool isVarArg) const override;
/// Used for exception handling on Win64.
bool needsFixedCatchObjects() const override;
+
+ bool fallBackToDAGISel(const Instruction &Inst) const override;
+
+ /// SVE code generation for fixed length vectors does not custom lower
+ /// BUILD_VECTOR. This makes BUILD_VECTOR legalisation a source of stores to
+ /// merge. However, merging them creates a BUILD_VECTOR that is just as
+ /// illegal as the original, thus leading to an infinite legalisation loop.
+ /// NOTE: Once BUILD_VECTOR is legal or can be custom lowered for all legal
+ /// vector types this override can be removed.
+ bool mergeStoresAfterLegalization(EVT VT) const override {
+ return !useSVEForFixedLengthVectors();
+ }
+
private:
/// Keep a pointer to the AArch64Subtarget around so that we can
/// make the right decision when generating code for different targets.
@@ -626,6 +753,7 @@ private:
bool isExtFreeImpl(const Instruction *Ext) const override;
void addTypeForNEON(MVT VT, MVT PromotedBitwiseVT);
+ void addTypeForFixedLengthSVE(MVT VT);
void addDRTypeForNEON(MVT VT);
void addQRTypeForNEON(MVT VT);
@@ -729,7 +857,11 @@ private:
SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerSPLAT_VECTOR(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerDUPQLane(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerToPredicatedOp(SDValue Op, SelectionDAG &DAG,
+ unsigned NewOp) const;
SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerINSERT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerVectorSRA_SRL_SHL(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerShiftLeftParts(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerShiftRightParts(SDValue Op, SelectionDAG &DAG) const;
@@ -746,6 +878,8 @@ private:
SDValue LowerVectorOR(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerVSCALE(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerVECREDUCE(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerATOMIC_LOAD_SUB(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerATOMIC_LOAD_AND(SDValue Op, SelectionDAG &DAG) const;
@@ -753,6 +887,13 @@ private:
SDValue LowerWindowsDYNAMIC_STACKALLOC(SDValue Op, SDValue Chain,
SDValue &Size,
SelectionDAG &DAG) const;
+ SDValue LowerSVEStructLoad(unsigned Intrinsic, ArrayRef<SDValue> LoadOps,
+ EVT VT, SelectionDAG &DAG, const SDLoc &DL) const;
+
+ SDValue LowerFixedLengthVectorLoadToSVE(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerFixedLengthVectorStoreToSVE(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerFixedLengthVectorTruncateToSVE(SDValue Op,
+ SelectionDAG &DAG) const;
SDValue BuildSDIVPow2(SDNode *N, const APInt &Divisor, SelectionDAG &DAG,
SmallVectorImpl<SDNode *> &Created) const override;
@@ -807,10 +948,19 @@ private:
void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue> &Results,
SelectionDAG &DAG) const override;
+ void ReplaceExtractSubVectorResults(SDNode *N,
+ SmallVectorImpl<SDValue> &Results,
+ SelectionDAG &DAG) const;
bool shouldNormalizeToSelectSequence(LLVMContext &, EVT) const override;
void finalizeLowering(MachineFunction &MF) const override;
+
+ bool shouldLocalize(const MachineInstr &MI,
+ const TargetTransformInfo *TTI) const override;
+
+ bool useSVEForFixedLengthVectors() const;
+ bool useSVEForFixedLengthVectorVT(EVT VT) const;
};
namespace AArch64 {
diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
index c3efe03a0987f..6df7970f4d82b 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
@@ -20,6 +20,30 @@ class Format<bits<2> val> {
def PseudoFrm : Format<0>;
def NormalFrm : Format<1>; // Do we need any others?
+// Enum describing whether an instruction is
+// destructive in its first source operand.
+class DestructiveInstTypeEnum<bits<4> val> {
+ bits<4> Value = val;
+}
+def NotDestructive : DestructiveInstTypeEnum<0>;
+// Destructive in its first operand and can be MOVPRFX'd, but has no other
+// special properties.
+def DestructiveOther : DestructiveInstTypeEnum<1>;
+def DestructiveUnary : DestructiveInstTypeEnum<2>;
+def DestructiveBinaryImm : DestructiveInstTypeEnum<3>;
+def DestructiveBinaryShImmUnpred : DestructiveInstTypeEnum<4>;
+def DestructiveBinary : DestructiveInstTypeEnum<5>;
+def DestructiveBinaryComm : DestructiveInstTypeEnum<6>;
+def DestructiveBinaryCommWithRev : DestructiveInstTypeEnum<7>;
+def DestructiveTernaryCommWithRev : DestructiveInstTypeEnum<8>;
+
+class FalseLanesEnum<bits<2> val> {
+ bits<2> Value = val;
+}
+def FalseLanesNone : FalseLanesEnum<0>;
+def FalseLanesZero : FalseLanesEnum<1>;
+def FalseLanesUndef : FalseLanesEnum<2>;
+
// AArch64 Instruction Format
class AArch64Inst<Format f, string cstr> : Instruction {
field bits<32> Inst; // Instruction encoding.
@@ -34,6 +58,16 @@ class AArch64Inst<Format f, string cstr> : Instruction {
let Namespace = "AArch64";
Format F = f;
bits<2> Form = F.Value;
+
+ // Defaults
+ FalseLanesEnum FalseLanes = FalseLanesNone;
+ DestructiveInstTypeEnum DestructiveInstType = NotDestructive;
+ ElementSizeEnum ElementSize = ElementSizeNone;
+
+ let TSFlags{8-7} = FalseLanes.Value;
+ let TSFlags{6-3} = DestructiveInstType.Value;
+ let TSFlags{2-0} = ElementSize.Value;
+
let Pattern = [];
let Constraints = cstr;
}
@@ -48,6 +82,7 @@ class Pseudo<dag oops, dag iops, list<dag> pattern, string cstr = "">
dag InOperandList = iops;
let Pattern = pattern;
let isCodeGenOnly = 1;
+ let isPseudo = 1;
}
// Real instructions (have encoding information)
@@ -56,14 +91,6 @@ class EncodedI<string cstr, list<dag> pattern> : AArch64Inst<NormalFrm, cstr> {
let Size = 4;
}
-// Enum describing whether an instruction is
-// destructive in its first source operand.
-class DestructiveInstTypeEnum<bits<1> val> {
- bits<1> Value = val;
-}
-def NotDestructive : DestructiveInstTypeEnum<0>;
-def Destructive : DestructiveInstTypeEnum<1>;
-
// Normal instructions
class I<dag oops, dag iops, string asm, string operands, string cstr,
list<dag> pattern>
@@ -71,13 +98,6 @@ class I<dag oops, dag iops, string asm, string operands, string cstr,
dag OutOperandList = oops;
dag InOperandList = iops;
let AsmString = !strconcat(asm, operands);
-
- // Destructive operations (SVE)
- DestructiveInstTypeEnum DestructiveInstType = NotDestructive;
- ElementSizeEnum ElementSize = ElementSizeB;
-
- let TSFlags{3} = DestructiveInstType.Value;
- let TSFlags{2-0} = ElementSize.Value;
}
class TriOpFrag<dag res> : PatFrag<(ops node:$LHS, node:$MHS, node:$RHS), res>;
@@ -327,6 +347,18 @@ def simm5_32b : Operand<i32>, ImmLeaf<i32, [{ return Imm >= -16 && Imm < 16; }]>
let DecoderMethod = "DecodeSImm<5>";
}
+def simm5_8b : Operand<i32>, ImmLeaf<i32, [{ return (int8_t)Imm >= -16 && (int8_t)Imm < 16; }]> {
+ let ParserMatchClass = SImm5Operand;
+ let DecoderMethod = "DecodeSImm<5>";
+ let PrintMethod = "printSImm<8>";
+}
+
+def simm5_16b : Operand<i32>, ImmLeaf<i32, [{ return (int16_t)Imm >= -16 && (int16_t)Imm < 16; }]> {
+ let ParserMatchClass = SImm5Operand;
+ let DecoderMethod = "DecodeSImm<5>";
+ let PrintMethod = "printSImm<16>";
+}
+
// simm7sN predicate - True if the immediate is a multiple of N in the range
// [-64 * N, 63 * N].
@@ -349,6 +381,8 @@ def simm7s16 : Operand<i32> {
let PrintMethod = "printImmScale<16>";
}
+def am_sve_fi : ComplexPattern<i64, 2, "SelectAddrModeFrameIndexSVE", []>;
+
def am_indexed7s8 : ComplexPattern<i64, 2, "SelectAddrModeIndexed7S8", []>;
def am_indexed7s16 : ComplexPattern<i64, 2, "SelectAddrModeIndexed7S16", []>;
def am_indexed7s32 : ComplexPattern<i64, 2, "SelectAddrModeIndexed7S32", []>;
@@ -358,6 +392,9 @@ def am_indexed7s128 : ComplexPattern<i64, 2, "SelectAddrModeIndexed7S128", []>;
def am_indexedu6s128 : ComplexPattern<i64, 2, "SelectAddrModeIndexedU6S128", []>;
def am_indexeds9s128 : ComplexPattern<i64, 2, "SelectAddrModeIndexedS9S128", []>;
+def UImmS1XForm : SDNodeXForm<imm, [{
+ return CurDAG->getTargetConstant(N->getZExtValue(), SDLoc(N), MVT::i64);
+}]>;
def UImmS2XForm : SDNodeXForm<imm, [{
return CurDAG->getTargetConstant(N->getZExtValue() / 2, SDLoc(N), MVT::i64);
}]>;
@@ -446,6 +483,19 @@ def uimm6s16 : Operand<i64>, ImmLeaf<i64,
let ParserMatchClass = UImm6s16Operand;
}
+def SImmS2XForm : SDNodeXForm<imm, [{
+ return CurDAG->getTargetConstant(N->getSExtValue() / 2, SDLoc(N), MVT::i64);
+}]>;
+def SImmS3XForm : SDNodeXForm<imm, [{
+ return CurDAG->getTargetConstant(N->getSExtValue() / 3, SDLoc(N), MVT::i64);
+}]>;
+def SImmS4XForm : SDNodeXForm<imm, [{
+ return CurDAG->getTargetConstant(N->getSExtValue() / 4, SDLoc(N), MVT::i64);
+}]>;
+def SImmS16XForm : SDNodeXForm<imm, [{
+ return CurDAG->getTargetConstant(N->getSExtValue() / 16, SDLoc(N), MVT::i64);
+}]>;
+
// simm6sN predicate - True if the immediate is a multiple of N in the range
// [-32 * N, 31 * N].
def SImm6s1Operand : SImmScaledMemoryIndexed<6, 1>;
@@ -461,6 +511,7 @@ def SImm4s2Operand : SImmScaledMemoryIndexed<4, 2>;
def SImm4s3Operand : SImmScaledMemoryIndexed<4, 3>;
def SImm4s4Operand : SImmScaledMemoryIndexed<4, 4>;
def SImm4s16Operand : SImmScaledMemoryIndexed<4, 16>;
+def SImm4s32Operand : SImmScaledMemoryIndexed<4, 32>;
def simm4s1 : Operand<i64>, ImmLeaf<i64,
[{ return Imm >=-8 && Imm <= 7; }]> {
@@ -469,31 +520,37 @@ def simm4s1 : Operand<i64>, ImmLeaf<i64,
}
def simm4s2 : Operand<i64>, ImmLeaf<i64,
-[{ return Imm >=-16 && Imm <= 14 && (Imm % 2) == 0x0; }]> {
+[{ return Imm >=-16 && Imm <= 14 && (Imm % 2) == 0x0; }], SImmS2XForm> {
let PrintMethod = "printImmScale<2>";
let ParserMatchClass = SImm4s2Operand;
let DecoderMethod = "DecodeSImm<4>";
}
def simm4s3 : Operand<i64>, ImmLeaf<i64,
-[{ return Imm >=-24 && Imm <= 21 && (Imm % 3) == 0x0; }]> {
+[{ return Imm >=-24 && Imm <= 21 && (Imm % 3) == 0x0; }], SImmS3XForm> {
let PrintMethod = "printImmScale<3>";
let ParserMatchClass = SImm4s3Operand;
let DecoderMethod = "DecodeSImm<4>";
}
def simm4s4 : Operand<i64>, ImmLeaf<i64,
-[{ return Imm >=-32 && Imm <= 28 && (Imm % 4) == 0x0; }]> {
+[{ return Imm >=-32 && Imm <= 28 && (Imm % 4) == 0x0; }], SImmS4XForm> {
let PrintMethod = "printImmScale<4>";
let ParserMatchClass = SImm4s4Operand;
let DecoderMethod = "DecodeSImm<4>";
}
def simm4s16 : Operand<i64>, ImmLeaf<i64,
-[{ return Imm >=-128 && Imm <= 112 && (Imm % 16) == 0x0; }]> {
+[{ return Imm >=-128 && Imm <= 112 && (Imm % 16) == 0x0; }], SImmS16XForm> {
let PrintMethod = "printImmScale<16>";
let ParserMatchClass = SImm4s16Operand;
let DecoderMethod = "DecodeSImm<4>";
}
+def simm4s32 : Operand<i64>, ImmLeaf<i64,
+[{ return Imm >=-256 && Imm <= 224 && (Imm % 32) == 0x0; }]> {
+ let PrintMethod = "printImmScale<32>";
+ let ParserMatchClass = SImm4s32Operand;
+ let DecoderMethod = "DecodeSImm<4>";
+}
def Imm1_8Operand : AsmImmRange<1, 8>;
def Imm1_16Operand : AsmImmRange<1, 16>;
@@ -647,6 +704,13 @@ def tvecshiftR32 : Operand<i32>, TImmLeaf<i32, [{
let DecoderMethod = "DecodeVecShiftR32Imm";
let ParserMatchClass = Imm1_32Operand;
}
+def tvecshiftR64 : Operand<i32>, TImmLeaf<i32, [{
+ return (((uint32_t)Imm) > 0) && (((uint32_t)Imm) < 65);
+}]> {
+ let EncoderMethod = "getVecShiftR64OpValue";
+ let DecoderMethod = "DecodeVecShiftR64Imm";
+ let ParserMatchClass = Imm1_64Operand;
+}
def Imm0_1Operand : AsmImmRange<0, 1>;
def Imm0_7Operand : AsmImmRange<0, 7>;
@@ -683,6 +747,36 @@ def vecshiftL64 : Operand<i32>, ImmLeaf<i32, [{
let ParserMatchClass = Imm0_63Operand;
}
+// Same as vecshiftL#N, but use TargetConstant (TimmLeaf) instead of Constant
+// (ImmLeaf)
+def tvecshiftL8 : Operand<i32>, TImmLeaf<i32, [{
+ return (((uint32_t)Imm) < 8);
+}]> {
+ let EncoderMethod = "getVecShiftL8OpValue";
+ let DecoderMethod = "DecodeVecShiftL8Imm";
+ let ParserMatchClass = Imm0_7Operand;
+}
+def tvecshiftL16 : Operand<i32>, TImmLeaf<i32, [{
+ return (((uint32_t)Imm) < 16);
+}]> {
+ let EncoderMethod = "getVecShiftL16OpValue";
+ let DecoderMethod = "DecodeVecShiftL16Imm";
+ let ParserMatchClass = Imm0_15Operand;
+}
+def tvecshiftL32 : Operand<i32>, TImmLeaf<i32, [{
+ return (((uint32_t)Imm) < 32);
+}]> {
+ let EncoderMethod = "getVecShiftL32OpValue";
+ let DecoderMethod = "DecodeVecShiftL32Imm";
+ let ParserMatchClass = Imm0_31Operand;
+}
+def tvecshiftL64 : Operand<i32>, TImmLeaf<i32, [{
+ return (((uint32_t)Imm) < 64);
+}]> {
+ let EncoderMethod = "getVecShiftL64OpValue";
+ let DecoderMethod = "DecodeVecShiftL64Imm";
+ let ParserMatchClass = Imm0_63Operand;
+}
// Crazy immediate formats used by 32-bit and 64-bit logical immediate
// instructions for splatting repeating bit patterns across the immediate.
@@ -796,7 +890,7 @@ def imm0_31 : Operand<i64>, ImmLeaf<i64, [{
}
// timm0_31 predicate - same ass imm0_31, but use TargetConstant (TimmLeaf)
-// instead of Contant (ImmLeaf)
+// instead of Constant (ImmLeaf)
def timm0_31 : Operand<i64>, TImmLeaf<i64, [{
return ((uint64_t)Imm) < 32;
}]> {
@@ -832,7 +926,7 @@ def imm0_7 : Operand<i64>, ImmLeaf<i64, [{
}
// imm32_0_7 predicate - True if the 32-bit immediate is in the range [0,7]
-def imm32_0_7 : Operand<i32>, ImmLeaf<i32, [{
+def imm32_0_7 : Operand<i32>, TImmLeaf<i32, [{
return ((uint32_t)Imm) < 8;
}]> {
let ParserMatchClass = Imm0_7Operand;
@@ -1091,29 +1185,44 @@ class AsmVectorIndex<int Min, int Max, string NamePrefix=""> : AsmOperandClass {
let RenderMethod = "addVectorIndexOperands";
}
-class AsmVectorIndexOpnd<ValueType ty, AsmOperandClass mc, code pred>
- : Operand<ty>, ImmLeaf<ty, pred> {
+class AsmVectorIndexOpnd<ValueType ty, AsmOperandClass mc>
+ : Operand<ty> {
let ParserMatchClass = mc;
let PrintMethod = "printVectorIndex";
}
+multiclass VectorIndex<ValueType ty, AsmOperandClass mc, code pred> {
+ def "" : AsmVectorIndexOpnd<ty, mc>, ImmLeaf<ty, pred>;
+ def _timm : AsmVectorIndexOpnd<ty, mc>, TImmLeaf<ty, pred>;
+}
+
def VectorIndex1Operand : AsmVectorIndex<1, 1>;
def VectorIndexBOperand : AsmVectorIndex<0, 15>;
def VectorIndexHOperand : AsmVectorIndex<0, 7>;
def VectorIndexSOperand : AsmVectorIndex<0, 3>;
def VectorIndexDOperand : AsmVectorIndex<0, 1>;
-def VectorIndex1 : AsmVectorIndexOpnd<i64, VectorIndex1Operand, [{ return ((uint64_t)Imm) == 1; }]>;
-def VectorIndexB : AsmVectorIndexOpnd<i64, VectorIndexBOperand, [{ return ((uint64_t)Imm) < 16; }]>;
-def VectorIndexH : AsmVectorIndexOpnd<i64, VectorIndexHOperand, [{ return ((uint64_t)Imm) < 8; }]>;
-def VectorIndexS : AsmVectorIndexOpnd<i64, VectorIndexSOperand, [{ return ((uint64_t)Imm) < 4; }]>;
-def VectorIndexD : AsmVectorIndexOpnd<i64, VectorIndexDOperand, [{ return ((uint64_t)Imm) < 2; }]>;
-
-def VectorIndex132b : AsmVectorIndexOpnd<i32, VectorIndex1Operand, [{ return ((uint64_t)Imm) == 1; }]>;
-def VectorIndexB32b : AsmVectorIndexOpnd<i32, VectorIndexBOperand, [{ return ((uint64_t)Imm) < 16; }]>;
-def VectorIndexH32b : AsmVectorIndexOpnd<i32, VectorIndexHOperand, [{ return ((uint64_t)Imm) < 8; }]>;
-def VectorIndexS32b : AsmVectorIndexOpnd<i32, VectorIndexSOperand, [{ return ((uint64_t)Imm) < 4; }]>;
-def VectorIndexD32b : AsmVectorIndexOpnd<i32, VectorIndexDOperand, [{ return ((uint64_t)Imm) < 2; }]>;
+defm VectorIndex1 : VectorIndex<i64, VectorIndex1Operand,
+ [{ return ((uint64_t)Imm) == 1; }]>;
+defm VectorIndexB : VectorIndex<i64, VectorIndexBOperand,
+ [{ return ((uint64_t)Imm) < 16; }]>;
+defm VectorIndexH : VectorIndex<i64, VectorIndexHOperand,
+ [{ return ((uint64_t)Imm) < 8; }]>;
+defm VectorIndexS : VectorIndex<i64, VectorIndexSOperand,
+ [{ return ((uint64_t)Imm) < 4; }]>;
+defm VectorIndexD : VectorIndex<i64, VectorIndexDOperand,
+ [{ return ((uint64_t)Imm) < 2; }]>;
+
+defm VectorIndex132b : VectorIndex<i32, VectorIndex1Operand,
+ [{ return ((uint64_t)Imm) == 1; }]>;
+defm VectorIndexB32b : VectorIndex<i32, VectorIndexBOperand,
+ [{ return ((uint64_t)Imm) < 16; }]>;
+defm VectorIndexH32b : VectorIndex<i32, VectorIndexHOperand,
+ [{ return ((uint64_t)Imm) < 8; }]>;
+defm VectorIndexS32b : VectorIndex<i32, VectorIndexSOperand,
+ [{ return ((uint64_t)Imm) < 4; }]>;
+defm VectorIndexD32b : VectorIndex<i32, VectorIndexDOperand,
+ [{ return ((uint64_t)Imm) < 2; }]>;
def SVEVectorIndexExtDupBOperand : AsmVectorIndex<0, 63, "SVE">;
def SVEVectorIndexExtDupHOperand : AsmVectorIndex<0, 31, "SVE">;
@@ -1121,16 +1230,21 @@ def SVEVectorIndexExtDupSOperand : AsmVectorIndex<0, 15, "SVE">;
def SVEVectorIndexExtDupDOperand : AsmVectorIndex<0, 7, "SVE">;
def SVEVectorIndexExtDupQOperand : AsmVectorIndex<0, 3, "SVE">;
-def sve_elm_idx_extdup_b
- : AsmVectorIndexOpnd<i64, SVEVectorIndexExtDupBOperand, [{ return ((uint64_t)Imm) < 64; }]>;
-def sve_elm_idx_extdup_h
- : AsmVectorIndexOpnd<i64, SVEVectorIndexExtDupHOperand, [{ return ((uint64_t)Imm) < 32; }]>;
-def sve_elm_idx_extdup_s
- : AsmVectorIndexOpnd<i64, SVEVectorIndexExtDupSOperand, [{ return ((uint64_t)Imm) < 16; }]>;
-def sve_elm_idx_extdup_d
- : AsmVectorIndexOpnd<i64, SVEVectorIndexExtDupDOperand, [{ return ((uint64_t)Imm) < 8; }]>;
-def sve_elm_idx_extdup_q
- : AsmVectorIndexOpnd<i64, SVEVectorIndexExtDupQOperand, [{ return ((uint64_t)Imm) < 4; }]>;
+defm sve_elm_idx_extdup_b
+ : VectorIndex<i64, SVEVectorIndexExtDupBOperand,
+ [{ return ((uint64_t)Imm) < 64; }]>;
+defm sve_elm_idx_extdup_h
+ : VectorIndex<i64, SVEVectorIndexExtDupHOperand,
+ [{ return ((uint64_t)Imm) < 32; }]>;
+defm sve_elm_idx_extdup_s
+ : VectorIndex<i64, SVEVectorIndexExtDupSOperand,
+ [{ return ((uint64_t)Imm) < 16; }]>;
+defm sve_elm_idx_extdup_d
+ : VectorIndex<i64, SVEVectorIndexExtDupDOperand,
+ [{ return ((uint64_t)Imm) < 8; }]>;
+defm sve_elm_idx_extdup_q
+ : VectorIndex<i64, SVEVectorIndexExtDupQOperand,
+ [{ return ((uint64_t)Imm) < 4; }]>;
// 8-bit immediate for AdvSIMD where 64-bit values of the form:
// aaaaaaaa bbbbbbbb cccccccc dddddddd eeeeeeee ffffffff gggggggg hhhhhhhh
@@ -1533,6 +1647,8 @@ class BaseAuthLoad<bit M, bit W, dag oops, dag iops, string asm,
let Inst{10} = 1;
let Inst{9-5} = Rn;
let Inst{4-0} = Rt;
+
+ let DecoderMethod = "DecodeAuthLoadInstruction";
}
multiclass AuthLoad<bit M, string asm, Operand opr> {
@@ -4333,14 +4449,14 @@ multiclass FPToIntegerUnscaled<bits<2> rmode, bits<3> opcode, string asm,
SDPatternOperator OpN> {
// Unscaled half-precision to 32-bit
def UWHr : BaseFPToIntegerUnscaled<0b11, rmode, opcode, FPR16, GPR32, asm,
- [(set GPR32:$Rd, (OpN FPR16:$Rn))]> {
+ [(set GPR32:$Rd, (OpN (f16 FPR16:$Rn)))]> {
let Inst{31} = 0; // 32-bit GPR flag
let Predicates = [HasFullFP16];
}
// Unscaled half-precision to 64-bit
def UXHr : BaseFPToIntegerUnscaled<0b11, rmode, opcode, FPR16, GPR64, asm,
- [(set GPR64:$Rd, (OpN FPR16:$Rn))]> {
+ [(set GPR64:$Rd, (OpN (f16 FPR16:$Rn)))]> {
let Inst{31} = 1; // 64-bit GPR flag
let Predicates = [HasFullFP16];
}
@@ -4375,7 +4491,7 @@ multiclass FPToIntegerScaled<bits<2> rmode, bits<3> opcode, string asm,
// Scaled half-precision to 32-bit
def SWHri : BaseFPToInteger<0b11, rmode, opcode, FPR16, GPR32,
fixedpoint_f16_i32, asm,
- [(set GPR32:$Rd, (OpN (fmul FPR16:$Rn,
+ [(set GPR32:$Rd, (OpN (fmul (f16 FPR16:$Rn),
fixedpoint_f16_i32:$scale)))]> {
let Inst{31} = 0; // 32-bit GPR flag
let scale{5} = 1;
@@ -4385,7 +4501,7 @@ multiclass FPToIntegerScaled<bits<2> rmode, bits<3> opcode, string asm,
// Scaled half-precision to 64-bit
def SXHri : BaseFPToInteger<0b11, rmode, opcode, FPR16, GPR64,
fixedpoint_f16_i64, asm,
- [(set GPR64:$Rd, (OpN (fmul FPR16:$Rn,
+ [(set GPR64:$Rd, (OpN (fmul (f16 FPR16:$Rn),
fixedpoint_f16_i64:$scale)))]> {
let Inst{31} = 1; // 64-bit GPR flag
let Predicates = [HasFullFP16];
@@ -4501,7 +4617,7 @@ multiclass IntegerToFP<bit isUnsigned, string asm, SDNode node> {
// Scaled
def SWHri: BaseIntegerToFP<isUnsigned, GPR32, FPR16, fixedpoint_f16_i32, asm,
- [(set FPR16:$Rd,
+ [(set (f16 FPR16:$Rd),
(fdiv (node GPR32:$Rn),
fixedpoint_f16_i32:$scale))]> {
let Inst{31} = 0; // 32-bit GPR flag
@@ -4529,7 +4645,7 @@ multiclass IntegerToFP<bit isUnsigned, string asm, SDNode node> {
}
def SXHri: BaseIntegerToFP<isUnsigned, GPR64, FPR16, fixedpoint_f16_i64, asm,
- [(set FPR16:$Rd,
+ [(set (f16 FPR16:$Rd),
(fdiv (node GPR64:$Rn),
fixedpoint_f16_i64:$scale))]> {
let Inst{31} = 1; // 64-bit GPR flag
@@ -4702,19 +4818,19 @@ class BaseFPConversion<bits<2> type, bits<2> opcode, RegisterClass dstType,
multiclass FPConversion<string asm> {
// Double-precision to Half-precision
def HDr : BaseFPConversion<0b01, 0b11, FPR16, FPR64, asm,
- [(set FPR16:$Rd, (fpround FPR64:$Rn))]>;
+ [(set (f16 FPR16:$Rd), (any_fpround FPR64:$Rn))]>;
// Double-precision to Single-precision
def SDr : BaseFPConversion<0b01, 0b00, FPR32, FPR64, asm,
- [(set FPR32:$Rd, (fpround FPR64:$Rn))]>;
+ [(set FPR32:$Rd, (any_fpround FPR64:$Rn))]>;
// Half-precision to Double-precision
def DHr : BaseFPConversion<0b11, 0b01, FPR64, FPR16, asm,
- [(set FPR64:$Rd, (fpextend FPR16:$Rn))]>;
+ [(set FPR64:$Rd, (fpextend (f16 FPR16:$Rn)))]>;
// Half-precision to Single-precision
def SHr : BaseFPConversion<0b11, 0b00, FPR32, FPR16, asm,
- [(set FPR32:$Rd, (fpextend FPR16:$Rn))]>;
+ [(set FPR32:$Rd, (fpextend (f16 FPR16:$Rn)))]>;
// Single-precision to Double-precision
def DSr : BaseFPConversion<0b00, 0b01, FPR64, FPR32, asm,
@@ -4722,7 +4838,7 @@ multiclass FPConversion<string asm> {
// Single-precision to Half-precision
def HSr : BaseFPConversion<0b00, 0b11, FPR16, FPR32, asm,
- [(set FPR16:$Rd, (fpround FPR32:$Rn))]>;
+ [(set (f16 FPR16:$Rd), (any_fpround FPR32:$Rn))]>;
}
//---
@@ -4824,7 +4940,7 @@ multiclass TwoOperandFPData<bits<4> opcode, string asm,
multiclass TwoOperandFPDataNeg<bits<4> opcode, string asm, SDNode node> {
def Hrr : BaseTwoOperandFPData<opcode, FPR16, asm,
- [(set FPR16:$Rd, (fneg (node FPR16:$Rn, (f16 FPR16:$Rm))))]> {
+ [(set (f16 FPR16:$Rd), (fneg (node (f16 FPR16:$Rn), (f16 FPR16:$Rm))))]> {
let Inst{23-22} = 0b11; // 16-bit size flag
let Predicates = [HasFullFP16];
}
@@ -4866,7 +4982,7 @@ class BaseThreeOperandFPData<bit isNegated, bit isSub,
multiclass ThreeOperandFPData<bit isNegated, bit isSub,string asm,
SDPatternOperator node> {
def Hrrr : BaseThreeOperandFPData<isNegated, isSub, FPR16, asm,
- [(set FPR16:$Rd,
+ [(set (f16 FPR16:$Rd),
(node (f16 FPR16:$Rn), (f16 FPR16:$Rm), (f16 FPR16:$Ra)))]> {
let Inst{23-22} = 0b11; // 16-bit size flag
let Predicates = [HasFullFP16];
@@ -4928,7 +5044,7 @@ multiclass FPComparison<bit signalAllNans, string asm,
SDPatternOperator OpNode = null_frag> {
let Defs = [NZCV] in {
def Hrr : BaseTwoOperandFPComparison<signalAllNans, FPR16, asm,
- [(OpNode FPR16:$Rn, (f16 FPR16:$Rm)), (implicit NZCV)]> {
+ [(OpNode (f16 FPR16:$Rn), (f16 FPR16:$Rm)), (implicit NZCV)]> {
let Inst{23-22} = 0b11;
let Predicates = [HasFullFP16];
}
@@ -5142,6 +5258,47 @@ class BaseSIMDThreeSameVectorTied<bit Q, bit U, bits<3> size, bits<5> opcode,
let Inst{4-0} = Rd;
}
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseSIMDThreeSameVectorPseudo<RegisterOperand regtype, list<dag> pattern>
+ : Pseudo<(outs regtype:$dst), (ins regtype:$Rd, regtype:$Rn, regtype:$Rm), pattern>,
+ Sched<[WriteV]>;
+
+multiclass SIMDLogicalThreeVectorPseudo<SDPatternOperator OpNode> {
+ def v8i8 : BaseSIMDThreeSameVectorPseudo<V64,
+ [(set (v8i8 V64:$dst),
+ (OpNode (v8i8 V64:$Rd), (v8i8 V64:$Rn), (v8i8 V64:$Rm)))]>;
+ def v16i8 : BaseSIMDThreeSameVectorPseudo<V128,
+ [(set (v16i8 V128:$dst),
+ (OpNode (v16i8 V128:$Rd), (v16i8 V128:$Rn),
+ (v16i8 V128:$Rm)))]>;
+
+ def : Pat<(v4i16 (OpNode (v4i16 V64:$LHS), (v4i16 V64:$MHS),
+ (v4i16 V64:$RHS))),
+ (!cast<Instruction>(NAME#"v8i8")
+ V64:$LHS, V64:$MHS, V64:$RHS)>;
+ def : Pat<(v2i32 (OpNode (v2i32 V64:$LHS), (v2i32 V64:$MHS),
+ (v2i32 V64:$RHS))),
+ (!cast<Instruction>(NAME#"v8i8")
+ V64:$LHS, V64:$MHS, V64:$RHS)>;
+ def : Pat<(v1i64 (OpNode (v1i64 V64:$LHS), (v1i64 V64:$MHS),
+ (v1i64 V64:$RHS))),
+ (!cast<Instruction>(NAME#"v8i8")
+ V64:$LHS, V64:$MHS, V64:$RHS)>;
+
+ def : Pat<(v8i16 (OpNode (v8i16 V128:$LHS), (v8i16 V128:$MHS),
+ (v8i16 V128:$RHS))),
+ (!cast<Instruction>(NAME#"v16i8")
+ V128:$LHS, V128:$MHS, V128:$RHS)>;
+ def : Pat<(v4i32 (OpNode (v4i32 V128:$LHS), (v4i32 V128:$MHS),
+ (v4i32 V128:$RHS))),
+ (!cast<Instruction>(NAME#"v16i8")
+ V128:$LHS, V128:$MHS, V128:$RHS)>;
+ def : Pat<(v2i64 (OpNode (v2i64 V128:$LHS), (v2i64 V128:$MHS),
+ (v2i64 V128:$RHS))),
+ (!cast<Instruction>(NAME#"v16i8")
+ V128:$LHS, V128:$MHS, V128:$RHS)>;
+}
+
// All operand sizes distinguished in the encoding.
multiclass SIMDThreeSameVector<bit U, bits<5> opc, string asm,
SDPatternOperator OpNode> {
@@ -5362,7 +5519,7 @@ multiclass SIMDLogicalThreeVector<bit U, bits<2> size, string asm,
}
multiclass SIMDLogicalThreeVectorTied<bit U, bits<2> size,
- string asm, SDPatternOperator OpNode> {
+ string asm, SDPatternOperator OpNode = null_frag> {
def v8i8 : BaseSIMDThreeSameVectorTied<0, U, {size,1}, 0b00011, V64,
asm, ".8b",
[(set (v8i8 V64:$dst),
@@ -5402,11 +5559,11 @@ multiclass SIMDLogicalThreeVectorTied<bit U, bits<2> size,
// ARMv8.2-A Dot Product Instructions (Vector): These instructions extract
// bytes from S-sized elements.
-class BaseSIMDThreeSameVectorDot<bit Q, bit U, string asm, string kind1,
+class BaseSIMDThreeSameVectorDot<bit Q, bit U, bit Mixed, string asm, string kind1,
string kind2, RegisterOperand RegType,
ValueType AccumType, ValueType InputType,
SDPatternOperator OpNode> :
- BaseSIMDThreeSameVectorTied<Q, U, 0b100, 0b10010, RegType, asm, kind1,
+ BaseSIMDThreeSameVectorTied<Q, U, 0b100, {0b1001, Mixed}, RegType, asm, kind1,
[(set (AccumType RegType:$dst),
(OpNode (AccumType RegType:$Rd),
(InputType RegType:$Rn),
@@ -5414,10 +5571,10 @@ class BaseSIMDThreeSameVectorDot<bit Q, bit U, string asm, string kind1,
let AsmString = !strconcat(asm, "{\t$Rd" # kind1 # ", $Rn" # kind2 # ", $Rm" # kind2 # "}");
}
-multiclass SIMDThreeSameVectorDot<bit U, string asm, SDPatternOperator OpNode> {
- def v8i8 : BaseSIMDThreeSameVectorDot<0, U, asm, ".2s", ".8b", V64,
+multiclass SIMDThreeSameVectorDot<bit U, bit Mixed, string asm, SDPatternOperator OpNode> {
+ def v8i8 : BaseSIMDThreeSameVectorDot<0, U, Mixed, asm, ".2s", ".8b", V64,
v2i32, v8i8, OpNode>;
- def v16i8 : BaseSIMDThreeSameVectorDot<1, U, asm, ".4s", ".16b", V128,
+ def v16i8 : BaseSIMDThreeSameVectorDot<1, U, Mixed, asm, ".4s", ".16b", V128,
v4i32, v16i8, OpNode>;
}
@@ -6581,13 +6738,13 @@ multiclass SIMDThreeScalarHSTied<bit U, bit R, bits<5> opc, string asm,
multiclass SIMDFPThreeScalar<bit U, bit S, bits<3> opc, string asm,
SDPatternOperator OpNode = null_frag> {
let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
- def #NAME#64 : BaseSIMDThreeScalar<U, {S,0b11}, {0b11,opc}, FPR64, asm,
+ def NAME#64 : BaseSIMDThreeScalar<U, {S,0b11}, {0b11,opc}, FPR64, asm,
[(set (f64 FPR64:$Rd), (OpNode (f64 FPR64:$Rn), (f64 FPR64:$Rm)))]>;
- def #NAME#32 : BaseSIMDThreeScalar<U, {S,0b01}, {0b11,opc}, FPR32, asm,
+ def NAME#32 : BaseSIMDThreeScalar<U, {S,0b01}, {0b11,opc}, FPR32, asm,
[(set FPR32:$Rd, (OpNode FPR32:$Rn, FPR32:$Rm))]>;
let Predicates = [HasNEON, HasFullFP16] in {
- def #NAME#16 : BaseSIMDThreeScalar<U, {S,0b10}, {0b00,opc}, FPR16, asm,
- [(set FPR16:$Rd, (OpNode FPR16:$Rn, FPR16:$Rm))]>;
+ def NAME#16 : BaseSIMDThreeScalar<U, {S,0b10}, {0b00,opc}, FPR16, asm,
+ [(set (f16 FPR16:$Rd), (OpNode (f16 FPR16:$Rn), (f16 FPR16:$Rm)))]>;
} // Predicates = [HasNEON, HasFullFP16]
}
@@ -6598,12 +6755,12 @@ multiclass SIMDFPThreeScalar<bit U, bit S, bits<3> opc, string asm,
multiclass SIMDThreeScalarFPCmp<bit U, bit S, bits<3> opc, string asm,
SDPatternOperator OpNode = null_frag> {
let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
- def #NAME#64 : BaseSIMDThreeScalar<U, {S,0b11}, {0b11,opc}, FPR64, asm,
+ def NAME#64 : BaseSIMDThreeScalar<U, {S,0b11}, {0b11,opc}, FPR64, asm,
[(set (i64 FPR64:$Rd), (OpNode (f64 FPR64:$Rn), (f64 FPR64:$Rm)))]>;
- def #NAME#32 : BaseSIMDThreeScalar<U, {S,0b01}, {0b11,opc}, FPR32, asm,
+ def NAME#32 : BaseSIMDThreeScalar<U, {S,0b01}, {0b11,opc}, FPR32, asm,
[(set (i32 FPR32:$Rd), (OpNode (f32 FPR32:$Rn), (f32 FPR32:$Rm)))]>;
let Predicates = [HasNEON, HasFullFP16] in {
- def #NAME#16 : BaseSIMDThreeScalar<U, {S,0b10}, {0b00,opc}, FPR16, asm,
+ def NAME#16 : BaseSIMDThreeScalar<U, {S,0b10}, {0b00,opc}, FPR16, asm,
[]>;
} // Predicates = [HasNEON, HasFullFP16]
}
@@ -6794,7 +6951,7 @@ multiclass SIMDFPTwoScalarCVT<bit U, bit S, bits<5> opc, string asm,
[(set FPR32:$Rd, (OpNode (f32 FPR32:$Rn)))]>;
let Predicates = [HasNEON, HasFullFP16] in {
def v1i16 : BaseSIMDTwoScalar<U, {S,1}, 0b11, opc, FPR16, FPR16, asm,
- [(set FPR16:$Rd, (OpNode (f16 FPR16:$Rn)))]>;
+ [(set (f16 FPR16:$Rd), (OpNode (f16 FPR16:$Rn)))]>;
}
}
@@ -6936,10 +7093,10 @@ multiclass SIMDFPAcrossLanes<bits<5> opcode, bit sz1, string asm,
let Predicates = [HasNEON, HasFullFP16] in {
def v4i16v : BaseSIMDAcrossLanes<0, 0, {sz1, 0}, opcode, FPR16, V64,
asm, ".4h",
- [(set FPR16:$Rd, (intOp (v4f16 V64:$Rn)))]>;
+ [(set (f16 FPR16:$Rd), (intOp (v4f16 V64:$Rn)))]>;
def v8i16v : BaseSIMDAcrossLanes<1, 0, {sz1, 0}, opcode, FPR16, V128,
asm, ".8h",
- [(set FPR16:$Rd, (intOp (v8f16 V128:$Rn)))]>;
+ [(set (f16 FPR16:$Rd), (intOp (v8f16 V128:$Rn)))]>;
} // Predicates = [HasNEON, HasFullFP16]
def v4i32v : BaseSIMDAcrossLanes<1, 1, {sz1, 0}, opcode, FPR32, V128,
asm, ".4s",
@@ -7136,7 +7293,7 @@ class SIMDInsMainMovAlias<string size, Instruction inst,
(inst V128:$dst, idxtype:$idx, regtype:$src)>;
class SIMDInsElementMovAlias<string size, Instruction inst,
Operand idxtype>
- : InstAlias<"mov" # "{\t$dst" # size # "$idx, $src" # size # "$idx2" #
+ : InstAlias<"mov" # "{\t$dst" # size # "$idx, $src" # size # "$idx2"
# "|" # size #"\t$dst$idx, $src$idx2}",
(inst V128:$dst, idxtype:$idx, V128:$src, idxtype:$idx2)>;
@@ -7377,7 +7534,7 @@ class BaseSIMDScalarCPY<RegisterClass regtype, RegisterOperand vectype,
class SIMDScalarCPYAlias<string asm, string size, Instruction inst,
RegisterClass regtype, RegisterOperand vectype, Operand idxtype>
- : InstAlias<asm # "{\t$dst, $src" # size # "$index" #
+ : InstAlias<asm # "{\t$dst, $src" # size # "$index"
# "|\t$dst, $src$index}",
(inst regtype:$dst, vectype:$src, idxtype:$index), 0>;
@@ -7651,13 +7808,152 @@ class BaseSIMDIndexedTied<bit Q, bit U, bit Scalar, bits<2> size, bits<4> opc,
let Inst{4-0} = Rd;
}
+
+//----------------------------------------------------------------------------
+// Armv8.6 BFloat16 Extension
+//----------------------------------------------------------------------------
+let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in {
+
+class BaseSIMDThreeSameVectorBFDot<bit Q, bit U, string asm, string kind1,
+ string kind2, RegisterOperand RegType,
+ ValueType AccumType, ValueType InputType>
+ : BaseSIMDThreeSameVectorTied<Q, U, 0b010, 0b11111, RegType, asm, kind1, [(set (AccumType RegType:$dst),
+ (int_aarch64_neon_bfdot (AccumType RegType:$Rd),
+ (InputType RegType:$Rn),
+ (InputType RegType:$Rm)))]> {
+ let AsmString = !strconcat(asm,
+ "{\t$Rd" # kind1 # ", $Rn" # kind2 #
+ ", $Rm" # kind2 # "}");
+}
+
+multiclass SIMDThreeSameVectorBFDot<bit U, string asm> {
+ def v4bf16 : BaseSIMDThreeSameVectorBFDot<0, U, asm, ".2s", ".4h", V64,
+ v2f32, v8i8>;
+ def v8bf16 : BaseSIMDThreeSameVectorBFDot<1, U, asm, ".4s", ".8h", V128,
+ v4f32, v16i8>;
+}
+
+class BaseSIMDThreeSameVectorBF16DotI<bit Q, bit U, string asm,
+ string dst_kind, string lhs_kind,
+ string rhs_kind,
+ RegisterOperand RegType,
+ ValueType AccumType,
+ ValueType InputType>
+ : BaseSIMDIndexedTied<Q, U, 0b0, 0b01, 0b1111,
+ RegType, RegType, V128, VectorIndexS,
+ asm, "", dst_kind, lhs_kind, rhs_kind,
+ [(set (AccumType RegType:$dst),
+ (AccumType (int_aarch64_neon_bfdot
+ (AccumType RegType:$Rd),
+ (InputType RegType:$Rn),
+ (InputType (bitconvert (AccumType
+ (AArch64duplane32 (v4f32 V128:$Rm),
+ VectorIndexH:$idx)))))))]> {
+
+ bits<2> idx;
+ let Inst{21} = idx{0}; // L
+ let Inst{11} = idx{1}; // H
+}
+
+multiclass SIMDThreeSameVectorBF16DotI<bit U, string asm> {
+
+ def v4bf16 : BaseSIMDThreeSameVectorBF16DotI<0, U, asm, ".2s", ".4h",
+ ".2h", V64, v2f32, v8i8>;
+ def v8bf16 : BaseSIMDThreeSameVectorBF16DotI<1, U, asm, ".4s", ".8h",
+ ".2h", V128, v4f32, v16i8>;
+}
+
+class SIMDBF16MLAL<bit Q, string asm, SDPatternOperator OpNode>
+ : BaseSIMDThreeSameVectorTied<Q, 0b1, 0b110, 0b11111, V128, asm, ".4s",
+ [(set (v4f32 V128:$dst), (OpNode (v4f32 V128:$Rd),
+ (v16i8 V128:$Rn),
+ (v16i8 V128:$Rm)))]> {
+ let AsmString = !strconcat(asm, "{\t$Rd.4s, $Rn.8h, $Rm.8h}");
+}
+
+class SIMDBF16MLALIndex<bit Q, string asm, SDPatternOperator OpNode>
+ : I<(outs V128:$dst),
+ (ins V128:$Rd, V128:$Rn, V128_lo:$Rm, VectorIndexH:$idx), asm,
+ "{\t$Rd.4s, $Rn.8h, $Rm.h$idx}", "$Rd = $dst",
+ [(set (v4f32 V128:$dst),
+ (v4f32 (OpNode (v4f32 V128:$Rd),
+ (v16i8 V128:$Rn),
+ (v16i8 (bitconvert (v8bf16
+ (AArch64duplane16 (v8bf16 V128_lo:$Rm),
+ VectorIndexH:$idx)))))))]>,
+ Sched<[WriteV]> {
+ bits<5> Rd;
+ bits<5> Rn;
+ bits<4> Rm;
+ bits<3> idx;
+
+ let Inst{31} = 0;
+ let Inst{30} = Q;
+ let Inst{29-22} = 0b00111111;
+ let Inst{21-20} = idx{1-0};
+ let Inst{19-16} = Rm;
+ let Inst{15-12} = 0b1111;
+ let Inst{11} = idx{2}; // H
+ let Inst{10} = 0;
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Rd;
+}
+
+class SIMDThreeSameVectorBF16MatrixMul<string asm>
+ : BaseSIMDThreeSameVectorTied<1, 1, 0b010, 0b11101,
+ V128, asm, ".4s",
+ [(set (v4f32 V128:$dst),
+ (int_aarch64_neon_bfmmla (v4f32 V128:$Rd),
+ (v16i8 V128:$Rn),
+ (v16i8 V128:$Rm)))]> {
+ let AsmString = !strconcat(asm, "{\t$Rd", ".4s", ", $Rn", ".8h",
+ ", $Rm", ".8h", "}");
+}
+
+class SIMD_BFCVTN
+ : BaseSIMDMixedTwoVector<0, 0, 0b10, 0b10110, V128, V128,
+ "bfcvtn", ".4h", ".4s",
+ [(set (v8bf16 V128:$Rd),
+ (int_aarch64_neon_bfcvtn (v4f32 V128:$Rn)))]>;
+
+class SIMD_BFCVTN2
+ : BaseSIMDMixedTwoVectorTied<1, 0, 0b10, 0b10110, V128, V128,
+ "bfcvtn2", ".8h", ".4s",
+ [(set (v8bf16 V128:$dst),
+ (int_aarch64_neon_bfcvtn2 (v8bf16 V128:$Rd), (v4f32 V128:$Rn)))]>;
+
+class BF16ToSinglePrecision<string asm>
+ : I<(outs FPR16:$Rd), (ins FPR32:$Rn), asm, "\t$Rd, $Rn", "",
+ [(set (bf16 FPR16:$Rd), (int_aarch64_neon_bfcvt (f32 FPR32:$Rn)))]>,
+ Sched<[WriteFCvt]> {
+ bits<5> Rd;
+ bits<5> Rn;
+ let Inst{31-10} = 0b0001111001100011010000;
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Rd;
+}
+} // End of let mayStore = 0, mayLoad = 0, hasSideEffects = 0
+
+//----------------------------------------------------------------------------
+// Armv8.6 Matrix Multiply Extension
+//----------------------------------------------------------------------------
+
+class SIMDThreeSameVectorMatMul<bit B, bit U, string asm, SDPatternOperator OpNode>
+ : BaseSIMDThreeSameVectorTied<1, U, 0b100, {0b1010, B}, V128, asm, ".4s",
+ [(set (v4i32 V128:$dst), (OpNode (v4i32 V128:$Rd),
+ (v16i8 V128:$Rn),
+ (v16i8 V128:$Rm)))]> {
+ let AsmString = asm # "{\t$Rd.4s, $Rn.16b, $Rm.16b}";
+}
+
+//----------------------------------------------------------------------------
// ARMv8.2-A Dot Product Instructions (Indexed)
-class BaseSIMDThreeSameVectorDotIndex<bit Q, bit U, string asm, string dst_kind,
- string lhs_kind, string rhs_kind,
+class BaseSIMDThreeSameVectorDotIndex<bit Q, bit U, bit Mixed, bits<2> size, string asm,
+ string dst_kind, string lhs_kind, string rhs_kind,
RegisterOperand RegType,
ValueType AccumType, ValueType InputType,
SDPatternOperator OpNode> :
- BaseSIMDIndexedTied<Q, U, 0b0, 0b10, 0b1110, RegType, RegType, V128,
+ BaseSIMDIndexedTied<Q, U, 0b0, size, {0b111, Mixed}, RegType, RegType, V128,
VectorIndexS, asm, "", dst_kind, lhs_kind, rhs_kind,
[(set (AccumType RegType:$dst),
(AccumType (OpNode (AccumType RegType:$Rd),
@@ -7670,11 +7966,11 @@ class BaseSIMDThreeSameVectorDotIndex<bit Q, bit U, string asm, string dst_kind,
let Inst{11} = idx{1}; // H
}
-multiclass SIMDThreeSameVectorDotIndex<bit U, string asm,
+multiclass SIMDThreeSameVectorDotIndex<bit U, bit Mixed, bits<2> size, string asm,
SDPatternOperator OpNode> {
- def v8i8 : BaseSIMDThreeSameVectorDotIndex<0, U, asm, ".2s", ".8b", ".4b",
+ def v8i8 : BaseSIMDThreeSameVectorDotIndex<0, U, Mixed, size, asm, ".2s", ".8b", ".4b",
V64, v2i32, v8i8, OpNode>;
- def v16i8 : BaseSIMDThreeSameVectorDotIndex<1, U, asm, ".4s", ".16b", ".4b",
+ def v16i8 : BaseSIMDThreeSameVectorDotIndex<1, U, Mixed, size, asm, ".4s", ".16b", ".4b",
V128, v4i32, v16i8, OpNode>;
}
@@ -7813,6 +8109,34 @@ multiclass SIMDFPIndexed<bit U, bits<4> opc, string asm,
}
multiclass SIMDFPIndexedTiedPatterns<string INST, SDPatternOperator OpNode> {
+ let Predicates = [HasNEON, HasFullFP16] in {
+ // Patterns for f16: DUPLANE, DUP scalar and vector_extract.
+ def : Pat<(v8f16 (OpNode (v8f16 V128:$Rd), (v8f16 V128:$Rn),
+ (AArch64duplane16 (v8f16 V128_lo:$Rm),
+ VectorIndexH:$idx))),
+ (!cast<Instruction>(INST # "v8i16_indexed")
+ V128:$Rd, V128:$Rn, V128_lo:$Rm, VectorIndexH:$idx)>;
+ def : Pat<(v8f16 (OpNode (v8f16 V128:$Rd), (v8f16 V128:$Rn),
+ (AArch64dup (f16 FPR16Op_lo:$Rm)))),
+ (!cast<Instruction>(INST # "v8i16_indexed") V128:$Rd, V128:$Rn,
+ (SUBREG_TO_REG (i32 0), (f16 FPR16Op_lo:$Rm), hsub), (i64 0))>;
+
+ def : Pat<(v4f16 (OpNode (v4f16 V64:$Rd), (v4f16 V64:$Rn),
+ (AArch64duplane16 (v8f16 V128_lo:$Rm),
+ VectorIndexH:$idx))),
+ (!cast<Instruction>(INST # "v4i16_indexed")
+ V64:$Rd, V64:$Rn, V128_lo:$Rm, VectorIndexH:$idx)>;
+ def : Pat<(v4f16 (OpNode (v4f16 V64:$Rd), (v4f16 V64:$Rn),
+ (AArch64dup (f16 FPR16Op_lo:$Rm)))),
+ (!cast<Instruction>(INST # "v4i16_indexed") V64:$Rd, V64:$Rn,
+ (SUBREG_TO_REG (i32 0), (f16 FPR16Op_lo:$Rm), hsub), (i64 0))>;
+
+ def : Pat<(f16 (OpNode (f16 FPR16:$Rd), (f16 FPR16:$Rn),
+ (vector_extract (v8f16 V128_lo:$Rm), VectorIndexH:$idx))),
+ (!cast<Instruction>(INST # "v1i16_indexed") FPR16:$Rd, FPR16:$Rn,
+ V128_lo:$Rm, VectorIndexH:$idx)>;
+ } // Predicates = [HasNEON, HasFullFP16]
+
// 2 variants for the .2s version: DUPLANE from 128-bit and DUP scalar.
def : Pat<(v2f32 (OpNode (v2f32 V64:$Rd), (v2f32 V64:$Rn),
(AArch64duplane32 (v4f32 V128:$Rm),
@@ -7847,15 +8171,11 @@ multiclass SIMDFPIndexedTiedPatterns<string INST, SDPatternOperator OpNode> {
(!cast<Instruction>(INST # "v2i64_indexed") V128:$Rd, V128:$Rn,
(SUBREG_TO_REG (i32 0), FPR64Op:$Rm, dsub), (i64 0))>;
- // 2 variants for 32-bit scalar version: extract from .2s or from .4s
+ // Covers 2 variants for 32-bit scalar version: extract from .2s or from .4s
def : Pat<(f32 (OpNode (f32 FPR32:$Rd), (f32 FPR32:$Rn),
(vector_extract (v4f32 V128:$Rm), VectorIndexS:$idx))),
(!cast<Instruction>(INST # "v1i32_indexed") FPR32:$Rd, FPR32:$Rn,
V128:$Rm, VectorIndexS:$idx)>;
- def : Pat<(f32 (OpNode (f32 FPR32:$Rd), (f32 FPR32:$Rn),
- (vector_extract (v2f32 V64:$Rm), VectorIndexS:$idx))),
- (!cast<Instruction>(INST # "v1i32_indexed") FPR32:$Rd, FPR32:$Rn,
- (SUBREG_TO_REG (i32 0), V64:$Rm, dsub), VectorIndexS:$idx)>;
// 1 variant for 64-bit scalar version: extract from .1d or from .2d
def : Pat<(f64 (OpNode (f64 FPR64:$Rd), (f64 FPR64:$Rn),
@@ -7940,6 +8260,64 @@ multiclass SIMDFPIndexedTied<bit U, bits<4> opc, string asm> {
}
}
+multiclass SIMDIndexedHSPatterns<SDPatternOperator OpNodeLane,
+ SDPatternOperator OpNodeLaneQ> {
+
+ def : Pat<(v4i16 (OpNodeLane
+ (v4i16 V64:$Rn), (v4i16 V64_lo:$Rm),
+ VectorIndexS32b:$idx)),
+ (!cast<Instruction>(NAME # v4i16_indexed) $Rn,
+ (SUBREG_TO_REG (i32 0), (v4i16 V64_lo:$Rm), dsub),
+ (UImmS1XForm $idx))>;
+
+ def : Pat<(v4i16 (OpNodeLaneQ
+ (v4i16 V64:$Rn), (v8i16 V128_lo:$Rm),
+ VectorIndexH32b:$idx)),
+ (!cast<Instruction>(NAME # v4i16_indexed) $Rn, $Rm,
+ (UImmS1XForm $idx))>;
+
+ def : Pat<(v8i16 (OpNodeLane
+ (v8i16 V128:$Rn), (v4i16 V64_lo:$Rm),
+ VectorIndexS32b:$idx)),
+ (!cast<Instruction>(NAME # v8i16_indexed) $Rn,
+ (SUBREG_TO_REG (i32 0), $Rm, dsub),
+ (UImmS1XForm $idx))>;
+
+ def : Pat<(v8i16 (OpNodeLaneQ
+ (v8i16 V128:$Rn), (v8i16 V128_lo:$Rm),
+ VectorIndexH32b:$idx)),
+ (!cast<Instruction>(NAME # v8i16_indexed) $Rn, $Rm,
+ (UImmS1XForm $idx))>;
+
+ def : Pat<(v2i32 (OpNodeLane
+ (v2i32 V64:$Rn), (v2i32 V64:$Rm),
+ VectorIndexD32b:$idx)),
+ (!cast<Instruction>(NAME # v2i32_indexed) $Rn,
+ (SUBREG_TO_REG (i32 0), (v2i32 V64_lo:$Rm), dsub),
+ (UImmS1XForm $idx))>;
+
+ def : Pat<(v2i32 (OpNodeLaneQ
+ (v2i32 V64:$Rn), (v4i32 V128:$Rm),
+ VectorIndexS32b:$idx)),
+ (!cast<Instruction>(NAME # v2i32_indexed) $Rn, $Rm,
+ (UImmS1XForm $idx))>;
+
+ def : Pat<(v4i32 (OpNodeLane
+ (v4i32 V128:$Rn), (v2i32 V64:$Rm),
+ VectorIndexD32b:$idx)),
+ (!cast<Instruction>(NAME # v4i32_indexed) $Rn,
+ (SUBREG_TO_REG (i32 0), $Rm, dsub),
+ (UImmS1XForm $idx))>;
+
+ def : Pat<(v4i32 (OpNodeLaneQ
+ (v4i32 V128:$Rn),
+ (v4i32 V128:$Rm),
+ VectorIndexS32b:$idx)),
+ (!cast<Instruction>(NAME # v4i32_indexed) $Rn, $Rm,
+ (UImmS1XForm $idx))>;
+
+}
+
multiclass SIMDIndexedHS<bit U, bits<4> opc, string asm,
SDPatternOperator OpNode> {
def v4i16_indexed : BaseSIMDIndexed<0, U, 0, 0b01, opc, V64, V64,
@@ -10154,15 +10532,15 @@ class ComplexRotationOperand<int Angle, int Remainder, string Type>
let DiagnosticType = "InvalidComplexRotation" # Type;
let Name = "ComplexRotation" # Type;
}
-def complexrotateop : Operand<i32>, ImmLeaf<i32, [{ return Imm >= 0 && Imm <= 270; }],
- SDNodeXForm<imm, [{
+def complexrotateop : Operand<i32>, TImmLeaf<i32, [{ return Imm >= 0 && Imm <= 270; }],
+ SDNodeXForm<imm, [{
return CurDAG->getTargetConstant((N->getSExtValue() / 90), SDLoc(N), MVT::i32);
}]>> {
let ParserMatchClass = ComplexRotationOperand<90, 0, "Even">;
let PrintMethod = "printComplexRotationOp<90, 0>";
}
-def complexrotateopodd : Operand<i32>, ImmLeaf<i32, [{ return Imm >= 0 && Imm <= 270; }],
- SDNodeXForm<imm, [{
+def complexrotateopodd : Operand<i32>, TImmLeaf<i32, [{ return Imm >= 0 && Imm <= 270; }],
+ SDNodeXForm<imm, [{
return CurDAG->getTargetConstant(((N->getSExtValue() - 90) / 180), SDLoc(N), MVT::i32);
}]>> {
let ParserMatchClass = ComplexRotationOperand<180, 90, "Odd">;
diff --git a/llvm/lib/Target/AArch64/AArch64InstrGISel.td b/llvm/lib/Target/AArch64/AArch64InstrGISel.td
new file mode 100644
index 0000000000000..a0e7c782f68c3
--- /dev/null
+++ b/llvm/lib/Target/AArch64/AArch64InstrGISel.td
@@ -0,0 +1,124 @@
+//=----- AArch64InstrGISel.td - AArch64 GISel target pseudos -*- tablegen -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// AArch64 GlobalISel target pseudo instruction definitions. This is kept
+// separately from the other tablegen files for organizational purposes, but
+// share the same infrastructure.
+//
+//===----------------------------------------------------------------------===//
+
+
+class AArch64GenericInstruction : GenericInstruction {
+ let Namespace = "AArch64";
+}
+
+// A pseudo to represent a relocatable add instruction as part of address
+// computation.
+def G_ADD_LOW : AArch64GenericInstruction {
+ let OutOperandList = (outs type0:$dst);
+ let InOperandList = (ins type1:$src, type2:$imm);
+ let hasSideEffects = 0;
+}
+
+// Pseudo for a rev16 instruction. Produced post-legalization from
+// G_SHUFFLE_VECTORs with appropriate masks.
+def G_REV16 : AArch64GenericInstruction {
+ let OutOperandList = (outs type0:$dst);
+ let InOperandList = (ins type0:$src);
+ let hasSideEffects = 0;
+}
+
+// Pseudo for a rev32 instruction. Produced post-legalization from
+// G_SHUFFLE_VECTORs with appropriate masks.
+def G_REV32 : AArch64GenericInstruction {
+ let OutOperandList = (outs type0:$dst);
+ let InOperandList = (ins type0:$src);
+ let hasSideEffects = 0;
+}
+
+// Pseudo for a rev64 instruction. Produced post-legalization from
+// G_SHUFFLE_VECTORs with appropriate masks.
+def G_REV64 : AArch64GenericInstruction {
+ let OutOperandList = (outs type0:$dst);
+ let InOperandList = (ins type0:$src);
+ let hasSideEffects = 0;
+}
+
+// Represents an uzp1 instruction. Produced post-legalization from
+// G_SHUFFLE_VECTORs with appropriate masks.
+def G_UZP1 : AArch64GenericInstruction {
+ let OutOperandList = (outs type0:$dst);
+ let InOperandList = (ins type0:$v1, type0:$v2);
+ let hasSideEffects = 0;
+}
+
+// Represents an uzp2 instruction. Produced post-legalization from
+// G_SHUFFLE_VECTORs with appropriate masks.
+def G_UZP2 : AArch64GenericInstruction {
+ let OutOperandList = (outs type0:$dst);
+ let InOperandList = (ins type0:$v1, type0:$v2);
+ let hasSideEffects = 0;
+}
+
+// Represents a zip1 instruction. Produced post-legalization from
+// G_SHUFFLE_VECTORs with appropriate masks.
+def G_ZIP1 : AArch64GenericInstruction {
+ let OutOperandList = (outs type0:$dst);
+ let InOperandList = (ins type0:$v1, type0:$v2);
+ let hasSideEffects = 0;
+}
+
+// Represents a zip2 instruction. Produced post-legalization from
+// G_SHUFFLE_VECTORs with appropriate masks.
+def G_ZIP2 : AArch64GenericInstruction {
+ let OutOperandList = (outs type0:$dst);
+ let InOperandList = (ins type0:$v1, type0:$v2);
+ let hasSideEffects = 0;
+}
+
+// Represents a dup instruction. Produced post-legalization from
+// G_SHUFFLE_VECTORs with appropriate masks.
+def G_DUP: AArch64GenericInstruction {
+ let OutOperandList = (outs type0:$dst);
+ let InOperandList = (ins type1:$lane);
+ let hasSideEffects = 0;
+}
+// Represents a trn1 instruction. Produced post-legalization from
+// G_SHUFFLE_VECTORs with appropriate masks.
+def G_TRN1 : AArch64GenericInstruction {
+ let OutOperandList = (outs type0:$dst);
+ let InOperandList = (ins type0:$v1, type0:$v2);
+ let hasSideEffects = 0;
+}
+
+// Represents a trn2 instruction. Produced post-legalization from
+// G_SHUFFLE_VECTORs with appropriate masks.
+def G_TRN2 : AArch64GenericInstruction {
+ let OutOperandList = (outs type0:$dst);
+ let InOperandList = (ins type0:$v1, type0:$v2);
+ let hasSideEffects = 0;
+}
+
+// Represents an ext instruction. Produced post-legalization from
+// G_SHUFFLE_VECTORs with appropriate masks.
+def G_EXT: AArch64GenericInstruction {
+ let OutOperandList = (outs type0:$dst);
+ let InOperandList = (ins type0:$v1, type0:$v2, untyped_imm_0:$imm);
+}
+
+def : GINodeEquiv<G_REV16, AArch64rev16>;
+def : GINodeEquiv<G_REV32, AArch64rev32>;
+def : GINodeEquiv<G_REV64, AArch64rev64>;
+def : GINodeEquiv<G_UZP1, AArch64uzp1>;
+def : GINodeEquiv<G_UZP2, AArch64uzp2>;
+def : GINodeEquiv<G_ZIP1, AArch64zip1>;
+def : GINodeEquiv<G_ZIP2, AArch64zip2>;
+def : GINodeEquiv<G_DUP, AArch64dup>;
+def : GINodeEquiv<G_TRN1, AArch64trn1>;
+def : GINodeEquiv<G_TRN2, AArch64trn2>;
+def : GINodeEquiv<G_EXT, AArch64ext>;
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index 54f3f7c101324..5139ae5ccaf19 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -24,9 +24,9 @@
#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
#include "llvm/CodeGen/MachineOperand.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/MachineModuleInfo.h"
#include "llvm/CodeGen/StackMaps.h"
#include "llvm/CodeGen/TargetRegisterInfo.h"
#include "llvm/CodeGen/TargetSubtargetInfo.h"
@@ -111,6 +111,14 @@ unsigned AArch64InstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
// This gets lowered to an instruction sequence which takes 16 bytes
NumBytes = 16;
break;
+ case AArch64::SpeculationBarrierISBDSBEndBB:
+ // This gets lowered to 2 4-byte instructions.
+ NumBytes = 8;
+ break;
+ case AArch64::SpeculationBarrierSBEndBB:
+ // This gets lowered to 1 4-byte instructions.
+ NumBytes = 4;
+ break;
case AArch64::JumpTableDest32:
case AArch64::JumpTableDest16:
case AArch64::JumpTableDest8:
@@ -119,11 +127,25 @@ unsigned AArch64InstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
case AArch64::SPACE:
NumBytes = MI.getOperand(1).getImm();
break;
+ case TargetOpcode::BUNDLE:
+ NumBytes = getInstBundleLength(MI);
+ break;
}
return NumBytes;
}
+unsigned AArch64InstrInfo::getInstBundleLength(const MachineInstr &MI) const {
+ unsigned Size = 0;
+ MachineBasicBlock::const_instr_iterator I = MI.getIterator();
+ MachineBasicBlock::const_instr_iterator E = MI.getParent()->instr_end();
+ while (++I != E && I->isInsideBundle()) {
+ assert(!I->isBundle() && "No nested bundle!");
+ Size += getInstSizeInBytes(*I);
+ }
+ return Size;
+}
+
static void parseCondBranch(MachineInstr *LastInst, MachineBasicBlock *&Target,
SmallVectorImpl<MachineOperand> &Cond) {
// Block ends with fall-through condbranch.
@@ -216,6 +238,12 @@ bool AArch64InstrInfo::analyzeBranch(MachineBasicBlock &MBB,
if (I == MBB.end())
return false;
+ // Skip over SpeculationBarrierEndBB terminators
+ if (I->getOpcode() == AArch64::SpeculationBarrierISBDSBEndBB ||
+ I->getOpcode() == AArch64::SpeculationBarrierSBEndBB) {
+ --I;
+ }
+
if (!isUnpredicatedTerminator(*I))
return false;
@@ -496,8 +524,9 @@ static unsigned canFoldIntoCSel(const MachineRegisterInfo &MRI, unsigned VReg,
bool AArch64InstrInfo::canInsertSelect(const MachineBasicBlock &MBB,
ArrayRef<MachineOperand> Cond,
- unsigned TrueReg, unsigned FalseReg,
- int &CondCycles, int &TrueCycles,
+ Register DstReg, Register TrueReg,
+ Register FalseReg, int &CondCycles,
+ int &TrueCycles,
int &FalseCycles) const {
// Check register classes.
const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
@@ -506,6 +535,12 @@ bool AArch64InstrInfo::canInsertSelect(const MachineBasicBlock &MBB,
if (!RC)
return false;
+ // Also need to check the dest regclass, in case we're trying to optimize
+ // something like:
+ // %1(gpr) = PHI %2(fpr), bb1, %(fpr), bb2
+ if (!RI.getCommonSubClass(RC, MRI.getRegClass(DstReg)))
+ return false;
+
// Expanding cbz/tbz requires an extra cycle of latency on the condition.
unsigned ExtraCondLat = Cond.size() != 1;
@@ -538,9 +573,9 @@ bool AArch64InstrInfo::canInsertSelect(const MachineBasicBlock &MBB,
void AArch64InstrInfo::insertSelect(MachineBasicBlock &MBB,
MachineBasicBlock::iterator I,
- const DebugLoc &DL, unsigned DstReg,
+ const DebugLoc &DL, Register DstReg,
ArrayRef<MachineOperand> Cond,
- unsigned TrueReg, unsigned FalseReg) const {
+ Register TrueReg, Register FalseReg) const {
MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
// Parse the condition code, see parseCondBranch() above.
@@ -910,7 +945,7 @@ bool AArch64InstrInfo::isSEHInstruction(const MachineInstr &MI) {
}
bool AArch64InstrInfo::isCoalescableExtInstr(const MachineInstr &MI,
- unsigned &SrcReg, unsigned &DstReg,
+ Register &SrcReg, Register &DstReg,
unsigned &SubIdx) const {
switch (MI.getOpcode()) {
default:
@@ -935,6 +970,7 @@ bool AArch64InstrInfo::areMemAccessesTriviallyDisjoint(
const MachineOperand *BaseOpA = nullptr, *BaseOpB = nullptr;
int64_t OffsetA = 0, OffsetB = 0;
unsigned WidthA = 0, WidthB = 0;
+ bool OffsetAIsScalable = false, OffsetBIsScalable = false;
assert(MIa.mayLoadOrStore() && "MIa must be a load or store.");
assert(MIb.mayLoadOrStore() && "MIb must be a load or store.");
@@ -948,9 +984,14 @@ bool AArch64InstrInfo::areMemAccessesTriviallyDisjoint(
// base are identical, and the offset of a lower memory access +
// the width doesn't overlap the offset of a higher memory access,
// then the memory accesses are different.
- if (getMemOperandWithOffsetWidth(MIa, BaseOpA, OffsetA, WidthA, TRI) &&
- getMemOperandWithOffsetWidth(MIb, BaseOpB, OffsetB, WidthB, TRI)) {
- if (BaseOpA->isIdenticalTo(*BaseOpB)) {
+ // If OffsetAIsScalable and OffsetBIsScalable are both true, they
+ // are assumed to have the same scale (vscale).
+ if (getMemOperandWithOffsetWidth(MIa, BaseOpA, OffsetA, OffsetAIsScalable,
+ WidthA, TRI) &&
+ getMemOperandWithOffsetWidth(MIb, BaseOpB, OffsetB, OffsetBIsScalable,
+ WidthB, TRI)) {
+ if (BaseOpA->isIdenticalTo(*BaseOpB) &&
+ OffsetAIsScalable == OffsetBIsScalable) {
int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB;
int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA;
int LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB;
@@ -984,8 +1025,8 @@ bool AArch64InstrInfo::isSchedulingBoundary(const MachineInstr &MI,
/// analyzeCompare - For a comparison instruction, return the source registers
/// in SrcReg and SrcReg2, and the value it compares against in CmpValue.
/// Return true if the comparison instruction can be analyzed.
-bool AArch64InstrInfo::analyzeCompare(const MachineInstr &MI, unsigned &SrcReg,
- unsigned &SrcReg2, int &CmpMask,
+bool AArch64InstrInfo::analyzeCompare(const MachineInstr &MI, Register &SrcReg,
+ Register &SrcReg2, int &CmpMask,
int &CmpValue) const {
// The first operand can be a frame index where we'd normally expect a
// register.
@@ -1156,10 +1197,9 @@ static bool areCFlagsAccessedBetweenInstrs(
return MI.getIterator() == From;
}) != To->getParent()->rend());
- // We iterate backward starting \p To until we hit \p From.
- for (--To; To != From; --To) {
- const MachineInstr &Instr = *To;
-
+ // We iterate backward starting at \p To until we hit \p From.
+ for (const MachineInstr &Instr :
+ instructionsWithoutDebug(++To.getReverse(), From.getReverse())) {
if (((AccessToCheck & AK_Write) &&
Instr.modifiesRegister(AArch64::NZCV, TRI)) ||
((AccessToCheck & AK_Read) && Instr.readsRegister(AArch64::NZCV, TRI)))
@@ -1180,7 +1220,7 @@ static bool areCFlagsAccessedBetweenInstrs(
/// instruction.
/// Only comparison with zero is supported.
bool AArch64InstrInfo::optimizeCompareInstr(
- MachineInstr &CmpInstr, unsigned SrcReg, unsigned SrcReg2, int CmpMask,
+ MachineInstr &CmpInstr, Register SrcReg, Register SrcReg2, int CmpMask,
int CmpValue, const MachineRegisterInfo *MRI) const {
assert(CmpInstr.getParent());
assert(MRI);
@@ -1416,10 +1456,9 @@ static bool canInstrSubstituteCmpInstr(MachineInstr *MI, MachineInstr *CmpInstr,
return false;
UsedNZCV NZCVUsedAfterCmp;
- for (auto I = std::next(CmpInstr->getIterator()),
- E = CmpInstr->getParent()->instr_end();
- I != E; ++I) {
- const MachineInstr &Instr = *I;
+ for (const MachineInstr &Instr :
+ instructionsWithoutDebug(std::next(CmpInstr->getIterator()),
+ CmpInstr->getParent()->instr_end())) {
if (Instr.readsRegister(AArch64::NZCV, TRI)) {
AArch64CC::CondCode CC = findCondCodeUsedByInstr(Instr);
if (CC == AArch64CC::Invalid) // Unsupported conditional instruction
@@ -1684,6 +1723,8 @@ unsigned AArch64InstrInfo::isStoreToStackSlot(const MachineInstr &MI,
case AArch64::STRSui:
case AArch64::STRDui:
case AArch64::STRQui:
+ case AArch64::LDR_PXI:
+ case AArch64::STR_PXI:
if (MI.getOperand(0).getSubReg() == 0 && MI.getOperand(1).isFI() &&
MI.getOperand(2).isImm() && MI.getOperand(2).getImm() == 0) {
FrameIndex = MI.getOperand(1).getIndex();
@@ -1796,9 +1837,37 @@ unsigned AArch64InstrInfo::getLoadStoreImmIdx(unsigned Opc) {
case AArch64::STNPSi:
case AArch64::LDG:
case AArch64::STGPi:
+ case AArch64::LD1B_IMM:
+ case AArch64::LD1H_IMM:
+ case AArch64::LD1W_IMM:
+ case AArch64::LD1D_IMM:
+ case AArch64::ST1B_IMM:
+ case AArch64::ST1H_IMM:
+ case AArch64::ST1W_IMM:
+ case AArch64::ST1D_IMM:
+ case AArch64::LD1B_H_IMM:
+ case AArch64::LD1SB_H_IMM:
+ case AArch64::LD1H_S_IMM:
+ case AArch64::LD1SH_S_IMM:
+ case AArch64::LD1W_D_IMM:
+ case AArch64::LD1SW_D_IMM:
+ case AArch64::ST1B_H_IMM:
+ case AArch64::ST1H_S_IMM:
+ case AArch64::ST1W_D_IMM:
+ case AArch64::LD1B_S_IMM:
+ case AArch64::LD1SB_S_IMM:
+ case AArch64::LD1H_D_IMM:
+ case AArch64::LD1SH_D_IMM:
+ case AArch64::ST1B_S_IMM:
+ case AArch64::ST1H_D_IMM:
+ case AArch64::LD1B_D_IMM:
+ case AArch64::LD1SB_D_IMM:
+ case AArch64::ST1B_D_IMM:
return 3;
case AArch64::ADDG:
case AArch64::STGOffset:
+ case AArch64::LDR_PXI:
+ case AArch64::STR_PXI:
return 2;
}
}
@@ -1978,20 +2047,25 @@ bool AArch64InstrInfo::isCandidateToMergeOrPair(const MachineInstr &MI) const {
return true;
}
-bool AArch64InstrInfo::getMemOperandWithOffset(const MachineInstr &LdSt,
- const MachineOperand *&BaseOp,
- int64_t &Offset,
- const TargetRegisterInfo *TRI) const {
+bool AArch64InstrInfo::getMemOperandsWithOffsetWidth(
+ const MachineInstr &LdSt, SmallVectorImpl<const MachineOperand *> &BaseOps,
+ int64_t &Offset, bool &OffsetIsScalable, unsigned &Width,
+ const TargetRegisterInfo *TRI) const {
if (!LdSt.mayLoadOrStore())
return false;
- unsigned Width;
- return getMemOperandWithOffsetWidth(LdSt, BaseOp, Offset, Width, TRI);
+ const MachineOperand *BaseOp;
+ if (!getMemOperandWithOffsetWidth(LdSt, BaseOp, Offset, OffsetIsScalable,
+ Width, TRI))
+ return false;
+ BaseOps.push_back(BaseOp);
+ return true;
}
bool AArch64InstrInfo::getMemOperandWithOffsetWidth(
const MachineInstr &LdSt, const MachineOperand *&BaseOp, int64_t &Offset,
- unsigned &Width, const TargetRegisterInfo *TRI) const {
+ bool &OffsetIsScalable, unsigned &Width,
+ const TargetRegisterInfo *TRI) const {
assert(LdSt.mayLoadOrStore() && "Expected a memory operation.");
// Handle only loads/stores with base register followed by immediate offset.
if (LdSt.getNumExplicitOperands() == 3) {
@@ -2010,7 +2084,7 @@ bool AArch64InstrInfo::getMemOperandWithOffsetWidth(
// Get the scaling factor for the instruction and set the width for the
// instruction.
- unsigned Scale = 0;
+ TypeSize Scale(0U, false);
int64_t Dummy1, Dummy2;
// If this returns false, then it's an instruction we don't want to handle.
@@ -2022,12 +2096,13 @@ bool AArch64InstrInfo::getMemOperandWithOffsetWidth(
// set to 1.
if (LdSt.getNumExplicitOperands() == 3) {
BaseOp = &LdSt.getOperand(1);
- Offset = LdSt.getOperand(2).getImm() * Scale;
+ Offset = LdSt.getOperand(2).getImm() * Scale.getKnownMinSize();
} else {
assert(LdSt.getNumExplicitOperands() == 4 && "invalid number of operands");
BaseOp = &LdSt.getOperand(2);
- Offset = LdSt.getOperand(3).getImm() * Scale;
+ Offset = LdSt.getOperand(3).getImm() * Scale.getKnownMinSize();
}
+ OffsetIsScalable = Scale.isScalable();
if (!BaseOp->isReg() && !BaseOp->isFI())
return false;
@@ -2043,26 +2118,28 @@ AArch64InstrInfo::getMemOpBaseRegImmOfsOffsetOperand(MachineInstr &LdSt) const {
return OfsOp;
}
-bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, unsigned &Scale,
+bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, TypeSize &Scale,
unsigned &Width, int64_t &MinOffset,
int64_t &MaxOffset) {
+ const unsigned SVEMaxBytesPerVector = AArch64::SVEMaxBitsPerVector / 8;
switch (Opcode) {
// Not a memory operation or something we want to handle.
default:
- Scale = Width = 0;
+ Scale = TypeSize::Fixed(0);
+ Width = 0;
MinOffset = MaxOffset = 0;
return false;
case AArch64::STRWpost:
case AArch64::LDRWpost:
Width = 32;
- Scale = 4;
+ Scale = TypeSize::Fixed(4);
MinOffset = -256;
MaxOffset = 255;
break;
case AArch64::LDURQi:
case AArch64::STURQi:
Width = 16;
- Scale = 1;
+ Scale = TypeSize::Fixed(1);
MinOffset = -256;
MaxOffset = 255;
break;
@@ -2072,7 +2149,7 @@ bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, unsigned &Scale,
case AArch64::STURXi:
case AArch64::STURDi:
Width = 8;
- Scale = 1;
+ Scale = TypeSize::Fixed(1);
MinOffset = -256;
MaxOffset = 255;
break;
@@ -2082,7 +2159,7 @@ bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, unsigned &Scale,
case AArch64::STURWi:
case AArch64::STURSi:
Width = 4;
- Scale = 1;
+ Scale = TypeSize::Fixed(1);
MinOffset = -256;
MaxOffset = 255;
break;
@@ -2093,7 +2170,7 @@ bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, unsigned &Scale,
case AArch64::STURHi:
case AArch64::STURHHi:
Width = 2;
- Scale = 1;
+ Scale = TypeSize::Fixed(1);
MinOffset = -256;
MaxOffset = 255;
break;
@@ -2104,7 +2181,7 @@ bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, unsigned &Scale,
case AArch64::STURBi:
case AArch64::STURBBi:
Width = 1;
- Scale = 1;
+ Scale = TypeSize::Fixed(1);
MinOffset = -256;
MaxOffset = 255;
break;
@@ -2112,14 +2189,15 @@ bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, unsigned &Scale,
case AArch64::LDNPQi:
case AArch64::STPQi:
case AArch64::STNPQi:
- Scale = 16;
+ Scale = TypeSize::Fixed(16);
Width = 32;
MinOffset = -64;
MaxOffset = 63;
break;
case AArch64::LDRQui:
case AArch64::STRQui:
- Scale = Width = 16;
+ Scale = TypeSize::Fixed(16);
+ Width = 16;
MinOffset = 0;
MaxOffset = 4095;
break;
@@ -2131,7 +2209,7 @@ bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, unsigned &Scale,
case AArch64::STPDi:
case AArch64::STNPXi:
case AArch64::STNPDi:
- Scale = 8;
+ Scale = TypeSize::Fixed(8);
Width = 16;
MinOffset = -64;
MaxOffset = 63;
@@ -2141,7 +2219,8 @@ bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, unsigned &Scale,
case AArch64::LDRDui:
case AArch64::STRXui:
case AArch64::STRDui:
- Scale = Width = 8;
+ Scale = TypeSize::Fixed(8);
+ Width = 8;
MinOffset = 0;
MaxOffset = 4095;
break;
@@ -2153,7 +2232,7 @@ bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, unsigned &Scale,
case AArch64::STPSi:
case AArch64::STNPWi:
case AArch64::STNPSi:
- Scale = 4;
+ Scale = TypeSize::Fixed(4);
Width = 8;
MinOffset = -64;
MaxOffset = 63;
@@ -2163,7 +2242,8 @@ bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, unsigned &Scale,
case AArch64::LDRSWui:
case AArch64::STRWui:
case AArch64::STRSui:
- Scale = Width = 4;
+ Scale = TypeSize::Fixed(4);
+ Width = 4;
MinOffset = 0;
MaxOffset = 4095;
break;
@@ -2173,7 +2253,8 @@ bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, unsigned &Scale,
case AArch64::LDRSHXui:
case AArch64::STRHui:
case AArch64::STRHHui:
- Scale = Width = 2;
+ Scale = TypeSize::Fixed(2);
+ Width = 2;
MinOffset = 0;
MaxOffset = 4095;
break;
@@ -2183,18 +2264,19 @@ bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, unsigned &Scale,
case AArch64::LDRSBXui:
case AArch64::STRBui:
case AArch64::STRBBui:
- Scale = Width = 1;
+ Scale = TypeSize::Fixed(1);
+ Width = 1;
MinOffset = 0;
MaxOffset = 4095;
break;
case AArch64::ADDG:
- Scale = 16;
+ Scale = TypeSize::Fixed(16);
Width = 0;
MinOffset = 0;
MaxOffset = 63;
break;
case AArch64::TAGPstack:
- Scale = 16;
+ Scale = TypeSize::Fixed(16);
Width = 0;
// TAGP with a negative offset turns into SUBP, which has a maximum offset
// of 63 (not 64!).
@@ -2204,31 +2286,110 @@ bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, unsigned &Scale,
case AArch64::LDG:
case AArch64::STGOffset:
case AArch64::STZGOffset:
- Scale = Width = 16;
+ Scale = TypeSize::Fixed(16);
+ Width = 16;
MinOffset = -256;
MaxOffset = 255;
break;
+ case AArch64::STR_ZZZZXI:
+ case AArch64::LDR_ZZZZXI:
+ Scale = TypeSize::Scalable(16);
+ Width = SVEMaxBytesPerVector * 4;
+ MinOffset = -256;
+ MaxOffset = 252;
+ break;
+ case AArch64::STR_ZZZXI:
+ case AArch64::LDR_ZZZXI:
+ Scale = TypeSize::Scalable(16);
+ Width = SVEMaxBytesPerVector * 3;
+ MinOffset = -256;
+ MaxOffset = 253;
+ break;
+ case AArch64::STR_ZZXI:
+ case AArch64::LDR_ZZXI:
+ Scale = TypeSize::Scalable(16);
+ Width = SVEMaxBytesPerVector * 2;
+ MinOffset = -256;
+ MaxOffset = 254;
+ break;
case AArch64::LDR_PXI:
case AArch64::STR_PXI:
- Scale = Width = 2;
+ Scale = TypeSize::Scalable(2);
+ Width = SVEMaxBytesPerVector / 8;
MinOffset = -256;
MaxOffset = 255;
break;
case AArch64::LDR_ZXI:
case AArch64::STR_ZXI:
- Scale = Width = 16;
+ Scale = TypeSize::Scalable(16);
+ Width = SVEMaxBytesPerVector;
MinOffset = -256;
MaxOffset = 255;
break;
+ case AArch64::LD1B_IMM:
+ case AArch64::LD1H_IMM:
+ case AArch64::LD1W_IMM:
+ case AArch64::LD1D_IMM:
+ case AArch64::ST1B_IMM:
+ case AArch64::ST1H_IMM:
+ case AArch64::ST1W_IMM:
+ case AArch64::ST1D_IMM:
+ // A full vectors worth of data
+ // Width = mbytes * elements
+ Scale = TypeSize::Scalable(16);
+ Width = SVEMaxBytesPerVector;
+ MinOffset = -8;
+ MaxOffset = 7;
+ break;
+ case AArch64::LD1B_H_IMM:
+ case AArch64::LD1SB_H_IMM:
+ case AArch64::LD1H_S_IMM:
+ case AArch64::LD1SH_S_IMM:
+ case AArch64::LD1W_D_IMM:
+ case AArch64::LD1SW_D_IMM:
+ case AArch64::ST1B_H_IMM:
+ case AArch64::ST1H_S_IMM:
+ case AArch64::ST1W_D_IMM:
+ // A half vector worth of data
+ // Width = mbytes * elements
+ Scale = TypeSize::Scalable(8);
+ Width = SVEMaxBytesPerVector / 2;
+ MinOffset = -8;
+ MaxOffset = 7;
+ break;
+ case AArch64::LD1B_S_IMM:
+ case AArch64::LD1SB_S_IMM:
+ case AArch64::LD1H_D_IMM:
+ case AArch64::LD1SH_D_IMM:
+ case AArch64::ST1B_S_IMM:
+ case AArch64::ST1H_D_IMM:
+ // A quarter vector worth of data
+ // Width = mbytes * elements
+ Scale = TypeSize::Scalable(4);
+ Width = SVEMaxBytesPerVector / 4;
+ MinOffset = -8;
+ MaxOffset = 7;
+ break;
+ case AArch64::LD1B_D_IMM:
+ case AArch64::LD1SB_D_IMM:
+ case AArch64::ST1B_D_IMM:
+ // A eighth vector worth of data
+ // Width = mbytes * elements
+ Scale = TypeSize::Scalable(2);
+ Width = SVEMaxBytesPerVector / 8;
+ MinOffset = -8;
+ MaxOffset = 7;
+ break;
case AArch64::ST2GOffset:
case AArch64::STZ2GOffset:
- Scale = 16;
+ Scale = TypeSize::Fixed(16);
Width = 32;
MinOffset = -256;
MaxOffset = 255;
break;
case AArch64::STGPi:
- Scale = Width = 16;
+ Scale = TypeSize::Fixed(16);
+ Width = 16;
MinOffset = -64;
MaxOffset = 63;
break;
@@ -2363,9 +2524,13 @@ static bool shouldClusterFI(const MachineFrameInfo &MFI, int FI1,
/// Detect opportunities for ldp/stp formation.
///
/// Only called for LdSt for which getMemOperandWithOffset returns true.
-bool AArch64InstrInfo::shouldClusterMemOps(const MachineOperand &BaseOp1,
- const MachineOperand &BaseOp2,
- unsigned NumLoads) const {
+bool AArch64InstrInfo::shouldClusterMemOps(
+ ArrayRef<const MachineOperand *> BaseOps1,
+ ArrayRef<const MachineOperand *> BaseOps2, unsigned NumLoads,
+ unsigned NumBytes) const {
+ assert(BaseOps1.size() == 1 && BaseOps2.size() == 1);
+ const MachineOperand &BaseOp1 = *BaseOps1.front();
+ const MachineOperand &BaseOp2 = *BaseOps2.front();
const MachineInstr &FirstLdSt = *BaseOp1.getParent();
const MachineInstr &SecondLdSt = *BaseOp2.getParent();
if (BaseOp1.getType() != BaseOp2.getType())
@@ -2379,7 +2544,7 @@ bool AArch64InstrInfo::shouldClusterMemOps(const MachineOperand &BaseOp1,
return false;
// Only cluster up to a single pair.
- if (NumLoads > 1)
+ if (NumLoads > 2)
return false;
if (!isPairableLdStInst(FirstLdSt) || !isPairableLdStInst(SecondLdSt))
@@ -2822,11 +2987,11 @@ static void storeRegPairToStackSlot(const TargetRegisterInfo &TRI,
MachineBasicBlock &MBB,
MachineBasicBlock::iterator InsertBefore,
const MCInstrDesc &MCID,
- unsigned SrcReg, bool IsKill,
+ Register SrcReg, bool IsKill,
unsigned SubIdx0, unsigned SubIdx1, int FI,
MachineMemOperand *MMO) {
- unsigned SrcReg0 = SrcReg;
- unsigned SrcReg1 = SrcReg;
+ Register SrcReg0 = SrcReg;
+ Register SrcReg1 = SrcReg;
if (Register::isPhysicalRegister(SrcReg)) {
SrcReg0 = TRI.getSubReg(SrcReg, SubIdx0);
SubIdx0 = 0;
@@ -2842,18 +3007,19 @@ static void storeRegPairToStackSlot(const TargetRegisterInfo &TRI,
}
void AArch64InstrInfo::storeRegToStackSlot(
- MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, unsigned SrcReg,
+ MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register SrcReg,
bool isKill, int FI, const TargetRegisterClass *RC,
const TargetRegisterInfo *TRI) const {
MachineFunction &MF = *MBB.getParent();
MachineFrameInfo &MFI = MF.getFrameInfo();
- unsigned Align = MFI.getObjectAlignment(FI);
MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI);
- MachineMemOperand *MMO = MF.getMachineMemOperand(
- PtrInfo, MachineMemOperand::MOStore, MFI.getObjectSize(FI), Align);
+ MachineMemOperand *MMO =
+ MF.getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore,
+ MFI.getObjectSize(FI), MFI.getObjectAlign(FI));
unsigned Opc = 0;
bool Offset = true;
+ unsigned StackID = TargetStackID::Default;
switch (TRI->getSpillSize(*RC)) {
case 1:
if (AArch64::FPR8RegClass.hasSubClassEq(RC))
@@ -2862,6 +3028,11 @@ void AArch64InstrInfo::storeRegToStackSlot(
case 2:
if (AArch64::FPR16RegClass.hasSubClassEq(RC))
Opc = AArch64::STRHui;
+ else if (AArch64::PPRRegClass.hasSubClassEq(RC)) {
+ assert(Subtarget.hasSVE() && "Unexpected register store without SVE");
+ Opc = AArch64::STR_PXI;
+ StackID = TargetStackID::SVEVector;
+ }
break;
case 4:
if (AArch64::GPR32allRegClass.hasSubClassEq(RC)) {
@@ -2901,6 +3072,10 @@ void AArch64InstrInfo::storeRegToStackSlot(
get(AArch64::STPXi), SrcReg, isKill,
AArch64::sube64, AArch64::subo64, FI, MMO);
return;
+ } else if (AArch64::ZPRRegClass.hasSubClassEq(RC)) {
+ assert(Subtarget.hasSVE() && "Unexpected register store without SVE");
+ Opc = AArch64::STR_ZXI;
+ StackID = TargetStackID::SVEVector;
}
break;
case 24:
@@ -2919,6 +3094,10 @@ void AArch64InstrInfo::storeRegToStackSlot(
assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
Opc = AArch64::ST1Twov2d;
Offset = false;
+ } else if (AArch64::ZPR2RegClass.hasSubClassEq(RC)) {
+ assert(Subtarget.hasSVE() && "Unexpected register store without SVE");
+ Opc = AArch64::STR_ZZXI;
+ StackID = TargetStackID::SVEVector;
}
break;
case 48:
@@ -2926,6 +3105,10 @@ void AArch64InstrInfo::storeRegToStackSlot(
assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
Opc = AArch64::ST1Threev2d;
Offset = false;
+ } else if (AArch64::ZPR3RegClass.hasSubClassEq(RC)) {
+ assert(Subtarget.hasSVE() && "Unexpected register store without SVE");
+ Opc = AArch64::STR_ZZZXI;
+ StackID = TargetStackID::SVEVector;
}
break;
case 64:
@@ -2933,19 +3116,13 @@ void AArch64InstrInfo::storeRegToStackSlot(
assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
Opc = AArch64::ST1Fourv2d;
Offset = false;
+ } else if (AArch64::ZPR4RegClass.hasSubClassEq(RC)) {
+ assert(Subtarget.hasSVE() && "Unexpected register store without SVE");
+ Opc = AArch64::STR_ZZZZXI;
+ StackID = TargetStackID::SVEVector;
}
break;
}
- unsigned StackID = TargetStackID::Default;
- if (AArch64::PPRRegClass.hasSubClassEq(RC)) {
- assert(Subtarget.hasSVE() && "Unexpected register store without SVE");
- Opc = AArch64::STR_PXI;
- StackID = TargetStackID::SVEVector;
- } else if (AArch64::ZPRRegClass.hasSubClassEq(RC)) {
- assert(Subtarget.hasSVE() && "Unexpected register store without SVE");
- Opc = AArch64::STR_ZXI;
- StackID = TargetStackID::SVEVector;
- }
assert(Opc && "Unknown register class");
MFI.setStackID(FI, StackID);
@@ -2962,11 +3139,11 @@ static void loadRegPairFromStackSlot(const TargetRegisterInfo &TRI,
MachineBasicBlock &MBB,
MachineBasicBlock::iterator InsertBefore,
const MCInstrDesc &MCID,
- unsigned DestReg, unsigned SubIdx0,
+ Register DestReg, unsigned SubIdx0,
unsigned SubIdx1, int FI,
MachineMemOperand *MMO) {
- unsigned DestReg0 = DestReg;
- unsigned DestReg1 = DestReg;
+ Register DestReg0 = DestReg;
+ Register DestReg1 = DestReg;
bool IsUndef = true;
if (Register::isPhysicalRegister(DestReg)) {
DestReg0 = TRI.getSubReg(DestReg, SubIdx0);
@@ -2984,18 +3161,19 @@ static void loadRegPairFromStackSlot(const TargetRegisterInfo &TRI,
}
void AArch64InstrInfo::loadRegFromStackSlot(
- MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, unsigned DestReg,
+ MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register DestReg,
int FI, const TargetRegisterClass *RC,
const TargetRegisterInfo *TRI) const {
MachineFunction &MF = *MBB.getParent();
MachineFrameInfo &MFI = MF.getFrameInfo();
- unsigned Align = MFI.getObjectAlignment(FI);
MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI);
- MachineMemOperand *MMO = MF.getMachineMemOperand(
- PtrInfo, MachineMemOperand::MOLoad, MFI.getObjectSize(FI), Align);
+ MachineMemOperand *MMO =
+ MF.getMachineMemOperand(PtrInfo, MachineMemOperand::MOLoad,
+ MFI.getObjectSize(FI), MFI.getObjectAlign(FI));
unsigned Opc = 0;
bool Offset = true;
+ unsigned StackID = TargetStackID::Default;
switch (TRI->getSpillSize(*RC)) {
case 1:
if (AArch64::FPR8RegClass.hasSubClassEq(RC))
@@ -3004,6 +3182,11 @@ void AArch64InstrInfo::loadRegFromStackSlot(
case 2:
if (AArch64::FPR16RegClass.hasSubClassEq(RC))
Opc = AArch64::LDRHui;
+ else if (AArch64::PPRRegClass.hasSubClassEq(RC)) {
+ assert(Subtarget.hasSVE() && "Unexpected register load without SVE");
+ Opc = AArch64::LDR_PXI;
+ StackID = TargetStackID::SVEVector;
+ }
break;
case 4:
if (AArch64::GPR32allRegClass.hasSubClassEq(RC)) {
@@ -3043,6 +3226,10 @@ void AArch64InstrInfo::loadRegFromStackSlot(
get(AArch64::LDPXi), DestReg, AArch64::sube64,
AArch64::subo64, FI, MMO);
return;
+ } else if (AArch64::ZPRRegClass.hasSubClassEq(RC)) {
+ assert(Subtarget.hasSVE() && "Unexpected register load without SVE");
+ Opc = AArch64::LDR_ZXI;
+ StackID = TargetStackID::SVEVector;
}
break;
case 24:
@@ -3061,6 +3248,10 @@ void AArch64InstrInfo::loadRegFromStackSlot(
assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
Opc = AArch64::LD1Twov2d;
Offset = false;
+ } else if (AArch64::ZPR2RegClass.hasSubClassEq(RC)) {
+ assert(Subtarget.hasSVE() && "Unexpected register load without SVE");
+ Opc = AArch64::LDR_ZZXI;
+ StackID = TargetStackID::SVEVector;
}
break;
case 48:
@@ -3068,6 +3259,10 @@ void AArch64InstrInfo::loadRegFromStackSlot(
assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
Opc = AArch64::LD1Threev2d;
Offset = false;
+ } else if (AArch64::ZPR3RegClass.hasSubClassEq(RC)) {
+ assert(Subtarget.hasSVE() && "Unexpected register load without SVE");
+ Opc = AArch64::LDR_ZZZXI;
+ StackID = TargetStackID::SVEVector;
}
break;
case 64:
@@ -3075,20 +3270,14 @@ void AArch64InstrInfo::loadRegFromStackSlot(
assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
Opc = AArch64::LD1Fourv2d;
Offset = false;
+ } else if (AArch64::ZPR4RegClass.hasSubClassEq(RC)) {
+ assert(Subtarget.hasSVE() && "Unexpected register load without SVE");
+ Opc = AArch64::LDR_ZZZZXI;
+ StackID = TargetStackID::SVEVector;
}
break;
}
- unsigned StackID = TargetStackID::Default;
- if (AArch64::PPRRegClass.hasSubClassEq(RC)) {
- assert(Subtarget.hasSVE() && "Unexpected register load without SVE");
- Opc = AArch64::LDR_PXI;
- StackID = TargetStackID::SVEVector;
- } else if (AArch64::ZPRRegClass.hasSubClassEq(RC)) {
- assert(Subtarget.hasSVE() && "Unexpected register load without SVE");
- Opc = AArch64::LDR_ZXI;
- StackID = TargetStackID::SVEVector;
- }
assert(Opc && "Unknown register class");
MFI.setStackID(FI, StackID);
@@ -3100,6 +3289,17 @@ void AArch64InstrInfo::loadRegFromStackSlot(
MI.addMemOperand(MMO);
}
+bool llvm::isNZCVTouchedInInstructionRange(const MachineInstr &DefMI,
+ const MachineInstr &UseMI,
+ const TargetRegisterInfo *TRI) {
+ return any_of(instructionsWithoutDebug(std::next(DefMI.getIterator()),
+ UseMI.getIterator()),
+ [TRI](const MachineInstr &I) {
+ return I.modifiesRegister(AArch64::NZCV, TRI) ||
+ I.readsRegister(AArch64::NZCV, TRI);
+ });
+}
+
// Helper function to emit a frame offset adjustment from a given
// pointer (SrcReg), stored into DestReg. This function is explicit
// in that it requires the opcode.
@@ -3146,6 +3346,10 @@ static void emitFrameOffsetAdj(MachineBasicBlock &MBB,
// assert(Offset < (1 << 24) && "unimplemented reg plus immediate");
const unsigned MaxEncodableValue = MaxEncoding << ShiftSize;
+ Register TmpReg = DestReg;
+ if (TmpReg == AArch64::XZR)
+ TmpReg = MBB.getParent()->getRegInfo().createVirtualRegister(
+ &AArch64::GPR64RegClass);
do {
uint64_t ThisVal = std::min<uint64_t>(Offset, MaxEncodableValue);
unsigned LocalShiftSize = 0;
@@ -3155,7 +3359,11 @@ static void emitFrameOffsetAdj(MachineBasicBlock &MBB,
}
assert((ThisVal >> ShiftSize) <= MaxEncoding &&
"Encoding cannot handle value that big");
- auto MBI = BuildMI(MBB, MBBI, DL, TII->get(Opc), DestReg)
+
+ Offset -= ThisVal << LocalShiftSize;
+ if (Offset == 0)
+ TmpReg = DestReg;
+ auto MBI = BuildMI(MBB, MBBI, DL, TII->get(Opc), TmpReg)
.addReg(SrcReg)
.addImm(Sign * (int)ThisVal);
if (ShiftSize)
@@ -3176,8 +3384,8 @@ static void emitFrameOffsetAdj(MachineBasicBlock &MBB,
BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_AddFP))
.addImm(Imm)
.setMIFlag(Flag);
- assert((Offset - Imm) == 0 && "Expected remaining offset to be zero to "
- "emit a single SEH directive");
+ assert(Offset == 0 && "Expected remaining offset to be zero to "
+ "emit a single SEH directive");
} else if (DestReg == AArch64::SP) {
if (HasWinCFI)
*HasWinCFI = true;
@@ -3190,8 +3398,7 @@ static void emitFrameOffsetAdj(MachineBasicBlock &MBB,
*HasWinCFI = true;
}
- SrcReg = DestReg;
- Offset -= ThisVal << LocalShiftSize;
+ SrcReg = TmpReg;
} while (Offset);
}
@@ -3414,18 +3621,6 @@ MachineInstr *AArch64InstrInfo::foldMemoryOperandImpl(
return nullptr;
}
-static bool isSVEScaledImmInstruction(unsigned Opcode) {
- switch (Opcode) {
- case AArch64::LDR_ZXI:
- case AArch64::STR_ZXI:
- case AArch64::LDR_PXI:
- case AArch64::STR_PXI:
- return true;
- default:
- return false;
- }
-}
-
int llvm::isAArch64FrameOffsetLegal(const MachineInstr &MI,
StackOffset &SOffset,
bool *OutUseUnscaledOp,
@@ -3458,20 +3653,23 @@ int llvm::isAArch64FrameOffsetLegal(const MachineInstr &MI,
case AArch64::ST1Fourv1d:
case AArch64::IRG:
case AArch64::IRGstack:
+ case AArch64::STGloop:
+ case AArch64::STZGloop:
return AArch64FrameOffsetCannotUpdate;
}
// Get the min/max offset and the scale.
- unsigned Scale, Width;
+ TypeSize ScaleValue(0U, false);
+ unsigned Width;
int64_t MinOff, MaxOff;
- if (!AArch64InstrInfo::getMemOpInfo(MI.getOpcode(), Scale, Width, MinOff,
+ if (!AArch64InstrInfo::getMemOpInfo(MI.getOpcode(), ScaleValue, Width, MinOff,
MaxOff))
llvm_unreachable("unhandled opcode in isAArch64FrameOffsetLegal");
// Construct the complete offset.
- bool IsMulVL = isSVEScaledImmInstruction(MI.getOpcode());
- int64_t Offset =
- IsMulVL ? (SOffset.getScalableBytes()) : (SOffset.getBytes());
+ bool IsMulVL = ScaleValue.isScalable();
+ unsigned Scale = ScaleValue.getKnownMinSize();
+ int64_t Offset = IsMulVL ? SOffset.getScalableBytes() : SOffset.getBytes();
const MachineOperand &ImmOpnd =
MI.getOperand(AArch64InstrInfo::getLoadStoreImmIdx(MI.getOpcode()));
@@ -3484,9 +3682,14 @@ int llvm::isAArch64FrameOffsetLegal(const MachineInstr &MI,
AArch64InstrInfo::getUnscaledLdSt(MI.getOpcode());
bool useUnscaledOp = UnscaledOp && (Offset % Scale || Offset < 0);
if (useUnscaledOp &&
- !AArch64InstrInfo::getMemOpInfo(*UnscaledOp, Scale, Width, MinOff, MaxOff))
+ !AArch64InstrInfo::getMemOpInfo(*UnscaledOp, ScaleValue, Width, MinOff,
+ MaxOff))
llvm_unreachable("unhandled opcode in isAArch64FrameOffsetLegal");
+ Scale = ScaleValue.getKnownMinSize();
+ assert(IsMulVL == ScaleValue.isScalable() &&
+ "Unscaled opcode has different value for scalable");
+
int64_t Remainder = Offset % Scale;
assert(!(Remainder && useUnscaledOp) &&
"Cannot have remainder when using unscaled op");
@@ -5791,6 +5994,35 @@ outliner::OutlinedFunction AArch64InstrInfo::getOutliningCandidateInfo(
return C.getMF()->getFunction().hasFnAttribute("branch-target-enforcement");
});
+ // We check to see if CFI Instructions are present, and if they are
+ // we find the number of CFI Instructions in the candidates.
+ unsigned CFICount = 0;
+ MachineBasicBlock::iterator MBBI = RepeatedSequenceLocs[0].front();
+ for (unsigned Loc = RepeatedSequenceLocs[0].getStartIdx();
+ Loc < RepeatedSequenceLocs[0].getEndIdx() + 1; Loc++) {
+ const std::vector<MCCFIInstruction> &CFIInstructions =
+ RepeatedSequenceLocs[0].getMF()->getFrameInstructions();
+ if (MBBI->isCFIInstruction()) {
+ unsigned CFIIndex = MBBI->getOperand(0).getCFIIndex();
+ MCCFIInstruction CFI = CFIInstructions[CFIIndex];
+ CFICount++;
+ }
+ MBBI++;
+ }
+
+ // We compare the number of found CFI Instructions to the number of CFI
+ // instructions in the parent function for each candidate. We must check this
+ // since if we outline one of the CFI instructions in a function, we have to
+ // outline them all for correctness. If we do not, the address offsets will be
+ // incorrect between the two sections of the program.
+ for (outliner::Candidate &C : RepeatedSequenceLocs) {
+ std::vector<MCCFIInstruction> CFIInstructions =
+ C.getMF()->getFrameInstructions();
+
+ if (CFICount > 0 && CFICount != CFIInstructions.size())
+ return outliner::OutlinedFunction();
+ }
+
// Returns true if an instructions is safe to fix up, false otherwise.
auto IsSafeToFixup = [this, &TRI](MachineInstr &MI) {
if (MI.isCall())
@@ -5811,23 +6043,29 @@ outliner::OutlinedFunction AArch64InstrInfo::getOutliningCandidateInfo(
if (MI.mayLoadOrStore()) {
const MachineOperand *Base; // Filled with the base operand of MI.
int64_t Offset; // Filled with the offset of MI.
+ bool OffsetIsScalable;
// Does it allow us to offset the base operand and is the base the
// register SP?
- if (!getMemOperandWithOffset(MI, Base, Offset, &TRI) || !Base->isReg() ||
- Base->getReg() != AArch64::SP)
+ if (!getMemOperandWithOffset(MI, Base, Offset, OffsetIsScalable, &TRI) ||
+ !Base->isReg() || Base->getReg() != AArch64::SP)
+ return false;
+
+ // Fixe-up code below assumes bytes.
+ if (OffsetIsScalable)
return false;
// Find the minimum/maximum offset for this instruction and check
// if fixing it up would be in range.
int64_t MinOffset,
MaxOffset; // Unscaled offsets for the instruction.
- unsigned Scale; // The scale to multiply the offsets by.
+ TypeSize Scale(0U, false); // The scale to multiply the offsets by.
unsigned DummyWidth;
getMemOpInfo(MI.getOpcode(), Scale, DummyWidth, MinOffset, MaxOffset);
Offset += 16; // Update the offset to what it would be if we outlined.
- if (Offset < MinOffset * Scale || Offset > MaxOffset * Scale)
+ if (Offset < MinOffset * (int64_t)Scale.getFixedSize() ||
+ Offset > MaxOffset * (int64_t)Scale.getFixedSize())
return false;
// It's in range, so we can outline it.
@@ -5854,7 +6092,9 @@ outliner::OutlinedFunction AArch64InstrInfo::getOutliningCandidateInfo(
}
else if (LastInstrOpcode == AArch64::BL ||
- (LastInstrOpcode == AArch64::BLR && !HasBTI)) {
+ ((LastInstrOpcode == AArch64::BLR ||
+ LastInstrOpcode == AArch64::BLRNoIP) &&
+ !HasBTI)) {
// FIXME: Do we need to check if the code after this uses the value of LR?
FrameID = MachineOutlinerThunk;
NumBytesToCreateFrame = 0;
@@ -5960,6 +6200,11 @@ outliner::OutlinedFunction AArch64InstrInfo::getOutliningCandidateInfo(
}
}
+ // If we have CFI instructions, we can only outline if the outlined section
+ // can be a tail call
+ if (FrameID != MachineOutlinerTailCall && CFICount > 0)
+ return outliner::OutlinedFunction();
+
return outliner::OutlinedFunction(RepeatedSequenceLocs, SequenceSize,
NumBytesToCreateFrame, FrameID);
}
@@ -5986,6 +6231,10 @@ bool AArch64InstrInfo::isFunctionSafeToOutlineFrom(
if (!AFI || AFI->hasRedZone().getValueOr(true))
return false;
+ // FIXME: Teach the outliner to generate/handle Windows unwind info.
+ if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI())
+ return false;
+
// It's safe to outline from MF.
return true;
}
@@ -6081,6 +6330,15 @@ AArch64InstrInfo::getOutliningType(MachineBasicBlock::iterator &MIT,
if (FuncInfo->getLOHRelated().count(&MI))
return outliner::InstrType::Illegal;
+ // We can only outline these if we will tail call the outlined function, or
+ // fix up the CFI offsets. Currently, CFI instructions are outlined only if
+ // in a tail call.
+ //
+ // FIXME: If the proper fixups for the offset are implemented, this should be
+ // possible.
+ if (MI.isCFIInstruction())
+ return outliner::InstrType::Legal;
+
// Don't allow debug values to impact outlining type.
if (MI.isDebugInstr() || MI.isIndirectDebugValue())
return outliner::InstrType::Invisible;
@@ -6150,10 +6408,11 @@ AArch64InstrInfo::getOutliningType(MachineBasicBlock::iterator &MIT,
// If we don't know anything about the callee, assume it depends on the
// stack layout of the caller. In that case, it's only legal to outline
- // as a tail-call. Whitelist the call instructions we know about so we
+ // as a tail-call. Explicitly list the call instructions we know about so we
// don't get unexpected results with call pseudo-instructions.
auto UnknownCallOutlineType = outliner::InstrType::Illegal;
- if (MI.getOpcode() == AArch64::BLR || MI.getOpcode() == AArch64::BL)
+ if (MI.getOpcode() == AArch64::BLR ||
+ MI.getOpcode() == AArch64::BLRNoIP || MI.getOpcode() == AArch64::BL)
UnknownCallOutlineType = outliner::InstrType::LegalTerminator;
if (!Callee)
@@ -6205,26 +6464,29 @@ void AArch64InstrInfo::fixupPostOutline(MachineBasicBlock &MBB) const {
const MachineOperand *Base;
unsigned Width;
int64_t Offset;
+ bool OffsetIsScalable;
// Is this a load or store with an immediate offset with SP as the base?
if (!MI.mayLoadOrStore() ||
- !getMemOperandWithOffsetWidth(MI, Base, Offset, Width, &RI) ||
+ !getMemOperandWithOffsetWidth(MI, Base, Offset, OffsetIsScalable, Width,
+ &RI) ||
(Base->isReg() && Base->getReg() != AArch64::SP))
continue;
// It is, so we have to fix it up.
- unsigned Scale;
+ TypeSize Scale(0U, false);
int64_t Dummy1, Dummy2;
MachineOperand &StackOffsetOperand = getMemOpBaseRegImmOfsOffsetOperand(MI);
assert(StackOffsetOperand.isImm() && "Stack offset wasn't immediate!");
getMemOpInfo(MI.getOpcode(), Scale, Width, Dummy1, Dummy2);
assert(Scale != 0 && "Unexpected opcode!");
+ assert(!OffsetIsScalable && "Expected offset to be a byte offset");
// We've pushed the return address to the stack, so add 16 to the offset.
// This is safe, since we already checked if it would overflow when we
// checked if this instruction was legal to outline.
- int64_t NewImm = (Offset + 16) / Scale;
+ int64_t NewImm = (Offset + 16) / (int64_t)Scale.getFixedSize();
StackOffsetOperand.setImm(NewImm);
}
}
@@ -6285,15 +6547,21 @@ static void signOutlinedFunction(MachineFunction &MF, MachineBasicBlock &MBB,
void AArch64InstrInfo::buildOutlinedFrame(
MachineBasicBlock &MBB, MachineFunction &MF,
const outliner::OutlinedFunction &OF) const {
- // For thunk outlining, rewrite the last instruction from a call to a
- // tail-call.
- if (OF.FrameConstructionID == MachineOutlinerThunk) {
+
+ AArch64FunctionInfo *FI = MF.getInfo<AArch64FunctionInfo>();
+
+ if (OF.FrameConstructionID == MachineOutlinerTailCall)
+ FI->setOutliningStyle("Tail Call");
+ else if (OF.FrameConstructionID == MachineOutlinerThunk) {
+ // For thunk outlining, rewrite the last instruction from a call to a
+ // tail-call.
MachineInstr *Call = &*--MBB.instr_end();
unsigned TailOpcode;
if (Call->getOpcode() == AArch64::BL) {
TailOpcode = AArch64::TCRETURNdi;
} else {
- assert(Call->getOpcode() == AArch64::BLR);
+ assert(Call->getOpcode() == AArch64::BLR ||
+ Call->getOpcode() == AArch64::BLRNoIP);
TailOpcode = AArch64::TCRETURNriALL;
}
MachineInstr *TC = BuildMI(MF, DebugLoc(), get(TailOpcode))
@@ -6301,6 +6569,8 @@ void AArch64InstrInfo::buildOutlinedFrame(
.addImm(0);
MBB.insert(MBB.end(), TC);
Call->eraseFromParent();
+
+ FI->setOutliningStyle("Thunk");
}
bool IsLeafFunction = true;
@@ -6320,7 +6590,8 @@ void AArch64InstrInfo::buildOutlinedFrame(
IsLeafFunction = false;
// LR has to be a live in so that we can save it.
- MBB.addLiveIn(AArch64::LR);
+ if (!MBB.isLiveIn(AArch64::LR))
+ MBB.addLiveIn(AArch64::LR);
MachineBasicBlock::iterator It = MBB.begin();
MachineBasicBlock::iterator Et = MBB.end();
@@ -6343,7 +6614,7 @@ void AArch64InstrInfo::buildOutlinedFrame(
// Add a CFI saying the stack was moved 16 B down.
int64_t StackPosEntry =
- MF.addFrameInst(MCCFIInstruction::createDefCfaOffset(nullptr, 16));
+ MF.addFrameInst(MCCFIInstruction::cfiDefCfaOffset(nullptr, 16));
BuildMI(MBB, It, DebugLoc(), get(AArch64::CFI_INSTRUCTION))
.addCFIIndex(StackPosEntry)
.setMIFlags(MachineInstr::FrameSetup);
@@ -6351,7 +6622,7 @@ void AArch64InstrInfo::buildOutlinedFrame(
// Add a CFI saying that the LR that we want to find is now 16 B higher than
// before.
int64_t LRPosEntry =
- MF.addFrameInst(MCCFIInstruction::createOffset(nullptr, DwarfReg, 16));
+ MF.addFrameInst(MCCFIInstruction::createOffset(nullptr, DwarfReg, -16));
BuildMI(MBB, It, DebugLoc(), get(AArch64::CFI_INSTRUCTION))
.addCFIIndex(LRPosEntry)
.setMIFlags(MachineInstr::FrameSetup);
@@ -6399,13 +6670,20 @@ void AArch64InstrInfo::buildOutlinedFrame(
}
// It's not a tail call, so we have to insert the return ourselves.
+
+ // LR has to be a live in so that we can return to it.
+ if (!MBB.isLiveIn(AArch64::LR))
+ MBB.addLiveIn(AArch64::LR);
+
MachineInstr *ret = BuildMI(MF, DebugLoc(), get(AArch64::RET))
- .addReg(AArch64::LR, RegState::Undef);
+ .addReg(AArch64::LR);
MBB.insert(MBB.end(), ret);
signOutlinedFunction(MF, MBB, ShouldSignReturnAddr,
ShouldSignReturnAddrWithAKey);
+ FI->setOutliningStyle("Function");
+
// Did we have to modify the stack by saving the link register?
if (OF.FrameConstructionID != MachineOutlinerDefault)
return;
@@ -6519,7 +6797,8 @@ Optional<RegImmPair> AArch64InstrInfo::isAddImmediate(const MachineInstr &MI,
// TODO: Handle cases where Reg is a super- or sub-register of the
// destination register.
- if (Reg != MI.getOperand(0).getReg())
+ const MachineOperand &Op0 = MI.getOperand(0);
+ if (!Op0.isReg() || Reg != Op0.getReg())
return None;
switch (MI.getOpcode()) {
@@ -6614,5 +6893,17 @@ AArch64InstrInfo::describeLoadedValue(const MachineInstr &MI,
return TargetInstrInfo::describeLoadedValue(MI, Reg);
}
+uint64_t AArch64InstrInfo::getElementSizeForOpcode(unsigned Opc) const {
+ return get(Opc).TSFlags & AArch64::ElementSizeMask;
+}
+
+unsigned llvm::getBLRCallOpcode(const MachineFunction &MF) {
+ if (MF.getSubtarget<AArch64Subtarget>().hardenSlsBlr())
+ return AArch64::BLRNoIP;
+ else
+ return AArch64::BLR;
+}
+
#define GET_INSTRINFO_HELPERS
+#define GET_INSTRMAP_INFO
#include "AArch64GenInstrInfo.inc"
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.h b/llvm/lib/Target/AArch64/AArch64InstrInfo.h
index 66e517e549035..298c04d81708d 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.h
@@ -19,6 +19,7 @@
#include "llvm/ADT/Optional.h"
#include "llvm/CodeGen/MachineCombinerPattern.h"
#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/Support/TypeSize.h"
#define GET_INSTRINFO_HEADER
#include "AArch64GenInstrInfo.inc"
@@ -51,8 +52,8 @@ public:
bool isAsCheapAsAMove(const MachineInstr &MI) const override;
- bool isCoalescableExtInstr(const MachineInstr &MI, unsigned &SrcReg,
- unsigned &DstReg, unsigned &SubIdx) const override;
+ bool isCoalescableExtInstr(const MachineInstr &MI, Register &SrcReg,
+ Register &DstReg, unsigned &SubIdx) const override;
bool
areMemAccessesTriviallyDisjoint(const MachineInstr &MIa,
@@ -112,14 +113,19 @@ public:
/// Hint that pairing the given load or store is unprofitable.
static void suppressLdStPair(MachineInstr &MI);
- bool getMemOperandWithOffset(const MachineInstr &MI,
- const MachineOperand *&BaseOp,
- int64_t &Offset,
- const TargetRegisterInfo *TRI) const override;
+ bool getMemOperandsWithOffsetWidth(
+ const MachineInstr &MI, SmallVectorImpl<const MachineOperand *> &BaseOps,
+ int64_t &Offset, bool &OffsetIsScalable, unsigned &Width,
+ const TargetRegisterInfo *TRI) const override;
+ /// If \p OffsetIsScalable is set to 'true', the offset is scaled by `vscale`.
+ /// This is true for some SVE instructions like ldr/str that have a
+ /// 'reg + imm' addressing mode where the immediate is an index to the
+ /// scalable vector located at 'reg + imm * vscale x #bytes'.
bool getMemOperandWithOffsetWidth(const MachineInstr &MI,
const MachineOperand *&BaseOp,
- int64_t &Offset, unsigned &Width,
+ int64_t &Offset, bool &OffsetIsScalable,
+ unsigned &Width,
const TargetRegisterInfo *TRI) const;
/// Return the immediate offset of the base register in a load/store \p LdSt.
@@ -129,12 +135,12 @@ public:
/// \p Scale, \p Width, \p MinOffset, and \p MaxOffset accordingly.
///
/// For unscaled instructions, \p Scale is set to 1.
- static bool getMemOpInfo(unsigned Opcode, unsigned &Scale, unsigned &Width,
+ static bool getMemOpInfo(unsigned Opcode, TypeSize &Scale, unsigned &Width,
int64_t &MinOffset, int64_t &MaxOffset);
- bool shouldClusterMemOps(const MachineOperand &BaseOp1,
- const MachineOperand &BaseOp2,
- unsigned NumLoads) const override;
+ bool shouldClusterMemOps(ArrayRef<const MachineOperand *> BaseOps1,
+ ArrayRef<const MachineOperand *> BaseOps2,
+ unsigned NumLoads, unsigned NumBytes) const override;
void copyPhysRegTuple(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
const DebugLoc &DL, MCRegister DestReg,
@@ -149,13 +155,13 @@ public:
bool KillSrc) const override;
void storeRegToStackSlot(MachineBasicBlock &MBB,
- MachineBasicBlock::iterator MBBI, unsigned SrcReg,
+ MachineBasicBlock::iterator MBBI, Register SrcReg,
bool isKill, int FrameIndex,
const TargetRegisterClass *RC,
const TargetRegisterInfo *TRI) const override;
void loadRegFromStackSlot(MachineBasicBlock &MBB,
- MachineBasicBlock::iterator MBBI, unsigned DestReg,
+ MachineBasicBlock::iterator MBBI, Register DestReg,
int FrameIndex, const TargetRegisterClass *RC,
const TargetRegisterInfo *TRI) const override;
@@ -191,11 +197,12 @@ public:
bool
reverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const override;
bool canInsertSelect(const MachineBasicBlock &, ArrayRef<MachineOperand> Cond,
- unsigned, unsigned, int &, int &, int &) const override;
+ Register, Register, Register, int &, int &,
+ int &) const override;
void insertSelect(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
- const DebugLoc &DL, unsigned DstReg,
- ArrayRef<MachineOperand> Cond, unsigned TrueReg,
- unsigned FalseReg) const override;
+ const DebugLoc &DL, Register DstReg,
+ ArrayRef<MachineOperand> Cond, Register TrueReg,
+ Register FalseReg) const override;
void getNoop(MCInst &NopInst) const override;
bool isSchedulingBoundary(const MachineInstr &MI,
@@ -205,13 +212,13 @@ public:
/// analyzeCompare - For a comparison instruction, return the source registers
/// in SrcReg and SrcReg2, and the value it compares against in CmpValue.
/// Return true if the comparison instruction can be analyzed.
- bool analyzeCompare(const MachineInstr &MI, unsigned &SrcReg,
- unsigned &SrcReg2, int &CmpMask,
+ bool analyzeCompare(const MachineInstr &MI, Register &SrcReg,
+ Register &SrcReg2, int &CmpMask,
int &CmpValue) const override;
/// optimizeCompareInstr - Convert the instruction supplying the argument to
/// the comparison into one that sets the zero bit in the flags register.
- bool optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg,
- unsigned SrcReg2, int CmpMask, int CmpValue,
+ bool optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
+ Register SrcReg2, int CmpMask, int CmpValue,
const MachineRegisterInfo *MRI) const override;
bool optimizeCondBranch(MachineInstr &MI) const override;
@@ -264,6 +271,8 @@ public:
MachineBasicBlock::iterator &It, MachineFunction &MF,
const outliner::Candidate &C) const override;
bool shouldOutlineFromFunctionByDefault(MachineFunction &MF) const override;
+ /// Returns the vector element size (B, H, S or D) of an SVE opcode.
+ uint64_t getElementSizeForOpcode(unsigned Opc) const;
/// Returns true if the instruction has a shift by immediate that can be
/// executed in one cycle less.
static bool isFalkorShiftExtFast(const MachineInstr &MI);
@@ -288,6 +297,8 @@ protected:
isCopyInstrImpl(const MachineInstr &MI) const override;
private:
+ unsigned getInstBundleLength(const MachineInstr &MI) const;
+
/// Sets the offsets on outlined instructions in \p MBB which use SP
/// so that they will be valid post-outlining.
///
@@ -305,6 +316,12 @@ private:
unsigned findRegisterToSaveLRTo(const outliner::Candidate &C) const;
};
+/// Return true if there is an instruction /after/ \p DefMI and before \p UseMI
+/// which either reads or clobbers NZCV.
+bool isNZCVTouchedInInstructionRange(const MachineInstr &DefMI,
+ const MachineInstr &UseMI,
+ const TargetRegisterInfo *TRI);
+
/// emitFrameOffset - Emit instructions as needed to set DestReg to SrcReg
/// plus Offset. This is intended to be used from within the prolog/epilog
/// insertion (PEI) pass, where a virtual scratch register may be allocated
@@ -369,12 +386,24 @@ static inline bool isCondBranchOpcode(int Opc) {
}
static inline bool isIndirectBranchOpcode(int Opc) {
- return Opc == AArch64::BR;
+ switch (Opc) {
+ case AArch64::BR:
+ case AArch64::BRAA:
+ case AArch64::BRAB:
+ case AArch64::BRAAZ:
+ case AArch64::BRABZ:
+ return true;
+ }
+ return false;
}
+/// Return opcode to be used for indirect calls.
+unsigned getBLRCallOpcode(const MachineFunction &MF);
+
// struct TSFlags {
#define TSFLAG_ELEMENT_SIZE_TYPE(X) (X) // 3-bits
-#define TSFLAG_DESTRUCTIVE_INST_TYPE(X) ((X) << 3) // 1-bit
+#define TSFLAG_DESTRUCTIVE_INST_TYPE(X) ((X) << 3) // 4-bit
+#define TSFLAG_FALSE_LANE_TYPE(X) ((X) << 7) // 2-bits
// }
namespace AArch64 {
@@ -389,13 +418,31 @@ enum ElementSizeType {
};
enum DestructiveInstType {
- DestructiveInstTypeMask = TSFLAG_DESTRUCTIVE_INST_TYPE(0x1),
- NotDestructive = TSFLAG_DESTRUCTIVE_INST_TYPE(0x0),
- Destructive = TSFLAG_DESTRUCTIVE_INST_TYPE(0x1),
+ DestructiveInstTypeMask = TSFLAG_DESTRUCTIVE_INST_TYPE(0xf),
+ NotDestructive = TSFLAG_DESTRUCTIVE_INST_TYPE(0x0),
+ DestructiveOther = TSFLAG_DESTRUCTIVE_INST_TYPE(0x1),
+ DestructiveUnary = TSFLAG_DESTRUCTIVE_INST_TYPE(0x2),
+ DestructiveBinaryImm = TSFLAG_DESTRUCTIVE_INST_TYPE(0x3),
+ DestructiveBinaryShImmUnpred = TSFLAG_DESTRUCTIVE_INST_TYPE(0x4),
+ DestructiveBinary = TSFLAG_DESTRUCTIVE_INST_TYPE(0x5),
+ DestructiveBinaryComm = TSFLAG_DESTRUCTIVE_INST_TYPE(0x6),
+ DestructiveBinaryCommWithRev = TSFLAG_DESTRUCTIVE_INST_TYPE(0x7),
+ DestructiveTernaryCommWithRev = TSFLAG_DESTRUCTIVE_INST_TYPE(0x8),
+};
+
+enum FalseLaneType {
+ FalseLanesMask = TSFLAG_FALSE_LANE_TYPE(0x3),
+ FalseLanesZero = TSFLAG_FALSE_LANE_TYPE(0x1),
+ FalseLanesUndef = TSFLAG_FALSE_LANE_TYPE(0x2),
};
#undef TSFLAG_ELEMENT_SIZE_TYPE
#undef TSFLAG_DESTRUCTIVE_INST_TYPE
+#undef TSFLAG_FALSE_LANE_TYPE
+
+int getSVEPseudoMap(uint16_t Opcode);
+int getSVERevInstr(uint16_t Opcode);
+int getSVENonRevInstr(uint16_t Opcode);
}
} // end namespace llvm
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index d590d4d913ff8..f4a5f639e4973 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -14,142 +14,154 @@
// ARM Instruction Predicate Definitions.
//
def HasV8_1a : Predicate<"Subtarget->hasV8_1aOps()">,
- AssemblerPredicate<"HasV8_1aOps", "armv8.1a">;
+ AssemblerPredicate<(all_of HasV8_1aOps), "armv8.1a">;
def HasV8_2a : Predicate<"Subtarget->hasV8_2aOps()">,
- AssemblerPredicate<"HasV8_2aOps", "armv8.2a">;
+ AssemblerPredicate<(all_of HasV8_2aOps), "armv8.2a">;
def HasV8_3a : Predicate<"Subtarget->hasV8_3aOps()">,
- AssemblerPredicate<"HasV8_3aOps", "armv8.3a">;
+ AssemblerPredicate<(all_of HasV8_3aOps), "armv8.3a">;
def HasV8_4a : Predicate<"Subtarget->hasV8_4aOps()">,
- AssemblerPredicate<"HasV8_4aOps", "armv8.4a">;
+ AssemblerPredicate<(all_of HasV8_4aOps), "armv8.4a">;
def HasV8_5a : Predicate<"Subtarget->hasV8_5aOps()">,
- AssemblerPredicate<"HasV8_5aOps", "armv8.5a">;
+ AssemblerPredicate<(all_of HasV8_5aOps), "armv8.5a">;
+def HasV8_6a : Predicate<"Subtarget->hasV8_6aOps()">,
+ AssemblerPredicate<(all_of HasV8_6aOps), "armv8.6a">;
def HasVH : Predicate<"Subtarget->hasVH()">,
- AssemblerPredicate<"FeatureVH", "vh">;
+ AssemblerPredicate<(all_of FeatureVH), "vh">;
def HasLOR : Predicate<"Subtarget->hasLOR()">,
- AssemblerPredicate<"FeatureLOR", "lor">;
+ AssemblerPredicate<(all_of FeatureLOR), "lor">;
def HasPA : Predicate<"Subtarget->hasPA()">,
- AssemblerPredicate<"FeaturePA", "pa">;
+ AssemblerPredicate<(all_of FeaturePA), "pa">;
def HasJS : Predicate<"Subtarget->hasJS()">,
- AssemblerPredicate<"FeatureJS", "jsconv">;
+ AssemblerPredicate<(all_of FeatureJS), "jsconv">;
def HasCCIDX : Predicate<"Subtarget->hasCCIDX()">,
- AssemblerPredicate<"FeatureCCIDX", "ccidx">;
+ AssemblerPredicate<(all_of FeatureCCIDX), "ccidx">;
def HasComplxNum : Predicate<"Subtarget->hasComplxNum()">,
- AssemblerPredicate<"FeatureComplxNum", "complxnum">;
+ AssemblerPredicate<(all_of FeatureComplxNum), "complxnum">;
def HasNV : Predicate<"Subtarget->hasNV()">,
- AssemblerPredicate<"FeatureNV", "nv">;
+ AssemblerPredicate<(all_of FeatureNV), "nv">;
def HasRASv8_4 : Predicate<"Subtarget->hasRASv8_4()">,
- AssemblerPredicate<"FeatureRASv8_4", "rasv8_4">;
+ AssemblerPredicate<(all_of FeatureRASv8_4), "rasv8_4">;
def HasMPAM : Predicate<"Subtarget->hasMPAM()">,
- AssemblerPredicate<"FeatureMPAM", "mpam">;
+ AssemblerPredicate<(all_of FeatureMPAM), "mpam">;
def HasDIT : Predicate<"Subtarget->hasDIT()">,
- AssemblerPredicate<"FeatureDIT", "dit">;
+ AssemblerPredicate<(all_of FeatureDIT), "dit">;
def HasTRACEV8_4 : Predicate<"Subtarget->hasTRACEV8_4()">,
- AssemblerPredicate<"FeatureTRACEV8_4", "tracev8.4">;
+ AssemblerPredicate<(all_of FeatureTRACEV8_4), "tracev8.4">;
def HasAM : Predicate<"Subtarget->hasAM()">,
- AssemblerPredicate<"FeatureAM", "am">;
+ AssemblerPredicate<(all_of FeatureAM), "am">;
def HasSEL2 : Predicate<"Subtarget->hasSEL2()">,
- AssemblerPredicate<"FeatureSEL2", "sel2">;
+ AssemblerPredicate<(all_of FeatureSEL2), "sel2">;
def HasPMU : Predicate<"Subtarget->hasPMU()">,
- AssemblerPredicate<"FeaturePMU", "pmu">;
+ AssemblerPredicate<(all_of FeaturePMU), "pmu">;
def HasTLB_RMI : Predicate<"Subtarget->hasTLB_RMI()">,
- AssemblerPredicate<"FeatureTLB_RMI", "tlb-rmi">;
+ AssemblerPredicate<(all_of FeatureTLB_RMI), "tlb-rmi">;
def HasFMI : Predicate<"Subtarget->hasFMI()">,
- AssemblerPredicate<"FeatureFMI", "fmi">;
+ AssemblerPredicate<(all_of FeatureFMI), "fmi">;
def HasRCPC_IMMO : Predicate<"Subtarget->hasRCPCImm()">,
- AssemblerPredicate<"FeatureRCPC_IMMO", "rcpc-immo">;
+ AssemblerPredicate<(all_of FeatureRCPC_IMMO), "rcpc-immo">;
def HasFPARMv8 : Predicate<"Subtarget->hasFPARMv8()">,
- AssemblerPredicate<"FeatureFPARMv8", "fp-armv8">;
+ AssemblerPredicate<(all_of FeatureFPARMv8), "fp-armv8">;
def HasNEON : Predicate<"Subtarget->hasNEON()">,
- AssemblerPredicate<"FeatureNEON", "neon">;
+ AssemblerPredicate<(all_of FeatureNEON), "neon">;
def HasCrypto : Predicate<"Subtarget->hasCrypto()">,
- AssemblerPredicate<"FeatureCrypto", "crypto">;
+ AssemblerPredicate<(all_of FeatureCrypto), "crypto">;
def HasSM4 : Predicate<"Subtarget->hasSM4()">,
- AssemblerPredicate<"FeatureSM4", "sm4">;
+ AssemblerPredicate<(all_of FeatureSM4), "sm4">;
def HasSHA3 : Predicate<"Subtarget->hasSHA3()">,
- AssemblerPredicate<"FeatureSHA3", "sha3">;
+ AssemblerPredicate<(all_of FeatureSHA3), "sha3">;
def HasSHA2 : Predicate<"Subtarget->hasSHA2()">,
- AssemblerPredicate<"FeatureSHA2", "sha2">;
+ AssemblerPredicate<(all_of FeatureSHA2), "sha2">;
def HasAES : Predicate<"Subtarget->hasAES()">,
- AssemblerPredicate<"FeatureAES", "aes">;
+ AssemblerPredicate<(all_of FeatureAES), "aes">;
def HasDotProd : Predicate<"Subtarget->hasDotProd()">,
- AssemblerPredicate<"FeatureDotProd", "dotprod">;
+ AssemblerPredicate<(all_of FeatureDotProd), "dotprod">;
def HasCRC : Predicate<"Subtarget->hasCRC()">,
- AssemblerPredicate<"FeatureCRC", "crc">;
+ AssemblerPredicate<(all_of FeatureCRC), "crc">;
def HasLSE : Predicate<"Subtarget->hasLSE()">,
- AssemblerPredicate<"FeatureLSE", "lse">;
+ AssemblerPredicate<(all_of FeatureLSE), "lse">;
def HasRAS : Predicate<"Subtarget->hasRAS()">,
- AssemblerPredicate<"FeatureRAS", "ras">;
+ AssemblerPredicate<(all_of FeatureRAS), "ras">;
def HasRDM : Predicate<"Subtarget->hasRDM()">,
- AssemblerPredicate<"FeatureRDM", "rdm">;
+ AssemblerPredicate<(all_of FeatureRDM), "rdm">;
def HasPerfMon : Predicate<"Subtarget->hasPerfMon()">;
def HasFullFP16 : Predicate<"Subtarget->hasFullFP16()">,
- AssemblerPredicate<"FeatureFullFP16", "fullfp16">;
+ AssemblerPredicate<(all_of FeatureFullFP16), "fullfp16">;
def HasFP16FML : Predicate<"Subtarget->hasFP16FML()">,
- AssemblerPredicate<"FeatureFP16FML", "fp16fml">;
+ AssemblerPredicate<(all_of FeatureFP16FML), "fp16fml">;
def HasSPE : Predicate<"Subtarget->hasSPE()">,
- AssemblerPredicate<"FeatureSPE", "spe">;
+ AssemblerPredicate<(all_of FeatureSPE), "spe">;
def HasFuseAES : Predicate<"Subtarget->hasFuseAES()">,
- AssemblerPredicate<"FeatureFuseAES",
+ AssemblerPredicate<(all_of FeatureFuseAES),
"fuse-aes">;
def HasSVE : Predicate<"Subtarget->hasSVE()">,
- AssemblerPredicate<"FeatureSVE", "sve">;
+ AssemblerPredicate<(all_of FeatureSVE), "sve">;
def HasSVE2 : Predicate<"Subtarget->hasSVE2()">,
- AssemblerPredicate<"FeatureSVE2", "sve2">;
+ AssemblerPredicate<(all_of FeatureSVE2), "sve2">;
def HasSVE2AES : Predicate<"Subtarget->hasSVE2AES()">,
- AssemblerPredicate<"FeatureSVE2AES", "sve2-aes">;
+ AssemblerPredicate<(all_of FeatureSVE2AES), "sve2-aes">;
def HasSVE2SM4 : Predicate<"Subtarget->hasSVE2SM4()">,
- AssemblerPredicate<"FeatureSVE2SM4", "sve2-sm4">;
+ AssemblerPredicate<(all_of FeatureSVE2SM4), "sve2-sm4">;
def HasSVE2SHA3 : Predicate<"Subtarget->hasSVE2SHA3()">,
- AssemblerPredicate<"FeatureSVE2SHA3", "sve2-sha3">;
+ AssemblerPredicate<(all_of FeatureSVE2SHA3), "sve2-sha3">;
def HasSVE2BitPerm : Predicate<"Subtarget->hasSVE2BitPerm()">,
- AssemblerPredicate<"FeatureSVE2BitPerm", "sve2-bitperm">;
+ AssemblerPredicate<(all_of FeatureSVE2BitPerm), "sve2-bitperm">;
def HasRCPC : Predicate<"Subtarget->hasRCPC()">,
- AssemblerPredicate<"FeatureRCPC", "rcpc">;
+ AssemblerPredicate<(all_of FeatureRCPC), "rcpc">;
def HasAltNZCV : Predicate<"Subtarget->hasAlternativeNZCV()">,
- AssemblerPredicate<"FeatureAltFPCmp", "altnzcv">;
+ AssemblerPredicate<(all_of FeatureAltFPCmp), "altnzcv">;
def HasFRInt3264 : Predicate<"Subtarget->hasFRInt3264()">,
- AssemblerPredicate<"FeatureFRInt3264", "frint3264">;
+ AssemblerPredicate<(all_of FeatureFRInt3264), "frint3264">;
def HasSB : Predicate<"Subtarget->hasSB()">,
- AssemblerPredicate<"FeatureSB", "sb">;
+ AssemblerPredicate<(all_of FeatureSB), "sb">;
def HasPredRes : Predicate<"Subtarget->hasPredRes()">,
- AssemblerPredicate<"FeaturePredRes", "predres">;
+ AssemblerPredicate<(all_of FeaturePredRes), "predres">;
def HasCCDP : Predicate<"Subtarget->hasCCDP()">,
- AssemblerPredicate<"FeatureCacheDeepPersist", "ccdp">;
+ AssemblerPredicate<(all_of FeatureCacheDeepPersist), "ccdp">;
def HasBTI : Predicate<"Subtarget->hasBTI()">,
- AssemblerPredicate<"FeatureBranchTargetId", "bti">;
+ AssemblerPredicate<(all_of FeatureBranchTargetId), "bti">;
def HasMTE : Predicate<"Subtarget->hasMTE()">,
- AssemblerPredicate<"FeatureMTE", "mte">;
+ AssemblerPredicate<(all_of FeatureMTE), "mte">;
def HasTME : Predicate<"Subtarget->hasTME()">,
- AssemblerPredicate<"FeatureTME", "tme">;
+ AssemblerPredicate<(all_of FeatureTME), "tme">;
def HasETE : Predicate<"Subtarget->hasETE()">,
- AssemblerPredicate<"FeatureETE", "ete">;
+ AssemblerPredicate<(all_of FeatureETE), "ete">;
def HasTRBE : Predicate<"Subtarget->hasTRBE()">,
- AssemblerPredicate<"FeatureTRBE", "trbe">;
+ AssemblerPredicate<(all_of FeatureTRBE), "trbe">;
+def HasBF16 : Predicate<"Subtarget->hasBF16()">,
+ AssemblerPredicate<(all_of FeatureBF16), "bf16">;
+def HasMatMulInt8 : Predicate<"Subtarget->hasMatMulInt8()">,
+ AssemblerPredicate<(all_of FeatureMatMulInt8), "i8mm">;
+def HasMatMulFP32 : Predicate<"Subtarget->hasMatMulFP32()">,
+ AssemblerPredicate<(all_of FeatureMatMulFP32), "f32mm">;
+def HasMatMulFP64 : Predicate<"Subtarget->hasMatMulFP64()">,
+ AssemblerPredicate<(all_of FeatureMatMulFP64), "f64mm">;
def IsLE : Predicate<"Subtarget->isLittleEndian()">;
def IsBE : Predicate<"!Subtarget->isLittleEndian()">;
def IsWindows : Predicate<"Subtarget->isTargetWindows()">;
+def UseExperimentalZeroingPseudos
+ : Predicate<"Subtarget->useExperimentalZeroingPseudos()">;
def UseAlternateSExtLoadCVTF32
: Predicate<"Subtarget->useAlternateSExtLoadCVTF32Pattern()">;
def UseNegativeImmediates
- : Predicate<"false">, AssemblerPredicate<"!FeatureNoNegativeImmediates",
+ : Predicate<"false">, AssemblerPredicate<(all_of (not FeatureNoNegativeImmediates)),
"NegativeImmediates">;
def AArch64LocalRecover : SDNode<"ISD::LOCAL_RECOVER",
@@ -227,6 +239,10 @@ def SDT_AArch64ExtVec: SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>,
SDTCisSameAs<0,2>, SDTCisInt<3>]>;
def SDT_AArch64vshift : SDTypeProfile<1, 2, [SDTCisSameAs<0,1>, SDTCisInt<2>]>;
+def SDT_AArch64vshiftinsert : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisInt<3>,
+ SDTCisSameAs<0,1>,
+ SDTCisSameAs<0,2>]>;
+
def SDT_AArch64unvec : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisSameAs<0,1>]>;
def SDT_AArch64fcmpz : SDTypeProfile<1, 1, []>;
def SDT_AArch64fcmp : SDTypeProfile<1, 2, [SDTCisSameAs<1,2>]>;
@@ -245,6 +261,7 @@ def SDT_AArch64TLSDescCall : SDTypeProfile<0, -2, [SDTCisPtrTy<0>,
def SDT_AArch64ldp : SDTypeProfile<2, 1, [SDTCisVT<0, i64>, SDTCisSameAs<0, 1>, SDTCisPtrTy<2>]>;
def SDT_AArch64stp : SDTypeProfile<0, 3, [SDTCisVT<0, i64>, SDTCisSameAs<0, 1>, SDTCisPtrTy<2>]>;
+def SDT_AArch64stnp : SDTypeProfile<0, 3, [SDTCisVT<0, v4i32>, SDTCisSameAs<0, 1>, SDTCisPtrTy<2>]>;
// Generates the general dynamic sequences, i.e.
// adrp x0, :tlsdesc:var
@@ -419,7 +436,14 @@ def AArch64fccmp : SDNode<"AArch64ISD::FCCMP", SDT_AArch64FCCMP>;
def AArch64threadpointer : SDNode<"AArch64ISD::THREAD_POINTER", SDTPtrLeaf>;
-def AArch64fcmp : SDNode<"AArch64ISD::FCMP", SDT_AArch64FCmp>;
+def AArch64fcmp : SDNode<"AArch64ISD::FCMP", SDT_AArch64FCmp>;
+def AArch64strict_fcmp : SDNode<"AArch64ISD::STRICT_FCMP", SDT_AArch64FCmp,
+ [SDNPHasChain]>;
+def AArch64strict_fcmpe : SDNode<"AArch64ISD::STRICT_FCMPE", SDT_AArch64FCmp,
+ [SDNPHasChain]>;
+def AArch64any_fcmp : PatFrags<(ops node:$lhs, node:$rhs),
+ [(AArch64strict_fcmp node:$lhs, node:$rhs),
+ (AArch64fcmp node:$lhs, node:$rhs)]>;
def AArch64dup : SDNode<"AArch64ISD::DUP", SDT_AArch64Dup>;
def AArch64duplane8 : SDNode<"AArch64ISD::DUPLANE8", SDT_AArch64DupLane>;
@@ -457,10 +481,12 @@ def AArch64uqshli : SDNode<"AArch64ISD::UQSHL_I", SDT_AArch64vshift>;
def AArch64sqshlui : SDNode<"AArch64ISD::SQSHLU_I", SDT_AArch64vshift>;
def AArch64srshri : SDNode<"AArch64ISD::SRSHR_I", SDT_AArch64vshift>;
def AArch64urshri : SDNode<"AArch64ISD::URSHR_I", SDT_AArch64vshift>;
+def AArch64vsli : SDNode<"AArch64ISD::VSLI", SDT_AArch64vshiftinsert>;
+def AArch64vsri : SDNode<"AArch64ISD::VSRI", SDT_AArch64vshiftinsert>;
def AArch64not: SDNode<"AArch64ISD::NOT", SDT_AArch64unvec>;
def AArch64bit: SDNode<"AArch64ISD::BIT", SDT_AArch64trivec>;
-def AArch64bsl: SDNode<"AArch64ISD::BSL", SDT_AArch64trivec>;
+def AArch64bsp: SDNode<"AArch64ISD::BSP", SDT_AArch64trivec>;
def AArch64cmeq: SDNode<"AArch64ISD::CMEQ", SDT_AArch64binvec>;
def AArch64cmge: SDNode<"AArch64ISD::CMGE", SDT_AArch64binvec>;
@@ -528,6 +554,9 @@ def AArch64uminv : SDNode<"AArch64ISD::UMINV", SDT_AArch64UnaryVec>;
def AArch64smaxv : SDNode<"AArch64ISD::SMAXV", SDT_AArch64UnaryVec>;
def AArch64umaxv : SDNode<"AArch64ISD::UMAXV", SDT_AArch64UnaryVec>;
+def AArch64srhadd : SDNode<"AArch64ISD::SRHADD", SDT_AArch64binvec>;
+def AArch64urhadd : SDNode<"AArch64ISD::URHADD", SDT_AArch64binvec>;
+
def SDT_AArch64SETTAG : SDTypeProfile<0, 2, [SDTCisPtrTy<0>, SDTCisPtrTy<1>]>;
def AArch64stg : SDNode<"AArch64ISD::STG", SDT_AArch64SETTAG, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
def AArch64stzg : SDNode<"AArch64ISD::STZG", SDT_AArch64SETTAG, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
@@ -544,6 +573,7 @@ def AArch64uunpklo : SDNode<"AArch64ISD::UUNPKLO", SDT_AArch64unpk>;
def AArch64ldp : SDNode<"AArch64ISD::LDP", SDT_AArch64ldp, [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
def AArch64stp : SDNode<"AArch64ISD::STP", SDT_AArch64stp, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+def AArch64stnp : SDNode<"AArch64ISD::STNP", SDT_AArch64stnp, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
def AArch64tbl : SDNode<"AArch64ISD::TBL", SDT_AArch64TBL>;
@@ -564,6 +594,8 @@ let RecomputePerFunction = 1 in {
def UseBTI : Predicate<[{ MF->getFunction().hasFnAttribute("branch-target-enforcement") }]>;
def NotUseBTI : Predicate<[{ !MF->getFunction().hasFnAttribute("branch-target-enforcement") }]>;
+ def SLSBLRMitigation : Predicate<[{ MF->getSubtarget<AArch64Subtarget>().hardenSlsBlr() }]>;
+ def NoSLSBLRMitigation : Predicate<[{ !MF->getSubtarget<AArch64Subtarget>().hardenSlsBlr() }]>;
// Toggles patterns which aren't beneficial in GlobalISel when we aren't
// optimizing. This allows us to selectively use patterns without impacting
// SelectionDAG's behaviour.
@@ -686,6 +718,14 @@ let hasSideEffects = 1, isCodeGenOnly = 1 in {
: Pseudo<(outs GPR32:$dst), (ins GPR32:$src), []>, Sched<[]>;
}
+// SpeculationBarrierEndBB must only be used after an unconditional control
+// flow, i.e. after a terminator for which isBarrier is True.
+let hasSideEffects = 1, isCodeGenOnly = 1, isTerminator = 1, isBarrier = 1 in {
+ def SpeculationBarrierISBDSBEndBB
+ : Pseudo<(outs), (ins), []>, Sched<[]>;
+ def SpeculationBarrierSBEndBB
+ : Pseudo<(outs), (ins), []>, Sched<[]>;
+}
//===----------------------------------------------------------------------===//
// System instructions.
@@ -698,8 +738,15 @@ def : InstAlias<"wfe", (HINT 0b010)>;
def : InstAlias<"wfi", (HINT 0b011)>;
def : InstAlias<"sev", (HINT 0b100)>;
def : InstAlias<"sevl", (HINT 0b101)>;
+def : InstAlias<"dgh", (HINT 0b110)>;
def : InstAlias<"esb", (HINT 0b10000)>, Requires<[HasRAS]>;
def : InstAlias<"csdb", (HINT 20)>;
+// In order to be able to write readable assembly, LLVM should accept assembly
+// inputs that use Branch Target Indentification mnemonics, even with BTI disabled.
+// However, in order to be compatible with other assemblers (e.g. GAS), LLVM
+// should not emit these mnemonics unless BTI is enabled.
+def : InstAlias<"bti", (HINT 32), 0>;
+def : InstAlias<"bti $op", (HINT btihint_op:$op), 0>;
def : InstAlias<"bti", (HINT 32)>, Requires<[HasBTI]>;
def : InstAlias<"bti $op", (HINT btihint_op:$op)>, Requires<[HasBTI]>;
@@ -731,10 +778,58 @@ def TSB : CRmSystemI<barrier_op, 0b010, "tsb", []> {
// ARMv8.2-A Dot Product
let Predicates = [HasDotProd] in {
-defm SDOT : SIMDThreeSameVectorDot<0, "sdot", int_aarch64_neon_sdot>;
-defm UDOT : SIMDThreeSameVectorDot<1, "udot", int_aarch64_neon_udot>;
-defm SDOTlane : SIMDThreeSameVectorDotIndex<0, "sdot", int_aarch64_neon_sdot>;
-defm UDOTlane : SIMDThreeSameVectorDotIndex<1, "udot", int_aarch64_neon_udot>;
+defm SDOT : SIMDThreeSameVectorDot<0, 0, "sdot", int_aarch64_neon_sdot>;
+defm UDOT : SIMDThreeSameVectorDot<1, 0, "udot", int_aarch64_neon_udot>;
+defm SDOTlane : SIMDThreeSameVectorDotIndex<0, 0, 0b10, "sdot", int_aarch64_neon_sdot>;
+defm UDOTlane : SIMDThreeSameVectorDotIndex<1, 0, 0b10, "udot", int_aarch64_neon_udot>;
+}
+
+// ARMv8.6-A BFloat
+let Predicates = [HasBF16] in {
+defm BFDOT : SIMDThreeSameVectorBFDot<1, "bfdot">;
+defm BF16DOTlane : SIMDThreeSameVectorBF16DotI<0, "bfdot">;
+def BFMMLA : SIMDThreeSameVectorBF16MatrixMul<"bfmmla">;
+def BFMLALB : SIMDBF16MLAL<0, "bfmlalb", int_aarch64_neon_bfmlalb>;
+def BFMLALT : SIMDBF16MLAL<1, "bfmlalt", int_aarch64_neon_bfmlalt>;
+def BFMLALBIdx : SIMDBF16MLALIndex<0, "bfmlalb", int_aarch64_neon_bfmlalb>;
+def BFMLALTIdx : SIMDBF16MLALIndex<1, "bfmlalt", int_aarch64_neon_bfmlalt>;
+def BFCVTN : SIMD_BFCVTN;
+def BFCVTN2 : SIMD_BFCVTN2;
+def BFCVT : BF16ToSinglePrecision<"bfcvt">;
+}
+
+// ARMv8.6A AArch64 matrix multiplication
+let Predicates = [HasMatMulInt8] in {
+def SMMLA : SIMDThreeSameVectorMatMul<0, 0, "smmla", int_aarch64_neon_smmla>;
+def UMMLA : SIMDThreeSameVectorMatMul<0, 1, "ummla", int_aarch64_neon_ummla>;
+def USMMLA : SIMDThreeSameVectorMatMul<1, 0, "usmmla", int_aarch64_neon_usmmla>;
+defm USDOT : SIMDThreeSameVectorDot<0, 1, "usdot", int_aarch64_neon_usdot>;
+defm USDOTlane : SIMDThreeSameVectorDotIndex<0, 1, 0b10, "usdot", int_aarch64_neon_usdot>;
+
+// sudot lane has a pattern where usdot is expected (there is no sudot).
+// The second operand is used in the dup operation to repeat the indexed
+// element.
+class BaseSIMDSUDOTIndex<bit Q, string dst_kind, string lhs_kind,
+ string rhs_kind, RegisterOperand RegType,
+ ValueType AccumType, ValueType InputType>
+ : BaseSIMDThreeSameVectorDotIndex<Q, 0, 1, 0b00, "sudot", dst_kind,
+ lhs_kind, rhs_kind, RegType, AccumType,
+ InputType, null_frag> {
+ let Pattern = [(set (AccumType RegType:$dst),
+ (AccumType (int_aarch64_neon_usdot (AccumType RegType:$Rd),
+ (InputType (bitconvert (AccumType
+ (AArch64duplane32 (v4i32 V128:$Rm),
+ VectorIndexS:$idx)))),
+ (InputType RegType:$Rn))))];
+}
+
+multiclass SIMDSUDOTIndex {
+ def v8i8 : BaseSIMDSUDOTIndex<0, ".2s", ".8b", ".4b", V64, v2i32, v8i8>;
+ def v16i8 : BaseSIMDSUDOTIndex<1, ".4s", ".16b", ".4b", V128, v4i32, v16i8>;
+}
+
+defm SUDOTlane : SIMDSUDOTIndex;
+
}
// ARMv8.2-A FP16 Fused Multiply-Add Long
@@ -819,38 +914,56 @@ let Predicates = [HasComplxNum, HasNEON] in {
// important for compatibility with other assemblers (e.g. GAS) when building
// software compatible with both CPUs that do or don't implement PA.
let Uses = [LR], Defs = [LR] in {
- def PACIAZ : SystemNoOperands<0b000, "hint #24">;
- def PACIBZ : SystemNoOperands<0b010, "hint #26">;
+ def PACIAZ : SystemNoOperands<0b000, "hint\t#24">;
+ def PACIBZ : SystemNoOperands<0b010, "hint\t#26">;
let isAuthenticated = 1 in {
- def AUTIAZ : SystemNoOperands<0b100, "hint #28">;
- def AUTIBZ : SystemNoOperands<0b110, "hint #30">;
+ def AUTIAZ : SystemNoOperands<0b100, "hint\t#28">;
+ def AUTIBZ : SystemNoOperands<0b110, "hint\t#30">;
}
}
let Uses = [LR, SP], Defs = [LR] in {
- def PACIASP : SystemNoOperands<0b001, "hint #25">;
- def PACIBSP : SystemNoOperands<0b011, "hint #27">;
+ def PACIASP : SystemNoOperands<0b001, "hint\t#25">;
+ def PACIBSP : SystemNoOperands<0b011, "hint\t#27">;
let isAuthenticated = 1 in {
- def AUTIASP : SystemNoOperands<0b101, "hint #29">;
- def AUTIBSP : SystemNoOperands<0b111, "hint #31">;
+ def AUTIASP : SystemNoOperands<0b101, "hint\t#29">;
+ def AUTIBSP : SystemNoOperands<0b111, "hint\t#31">;
}
}
let Uses = [X16, X17], Defs = [X17], CRm = 0b0001 in {
- def PACIA1716 : SystemNoOperands<0b000, "hint #8">;
- def PACIB1716 : SystemNoOperands<0b010, "hint #10">;
+ def PACIA1716 : SystemNoOperands<0b000, "hint\t#8">;
+ def PACIB1716 : SystemNoOperands<0b010, "hint\t#10">;
let isAuthenticated = 1 in {
- def AUTIA1716 : SystemNoOperands<0b100, "hint #12">;
- def AUTIB1716 : SystemNoOperands<0b110, "hint #14">;
+ def AUTIA1716 : SystemNoOperands<0b100, "hint\t#12">;
+ def AUTIB1716 : SystemNoOperands<0b110, "hint\t#14">;
}
}
let Uses = [LR], Defs = [LR], CRm = 0b0000 in {
- def XPACLRI : SystemNoOperands<0b111, "hint #7">;
-}
+ def XPACLRI : SystemNoOperands<0b111, "hint\t#7">;
+}
+
+// In order to be able to write readable assembly, LLVM should accept assembly
+// inputs that use pointer authentication mnemonics, even with PA disabled.
+// However, in order to be compatible with other assemblers (e.g. GAS), LLVM
+// should not emit these mnemonics unless PA is enabled.
+def : InstAlias<"paciaz", (PACIAZ), 0>;
+def : InstAlias<"pacibz", (PACIBZ), 0>;
+def : InstAlias<"autiaz", (AUTIAZ), 0>;
+def : InstAlias<"autibz", (AUTIBZ), 0>;
+def : InstAlias<"paciasp", (PACIASP), 0>;
+def : InstAlias<"pacibsp", (PACIBSP), 0>;
+def : InstAlias<"autiasp", (AUTIASP), 0>;
+def : InstAlias<"autibsp", (AUTIBSP), 0>;
+def : InstAlias<"pacia1716", (PACIA1716), 0>;
+def : InstAlias<"pacib1716", (PACIB1716), 0>;
+def : InstAlias<"autia1716", (AUTIA1716), 0>;
+def : InstAlias<"autib1716", (AUTIB1716), 0>;
+def : InstAlias<"xpaclri", (XPACLRI), 0>;
// These pointer authentication instructions require armv8.3a
let Predicates = [HasPA] in {
- // When compiling with PA, there is a better mnemonic for these instructions.
+ // When PA is enabled, a better mnemonic should be emitted.
def : InstAlias<"paciaz", (PACIAZ), 1>;
def : InstAlias<"pacibz", (PACIBZ), 1>;
def : InstAlias<"autiaz", (AUTIAZ), 1>;
@@ -884,15 +997,23 @@ let Predicates = [HasPA] in {
def PACGA : SignAuthTwoOperand<0b1100, "pacga", null_frag>;
// Combined Instructions
- def BRAA : AuthBranchTwoOperands<0, 0, "braa">;
- def BRAB : AuthBranchTwoOperands<0, 1, "brab">;
- def BLRAA : AuthBranchTwoOperands<1, 0, "blraa">;
- def BLRAB : AuthBranchTwoOperands<1, 1, "blrab">;
+ let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in {
+ def BRAA : AuthBranchTwoOperands<0, 0, "braa">;
+ def BRAB : AuthBranchTwoOperands<0, 1, "brab">;
+ }
+ let isCall = 1, Defs = [LR], Uses = [SP] in {
+ def BLRAA : AuthBranchTwoOperands<1, 0, "blraa">;
+ def BLRAB : AuthBranchTwoOperands<1, 1, "blrab">;
+ }
- def BRAAZ : AuthOneOperand<0b000, 0, "braaz">;
- def BRABZ : AuthOneOperand<0b000, 1, "brabz">;
- def BLRAAZ : AuthOneOperand<0b001, 0, "blraaz">;
- def BLRABZ : AuthOneOperand<0b001, 1, "blrabz">;
+ let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in {
+ def BRAAZ : AuthOneOperand<0b000, 0, "braaz">;
+ def BRABZ : AuthOneOperand<0b000, 1, "brabz">;
+ }
+ let isCall = 1, Defs = [LR], Uses = [SP] in {
+ def BLRAAZ : AuthOneOperand<0b001, 0, "blraaz">;
+ def BLRABZ : AuthOneOperand<0b001, 1, "blrabz">;
+ }
let isReturn = 1, isTerminator = 1, isBarrier = 1 in {
def RETAA : AuthReturn<0b010, 0, "retaa">;
@@ -1538,17 +1659,29 @@ def TAGPstack
// register / expression for the tagged base pointer of the current function.
def : Pat<(int_aarch64_irg_sp i64:$Rm), (IRGstack SP, i64:$Rm)>;
-// Large STG to be expanded into a loop. $Rm is the size, $Rn is start address.
-// $Rn_wback is one past the end of the range.
+// Large STG to be expanded into a loop. $sz is the size, $Rn is start address.
+// $Rn_wback is one past the end of the range. $Rm is the loop counter.
let isCodeGenOnly=1, mayStore=1 in {
+def STGloop_wback
+ : Pseudo<(outs GPR64common:$Rm, GPR64sp:$Rn_wback), (ins i64imm:$sz, GPR64sp:$Rn),
+ [], "$Rn = $Rn_wback,@earlyclobber $Rn_wback,@earlyclobber $Rm" >,
+ Sched<[WriteAdr, WriteST]>;
+
+def STZGloop_wback
+ : Pseudo<(outs GPR64common:$Rm, GPR64sp:$Rn_wback), (ins i64imm:$sz, GPR64sp:$Rn),
+ [], "$Rn = $Rn_wback,@earlyclobber $Rn_wback,@earlyclobber $Rm" >,
+ Sched<[WriteAdr, WriteST]>;
+
+// A variant of the above where $Rn2 is an independent register not tied to the input register $Rn.
+// Their purpose is to use a FrameIndex operand as $Rn (which of course can not be written back).
def STGloop
- : Pseudo<(outs GPR64common:$Rm_wback, GPR64sp:$Rn_wback), (ins GPR64common:$Rm, GPR64sp:$Rn),
- [], "$Rn = $Rn_wback,@earlyclobber $Rn_wback,$Rm = $Rm_wback,@earlyclobber $Rm_wback" >,
+ : Pseudo<(outs GPR64common:$Rm, GPR64sp:$Rn2), (ins i64imm:$sz, GPR64sp:$Rn),
+ [], "@earlyclobber $Rn2,@earlyclobber $Rm" >,
Sched<[WriteAdr, WriteST]>;
def STZGloop
- : Pseudo<(outs GPR64common:$Rm_wback, GPR64sp:$Rn_wback), (ins GPR64common:$Rm, GPR64sp:$Rn),
- [], "$Rn = $Rn_wback,@earlyclobber $Rn_wback,$Rm = $Rm_wback,@earlyclobber $Rm_wback" >,
+ : Pseudo<(outs GPR64common:$Rm, GPR64sp:$Rn2), (ins i64imm:$sz, GPR64sp:$Rn),
+ [], "@earlyclobber $Rn2,@earlyclobber $Rm" >,
Sched<[WriteAdr, WriteST]>;
}
@@ -1894,9 +2027,19 @@ def ERET : SpecialReturn<0b0100, "eret">;
def : InstAlias<"ret", (RET LR)>;
let isCall = 1, Defs = [LR], Uses = [SP] in {
-def BLR : BranchReg<0b0001, "blr", [(AArch64call GPR64:$Rn)]>;
+ def BLR : BranchReg<0b0001, "blr", []>;
+ def BLRNoIP : Pseudo<(outs), (ins GPR64noip:$Rn), []>,
+ Sched<[WriteBrReg]>,
+ PseudoInstExpansion<(BLR GPR64:$Rn)>;
} // isCall
+def : Pat<(AArch64call GPR64:$Rn),
+ (BLR GPR64:$Rn)>,
+ Requires<[NoSLSBLRMitigation]>;
+def : Pat<(AArch64call GPR64noip:$Rn),
+ (BLRNoIP GPR64noip:$Rn)>,
+ Requires<[SLSBLRMitigation]>;
+
let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in {
def BR : BranchReg<0b0000, "br", [(brind GPR64:$Rn)]>;
} // isBranch, isTerminator, isBarrier, isIndirectBranch
@@ -2129,6 +2272,7 @@ let Predicates = [IsLE] in {
defm : VecROLoadPat<ro64, v8i8, LDRDroW, LDRDroX>;
defm : VecROLoadPat<ro64, v4i16, LDRDroW, LDRDroX>;
defm : VecROLoadPat<ro64, v4f16, LDRDroW, LDRDroX>;
+ defm : VecROLoadPat<ro64, v4bf16, LDRDroW, LDRDroX>;
}
defm : VecROLoadPat<ro64, v1i64, LDRDroW, LDRDroX>;
@@ -2143,6 +2287,7 @@ let Predicates = [IsLE] in {
defm : VecROLoadPat<ro128, v4f32, LDRQroW, LDRQroX>;
defm : VecROLoadPat<ro128, v8i16, LDRQroW, LDRQroX>;
defm : VecROLoadPat<ro128, v8f16, LDRQroW, LDRQroX>;
+ defm : VecROLoadPat<ro128, v8bf16, LDRQroW, LDRQroX>;
defm : VecROLoadPat<ro128, v16i8, LDRQroW, LDRQroX>;
}
} // AddedComplexity = 10
@@ -2225,6 +2370,10 @@ defm LDRQ : LoadUI<0b00, 1, 0b11, FPR128Op, uimm12s16, "ldr",
[(set (f128 FPR128Op:$Rt),
(load (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)))]>;
+// bf16 load pattern
+def : Pat <(bf16 (load (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))),
+ (LDRHui GPR64sp:$Rn, uimm12s2:$offset)>;
+
// For regular load, we do not have any alignment requirement.
// Thus, it is safe to directly map the vector loads with interesting
// addressing modes.
@@ -2274,6 +2423,8 @@ let Predicates = [IsLE] in {
(LDRDui GPR64sp:$Rn, uimm12s8:$offset)>;
def : Pat<(v4f16 (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))),
(LDRDui GPR64sp:$Rn, uimm12s8:$offset)>;
+ def : Pat<(v4bf16 (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))),
+ (LDRDui GPR64sp:$Rn, uimm12s8:$offset)>;
}
def : Pat<(v1f64 (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))),
(LDRDui GPR64sp:$Rn, uimm12s8:$offset)>;
@@ -2297,6 +2448,8 @@ let Predicates = [IsLE] in {
(LDRQui GPR64sp:$Rn, uimm12s16:$offset)>;
def : Pat<(v8f16 (load (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset))),
(LDRQui GPR64sp:$Rn, uimm12s16:$offset)>;
+ def : Pat<(v8bf16 (load (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset))),
+ (LDRQui GPR64sp:$Rn, uimm12s16:$offset)>;
}
def : Pat<(f128 (load (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset))),
(LDRQui GPR64sp:$Rn, uimm12s16:$offset)>;
@@ -2381,11 +2534,11 @@ def : InstAlias<"prfm $Rt, [$Rn]", (PRFMui prfop:$Rt, GPR64sp:$Rn, 0)>;
def alignedglobal : PatLeaf<(iPTR iPTR:$label), [{
if (auto *G = dyn_cast<GlobalAddressSDNode>(N)) {
const DataLayout &DL = MF->getDataLayout();
- MaybeAlign Align = G->getGlobal()->getPointerAlignment(DL);
- return Align && *Align >= 4 && G->getOffset() % 4 == 0;
+ Align Align = G->getGlobal()->getPointerAlignment(DL);
+ return Align >= 4 && G->getOffset() % 4 == 0;
}
if (auto *C = dyn_cast<ConstantPoolSDNode>(N))
- return C->getAlignment() >= 4 && C->getOffset() % 4 == 0;
+ return C->getAlign() >= 4 && C->getOffset() % 4 == 0;
return false;
}]>;
@@ -2425,7 +2578,7 @@ defm LDURB : LoadUnscaled<0b00, 1, 0b01, FPR8Op, "ldur",
[(set FPR8Op:$Rt,
(load (am_unscaled8 GPR64sp:$Rn, simm9:$offset)))]>;
defm LDURH : LoadUnscaled<0b01, 1, 0b01, FPR16Op, "ldur",
- [(set FPR16Op:$Rt,
+ [(set (f16 FPR16Op:$Rt),
(load (am_unscaled16 GPR64sp:$Rn, simm9:$offset)))]>;
defm LDURS : LoadUnscaled<0b10, 1, 0b01, FPR32Op, "ldur",
[(set (f32 FPR32Op:$Rt),
@@ -2722,6 +2875,10 @@ defm STNPQ : StorePairNoAlloc<0b10, 1, FPR128Op, simm7s16, "stnp">;
def : Pat<(AArch64stp GPR64z:$Rt, GPR64z:$Rt2, (am_indexed7s64 GPR64sp:$Rn, simm7s8:$offset)),
(STPXi GPR64z:$Rt, GPR64z:$Rt2, GPR64sp:$Rn, simm7s8:$offset)>;
+def : Pat<(AArch64stnp FPR128:$Rt, FPR128:$Rt2, (am_indexed7s128 GPR64sp:$Rn, simm7s16:$offset)),
+ (STNPQi FPR128:$Rt, FPR128:$Rt2, GPR64sp:$Rn, simm7s16:$offset)>;
+
+
//---
// (Register offset)
@@ -2791,6 +2948,7 @@ let Predicates = [IsLE] in {
defm : VecROStorePat<ro64, v4i16, FPR64, STRDroW, STRDroX>;
defm : VecROStorePat<ro64, v8i8, FPR64, STRDroW, STRDroX>;
defm : VecROStorePat<ro64, v4f16, FPR64, STRDroW, STRDroX>;
+ defm : VecROStorePat<ro64, v4bf16, FPR64, STRDroW, STRDroX>;
}
defm : VecROStorePat<ro64, v1i64, FPR64, STRDroW, STRDroX>;
@@ -2806,6 +2964,7 @@ let Predicates = [IsLE, UseSTRQro] in {
defm : VecROStorePat<ro128, v8i16, FPR128, STRQroW, STRQroX>;
defm : VecROStorePat<ro128, v16i8, FPR128, STRQroW, STRQroX>;
defm : VecROStorePat<ro128, v8f16, FPR128, STRQroW, STRQroX>;
+ defm : VecROStorePat<ro128, v8bf16, FPR128, STRQroW, STRQroX>;
}
} // AddedComplexity = 10
@@ -2866,6 +3025,11 @@ defm STRBB : StoreUIz<0b00, 0, 0b00, GPR32z, uimm12s1, "strb",
(am_indexed8 GPR64sp:$Rn,
uimm12s1:$offset))]>;
+// bf16 store pattern
+def : Pat<(store (bf16 FPR16Op:$Rt),
+ (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset)),
+ (STRHui FPR16:$Rt, GPR64sp:$Rn, uimm12s2:$offset)>;
+
let AddedComplexity = 10 in {
// Match all store 64 bits width whose type is compatible with FPR64
@@ -2893,6 +3057,9 @@ let Predicates = [IsLE] in {
def : Pat<(store (v4f16 FPR64:$Rt),
(am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)),
(STRDui FPR64:$Rt, GPR64sp:$Rn, uimm12s8:$offset)>;
+ def : Pat<(store (v4bf16 FPR64:$Rt),
+ (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)),
+ (STRDui FPR64:$Rt, GPR64sp:$Rn, uimm12s8:$offset)>;
}
// Match all store 128 bits width whose type is compatible with FPR128
@@ -2923,6 +3090,9 @@ let Predicates = [IsLE] in {
def : Pat<(store (v8f16 FPR128:$Rt),
(am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)),
(STRQui FPR128:$Rt, GPR64sp:$Rn, uimm12s16:$offset)>;
+ def : Pat<(store (v8bf16 FPR128:$Rt),
+ (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)),
+ (STRQui FPR128:$Rt, GPR64sp:$Rn, uimm12s16:$offset)>;
}
// truncstore i64
@@ -3030,6 +3200,9 @@ let Predicates = [IsLE] in {
def : Pat<(store (v4f16 FPR64:$Rt),
(am_unscaled64 GPR64sp:$Rn, simm9:$offset)),
(STURDi FPR64:$Rt, GPR64sp:$Rn, simm9:$offset)>;
+ def : Pat<(store (v4bf16 FPR64:$Rt),
+ (am_unscaled64 GPR64sp:$Rn, simm9:$offset)),
+ (STURDi FPR64:$Rt, GPR64sp:$Rn, simm9:$offset)>;
}
// Match all store 128 bits width whose type is compatible with FPR128
@@ -3062,6 +3235,9 @@ let Predicates = [IsLE] in {
def : Pat<(store (v8f16 FPR128:$Rt),
(am_unscaled128 GPR64sp:$Rn, simm9:$offset)),
(STURQi FPR128:$Rt, GPR64sp:$Rn, simm9:$offset)>;
+ def : Pat<(store (v8bf16 FPR128:$Rt),
+ (am_unscaled128 GPR64sp:$Rn, simm9:$offset)),
+ (STURQi FPR128:$Rt, GPR64sp:$Rn, simm9:$offset)>;
}
} // AddedComplexity = 10
@@ -3300,10 +3476,10 @@ defm FCVTNS : FPToIntegerUnscaled<0b00, 0b000, "fcvtns", int_aarch64_neon_fcvtns
defm FCVTNU : FPToIntegerUnscaled<0b00, 0b001, "fcvtnu", int_aarch64_neon_fcvtnu>;
defm FCVTPS : FPToIntegerUnscaled<0b01, 0b000, "fcvtps", int_aarch64_neon_fcvtps>;
defm FCVTPU : FPToIntegerUnscaled<0b01, 0b001, "fcvtpu", int_aarch64_neon_fcvtpu>;
-defm FCVTZS : FPToIntegerUnscaled<0b11, 0b000, "fcvtzs", fp_to_sint>;
-defm FCVTZU : FPToIntegerUnscaled<0b11, 0b001, "fcvtzu", fp_to_uint>;
-defm FCVTZS : FPToIntegerScaled<0b11, 0b000, "fcvtzs", fp_to_sint>;
-defm FCVTZU : FPToIntegerScaled<0b11, 0b001, "fcvtzu", fp_to_uint>;
+defm FCVTZS : FPToIntegerUnscaled<0b11, 0b000, "fcvtzs", any_fp_to_sint>;
+defm FCVTZU : FPToIntegerUnscaled<0b11, 0b001, "fcvtzu", any_fp_to_uint>;
+defm FCVTZS : FPToIntegerScaled<0b11, 0b000, "fcvtzs", any_fp_to_sint>;
+defm FCVTZU : FPToIntegerScaled<0b11, 0b001, "fcvtzu", any_fp_to_uint>;
multiclass FPToIntegerIntPats<Intrinsic round, string INST> {
def : Pat<(i32 (round f16:$Rn)), (!cast<Instruction>(INST # UWHr) $Rn)>;
@@ -3375,8 +3551,8 @@ def : Pat<(i64 (llround f64:$Rn)),
// Scaled integer to floating point conversion instructions.
//===----------------------------------------------------------------------===//
-defm SCVTF : IntegerToFP<0, "scvtf", sint_to_fp>;
-defm UCVTF : IntegerToFP<1, "ucvtf", uint_to_fp>;
+defm SCVTF : IntegerToFP<0, "scvtf", any_sint_to_fp>;
+defm UCVTF : IntegerToFP<1, "ucvtf", any_uint_to_fp>;
//===----------------------------------------------------------------------===//
// Unscaled integer to floating point conversion instruction.
@@ -3541,8 +3717,8 @@ def : Pat<(f64 (fma FPR64:$Rn, (fneg FPR64:$Rm), (fneg FPR64:$Ra))),
// Floating point comparison instructions.
//===----------------------------------------------------------------------===//
-defm FCMPE : FPComparison<1, "fcmpe">;
-defm FCMP : FPComparison<0, "fcmp", AArch64fcmp>;
+defm FCMPE : FPComparison<1, "fcmpe", AArch64strict_fcmpe>;
+defm FCMP : FPComparison<0, "fcmp", AArch64any_fcmp>;
//===----------------------------------------------------------------------===//
// Floating point conditional comparison instructions.
@@ -3603,10 +3779,6 @@ let isTerminator = 1, hasSideEffects = 1, isBarrier = 1, hasCtrlDep = 1,
Sched<[]>;
}
-let hasSideEffects = 1, hasCtrlDep = 1, isCodeGenOnly = 1,
- usesCustomInserter = 1 in
-def CATCHPAD : Pseudo<(outs), (ins), [(catchpad)]>, Sched<[]>;
-
//===----------------------------------------------------------------------===//
// Floating point immediate move.
//===----------------------------------------------------------------------===//
@@ -3788,12 +3960,16 @@ defm URSQRTE: SIMDTwoVectorS<1, 1, 0b11100, "ursqrte", int_aarch64_neon_ursqrte>
defm USQADD : SIMDTwoVectorBHSDTied<1, 0b00011, "usqadd",int_aarch64_neon_usqadd>;
defm XTN : SIMDMixedTwoVector<0, 0b10010, "xtn", trunc>;
-def : Pat<(v4f16 (AArch64rev32 V64:$Rn)), (REV32v4i16 V64:$Rn)>;
-def : Pat<(v4f16 (AArch64rev64 V64:$Rn)), (REV64v4i16 V64:$Rn)>;
-def : Pat<(v8f16 (AArch64rev32 V128:$Rn)), (REV32v8i16 V128:$Rn)>;
-def : Pat<(v8f16 (AArch64rev64 V128:$Rn)), (REV64v8i16 V128:$Rn)>;
-def : Pat<(v2f32 (AArch64rev64 V64:$Rn)), (REV64v2i32 V64:$Rn)>;
-def : Pat<(v4f32 (AArch64rev64 V128:$Rn)), (REV64v4i32 V128:$Rn)>;
+def : Pat<(v4f16 (AArch64rev32 V64:$Rn)), (REV32v4i16 V64:$Rn)>;
+def : Pat<(v4f16 (AArch64rev64 V64:$Rn)), (REV64v4i16 V64:$Rn)>;
+def : Pat<(v4bf16 (AArch64rev32 V64:$Rn)), (REV32v4i16 V64:$Rn)>;
+def : Pat<(v4bf16 (AArch64rev64 V64:$Rn)), (REV64v4i16 V64:$Rn)>;
+def : Pat<(v8f16 (AArch64rev32 V128:$Rn)), (REV32v8i16 V128:$Rn)>;
+def : Pat<(v8f16 (AArch64rev64 V128:$Rn)), (REV64v8i16 V128:$Rn)>;
+def : Pat<(v8bf16 (AArch64rev32 V128:$Rn)), (REV32v8i16 V128:$Rn)>;
+def : Pat<(v8bf16 (AArch64rev64 V128:$Rn)), (REV64v8i16 V128:$Rn)>;
+def : Pat<(v2f32 (AArch64rev64 V64:$Rn)), (REV64v2i32 V64:$Rn)>;
+def : Pat<(v4f32 (AArch64rev64 V128:$Rn)), (REV64v4i32 V128:$Rn)>;
// Patterns for vector long shift (by element width). These need to match all
// three of zext, sext and anyext so it's easier to pull the patterns out of the
@@ -3900,7 +4076,7 @@ defm SQRDMULH : SIMDThreeSameVectorHS<1,0b10110,"sqrdmulh",int_aarch64_neon_sqrd
defm SQRSHL : SIMDThreeSameVector<0,0b01011,"sqrshl", int_aarch64_neon_sqrshl>;
defm SQSHL : SIMDThreeSameVector<0,0b01001,"sqshl", int_aarch64_neon_sqshl>;
defm SQSUB : SIMDThreeSameVector<0,0b00101,"sqsub", int_aarch64_neon_sqsub>;
-defm SRHADD : SIMDThreeSameVectorBHS<0,0b00010,"srhadd",int_aarch64_neon_srhadd>;
+defm SRHADD : SIMDThreeSameVectorBHS<0,0b00010,"srhadd", AArch64srhadd>;
defm SRSHL : SIMDThreeSameVector<0,0b01010,"srshl", int_aarch64_neon_srshl>;
defm SSHL : SIMDThreeSameVector<0,0b01000,"sshl", int_aarch64_neon_sshl>;
defm SUB : SIMDThreeSameVector<1,0b10000,"sub", sub>;
@@ -3917,7 +4093,7 @@ defm UQADD : SIMDThreeSameVector<1,0b00001,"uqadd", int_aarch64_neon_uqadd>;
defm UQRSHL : SIMDThreeSameVector<1,0b01011,"uqrshl", int_aarch64_neon_uqrshl>;
defm UQSHL : SIMDThreeSameVector<1,0b01001,"uqshl", int_aarch64_neon_uqshl>;
defm UQSUB : SIMDThreeSameVector<1,0b00101,"uqsub", int_aarch64_neon_uqsub>;
-defm URHADD : SIMDThreeSameVectorBHS<1,0b00010,"urhadd", int_aarch64_neon_urhadd>;
+defm URHADD : SIMDThreeSameVectorBHS<1,0b00010,"urhadd", AArch64urhadd>;
defm URSHL : SIMDThreeSameVector<1,0b01010,"urshl", int_aarch64_neon_urshl>;
defm USHL : SIMDThreeSameVector<1,0b01000,"ushl", int_aarch64_neon_ushl>;
defm SQRDMLAH : SIMDThreeSameVectorSQRDMLxHTiedHS<1,0b10000,"sqrdmlah",
@@ -3934,33 +4110,36 @@ defm : SIMDThreeSameVectorExtraPatterns<"UQSUB", usubsat>;
defm AND : SIMDLogicalThreeVector<0, 0b00, "and", and>;
defm BIC : SIMDLogicalThreeVector<0, 0b01, "bic",
BinOpFrag<(and node:$LHS, (vnot node:$RHS))> >;
-defm BIF : SIMDLogicalThreeVector<1, 0b11, "bif">;
-defm BIT : SIMDLogicalThreeVectorTied<1, 0b10, "bit", AArch64bit>;
-defm BSL : SIMDLogicalThreeVectorTied<1, 0b01, "bsl",
- TriOpFrag<(or (and node:$LHS, node:$MHS), (and (vnot node:$LHS), node:$RHS))>>;
defm EOR : SIMDLogicalThreeVector<1, 0b00, "eor", xor>;
defm ORN : SIMDLogicalThreeVector<0, 0b11, "orn",
BinOpFrag<(or node:$LHS, (vnot node:$RHS))> >;
defm ORR : SIMDLogicalThreeVector<0, 0b10, "orr", or>;
-
-def : Pat<(AArch64bsl (v8i8 V64:$Rd), V64:$Rn, V64:$Rm),
- (BSLv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>;
-def : Pat<(AArch64bsl (v4i16 V64:$Rd), V64:$Rn, V64:$Rm),
- (BSLv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>;
-def : Pat<(AArch64bsl (v2i32 V64:$Rd), V64:$Rn, V64:$Rm),
- (BSLv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>;
-def : Pat<(AArch64bsl (v1i64 V64:$Rd), V64:$Rn, V64:$Rm),
- (BSLv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>;
-
-def : Pat<(AArch64bsl (v16i8 V128:$Rd), V128:$Rn, V128:$Rm),
- (BSLv16i8 V128:$Rd, V128:$Rn, V128:$Rm)>;
-def : Pat<(AArch64bsl (v8i16 V128:$Rd), V128:$Rn, V128:$Rm),
- (BSLv16i8 V128:$Rd, V128:$Rn, V128:$Rm)>;
-def : Pat<(AArch64bsl (v4i32 V128:$Rd), V128:$Rn, V128:$Rm),
- (BSLv16i8 V128:$Rd, V128:$Rn, V128:$Rm)>;
-def : Pat<(AArch64bsl (v2i64 V128:$Rd), V128:$Rn, V128:$Rm),
- (BSLv16i8 V128:$Rd, V128:$Rn, V128:$Rm)>;
+// Pseudo bitwise select pattern BSP.
+// It is expanded into BSL/BIT/BIF after register allocation.
+defm BSP : SIMDLogicalThreeVectorPseudo<TriOpFrag<(or (and node:$LHS, node:$MHS),
+ (and (vnot node:$LHS), node:$RHS))>>;
+defm BSL : SIMDLogicalThreeVectorTied<1, 0b01, "bsl">;
+defm BIT : SIMDLogicalThreeVectorTied<1, 0b10, "bit", AArch64bit>;
+defm BIF : SIMDLogicalThreeVectorTied<1, 0b11, "bif">;
+
+def : Pat<(AArch64bsp (v8i8 V64:$Rd), V64:$Rn, V64:$Rm),
+ (BSPv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>;
+def : Pat<(AArch64bsp (v4i16 V64:$Rd), V64:$Rn, V64:$Rm),
+ (BSPv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>;
+def : Pat<(AArch64bsp (v2i32 V64:$Rd), V64:$Rn, V64:$Rm),
+ (BSPv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>;
+def : Pat<(AArch64bsp (v1i64 V64:$Rd), V64:$Rn, V64:$Rm),
+ (BSPv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>;
+
+def : Pat<(AArch64bsp (v16i8 V128:$Rd), V128:$Rn, V128:$Rm),
+ (BSPv16i8 V128:$Rd, V128:$Rn, V128:$Rm)>;
+def : Pat<(AArch64bsp (v8i16 V128:$Rd), V128:$Rn, V128:$Rm),
+ (BSPv16i8 V128:$Rd, V128:$Rn, V128:$Rm)>;
+def : Pat<(AArch64bsp (v4i32 V128:$Rd), V128:$Rn, V128:$Rm),
+ (BSPv16i8 V128:$Rd, V128:$Rn, V128:$Rm)>;
+def : Pat<(AArch64bsp (v2i64 V128:$Rd), V128:$Rn, V128:$Rm),
+ (BSPv16i8 V128:$Rd, V128:$Rn, V128:$Rm)>;
def : InstAlias<"mov{\t$dst.16b, $src.16b|.16b\t$dst, $src}",
(ORRv16i8 V128:$dst, V128:$src, V128:$src), 1>;
@@ -4669,6 +4848,7 @@ multiclass ExtPat<ValueType VT64, ValueType VT128, int N> {
defm : ExtPat<v8i8, v16i8, 8>;
defm : ExtPat<v4i16, v8i16, 4>;
defm : ExtPat<v4f16, v8f16, 4>;
+defm : ExtPat<v4bf16, v8bf16, 4>;
defm : ExtPat<v2i32, v4i32, 2>;
defm : ExtPat<v2f32, v4f32, 2>;
defm : ExtPat<v1i64, v2i64, 1>;
@@ -4790,16 +4970,29 @@ def : Pat<(v4f16 (AArch64dup (f16 FPR16:$Rn))),
(v4f16 (DUPv4i16lane
(INSERT_SUBREG (v8i16 (IMPLICIT_DEF)), FPR16:$Rn, hsub),
(i64 0)))>;
+def : Pat<(v4bf16 (AArch64dup (bf16 FPR16:$Rn))),
+ (v4bf16 (DUPv4i16lane
+ (INSERT_SUBREG (v8i16 (IMPLICIT_DEF)), FPR16:$Rn, hsub),
+ (i64 0)))>;
def : Pat<(v8f16 (AArch64dup (f16 FPR16:$Rn))),
(v8f16 (DUPv8i16lane
(INSERT_SUBREG (v8i16 (IMPLICIT_DEF)), FPR16:$Rn, hsub),
(i64 0)))>;
+def : Pat<(v8bf16 (AArch64dup (bf16 FPR16:$Rn))),
+ (v8bf16 (DUPv8i16lane
+ (INSERT_SUBREG (v8i16 (IMPLICIT_DEF)), FPR16:$Rn, hsub),
+ (i64 0)))>;
def : Pat<(v4f16 (AArch64duplane16 (v8f16 V128:$Rn), VectorIndexH:$imm)),
(DUPv4i16lane V128:$Rn, VectorIndexH:$imm)>;
def : Pat<(v8f16 (AArch64duplane16 (v8f16 V128:$Rn), VectorIndexH:$imm)),
(DUPv8i16lane V128:$Rn, VectorIndexH:$imm)>;
+def : Pat<(v4bf16 (AArch64duplane16 (v8bf16 V128:$Rn), VectorIndexH:$imm)),
+ (DUPv4i16lane V128:$Rn, VectorIndexH:$imm)>;
+def : Pat<(v8bf16 (AArch64duplane16 (v8bf16 V128:$Rn), VectorIndexH:$imm)),
+ (DUPv8i16lane V128:$Rn, VectorIndexH:$imm)>;
+
def : Pat<(v2f32 (AArch64duplane32 (v4f32 V128:$Rn), VectorIndexS:$imm)),
(DUPv2i32lane V128:$Rn, VectorIndexS:$imm)>;
def : Pat<(v4f32 (AArch64duplane32 (v4f32 V128:$Rn), VectorIndexS:$imm)),
@@ -4915,6 +5108,11 @@ def : Pat<(v4f16 (scalar_to_vector (f16 FPR16:$Rn))),
def : Pat<(v8f16 (scalar_to_vector (f16 FPR16:$Rn))),
(INSERT_SUBREG (v8f16 (IMPLICIT_DEF)), FPR16:$Rn, hsub)>;
+def : Pat<(v4bf16 (scalar_to_vector (bf16 FPR16:$Rn))),
+ (INSERT_SUBREG (v4bf16 (IMPLICIT_DEF)), FPR16:$Rn, hsub)>;
+def : Pat<(v8bf16 (scalar_to_vector (bf16 FPR16:$Rn))),
+ (INSERT_SUBREG (v8bf16 (IMPLICIT_DEF)), FPR16:$Rn, hsub)>;
+
def : Pat<(v2i32 (scalar_to_vector (i32 FPR32:$Rn))),
(v2i32 (INSERT_SUBREG (v2i32 (IMPLICIT_DEF)),
(i32 FPR32:$Rn), ssub))>;
@@ -4931,6 +5129,11 @@ def : Pat<(v4f16 (scalar_to_vector (f16 FPR16:$Rn))),
def : Pat<(v8f16 (scalar_to_vector (f16 FPR16:$Rn))),
(INSERT_SUBREG (v8f16 (IMPLICIT_DEF)), FPR16:$Rn, hsub)>;
+def : Pat<(v4bf16 (scalar_to_vector (bf16 FPR16:$Rn))),
+ (INSERT_SUBREG (v4bf16 (IMPLICIT_DEF)), FPR16:$Rn, hsub)>;
+def : Pat<(v8bf16 (scalar_to_vector (bf16 FPR16:$Rn))),
+ (INSERT_SUBREG (v8bf16 (IMPLICIT_DEF)), FPR16:$Rn, hsub)>;
+
def : Pat<(v4f32 (scalar_to_vector (f32 FPR32:$Rn))),
(INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FPR32:$Rn, ssub)>;
def : Pat<(v2f32 (scalar_to_vector (f32 FPR32:$Rn))),
@@ -4956,6 +5159,23 @@ def : Pat<(v8f16 (vector_insert (v8f16 V128:$Rn),
(v8f16 (INSERT_SUBREG (v8f16 (IMPLICIT_DEF)), FPR16:$Rm, hsub)),
(i64 0))>;
+def : Pat<(v4bf16 (vector_insert (v4bf16 V64:$Rn),
+ (bf16 FPR16:$Rm), (i64 VectorIndexS:$imm))),
+ (EXTRACT_SUBREG
+ (INSvi16lane
+ (v8bf16 (INSERT_SUBREG (v8bf16 (IMPLICIT_DEF)), V64:$Rn, dsub)),
+ VectorIndexS:$imm,
+ (v8bf16 (INSERT_SUBREG (v8bf16 (IMPLICIT_DEF)), FPR16:$Rm, hsub)),
+ (i64 0)),
+ dsub)>;
+
+def : Pat<(v8bf16 (vector_insert (v8bf16 V128:$Rn),
+ (bf16 FPR16:$Rm), (i64 VectorIndexH:$imm))),
+ (INSvi16lane
+ V128:$Rn, VectorIndexH:$imm,
+ (v8bf16 (INSERT_SUBREG (v8bf16 (IMPLICIT_DEF)), FPR16:$Rm, hsub)),
+ (i64 0))>;
+
def : Pat<(v2f32 (vector_insert (v2f32 V64:$Rn),
(f32 FPR32:$Rm), (i64 VectorIndexS:$imm))),
(EXTRACT_SUBREG
@@ -5037,6 +5257,7 @@ multiclass Neon_INS_elt_pattern<ValueType VT128, ValueType VT64,
}
defm : Neon_INS_elt_pattern<v8f16, v4f16, f16, INSvi16lane>;
+defm : Neon_INS_elt_pattern<v8bf16, v4bf16, bf16, INSvi16lane>;
defm : Neon_INS_elt_pattern<v4f32, v2f32, f32, INSvi32lane>;
defm : Neon_INS_elt_pattern<v2f64, v1f64, f64, INSvi64lane>;
@@ -5050,6 +5271,9 @@ def : Pat<(vector_extract (v4f32 V128:$Rn), 0),
(f32 (EXTRACT_SUBREG V128:$Rn, ssub))>;
def : Pat<(vector_extract (v8f16 V128:$Rn), 0),
(f16 (EXTRACT_SUBREG V128:$Rn, hsub))>;
+def : Pat<(vector_extract (v8bf16 V128:$Rn), 0),
+ (bf16 (EXTRACT_SUBREG V128:$Rn, hsub))>;
+
def : Pat<(vector_extract (v2f64 V128:$Rn), VectorIndexD:$idx),
(f64 (CPYi64 V128:$Rn, VectorIndexD:$idx))>;
@@ -5057,6 +5281,8 @@ def : Pat<(vector_extract (v4f32 V128:$Rn), VectorIndexS:$idx),
(f32 (CPYi32 V128:$Rn, VectorIndexS:$idx))>;
def : Pat<(vector_extract (v8f16 V128:$Rn), VectorIndexH:$idx),
(f16 (CPYi16 V128:$Rn, VectorIndexH:$idx))>;
+def : Pat<(vector_extract (v8bf16 V128:$Rn), VectorIndexH:$idx),
+ (bf16 (CPYi16 V128:$Rn, VectorIndexH:$idx))>;
// All concat_vectors operations are canonicalised to act on i64 vectors for
// AArch64. In the general case we need an instruction, which had just as well be
@@ -5072,6 +5298,7 @@ def : ConcatPat<v4i32, v2i32>;
def : ConcatPat<v4f32, v2f32>;
def : ConcatPat<v8i16, v4i16>;
def : ConcatPat<v8f16, v4f16>;
+def : ConcatPat<v8bf16, v4bf16>;
def : ConcatPat<v16i8, v8i8>;
// If the high lanes are undef, though, we can just ignore them:
@@ -5613,6 +5840,11 @@ def : Pat<(v2f64 (fmul V128:$Rn, (AArch64dup (f64 FPR64:$Rm)))),
defm SQDMULH : SIMDIndexedHS<0, 0b1100, "sqdmulh", int_aarch64_neon_sqdmulh>;
defm SQRDMULH : SIMDIndexedHS<0, 0b1101, "sqrdmulh", int_aarch64_neon_sqrdmulh>;
+defm SQDMULH : SIMDIndexedHSPatterns<int_aarch64_neon_sqdmulh_lane,
+ int_aarch64_neon_sqdmulh_laneq>;
+defm SQRDMULH : SIMDIndexedHSPatterns<int_aarch64_neon_sqrdmulh_lane,
+ int_aarch64_neon_sqrdmulh_laneq>;
+
// Generated by MachineCombine
defm MLA : SIMDVectorIndexedHSTied<1, 0b0000, "mla", null_frag>;
defm MLS : SIMDVectorIndexedHSTied<1, 0b0100, "mls", null_frag>;
@@ -5780,8 +6012,8 @@ defm RSHRN : SIMDVectorRShiftNarrowBHS<0, 0b10001, "rshrn",
defm SHL : SIMDVectorLShiftBHSD<0, 0b01010, "shl", AArch64vshl>;
defm SHRN : SIMDVectorRShiftNarrowBHS<0, 0b10000, "shrn",
BinOpFrag<(trunc (AArch64vashr node:$LHS, node:$RHS))>>;
-defm SLI : SIMDVectorLShiftBHSDTied<1, 0b01010, "sli", int_aarch64_neon_vsli>;
-def : Pat<(v1i64 (int_aarch64_neon_vsli (v1i64 FPR64:$Rd), (v1i64 FPR64:$Rn),
+defm SLI : SIMDVectorLShiftBHSDTied<1, 0b01010, "sli", AArch64vsli>;
+def : Pat<(v1i64 (AArch64vsli (v1i64 FPR64:$Rd), (v1i64 FPR64:$Rn),
(i32 vecshiftL64:$imm))),
(SLId FPR64:$Rd, FPR64:$Rn, vecshiftL64:$imm)>;
defm SQRSHRN : SIMDVectorRShiftNarrowBHS<0, 0b10011, "sqrshrn",
@@ -5794,8 +6026,8 @@ defm SQSHRN : SIMDVectorRShiftNarrowBHS<0, 0b10010, "sqshrn",
int_aarch64_neon_sqshrn>;
defm SQSHRUN : SIMDVectorRShiftNarrowBHS<1, 0b10000, "sqshrun",
int_aarch64_neon_sqshrun>;
-defm SRI : SIMDVectorRShiftBHSDTied<1, 0b01000, "sri", int_aarch64_neon_vsri>;
-def : Pat<(v1i64 (int_aarch64_neon_vsri (v1i64 FPR64:$Rd), (v1i64 FPR64:$Rn),
+defm SRI : SIMDVectorRShiftBHSDTied<1, 0b01000, "sri", AArch64vsri>;
+def : Pat<(v1i64 (AArch64vsri (v1i64 FPR64:$Rd), (v1i64 FPR64:$Rn),
(i32 vecshiftR64:$imm))),
(SRId FPR64:$Rd, FPR64:$Rn, vecshiftR64:$imm)>;
defm SRSHR : SIMDVectorRShiftBHSD<0, 0b00100, "srshr", AArch64srshri>;
@@ -6147,6 +6379,10 @@ def : Pat<(v4f16 (AArch64dup (f16 (load GPR64sp:$Rn)))),
(LD1Rv4h GPR64sp:$Rn)>;
def : Pat<(v8f16 (AArch64dup (f16 (load GPR64sp:$Rn)))),
(LD1Rv8h GPR64sp:$Rn)>;
+def : Pat<(v4bf16 (AArch64dup (bf16 (load GPR64sp:$Rn)))),
+ (LD1Rv4h GPR64sp:$Rn)>;
+def : Pat<(v8bf16 (AArch64dup (bf16 (load GPR64sp:$Rn)))),
+ (LD1Rv8h GPR64sp:$Rn)>;
class Ld1Lane128Pat<SDPatternOperator scalar_load, Operand VecIndex,
ValueType VTy, ValueType STy, Instruction LD1>
@@ -6161,6 +6397,7 @@ def : Ld1Lane128Pat<load, VectorIndexS, v4f32, f32, LD1i32>;
def : Ld1Lane128Pat<load, VectorIndexD, v2i64, i64, LD1i64>;
def : Ld1Lane128Pat<load, VectorIndexD, v2f64, f64, LD1i64>;
def : Ld1Lane128Pat<load, VectorIndexH, v8f16, f16, LD1i16>;
+def : Ld1Lane128Pat<load, VectorIndexH, v8bf16, bf16, LD1i16>;
class Ld1Lane64Pat<SDPatternOperator scalar_load, Operand VecIndex,
ValueType VTy, ValueType STy, Instruction LD1>
@@ -6176,6 +6413,7 @@ def : Ld1Lane64Pat<extloadi16, VectorIndexH, v4i16, i32, LD1i16>;
def : Ld1Lane64Pat<load, VectorIndexS, v2i32, i32, LD1i32>;
def : Ld1Lane64Pat<load, VectorIndexS, v2f32, f32, LD1i32>;
def : Ld1Lane64Pat<load, VectorIndexH, v4f16, f16, LD1i16>;
+def : Ld1Lane64Pat<load, VectorIndexH, v4bf16, bf16, LD1i16>;
defm LD1 : SIMDLdSt1SingleAliases<"ld1">;
@@ -6204,6 +6442,7 @@ def : St1Lane128Pat<store, VectorIndexS, v4f32, f32, ST1i32>;
def : St1Lane128Pat<store, VectorIndexD, v2i64, i64, ST1i64>;
def : St1Lane128Pat<store, VectorIndexD, v2f64, f64, ST1i64>;
def : St1Lane128Pat<store, VectorIndexH, v8f16, f16, ST1i16>;
+def : St1Lane128Pat<store, VectorIndexH, v8bf16, bf16, ST1i16>;
let AddedComplexity = 19 in
class St1Lane64Pat<SDPatternOperator scalar_store, Operand VecIndex,
@@ -6219,6 +6458,7 @@ def : St1Lane64Pat<truncstorei16, VectorIndexH, v4i16, i32, ST1i16>;
def : St1Lane64Pat<store, VectorIndexS, v2i32, i32, ST1i32>;
def : St1Lane64Pat<store, VectorIndexS, v2f32, f32, ST1i32>;
def : St1Lane64Pat<store, VectorIndexH, v4f16, f16, ST1i16>;
+def : St1Lane64Pat<store, VectorIndexH, v4bf16, bf16, ST1i16>;
multiclass St1LanePost64Pat<SDPatternOperator scalar_store, Operand VecIndex,
ValueType VTy, ValueType STy, Instruction ST1,
@@ -6244,6 +6484,7 @@ defm : St1LanePost64Pat<post_store, VectorIndexS, v2f32, f32, ST1i32_POST, 4>;
defm : St1LanePost64Pat<post_store, VectorIndexD, v1i64, i64, ST1i64_POST, 8>;
defm : St1LanePost64Pat<post_store, VectorIndexD, v1f64, f64, ST1i64_POST, 8>;
defm : St1LanePost64Pat<post_store, VectorIndexH, v4f16, f16, ST1i16_POST, 2>;
+defm : St1LanePost64Pat<post_store, VectorIndexH, v4bf16, bf16, ST1i16_POST, 2>;
multiclass St1LanePost128Pat<SDPatternOperator scalar_store, Operand VecIndex,
ValueType VTy, ValueType STy, Instruction ST1,
@@ -6268,6 +6509,7 @@ defm : St1LanePost128Pat<post_store, VectorIndexS, v4f32, f32, ST1i32_POST, 4>;
defm : St1LanePost128Pat<post_store, VectorIndexD, v2i64, i64, ST1i64_POST, 8>;
defm : St1LanePost128Pat<post_store, VectorIndexD, v2f64, f64, ST1i64_POST, 8>;
defm : St1LanePost128Pat<post_store, VectorIndexH, v8f16, f16, ST1i16_POST, 2>;
+defm : St1LanePost128Pat<post_store, VectorIndexH, v8bf16, bf16, ST1i16_POST, 2>;
let mayStore = 1, hasSideEffects = 0 in {
defm ST2 : SIMDStSingleB<1, 0b000, "st2", VecListTwob, GPR64pi2>;
@@ -6508,6 +6750,7 @@ def : Pat<(v4i32 (mulhu V128:$Rn, V128:$Rm)),
def : Pat<(v8i8 (AArch64NvCast (v2i32 FPR64:$src))), (v8i8 FPR64:$src)>;
def : Pat<(v4i16 (AArch64NvCast (v2i32 FPR64:$src))), (v4i16 FPR64:$src)>;
def : Pat<(v4f16 (AArch64NvCast (v2i32 FPR64:$src))), (v4f16 FPR64:$src)>;
+def : Pat<(v4bf16 (AArch64NvCast (v2i32 FPR64:$src))), (v4bf16 FPR64:$src)>;
def : Pat<(v2i32 (AArch64NvCast (v2i32 FPR64:$src))), (v2i32 FPR64:$src)>;
def : Pat<(v2f32 (AArch64NvCast (v2i32 FPR64:$src))), (v2f32 FPR64:$src)>;
def : Pat<(v1i64 (AArch64NvCast (v2i32 FPR64:$src))), (v1i64 FPR64:$src)>;
@@ -6515,12 +6758,14 @@ def : Pat<(v1i64 (AArch64NvCast (v2i32 FPR64:$src))), (v1i64 FPR64:$src)>;
def : Pat<(v8i8 (AArch64NvCast (v4i16 FPR64:$src))), (v8i8 FPR64:$src)>;
def : Pat<(v4i16 (AArch64NvCast (v4i16 FPR64:$src))), (v4i16 FPR64:$src)>;
def : Pat<(v4f16 (AArch64NvCast (v4i16 FPR64:$src))), (v4f16 FPR64:$src)>;
+def : Pat<(v4bf16 (AArch64NvCast (v4i16 FPR64:$src))), (v4bf16 FPR64:$src)>;
def : Pat<(v2i32 (AArch64NvCast (v4i16 FPR64:$src))), (v2i32 FPR64:$src)>;
def : Pat<(v1i64 (AArch64NvCast (v4i16 FPR64:$src))), (v1i64 FPR64:$src)>;
def : Pat<(v8i8 (AArch64NvCast (v8i8 FPR64:$src))), (v8i8 FPR64:$src)>;
def : Pat<(v4i16 (AArch64NvCast (v8i8 FPR64:$src))), (v4i16 FPR64:$src)>;
def : Pat<(v4f16 (AArch64NvCast (v8i8 FPR64:$src))), (v4f16 FPR64:$src)>;
+def : Pat<(v4bf16 (AArch64NvCast (v8i8 FPR64:$src))), (v4bf16 FPR64:$src)>;
def : Pat<(v2i32 (AArch64NvCast (v8i8 FPR64:$src))), (v2i32 FPR64:$src)>;
def : Pat<(v2f32 (AArch64NvCast (v8i8 FPR64:$src))), (v2f32 FPR64:$src)>;
def : Pat<(v1i64 (AArch64NvCast (v8i8 FPR64:$src))), (v1i64 FPR64:$src)>;
@@ -6528,6 +6773,7 @@ def : Pat<(v1i64 (AArch64NvCast (v8i8 FPR64:$src))), (v1i64 FPR64:$src)>;
def : Pat<(v8i8 (AArch64NvCast (f64 FPR64:$src))), (v8i8 FPR64:$src)>;
def : Pat<(v4i16 (AArch64NvCast (f64 FPR64:$src))), (v4i16 FPR64:$src)>;
def : Pat<(v4f16 (AArch64NvCast (f64 FPR64:$src))), (v4f16 FPR64:$src)>;
+def : Pat<(v4bf16 (AArch64NvCast (f64 FPR64:$src))), (v4bf16 FPR64:$src)>;
def : Pat<(v2i32 (AArch64NvCast (f64 FPR64:$src))), (v2i32 FPR64:$src)>;
def : Pat<(v2f32 (AArch64NvCast (f64 FPR64:$src))), (v2f32 FPR64:$src)>;
def : Pat<(v1i64 (AArch64NvCast (f64 FPR64:$src))), (v1i64 FPR64:$src)>;
@@ -6544,6 +6790,7 @@ def : Pat<(v1f64 (AArch64NvCast (v2f32 FPR64:$src))), (v1f64 FPR64:$src)>;
def : Pat<(v16i8 (AArch64NvCast (v4i32 FPR128:$src))), (v16i8 FPR128:$src)>;
def : Pat<(v8i16 (AArch64NvCast (v4i32 FPR128:$src))), (v8i16 FPR128:$src)>;
def : Pat<(v8f16 (AArch64NvCast (v4i32 FPR128:$src))), (v8f16 FPR128:$src)>;
+def : Pat<(v8bf16 (AArch64NvCast (v4i32 FPR128:$src))), (v8bf16 FPR128:$src)>;
def : Pat<(v4i32 (AArch64NvCast (v4i32 FPR128:$src))), (v4i32 FPR128:$src)>;
def : Pat<(v4f32 (AArch64NvCast (v4i32 FPR128:$src))), (v4f32 FPR128:$src)>;
def : Pat<(v2i64 (AArch64NvCast (v4i32 FPR128:$src))), (v2i64 FPR128:$src)>;
@@ -6552,6 +6799,7 @@ def : Pat<(v2f64 (AArch64NvCast (v4i32 FPR128:$src))), (v2f64 FPR128:$src)>;
def : Pat<(v16i8 (AArch64NvCast (v8i16 FPR128:$src))), (v16i8 FPR128:$src)>;
def : Pat<(v8i16 (AArch64NvCast (v8i16 FPR128:$src))), (v8i16 FPR128:$src)>;
def : Pat<(v8f16 (AArch64NvCast (v8i16 FPR128:$src))), (v8f16 FPR128:$src)>;
+def : Pat<(v8bf16 (AArch64NvCast (v8i16 FPR128:$src))), (v8bf16 FPR128:$src)>;
def : Pat<(v4i32 (AArch64NvCast (v8i16 FPR128:$src))), (v4i32 FPR128:$src)>;
def : Pat<(v2i64 (AArch64NvCast (v8i16 FPR128:$src))), (v2i64 FPR128:$src)>;
def : Pat<(v4f32 (AArch64NvCast (v8i16 FPR128:$src))), (v4f32 FPR128:$src)>;
@@ -6560,6 +6808,7 @@ def : Pat<(v2f64 (AArch64NvCast (v8i16 FPR128:$src))), (v2f64 FPR128:$src)>;
def : Pat<(v16i8 (AArch64NvCast (v16i8 FPR128:$src))), (v16i8 FPR128:$src)>;
def : Pat<(v8i16 (AArch64NvCast (v16i8 FPR128:$src))), (v8i16 FPR128:$src)>;
def : Pat<(v8f16 (AArch64NvCast (v16i8 FPR128:$src))), (v8f16 FPR128:$src)>;
+def : Pat<(v8bf16 (AArch64NvCast (v16i8 FPR128:$src))), (v8bf16 FPR128:$src)>;
def : Pat<(v4i32 (AArch64NvCast (v16i8 FPR128:$src))), (v4i32 FPR128:$src)>;
def : Pat<(v2i64 (AArch64NvCast (v16i8 FPR128:$src))), (v2i64 FPR128:$src)>;
def : Pat<(v4f32 (AArch64NvCast (v16i8 FPR128:$src))), (v4f32 FPR128:$src)>;
@@ -6568,6 +6817,7 @@ def : Pat<(v2f64 (AArch64NvCast (v16i8 FPR128:$src))), (v2f64 FPR128:$src)>;
def : Pat<(v16i8 (AArch64NvCast (v2i64 FPR128:$src))), (v16i8 FPR128:$src)>;
def : Pat<(v8i16 (AArch64NvCast (v2i64 FPR128:$src))), (v8i16 FPR128:$src)>;
def : Pat<(v8f16 (AArch64NvCast (v2i64 FPR128:$src))), (v8f16 FPR128:$src)>;
+def : Pat<(v8bf16 (AArch64NvCast (v2i64 FPR128:$src))), (v8bf16 FPR128:$src)>;
def : Pat<(v4i32 (AArch64NvCast (v2i64 FPR128:$src))), (v4i32 FPR128:$src)>;
def : Pat<(v2i64 (AArch64NvCast (v2i64 FPR128:$src))), (v2i64 FPR128:$src)>;
def : Pat<(v4f32 (AArch64NvCast (v2i64 FPR128:$src))), (v4f32 FPR128:$src)>;
@@ -6579,6 +6829,7 @@ def : Pat<(v4i32 (AArch64NvCast (v4f32 FPR128:$src))), (v4i32 FPR128:$src)>;
def : Pat<(v4f32 (AArch64NvCast (v4f32 FPR128:$src))), (v4f32 FPR128:$src)>;
def : Pat<(v2i64 (AArch64NvCast (v4f32 FPR128:$src))), (v2i64 FPR128:$src)>;
def : Pat<(v8f16 (AArch64NvCast (v4f32 FPR128:$src))), (v8f16 FPR128:$src)>;
+def : Pat<(v8bf16 (AArch64NvCast (v4f32 FPR128:$src))), (v8bf16 FPR128:$src)>;
def : Pat<(v2f64 (AArch64NvCast (v4f32 FPR128:$src))), (v2f64 FPR128:$src)>;
def : Pat<(v16i8 (AArch64NvCast (v2f64 FPR128:$src))), (v16i8 FPR128:$src)>;
@@ -6587,6 +6838,7 @@ def : Pat<(v4i32 (AArch64NvCast (v2f64 FPR128:$src))), (v4i32 FPR128:$src)>;
def : Pat<(v2i64 (AArch64NvCast (v2f64 FPR128:$src))), (v2i64 FPR128:$src)>;
def : Pat<(v2f64 (AArch64NvCast (v2f64 FPR128:$src))), (v2f64 FPR128:$src)>;
def : Pat<(v8f16 (AArch64NvCast (v2f64 FPR128:$src))), (v8f16 FPR128:$src)>;
+def : Pat<(v8bf16 (AArch64NvCast (v2f64 FPR128:$src))), (v8bf16 FPR128:$src)>;
def : Pat<(v4f32 (AArch64NvCast (v2f64 FPR128:$src))), (v4f32 FPR128:$src)>;
let Predicates = [IsLE] in {
@@ -6594,6 +6846,7 @@ def : Pat<(v8i8 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
def : Pat<(v4i16 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
def : Pat<(v2i32 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
def : Pat<(v4f16 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
+def : Pat<(v4bf16 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
def : Pat<(v2f32 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
def : Pat<(i64 (bitconvert (v8i8 V64:$Vn))),
@@ -6604,6 +6857,8 @@ def : Pat<(i64 (bitconvert (v2i32 V64:$Vn))),
(COPY_TO_REGCLASS V64:$Vn, GPR64)>;
def : Pat<(i64 (bitconvert (v4f16 V64:$Vn))),
(COPY_TO_REGCLASS V64:$Vn, GPR64)>;
+def : Pat<(i64 (bitconvert (v4bf16 V64:$Vn))),
+ (COPY_TO_REGCLASS V64:$Vn, GPR64)>;
def : Pat<(i64 (bitconvert (v2f32 V64:$Vn))),
(COPY_TO_REGCLASS V64:$Vn, GPR64)>;
def : Pat<(i64 (bitconvert (v1f64 V64:$Vn))),
@@ -6618,6 +6873,8 @@ def : Pat<(v2i32 (bitconvert GPR64:$Xn)),
(REV64v2i32 (COPY_TO_REGCLASS GPR64:$Xn, FPR64))>;
def : Pat<(v4f16 (bitconvert GPR64:$Xn)),
(REV64v4i16 (COPY_TO_REGCLASS GPR64:$Xn, FPR64))>;
+def : Pat<(v4bf16 (bitconvert GPR64:$Xn)),
+ (REV64v4i16 (COPY_TO_REGCLASS GPR64:$Xn, FPR64))>;
def : Pat<(v2f32 (bitconvert GPR64:$Xn)),
(REV64v2i32 (COPY_TO_REGCLASS GPR64:$Xn, FPR64))>;
@@ -6629,6 +6886,8 @@ def : Pat<(i64 (bitconvert (v2i32 V64:$Vn))),
(REV64v2i32 (COPY_TO_REGCLASS V64:$Vn, GPR64))>;
def : Pat<(i64 (bitconvert (v4f16 V64:$Vn))),
(REV64v4i16 (COPY_TO_REGCLASS V64:$Vn, GPR64))>;
+def : Pat<(i64 (bitconvert (v4bf16 V64:$Vn))),
+ (REV64v4i16 (COPY_TO_REGCLASS V64:$Vn, GPR64))>;
def : Pat<(i64 (bitconvert (v2f32 V64:$Vn))),
(REV64v2i32 (COPY_TO_REGCLASS V64:$Vn, GPR64))>;
}
@@ -6658,6 +6917,7 @@ def : Pat<(v1i64 (bitconvert (v2i32 FPR64:$src))), (v1i64 FPR64:$src)>;
def : Pat<(v1i64 (bitconvert (v4i16 FPR64:$src))), (v1i64 FPR64:$src)>;
def : Pat<(v1i64 (bitconvert (v8i8 FPR64:$src))), (v1i64 FPR64:$src)>;
def : Pat<(v1i64 (bitconvert (v4f16 FPR64:$src))), (v1i64 FPR64:$src)>;
+def : Pat<(v1i64 (bitconvert (v4bf16 FPR64:$src))), (v1i64 FPR64:$src)>;
def : Pat<(v1i64 (bitconvert (v2f32 FPR64:$src))), (v1i64 FPR64:$src)>;
}
let Predicates = [IsBE] in {
@@ -6669,6 +6929,8 @@ def : Pat<(v1i64 (bitconvert (v8i8 FPR64:$src))),
(v1i64 (REV64v8i8 FPR64:$src))>;
def : Pat<(v1i64 (bitconvert (v4f16 FPR64:$src))),
(v1i64 (REV64v4i16 FPR64:$src))>;
+def : Pat<(v1i64 (bitconvert (v4bf16 FPR64:$src))),
+ (v1i64 (REV64v4i16 FPR64:$src))>;
def : Pat<(v1i64 (bitconvert (v2f32 FPR64:$src))),
(v1i64 (REV64v2i32 FPR64:$src))>;
}
@@ -6682,6 +6944,7 @@ def : Pat<(v2i32 (bitconvert (v8i8 FPR64:$src))), (v2i32 FPR64:$src)>;
def : Pat<(v2i32 (bitconvert (f64 FPR64:$src))), (v2i32 FPR64:$src)>;
def : Pat<(v2i32 (bitconvert (v1f64 FPR64:$src))), (v2i32 FPR64:$src)>;
def : Pat<(v2i32 (bitconvert (v4f16 FPR64:$src))), (v2i32 FPR64:$src)>;
+def : Pat<(v2i32 (bitconvert (v4bf16 FPR64:$src))), (v2i32 FPR64:$src)>;
}
let Predicates = [IsBE] in {
def : Pat<(v2i32 (bitconvert (v1i64 FPR64:$src))),
@@ -6696,6 +6959,8 @@ def : Pat<(v2i32 (bitconvert (v1f64 FPR64:$src))),
(v2i32 (REV64v2i32 FPR64:$src))>;
def : Pat<(v2i32 (bitconvert (v4f16 FPR64:$src))),
(v2i32 (REV32v4i16 FPR64:$src))>;
+def : Pat<(v2i32 (bitconvert (v4bf16 FPR64:$src))),
+ (v2i32 (REV32v4i16 FPR64:$src))>;
}
def : Pat<(v2i32 (bitconvert (v2f32 FPR64:$src))), (v2i32 FPR64:$src)>;
@@ -6722,6 +6987,7 @@ def : Pat<(v4i16 (bitconvert (v1f64 FPR64:$src))),
(v4i16 (REV64v4i16 FPR64:$src))>;
}
def : Pat<(v4i16 (bitconvert (v4f16 FPR64:$src))), (v4i16 FPR64:$src)>;
+def : Pat<(v4i16 (bitconvert (v4bf16 FPR64:$src))), (v4i16 FPR64:$src)>;
let Predicates = [IsLE] in {
def : Pat<(v4f16 (bitconvert (v1i64 FPR64:$src))), (v4f16 FPR64:$src)>;
@@ -6730,6 +6996,13 @@ def : Pat<(v4f16 (bitconvert (v8i8 FPR64:$src))), (v4f16 FPR64:$src)>;
def : Pat<(v4f16 (bitconvert (f64 FPR64:$src))), (v4f16 FPR64:$src)>;
def : Pat<(v4f16 (bitconvert (v2f32 FPR64:$src))), (v4f16 FPR64:$src)>;
def : Pat<(v4f16 (bitconvert (v1f64 FPR64:$src))), (v4f16 FPR64:$src)>;
+
+def : Pat<(v4bf16 (bitconvert (v1i64 FPR64:$src))), (v4bf16 FPR64:$src)>;
+def : Pat<(v4bf16 (bitconvert (v2i32 FPR64:$src))), (v4bf16 FPR64:$src)>;
+def : Pat<(v4bf16 (bitconvert (v8i8 FPR64:$src))), (v4bf16 FPR64:$src)>;
+def : Pat<(v4bf16 (bitconvert (f64 FPR64:$src))), (v4bf16 FPR64:$src)>;
+def : Pat<(v4bf16 (bitconvert (v2f32 FPR64:$src))), (v4bf16 FPR64:$src)>;
+def : Pat<(v4bf16 (bitconvert (v1f64 FPR64:$src))), (v4bf16 FPR64:$src)>;
}
let Predicates = [IsBE] in {
def : Pat<(v4f16 (bitconvert (v1i64 FPR64:$src))),
@@ -6744,8 +7017,22 @@ def : Pat<(v4f16 (bitconvert (v2f32 FPR64:$src))),
(v4f16 (REV32v4i16 FPR64:$src))>;
def : Pat<(v4f16 (bitconvert (v1f64 FPR64:$src))),
(v4f16 (REV64v4i16 FPR64:$src))>;
+
+def : Pat<(v4bf16 (bitconvert (v1i64 FPR64:$src))),
+ (v4bf16 (REV64v4i16 FPR64:$src))>;
+def : Pat<(v4bf16 (bitconvert (v2i32 FPR64:$src))),
+ (v4bf16 (REV32v4i16 FPR64:$src))>;
+def : Pat<(v4bf16 (bitconvert (v8i8 FPR64:$src))),
+ (v4bf16 (REV16v8i8 FPR64:$src))>;
+def : Pat<(v4bf16 (bitconvert (f64 FPR64:$src))),
+ (v4bf16 (REV64v4i16 FPR64:$src))>;
+def : Pat<(v4bf16 (bitconvert (v2f32 FPR64:$src))),
+ (v4bf16 (REV32v4i16 FPR64:$src))>;
+def : Pat<(v4bf16 (bitconvert (v1f64 FPR64:$src))),
+ (v4bf16 (REV64v4i16 FPR64:$src))>;
}
def : Pat<(v4f16 (bitconvert (v4i16 FPR64:$src))), (v4f16 FPR64:$src)>;
+def : Pat<(v4bf16 (bitconvert (v4i16 FPR64:$src))), (v4bf16 FPR64:$src)>;
let Predicates = [IsLE] in {
def : Pat<(v8i8 (bitconvert (v1i64 FPR64:$src))), (v8i8 FPR64:$src)>;
@@ -6755,6 +7042,7 @@ def : Pat<(v8i8 (bitconvert (f64 FPR64:$src))), (v8i8 FPR64:$src)>;
def : Pat<(v8i8 (bitconvert (v2f32 FPR64:$src))), (v8i8 FPR64:$src)>;
def : Pat<(v8i8 (bitconvert (v1f64 FPR64:$src))), (v8i8 FPR64:$src)>;
def : Pat<(v8i8 (bitconvert (v4f16 FPR64:$src))), (v8i8 FPR64:$src)>;
+def : Pat<(v8i8 (bitconvert (v4bf16 FPR64:$src))), (v8i8 FPR64:$src)>;
}
let Predicates = [IsBE] in {
def : Pat<(v8i8 (bitconvert (v1i64 FPR64:$src))),
@@ -6771,6 +7059,8 @@ def : Pat<(v8i8 (bitconvert (v1f64 FPR64:$src))),
(v8i8 (REV64v8i8 FPR64:$src))>;
def : Pat<(v8i8 (bitconvert (v4f16 FPR64:$src))),
(v8i8 (REV16v8i8 FPR64:$src))>;
+def : Pat<(v8i8 (bitconvert (v4bf16 FPR64:$src))),
+ (v8i8 (REV16v8i8 FPR64:$src))>;
}
let Predicates = [IsLE] in {
@@ -6779,6 +7069,7 @@ def : Pat<(f64 (bitconvert (v4i16 FPR64:$src))), (f64 FPR64:$src)>;
def : Pat<(f64 (bitconvert (v2f32 FPR64:$src))), (f64 FPR64:$src)>;
def : Pat<(f64 (bitconvert (v8i8 FPR64:$src))), (f64 FPR64:$src)>;
def : Pat<(f64 (bitconvert (v4f16 FPR64:$src))), (f64 FPR64:$src)>;
+def : Pat<(f64 (bitconvert (v4bf16 FPR64:$src))), (f64 FPR64:$src)>;
}
let Predicates = [IsBE] in {
def : Pat<(f64 (bitconvert (v2i32 FPR64:$src))),
@@ -6791,6 +7082,8 @@ def : Pat<(f64 (bitconvert (v8i8 FPR64:$src))),
(f64 (REV64v8i8 FPR64:$src))>;
def : Pat<(f64 (bitconvert (v4f16 FPR64:$src))),
(f64 (REV64v4i16 FPR64:$src))>;
+def : Pat<(f64 (bitconvert (v4bf16 FPR64:$src))),
+ (f64 (REV64v4i16 FPR64:$src))>;
}
def : Pat<(f64 (bitconvert (v1i64 FPR64:$src))), (f64 FPR64:$src)>;
def : Pat<(f64 (bitconvert (v1f64 FPR64:$src))), (f64 FPR64:$src)>;
@@ -6801,6 +7094,7 @@ def : Pat<(v1f64 (bitconvert (v4i16 FPR64:$src))), (v1f64 FPR64:$src)>;
def : Pat<(v1f64 (bitconvert (v8i8 FPR64:$src))), (v1f64 FPR64:$src)>;
def : Pat<(v1f64 (bitconvert (v2f32 FPR64:$src))), (v1f64 FPR64:$src)>;
def : Pat<(v1f64 (bitconvert (v4f16 FPR64:$src))), (v1f64 FPR64:$src)>;
+def : Pat<(v1f64 (bitconvert (v4bf16 FPR64:$src))), (v1f64 FPR64:$src)>;
}
let Predicates = [IsBE] in {
def : Pat<(v1f64 (bitconvert (v2i32 FPR64:$src))),
@@ -6813,6 +7107,8 @@ def : Pat<(v1f64 (bitconvert (v2f32 FPR64:$src))),
(v1f64 (REV64v2i32 FPR64:$src))>;
def : Pat<(v1f64 (bitconvert (v4f16 FPR64:$src))),
(v1f64 (REV64v4i16 FPR64:$src))>;
+def : Pat<(v1f64 (bitconvert (v4bf16 FPR64:$src))),
+ (v1f64 (REV64v4i16 FPR64:$src))>;
}
def : Pat<(v1f64 (bitconvert (v1i64 FPR64:$src))), (v1f64 FPR64:$src)>;
def : Pat<(v1f64 (bitconvert (f64 FPR64:$src))), (v1f64 FPR64:$src)>;
@@ -6824,6 +7120,7 @@ def : Pat<(v2f32 (bitconvert (v8i8 FPR64:$src))), (v2f32 FPR64:$src)>;
def : Pat<(v2f32 (bitconvert (v1f64 FPR64:$src))), (v2f32 FPR64:$src)>;
def : Pat<(v2f32 (bitconvert (f64 FPR64:$src))), (v2f32 FPR64:$src)>;
def : Pat<(v2f32 (bitconvert (v4f16 FPR64:$src))), (v2f32 FPR64:$src)>;
+def : Pat<(v2f32 (bitconvert (v4bf16 FPR64:$src))), (v2f32 FPR64:$src)>;
}
let Predicates = [IsBE] in {
def : Pat<(v2f32 (bitconvert (v1i64 FPR64:$src))),
@@ -6838,6 +7135,8 @@ def : Pat<(v2f32 (bitconvert (f64 FPR64:$src))),
(v2f32 (REV64v2i32 FPR64:$src))>;
def : Pat<(v2f32 (bitconvert (v4f16 FPR64:$src))),
(v2f32 (REV32v4i16 FPR64:$src))>;
+def : Pat<(v2f32 (bitconvert (v4bf16 FPR64:$src))),
+ (v2f32 (REV32v4i16 FPR64:$src))>;
}
def : Pat<(v2f32 (bitconvert (v2i32 FPR64:$src))), (v2f32 FPR64:$src)>;
@@ -6848,6 +7147,7 @@ def : Pat<(f128 (bitconvert (v8i16 FPR128:$src))), (f128 FPR128:$src)>;
def : Pat<(f128 (bitconvert (v2f64 FPR128:$src))), (f128 FPR128:$src)>;
def : Pat<(f128 (bitconvert (v4f32 FPR128:$src))), (f128 FPR128:$src)>;
def : Pat<(f128 (bitconvert (v8f16 FPR128:$src))), (f128 FPR128:$src)>;
+def : Pat<(f128 (bitconvert (v8bf16 FPR128:$src))), (f128 FPR128:$src)>;
def : Pat<(f128 (bitconvert (v16i8 FPR128:$src))), (f128 FPR128:$src)>;
}
let Predicates = [IsBE] in {
@@ -6862,6 +7162,9 @@ def : Pat<(f128 (bitconvert (v8i16 FPR128:$src))),
def : Pat<(f128 (bitconvert (v8f16 FPR128:$src))),
(f128 (EXTv16i8 (REV64v8i16 FPR128:$src),
(REV64v8i16 FPR128:$src), (i32 8)))>;
+def : Pat<(f128 (bitconvert (v8bf16 FPR128:$src))),
+ (f128 (EXTv16i8 (REV64v8i16 FPR128:$src),
+ (REV64v8i16 FPR128:$src), (i32 8)))>;
def : Pat<(f128 (bitconvert (v2f64 FPR128:$src))),
(f128 (EXTv16i8 FPR128:$src, FPR128:$src, (i32 8)))>;
def : Pat<(f128 (bitconvert (v4f32 FPR128:$src))),
@@ -6877,6 +7180,7 @@ def : Pat<(v2f64 (bitconvert (f128 FPR128:$src))), (v2f64 FPR128:$src)>;
def : Pat<(v2f64 (bitconvert (v4i32 FPR128:$src))), (v2f64 FPR128:$src)>;
def : Pat<(v2f64 (bitconvert (v8i16 FPR128:$src))), (v2f64 FPR128:$src)>;
def : Pat<(v2f64 (bitconvert (v8f16 FPR128:$src))), (v2f64 FPR128:$src)>;
+def : Pat<(v2f64 (bitconvert (v8bf16 FPR128:$src))), (v2f64 FPR128:$src)>;
def : Pat<(v2f64 (bitconvert (v16i8 FPR128:$src))), (v2f64 FPR128:$src)>;
def : Pat<(v2f64 (bitconvert (v4f32 FPR128:$src))), (v2f64 FPR128:$src)>;
}
@@ -6890,6 +7194,8 @@ def : Pat<(v2f64 (bitconvert (v8i16 FPR128:$src))),
(v2f64 (REV64v8i16 FPR128:$src))>;
def : Pat<(v2f64 (bitconvert (v8f16 FPR128:$src))),
(v2f64 (REV64v8i16 FPR128:$src))>;
+def : Pat<(v2f64 (bitconvert (v8bf16 FPR128:$src))),
+ (v2f64 (REV64v8i16 FPR128:$src))>;
def : Pat<(v2f64 (bitconvert (v16i8 FPR128:$src))),
(v2f64 (REV64v16i8 FPR128:$src))>;
def : Pat<(v2f64 (bitconvert (v4f32 FPR128:$src))),
@@ -6901,6 +7207,7 @@ let Predicates = [IsLE] in {
def : Pat<(v4f32 (bitconvert (f128 FPR128:$src))), (v4f32 FPR128:$src)>;
def : Pat<(v4f32 (bitconvert (v8i16 FPR128:$src))), (v4f32 FPR128:$src)>;
def : Pat<(v4f32 (bitconvert (v8f16 FPR128:$src))), (v4f32 FPR128:$src)>;
+def : Pat<(v4f32 (bitconvert (v8bf16 FPR128:$src))), (v4f32 FPR128:$src)>;
def : Pat<(v4f32 (bitconvert (v16i8 FPR128:$src))), (v4f32 FPR128:$src)>;
def : Pat<(v4f32 (bitconvert (v2i64 FPR128:$src))), (v4f32 FPR128:$src)>;
def : Pat<(v4f32 (bitconvert (v2f64 FPR128:$src))), (v4f32 FPR128:$src)>;
@@ -6913,6 +7220,8 @@ def : Pat<(v4f32 (bitconvert (v8i16 FPR128:$src))),
(v4f32 (REV32v8i16 FPR128:$src))>;
def : Pat<(v4f32 (bitconvert (v8f16 FPR128:$src))),
(v4f32 (REV32v8i16 FPR128:$src))>;
+def : Pat<(v4f32 (bitconvert (v8bf16 FPR128:$src))),
+ (v4f32 (REV32v8i16 FPR128:$src))>;
def : Pat<(v4f32 (bitconvert (v16i8 FPR128:$src))),
(v4f32 (REV32v16i8 FPR128:$src))>;
def : Pat<(v4f32 (bitconvert (v2i64 FPR128:$src))),
@@ -6929,6 +7238,7 @@ def : Pat<(v2i64 (bitconvert (v8i16 FPR128:$src))), (v2i64 FPR128:$src)>;
def : Pat<(v2i64 (bitconvert (v16i8 FPR128:$src))), (v2i64 FPR128:$src)>;
def : Pat<(v2i64 (bitconvert (v4f32 FPR128:$src))), (v2i64 FPR128:$src)>;
def : Pat<(v2i64 (bitconvert (v8f16 FPR128:$src))), (v2i64 FPR128:$src)>;
+def : Pat<(v2i64 (bitconvert (v8bf16 FPR128:$src))), (v2i64 FPR128:$src)>;
}
let Predicates = [IsBE] in {
def : Pat<(v2i64 (bitconvert (f128 FPR128:$src))),
@@ -6944,6 +7254,8 @@ def : Pat<(v2i64 (bitconvert (v4f32 FPR128:$src))),
(v2i64 (REV64v4i32 FPR128:$src))>;
def : Pat<(v2i64 (bitconvert (v8f16 FPR128:$src))),
(v2i64 (REV64v8i16 FPR128:$src))>;
+def : Pat<(v2i64 (bitconvert (v8bf16 FPR128:$src))),
+ (v2i64 (REV64v8i16 FPR128:$src))>;
}
def : Pat<(v2i64 (bitconvert (v2f64 FPR128:$src))), (v2i64 FPR128:$src)>;
@@ -6954,6 +7266,7 @@ def : Pat<(v4i32 (bitconvert (v8i16 FPR128:$src))), (v4i32 FPR128:$src)>;
def : Pat<(v4i32 (bitconvert (v16i8 FPR128:$src))), (v4i32 FPR128:$src)>;
def : Pat<(v4i32 (bitconvert (v2f64 FPR128:$src))), (v4i32 FPR128:$src)>;
def : Pat<(v4i32 (bitconvert (v8f16 FPR128:$src))), (v4i32 FPR128:$src)>;
+def : Pat<(v4i32 (bitconvert (v8bf16 FPR128:$src))), (v4i32 FPR128:$src)>;
}
let Predicates = [IsBE] in {
def : Pat<(v4i32 (bitconvert (f128 FPR128:$src))),
@@ -6970,6 +7283,8 @@ def : Pat<(v4i32 (bitconvert (v2f64 FPR128:$src))),
(v4i32 (REV64v4i32 FPR128:$src))>;
def : Pat<(v4i32 (bitconvert (v8f16 FPR128:$src))),
(v4i32 (REV32v8i16 FPR128:$src))>;
+def : Pat<(v4i32 (bitconvert (v8bf16 FPR128:$src))),
+ (v4i32 (REV32v8i16 FPR128:$src))>;
}
def : Pat<(v4i32 (bitconvert (v4f32 FPR128:$src))), (v4i32 FPR128:$src)>;
@@ -6998,6 +7313,7 @@ def : Pat<(v8i16 (bitconvert (v4f32 FPR128:$src))),
(v8i16 (REV32v8i16 FPR128:$src))>;
}
def : Pat<(v8i16 (bitconvert (v8f16 FPR128:$src))), (v8i16 FPR128:$src)>;
+def : Pat<(v8i16 (bitconvert (v8bf16 FPR128:$src))), (v8i16 FPR128:$src)>;
let Predicates = [IsLE] in {
def : Pat<(v8f16 (bitconvert (f128 FPR128:$src))), (v8f16 FPR128:$src)>;
@@ -7006,6 +7322,13 @@ def : Pat<(v8f16 (bitconvert (v4i32 FPR128:$src))), (v8f16 FPR128:$src)>;
def : Pat<(v8f16 (bitconvert (v16i8 FPR128:$src))), (v8f16 FPR128:$src)>;
def : Pat<(v8f16 (bitconvert (v2f64 FPR128:$src))), (v8f16 FPR128:$src)>;
def : Pat<(v8f16 (bitconvert (v4f32 FPR128:$src))), (v8f16 FPR128:$src)>;
+
+def : Pat<(v8bf16 (bitconvert (f128 FPR128:$src))), (v8bf16 FPR128:$src)>;
+def : Pat<(v8bf16 (bitconvert (v2i64 FPR128:$src))), (v8bf16 FPR128:$src)>;
+def : Pat<(v8bf16 (bitconvert (v4i32 FPR128:$src))), (v8bf16 FPR128:$src)>;
+def : Pat<(v8bf16 (bitconvert (v16i8 FPR128:$src))), (v8bf16 FPR128:$src)>;
+def : Pat<(v8bf16 (bitconvert (v2f64 FPR128:$src))), (v8bf16 FPR128:$src)>;
+def : Pat<(v8bf16 (bitconvert (v4f32 FPR128:$src))), (v8bf16 FPR128:$src)>;
}
let Predicates = [IsBE] in {
def : Pat<(v8f16 (bitconvert (f128 FPR128:$src))),
@@ -7022,8 +7345,24 @@ def : Pat<(v8f16 (bitconvert (v2f64 FPR128:$src))),
(v8f16 (REV64v8i16 FPR128:$src))>;
def : Pat<(v8f16 (bitconvert (v4f32 FPR128:$src))),
(v8f16 (REV32v8i16 FPR128:$src))>;
+
+def : Pat<(v8bf16 (bitconvert (f128 FPR128:$src))),
+ (v8bf16 (EXTv16i8 (REV64v8i16 FPR128:$src),
+ (REV64v8i16 FPR128:$src),
+ (i32 8)))>;
+def : Pat<(v8bf16 (bitconvert (v2i64 FPR128:$src))),
+ (v8bf16 (REV64v8i16 FPR128:$src))>;
+def : Pat<(v8bf16 (bitconvert (v4i32 FPR128:$src))),
+ (v8bf16 (REV32v8i16 FPR128:$src))>;
+def : Pat<(v8bf16 (bitconvert (v16i8 FPR128:$src))),
+ (v8bf16 (REV16v16i8 FPR128:$src))>;
+def : Pat<(v8bf16 (bitconvert (v2f64 FPR128:$src))),
+ (v8bf16 (REV64v8i16 FPR128:$src))>;
+def : Pat<(v8bf16 (bitconvert (v4f32 FPR128:$src))),
+ (v8bf16 (REV32v8i16 FPR128:$src))>;
}
def : Pat<(v8f16 (bitconvert (v8i16 FPR128:$src))), (v8f16 FPR128:$src)>;
+def : Pat<(v8bf16 (bitconvert (v8i16 FPR128:$src))), (v8bf16 FPR128:$src)>;
let Predicates = [IsLE] in {
def : Pat<(v16i8 (bitconvert (f128 FPR128:$src))), (v16i8 FPR128:$src)>;
@@ -7033,6 +7372,7 @@ def : Pat<(v16i8 (bitconvert (v8i16 FPR128:$src))), (v16i8 FPR128:$src)>;
def : Pat<(v16i8 (bitconvert (v2f64 FPR128:$src))), (v16i8 FPR128:$src)>;
def : Pat<(v16i8 (bitconvert (v4f32 FPR128:$src))), (v16i8 FPR128:$src)>;
def : Pat<(v16i8 (bitconvert (v8f16 FPR128:$src))), (v16i8 FPR128:$src)>;
+def : Pat<(v16i8 (bitconvert (v8bf16 FPR128:$src))), (v16i8 FPR128:$src)>;
}
let Predicates = [IsBE] in {
def : Pat<(v16i8 (bitconvert (f128 FPR128:$src))),
@@ -7051,6 +7391,8 @@ def : Pat<(v16i8 (bitconvert (v4f32 FPR128:$src))),
(v16i8 (REV32v16i8 FPR128:$src))>;
def : Pat<(v16i8 (bitconvert (v8f16 FPR128:$src))),
(v16i8 (REV16v16i8 FPR128:$src))>;
+def : Pat<(v16i8 (bitconvert (v8bf16 FPR128:$src))),
+ (v16i8 (REV16v16i8 FPR128:$src))>;
}
def : Pat<(v4i16 (extract_subvector V128:$Rn, (i64 0))),
@@ -7061,6 +7403,8 @@ def : Pat<(v2f32 (extract_subvector V128:$Rn, (i64 0))),
(EXTRACT_SUBREG V128:$Rn, dsub)>;
def : Pat<(v4f16 (extract_subvector V128:$Rn, (i64 0))),
(EXTRACT_SUBREG V128:$Rn, dsub)>;
+def : Pat<(v4bf16 (extract_subvector V128:$Rn, (i64 0))),
+ (EXTRACT_SUBREG V128:$Rn, dsub)>;
def : Pat<(v2i32 (extract_subvector V128:$Rn, (i64 0))),
(EXTRACT_SUBREG V128:$Rn, dsub)>;
def : Pat<(v1i64 (extract_subvector V128:$Rn, (i64 0))),
@@ -7092,6 +7436,8 @@ multiclass InsertSubvectorUndef<ValueType Ty> {
(INSERT_SUBREG (v8i16 (IMPLICIT_DEF)), FPR64:$src, dsub)>;
def : Pat<(insert_subvector undef, (v4f16 FPR64:$src), (Ty 0)),
(INSERT_SUBREG (v8f16 (IMPLICIT_DEF)), FPR64:$src, dsub)>;
+ def : Pat<(insert_subvector undef, (v4bf16 FPR64:$src), (Ty 0)),
+ (INSERT_SUBREG (v8bf16 (IMPLICIT_DEF)), FPR64:$src, dsub)>;
def : Pat<(insert_subvector undef, (v8i8 FPR64:$src), (Ty 0)),
(INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), FPR64:$src, dsub)>;
}
@@ -7317,3 +7663,5 @@ let AddedComplexity = 10 in {
include "AArch64InstrAtomics.td"
include "AArch64SVEInstrInfo.td"
+
+include "AArch64InstrGISel.td"
diff --git a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
index 3156bb4469638..d975b8bd04fe6 100644
--- a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
@@ -29,6 +29,7 @@
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/TargetRegisterInfo.h"
#include "llvm/IR/DebugLoc.h"
+#include "llvm/MC/MCAsmInfo.h"
#include "llvm/MC/MCRegisterInfo.h"
#include "llvm/Pass.h"
#include "llvm/Support/CommandLine.h"
@@ -66,6 +67,10 @@ static cl::opt<unsigned> LdStLimit("aarch64-load-store-scan-limit",
static cl::opt<unsigned> UpdateLimit("aarch64-update-scan-limit", cl::init(100),
cl::Hidden);
+// Enable register renaming to find additional store pairing opportunities.
+static cl::opt<bool> EnableRenaming("aarch64-load-store-renaming",
+ cl::init(true), cl::Hidden);
+
#define AARCH64_LOAD_STORE_OPT_NAME "AArch64 load / store optimization pass"
namespace {
@@ -673,14 +678,14 @@ AArch64LoadStoreOpt::mergeNarrowZeroStores(MachineBasicBlock::iterator I,
assert(isPromotableZeroStoreInst(*I) && isPromotableZeroStoreInst(*MergeMI) &&
"Expected promotable zero stores.");
- MachineBasicBlock::iterator NextI = I;
- ++NextI;
+ MachineBasicBlock::iterator E = I->getParent()->end();
+ MachineBasicBlock::iterator NextI = next_nodbg(I, E);
// If NextI is the second of the two instructions to be merged, we need
// to skip one further. Either way we merge will invalidate the iterator,
// and we don't need to scan the new instruction, as it's a pairwise
// instruction, which we're not considering for further action anyway.
if (NextI == MergeMI)
- ++NextI;
+ NextI = next_nodbg(NextI, E);
unsigned Opc = I->getOpcode();
bool IsScaled = !TII->isUnscaledLdSt(Opc);
@@ -743,18 +748,17 @@ static bool forAllMIsUntilDef(MachineInstr &MI, MCPhysReg DefReg,
const TargetRegisterInfo *TRI, unsigned Limit,
std::function<bool(MachineInstr &, bool)> &Fn) {
auto MBB = MI.getParent();
- for (MachineBasicBlock::reverse_iterator I = MI.getReverseIterator(),
- E = MBB->rend();
- I != E; I++) {
+ for (MachineInstr &I :
+ instructionsWithoutDebug(MI.getReverseIterator(), MBB->instr_rend())) {
if (!Limit)
return false;
--Limit;
- bool isDef = any_of(I->operands(), [DefReg, TRI](MachineOperand &MOP) {
+ bool isDef = any_of(I.operands(), [DefReg, TRI](MachineOperand &MOP) {
return MOP.isReg() && MOP.isDef() && !MOP.isDebug() && MOP.getReg() &&
TRI->regsOverlap(MOP.getReg(), DefReg);
});
- if (!Fn(*I, isDef))
+ if (!Fn(I, isDef))
return false;
if (isDef)
break;
@@ -778,14 +782,14 @@ MachineBasicBlock::iterator
AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I,
MachineBasicBlock::iterator Paired,
const LdStPairFlags &Flags) {
- MachineBasicBlock::iterator NextI = I;
- ++NextI;
+ MachineBasicBlock::iterator E = I->getParent()->end();
+ MachineBasicBlock::iterator NextI = next_nodbg(I, E);
// If NextI is the second of the two instructions to be merged, we need
// to skip one further. Either way we merge will invalidate the iterator,
// and we don't need to scan the new instruction, as it's a pairwise
// instruction, which we're not considering for further action anyway.
if (NextI == Paired)
- ++NextI;
+ NextI = next_nodbg(NextI, E);
int SExtIdx = Flags.getSExtIdx();
unsigned Opc =
@@ -1004,8 +1008,8 @@ AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I,
MachineBasicBlock::iterator
AArch64LoadStoreOpt::promoteLoadFromStore(MachineBasicBlock::iterator LoadI,
MachineBasicBlock::iterator StoreI) {
- MachineBasicBlock::iterator NextI = LoadI;
- ++NextI;
+ MachineBasicBlock::iterator NextI =
+ next_nodbg(LoadI, LoadI->getParent()->end());
int LoadSize = TII->getMemScale(*LoadI);
int StoreSize = TII->getMemScale(*StoreI);
@@ -1140,24 +1144,11 @@ static int alignTo(int Num, int PowOf2) {
return (Num + PowOf2 - 1) & ~(PowOf2 - 1);
}
-static bool mayAlias(MachineInstr &MIa, MachineInstr &MIb,
- AliasAnalysis *AA) {
- // One of the instructions must modify memory.
- if (!MIa.mayStore() && !MIb.mayStore())
- return false;
-
- // Both instructions must be memory operations.
- if (!MIa.mayLoadOrStore() && !MIb.mayLoadOrStore())
- return false;
-
- return MIa.mayAlias(AA, MIb, /*UseTBAA*/false);
-}
-
static bool mayAlias(MachineInstr &MIa,
SmallVectorImpl<MachineInstr *> &MemInsns,
AliasAnalysis *AA) {
for (MachineInstr *MIb : MemInsns)
- if (mayAlias(MIa, *MIb, AA))
+ if (MIa.mayAlias(AA, *MIb, /*UseTBAA*/ false))
return true;
return false;
@@ -1183,7 +1174,7 @@ bool AArch64LoadStoreOpt::findMatchingStore(
unsigned Count = 0;
do {
- --MBBI;
+ MBBI = prev_nodbg(MBBI, B);
MachineInstr &MI = *MBBI;
// Don't count transient instructions towards the search limit since there
@@ -1215,7 +1206,7 @@ bool AArch64LoadStoreOpt::findMatchingStore(
return false;
// If we encounter a store aliased with the load, return early.
- if (MI.mayStore() && mayAlias(LoadMI, MI, AA))
+ if (MI.mayStore() && LoadMI.mayAlias(AA, MI, /*UseTBAA*/ false))
return false;
} while (MBBI != B && Count < Limit);
return false;
@@ -1296,7 +1287,23 @@ canRenameUpToDef(MachineInstr &FirstMI, LiveRegUnits &UsedInBetween,
LLVM_DEBUG(dbgs() << " Operand not killed at " << FirstMI << "\n");
return false;
}
- auto canRenameMOP = [](const MachineOperand &MOP) {
+ auto canRenameMOP = [TRI](const MachineOperand &MOP) {
+ if (MOP.isReg()) {
+ auto *RegClass = TRI->getMinimalPhysRegClass(MOP.getReg());
+ // Renaming registers with multiple disjunct sub-registers (e.g. the
+ // result of a LD3) means that all sub-registers are renamed, potentially
+ // impacting other instructions we did not check. Bail out.
+ // Note that this relies on the structure of the AArch64 register file. In
+ // particular, a subregister cannot be written without overwriting the
+ // whole register.
+ if (RegClass->HasDisjunctSubRegs) {
+ LLVM_DEBUG(
+ dbgs()
+ << " Cannot rename operands with multiple disjunct subregisters ("
+ << MOP << ")\n");
+ return false;
+ }
+ }
return MOP.isImplicit() ||
(MOP.isRenamable() && !MOP.isEarlyClobber() && !MOP.isTied());
};
@@ -1325,6 +1332,19 @@ canRenameUpToDef(MachineInstr &FirstMI, LiveRegUnits &UsedInBetween,
// For defs, check if we can rename the first def of RegToRename.
if (FoundDef) {
+ // For some pseudo instructions, we might not generate code in the end
+ // (e.g. KILL) and we would end up without a correct def for the rename
+ // register.
+ // TODO: This might be overly conservative and we could handle those cases
+ // in multiple ways:
+ // 1. Insert an extra copy, to materialize the def.
+ // 2. Skip pseudo-defs until we find an non-pseudo def.
+ if (MI.isPseudo()) {
+ LLVM_DEBUG(dbgs() << " Cannot rename pseudo instruction " << MI
+ << "\n");
+ return false;
+ }
+
for (auto &MOP : MI.operands()) {
if (!MOP.isReg() || !MOP.isDef() || MOP.isDebug() || !MOP.getReg() ||
!TRI->regsOverlap(MOP.getReg(), RegToRename))
@@ -1422,7 +1442,7 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
MachineBasicBlock::iterator MBBI = I;
MachineBasicBlock::iterator MBBIWithRenameReg;
MachineInstr &FirstMI = *I;
- ++MBBI;
+ MBBI = next_nodbg(MBBI, E);
bool MayLoad = FirstMI.mayLoad();
bool IsUnscaled = TII->isUnscaledLdSt(FirstMI);
@@ -1433,6 +1453,9 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
bool IsPromotableZeroStore = isPromotableZeroStoreInst(FirstMI);
Optional<bool> MaybeCanRename = None;
+ if (!EnableRenaming)
+ MaybeCanRename = {false};
+
SmallPtrSet<const TargetRegisterClass *, 5> RequiredClasses;
LiveRegUnits UsedInBetween;
UsedInBetween.init(*TRI);
@@ -1447,7 +1470,8 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
// Remember any instructions that read/write memory between FirstMI and MI.
SmallVector<MachineInstr *, 4> MemInsns;
- for (unsigned Count = 0; MBBI != E && Count < Limit; ++MBBI) {
+ for (unsigned Count = 0; MBBI != E && Count < Limit;
+ MBBI = next_nodbg(MBBI, E)) {
MachineInstr &MI = *MBBI;
UsedInBetween.accumulate(MI);
@@ -1616,12 +1640,13 @@ AArch64LoadStoreOpt::mergeUpdateInsn(MachineBasicBlock::iterator I,
assert((Update->getOpcode() == AArch64::ADDXri ||
Update->getOpcode() == AArch64::SUBXri) &&
"Unexpected base register update instruction to merge!");
- MachineBasicBlock::iterator NextI = I;
+ MachineBasicBlock::iterator E = I->getParent()->end();
+ MachineBasicBlock::iterator NextI = next_nodbg(I, E);
// Return the instruction following the merged instruction, which is
// the instruction following our unmerged load. Unless that's the add/sub
// instruction we're merging, in which case it's the one after that.
- if (++NextI == Update)
- ++NextI;
+ if (NextI == Update)
+ NextI = next_nodbg(NextI, E);
int Value = Update->getOperand(2).getImm();
assert(AArch64_AM::getShiftValue(Update->getOperand(3).getImm()) == 0 &&
@@ -1759,8 +1784,24 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnForward(
// insn (inclusive) and the second insn.
ModifiedRegUnits.clear();
UsedRegUnits.clear();
- ++MBBI;
- for (unsigned Count = 0; MBBI != E && Count < Limit; ++MBBI) {
+ MBBI = next_nodbg(MBBI, E);
+
+ // We can't post-increment the stack pointer if any instruction between
+ // the memory access (I) and the increment (MBBI) can access the memory
+ // region defined by [SP, MBBI].
+ const bool BaseRegSP = BaseReg == AArch64::SP;
+ if (BaseRegSP) {
+ // FIXME: For now, we always block the optimization over SP in windows
+ // targets as it requires to adjust the unwind/debug info, messing up
+ // the unwind info can actually cause a miscompile.
+ const MCAsmInfo *MAI = I->getMF()->getTarget().getMCAsmInfo();
+ if (MAI->usesWindowsCFI() &&
+ I->getMF()->getFunction().needsUnwindTableEntry())
+ return E;
+ }
+
+ for (unsigned Count = 0; MBBI != E && Count < Limit;
+ MBBI = next_nodbg(MBBI, E)) {
MachineInstr &MI = *MBBI;
// Don't count transient instructions towards the search limit since there
@@ -1777,8 +1818,11 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnForward(
// Otherwise, if the base register is used or modified, we have no match, so
// return early.
+ // If we are optimizing SP, do not allow instructions that may load or store
+ // in between the load and the optimized value update.
if (!ModifiedRegUnits.available(BaseReg) ||
- !UsedRegUnits.available(BaseReg))
+ !UsedRegUnits.available(BaseReg) ||
+ (BaseRegSP && MBBI->mayLoadOrStore()))
return E;
}
return E;
@@ -1815,7 +1859,7 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward(
UsedRegUnits.clear();
unsigned Count = 0;
do {
- --MBBI;
+ MBBI = prev_nodbg(MBBI, B);
MachineInstr &MI = *MBBI;
// Don't count transient instructions towards the search limit since there
diff --git a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp
new file mode 100644
index 0000000000000..a37e380725544
--- /dev/null
+++ b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp
@@ -0,0 +1,32 @@
+//=- AArch64MachineFunctionInfo.cpp - AArch64 Machine Function Info ---------=//
+
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file implements AArch64-specific per-machine-function
+/// information.
+///
+//===----------------------------------------------------------------------===//
+
+#include "AArch64MachineFunctionInfo.h"
+
+using namespace llvm;
+
+yaml::AArch64FunctionInfo::AArch64FunctionInfo(
+ const llvm::AArch64FunctionInfo &MFI)
+ : HasRedZone(MFI.hasRedZone()) {}
+
+void yaml::AArch64FunctionInfo::mappingImpl(yaml::IO &YamlIO) {
+ MappingTraits<AArch64FunctionInfo>::mapping(YamlIO, *this);
+}
+
+void AArch64FunctionInfo::initializeBaseYamlFields(
+ const yaml::AArch64FunctionInfo &YamlMFI) {
+ if (YamlMFI.HasRedZone.hasValue())
+ HasRedZone = YamlMFI.HasRedZone;
+}
diff --git a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
index 6ddb3fdb00463..84aa53f2bece1 100644
--- a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
@@ -18,6 +18,7 @@
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/MIRYamlMapping.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/TargetFrameLowering.h"
#include "llvm/IR/Function.h"
@@ -26,6 +27,10 @@
namespace llvm {
+namespace yaml {
+struct AArch64FunctionInfo;
+} // end namespace yaml
+
class MachineInstr;
/// AArch64FunctionInfo - This class is derived from MachineFunctionInfo and
@@ -126,6 +131,10 @@ class AArch64FunctionInfo final : public MachineFunctionInfo {
// stack slot.
unsigned TaggedBasePointerOffset = 0;
+ /// OutliningStyle denotes, if a function was outined, how it was outlined,
+ /// e.g. Tail Call, Thunk, or Function if none apply.
+ Optional<std::string> OutliningStyle;
+
public:
AArch64FunctionInfo() = default;
@@ -137,6 +146,7 @@ public:
if (MF.getFunction().hasFnAttribute(Attribute::NoRedZone))
HasRedZone = false;
}
+ void initializeBaseYamlFields(const yaml::AArch64FunctionInfo &YamlMFI);
unsigned getBytesInStackArgArea() const { return BytesInStackArgArea; }
void setBytesInStackArgArea(unsigned bytes) { BytesInStackArgArea = bytes; }
@@ -173,6 +183,9 @@ public:
void setLocalStackSize(uint64_t Size) { LocalStackSize = Size; }
uint64_t getLocalStackSize() const { return LocalStackSize; }
+ void setOutliningStyle(std::string Style) { OutliningStyle = Style; }
+ Optional<std::string> getOutliningStyle() const { return OutliningStyle; }
+
void setCalleeSavedStackSize(unsigned Size) {
CalleeSavedStackSize = Size;
HasCalleeSavedStackSize = true;
@@ -333,6 +346,25 @@ private:
DenseMap<int, std::pair<unsigned, MCSymbol *>> JumpTableEntryInfo;
};
+namespace yaml {
+struct AArch64FunctionInfo final : public yaml::MachineFunctionInfo {
+ Optional<bool> HasRedZone;
+
+ AArch64FunctionInfo() = default;
+ AArch64FunctionInfo(const llvm::AArch64FunctionInfo &MFI);
+
+ void mappingImpl(yaml::IO &YamlIO) override;
+ ~AArch64FunctionInfo() = default;
+};
+
+template <> struct MappingTraits<AArch64FunctionInfo> {
+ static void mapping(IO &YamlIO, AArch64FunctionInfo &MFI) {
+ YamlIO.mapOptional("hasRedZone", MFI.HasRedZone);
+ }
+};
+
+} // end namespace yaml
+
} // end namespace llvm
#endif // LLVM_LIB_TARGET_AARCH64_AARCH64MACHINEFUNCTIONINFO_H
diff --git a/llvm/lib/Target/AArch64/AArch64PromoteConstant.cpp b/llvm/lib/Target/AArch64/AArch64PromoteConstant.cpp
index 9135f1b401223..9044c94bc4fe5 100644
--- a/llvm/lib/Target/AArch64/AArch64PromoteConstant.cpp
+++ b/llvm/lib/Target/AArch64/AArch64PromoteConstant.cpp
@@ -250,6 +250,20 @@ static bool isConstantUsingVectorTy(const Type *CstTy) {
return false;
}
+// Returns true if \p C contains only ConstantData leafs and no global values,
+// block addresses or constant expressions. Traverses ConstantAggregates.
+static bool containsOnlyConstantData(const Constant *C) {
+ if (isa<ConstantData>(C))
+ return true;
+
+ if (isa<GlobalValue>(C) || isa<BlockAddress>(C) || isa<ConstantExpr>(C))
+ return false;
+
+ return all_of(C->operands(), [](const Use &U) {
+ return containsOnlyConstantData(cast<Constant>(&U));
+ });
+}
+
/// Check if the given use (Instruction + OpIdx) of Cst should be converted into
/// a load of a global variable initialized with Cst.
/// A use should be converted if it is legal to do so.
@@ -304,7 +318,7 @@ static bool shouldConvertUse(const Constant *Cst, const Instruction *Instr,
// Do not mess with inline asm.
const CallInst *CI = dyn_cast<const CallInst>(Instr);
- return !(CI && isa<const InlineAsm>(CI->getCalledValue()));
+ return !(CI && CI->isInlineAsm());
}
/// Check if the given Cst should be converted into
@@ -550,9 +564,10 @@ bool AArch64PromoteConstant::runOnFunction(Function &F,
for (Use &U : I.operands()) {
Constant *Cst = dyn_cast<Constant>(U);
// There is no point in promoting global values as they are already
- // global. Do not promote constant expressions either, as they may
- // require some code expansion.
- if (!Cst || isa<GlobalValue>(Cst) || isa<ConstantExpr>(Cst))
+ // global. Do not promote constants containing constant expression, global
+ // values or blockaddresses either, as they may require some code
+ // expansion.
+ if (!Cst || isa<GlobalValue>(Cst) || !containsOnlyConstantData(Cst))
continue;
// Check if this constant is worth promoting.
diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
index 14f839cd4f812..886158ca44901 100644
--- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
@@ -43,24 +43,27 @@ AArch64RegisterInfo::AArch64RegisterInfo(const Triple &TT)
const MCPhysReg *
AArch64RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
assert(MF && "Invalid MachineFunction pointer.");
- if (MF->getFunction().getCallingConv() == CallingConv::CFGuard_Check)
- return CSR_Win_AArch64_CFGuard_Check_SaveList;
- if (MF->getSubtarget<AArch64Subtarget>().isTargetWindows())
- return CSR_Win_AArch64_AAPCS_SaveList;
+
if (MF->getFunction().getCallingConv() == CallingConv::GHC)
// GHC set of callee saved regs is empty as all those regs are
// used for passing STG regs around
return CSR_AArch64_NoRegs_SaveList;
if (MF->getFunction().getCallingConv() == CallingConv::AnyReg)
return CSR_AArch64_AllRegs_SaveList;
+
+ // Darwin has its own CSR_AArch64_AAPCS_SaveList, which means most CSR save
+ // lists depending on that will need to have their Darwin variant as well.
+ if (MF->getSubtarget<AArch64Subtarget>().isTargetDarwin())
+ return getDarwinCalleeSavedRegs(MF);
+
+ if (MF->getFunction().getCallingConv() == CallingConv::CFGuard_Check)
+ return CSR_Win_AArch64_CFGuard_Check_SaveList;
+ if (MF->getSubtarget<AArch64Subtarget>().isTargetWindows())
+ return CSR_Win_AArch64_AAPCS_SaveList;
if (MF->getFunction().getCallingConv() == CallingConv::AArch64_VectorCall)
return CSR_AArch64_AAVPCS_SaveList;
if (MF->getFunction().getCallingConv() == CallingConv::AArch64_SVE_VectorCall)
return CSR_AArch64_SVE_AAPCS_SaveList;
- if (MF->getFunction().getCallingConv() == CallingConv::CXX_FAST_TLS)
- return MF->getInfo<AArch64FunctionInfo>()->isSplitCSR() ?
- CSR_AArch64_CXX_TLS_Darwin_PE_SaveList :
- CSR_AArch64_CXX_TLS_Darwin_SaveList;
if (MF->getSubtarget<AArch64Subtarget>().getTargetLowering()
->supportSwiftError() &&
MF->getFunction().getAttributes().hasAttrSomewhere(
@@ -68,17 +71,47 @@ AArch64RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
return CSR_AArch64_AAPCS_SwiftError_SaveList;
if (MF->getFunction().getCallingConv() == CallingConv::PreserveMost)
return CSR_AArch64_RT_MostRegs_SaveList;
- if (MF->getSubtarget<AArch64Subtarget>().isTargetDarwin())
- return CSR_Darwin_AArch64_AAPCS_SaveList;
+ if (MF->getFunction().getCallingConv() == CallingConv::Win64)
+ // This is for OSes other than Windows; Windows is a separate case further
+ // above.
+ return CSR_AArch64_AAPCS_X18_SaveList;
return CSR_AArch64_AAPCS_SaveList;
}
+const MCPhysReg *
+AArch64RegisterInfo::getDarwinCalleeSavedRegs(const MachineFunction *MF) const {
+ assert(MF && "Invalid MachineFunction pointer.");
+ assert(MF->getSubtarget<AArch64Subtarget>().isTargetDarwin() &&
+ "Invalid subtarget for getDarwinCalleeSavedRegs");
+
+ if (MF->getFunction().getCallingConv() == CallingConv::CFGuard_Check)
+ report_fatal_error(
+ "Calling convention CFGuard_Check is unsupported on Darwin.");
+ if (MF->getFunction().getCallingConv() == CallingConv::AArch64_VectorCall)
+ return CSR_Darwin_AArch64_AAVPCS_SaveList;
+ if (MF->getFunction().getCallingConv() == CallingConv::AArch64_SVE_VectorCall)
+ report_fatal_error(
+ "Calling convention SVE_VectorCall is unsupported on Darwin.");
+ if (MF->getFunction().getCallingConv() == CallingConv::CXX_FAST_TLS)
+ return MF->getInfo<AArch64FunctionInfo>()->isSplitCSR()
+ ? CSR_Darwin_AArch64_CXX_TLS_PE_SaveList
+ : CSR_Darwin_AArch64_CXX_TLS_SaveList;
+ if (MF->getSubtarget<AArch64Subtarget>().getTargetLowering()
+ ->supportSwiftError() &&
+ MF->getFunction().getAttributes().hasAttrSomewhere(
+ Attribute::SwiftError))
+ return CSR_Darwin_AArch64_AAPCS_SwiftError_SaveList;
+ if (MF->getFunction().getCallingConv() == CallingConv::PreserveMost)
+ return CSR_Darwin_AArch64_RT_MostRegs_SaveList;
+ return CSR_Darwin_AArch64_AAPCS_SaveList;
+}
+
const MCPhysReg *AArch64RegisterInfo::getCalleeSavedRegsViaCopy(
const MachineFunction *MF) const {
assert(MF && "Invalid MachineFunction pointer.");
if (MF->getFunction().getCallingConv() == CallingConv::CXX_FAST_TLS &&
MF->getInfo<AArch64FunctionInfo>()->isSplitCSR())
- return CSR_AArch64_CXX_TLS_Darwin_ViaCopy_SaveList;
+ return CSR_Darwin_AArch64_CXX_TLS_ViaCopy_SaveList;
return nullptr;
}
@@ -113,6 +146,32 @@ AArch64RegisterInfo::getSubClassWithSubReg(const TargetRegisterClass *RC,
}
const uint32_t *
+AArch64RegisterInfo::getDarwinCallPreservedMask(const MachineFunction &MF,
+ CallingConv::ID CC) const {
+ assert(MF.getSubtarget<AArch64Subtarget>().isTargetDarwin() &&
+ "Invalid subtarget for getDarwinCallPreservedMask");
+
+ if (CC == CallingConv::CXX_FAST_TLS)
+ return CSR_Darwin_AArch64_CXX_TLS_RegMask;
+ if (CC == CallingConv::AArch64_VectorCall)
+ return CSR_Darwin_AArch64_AAVPCS_RegMask;
+ if (CC == CallingConv::AArch64_SVE_VectorCall)
+ report_fatal_error(
+ "Calling convention SVE_VectorCall is unsupported on Darwin.");
+ if (CC == CallingConv::CFGuard_Check)
+ report_fatal_error(
+ "Calling convention CFGuard_Check is unsupported on Darwin.");
+ if (MF.getSubtarget<AArch64Subtarget>()
+ .getTargetLowering()
+ ->supportSwiftError() &&
+ MF.getFunction().getAttributes().hasAttrSomewhere(Attribute::SwiftError))
+ return CSR_Darwin_AArch64_AAPCS_SwiftError_RegMask;
+ if (CC == CallingConv::PreserveMost)
+ return CSR_Darwin_AArch64_RT_MostRegs_RegMask;
+ return CSR_Darwin_AArch64_AAPCS_RegMask;
+}
+
+const uint32_t *
AArch64RegisterInfo::getCallPreservedMask(const MachineFunction &MF,
CallingConv::ID CC) const {
bool SCS = MF.getFunction().hasFnAttribute(Attribute::ShadowCallStack);
@@ -121,9 +180,14 @@ AArch64RegisterInfo::getCallPreservedMask(const MachineFunction &MF,
return SCS ? CSR_AArch64_NoRegs_SCS_RegMask : CSR_AArch64_NoRegs_RegMask;
if (CC == CallingConv::AnyReg)
return SCS ? CSR_AArch64_AllRegs_SCS_RegMask : CSR_AArch64_AllRegs_RegMask;
- if (CC == CallingConv::CXX_FAST_TLS)
- return SCS ? CSR_AArch64_CXX_TLS_Darwin_SCS_RegMask
- : CSR_AArch64_CXX_TLS_Darwin_RegMask;
+
+ // All the following calling conventions are handled differently on Darwin.
+ if (MF.getSubtarget<AArch64Subtarget>().isTargetDarwin()) {
+ if (SCS)
+ report_fatal_error("ShadowCallStack attribute not supported on Darwin.");
+ return getDarwinCallPreservedMask(MF, CC);
+ }
+
if (CC == CallingConv::AArch64_VectorCall)
return SCS ? CSR_AArch64_AAVPCS_SCS_RegMask : CSR_AArch64_AAVPCS_RegMask;
if (CC == CallingConv::AArch64_SVE_VectorCall)
@@ -145,7 +209,7 @@ AArch64RegisterInfo::getCallPreservedMask(const MachineFunction &MF,
const uint32_t *AArch64RegisterInfo::getTLSCallPreservedMask() const {
if (TT.isOSDarwin())
- return CSR_AArch64_TLS_Darwin_RegMask;
+ return CSR_Darwin_AArch64_TLS_RegMask;
assert(TT.isOSBinFormatELF() && "Invalid target");
return CSR_AArch64_TLS_ELF_RegMask;
@@ -186,6 +250,8 @@ AArch64RegisterInfo::getThisReturnPreservedMask(const MachineFunction &MF,
// In case that the calling convention does not use the same register for
// both, the function should return NULL (does not currently apply)
assert(CC != CallingConv::GHC && "should not be GHC calling convention.");
+ if (MF.getSubtarget<AArch64Subtarget>().isTargetDarwin())
+ return CSR_Darwin_AArch64_AAPCS_ThisReturn_RegMask;
return CSR_AArch64_AAPCS_ThisReturn_RegMask;
}
@@ -222,7 +288,7 @@ AArch64RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
}
bool AArch64RegisterInfo::isReservedReg(const MachineFunction &MF,
- unsigned Reg) const {
+ MCRegister Reg) const {
return getReservedRegs(MF)[Reg];
}
@@ -240,11 +306,11 @@ void AArch64RegisterInfo::emitReservedArgRegCallError(
}
bool AArch64RegisterInfo::isAsmClobberable(const MachineFunction &MF,
- unsigned PhysReg) const {
+ MCRegister PhysReg) const {
return !isReservedReg(MF, PhysReg);
}
-bool AArch64RegisterInfo::isConstantPhysReg(unsigned PhysReg) const {
+bool AArch64RegisterInfo::isConstantPhysReg(MCRegister PhysReg) const {
return PhysReg == AArch64::WZR || PhysReg == AArch64::XZR;
}
@@ -390,12 +456,16 @@ bool AArch64RegisterInfo::needsFrameBaseReg(MachineInstr *MI,
if (isFrameOffsetLegal(MI, AArch64::SP, Offset))
return false;
+ // If even offset 0 is illegal, we don't want a virtual base register.
+ if (!isFrameOffsetLegal(MI, AArch64::SP, 0))
+ return false;
+
// The offset likely isn't legal; we want to allocate a virtual base register.
return true;
}
bool AArch64RegisterInfo::isFrameOffsetLegal(const MachineInstr *MI,
- unsigned BaseReg,
+ Register BaseReg,
int64_t Offset) const {
assert(MI && "Unable to get the legal offset for nil instruction.");
StackOffset SaveOffset(Offset, MVT::i8);
@@ -405,7 +475,7 @@ bool AArch64RegisterInfo::isFrameOffsetLegal(const MachineInstr *MI,
/// Insert defining instruction(s) for BaseReg to be a pointer to FrameIdx
/// at the beginning of the basic block.
void AArch64RegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB,
- unsigned BaseReg,
+ Register BaseReg,
int FrameIdx,
int64_t Offset) const {
MachineBasicBlock::iterator Ins = MBB->begin();
@@ -426,7 +496,7 @@ void AArch64RegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB,
.addImm(Shifter);
}
-void AArch64RegisterInfo::resolveFrameIndex(MachineInstr &MI, unsigned BaseReg,
+void AArch64RegisterInfo::resolveFrameIndex(MachineInstr &MI, Register BaseReg,
int64_t Offset) const {
// ARM doesn't need the general 64-bit offsets
StackOffset Off(Offset, MVT::i8);
@@ -445,6 +515,27 @@ void AArch64RegisterInfo::resolveFrameIndex(MachineInstr &MI, unsigned BaseReg,
(void)Done;
}
+// Create a scratch register for the frame index elimination in an instruction.
+// This function has special handling of stack tagging loop pseudos, in which
+// case it can also change the instruction opcode (but not the operands).
+static Register
+createScratchRegisterForInstruction(MachineInstr &MI,
+ const AArch64InstrInfo *TII) {
+ // ST*Gloop have a reserved scratch register in operand 1. Use it, and also
+ // replace the instruction with the writeback variant because it will now
+ // satisfy the operand constraints for it.
+ if (MI.getOpcode() == AArch64::STGloop) {
+ MI.setDesc(TII->get(AArch64::STGloop_wback));
+ return MI.getOperand(1).getReg();
+ } else if (MI.getOpcode() == AArch64::STZGloop) {
+ MI.setDesc(TII->get(AArch64::STZGloop_wback));
+ return MI.getOperand(1).getReg();
+ } else {
+ return MI.getMF()->getRegInfo().createVirtualRegister(
+ &AArch64::GPR64RegClass);
+ }
+}
+
void AArch64RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
int SPAdj, unsigned FIOperandNum,
RegScavenger *RS) const {
@@ -461,7 +552,7 @@ void AArch64RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
int FrameIndex = MI.getOperand(FIOperandNum).getIndex();
bool Tagged =
MI.getOperand(FIOperandNum).getTargetFlags() & AArch64II::MO_TAGGED;
- unsigned FrameReg;
+ Register FrameReg;
// Special handling of dbg_value, stackmap and patchpoint instructions.
if (MI.isDebugValue() || MI.getOpcode() == TargetOpcode::STACKMAP ||
@@ -531,8 +622,7 @@ void AArch64RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
// If we get here, the immediate doesn't fit into the instruction. We folded
// as much as possible above. Handle the rest, providing a register that is
// SP+LargeImm.
- Register ScratchReg =
- MF.getRegInfo().createVirtualRegister(&AArch64::GPR64RegClass);
+ Register ScratchReg = createScratchRegisterForInstruction(MI, TII);
emitFrameOffset(MBB, II, MI.getDebugLoc(), ScratchReg, FrameReg, Offset, TII);
MI.getOperand(FIOperandNum).ChangeToRegister(ScratchReg, false, false, true);
}
@@ -572,6 +662,8 @@ unsigned AArch64RegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
return 32;
case AArch64::FPR128_loRegClassID:
+ case AArch64::FPR64_loRegClassID:
+ case AArch64::FPR16_loRegClassID:
return 16;
}
}
diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.h b/llvm/lib/Target/AArch64/AArch64RegisterInfo.h
index 2c3f82c530d8a..22a8ba76c6111 100644
--- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.h
@@ -34,7 +34,7 @@ public:
return getEncodingValue(i);
}
- bool isReservedReg(const MachineFunction &MF, unsigned Reg) const;
+ bool isReservedReg(const MachineFunction &MF, MCRegister Reg) const;
bool isAnyArgRegReserved(const MachineFunction &MF) const;
void emitReservedArgRegCallError(const MachineFunction &MF) const;
@@ -44,10 +44,13 @@ public:
/// Code Generation virtual methods...
const MCPhysReg *getCalleeSavedRegs(const MachineFunction *MF) const override;
+ const MCPhysReg *getDarwinCalleeSavedRegs(const MachineFunction *MF) const;
const MCPhysReg *
getCalleeSavedRegsViaCopy(const MachineFunction *MF) const;
const uint32_t *getCallPreservedMask(const MachineFunction &MF,
CallingConv::ID) const override;
+ const uint32_t *getDarwinCallPreservedMask(const MachineFunction &MF,
+ CallingConv::ID) const;
unsigned getCSRFirstUseCost() const override {
// The cost will be compared against BlockFrequency where entry has the
@@ -83,8 +86,8 @@ public:
BitVector getReservedRegs(const MachineFunction &MF) const override;
bool isAsmClobberable(const MachineFunction &MF,
- unsigned PhysReg) const override;
- bool isConstantPhysReg(unsigned PhysReg) const override;
+ MCRegister PhysReg) const override;
+ bool isConstantPhysReg(MCRegister PhysReg) const override;
const TargetRegisterClass *
getPointerRegClass(const MachineFunction &MF,
unsigned Kind = 0) const override;
@@ -96,12 +99,12 @@ public:
bool requiresFrameIndexScavenging(const MachineFunction &MF) const override;
bool needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const override;
- bool isFrameOffsetLegal(const MachineInstr *MI, unsigned BaseReg,
+ bool isFrameOffsetLegal(const MachineInstr *MI, Register BaseReg,
int64_t Offset) const override;
- void materializeFrameBaseRegister(MachineBasicBlock *MBB, unsigned BaseReg,
+ void materializeFrameBaseRegister(MachineBasicBlock *MBB, Register BaseReg,
int FrameIdx,
int64_t Offset) const override;
- void resolveFrameIndex(MachineInstr &MI, unsigned BaseReg,
+ void resolveFrameIndex(MachineInstr &MI, Register BaseReg,
int64_t Offset) const override;
void eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj,
unsigned FIOperandNum,
@@ -118,10 +121,6 @@ public:
unsigned getRegPressureLimit(const TargetRegisterClass *RC,
MachineFunction &MF) const override;
- bool trackLivenessAfterRegAlloc(const MachineFunction&) const override {
- return true;
- }
-
unsigned getLocalAddressRegister(const MachineFunction &MF) const;
};
diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.td b/llvm/lib/Target/AArch64/AArch64RegisterInfo.td
index f52feab039530..bd05c56009a1d 100644
--- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.td
@@ -422,25 +422,35 @@ def Q31 : AArch64Reg<31, "q31", [D31], ["v31", ""]>, DwarfRegAlias<B31>;
def FPR8 : RegisterClass<"AArch64", [untyped], 8, (sequence "B%u", 0, 31)> {
let Size = 8;
}
-def FPR16 : RegisterClass<"AArch64", [f16], 16, (sequence "H%u", 0, 31)> {
+def FPR16 : RegisterClass<"AArch64", [f16, bf16], 16, (sequence "H%u", 0, 31)> {
+ let Size = 16;
+}
+
+def FPR16_lo : RegisterClass<"AArch64", [f16], 16, (trunc FPR16, 16)> {
let Size = 16;
}
def FPR32 : RegisterClass<"AArch64", [f32, i32], 32,(sequence "S%u", 0, 31)>;
def FPR64 : RegisterClass<"AArch64", [f64, i64, v2f32, v1f64, v8i8, v4i16, v2i32,
- v1i64, v4f16],
- 64, (sequence "D%u", 0, 31)>;
+ v1i64, v4f16, v4bf16],
+ 64, (sequence "D%u", 0, 31)>;
+def FPR64_lo : RegisterClass<"AArch64",
+ [v8i8, v4i16, v2i32, v1i64, v4f16, v4bf16, v2f32,
+ v1f64],
+ 64, (trunc FPR64, 16)>;
+
// We don't (yet) have an f128 legal type, so don't use that here. We
// normalize 128-bit vectors to v2f64 for arg passing and such, so use
// that here.
def FPR128 : RegisterClass<"AArch64",
[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64, f128,
- v8f16],
+ v8f16, v8bf16],
128, (sequence "Q%u", 0, 31)>;
// The lower 16 vector registers. Some instructions can only take registers
// in this range.
def FPR128_lo : RegisterClass<"AArch64",
- [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64, v8f16],
+ [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64, v8f16,
+ v8bf16],
128, (trunc FPR128, 16)>;
// Pairs, triples, and quads of 64-bit vector registers.
@@ -503,6 +513,9 @@ def VectorRegLoAsmOperand : AsmOperandClass {
let Name = "VectorRegLo";
let PredicateMethod = "isNeonVectorRegLo";
}
+def V64_lo : RegisterOperand<FPR64_lo, "printVRegOperand"> {
+ let ParserMatchClass = VectorRegLoAsmOperand;
+}
def V128_lo : RegisterOperand<FPR128_lo, "printVRegOperand"> {
let ParserMatchClass = VectorRegLoAsmOperand;
}
@@ -641,6 +654,10 @@ def FPR16Op : RegisterOperand<FPR16, "printOperand"> {
let ParserMatchClass = FPRAsmOperand<"FPR16">;
}
+def FPR16Op_lo : RegisterOperand<FPR16_lo, "printOperand"> {
+ let ParserMatchClass = FPRAsmOperand<"FPR16_lo">;
+}
+
def FPR32Op : RegisterOperand<FPR32, "printOperand"> {
let ParserMatchClass = FPRAsmOperand<"FPR32">;
}
@@ -664,11 +681,11 @@ def XSeqPairs : RegisterTuples<[sube64, subo64],
[(decimate (rotl GPR64, 0), 2),
(decimate (rotl GPR64, 1), 2)]>;
-def WSeqPairsClass : RegisterClass<"AArch64", [untyped], 32,
+def WSeqPairsClass : RegisterClass<"AArch64", [untyped], 32,
(add WSeqPairs)>{
let Size = 64;
}
-def XSeqPairsClass : RegisterClass<"AArch64", [untyped], 64,
+def XSeqPairsClass : RegisterClass<"AArch64", [untyped], 64,
(add XSeqPairs)>{
let Size = 128;
}
@@ -780,7 +797,7 @@ def Z30 : AArch64Reg<30, "z30", [Q30, Z30_HI]>, DwarfRegNum<[126]>;
def Z31 : AArch64Reg<31, "z31", [Q31, Z31_HI]>, DwarfRegNum<[127]>;
}
-// Enum descibing the element size for destructive
+// Enum describing the element size for destructive
// operations.
class ElementSizeEnum<bits<3> val> {
bits<3> Value = val;
@@ -862,6 +879,7 @@ def PPR3b64 : PPRRegOp<"d", PPRAsmOp3b64, ElementSizeD, PPR_3b>;
class ZPRClass<int lastreg> : RegisterClass<"AArch64",
[nxv16i8, nxv8i16, nxv4i32, nxv2i64,
nxv2f16, nxv4f16, nxv8f16,
+ nxv2bf16, nxv4bf16, nxv8bf16,
nxv2f32, nxv4f32,
nxv2f64],
128, (sequence "Z%u", 0, lastreg)> {
diff --git a/llvm/lib/Target/AArch64/AArch64SIMDInstrOpt.cpp b/llvm/lib/Target/AArch64/AArch64SIMDInstrOpt.cpp
index 28a7e680849b0..fc31e701d3af1 100644
--- a/llvm/lib/Target/AArch64/AArch64SIMDInstrOpt.cpp
+++ b/llvm/lib/Target/AArch64/AArch64SIMDInstrOpt.cpp
@@ -219,7 +219,7 @@ shouldReplaceInst(MachineFunction *MF, const MCInstrDesc *InstDesc,
SmallVectorImpl<const MCInstrDesc*> &InstDescRepl) {
// Check if replacement decision is already available in the cached table.
// if so, return it.
- std::string Subtarget = SchedModel.getSubtargetInfo()->getCPU();
+ std::string Subtarget = std::string(SchedModel.getSubtargetInfo()->getCPU());
auto InstID = std::make_pair(InstDesc->getOpcode(), Subtarget);
if (SIMDInstrTable.find(InstID) != SIMDInstrTable.end())
return SIMDInstrTable[InstID];
@@ -288,7 +288,8 @@ bool AArch64SIMDInstrOpt::shouldExitEarly(MachineFunction *MF, Subpass SP) {
// For this optimization, check for all concerned instructions.
case Interleave:
- std::string Subtarget = SchedModel.getSubtargetInfo()->getCPU();
+ std::string Subtarget =
+ std::string(SchedModel.getSubtargetInfo()->getCPU());
if (InterlEarlyExit.find(Subtarget) != InterlEarlyExit.end())
return InterlEarlyExit[Subtarget];
diff --git a/llvm/lib/Target/AArch64/AArch64SLSHardening.cpp b/llvm/lib/Target/AArch64/AArch64SLSHardening.cpp
new file mode 100644
index 0000000000000..cb4dc8462f68d
--- /dev/null
+++ b/llvm/lib/Target/AArch64/AArch64SLSHardening.cpp
@@ -0,0 +1,443 @@
+//===- AArch64SLSHardening.cpp - Harden Straight Line Missspeculation -----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a pass to insert code to mitigate against side channel
+// vulnerabilities that may happen under straight line miss-speculation.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AArch64InstrInfo.h"
+#include "AArch64Subtarget.h"
+#include "Utils/AArch64BaseInfo.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/IndirectThunks.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CodeGen.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Target/TargetMachine.h"
+#include <cassert>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "aarch64-sls-hardening"
+
+#define AARCH64_SLS_HARDENING_NAME "AArch64 sls hardening pass"
+
+namespace {
+
+class AArch64SLSHardening : public MachineFunctionPass {
+public:
+ const TargetInstrInfo *TII;
+ const TargetRegisterInfo *TRI;
+ const AArch64Subtarget *ST;
+
+ static char ID;
+
+ AArch64SLSHardening() : MachineFunctionPass(ID) {
+ initializeAArch64SLSHardeningPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnMachineFunction(MachineFunction &Fn) override;
+
+ StringRef getPassName() const override { return AARCH64_SLS_HARDENING_NAME; }
+
+private:
+ bool hardenReturnsAndBRs(MachineBasicBlock &MBB) const;
+ bool hardenBLRs(MachineBasicBlock &MBB) const;
+ MachineBasicBlock &ConvertBLRToBL(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator) const;
+};
+
+} // end anonymous namespace
+
+char AArch64SLSHardening::ID = 0;
+
+INITIALIZE_PASS(AArch64SLSHardening, "aarch64-sls-hardening",
+ AARCH64_SLS_HARDENING_NAME, false, false)
+
+static void insertSpeculationBarrier(const AArch64Subtarget *ST,
+ MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ DebugLoc DL,
+ bool AlwaysUseISBDSB = false) {
+ assert(MBBI != MBB.begin() &&
+ "Must not insert SpeculationBarrierEndBB as only instruction in MBB.");
+ assert(std::prev(MBBI)->isBarrier() &&
+ "SpeculationBarrierEndBB must only follow unconditional control flow "
+ "instructions.");
+ assert(std::prev(MBBI)->isTerminator() &&
+ "SpeculationBarrierEndBB must only follow terminators.");
+ const TargetInstrInfo *TII = ST->getInstrInfo();
+ unsigned BarrierOpc = ST->hasSB() && !AlwaysUseISBDSB
+ ? AArch64::SpeculationBarrierSBEndBB
+ : AArch64::SpeculationBarrierISBDSBEndBB;
+ if (MBBI == MBB.end() ||
+ (MBBI->getOpcode() != AArch64::SpeculationBarrierSBEndBB &&
+ MBBI->getOpcode() != AArch64::SpeculationBarrierISBDSBEndBB))
+ BuildMI(MBB, MBBI, DL, TII->get(BarrierOpc));
+}
+
+bool AArch64SLSHardening::runOnMachineFunction(MachineFunction &MF) {
+ ST = &MF.getSubtarget<AArch64Subtarget>();
+ TII = MF.getSubtarget().getInstrInfo();
+ TRI = MF.getSubtarget().getRegisterInfo();
+
+ bool Modified = false;
+ for (auto &MBB : MF) {
+ Modified |= hardenReturnsAndBRs(MBB);
+ Modified |= hardenBLRs(MBB);
+ }
+
+ return Modified;
+}
+
+static bool isBLR(const MachineInstr &MI) {
+ switch (MI.getOpcode()) {
+ case AArch64::BLR:
+ case AArch64::BLRNoIP:
+ return true;
+ case AArch64::BLRAA:
+ case AArch64::BLRAB:
+ case AArch64::BLRAAZ:
+ case AArch64::BLRABZ:
+ llvm_unreachable("Currently, LLVM's code generator does not support "
+ "producing BLRA* instructions. Therefore, there's no "
+ "support in this pass for those instructions.");
+ }
+ return false;
+}
+
+bool AArch64SLSHardening::hardenReturnsAndBRs(MachineBasicBlock &MBB) const {
+ if (!ST->hardenSlsRetBr())
+ return false;
+ bool Modified = false;
+ MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator(), E = MBB.end();
+ MachineBasicBlock::iterator NextMBBI;
+ for (; MBBI != E; MBBI = NextMBBI) {
+ MachineInstr &MI = *MBBI;
+ NextMBBI = std::next(MBBI);
+ if (MI.isReturn() || isIndirectBranchOpcode(MI.getOpcode())) {
+ assert(MI.isTerminator());
+ insertSpeculationBarrier(ST, MBB, std::next(MBBI), MI.getDebugLoc());
+ Modified = true;
+ }
+ }
+ return Modified;
+}
+
+static const char SLSBLRNamePrefix[] = "__llvm_slsblr_thunk_";
+
+static const struct ThunkNameAndReg {
+ const char* Name;
+ Register Reg;
+} SLSBLRThunks[] = {
+ { "__llvm_slsblr_thunk_x0", AArch64::X0},
+ { "__llvm_slsblr_thunk_x1", AArch64::X1},
+ { "__llvm_slsblr_thunk_x2", AArch64::X2},
+ { "__llvm_slsblr_thunk_x3", AArch64::X3},
+ { "__llvm_slsblr_thunk_x4", AArch64::X4},
+ { "__llvm_slsblr_thunk_x5", AArch64::X5},
+ { "__llvm_slsblr_thunk_x6", AArch64::X6},
+ { "__llvm_slsblr_thunk_x7", AArch64::X7},
+ { "__llvm_slsblr_thunk_x8", AArch64::X8},
+ { "__llvm_slsblr_thunk_x9", AArch64::X9},
+ { "__llvm_slsblr_thunk_x10", AArch64::X10},
+ { "__llvm_slsblr_thunk_x11", AArch64::X11},
+ { "__llvm_slsblr_thunk_x12", AArch64::X12},
+ { "__llvm_slsblr_thunk_x13", AArch64::X13},
+ { "__llvm_slsblr_thunk_x14", AArch64::X14},
+ { "__llvm_slsblr_thunk_x15", AArch64::X15},
+ // X16 and X17 are deliberately missing, as the mitigation requires those
+ // register to not be used in BLR. See comment in ConvertBLRToBL for more
+ // details.
+ { "__llvm_slsblr_thunk_x18", AArch64::X18},
+ { "__llvm_slsblr_thunk_x19", AArch64::X19},
+ { "__llvm_slsblr_thunk_x20", AArch64::X20},
+ { "__llvm_slsblr_thunk_x21", AArch64::X21},
+ { "__llvm_slsblr_thunk_x22", AArch64::X22},
+ { "__llvm_slsblr_thunk_x23", AArch64::X23},
+ { "__llvm_slsblr_thunk_x24", AArch64::X24},
+ { "__llvm_slsblr_thunk_x25", AArch64::X25},
+ { "__llvm_slsblr_thunk_x26", AArch64::X26},
+ { "__llvm_slsblr_thunk_x27", AArch64::X27},
+ { "__llvm_slsblr_thunk_x28", AArch64::X28},
+ { "__llvm_slsblr_thunk_x29", AArch64::FP},
+ // X30 is deliberately missing, for similar reasons as X16 and X17 are
+ // missing.
+ { "__llvm_slsblr_thunk_x31", AArch64::XZR},
+};
+
+namespace {
+struct SLSBLRThunkInserter : ThunkInserter<SLSBLRThunkInserter> {
+ const char *getThunkPrefix() { return SLSBLRNamePrefix; }
+ bool mayUseThunk(const MachineFunction &MF) {
+ // FIXME: This could also check if there are any BLRs in the function
+ // to more accurately reflect if a thunk will be needed.
+ return MF.getSubtarget<AArch64Subtarget>().hardenSlsBlr();
+ }
+ void insertThunks(MachineModuleInfo &MMI);
+ void populateThunk(MachineFunction &MF);
+};
+} // namespace
+
+void SLSBLRThunkInserter::insertThunks(MachineModuleInfo &MMI) {
+ // FIXME: It probably would be possible to filter which thunks to produce
+ // based on which registers are actually used in BLR instructions in this
+ // function. But would that be a worthwhile optimization?
+ for (auto T : SLSBLRThunks)
+ createThunkFunction(MMI, T.Name);
+}
+
+void SLSBLRThunkInserter::populateThunk(MachineFunction &MF) {
+ // FIXME: How to better communicate Register number, rather than through
+ // name and lookup table?
+ assert(MF.getName().startswith(getThunkPrefix()));
+ auto ThunkIt = llvm::find_if(
+ SLSBLRThunks, [&MF](auto T) { return T.Name == MF.getName(); });
+ assert(ThunkIt != std::end(SLSBLRThunks));
+ Register ThunkReg = ThunkIt->Reg;
+
+ const TargetInstrInfo *TII =
+ MF.getSubtarget<AArch64Subtarget>().getInstrInfo();
+ assert (MF.size() == 1);
+ MachineBasicBlock *Entry = &MF.front();
+ Entry->clear();
+
+ // These thunks need to consist of the following instructions:
+ // __llvm_slsblr_thunk_xN:
+ // BR xN
+ // barrierInsts
+ Entry->addLiveIn(ThunkReg);
+ // MOV X16, ThunkReg == ORR X16, XZR, ThunkReg, LSL #0
+ BuildMI(Entry, DebugLoc(), TII->get(AArch64::ORRXrs), AArch64::X16)
+ .addReg(AArch64::XZR)
+ .addReg(ThunkReg)
+ .addImm(0);
+ BuildMI(Entry, DebugLoc(), TII->get(AArch64::BR)).addReg(AArch64::X16);
+ // Make sure the thunks do not make use of the SB extension in case there is
+ // a function somewhere that will call to it that for some reason disabled
+ // the SB extension locally on that function, even though it's enabled for
+ // the module otherwise. Therefore set AlwaysUseISBSDB to true.
+ insertSpeculationBarrier(&MF.getSubtarget<AArch64Subtarget>(), *Entry,
+ Entry->end(), DebugLoc(), true /*AlwaysUseISBDSB*/);
+}
+
+MachineBasicBlock &
+AArch64SLSHardening::ConvertBLRToBL(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI) const {
+ // Transform a BLR to a BL as follows:
+ // Before:
+ // |-----------------------------|
+ // | ... |
+ // | instI |
+ // | BLR xN |
+ // | instJ |
+ // | ... |
+ // |-----------------------------|
+ //
+ // After:
+ // |-----------------------------|
+ // | ... |
+ // | instI |
+ // | BL __llvm_slsblr_thunk_xN |
+ // | instJ |
+ // | ... |
+ // |-----------------------------|
+ //
+ // __llvm_slsblr_thunk_xN:
+ // |-----------------------------|
+ // | BR xN |
+ // | barrierInsts |
+ // |-----------------------------|
+ //
+ // The __llvm_slsblr_thunk_xN thunks are created by the SLSBLRThunkInserter.
+ // This function merely needs to transform BLR xN into BL
+ // __llvm_slsblr_thunk_xN.
+ //
+ // Since linkers are allowed to clobber X16 and X17 on function calls, the
+ // above mitigation only works if the original BLR instruction was not
+ // BLR X16 nor BLR X17. Code generation before must make sure that no BLR
+ // X16|X17 was produced if the mitigation is enabled.
+
+ MachineInstr &BLR = *MBBI;
+ assert(isBLR(BLR));
+ unsigned BLOpcode;
+ Register Reg;
+ bool RegIsKilled;
+ switch (BLR.getOpcode()) {
+ case AArch64::BLR:
+ case AArch64::BLRNoIP:
+ BLOpcode = AArch64::BL;
+ Reg = BLR.getOperand(0).getReg();
+ assert(Reg != AArch64::X16 && Reg != AArch64::X17 && Reg != AArch64::LR);
+ RegIsKilled = BLR.getOperand(0).isKill();
+ break;
+ case AArch64::BLRAA:
+ case AArch64::BLRAB:
+ case AArch64::BLRAAZ:
+ case AArch64::BLRABZ:
+ llvm_unreachable("BLRA instructions cannot yet be produced by LLVM, "
+ "therefore there is no need to support them for now.");
+ default:
+ llvm_unreachable("unhandled BLR");
+ }
+ DebugLoc DL = BLR.getDebugLoc();
+
+ // If we'd like to support also BLRAA and BLRAB instructions, we'd need
+ // a lot more different kind of thunks.
+ // For example, a
+ //
+ // BLRAA xN, xM
+ //
+ // instruction probably would need to be transformed to something like:
+ //
+ // BL __llvm_slsblraa_thunk_x<N>_x<M>
+ //
+ // __llvm_slsblraa_thunk_x<N>_x<M>:
+ // BRAA x<N>, x<M>
+ // barrierInsts
+ //
+ // Given that about 30 different values of N are possible and about 30
+ // different values of M are possible in the above, with the current way
+ // of producing indirect thunks, we'd be producing about 30 times 30, i.e.
+ // about 900 thunks (where most might not be actually called). This would
+ // multiply further by two to support both BLRAA and BLRAB variants of those
+ // instructions.
+ // If we'd want to support this, we'd probably need to look into a different
+ // way to produce thunk functions, based on which variants are actually
+ // needed, rather than producing all possible variants.
+ // So far, LLVM does never produce BLRA* instructions, so let's leave this
+ // for the future when LLVM can start producing BLRA* instructions.
+ MachineFunction &MF = *MBBI->getMF();
+ MCContext &Context = MBB.getParent()->getContext();
+ auto ThunkIt =
+ llvm::find_if(SLSBLRThunks, [Reg](auto T) { return T.Reg == Reg; });
+ assert (ThunkIt != std::end(SLSBLRThunks));
+ MCSymbol *Sym = Context.getOrCreateSymbol(ThunkIt->Name);
+
+ MachineInstr *BL = BuildMI(MBB, MBBI, DL, TII->get(BLOpcode)).addSym(Sym);
+
+ // Now copy the implicit operands from BLR to BL and copy other necessary
+ // info.
+ // However, both BLR and BL instructions implictly use SP and implicitly
+ // define LR. Blindly copying implicit operands would result in SP and LR
+ // operands to be present multiple times. While this may not be too much of
+ // an issue, let's avoid that for cleanliness, by removing those implicit
+ // operands from the BL created above before we copy over all implicit
+ // operands from the BLR.
+ int ImpLROpIdx = -1;
+ int ImpSPOpIdx = -1;
+ for (unsigned OpIdx = BL->getNumExplicitOperands();
+ OpIdx < BL->getNumOperands(); OpIdx++) {
+ MachineOperand Op = BL->getOperand(OpIdx);
+ if (!Op.isReg())
+ continue;
+ if (Op.getReg() == AArch64::LR && Op.isDef())
+ ImpLROpIdx = OpIdx;
+ if (Op.getReg() == AArch64::SP && !Op.isDef())
+ ImpSPOpIdx = OpIdx;
+ }
+ assert(ImpLROpIdx != -1);
+ assert(ImpSPOpIdx != -1);
+ int FirstOpIdxToRemove = std::max(ImpLROpIdx, ImpSPOpIdx);
+ int SecondOpIdxToRemove = std::min(ImpLROpIdx, ImpSPOpIdx);
+ BL->RemoveOperand(FirstOpIdxToRemove);
+ BL->RemoveOperand(SecondOpIdxToRemove);
+ // Now copy over the implicit operands from the original BLR
+ BL->copyImplicitOps(MF, BLR);
+ MF.moveCallSiteInfo(&BLR, BL);
+ // Also add the register called in the BLR as being used in the called thunk.
+ BL->addOperand(MachineOperand::CreateReg(Reg, false /*isDef*/, true /*isImp*/,
+ RegIsKilled /*isKill*/));
+ // Remove BLR instruction
+ MBB.erase(MBBI);
+
+ return MBB;
+}
+
+bool AArch64SLSHardening::hardenBLRs(MachineBasicBlock &MBB) const {
+ if (!ST->hardenSlsBlr())
+ return false;
+ bool Modified = false;
+ MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
+ MachineBasicBlock::iterator NextMBBI;
+ for (; MBBI != E; MBBI = NextMBBI) {
+ MachineInstr &MI = *MBBI;
+ NextMBBI = std::next(MBBI);
+ if (isBLR(MI)) {
+ ConvertBLRToBL(MBB, MBBI);
+ Modified = true;
+ }
+ }
+ return Modified;
+}
+
+FunctionPass *llvm::createAArch64SLSHardeningPass() {
+ return new AArch64SLSHardening();
+}
+
+namespace {
+class AArch64IndirectThunks : public MachineFunctionPass {
+public:
+ static char ID;
+
+ AArch64IndirectThunks() : MachineFunctionPass(ID) {}
+
+ StringRef getPassName() const override { return "AArch64 Indirect Thunks"; }
+
+ bool doInitialization(Module &M) override;
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+private:
+ std::tuple<SLSBLRThunkInserter> TIs;
+
+ // FIXME: When LLVM moves to C++17, these can become folds
+ template <typename... ThunkInserterT>
+ static void initTIs(Module &M,
+ std::tuple<ThunkInserterT...> &ThunkInserters) {
+ (void)std::initializer_list<int>{
+ (std::get<ThunkInserterT>(ThunkInserters).init(M), 0)...};
+ }
+ template <typename... ThunkInserterT>
+ static bool runTIs(MachineModuleInfo &MMI, MachineFunction &MF,
+ std::tuple<ThunkInserterT...> &ThunkInserters) {
+ bool Modified = false;
+ (void)std::initializer_list<int>{
+ Modified |= std::get<ThunkInserterT>(ThunkInserters).run(MMI, MF)...};
+ return Modified;
+ }
+};
+
+} // end anonymous namespace
+
+char AArch64IndirectThunks::ID = 0;
+
+FunctionPass *llvm::createAArch64IndirectThunks() {
+ return new AArch64IndirectThunks();
+}
+
+bool AArch64IndirectThunks::doInitialization(Module &M) {
+ initTIs(M, TIs);
+ return false;
+}
+
+bool AArch64IndirectThunks::runOnMachineFunction(MachineFunction &MF) {
+ LLVM_DEBUG(dbgs() << getPassName() << '\n');
+ auto &MMI = getAnalysis<MachineModuleInfoWrapperPass>().getMMI();
+ return runTIs(MMI, MF, TIs);
+}
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index c849d7af9a40b..28a54e6f7d79f 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -10,65 +10,188 @@
//
//===----------------------------------------------------------------------===//
-def SDT_AArch64_GLD1 : SDTypeProfile<1, 4, [
+// For predicated nodes where the entire operation is controlled by a governing
+// predicate, please stick to a similar naming convention as used for the
+// ISD nodes:
+//
+// SDNode <=> AArch64ISD
+// -------------------------------
+// _m<n> <=> _MERGE_OP<n>
+// _mt <=> _MERGE_PASSTHRU
+// _z <=> _MERGE_ZERO
+// _p <=> _PRED
+//
+// Given the context of this file, it is not strictly necessary to use _p to
+// distinguish predicated from unpredicated nodes given that most SVE
+// instructions are predicated.
+
+// Contiguous loads - node definitions
+//
+def SDT_AArch64_LD1 : SDTypeProfile<1, 3, [
+ SDTCisVec<0>, SDTCisVec<1>, SDTCisPtrTy<2>,
+ SDTCVecEltisVT<1,i1>, SDTCisSameNumEltsAs<0,1>
+]>;
+
+def AArch64ld1_z : SDNode<"AArch64ISD::LD1_MERGE_ZERO", SDT_AArch64_LD1, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>;
+def AArch64ld1s_z : SDNode<"AArch64ISD::LD1S_MERGE_ZERO", SDT_AArch64_LD1, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>;
+
+// Non-faulting & first-faulting loads - node definitions
+//
+def AArch64ldnf1_z : SDNode<"AArch64ISD::LDNF1_MERGE_ZERO", SDT_AArch64_LD1, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue, SDNPOutGlue]>;
+def AArch64ldff1_z : SDNode<"AArch64ISD::LDFF1_MERGE_ZERO", SDT_AArch64_LD1, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue, SDNPOutGlue]>;
+
+def AArch64ldnf1s_z : SDNode<"AArch64ISD::LDNF1S_MERGE_ZERO", SDT_AArch64_LD1, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue, SDNPOutGlue]>;
+def AArch64ldff1s_z : SDNode<"AArch64ISD::LDFF1S_MERGE_ZERO", SDT_AArch64_LD1, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue, SDNPOutGlue]>;
+
+// Contiguous load and replicate - node definitions
+//
+
+def SDT_AArch64_LD1Replicate : SDTypeProfile<1, 2, [
+ SDTCisVec<0>, SDTCisVec<1>, SDTCisPtrTy<2>,
+ SDTCVecEltisVT<1,i1>, SDTCisSameNumEltsAs<0,1>
+]>;
+
+def AArch64ld1rq_z : SDNode<"AArch64ISD::LD1RQ_MERGE_ZERO", SDT_AArch64_LD1Replicate, [SDNPHasChain, SDNPMayLoad]>;
+def AArch64ld1ro_z : SDNode<"AArch64ISD::LD1RO_MERGE_ZERO", SDT_AArch64_LD1Replicate, [SDNPHasChain, SDNPMayLoad]>;
+
+// Gather loads - node definitions
+//
+def SDT_AArch64_GATHER_SV : SDTypeProfile<1, 4, [
SDTCisVec<0>, SDTCisVec<1>, SDTCisPtrTy<2>, SDTCisVec<3>, SDTCisVT<4, OtherVT>,
SDTCVecEltisVT<1,i1>, SDTCisSameNumEltsAs<0,1>
]>;
-def SDT_AArch64_GLD1_IMM : SDTypeProfile<1, 4, [
+def SDT_AArch64_GATHER_VS : SDTypeProfile<1, 4, [
SDTCisVec<0>, SDTCisVec<1>, SDTCisVec<2>, SDTCisInt<3>, SDTCisVT<4, OtherVT>,
SDTCVecEltisVT<1,i1>, SDTCisSameNumEltsAs<0,1>
]>;
-def SDT_AArch64_SST1 : SDTypeProfile<0, 5, [
+def AArch64ld1_gather_z : SDNode<"AArch64ISD::GLD1_MERGE_ZERO", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad]>;
+def AArch64ld1_gather_scaled_z : SDNode<"AArch64ISD::GLD1_SCALED_MERGE_ZERO", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad]>;
+def AArch64ld1_gather_uxtw_z : SDNode<"AArch64ISD::GLD1_UXTW_MERGE_ZERO", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad]>;
+def AArch64ld1_gather_sxtw_z : SDNode<"AArch64ISD::GLD1_SXTW_MERGE_ZERO", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad]>;
+def AArch64ld1_gather_uxtw_scaled_z : SDNode<"AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad]>;
+def AArch64ld1_gather_sxtw_scaled_z : SDNode<"AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad]>;
+def AArch64ld1_gather_imm_z : SDNode<"AArch64ISD::GLD1_IMM_MERGE_ZERO", SDT_AArch64_GATHER_VS, [SDNPHasChain, SDNPMayLoad]>;
+
+def AArch64ld1s_gather_z : SDNode<"AArch64ISD::GLD1S_MERGE_ZERO", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad]>;
+def AArch64ld1s_gather_scaled_z : SDNode<"AArch64ISD::GLD1S_SCALED_MERGE_ZERO", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad]>;
+def AArch64ld1s_gather_uxtw_z : SDNode<"AArch64ISD::GLD1S_UXTW_MERGE_ZERO", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad]>;
+def AArch64ld1s_gather_sxtw_z : SDNode<"AArch64ISD::GLD1S_SXTW_MERGE_ZERO", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad]>;
+def AArch64ld1s_gather_uxtw_scaled_z : SDNode<"AArch64ISD::GLD1S_UXTW_SCALED_MERGE_ZERO", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad]>;
+def AArch64ld1s_gather_sxtw_scaled_z : SDNode<"AArch64ISD::GLD1S_SXTW_SCALED_MERGE_ZERO", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad]>;
+def AArch64ld1s_gather_imm_z : SDNode<"AArch64ISD::GLD1S_IMM_MERGE_ZERO", SDT_AArch64_GATHER_VS, [SDNPHasChain, SDNPMayLoad]>;
+
+def AArch64ldff1_gather_z : SDNode<"AArch64ISD::GLDFF1_MERGE_ZERO", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue, SDNPOutGlue]>;
+def AArch64ldff1_gather_scaled_z : SDNode<"AArch64ISD::GLDFF1_SCALED_MERGE_ZERO", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue, SDNPOutGlue]>;
+def AArch64ldff1_gather_uxtw_z : SDNode<"AArch64ISD::GLDFF1_UXTW_MERGE_ZERO", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue, SDNPOutGlue]>;
+def AArch64ldff1_gather_sxtw_z : SDNode<"AArch64ISD::GLDFF1_SXTW_MERGE_ZERO", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue, SDNPOutGlue]>;
+def AArch64ldff1_gather_uxtw_scaled_z : SDNode<"AArch64ISD::GLDFF1_UXTW_SCALED_MERGE_ZERO", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue, SDNPOutGlue]>;
+def AArch64ldff1_gather_sxtw_scaled_z : SDNode<"AArch64ISD::GLDFF1_SXTW_SCALED_MERGE_ZERO", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue, SDNPOutGlue]>;
+def AArch64ldff1_gather_imm_z : SDNode<"AArch64ISD::GLDFF1_IMM_MERGE_ZERO", SDT_AArch64_GATHER_VS, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue, SDNPOutGlue]>;
+
+def AArch64ldff1s_gather_z : SDNode<"AArch64ISD::GLDFF1S_MERGE_ZERO", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue, SDNPOutGlue]>;
+def AArch64ldff1s_gather_scaled_z : SDNode<"AArch64ISD::GLDFF1S_SCALED_MERGE_ZERO", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue, SDNPOutGlue]>;
+def AArch64ldff1s_gather_uxtw_z : SDNode<"AArch64ISD::GLDFF1S_UXTW_MERGE_ZERO", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue, SDNPOutGlue]>;
+def AArch64ldff1s_gather_sxtw_z : SDNode<"AArch64ISD::GLDFF1S_SXTW_MERGE_ZERO", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue, SDNPOutGlue]>;
+def AArch64ldff1s_gather_uxtw_scaled_z : SDNode<"AArch64ISD::GLDFF1S_UXTW_SCALED_MERGE_ZERO", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue, SDNPOutGlue]>;
+def AArch64ldff1s_gather_sxtw_scaled_z : SDNode<"AArch64ISD::GLDFF1S_SXTW_SCALED_MERGE_ZERO", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue, SDNPOutGlue]>;
+def AArch64ldff1s_gather_imm_z : SDNode<"AArch64ISD::GLDFF1S_IMM_MERGE_ZERO", SDT_AArch64_GATHER_VS, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue, SDNPOutGlue]>;
+
+def AArch64ldnt1_gather_z : SDNode<"AArch64ISD::GLDNT1_MERGE_ZERO", SDT_AArch64_GATHER_VS, [SDNPHasChain, SDNPMayLoad]>;
+def AArch64ldnt1s_gather_z : SDNode<"AArch64ISD::GLDNT1S_MERGE_ZERO", SDT_AArch64_GATHER_VS, [SDNPHasChain, SDNPMayLoad]>;
+
+// Contiguous stores - node definitions
+//
+def SDT_AArch64_ST1 : SDTypeProfile<0, 4, [
+ SDTCisVec<0>, SDTCisPtrTy<1>, SDTCisVec<2>,
+ SDTCVecEltisVT<2,i1>, SDTCisSameNumEltsAs<0,2>
+]>;
+
+def AArch64st1 : SDNode<"AArch64ISD::ST1_PRED", SDT_AArch64_ST1, [SDNPHasChain, SDNPMayStore]>;
+
+// Scatter stores - node definitions
+//
+def SDT_AArch64_SCATTER_SV : SDTypeProfile<0, 5, [
SDTCisVec<0>, SDTCisVec<1>, SDTCisPtrTy<2>, SDTCisVec<3>, SDTCisVT<4, OtherVT>,
SDTCVecEltisVT<1,i1>, SDTCisSameNumEltsAs<0,1>
]>;
-def SDT_AArch64_SST1_IMM : SDTypeProfile<0, 5, [
+def SDT_AArch64_SCATTER_VS : SDTypeProfile<0, 5, [
SDTCisVec<0>, SDTCisVec<1>, SDTCisVec<2>, SDTCisInt<3>, SDTCisVT<4, OtherVT>,
SDTCVecEltisVT<1,i1>, SDTCisSameNumEltsAs<0,1>
]>;
-def AArch64st1_scatter : SDNode<"AArch64ISD::SST1", SDT_AArch64_SST1, [SDNPHasChain, SDNPMayStore, SDNPOptInGlue]>;
-def AArch64st1_scatter_scaled : SDNode<"AArch64ISD::SST1_SCALED", SDT_AArch64_SST1, [SDNPHasChain, SDNPMayStore, SDNPOptInGlue]>;
-def AArch64st1_scatter_uxtw : SDNode<"AArch64ISD::SST1_UXTW", SDT_AArch64_SST1, [SDNPHasChain, SDNPMayStore, SDNPOptInGlue]>;
-def AArch64st1_scatter_sxtw : SDNode<"AArch64ISD::SST1_SXTW", SDT_AArch64_SST1, [SDNPHasChain, SDNPMayStore, SDNPOptInGlue]>;
-def AArch64st1_scatter_uxtw_scaled : SDNode<"AArch64ISD::SST1_UXTW_SCALED", SDT_AArch64_SST1, [SDNPHasChain, SDNPMayStore, SDNPOptInGlue]>;
-def AArch64st1_scatter_sxtw_scaled : SDNode<"AArch64ISD::SST1_SXTW_SCALED", SDT_AArch64_SST1, [SDNPHasChain, SDNPMayStore, SDNPOptInGlue]>;
-def AArch64st1_scatter_imm : SDNode<"AArch64ISD::SST1_IMM", SDT_AArch64_SST1_IMM, [SDNPHasChain, SDNPMayStore, SDNPOptInGlue]>;
-
-def AArch64ld1_gather : SDNode<"AArch64ISD::GLD1", SDT_AArch64_GLD1, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>;
-def AArch64ld1_gather_scaled : SDNode<"AArch64ISD::GLD1_SCALED", SDT_AArch64_GLD1, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>;
-def AArch64ld1_gather_uxtw : SDNode<"AArch64ISD::GLD1_UXTW", SDT_AArch64_GLD1, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>;
-def AArch64ld1_gather_sxtw : SDNode<"AArch64ISD::GLD1_SXTW", SDT_AArch64_GLD1, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>;
-def AArch64ld1_gather_uxtw_scaled : SDNode<"AArch64ISD::GLD1_UXTW_SCALED", SDT_AArch64_GLD1, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>;
-def AArch64ld1_gather_sxtw_scaled : SDNode<"AArch64ISD::GLD1_SXTW_SCALED", SDT_AArch64_GLD1, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>;
-def AArch64ld1_gather_imm : SDNode<"AArch64ISD::GLD1_IMM", SDT_AArch64_GLD1_IMM, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>;
-
-def AArch64ld1s_gather : SDNode<"AArch64ISD::GLD1S", SDT_AArch64_GLD1, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>;
-def AArch64ld1s_gather_scaled : SDNode<"AArch64ISD::GLD1S_SCALED", SDT_AArch64_GLD1, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>;
-def AArch64ld1s_gather_uxtw : SDNode<"AArch64ISD::GLD1S_UXTW", SDT_AArch64_GLD1, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>;
-def AArch64ld1s_gather_sxtw : SDNode<"AArch64ISD::GLD1S_SXTW", SDT_AArch64_GLD1, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>;
-def AArch64ld1s_gather_uxtw_scaled : SDNode<"AArch64ISD::GLD1S_UXTW_SCALED", SDT_AArch64_GLD1, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>;
-def AArch64ld1s_gather_sxtw_scaled : SDNode<"AArch64ISD::GLD1S_SXTW_SCALED", SDT_AArch64_GLD1, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>;
-def AArch64ld1s_gather_imm : SDNode<"AArch64ISD::GLD1S_IMM", SDT_AArch64_GLD1_IMM, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>;
+def AArch64st1_scatter : SDNode<"AArch64ISD::SST1_PRED", SDT_AArch64_SCATTER_SV, [SDNPHasChain, SDNPMayStore]>;
+def AArch64st1_scatter_scaled : SDNode<"AArch64ISD::SST1_SCALED_PRED", SDT_AArch64_SCATTER_SV, [SDNPHasChain, SDNPMayStore]>;
+def AArch64st1_scatter_uxtw : SDNode<"AArch64ISD::SST1_UXTW_PRED", SDT_AArch64_SCATTER_SV, [SDNPHasChain, SDNPMayStore]>;
+def AArch64st1_scatter_sxtw : SDNode<"AArch64ISD::SST1_SXTW_PRED", SDT_AArch64_SCATTER_SV, [SDNPHasChain, SDNPMayStore]>;
+def AArch64st1_scatter_uxtw_scaled : SDNode<"AArch64ISD::SST1_UXTW_SCALED_PRED", SDT_AArch64_SCATTER_SV, [SDNPHasChain, SDNPMayStore]>;
+def AArch64st1_scatter_sxtw_scaled : SDNode<"AArch64ISD::SST1_SXTW_SCALED_PRED", SDT_AArch64_SCATTER_SV, [SDNPHasChain, SDNPMayStore]>;
+def AArch64st1_scatter_imm : SDNode<"AArch64ISD::SST1_IMM_PRED", SDT_AArch64_SCATTER_VS, [SDNPHasChain, SDNPMayStore]>;
+
+def AArch64stnt1_scatter : SDNode<"AArch64ISD::SSTNT1_PRED", SDT_AArch64_SCATTER_VS, [SDNPHasChain, SDNPMayStore]>;
+
+// AArch64 SVE/SVE2 - the remaining node definitions
+//
+
+// SVE CNT/INC/RDVL
+def sve_rdvl_imm : ComplexPattern<i32, 1, "SelectRDVLImm<-32, 31, 16>">;
+def sve_cnth_imm : ComplexPattern<i32, 1, "SelectRDVLImm<1, 16, 8>">;
+def sve_cntw_imm : ComplexPattern<i32, 1, "SelectRDVLImm<1, 16, 4>">;
+def sve_cntd_imm : ComplexPattern<i32, 1, "SelectRDVLImm<1, 16, 2>">;
+
+// SVE DEC
+def sve_cnth_imm_neg : ComplexPattern<i32, 1, "SelectRDVLImm<1, 16, -8>">;
+def sve_cntw_imm_neg : ComplexPattern<i32, 1, "SelectRDVLImm<1, 16, -4>">;
+def sve_cntd_imm_neg : ComplexPattern<i32, 1, "SelectRDVLImm<1, 16, -2>">;
def SDT_AArch64Reduce : SDTypeProfile<1, 2, [SDTCisVec<1>, SDTCisVec<2>]>;
+def AArch64faddv_p : SDNode<"AArch64ISD::FADDV_PRED", SDT_AArch64Reduce>;
+def AArch64fmaxv_p : SDNode<"AArch64ISD::FMAXV_PRED", SDT_AArch64Reduce>;
+def AArch64fmaxnmv_p : SDNode<"AArch64ISD::FMAXNMV_PRED", SDT_AArch64Reduce>;
+def AArch64fminv_p : SDNode<"AArch64ISD::FMINV_PRED", SDT_AArch64Reduce>;
+def AArch64fminnmv_p : SDNode<"AArch64ISD::FMINNMV_PRED", SDT_AArch64Reduce>;
+def AArch64smaxv_p : SDNode<"AArch64ISD::SMAXV_PRED", SDT_AArch64Reduce>;
+def AArch64umaxv_p : SDNode<"AArch64ISD::UMAXV_PRED", SDT_AArch64Reduce>;
+def AArch64sminv_p : SDNode<"AArch64ISD::SMINV_PRED", SDT_AArch64Reduce>;
+def AArch64uminv_p : SDNode<"AArch64ISD::UMINV_PRED", SDT_AArch64Reduce>;
+def AArch64orv_p : SDNode<"AArch64ISD::ORV_PRED", SDT_AArch64Reduce>;
+def AArch64eorv_p : SDNode<"AArch64ISD::EORV_PRED", SDT_AArch64Reduce>;
+def AArch64andv_p : SDNode<"AArch64ISD::ANDV_PRED", SDT_AArch64Reduce>;
+def AArch64lasta : SDNode<"AArch64ISD::LASTA", SDT_AArch64Reduce>;
+def AArch64lastb : SDNode<"AArch64ISD::LASTB", SDT_AArch64Reduce>;
+
+def SDT_AArch64Arith : SDTypeProfile<1, 3, [
+ SDTCisVec<0>, SDTCisVec<1>, SDTCisVec<2>, SDTCisVec<3>,
+ SDTCVecEltisVT<1,i1>, SDTCisSameAs<2,3>
+]>;
+
+def SDT_AArch64FMA : SDTypeProfile<1, 4, [
+ SDTCisVec<0>, SDTCisVec<1>, SDTCisVec<2>, SDTCisVec<3>, SDTCisVec<4>,
+ SDTCVecEltisVT<1,i1>, SDTCisSameAs<2,3>, SDTCisSameAs<3,4>
+]>;
-def AArch64smaxv_pred : SDNode<"AArch64ISD::SMAXV_PRED", SDT_AArch64Reduce>;
-def AArch64umaxv_pred : SDNode<"AArch64ISD::UMAXV_PRED", SDT_AArch64Reduce>;
-def AArch64sminv_pred : SDNode<"AArch64ISD::SMINV_PRED", SDT_AArch64Reduce>;
-def AArch64uminv_pred : SDNode<"AArch64ISD::UMINV_PRED", SDT_AArch64Reduce>;
-def AArch64orv_pred : SDNode<"AArch64ISD::ORV_PRED", SDT_AArch64Reduce>;
-def AArch64eorv_pred : SDNode<"AArch64ISD::EORV_PRED", SDT_AArch64Reduce>;
-def AArch64andv_pred : SDNode<"AArch64ISD::ANDV_PRED", SDT_AArch64Reduce>;
-def AArch64lasta : SDNode<"AArch64ISD::LASTA", SDT_AArch64Reduce>;
-def AArch64lastb : SDNode<"AArch64ISD::LASTB", SDT_AArch64Reduce>;
+// Predicated operations with the result of inactive lanes being unspecified.
+def AArch64add_p : SDNode<"AArch64ISD::ADD_PRED", SDT_AArch64Arith>;
+def AArch64fadd_p : SDNode<"AArch64ISD::FADD_PRED", SDT_AArch64Arith>;
+def AArch64fma_p : SDNode<"AArch64ISD::FMA_PRED", SDT_AArch64FMA>;
+def AArch64sdiv_p : SDNode<"AArch64ISD::SDIV_PRED", SDT_AArch64Arith>;
+def AArch64udiv_p : SDNode<"AArch64ISD::UDIV_PRED", SDT_AArch64Arith>;
+
+// Merging op1 into the inactive lanes.
+def AArch64smin_m1 : SDNode<"AArch64ISD::SMIN_MERGE_OP1", SDT_AArch64Arith>;
+def AArch64umin_m1 : SDNode<"AArch64ISD::UMIN_MERGE_OP1", SDT_AArch64Arith>;
+def AArch64smax_m1 : SDNode<"AArch64ISD::SMAX_MERGE_OP1", SDT_AArch64Arith>;
+def AArch64umax_m1 : SDNode<"AArch64ISD::UMAX_MERGE_OP1", SDT_AArch64Arith>;
+def AArch64lsl_m1 : SDNode<"AArch64ISD::SHL_MERGE_OP1", SDT_AArch64Arith>;
+def AArch64lsr_m1 : SDNode<"AArch64ISD::SRL_MERGE_OP1", SDT_AArch64Arith>;
+def AArch64asr_m1 : SDNode<"AArch64ISD::SRA_MERGE_OP1", SDT_AArch64Arith>;
def SDT_AArch64ReduceWithInit : SDTypeProfile<1, 3, [SDTCisVec<1>, SDTCisVec<3>]>;
def AArch64clasta_n : SDNode<"AArch64ISD::CLASTA_N", SDT_AArch64ReduceWithInit>;
def AArch64clastb_n : SDNode<"AArch64ISD::CLASTB_N", SDT_AArch64ReduceWithInit>;
+def AArch64fadda_p : SDNode<"AArch64ISD::FADDA_PRED", SDT_AArch64ReduceWithInit>;
def SDT_AArch64Rev : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisSameAs<0,1>]>;
def AArch64rev : SDNode<"AArch64ISD::REV", SDT_AArch64Rev>;
@@ -76,42 +199,57 @@ def AArch64rev : SDNode<"AArch64ISD::REV", SDT_AArch64Rev>;
def SDT_AArch64PTest : SDTypeProfile<0, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>]>;
def AArch64ptest : SDNode<"AArch64ISD::PTEST", SDT_AArch64PTest>;
-let Predicates = [HasSVE] in {
+def SDT_AArch64DUP_PRED : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0, 3>, SDTCisVec<1>, SDTCVecEltisVT<1,i1>]>;
+def AArch64dup_mt : SDNode<"AArch64ISD::DUP_MERGE_PASSTHRU", SDT_AArch64DUP_PRED>;
+
+def SDT_IndexVector : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<1, 2>, SDTCisInt<2>]>;
+def index_vector : SDNode<"AArch64ISD::INDEX_VECTOR", SDT_IndexVector, []>;
- def RDFFR_PPz : sve_int_rdffr_pred<0b0, "rdffr">;
- def RDFFRS_PPz : sve_int_rdffr_pred<0b1, "rdffrs">;
- def RDFFR_P : sve_int_rdffr_unpred<"rdffr">;
- def SETFFR : sve_int_setffr<"setffr">;
- def WRFFR : sve_int_wrffr<"wrffr">;
+def reinterpret_cast : SDNode<"AArch64ISD::REINTERPRET_CAST", SDTUnaryOp>;
- defm ADD_ZZZ : sve_int_bin_cons_arit_0<0b000, "add", add>;
- defm SUB_ZZZ : sve_int_bin_cons_arit_0<0b001, "sub", sub>;
- defm SQADD_ZZZ : sve_int_bin_cons_arit_0<0b100, "sqadd", saddsat>;
- defm UQADD_ZZZ : sve_int_bin_cons_arit_0<0b101, "uqadd", uaddsat>;
- defm SQSUB_ZZZ : sve_int_bin_cons_arit_0<0b110, "sqsub", ssubsat>;
- defm UQSUB_ZZZ : sve_int_bin_cons_arit_0<0b111, "uqsub", usubsat>;
+let Predicates = [HasSVE] in {
+ defm RDFFR_PPz : sve_int_rdffr_pred<0b0, "rdffr", int_aarch64_sve_rdffr_z>;
+ def RDFFRS_PPz : sve_int_rdffr_pred<0b1, "rdffrs">;
+ defm RDFFR_P : sve_int_rdffr_unpred<"rdffr", int_aarch64_sve_rdffr>;
+ def SETFFR : sve_int_setffr<"setffr", int_aarch64_sve_setffr>;
+ def WRFFR : sve_int_wrffr<"wrffr", int_aarch64_sve_wrffr>;
+
+ defm ADD_ZZZ : sve_int_bin_cons_arit_0<0b000, "add", add, null_frag>;
+ defm SUB_ZZZ : sve_int_bin_cons_arit_0<0b001, "sub", sub, null_frag>;
+ defm SQADD_ZZZ : sve_int_bin_cons_arit_0<0b100, "sqadd", saddsat, int_aarch64_sve_sqadd_x>;
+ defm UQADD_ZZZ : sve_int_bin_cons_arit_0<0b101, "uqadd", uaddsat, int_aarch64_sve_uqadd_x>;
+ defm SQSUB_ZZZ : sve_int_bin_cons_arit_0<0b110, "sqsub", ssubsat, int_aarch64_sve_sqsub_x>;
+ defm UQSUB_ZZZ : sve_int_bin_cons_arit_0<0b111, "uqsub", usubsat, int_aarch64_sve_uqsub_x>;
defm AND_ZZZ : sve_int_bin_cons_log<0b00, "and", and>;
defm ORR_ZZZ : sve_int_bin_cons_log<0b01, "orr", or>;
defm EOR_ZZZ : sve_int_bin_cons_log<0b10, "eor", xor>;
defm BIC_ZZZ : sve_int_bin_cons_log<0b11, "bic", null_frag>;
- defm ADD_ZPmZ : sve_int_bin_pred_arit_0<0b000, "add", int_aarch64_sve_add>;
- defm SUB_ZPmZ : sve_int_bin_pred_arit_0<0b001, "sub", int_aarch64_sve_sub>;
- defm SUBR_ZPmZ : sve_int_bin_pred_arit_0<0b011, "subr", int_aarch64_sve_subr>;
+ defm ADD_ZPmZ : sve_int_bin_pred_arit_0<0b000, "add", "ADD_ZPZZ", int_aarch64_sve_add, DestructiveBinaryComm>;
+ defm SUB_ZPmZ : sve_int_bin_pred_arit_0<0b001, "sub", "SUB_ZPZZ", int_aarch64_sve_sub, DestructiveBinaryCommWithRev, "SUBR_ZPmZ">;
+ defm SUBR_ZPmZ : sve_int_bin_pred_arit_0<0b011, "subr", "SUBR_ZPZZ", int_aarch64_sve_subr, DestructiveBinaryCommWithRev, "SUB_ZPmZ", /*isReverseInstr*/ 1>;
+
+ defm ADD_ZPZZ : sve_int_bin_pred_bhsd<AArch64add_p>;
+
+ let Predicates = [HasSVE, UseExperimentalZeroingPseudos] in {
+ defm ADD_ZPZZ : sve_int_bin_pred_zeroing_bhsd<int_aarch64_sve_add>;
+ defm SUB_ZPZZ : sve_int_bin_pred_zeroing_bhsd<int_aarch64_sve_sub>;
+ defm SUBR_ZPZZ : sve_int_bin_pred_zeroing_bhsd<int_aarch64_sve_subr>;
+ }
defm ORR_ZPmZ : sve_int_bin_pred_log<0b000, "orr", int_aarch64_sve_orr>;
defm EOR_ZPmZ : sve_int_bin_pred_log<0b001, "eor", int_aarch64_sve_eor>;
defm AND_ZPmZ : sve_int_bin_pred_log<0b010, "and", int_aarch64_sve_and>;
defm BIC_ZPmZ : sve_int_bin_pred_log<0b011, "bic", int_aarch64_sve_bic>;
- defm ADD_ZI : sve_int_arith_imm0<0b000, "add", add>;
- defm SUB_ZI : sve_int_arith_imm0<0b001, "sub", sub>;
+ defm ADD_ZI : sve_int_arith_imm0<0b000, "add", add, null_frag>;
+ defm SUB_ZI : sve_int_arith_imm0<0b001, "sub", sub, null_frag>;
defm SUBR_ZI : sve_int_arith_imm0_subr<0b011, "subr", sub>;
- defm SQADD_ZI : sve_int_arith_imm0<0b100, "sqadd", saddsat>;
- defm UQADD_ZI : sve_int_arith_imm0<0b101, "uqadd", uaddsat>;
- defm SQSUB_ZI : sve_int_arith_imm0<0b110, "sqsub", ssubsat>;
- defm UQSUB_ZI : sve_int_arith_imm0<0b111, "uqsub", usubsat>;
+ defm SQADD_ZI : sve_int_arith_imm0<0b100, "sqadd", saddsat, int_aarch64_sve_sqadd_x>;
+ defm UQADD_ZI : sve_int_arith_imm0<0b101, "uqadd", uaddsat, int_aarch64_sve_uqadd_x>;
+ defm SQSUB_ZI : sve_int_arith_imm0<0b110, "sqsub", ssubsat, int_aarch64_sve_sqsub_x>;
+ defm UQSUB_ZI : sve_int_arith_imm0<0b111, "uqsub", usubsat, int_aarch64_sve_uqsub_x>;
defm MAD_ZPmZZ : sve_int_mladdsub_vvv_pred<0b0, "mad", int_aarch64_sve_mad>;
defm MSB_ZPmZZ : sve_int_mladdsub_vvv_pred<0b1, "msb", int_aarch64_sve_msb>;
@@ -121,32 +259,45 @@ let Predicates = [HasSVE] in {
// SVE predicated integer reductions.
defm SADDV_VPZ : sve_int_reduce_0_saddv<0b000, "saddv", int_aarch64_sve_saddv>;
defm UADDV_VPZ : sve_int_reduce_0_uaddv<0b001, "uaddv", int_aarch64_sve_uaddv, int_aarch64_sve_saddv>;
- defm SMAXV_VPZ : sve_int_reduce_1<0b000, "smaxv", AArch64smaxv_pred>;
- defm UMAXV_VPZ : sve_int_reduce_1<0b001, "umaxv", AArch64umaxv_pred>;
- defm SMINV_VPZ : sve_int_reduce_1<0b010, "sminv", AArch64sminv_pred>;
- defm UMINV_VPZ : sve_int_reduce_1<0b011, "uminv", AArch64uminv_pred>;
- defm ORV_VPZ : sve_int_reduce_2<0b000, "orv", AArch64orv_pred>;
- defm EORV_VPZ : sve_int_reduce_2<0b001, "eorv", AArch64eorv_pred>;
- defm ANDV_VPZ : sve_int_reduce_2<0b010, "andv", AArch64andv_pred>;
+ defm SMAXV_VPZ : sve_int_reduce_1<0b000, "smaxv", AArch64smaxv_p>;
+ defm UMAXV_VPZ : sve_int_reduce_1<0b001, "umaxv", AArch64umaxv_p>;
+ defm SMINV_VPZ : sve_int_reduce_1<0b010, "sminv", AArch64sminv_p>;
+ defm UMINV_VPZ : sve_int_reduce_1<0b011, "uminv", AArch64uminv_p>;
+ defm ORV_VPZ : sve_int_reduce_2<0b000, "orv", AArch64orv_p>;
+ defm EORV_VPZ : sve_int_reduce_2<0b001, "eorv", AArch64eorv_p>;
+ defm ANDV_VPZ : sve_int_reduce_2<0b010, "andv", AArch64andv_p>;
defm ORR_ZI : sve_int_log_imm<0b00, "orr", "orn", or>;
defm EOR_ZI : sve_int_log_imm<0b01, "eor", "eon", xor>;
defm AND_ZI : sve_int_log_imm<0b10, "and", "bic", and>;
- defm SMAX_ZI : sve_int_arith_imm1<0b00, "smax", smax>;
- defm SMIN_ZI : sve_int_arith_imm1<0b10, "smin", smin>;
- defm UMAX_ZI : sve_int_arith_imm1_unsigned<0b01, "umax", umax>;
- defm UMIN_ZI : sve_int_arith_imm1_unsigned<0b11, "umin", umin>;
+ defm SMAX_ZI : sve_int_arith_imm1<0b00, "smax", AArch64smax_m1>;
+ defm SMIN_ZI : sve_int_arith_imm1<0b10, "smin", AArch64smin_m1>;
+ defm UMAX_ZI : sve_int_arith_imm1_unsigned<0b01, "umax", AArch64umax_m1>;
+ defm UMIN_ZI : sve_int_arith_imm1_unsigned<0b11, "umin", AArch64umin_m1>;
- defm MUL_ZI : sve_int_arith_imm2<"mul", mul>;
- defm MUL_ZPmZ : sve_int_bin_pred_arit_2<0b000, "mul", int_aarch64_sve_mul>;
+ defm MUL_ZI : sve_int_arith_imm2<"mul", mul>;
+ defm MUL_ZPmZ : sve_int_bin_pred_arit_2<0b000, "mul", int_aarch64_sve_mul>;
defm SMULH_ZPmZ : sve_int_bin_pred_arit_2<0b010, "smulh", int_aarch64_sve_smulh>;
defm UMULH_ZPmZ : sve_int_bin_pred_arit_2<0b011, "umulh", int_aarch64_sve_umulh>;
- defm SDIV_ZPmZ : sve_int_bin_pred_arit_2_div<0b100, "sdiv", int_aarch64_sve_sdiv>;
- defm UDIV_ZPmZ : sve_int_bin_pred_arit_2_div<0b101, "udiv", int_aarch64_sve_udiv>;
- defm SDIVR_ZPmZ : sve_int_bin_pred_arit_2_div<0b110, "sdivr", int_aarch64_sve_sdivr>;
- defm UDIVR_ZPmZ : sve_int_bin_pred_arit_2_div<0b111, "udivr", int_aarch64_sve_udivr>;
+ // Add unpredicated alternative for the mul instruction.
+ def : Pat<(mul nxv16i8:$Op1, nxv16i8:$Op2),
+ (MUL_ZPmZ_B (PTRUE_B 31), $Op1, $Op2)>;
+ def : Pat<(mul nxv8i16:$Op1, nxv8i16:$Op2),
+ (MUL_ZPmZ_H (PTRUE_H 31), $Op1, $Op2)>;
+ def : Pat<(mul nxv4i32:$Op1, nxv4i32:$Op2),
+ (MUL_ZPmZ_S (PTRUE_S 31), $Op1, $Op2)>;
+ def : Pat<(mul nxv2i64:$Op1, nxv2i64:$Op2),
+ (MUL_ZPmZ_D (PTRUE_D 31), $Op1, $Op2)>;
+
+ defm SDIV_ZPmZ : sve_int_bin_pred_arit_2_div<0b100, "sdiv", "SDIV_ZPZZ", int_aarch64_sve_sdiv, DestructiveBinaryCommWithRev, "SDIVR_ZPmZ">;
+ defm UDIV_ZPmZ : sve_int_bin_pred_arit_2_div<0b101, "udiv", "UDIV_ZPZZ", int_aarch64_sve_udiv, DestructiveBinaryCommWithRev, "UDIVR_ZPmZ">;
+ defm SDIVR_ZPmZ : sve_int_bin_pred_arit_2_div<0b110, "sdivr", "SDIVR_ZPZZ", int_aarch64_sve_sdivr, DestructiveBinaryCommWithRev, "SDIV_ZPmZ", /*isReverseInstr*/ 1>;
+ defm UDIVR_ZPmZ : sve_int_bin_pred_arit_2_div<0b111, "udivr", "UDIVR_ZPZZ", int_aarch64_sve_udivr, DestructiveBinaryCommWithRev, "UDIV_ZPmZ", /*isReverseInstr*/ 1>;
+
+ defm SDIV_ZPZZ : sve_int_bin_pred_sd<AArch64sdiv_p>;
+ defm UDIV_ZPZZ : sve_int_bin_pred_sd<AArch64udiv_p>;
defm SDOT_ZZZ : sve_intx_dot<0b0, "sdot", int_aarch64_sve_sdot>;
defm UDOT_ZZZ : sve_intx_dot<0b1, "udot", int_aarch64_sve_udot>;
@@ -166,15 +317,20 @@ let Predicates = [HasSVE] in {
defm CLS_ZPmZ : sve_int_un_pred_arit_1< 0b000, "cls", int_aarch64_sve_cls>;
defm CLZ_ZPmZ : sve_int_un_pred_arit_1< 0b001, "clz", int_aarch64_sve_clz>;
defm CNT_ZPmZ : sve_int_un_pred_arit_1< 0b010, "cnt", int_aarch64_sve_cnt>;
+
+ let Predicates = [HasSVE, HasBF16] in {
+ def : SVE_3_Op_Pat<nxv8i16, int_aarch64_sve_cnt, nxv8i16, nxv8i1, nxv8bf16, !cast<Instruction>(CNT_ZPmZ_H)>;
+ }
+
defm CNOT_ZPmZ : sve_int_un_pred_arit_1< 0b011, "cnot", int_aarch64_sve_cnot>;
defm NOT_ZPmZ : sve_int_un_pred_arit_1< 0b110, "not", int_aarch64_sve_not>;
defm FABS_ZPmZ : sve_int_un_pred_arit_1_fp<0b100, "fabs", int_aarch64_sve_fabs>;
defm FNEG_ZPmZ : sve_int_un_pred_arit_1_fp<0b101, "fneg", int_aarch64_sve_fneg>;
- defm SMAX_ZPmZ : sve_int_bin_pred_arit_1<0b000, "smax", int_aarch64_sve_smax>;
- defm UMAX_ZPmZ : sve_int_bin_pred_arit_1<0b001, "umax", int_aarch64_sve_umax>;
- defm SMIN_ZPmZ : sve_int_bin_pred_arit_1<0b010, "smin", int_aarch64_sve_smin>;
- defm UMIN_ZPmZ : sve_int_bin_pred_arit_1<0b011, "umin", int_aarch64_sve_umin>;
+ defm SMAX_ZPmZ : sve_int_bin_pred_arit_1<0b000, "smax", AArch64smax_m1>;
+ defm UMAX_ZPmZ : sve_int_bin_pred_arit_1<0b001, "umax", AArch64umax_m1>;
+ defm SMIN_ZPmZ : sve_int_bin_pred_arit_1<0b010, "smin", AArch64smin_m1>;
+ defm UMIN_ZPmZ : sve_int_bin_pred_arit_1<0b011, "umin", AArch64umin_m1>;
defm SABD_ZPmZ : sve_int_bin_pred_arit_1<0b100, "sabd", int_aarch64_sve_sabd>;
defm UABD_ZPmZ : sve_int_bin_pred_arit_1<0b101, "uabd", int_aarch64_sve_uabd>;
@@ -190,19 +346,36 @@ let Predicates = [HasSVE] in {
defm FMAX_ZPmI : sve_fp_2op_i_p_zds<0b110, "fmax", sve_fpimm_zero_one>;
defm FMIN_ZPmI : sve_fp_2op_i_p_zds<0b111, "fmin", sve_fpimm_zero_one>;
- defm FADD_ZPmZ : sve_fp_2op_p_zds<0b0000, "fadd", int_aarch64_sve_fadd>;
- defm FSUB_ZPmZ : sve_fp_2op_p_zds<0b0001, "fsub", int_aarch64_sve_fsub>;
- defm FMUL_ZPmZ : sve_fp_2op_p_zds<0b0010, "fmul", int_aarch64_sve_fmul>;
- defm FSUBR_ZPmZ : sve_fp_2op_p_zds<0b0011, "fsubr", int_aarch64_sve_fsubr>;
- defm FMAXNM_ZPmZ : sve_fp_2op_p_zds<0b0100, "fmaxnm", int_aarch64_sve_fmaxnm>;
- defm FMINNM_ZPmZ : sve_fp_2op_p_zds<0b0101, "fminnm", int_aarch64_sve_fminnm>;
- defm FMAX_ZPmZ : sve_fp_2op_p_zds<0b0110, "fmax", int_aarch64_sve_fmax>;
- defm FMIN_ZPmZ : sve_fp_2op_p_zds<0b0111, "fmin", int_aarch64_sve_fmin>;
- defm FABD_ZPmZ : sve_fp_2op_p_zds<0b1000, "fabd", int_aarch64_sve_fabd>;
+ defm FADD_ZPmZ : sve_fp_2op_p_zds<0b0000, "fadd", "FADD_ZPZZ", int_aarch64_sve_fadd, DestructiveBinaryComm>;
+ defm FSUB_ZPmZ : sve_fp_2op_p_zds<0b0001, "fsub", "FSUB_ZPZZ", int_aarch64_sve_fsub, DestructiveBinaryCommWithRev, "FSUBR_ZPmZ">;
+ defm FMUL_ZPmZ : sve_fp_2op_p_zds<0b0010, "fmul", "FMUL_ZPZZ", int_aarch64_sve_fmul, DestructiveBinaryComm>;
+ defm FSUBR_ZPmZ : sve_fp_2op_p_zds<0b0011, "fsubr", "FSUBR_ZPZZ", int_aarch64_sve_fsubr, DestructiveBinaryCommWithRev, "FSUB_ZPmZ", /*isReverseInstr*/ 1>;
+ defm FMAXNM_ZPmZ : sve_fp_2op_p_zds<0b0100, "fmaxnm", "FMAXNM_ZPZZ", int_aarch64_sve_fmaxnm, DestructiveBinaryComm>;
+ defm FMINNM_ZPmZ : sve_fp_2op_p_zds<0b0101, "fminnm", "FMINNM_ZPZZ", int_aarch64_sve_fminnm, DestructiveBinaryComm>;
+ defm FMAX_ZPmZ : sve_fp_2op_p_zds<0b0110, "fmax", "FMAX_ZPZZ", int_aarch64_sve_fmax, DestructiveBinaryComm>;
+ defm FMIN_ZPmZ : sve_fp_2op_p_zds<0b0111, "fmin", "FMIN_ZPZZ", int_aarch64_sve_fmin, DestructiveBinaryComm>;
+ defm FABD_ZPmZ : sve_fp_2op_p_zds<0b1000, "fabd", "FABD_ZPZZ", int_aarch64_sve_fabd, DestructiveBinaryComm>;
defm FSCALE_ZPmZ : sve_fp_2op_p_zds_fscale<0b1001, "fscale", int_aarch64_sve_fscale>;
- defm FMULX_ZPmZ : sve_fp_2op_p_zds<0b1010, "fmulx", int_aarch64_sve_fmulx>;
- defm FDIVR_ZPmZ : sve_fp_2op_p_zds<0b1100, "fdivr", int_aarch64_sve_fdivr>;
- defm FDIV_ZPmZ : sve_fp_2op_p_zds<0b1101, "fdiv", int_aarch64_sve_fdiv>;
+ defm FMULX_ZPmZ : sve_fp_2op_p_zds<0b1010, "fmulx", "FMULX_ZPZZ", int_aarch64_sve_fmulx, DestructiveBinaryComm>;
+ defm FDIVR_ZPmZ : sve_fp_2op_p_zds<0b1100, "fdivr", "FDIVR_ZPZZ", int_aarch64_sve_fdivr, DestructiveBinaryCommWithRev, "FDIV_ZPmZ", /*isReverseInstr*/ 1>;
+ defm FDIV_ZPmZ : sve_fp_2op_p_zds<0b1101, "fdiv", "FDIV_ZPZZ", int_aarch64_sve_fdiv, DestructiveBinaryCommWithRev, "FDIVR_ZPmZ">;
+
+ defm FADD_ZPZZ : sve_fp_bin_pred_hfd<AArch64fadd_p>;
+
+ let Predicates = [HasSVE, UseExperimentalZeroingPseudos] in {
+ defm FADD_ZPZZ : sve_fp_2op_p_zds_zeroing_hsd<int_aarch64_sve_fadd>;
+ defm FSUB_ZPZZ : sve_fp_2op_p_zds_zeroing_hsd<int_aarch64_sve_fsub>;
+ defm FMUL_ZPZZ : sve_fp_2op_p_zds_zeroing_hsd<int_aarch64_sve_fmul>;
+ defm FSUBR_ZPZZ : sve_fp_2op_p_zds_zeroing_hsd<int_aarch64_sve_fsubr>;
+ defm FMAXNM_ZPZZ : sve_fp_2op_p_zds_zeroing_hsd<int_aarch64_sve_fmaxnm>;
+ defm FMINNM_ZPZZ : sve_fp_2op_p_zds_zeroing_hsd<int_aarch64_sve_fminnm>;
+ defm FMAX_ZPZZ : sve_fp_2op_p_zds_zeroing_hsd<int_aarch64_sve_fmax>;
+ defm FMIN_ZPZZ : sve_fp_2op_p_zds_zeroing_hsd<int_aarch64_sve_fmin>;
+ defm FABD_ZPZZ : sve_fp_2op_p_zds_zeroing_hsd<int_aarch64_sve_fabd>;
+ defm FMULX_ZPZZ : sve_fp_2op_p_zds_zeroing_hsd<int_aarch64_sve_fmulx>;
+ defm FDIVR_ZPZZ : sve_fp_2op_p_zds_zeroing_hsd<int_aarch64_sve_fdivr>;
+ defm FDIV_ZPZZ : sve_fp_2op_p_zds_zeroing_hsd<int_aarch64_sve_fdiv>;
+ }
defm FADD_ZZZ : sve_fp_3op_u_zd<0b000, "fadd", fadd>;
defm FSUB_ZZZ : sve_fp_3op_u_zd<0b001, "fsub", fsub>;
@@ -226,6 +399,16 @@ let Predicates = [HasSVE] in {
defm FNMAD_ZPmZZ : sve_fp_3op_p_zds_b<0b10, "fnmad", int_aarch64_sve_fnmad>;
defm FNMSB_ZPmZZ : sve_fp_3op_p_zds_b<0b11, "fnmsb", int_aarch64_sve_fnmsb>;
+ // Add patterns for FMA where disabled lanes are undef.
+ // FIXME: Implement a pseudo so we can choose a better instruction after
+ // regalloc.
+ def : Pat<(nxv8f16 (AArch64fma_p nxv8i1:$P, nxv8f16:$Op1, nxv8f16:$Op2, nxv8f16:$Op3)),
+ (FMLA_ZPmZZ_H $P, $Op3, $Op1, $Op2)>;
+ def : Pat<(nxv4f32 (AArch64fma_p nxv4i1:$P, nxv4f32:$Op1, nxv4f32:$Op2, nxv4f32:$Op3)),
+ (FMLA_ZPmZZ_S $P, $Op3, $Op1, $Op2)>;
+ def : Pat<(nxv2f64 (AArch64fma_p nxv2i1:$P, nxv2f64:$Op1, nxv2f64:$Op2, nxv2f64:$Op3)),
+ (FMLA_ZPmZZ_D $P, $Op3, $Op1, $Op2)>;
+
defm FTMAD_ZZI : sve_fp_ftmad<"ftmad", int_aarch64_sve_ftmad_x>;
defm FMLA_ZZZI : sve_fp_fma_by_indexed_elem<0b0, "fmla", int_aarch64_sve_fmla_lane>;
@@ -235,12 +418,21 @@ let Predicates = [HasSVE] in {
defm FMUL_ZZZI : sve_fp_fmul_by_indexed_elem<"fmul", int_aarch64_sve_fmul_lane>;
// SVE floating point reductions.
- defm FADDA_VPZ : sve_fp_2op_p_vd<0b000, "fadda", int_aarch64_sve_fadda>;
- defm FADDV_VPZ : sve_fp_fast_red<0b000, "faddv", int_aarch64_sve_faddv>;
- defm FMAXNMV_VPZ : sve_fp_fast_red<0b100, "fmaxnmv", int_aarch64_sve_fmaxnmv>;
- defm FMINNMV_VPZ : sve_fp_fast_red<0b101, "fminnmv", int_aarch64_sve_fminnmv>;
- defm FMAXV_VPZ : sve_fp_fast_red<0b110, "fmaxv", int_aarch64_sve_fmaxv>;
- defm FMINV_VPZ : sve_fp_fast_red<0b111, "fminv", int_aarch64_sve_fminv>;
+ defm FADDA_VPZ : sve_fp_2op_p_vd<0b000, "fadda", AArch64fadda_p>;
+ defm FADDV_VPZ : sve_fp_fast_red<0b000, "faddv", AArch64faddv_p>;
+ defm FMAXNMV_VPZ : sve_fp_fast_red<0b100, "fmaxnmv", AArch64fmaxnmv_p>;
+ defm FMINNMV_VPZ : sve_fp_fast_red<0b101, "fminnmv", AArch64fminnmv_p>;
+ defm FMAXV_VPZ : sve_fp_fast_red<0b110, "fmaxv", AArch64fmaxv_p>;
+ defm FMINV_VPZ : sve_fp_fast_red<0b111, "fminv", AArch64fminv_p>;
+
+ // Use more efficient NEON instructions to extract elements within the NEON
+ // part (first 128bits) of an SVE register.
+ def : Pat<(vector_extract (nxv8f16 ZPR:$Zs), (i64 0)),
+ (f16 (EXTRACT_SUBREG (v8f16 (EXTRACT_SUBREG ZPR:$Zs, zsub)), hsub))>;
+ def : Pat<(vector_extract (nxv4f32 ZPR:$Zs), (i64 0)),
+ (f32 (EXTRACT_SUBREG (v4f32 (EXTRACT_SUBREG ZPR:$Zs, zsub)), ssub))>;
+ def : Pat<(vector_extract (nxv2f64 ZPR:$Zs), (i64 0)),
+ (f64 (EXTRACT_SUBREG (v2f64 (EXTRACT_SUBREG ZPR:$Zs, zsub)), dsub))>;
// Splat immediate (unpredicated)
defm DUP_ZI : sve_int_dup_imm<"dup">;
@@ -257,18 +449,88 @@ let Predicates = [HasSVE] in {
defm DUP_ZZI : sve_int_perm_dup_i<"dup">;
// Splat scalar register (predicated)
- defm CPY_ZPmR : sve_int_perm_cpy_r<"cpy">;
- defm CPY_ZPmV : sve_int_perm_cpy_v<"cpy">;
+ defm CPY_ZPmR : sve_int_perm_cpy_r<"cpy", AArch64dup_mt>;
+ defm CPY_ZPmV : sve_int_perm_cpy_v<"cpy", AArch64dup_mt>;
+
+ let Predicates = [HasSVE, HasBF16] in {
+ def : Pat<(nxv8bf16 (AArch64dup_mt nxv8i1:$pg, bf16:$splat, nxv8bf16:$passthru)),
+ (CPY_ZPmV_H $passthru, $pg, $splat)>;
+ }
+
+ // Duplicate FP scalar into all vector elements
+ def : Pat<(nxv8f16 (AArch64dup (f16 FPR16:$src))),
+ (DUP_ZZI_H (INSERT_SUBREG (IMPLICIT_DEF), FPR16:$src, hsub), 0)>;
+ def : Pat<(nxv4f16 (AArch64dup (f16 FPR16:$src))),
+ (DUP_ZZI_H (INSERT_SUBREG (IMPLICIT_DEF), FPR16:$src, hsub), 0)>;
+ def : Pat<(nxv2f16 (AArch64dup (f16 FPR16:$src))),
+ (DUP_ZZI_H (INSERT_SUBREG (IMPLICIT_DEF), FPR16:$src, hsub), 0)>;
+ def : Pat<(nxv4f32 (AArch64dup (f32 FPR32:$src))),
+ (DUP_ZZI_S (INSERT_SUBREG (IMPLICIT_DEF), FPR32:$src, ssub), 0)>;
+ def : Pat<(nxv2f32 (AArch64dup (f32 FPR32:$src))),
+ (DUP_ZZI_S (INSERT_SUBREG (IMPLICIT_DEF), FPR32:$src, ssub), 0)>;
+ def : Pat<(nxv2f64 (AArch64dup (f64 FPR64:$src))),
+ (DUP_ZZI_D (INSERT_SUBREG (IMPLICIT_DEF), FPR64:$src, dsub), 0)>;
+ let Predicates = [HasSVE, HasBF16] in {
+ def : Pat<(nxv8bf16 (AArch64dup (bf16 FPR16:$src))),
+ (DUP_ZZI_H (INSERT_SUBREG (IMPLICIT_DEF), FPR16:$src, hsub), 0)>;
+ }
+
+ // Duplicate +0.0 into all vector elements
+ def : Pat<(nxv8f16 (AArch64dup (f16 fpimm0))), (DUP_ZI_H 0, 0)>;
+ def : Pat<(nxv4f16 (AArch64dup (f16 fpimm0))), (DUP_ZI_H 0, 0)>;
+ def : Pat<(nxv2f16 (AArch64dup (f16 fpimm0))), (DUP_ZI_H 0, 0)>;
+ def : Pat<(nxv4f32 (AArch64dup (f32 fpimm0))), (DUP_ZI_S 0, 0)>;
+ def : Pat<(nxv2f32 (AArch64dup (f32 fpimm0))), (DUP_ZI_S 0, 0)>;
+ def : Pat<(nxv2f64 (AArch64dup (f64 fpimm0))), (DUP_ZI_D 0, 0)>;
+ let Predicates = [HasSVE, HasBF16] in {
+ def : Pat<(nxv8bf16 (AArch64dup (bf16 fpimm0))), (DUP_ZI_H 0, 0)>;
+ }
+
+ // Duplicate Int immediate into all vector elements
+ def : Pat<(nxv16i8 (AArch64dup (i32 (SVE8BitLslImm i32:$a, i32:$b)))),
+ (DUP_ZI_B $a, $b)>;
+ def : Pat<(nxv8i16 (AArch64dup (i32 (SVE8BitLslImm i32:$a, i32:$b)))),
+ (DUP_ZI_H $a, $b)>;
+ def : Pat<(nxv4i32 (AArch64dup (i32 (SVE8BitLslImm i32:$a, i32:$b)))),
+ (DUP_ZI_S $a, $b)>;
+ def : Pat<(nxv2i64 (AArch64dup (i64 (SVE8BitLslImm i32:$a, i32:$b)))),
+ (DUP_ZI_D $a, $b)>;
+
+ // Duplicate FP immediate into all vector elements
+ let AddedComplexity = 2 in {
+ def : Pat<(nxv8f16 (AArch64dup fpimm16:$imm8)),
+ (FDUP_ZI_H fpimm16:$imm8)>;
+ def : Pat<(nxv4f16 (AArch64dup fpimm16:$imm8)),
+ (FDUP_ZI_H fpimm16:$imm8)>;
+ def : Pat<(nxv2f16 (AArch64dup fpimm16:$imm8)),
+ (FDUP_ZI_H fpimm16:$imm8)>;
+ def : Pat<(nxv4f32 (AArch64dup fpimm32:$imm8)),
+ (FDUP_ZI_S fpimm32:$imm8)>;
+ def : Pat<(nxv2f32 (AArch64dup fpimm32:$imm8)),
+ (FDUP_ZI_S fpimm32:$imm8)>;
+ def : Pat<(nxv2f64 (AArch64dup fpimm64:$imm8)),
+ (FDUP_ZI_D fpimm64:$imm8)>;
+ }
// Select elements from either vector (predicated)
defm SEL_ZPZZ : sve_int_sel_vvv<"sel", vselect>;
defm SPLICE_ZPZ : sve_int_perm_splice<"splice", int_aarch64_sve_splice>;
+
+ let Predicates = [HasSVE, HasBF16] in {
+ def : SVE_3_Op_Pat<nxv8bf16, vselect, nxv8i1, nxv8bf16, nxv8bf16, SEL_ZPZZ_H>;
+ def : SVE_3_Op_Pat<nxv8bf16, int_aarch64_sve_splice, nxv8i1, nxv8bf16, nxv8bf16, SPLICE_ZPZ_H>;
+ }
+
defm COMPACT_ZPZ : sve_int_perm_compact<"compact", int_aarch64_sve_compact>;
defm INSR_ZR : sve_int_perm_insrs<"insr", AArch64insr>;
defm INSR_ZV : sve_int_perm_insrv<"insr", AArch64insr>;
defm EXT_ZZI : sve_int_perm_extract_i<"ext", AArch64ext>;
+ let Predicates = [HasSVE, HasBF16] in {
+ def : SVE_2_Op_Pat<nxv8bf16, AArch64insr, nxv8bf16, bf16, INSR_ZV_H>;
+ }
+
defm RBIT_ZPmZ : sve_int_perm_rev_rbit<"rbit", int_aarch64_sve_rbit>;
defm REVB_ZPmZ : sve_int_perm_rev_revb<"revb", int_aarch64_sve_revb, bswap>;
defm REVH_ZPmZ : sve_int_perm_rev_revh<"revh", int_aarch64_sve_revh>;
@@ -277,6 +539,10 @@ let Predicates = [HasSVE] in {
defm REV_PP : sve_int_perm_reverse_p<"rev", AArch64rev>;
defm REV_ZZ : sve_int_perm_reverse_z<"rev", AArch64rev>;
+ let Predicates = [HasSVE, HasBF16] in {
+ def : SVE_1_Op_Pat<nxv8bf16, AArch64rev, nxv8bf16, REV_ZZ_H>;
+ }
+
defm SUNPKLO_ZZ : sve_int_perm_unpk<0b00, "sunpklo", AArch64sunpklo>;
defm SUNPKHI_ZZ : sve_int_perm_unpk<0b01, "sunpkhi", AArch64sunpkhi>;
defm UUNPKLO_ZZ : sve_int_perm_unpk<0b10, "uunpklo", AArch64uunpklo>;
@@ -290,34 +556,34 @@ let Predicates = [HasSVE] in {
def MOVPRFX_ZZ : sve_int_bin_cons_misc_0_c<0b00000001, "movprfx", ZPRAny>;
defm FEXPA_ZZ : sve_int_bin_cons_misc_0_c_fexpa<"fexpa", int_aarch64_sve_fexpa_x>;
- def BRKPA_PPzPP : sve_int_brkp<0b00, "brkpa">;
- def BRKPAS_PPzPP : sve_int_brkp<0b10, "brkpas">;
- def BRKPB_PPzPP : sve_int_brkp<0b01, "brkpb">;
- def BRKPBS_PPzPP : sve_int_brkp<0b11, "brkpbs">;
+ defm BRKPA_PPzPP : sve_int_brkp<0b00, "brkpa", int_aarch64_sve_brkpa_z>;
+ defm BRKPAS_PPzPP : sve_int_brkp<0b10, "brkpas", null_frag>;
+ defm BRKPB_PPzPP : sve_int_brkp<0b01, "brkpb", int_aarch64_sve_brkpb_z>;
+ defm BRKPBS_PPzPP : sve_int_brkp<0b11, "brkpbs", null_frag>;
- def BRKN_PPzP : sve_int_brkn<0b0, "brkn">;
- def BRKNS_PPzP : sve_int_brkn<0b1, "brkns">;
+ defm BRKN_PPzP : sve_int_brkn<0b0, "brkn", int_aarch64_sve_brkn_z>;
+ defm BRKNS_PPzP : sve_int_brkn<0b1, "brkns", null_frag>;
- defm BRKA_PPzP : sve_int_break_z<0b000, "brka">;
- defm BRKA_PPmP : sve_int_break_m<0b001, "brka">;
- defm BRKAS_PPzP : sve_int_break_z<0b010, "brkas">;
- defm BRKB_PPzP : sve_int_break_z<0b100, "brkb">;
- defm BRKB_PPmP : sve_int_break_m<0b101, "brkb">;
- defm BRKBS_PPzP : sve_int_break_z<0b110, "brkbs">;
+ defm BRKA_PPzP : sve_int_break_z<0b000, "brka", int_aarch64_sve_brka_z>;
+ defm BRKA_PPmP : sve_int_break_m<0b001, "brka", int_aarch64_sve_brka>;
+ defm BRKAS_PPzP : sve_int_break_z<0b010, "brkas", null_frag>;
+ defm BRKB_PPzP : sve_int_break_z<0b100, "brkb", int_aarch64_sve_brkb_z>;
+ defm BRKB_PPmP : sve_int_break_m<0b101, "brkb", int_aarch64_sve_brkb>;
+ defm BRKBS_PPzP : sve_int_break_z<0b110, "brkbs", null_frag>;
def PTEST_PP : sve_int_ptest<0b010000, "ptest">;
def PFALSE : sve_int_pfalse<0b000000, "pfalse">;
defm PFIRST : sve_int_pfirst<0b00000, "pfirst", int_aarch64_sve_pfirst>;
defm PNEXT : sve_int_pnext<0b00110, "pnext", int_aarch64_sve_pnext>;
- defm AND_PPzPP : sve_int_pred_log<0b0000, "and", int_aarch64_sve_and_z>;
+ defm AND_PPzPP : sve_int_pred_log<0b0000, "and", int_aarch64_sve_and_z, and>;
defm BIC_PPzPP : sve_int_pred_log<0b0001, "bic", int_aarch64_sve_bic_z>;
- defm EOR_PPzPP : sve_int_pred_log<0b0010, "eor", int_aarch64_sve_eor_z>;
+ defm EOR_PPzPP : sve_int_pred_log<0b0010, "eor", int_aarch64_sve_eor_z, xor>;
defm SEL_PPPP : sve_int_pred_log<0b0011, "sel", vselect>;
defm ANDS_PPzPP : sve_int_pred_log<0b0100, "ands", null_frag>;
defm BICS_PPzPP : sve_int_pred_log<0b0101, "bics", null_frag>;
defm EORS_PPzPP : sve_int_pred_log<0b0110, "eors", null_frag>;
- defm ORR_PPzPP : sve_int_pred_log<0b1000, "orr", int_aarch64_sve_orr_z>;
+ defm ORR_PPzPP : sve_int_pred_log<0b1000, "orr", int_aarch64_sve_orr_z, or>;
defm ORN_PPzPP : sve_int_pred_log<0b1001, "orn", int_aarch64_sve_orn_z>;
defm NOR_PPzPP : sve_int_pred_log<0b1010, "nor", int_aarch64_sve_nor_z>;
defm NAND_PPzPP : sve_int_pred_log<0b1011, "nand", int_aarch64_sve_nand_z>;
@@ -333,11 +599,23 @@ let Predicates = [HasSVE] in {
defm CLASTA_ZPZ : sve_int_perm_clast_zz<0, "clasta", int_aarch64_sve_clasta>;
defm CLASTB_ZPZ : sve_int_perm_clast_zz<1, "clastb", int_aarch64_sve_clastb>;
+ let Predicates = [HasSVE, HasBF16] in {
+ def : SVE_3_Op_Pat<bf16, AArch64clasta_n, nxv8i1, bf16, nxv8bf16, CLASTA_VPZ_H>;
+ def : SVE_3_Op_Pat<bf16, AArch64clastb_n, nxv8i1, bf16, nxv8bf16, CLASTB_VPZ_H>;
+ def : SVE_3_Op_Pat<nxv8bf16, int_aarch64_sve_clasta, nxv8i1, nxv8bf16, nxv8bf16, CLASTA_ZPZ_H>;
+ def : SVE_3_Op_Pat<nxv8bf16, int_aarch64_sve_clastb, nxv8i1, nxv8bf16, nxv8bf16, CLASTB_ZPZ_H>;
+ }
+
defm LASTA_RPZ : sve_int_perm_last_r<0, "lasta", AArch64lasta>;
defm LASTB_RPZ : sve_int_perm_last_r<1, "lastb", AArch64lastb>;
defm LASTA_VPZ : sve_int_perm_last_v<0, "lasta", AArch64lasta>;
defm LASTB_VPZ : sve_int_perm_last_v<1, "lastb", AArch64lastb>;
+ let Predicates = [HasSVE, HasBF16] in {
+ def : SVE_2_Op_Pat<bf16, AArch64lasta, nxv8i1, nxv8bf16, LASTA_VPZ_H>;
+ def : SVE_2_Op_Pat<bf16, AArch64lastb, nxv8i1, nxv8bf16, LASTB_VPZ_H>;
+ }
+
// continuous load with reg+immediate
defm LD1B_IMM : sve_mem_cld_si<0b0000, "ld1b", Z_b, ZPR8>;
defm LD1B_H_IMM : sve_mem_cld_si<0b0001, "ld1b", Z_h, ZPR16>;
@@ -468,115 +746,115 @@ let Predicates = [HasSVE] in {
// Gathers using unscaled 32-bit offsets, e.g.
// ld1h z0.s, p0/z, [x0, z0.s, uxtw]
- defm GLD1SB_S : sve_mem_32b_gld_vs_32_unscaled<0b0000, "ld1sb", AArch64ld1s_gather_sxtw, AArch64ld1s_gather_uxtw, ZPR32ExtSXTW8Only, ZPR32ExtUXTW8Only, nxv4i8>;
- defm GLDFF1SB_S : sve_mem_32b_gld_vs_32_unscaled<0b0001, "ldff1sb", null_frag, null_frag, ZPR32ExtSXTW8Only, ZPR32ExtUXTW8Only, nxv4i8>;
- defm GLD1B_S : sve_mem_32b_gld_vs_32_unscaled<0b0010, "ld1b", AArch64ld1_gather_sxtw, AArch64ld1_gather_uxtw, ZPR32ExtSXTW8Only, ZPR32ExtUXTW8Only, nxv4i8>;
- defm GLDFF1B_S : sve_mem_32b_gld_vs_32_unscaled<0b0011, "ldff1b", null_frag, null_frag, ZPR32ExtSXTW8Only, ZPR32ExtUXTW8Only, nxv4i8>;
- defm GLD1SH_S : sve_mem_32b_gld_vs_32_unscaled<0b0100, "ld1sh", AArch64ld1s_gather_sxtw, AArch64ld1s_gather_uxtw, ZPR32ExtSXTW8, ZPR32ExtUXTW8, nxv4i16>;
- defm GLDFF1SH_S : sve_mem_32b_gld_vs_32_unscaled<0b0101, "ldff1sh", null_frag, null_frag, ZPR32ExtSXTW8, ZPR32ExtUXTW8, nxv4i16>;
- defm GLD1H_S : sve_mem_32b_gld_vs_32_unscaled<0b0110, "ld1h", AArch64ld1_gather_sxtw, AArch64ld1_gather_uxtw, ZPR32ExtSXTW8, ZPR32ExtUXTW8, nxv4i16>;
- defm GLDFF1H_S : sve_mem_32b_gld_vs_32_unscaled<0b0111, "ldff1h", null_frag, null_frag, ZPR32ExtSXTW8, ZPR32ExtUXTW8, nxv4i16>;
- defm GLD1W : sve_mem_32b_gld_vs_32_unscaled<0b1010, "ld1w", AArch64ld1_gather_sxtw, AArch64ld1_gather_uxtw, ZPR32ExtSXTW8, ZPR32ExtUXTW8, nxv4i32>;
- defm GLDFF1W : sve_mem_32b_gld_vs_32_unscaled<0b1011, "ldff1w", null_frag, null_frag, ZPR32ExtSXTW8, ZPR32ExtUXTW8, nxv4i32>;
+ defm GLD1SB_S : sve_mem_32b_gld_vs_32_unscaled<0b0000, "ld1sb", AArch64ld1s_gather_sxtw_z, AArch64ld1s_gather_uxtw_z, ZPR32ExtSXTW8Only, ZPR32ExtUXTW8Only, nxv4i8>;
+ defm GLDFF1SB_S : sve_mem_32b_gld_vs_32_unscaled<0b0001, "ldff1sb", AArch64ldff1s_gather_sxtw_z, AArch64ldff1s_gather_uxtw_z, ZPR32ExtSXTW8Only, ZPR32ExtUXTW8Only, nxv4i8>;
+ defm GLD1B_S : sve_mem_32b_gld_vs_32_unscaled<0b0010, "ld1b", AArch64ld1_gather_sxtw_z, AArch64ld1_gather_uxtw_z, ZPR32ExtSXTW8Only, ZPR32ExtUXTW8Only, nxv4i8>;
+ defm GLDFF1B_S : sve_mem_32b_gld_vs_32_unscaled<0b0011, "ldff1b", AArch64ldff1_gather_sxtw_z, AArch64ldff1_gather_uxtw_z, ZPR32ExtSXTW8Only, ZPR32ExtUXTW8Only, nxv4i8>;
+ defm GLD1SH_S : sve_mem_32b_gld_vs_32_unscaled<0b0100, "ld1sh", AArch64ld1s_gather_sxtw_z, AArch64ld1s_gather_uxtw_z, ZPR32ExtSXTW8, ZPR32ExtUXTW8, nxv4i16>;
+ defm GLDFF1SH_S : sve_mem_32b_gld_vs_32_unscaled<0b0101, "ldff1sh", AArch64ldff1s_gather_sxtw_z, AArch64ldff1s_gather_uxtw_z, ZPR32ExtSXTW8, ZPR32ExtUXTW8, nxv4i16>;
+ defm GLD1H_S : sve_mem_32b_gld_vs_32_unscaled<0b0110, "ld1h", AArch64ld1_gather_sxtw_z, AArch64ld1_gather_uxtw_z, ZPR32ExtSXTW8, ZPR32ExtUXTW8, nxv4i16>;
+ defm GLDFF1H_S : sve_mem_32b_gld_vs_32_unscaled<0b0111, "ldff1h", AArch64ldff1_gather_sxtw_z, AArch64ldff1_gather_uxtw_z, ZPR32ExtSXTW8, ZPR32ExtUXTW8, nxv4i16>;
+ defm GLD1W : sve_mem_32b_gld_vs_32_unscaled<0b1010, "ld1w", AArch64ld1_gather_sxtw_z, AArch64ld1_gather_uxtw_z, ZPR32ExtSXTW8, ZPR32ExtUXTW8, nxv4i32>;
+ defm GLDFF1W : sve_mem_32b_gld_vs_32_unscaled<0b1011, "ldff1w", AArch64ldff1_gather_sxtw_z, AArch64ldff1_gather_uxtw_z, ZPR32ExtSXTW8, ZPR32ExtUXTW8, nxv4i32>;
// Gathers using scaled 32-bit offsets, e.g.
// ld1h z0.s, p0/z, [x0, z0.s, uxtw #1]
- defm GLD1SH_S : sve_mem_32b_gld_sv_32_scaled<0b0100, "ld1sh", AArch64ld1s_gather_sxtw_scaled, AArch64ld1s_gather_uxtw_scaled, ZPR32ExtSXTW16, ZPR32ExtUXTW16, nxv4i16>;
- defm GLDFF1SH_S : sve_mem_32b_gld_sv_32_scaled<0b0101, "ldff1sh", null_frag, null_frag, ZPR32ExtSXTW16, ZPR32ExtUXTW16, nxv4i16>;
- defm GLD1H_S : sve_mem_32b_gld_sv_32_scaled<0b0110, "ld1h", AArch64ld1_gather_sxtw_scaled, AArch64ld1_gather_uxtw_scaled, ZPR32ExtSXTW16, ZPR32ExtUXTW16, nxv4i16>;
- defm GLDFF1H_S : sve_mem_32b_gld_sv_32_scaled<0b0111, "ldff1h", null_frag, null_frag, ZPR32ExtSXTW16, ZPR32ExtUXTW16, nxv4i16>;
- defm GLD1W : sve_mem_32b_gld_sv_32_scaled<0b1010, "ld1w", AArch64ld1_gather_sxtw_scaled, AArch64ld1_gather_uxtw_scaled, ZPR32ExtSXTW32, ZPR32ExtUXTW32, nxv4i32>;
- defm GLDFF1W : sve_mem_32b_gld_sv_32_scaled<0b1011, "ldff1w", null_frag, null_frag, ZPR32ExtSXTW32, ZPR32ExtUXTW32, nxv4i32>;
+ defm GLD1SH_S : sve_mem_32b_gld_sv_32_scaled<0b0100, "ld1sh", AArch64ld1s_gather_sxtw_scaled_z, AArch64ld1s_gather_uxtw_scaled_z, ZPR32ExtSXTW16, ZPR32ExtUXTW16, nxv4i16>;
+ defm GLDFF1SH_S : sve_mem_32b_gld_sv_32_scaled<0b0101, "ldff1sh", AArch64ldff1s_gather_sxtw_scaled_z, AArch64ldff1s_gather_uxtw_scaled_z, ZPR32ExtSXTW16, ZPR32ExtUXTW16, nxv4i16>;
+ defm GLD1H_S : sve_mem_32b_gld_sv_32_scaled<0b0110, "ld1h", AArch64ld1_gather_sxtw_scaled_z, AArch64ld1_gather_uxtw_scaled_z, ZPR32ExtSXTW16, ZPR32ExtUXTW16, nxv4i16>;
+ defm GLDFF1H_S : sve_mem_32b_gld_sv_32_scaled<0b0111, "ldff1h", AArch64ldff1_gather_sxtw_scaled_z, AArch64ldff1_gather_uxtw_scaled_z, ZPR32ExtSXTW16, ZPR32ExtUXTW16, nxv4i16>;
+ defm GLD1W : sve_mem_32b_gld_sv_32_scaled<0b1010, "ld1w", AArch64ld1_gather_sxtw_scaled_z, AArch64ld1_gather_uxtw_scaled_z, ZPR32ExtSXTW32, ZPR32ExtUXTW32, nxv4i32>;
+ defm GLDFF1W : sve_mem_32b_gld_sv_32_scaled<0b1011, "ldff1w", AArch64ldff1_gather_sxtw_scaled_z, AArch64ldff1_gather_uxtw_scaled_z, ZPR32ExtSXTW32, ZPR32ExtUXTW32, nxv4i32>;
// Gathers using 32-bit pointers with scaled offset, e.g.
// ld1h z0.s, p0/z, [z0.s, #16]
- defm GLD1SB_S : sve_mem_32b_gld_vi_32_ptrs<0b0000, "ld1sb", imm0_31, AArch64ld1s_gather_imm, nxv4i8>;
- defm GLDFF1SB_S : sve_mem_32b_gld_vi_32_ptrs<0b0001, "ldff1sb", imm0_31, null_frag, nxv4i8>;
- defm GLD1B_S : sve_mem_32b_gld_vi_32_ptrs<0b0010, "ld1b", imm0_31, AArch64ld1_gather_imm, nxv4i8>;
- defm GLDFF1B_S : sve_mem_32b_gld_vi_32_ptrs<0b0011, "ldff1b", imm0_31, null_frag, nxv4i8>;
- defm GLD1SH_S : sve_mem_32b_gld_vi_32_ptrs<0b0100, "ld1sh", uimm5s2, AArch64ld1s_gather_imm, nxv4i16>;
- defm GLDFF1SH_S : sve_mem_32b_gld_vi_32_ptrs<0b0101, "ldff1sh", uimm5s2, null_frag, nxv4i16>;
- defm GLD1H_S : sve_mem_32b_gld_vi_32_ptrs<0b0110, "ld1h", uimm5s2, AArch64ld1_gather_imm, nxv4i16>;
- defm GLDFF1H_S : sve_mem_32b_gld_vi_32_ptrs<0b0111, "ldff1h", uimm5s2, null_frag, nxv4i16>;
- defm GLD1W : sve_mem_32b_gld_vi_32_ptrs<0b1010, "ld1w", uimm5s4, AArch64ld1_gather_imm, nxv4i32>;
- defm GLDFF1W : sve_mem_32b_gld_vi_32_ptrs<0b1011, "ldff1w", uimm5s4, null_frag, nxv4i32>;
+ defm GLD1SB_S : sve_mem_32b_gld_vi_32_ptrs<0b0000, "ld1sb", imm0_31, AArch64ld1s_gather_imm_z, nxv4i8>;
+ defm GLDFF1SB_S : sve_mem_32b_gld_vi_32_ptrs<0b0001, "ldff1sb", imm0_31, AArch64ldff1s_gather_imm_z, nxv4i8>;
+ defm GLD1B_S : sve_mem_32b_gld_vi_32_ptrs<0b0010, "ld1b", imm0_31, AArch64ld1_gather_imm_z, nxv4i8>;
+ defm GLDFF1B_S : sve_mem_32b_gld_vi_32_ptrs<0b0011, "ldff1b", imm0_31, AArch64ldff1_gather_imm_z, nxv4i8>;
+ defm GLD1SH_S : sve_mem_32b_gld_vi_32_ptrs<0b0100, "ld1sh", uimm5s2, AArch64ld1s_gather_imm_z, nxv4i16>;
+ defm GLDFF1SH_S : sve_mem_32b_gld_vi_32_ptrs<0b0101, "ldff1sh", uimm5s2, AArch64ldff1s_gather_imm_z, nxv4i16>;
+ defm GLD1H_S : sve_mem_32b_gld_vi_32_ptrs<0b0110, "ld1h", uimm5s2, AArch64ld1_gather_imm_z, nxv4i16>;
+ defm GLDFF1H_S : sve_mem_32b_gld_vi_32_ptrs<0b0111, "ldff1h", uimm5s2, AArch64ldff1_gather_imm_z, nxv4i16>;
+ defm GLD1W : sve_mem_32b_gld_vi_32_ptrs<0b1010, "ld1w", uimm5s4, AArch64ld1_gather_imm_z, nxv4i32>;
+ defm GLDFF1W : sve_mem_32b_gld_vi_32_ptrs<0b1011, "ldff1w", uimm5s4, AArch64ldff1_gather_imm_z, nxv4i32>;
// Gathers using 64-bit pointers with scaled offset, e.g.
// ld1h z0.d, p0/z, [z0.d, #16]
- defm GLD1SB_D : sve_mem_64b_gld_vi_64_ptrs<0b0000, "ld1sb", imm0_31, AArch64ld1s_gather_imm, nxv2i8>;
- defm GLDFF1SB_D : sve_mem_64b_gld_vi_64_ptrs<0b0001, "ldff1sb", imm0_31, null_frag, nxv2i8>;
- defm GLD1B_D : sve_mem_64b_gld_vi_64_ptrs<0b0010, "ld1b", imm0_31, AArch64ld1_gather_imm, nxv2i8>;
- defm GLDFF1B_D : sve_mem_64b_gld_vi_64_ptrs<0b0011, "ldff1b", imm0_31, null_frag, nxv2i8>;
- defm GLD1SH_D : sve_mem_64b_gld_vi_64_ptrs<0b0100, "ld1sh", uimm5s2, AArch64ld1s_gather_imm, nxv2i16>;
- defm GLDFF1SH_D : sve_mem_64b_gld_vi_64_ptrs<0b0101, "ldff1sh", uimm5s2, null_frag, nxv2i16>;
- defm GLD1H_D : sve_mem_64b_gld_vi_64_ptrs<0b0110, "ld1h", uimm5s2, AArch64ld1_gather_imm, nxv2i16>;
- defm GLDFF1H_D : sve_mem_64b_gld_vi_64_ptrs<0b0111, "ldff1h", uimm5s2, null_frag, nxv2i16>;
- defm GLD1SW_D : sve_mem_64b_gld_vi_64_ptrs<0b1000, "ld1sw", uimm5s4, AArch64ld1s_gather_imm, nxv2i32>;
- defm GLDFF1SW_D : sve_mem_64b_gld_vi_64_ptrs<0b1001, "ldff1sw", uimm5s4, null_frag, nxv2i32>;
- defm GLD1W_D : sve_mem_64b_gld_vi_64_ptrs<0b1010, "ld1w", uimm5s4, AArch64ld1_gather_imm, nxv2i32>;
- defm GLDFF1W_D : sve_mem_64b_gld_vi_64_ptrs<0b1011, "ldff1w", uimm5s4, null_frag, nxv2i32>;
- defm GLD1D : sve_mem_64b_gld_vi_64_ptrs<0b1110, "ld1d", uimm5s8, AArch64ld1_gather_imm, nxv2i64>;
- defm GLDFF1D : sve_mem_64b_gld_vi_64_ptrs<0b1111, "ldff1d", uimm5s8, null_frag, nxv2i64>;
+ defm GLD1SB_D : sve_mem_64b_gld_vi_64_ptrs<0b0000, "ld1sb", imm0_31, AArch64ld1s_gather_imm_z, nxv2i8>;
+ defm GLDFF1SB_D : sve_mem_64b_gld_vi_64_ptrs<0b0001, "ldff1sb", imm0_31, AArch64ldff1s_gather_imm_z, nxv2i8>;
+ defm GLD1B_D : sve_mem_64b_gld_vi_64_ptrs<0b0010, "ld1b", imm0_31, AArch64ld1_gather_imm_z, nxv2i8>;
+ defm GLDFF1B_D : sve_mem_64b_gld_vi_64_ptrs<0b0011, "ldff1b", imm0_31, AArch64ldff1_gather_imm_z, nxv2i8>;
+ defm GLD1SH_D : sve_mem_64b_gld_vi_64_ptrs<0b0100, "ld1sh", uimm5s2, AArch64ld1s_gather_imm_z, nxv2i16>;
+ defm GLDFF1SH_D : sve_mem_64b_gld_vi_64_ptrs<0b0101, "ldff1sh", uimm5s2, AArch64ldff1s_gather_imm_z, nxv2i16>;
+ defm GLD1H_D : sve_mem_64b_gld_vi_64_ptrs<0b0110, "ld1h", uimm5s2, AArch64ld1_gather_imm_z, nxv2i16>;
+ defm GLDFF1H_D : sve_mem_64b_gld_vi_64_ptrs<0b0111, "ldff1h", uimm5s2, AArch64ldff1_gather_imm_z, nxv2i16>;
+ defm GLD1SW_D : sve_mem_64b_gld_vi_64_ptrs<0b1000, "ld1sw", uimm5s4, AArch64ld1s_gather_imm_z, nxv2i32>;
+ defm GLDFF1SW_D : sve_mem_64b_gld_vi_64_ptrs<0b1001, "ldff1sw", uimm5s4, AArch64ldff1s_gather_imm_z, nxv2i32>;
+ defm GLD1W_D : sve_mem_64b_gld_vi_64_ptrs<0b1010, "ld1w", uimm5s4, AArch64ld1_gather_imm_z, nxv2i32>;
+ defm GLDFF1W_D : sve_mem_64b_gld_vi_64_ptrs<0b1011, "ldff1w", uimm5s4, AArch64ldff1_gather_imm_z, nxv2i32>;
+ defm GLD1D : sve_mem_64b_gld_vi_64_ptrs<0b1110, "ld1d", uimm5s8, AArch64ld1_gather_imm_z, nxv2i64>;
+ defm GLDFF1D : sve_mem_64b_gld_vi_64_ptrs<0b1111, "ldff1d", uimm5s8, AArch64ldff1_gather_imm_z, nxv2i64>;
// Gathers using unscaled 64-bit offsets, e.g.
// ld1h z0.d, p0/z, [x0, z0.d]
- defm GLD1SB_D : sve_mem_64b_gld_vs2_64_unscaled<0b0000, "ld1sb", AArch64ld1s_gather, nxv2i8>;
- defm GLDFF1SB_D : sve_mem_64b_gld_vs2_64_unscaled<0b0001, "ldff1sb", null_frag, nxv2i8>;
- defm GLD1B_D : sve_mem_64b_gld_vs2_64_unscaled<0b0010, "ld1b", AArch64ld1_gather, nxv2i8>;
- defm GLDFF1B_D : sve_mem_64b_gld_vs2_64_unscaled<0b0011, "ldff1b", null_frag, nxv2i8>;
- defm GLD1SH_D : sve_mem_64b_gld_vs2_64_unscaled<0b0100, "ld1sh", AArch64ld1s_gather, nxv2i16>;
- defm GLDFF1SH_D : sve_mem_64b_gld_vs2_64_unscaled<0b0101, "ldff1sh", null_frag, nxv2i16>;
- defm GLD1H_D : sve_mem_64b_gld_vs2_64_unscaled<0b0110, "ld1h", AArch64ld1_gather, nxv2i16>;
- defm GLDFF1H_D : sve_mem_64b_gld_vs2_64_unscaled<0b0111, "ldff1h", null_frag, nxv2i16>;
- defm GLD1SW_D : sve_mem_64b_gld_vs2_64_unscaled<0b1000, "ld1sw", AArch64ld1s_gather, nxv2i32>;
- defm GLDFF1SW_D : sve_mem_64b_gld_vs2_64_unscaled<0b1001, "ldff1sw", null_frag, nxv2i32>;
- defm GLD1W_D : sve_mem_64b_gld_vs2_64_unscaled<0b1010, "ld1w", AArch64ld1_gather, nxv2i32>;
- defm GLDFF1W_D : sve_mem_64b_gld_vs2_64_unscaled<0b1011, "ldff1w", null_frag, nxv2i32>;
- defm GLD1D : sve_mem_64b_gld_vs2_64_unscaled<0b1110, "ld1d", AArch64ld1_gather, nxv2i64>;
- defm GLDFF1D : sve_mem_64b_gld_vs2_64_unscaled<0b1111, "ldff1d", null_frag, nxv2i64>;
+ defm GLD1SB_D : sve_mem_64b_gld_vs2_64_unscaled<0b0000, "ld1sb", AArch64ld1s_gather_z, nxv2i8>;
+ defm GLDFF1SB_D : sve_mem_64b_gld_vs2_64_unscaled<0b0001, "ldff1sb", AArch64ldff1s_gather_z, nxv2i8>;
+ defm GLD1B_D : sve_mem_64b_gld_vs2_64_unscaled<0b0010, "ld1b", AArch64ld1_gather_z, nxv2i8>;
+ defm GLDFF1B_D : sve_mem_64b_gld_vs2_64_unscaled<0b0011, "ldff1b", AArch64ldff1_gather_z, nxv2i8>;
+ defm GLD1SH_D : sve_mem_64b_gld_vs2_64_unscaled<0b0100, "ld1sh", AArch64ld1s_gather_z, nxv2i16>;
+ defm GLDFF1SH_D : sve_mem_64b_gld_vs2_64_unscaled<0b0101, "ldff1sh", AArch64ldff1s_gather_z, nxv2i16>;
+ defm GLD1H_D : sve_mem_64b_gld_vs2_64_unscaled<0b0110, "ld1h", AArch64ld1_gather_z, nxv2i16>;
+ defm GLDFF1H_D : sve_mem_64b_gld_vs2_64_unscaled<0b0111, "ldff1h", AArch64ldff1_gather_z, nxv2i16>;
+ defm GLD1SW_D : sve_mem_64b_gld_vs2_64_unscaled<0b1000, "ld1sw", AArch64ld1s_gather_z, nxv2i32>;
+ defm GLDFF1SW_D : sve_mem_64b_gld_vs2_64_unscaled<0b1001, "ldff1sw", AArch64ldff1s_gather_z, nxv2i32>;
+ defm GLD1W_D : sve_mem_64b_gld_vs2_64_unscaled<0b1010, "ld1w", AArch64ld1_gather_z, nxv2i32>;
+ defm GLDFF1W_D : sve_mem_64b_gld_vs2_64_unscaled<0b1011, "ldff1w", AArch64ldff1_gather_z, nxv2i32>;
+ defm GLD1D : sve_mem_64b_gld_vs2_64_unscaled<0b1110, "ld1d", AArch64ld1_gather_z, nxv2i64>;
+ defm GLDFF1D : sve_mem_64b_gld_vs2_64_unscaled<0b1111, "ldff1d", AArch64ldff1_gather_z, nxv2i64>;
// Gathers using scaled 64-bit offsets, e.g.
// ld1h z0.d, p0/z, [x0, z0.d, lsl #1]
- defm GLD1SH_D : sve_mem_64b_gld_sv2_64_scaled<0b0100, "ld1sh", AArch64ld1s_gather_scaled, ZPR64ExtLSL16, nxv2i16>;
- defm GLDFF1SH_D : sve_mem_64b_gld_sv2_64_scaled<0b0101, "ldff1sh", null_frag, ZPR64ExtLSL16, nxv2i16>;
- defm GLD1H_D : sve_mem_64b_gld_sv2_64_scaled<0b0110, "ld1h", AArch64ld1_gather_scaled, ZPR64ExtLSL16, nxv2i16>;
- defm GLDFF1H_D : sve_mem_64b_gld_sv2_64_scaled<0b0111, "ldff1h", null_frag, ZPR64ExtLSL16, nxv2i16>;
- defm GLD1SW_D : sve_mem_64b_gld_sv2_64_scaled<0b1000, "ld1sw", AArch64ld1s_gather_scaled, ZPR64ExtLSL32, nxv2i32>;
- defm GLDFF1SW_D : sve_mem_64b_gld_sv2_64_scaled<0b1001, "ldff1sw", null_frag, ZPR64ExtLSL32, nxv2i32>;
- defm GLD1W_D : sve_mem_64b_gld_sv2_64_scaled<0b1010, "ld1w", AArch64ld1_gather_scaled, ZPR64ExtLSL32, nxv2i32>;
- defm GLDFF1W_D : sve_mem_64b_gld_sv2_64_scaled<0b1011, "ldff1w", null_frag, ZPR64ExtLSL32, nxv2i32>;
- defm GLD1D : sve_mem_64b_gld_sv2_64_scaled<0b1110, "ld1d", AArch64ld1_gather_scaled, ZPR64ExtLSL64, nxv2i64>;
- defm GLDFF1D : sve_mem_64b_gld_sv2_64_scaled<0b1111, "ldff1d", null_frag, ZPR64ExtLSL64, nxv2i64>;
+ defm GLD1SH_D : sve_mem_64b_gld_sv2_64_scaled<0b0100, "ld1sh", AArch64ld1s_gather_scaled_z, ZPR64ExtLSL16, nxv2i16>;
+ defm GLDFF1SH_D : sve_mem_64b_gld_sv2_64_scaled<0b0101, "ldff1sh", AArch64ldff1s_gather_scaled_z, ZPR64ExtLSL16, nxv2i16>;
+ defm GLD1H_D : sve_mem_64b_gld_sv2_64_scaled<0b0110, "ld1h", AArch64ld1_gather_scaled_z, ZPR64ExtLSL16, nxv2i16>;
+ defm GLDFF1H_D : sve_mem_64b_gld_sv2_64_scaled<0b0111, "ldff1h", AArch64ldff1_gather_scaled_z, ZPR64ExtLSL16, nxv2i16>;
+ defm GLD1SW_D : sve_mem_64b_gld_sv2_64_scaled<0b1000, "ld1sw", AArch64ld1s_gather_scaled_z, ZPR64ExtLSL32, nxv2i32>;
+ defm GLDFF1SW_D : sve_mem_64b_gld_sv2_64_scaled<0b1001, "ldff1sw", AArch64ldff1s_gather_scaled_z, ZPR64ExtLSL32, nxv2i32>;
+ defm GLD1W_D : sve_mem_64b_gld_sv2_64_scaled<0b1010, "ld1w", AArch64ld1_gather_scaled_z, ZPR64ExtLSL32, nxv2i32>;
+ defm GLDFF1W_D : sve_mem_64b_gld_sv2_64_scaled<0b1011, "ldff1w", AArch64ldff1_gather_scaled_z, ZPR64ExtLSL32, nxv2i32>;
+ defm GLD1D : sve_mem_64b_gld_sv2_64_scaled<0b1110, "ld1d", AArch64ld1_gather_scaled_z, ZPR64ExtLSL64, nxv2i64>;
+ defm GLDFF1D : sve_mem_64b_gld_sv2_64_scaled<0b1111, "ldff1d", AArch64ldff1_gather_scaled_z, ZPR64ExtLSL64, nxv2i64>;
// Gathers using unscaled 32-bit offsets unpacked in 64-bits elements, e.g.
// ld1h z0.d, p0/z, [x0, z0.d, uxtw]
- defm GLD1SB_D : sve_mem_64b_gld_vs_32_unscaled<0b0000, "ld1sb", AArch64ld1s_gather_sxtw, AArch64ld1s_gather_uxtw, ZPR64ExtSXTW8Only, ZPR64ExtUXTW8Only, nxv2i8>;
- defm GLDFF1SB_D : sve_mem_64b_gld_vs_32_unscaled<0b0001, "ldff1sb", null_frag, null_frag, ZPR64ExtSXTW8Only, ZPR64ExtUXTW8Only, nxv2i8>;
- defm GLD1B_D : sve_mem_64b_gld_vs_32_unscaled<0b0010, "ld1b", AArch64ld1_gather_sxtw, AArch64ld1_gather_uxtw, ZPR64ExtSXTW8Only, ZPR64ExtUXTW8Only, nxv2i8>;
- defm GLDFF1B_D : sve_mem_64b_gld_vs_32_unscaled<0b0011, "ldff1b", null_frag, null_frag, ZPR64ExtSXTW8Only, ZPR64ExtUXTW8Only, nxv2i8>;
- defm GLD1SH_D : sve_mem_64b_gld_vs_32_unscaled<0b0100, "ld1sh", AArch64ld1s_gather_sxtw, AArch64ld1s_gather_uxtw, ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i16>;
- defm GLDFF1SH_D : sve_mem_64b_gld_vs_32_unscaled<0b0101, "ldff1sh", null_frag, null_frag, ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i16>;
- defm GLD1H_D : sve_mem_64b_gld_vs_32_unscaled<0b0110, "ld1h", AArch64ld1_gather_sxtw, AArch64ld1_gather_uxtw, ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i16>;
- defm GLDFF1H_D : sve_mem_64b_gld_vs_32_unscaled<0b0111, "ldff1h", null_frag, null_frag, ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i16>;
- defm GLD1SW_D : sve_mem_64b_gld_vs_32_unscaled<0b1000, "ld1sw", AArch64ld1s_gather_sxtw, AArch64ld1s_gather_uxtw, ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i32>;
- defm GLDFF1SW_D : sve_mem_64b_gld_vs_32_unscaled<0b1001, "ldff1sw", null_frag, null_frag, ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i32>;
- defm GLD1W_D : sve_mem_64b_gld_vs_32_unscaled<0b1010, "ld1w", AArch64ld1_gather_sxtw, AArch64ld1_gather_uxtw, ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i32>;
- defm GLDFF1W_D : sve_mem_64b_gld_vs_32_unscaled<0b1011, "ldff1w", null_frag, null_frag, ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i32>;
- defm GLD1D : sve_mem_64b_gld_vs_32_unscaled<0b1110, "ld1d", AArch64ld1_gather_sxtw, AArch64ld1_gather_uxtw, ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i64>;
- defm GLDFF1D : sve_mem_64b_gld_vs_32_unscaled<0b1111, "ldff1d", null_frag, null_frag, ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i64>;
+ defm GLD1SB_D : sve_mem_64b_gld_vs_32_unscaled<0b0000, "ld1sb", AArch64ld1s_gather_sxtw_z, AArch64ld1s_gather_uxtw_z, ZPR64ExtSXTW8Only, ZPR64ExtUXTW8Only, nxv2i8>;
+ defm GLDFF1SB_D : sve_mem_64b_gld_vs_32_unscaled<0b0001, "ldff1sb", AArch64ldff1s_gather_sxtw_z, AArch64ldff1s_gather_uxtw_z, ZPR64ExtSXTW8Only, ZPR64ExtUXTW8Only, nxv2i8>;
+ defm GLD1B_D : sve_mem_64b_gld_vs_32_unscaled<0b0010, "ld1b", AArch64ld1_gather_sxtw_z, AArch64ld1_gather_uxtw_z, ZPR64ExtSXTW8Only, ZPR64ExtUXTW8Only, nxv2i8>;
+ defm GLDFF1B_D : sve_mem_64b_gld_vs_32_unscaled<0b0011, "ldff1b", AArch64ldff1_gather_sxtw_z, AArch64ldff1_gather_uxtw_z, ZPR64ExtSXTW8Only, ZPR64ExtUXTW8Only, nxv2i8>;
+ defm GLD1SH_D : sve_mem_64b_gld_vs_32_unscaled<0b0100, "ld1sh", AArch64ld1s_gather_sxtw_z, AArch64ld1s_gather_uxtw_z, ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i16>;
+ defm GLDFF1SH_D : sve_mem_64b_gld_vs_32_unscaled<0b0101, "ldff1sh", AArch64ldff1s_gather_sxtw_z, AArch64ldff1s_gather_uxtw_z, ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i16>;
+ defm GLD1H_D : sve_mem_64b_gld_vs_32_unscaled<0b0110, "ld1h", AArch64ld1_gather_sxtw_z, AArch64ld1_gather_uxtw_z, ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i16>;
+ defm GLDFF1H_D : sve_mem_64b_gld_vs_32_unscaled<0b0111, "ldff1h", AArch64ldff1_gather_sxtw_z, AArch64ldff1_gather_uxtw_z, ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i16>;
+ defm GLD1SW_D : sve_mem_64b_gld_vs_32_unscaled<0b1000, "ld1sw", AArch64ld1s_gather_sxtw_z, AArch64ld1s_gather_uxtw_z, ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i32>;
+ defm GLDFF1SW_D : sve_mem_64b_gld_vs_32_unscaled<0b1001, "ldff1sw", AArch64ldff1s_gather_sxtw_z, AArch64ldff1s_gather_uxtw_z, ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i32>;
+ defm GLD1W_D : sve_mem_64b_gld_vs_32_unscaled<0b1010, "ld1w", AArch64ld1_gather_sxtw_z, AArch64ld1_gather_uxtw_z, ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i32>;
+ defm GLDFF1W_D : sve_mem_64b_gld_vs_32_unscaled<0b1011, "ldff1w", AArch64ldff1_gather_sxtw_z, AArch64ldff1_gather_uxtw_z, ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i32>;
+ defm GLD1D : sve_mem_64b_gld_vs_32_unscaled<0b1110, "ld1d", AArch64ld1_gather_sxtw_z, AArch64ld1_gather_uxtw_z, ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i64>;
+ defm GLDFF1D : sve_mem_64b_gld_vs_32_unscaled<0b1111, "ldff1d", AArch64ldff1_gather_sxtw_z, AArch64ldff1_gather_uxtw_z, ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i64>;
// Gathers using scaled 32-bit offsets unpacked in 64-bits elements, e.g.
// ld1h z0.d, p0/z, [x0, z0.d, uxtw #1]
- defm GLD1SH_D : sve_mem_64b_gld_sv_32_scaled<0b0100, "ld1sh", AArch64ld1s_gather_sxtw_scaled, AArch64ld1s_gather_uxtw_scaled, ZPR64ExtSXTW16, ZPR64ExtUXTW16, nxv2i16>;
- defm GLDFF1SH_D : sve_mem_64b_gld_sv_32_scaled<0b0101, "ldff1sh", null_frag, null_frag, ZPR64ExtSXTW16, ZPR64ExtUXTW16, nxv2i16>;
- defm GLD1H_D : sve_mem_64b_gld_sv_32_scaled<0b0110, "ld1h", AArch64ld1_gather_sxtw_scaled, AArch64ld1_gather_uxtw_scaled, ZPR64ExtSXTW16, ZPR64ExtUXTW16, nxv2i16>;
- defm GLDFF1H_D : sve_mem_64b_gld_sv_32_scaled<0b0111, "ldff1h", null_frag, null_frag, ZPR64ExtSXTW16, ZPR64ExtUXTW16, nxv2i16>;
- defm GLD1SW_D : sve_mem_64b_gld_sv_32_scaled<0b1000, "ld1sw", AArch64ld1s_gather_sxtw_scaled, AArch64ld1s_gather_uxtw_scaled, ZPR64ExtSXTW32, ZPR64ExtUXTW32, nxv2i32>;
- defm GLDFF1SW_D : sve_mem_64b_gld_sv_32_scaled<0b1001, "ldff1sw", null_frag, null_frag, ZPR64ExtSXTW32, ZPR64ExtUXTW32, nxv2i32>;
- defm GLD1W_D : sve_mem_64b_gld_sv_32_scaled<0b1010, "ld1w", AArch64ld1_gather_sxtw_scaled, AArch64ld1_gather_uxtw_scaled, ZPR64ExtSXTW32, ZPR64ExtUXTW32, nxv2i32>;
- defm GLDFF1W_D : sve_mem_64b_gld_sv_32_scaled<0b1011, "ldff1w", null_frag, null_frag, ZPR64ExtSXTW32, ZPR64ExtUXTW32, nxv2i32>;
- defm GLD1D : sve_mem_64b_gld_sv_32_scaled<0b1110, "ld1d", AArch64ld1_gather_sxtw_scaled, AArch64ld1_gather_uxtw_scaled, ZPR64ExtSXTW64, ZPR64ExtUXTW64, nxv2i64>;
- defm GLDFF1D : sve_mem_64b_gld_sv_32_scaled<0b1111, "ldff1d", null_frag, null_frag, ZPR64ExtSXTW64, ZPR64ExtUXTW64, nxv2i64>;
+ defm GLD1SH_D : sve_mem_64b_gld_sv_32_scaled<0b0100, "ld1sh", AArch64ld1s_gather_sxtw_scaled_z, AArch64ld1s_gather_uxtw_scaled_z, ZPR64ExtSXTW16, ZPR64ExtUXTW16, nxv2i16>;
+ defm GLDFF1SH_D : sve_mem_64b_gld_sv_32_scaled<0b0101, "ldff1sh", AArch64ldff1s_gather_sxtw_scaled_z, AArch64ldff1s_gather_uxtw_scaled_z, ZPR64ExtSXTW16, ZPR64ExtUXTW16, nxv2i16>;
+ defm GLD1H_D : sve_mem_64b_gld_sv_32_scaled<0b0110, "ld1h", AArch64ld1_gather_sxtw_scaled_z, AArch64ld1_gather_uxtw_scaled_z, ZPR64ExtSXTW16, ZPR64ExtUXTW16, nxv2i16>;
+ defm GLDFF1H_D : sve_mem_64b_gld_sv_32_scaled<0b0111, "ldff1h", AArch64ldff1_gather_sxtw_scaled_z, AArch64ldff1_gather_uxtw_scaled_z, ZPR64ExtSXTW16, ZPR64ExtUXTW16, nxv2i16>;
+ defm GLD1SW_D : sve_mem_64b_gld_sv_32_scaled<0b1000, "ld1sw", AArch64ld1s_gather_sxtw_scaled_z, AArch64ld1s_gather_uxtw_scaled_z, ZPR64ExtSXTW32, ZPR64ExtUXTW32, nxv2i32>;
+ defm GLDFF1SW_D : sve_mem_64b_gld_sv_32_scaled<0b1001, "ldff1sw", AArch64ldff1s_gather_sxtw_scaled_z, AArch64ldff1s_gather_uxtw_scaled_z, ZPR64ExtSXTW32, ZPR64ExtUXTW32, nxv2i32>;
+ defm GLD1W_D : sve_mem_64b_gld_sv_32_scaled<0b1010, "ld1w", AArch64ld1_gather_sxtw_scaled_z, AArch64ld1_gather_uxtw_scaled_z, ZPR64ExtSXTW32, ZPR64ExtUXTW32, nxv2i32>;
+ defm GLDFF1W_D : sve_mem_64b_gld_sv_32_scaled<0b1011, "ldff1w", AArch64ldff1_gather_sxtw_scaled_z, AArch64ldff1_gather_uxtw_scaled_z, ZPR64ExtSXTW32, ZPR64ExtUXTW32, nxv2i32>;
+ defm GLD1D : sve_mem_64b_gld_sv_32_scaled<0b1110, "ld1d", AArch64ld1_gather_sxtw_scaled_z, AArch64ld1_gather_uxtw_scaled_z, ZPR64ExtSXTW64, ZPR64ExtUXTW64, nxv2i64>;
+ defm GLDFF1D : sve_mem_64b_gld_sv_32_scaled<0b1111, "ldff1d", AArch64ldff1_gather_sxtw_scaled_z, AArch64ldff1_gather_uxtw_scaled_z, ZPR64ExtSXTW64, ZPR64ExtUXTW64, nxv2i64>;
// Non-temporal contiguous loads (register + immediate)
defm LDNT1B_ZRI : sve_mem_cldnt_si<0b00, "ldnt1b", Z_b, ZPR8>;
@@ -640,16 +918,16 @@ let Predicates = [HasSVE] in {
// Scatters using 32/64-bit pointers with offset, e.g.
// st1h z0.s, p0, [z0.s, #16]
- defm SST1B_S : sve_mem_32b_sst_vi_ptrs<0b001, "st1b", timm0_31, AArch64st1_scatter_imm, nxv4i8>;
- defm SST1H_S : sve_mem_32b_sst_vi_ptrs<0b011, "st1h", tuimm5s2, AArch64st1_scatter_imm, nxv4i16>;
- defm SST1W : sve_mem_32b_sst_vi_ptrs<0b101, "st1w", tuimm5s4, AArch64st1_scatter_imm, nxv4i32>;
+ defm SST1B_S : sve_mem_32b_sst_vi_ptrs<0b001, "st1b", imm0_31, AArch64st1_scatter_imm, nxv4i8>;
+ defm SST1H_S : sve_mem_32b_sst_vi_ptrs<0b011, "st1h", uimm5s2, AArch64st1_scatter_imm, nxv4i16>;
+ defm SST1W : sve_mem_32b_sst_vi_ptrs<0b101, "st1w", uimm5s4, AArch64st1_scatter_imm, nxv4i32>;
// Scatters using 32/64-bit pointers with offset, e.g.
// st1h z0.d, p0, [z0.d, #16]
- defm SST1B_D : sve_mem_64b_sst_vi_ptrs<0b000, "st1b", timm0_31, AArch64st1_scatter_imm, nxv2i8>;
- defm SST1H_D : sve_mem_64b_sst_vi_ptrs<0b010, "st1h", tuimm5s2, AArch64st1_scatter_imm, nxv2i16>;
- defm SST1W_D : sve_mem_64b_sst_vi_ptrs<0b100, "st1w", tuimm5s4, AArch64st1_scatter_imm, nxv2i32>;
- defm SST1D : sve_mem_64b_sst_vi_ptrs<0b110, "st1d", tuimm5s8, AArch64st1_scatter_imm, nxv2i64>;
+ defm SST1B_D : sve_mem_64b_sst_vi_ptrs<0b000, "st1b", imm0_31, AArch64st1_scatter_imm, nxv2i8>;
+ defm SST1H_D : sve_mem_64b_sst_vi_ptrs<0b010, "st1h", uimm5s2, AArch64st1_scatter_imm, nxv2i16>;
+ defm SST1W_D : sve_mem_64b_sst_vi_ptrs<0b100, "st1w", uimm5s4, AArch64st1_scatter_imm, nxv2i32>;
+ defm SST1D : sve_mem_64b_sst_vi_ptrs<0b110, "st1d", uimm5s8, AArch64st1_scatter_imm, nxv2i64>;
// Scatters using unscaled 64-bit offsets, e.g.
// st1h z0.d, p0, [x0, z0.d]
@@ -722,47 +1000,92 @@ let Predicates = [HasSVE] in {
def PRFS_PRR : sve_mem_prfm_ss<0b101, "prfw", GPR64NoXZRshifted32>;
def PRFD_PRR : sve_mem_prfm_ss<0b111, "prfd", GPR64NoXZRshifted64>;
+multiclass sve_prefetch<SDPatternOperator prefetch, ValueType PredTy, Instruction RegImmInst, Instruction RegRegInst, int scale, ComplexPattern AddrCP> {
+ // reg + imm
+ let AddedComplexity = 2 in {
+ def _reg_imm : Pat<(prefetch (PredTy PPR_3b:$gp), (am_sve_indexed_s6 GPR64sp:$base, simm6s1:$offset), (i32 sve_prfop:$prfop)),
+ (RegImmInst sve_prfop:$prfop, PPR_3b:$gp, GPR64:$base, simm6s1:$offset)>;
+ }
+
+ // reg + reg
+ let AddedComplexity = 1 in {
+ def _reg_reg : Pat<(prefetch (PredTy PPR_3b:$gp), (AddrCP GPR64sp:$base, GPR64:$index), (i32 sve_prfop:$prfop)),
+ (RegRegInst sve_prfop:$prfop, PPR_3b:$gp, GPR64:$base, GPR64:$index)>;
+ }
+
+ // default fallback
+ def _default : Pat<(prefetch (PredTy PPR_3b:$gp), GPR64:$base, (i32 sve_prfop:$prfop)),
+ (RegImmInst sve_prfop:$prfop, PPR_3b:$gp, GPR64:$base, (i64 0))>;
+ }
+
+ defm : sve_prefetch<int_aarch64_sve_prf, nxv16i1, PRFB_PRI, PRFB_PRR, 0, am_sve_regreg_lsl0>;
+ defm : sve_prefetch<int_aarch64_sve_prf, nxv8i1, PRFH_PRI, PRFH_PRR, 1, am_sve_regreg_lsl1>;
+ defm : sve_prefetch<int_aarch64_sve_prf, nxv4i1, PRFW_PRI, PRFS_PRR, 2, am_sve_regreg_lsl2>;
+ defm : sve_prefetch<int_aarch64_sve_prf, nxv2i1, PRFD_PRI, PRFD_PRR, 3, am_sve_regreg_lsl3>;
+
// Gather prefetch using scaled 32-bit offsets, e.g.
// prfh pldl1keep, p0, [x0, z0.s, uxtw #1]
- defm PRFB_S : sve_mem_32b_prfm_sv_scaled<0b00, "prfb", ZPR32ExtSXTW8Only, ZPR32ExtUXTW8Only>;
- defm PRFH_S : sve_mem_32b_prfm_sv_scaled<0b01, "prfh", ZPR32ExtSXTW16, ZPR32ExtUXTW16>;
- defm PRFW_S : sve_mem_32b_prfm_sv_scaled<0b10, "prfw", ZPR32ExtSXTW32, ZPR32ExtUXTW32>;
- defm PRFD_S : sve_mem_32b_prfm_sv_scaled<0b11, "prfd", ZPR32ExtSXTW64, ZPR32ExtUXTW64>;
+ defm PRFB_S : sve_mem_32b_prfm_sv_scaled<0b00, "prfb", ZPR32ExtSXTW8Only, ZPR32ExtUXTW8Only, int_aarch64_sve_prfb_gather_sxtw_index, int_aarch64_sve_prfb_gather_uxtw_index>;
+ defm PRFH_S : sve_mem_32b_prfm_sv_scaled<0b01, "prfh", ZPR32ExtSXTW16, ZPR32ExtUXTW16, int_aarch64_sve_prfh_gather_sxtw_index, int_aarch64_sve_prfh_gather_uxtw_index>;
+ defm PRFW_S : sve_mem_32b_prfm_sv_scaled<0b10, "prfw", ZPR32ExtSXTW32, ZPR32ExtUXTW32, int_aarch64_sve_prfw_gather_sxtw_index, int_aarch64_sve_prfw_gather_uxtw_index>;
+ defm PRFD_S : sve_mem_32b_prfm_sv_scaled<0b11, "prfd", ZPR32ExtSXTW64, ZPR32ExtUXTW64, int_aarch64_sve_prfd_gather_sxtw_index, int_aarch64_sve_prfd_gather_uxtw_index>;
// Gather prefetch using unpacked, scaled 32-bit offsets, e.g.
// prfh pldl1keep, p0, [x0, z0.d, uxtw #1]
- defm PRFB_D : sve_mem_64b_prfm_sv_ext_scaled<0b00, "prfb", ZPR64ExtSXTW8Only, ZPR64ExtUXTW8Only>;
- defm PRFH_D : sve_mem_64b_prfm_sv_ext_scaled<0b01, "prfh", ZPR64ExtSXTW16, ZPR64ExtUXTW16>;
- defm PRFW_D : sve_mem_64b_prfm_sv_ext_scaled<0b10, "prfw", ZPR64ExtSXTW32, ZPR64ExtUXTW32>;
- defm PRFD_D : sve_mem_64b_prfm_sv_ext_scaled<0b11, "prfd", ZPR64ExtSXTW64, ZPR64ExtUXTW64>;
+ defm PRFB_D : sve_mem_64b_prfm_sv_ext_scaled<0b00, "prfb", ZPR64ExtSXTW8Only, ZPR64ExtUXTW8Only, int_aarch64_sve_prfb_gather_sxtw_index, int_aarch64_sve_prfb_gather_uxtw_index>;
+ defm PRFH_D : sve_mem_64b_prfm_sv_ext_scaled<0b01, "prfh", ZPR64ExtSXTW16, ZPR64ExtUXTW16, int_aarch64_sve_prfh_gather_sxtw_index, int_aarch64_sve_prfh_gather_uxtw_index>;
+ defm PRFW_D : sve_mem_64b_prfm_sv_ext_scaled<0b10, "prfw", ZPR64ExtSXTW32, ZPR64ExtUXTW32, int_aarch64_sve_prfw_gather_sxtw_index, int_aarch64_sve_prfw_gather_uxtw_index>;
+ defm PRFD_D : sve_mem_64b_prfm_sv_ext_scaled<0b11, "prfd", ZPR64ExtSXTW64, ZPR64ExtUXTW64, int_aarch64_sve_prfd_gather_sxtw_index, int_aarch64_sve_prfd_gather_uxtw_index>;
// Gather prefetch using scaled 64-bit offsets, e.g.
// prfh pldl1keep, p0, [x0, z0.d, lsl #1]
- defm PRFB_D_SCALED : sve_mem_64b_prfm_sv_lsl_scaled<0b00, "prfb", ZPR64ExtLSL8>;
- defm PRFH_D_SCALED : sve_mem_64b_prfm_sv_lsl_scaled<0b01, "prfh", ZPR64ExtLSL16>;
- defm PRFW_D_SCALED : sve_mem_64b_prfm_sv_lsl_scaled<0b10, "prfw", ZPR64ExtLSL32>;
- defm PRFD_D_SCALED : sve_mem_64b_prfm_sv_lsl_scaled<0b11, "prfd", ZPR64ExtLSL64>;
+ defm PRFB_D_SCALED : sve_mem_64b_prfm_sv_lsl_scaled<0b00, "prfb", ZPR64ExtLSL8, int_aarch64_sve_prfb_gather_index>;
+ defm PRFH_D_SCALED : sve_mem_64b_prfm_sv_lsl_scaled<0b01, "prfh", ZPR64ExtLSL16, int_aarch64_sve_prfh_gather_index>;
+ defm PRFW_D_SCALED : sve_mem_64b_prfm_sv_lsl_scaled<0b10, "prfw", ZPR64ExtLSL32, int_aarch64_sve_prfw_gather_index>;
+ defm PRFD_D_SCALED : sve_mem_64b_prfm_sv_lsl_scaled<0b11, "prfd", ZPR64ExtLSL64, int_aarch64_sve_prfd_gather_index>;
// Gather prefetch using 32/64-bit pointers with offset, e.g.
// prfh pldl1keep, p0, [z0.s, #16]
// prfh pldl1keep, p0, [z0.d, #16]
- defm PRFB_S_PZI : sve_mem_32b_prfm_vi<0b00, "prfb", imm0_31>;
- defm PRFH_S_PZI : sve_mem_32b_prfm_vi<0b01, "prfh", uimm5s2>;
- defm PRFW_S_PZI : sve_mem_32b_prfm_vi<0b10, "prfw", uimm5s4>;
- defm PRFD_S_PZI : sve_mem_32b_prfm_vi<0b11, "prfd", uimm5s8>;
+ defm PRFB_S_PZI : sve_mem_32b_prfm_vi<0b00, "prfb", imm0_31, int_aarch64_sve_prfb_gather_scalar_offset>;
+ defm PRFH_S_PZI : sve_mem_32b_prfm_vi<0b01, "prfh", uimm5s2, int_aarch64_sve_prfh_gather_scalar_offset>;
+ defm PRFW_S_PZI : sve_mem_32b_prfm_vi<0b10, "prfw", uimm5s4, int_aarch64_sve_prfw_gather_scalar_offset>;
+ defm PRFD_S_PZI : sve_mem_32b_prfm_vi<0b11, "prfd", uimm5s8, int_aarch64_sve_prfd_gather_scalar_offset>;
- defm PRFB_D_PZI : sve_mem_64b_prfm_vi<0b00, "prfb", imm0_31>;
- defm PRFH_D_PZI : sve_mem_64b_prfm_vi<0b01, "prfh", uimm5s2>;
- defm PRFW_D_PZI : sve_mem_64b_prfm_vi<0b10, "prfw", uimm5s4>;
- defm PRFD_D_PZI : sve_mem_64b_prfm_vi<0b11, "prfd", uimm5s8>;
+ defm PRFB_D_PZI : sve_mem_64b_prfm_vi<0b00, "prfb", imm0_31, int_aarch64_sve_prfb_gather_scalar_offset>;
+ defm PRFH_D_PZI : sve_mem_64b_prfm_vi<0b01, "prfh", uimm5s2, int_aarch64_sve_prfh_gather_scalar_offset>;
+ defm PRFW_D_PZI : sve_mem_64b_prfm_vi<0b10, "prfw", uimm5s4, int_aarch64_sve_prfw_gather_scalar_offset>;
+ defm PRFD_D_PZI : sve_mem_64b_prfm_vi<0b11, "prfd", uimm5s8, int_aarch64_sve_prfd_gather_scalar_offset>;
defm ADR_SXTW_ZZZ_D : sve_int_bin_cons_misc_0_a_sxtw<0b00, "adr">;
defm ADR_UXTW_ZZZ_D : sve_int_bin_cons_misc_0_a_uxtw<0b01, "adr">;
defm ADR_LSL_ZZZ_S : sve_int_bin_cons_misc_0_a_32_lsl<0b10, "adr">;
defm ADR_LSL_ZZZ_D : sve_int_bin_cons_misc_0_a_64_lsl<0b11, "adr">;
+ def : Pat<(nxv4i32 (int_aarch64_sve_adrb nxv4i32:$Op1, nxv4i32:$Op2)),
+ (ADR_LSL_ZZZ_S_0 $Op1, $Op2)>;
+ def : Pat<(nxv4i32 (int_aarch64_sve_adrh nxv4i32:$Op1, nxv4i32:$Op2)),
+ (ADR_LSL_ZZZ_S_1 $Op1, $Op2)>;
+ def : Pat<(nxv4i32 (int_aarch64_sve_adrw nxv4i32:$Op1, nxv4i32:$Op2)),
+ (ADR_LSL_ZZZ_S_2 $Op1, $Op2)>;
+ def : Pat<(nxv4i32 (int_aarch64_sve_adrd nxv4i32:$Op1, nxv4i32:$Op2)),
+ (ADR_LSL_ZZZ_S_3 $Op1, $Op2)>;
+
+ def : Pat<(nxv2i64 (int_aarch64_sve_adrb nxv2i64:$Op1, nxv2i64:$Op2)),
+ (ADR_LSL_ZZZ_D_0 $Op1, $Op2)>;
+ def : Pat<(nxv2i64 (int_aarch64_sve_adrh nxv2i64:$Op1, nxv2i64:$Op2)),
+ (ADR_LSL_ZZZ_D_1 $Op1, $Op2)>;
+ def : Pat<(nxv2i64 (int_aarch64_sve_adrw nxv2i64:$Op1, nxv2i64:$Op2)),
+ (ADR_LSL_ZZZ_D_2 $Op1, $Op2)>;
+ def : Pat<(nxv2i64 (int_aarch64_sve_adrd nxv2i64:$Op1, nxv2i64:$Op2)),
+ (ADR_LSL_ZZZ_D_3 $Op1, $Op2)>;
+
defm TBL_ZZZ : sve_int_perm_tbl<"tbl", AArch64tbl>;
+ let Predicates = [HasSVE, HasBF16] in {
+ def : SVE_2_Op_Pat<nxv8bf16, AArch64tbl, nxv8bf16, nxv8i16, TBL_ZZZ_H>;
+ }
+
defm ZIP1_ZZZ : sve_int_perm_bin_perm_zz<0b000, "zip1", AArch64zip1>;
defm ZIP2_ZZZ : sve_int_perm_bin_perm_zz<0b001, "zip2", AArch64zip2>;
defm UZP1_ZZZ : sve_int_perm_bin_perm_zz<0b010, "uzp1", AArch64uzp1>;
@@ -770,6 +1093,15 @@ let Predicates = [HasSVE] in {
defm TRN1_ZZZ : sve_int_perm_bin_perm_zz<0b100, "trn1", AArch64trn1>;
defm TRN2_ZZZ : sve_int_perm_bin_perm_zz<0b101, "trn2", AArch64trn2>;
+ let Predicates = [HasSVE, HasBF16] in {
+ def : SVE_2_Op_Pat<nxv8bf16, AArch64zip1, nxv8bf16, nxv8bf16, ZIP1_ZZZ_H>;
+ def : SVE_2_Op_Pat<nxv8bf16, AArch64zip2, nxv8bf16, nxv8bf16, ZIP2_ZZZ_H>;
+ def : SVE_2_Op_Pat<nxv8bf16, AArch64uzp1, nxv8bf16, nxv8bf16, UZP1_ZZZ_H>;
+ def : SVE_2_Op_Pat<nxv8bf16, AArch64uzp2, nxv8bf16, nxv8bf16, UZP2_ZZZ_H>;
+ def : SVE_2_Op_Pat<nxv8bf16, AArch64trn1, nxv8bf16, nxv8bf16, TRN1_ZZZ_H>;
+ def : SVE_2_Op_Pat<nxv8bf16, AArch64trn2, nxv8bf16, nxv8bf16, TRN2_ZZZ_H>;
+ }
+
defm ZIP1_PPP : sve_int_perm_bin_perm_pp<0b000, "zip1", AArch64zip1>;
defm ZIP2_PPP : sve_int_perm_bin_perm_pp<0b001, "zip2", AArch64zip2>;
defm UZP1_PPP : sve_int_perm_bin_perm_pp<0b010, "uzp1", AArch64uzp1>;
@@ -777,12 +1109,12 @@ let Predicates = [HasSVE] in {
defm TRN1_PPP : sve_int_perm_bin_perm_pp<0b100, "trn1", AArch64trn1>;
defm TRN2_PPP : sve_int_perm_bin_perm_pp<0b101, "trn2", AArch64trn2>;
- defm CMPHS_PPzZZ : sve_int_cmp_0<0b000, "cmphs", int_aarch64_sve_cmphs, SETUGE>;
- defm CMPHI_PPzZZ : sve_int_cmp_0<0b001, "cmphi", int_aarch64_sve_cmphi, SETUGT>;
- defm CMPGE_PPzZZ : sve_int_cmp_0<0b100, "cmpge", int_aarch64_sve_cmpge, SETGE>;
- defm CMPGT_PPzZZ : sve_int_cmp_0<0b101, "cmpgt", int_aarch64_sve_cmpgt, SETGT>;
- defm CMPEQ_PPzZZ : sve_int_cmp_0<0b110, "cmpeq", int_aarch64_sve_cmpeq, SETEQ>;
- defm CMPNE_PPzZZ : sve_int_cmp_0<0b111, "cmpne", int_aarch64_sve_cmpne, SETNE>;
+ defm CMPHS_PPzZZ : sve_int_cmp_0<0b000, "cmphs", SETUGE, SETULE>;
+ defm CMPHI_PPzZZ : sve_int_cmp_0<0b001, "cmphi", SETUGT, SETULT>;
+ defm CMPGE_PPzZZ : sve_int_cmp_0<0b100, "cmpge", SETGE, SETLE>;
+ defm CMPGT_PPzZZ : sve_int_cmp_0<0b101, "cmpgt", SETGT, SETLT>;
+ defm CMPEQ_PPzZZ : sve_int_cmp_0<0b110, "cmpeq", SETEQ, SETEQ>;
+ defm CMPNE_PPzZZ : sve_int_cmp_0<0b111, "cmpne", SETNE, SETNE>;
defm CMPEQ_WIDE_PPzZZ : sve_int_cmp_0_wide<0b010, "cmpeq", int_aarch64_sve_cmpeq_wide>;
defm CMPNE_WIDE_PPzZZ : sve_int_cmp_0_wide<0b011, "cmpne", int_aarch64_sve_cmpne_wide>;
@@ -795,22 +1127,22 @@ let Predicates = [HasSVE] in {
defm CMPLO_WIDE_PPzZZ : sve_int_cmp_1_wide<0b110, "cmplo", int_aarch64_sve_cmplo_wide>;
defm CMPLS_WIDE_PPzZZ : sve_int_cmp_1_wide<0b111, "cmpls", int_aarch64_sve_cmpls_wide>;
- defm CMPGE_PPzZI : sve_int_scmp_vi<0b000, "cmpge", SETGE, int_aarch64_sve_cmpge>;
- defm CMPGT_PPzZI : sve_int_scmp_vi<0b001, "cmpgt", SETGT, int_aarch64_sve_cmpgt>;
- defm CMPLT_PPzZI : sve_int_scmp_vi<0b010, "cmplt", SETLT, null_frag, int_aarch64_sve_cmpgt>;
- defm CMPLE_PPzZI : sve_int_scmp_vi<0b011, "cmple", SETLE, null_frag, int_aarch64_sve_cmpge>;
- defm CMPEQ_PPzZI : sve_int_scmp_vi<0b100, "cmpeq", SETEQ, int_aarch64_sve_cmpeq>;
- defm CMPNE_PPzZI : sve_int_scmp_vi<0b101, "cmpne", SETNE, int_aarch64_sve_cmpne>;
- defm CMPHS_PPzZI : sve_int_ucmp_vi<0b00, "cmphs", SETUGE, int_aarch64_sve_cmphs>;
- defm CMPHI_PPzZI : sve_int_ucmp_vi<0b01, "cmphi", SETUGT, int_aarch64_sve_cmphi>;
- defm CMPLO_PPzZI : sve_int_ucmp_vi<0b10, "cmplo", SETULT, null_frag, int_aarch64_sve_cmphi>;
- defm CMPLS_PPzZI : sve_int_ucmp_vi<0b11, "cmpls", SETULE, null_frag, int_aarch64_sve_cmphs>;
-
- defm FCMGE_PPzZZ : sve_fp_3op_p_pd<0b000, "fcmge", int_aarch64_sve_fcmpge>;
- defm FCMGT_PPzZZ : sve_fp_3op_p_pd<0b001, "fcmgt", int_aarch64_sve_fcmpgt>;
- defm FCMEQ_PPzZZ : sve_fp_3op_p_pd<0b010, "fcmeq", int_aarch64_sve_fcmpeq>;
- defm FCMNE_PPzZZ : sve_fp_3op_p_pd<0b011, "fcmne", int_aarch64_sve_fcmpne>;
- defm FCMUO_PPzZZ : sve_fp_3op_p_pd<0b100, "fcmuo", int_aarch64_sve_fcmpuo>;
+ defm CMPGE_PPzZI : sve_int_scmp_vi<0b000, "cmpge", SETGE, SETLE>;
+ defm CMPGT_PPzZI : sve_int_scmp_vi<0b001, "cmpgt", SETGT, SETLT>;
+ defm CMPLT_PPzZI : sve_int_scmp_vi<0b010, "cmplt", SETLT, SETGT>;
+ defm CMPLE_PPzZI : sve_int_scmp_vi<0b011, "cmple", SETLE, SETGE>;
+ defm CMPEQ_PPzZI : sve_int_scmp_vi<0b100, "cmpeq", SETEQ, SETEQ>;
+ defm CMPNE_PPzZI : sve_int_scmp_vi<0b101, "cmpne", SETNE, SETEQ>;
+ defm CMPHS_PPzZI : sve_int_ucmp_vi<0b00, "cmphs", SETUGE, SETULE>;
+ defm CMPHI_PPzZI : sve_int_ucmp_vi<0b01, "cmphi", SETUGT, SETULT>;
+ defm CMPLO_PPzZI : sve_int_ucmp_vi<0b10, "cmplo", SETULT, SETUGT>;
+ defm CMPLS_PPzZI : sve_int_ucmp_vi<0b11, "cmpls", SETULE, SETUGE>;
+
+ defm FCMGE_PPzZZ : sve_fp_3op_p_pd_cc<0b000, "fcmge", int_aarch64_sve_fcmpge, setoge>;
+ defm FCMGT_PPzZZ : sve_fp_3op_p_pd_cc<0b001, "fcmgt", int_aarch64_sve_fcmpgt, setogt>;
+ defm FCMEQ_PPzZZ : sve_fp_3op_p_pd_cc<0b010, "fcmeq", int_aarch64_sve_fcmpeq, setoeq>;
+ defm FCMNE_PPzZZ : sve_fp_3op_p_pd_cc<0b011, "fcmne", int_aarch64_sve_fcmpne, setone>;
+ defm FCMUO_PPzZZ : sve_fp_3op_p_pd_cc<0b100, "fcmuo", int_aarch64_sve_fcmpuo, setuo>;
defm FACGE_PPzZZ : sve_fp_3op_p_pd<0b101, "facge", int_aarch64_sve_facge>;
defm FACGT_PPzZZ : sve_fp_3op_p_pd<0b111, "facgt", int_aarch64_sve_facgt>;
@@ -928,71 +1260,78 @@ let Predicates = [HasSVE] in {
defm INCP_ZP : sve_int_count_v<0b10000, "incp">;
defm DECP_ZP : sve_int_count_v<0b10100, "decp">;
- defm INDEX_RR : sve_int_index_rr<"index">;
- defm INDEX_IR : sve_int_index_ir<"index">;
- defm INDEX_RI : sve_int_index_ri<"index">;
- defm INDEX_II : sve_int_index_ii<"index">;
+ defm INDEX_RR : sve_int_index_rr<"index", index_vector>;
+ defm INDEX_IR : sve_int_index_ir<"index", index_vector>;
+ defm INDEX_RI : sve_int_index_ri<"index", index_vector>;
+ defm INDEX_II : sve_int_index_ii<"index", index_vector>;
// Unpredicated shifts
- defm ASR_ZZI : sve_int_bin_cons_shift_imm_right<0b00, "asr">;
- defm LSR_ZZI : sve_int_bin_cons_shift_imm_right<0b01, "lsr">;
- defm LSL_ZZI : sve_int_bin_cons_shift_imm_left< 0b11, "lsl">;
+ defm ASR_ZZI : sve_int_bin_cons_shift_imm_right<0b00, "asr", AArch64asr_m1>;
+ defm LSR_ZZI : sve_int_bin_cons_shift_imm_right<0b01, "lsr", AArch64lsr_m1>;
+ defm LSL_ZZI : sve_int_bin_cons_shift_imm_left< 0b11, "lsl", AArch64lsl_m1>;
defm ASR_WIDE_ZZZ : sve_int_bin_cons_shift_wide<0b00, "asr">;
defm LSR_WIDE_ZZZ : sve_int_bin_cons_shift_wide<0b01, "lsr">;
defm LSL_WIDE_ZZZ : sve_int_bin_cons_shift_wide<0b11, "lsl">;
// Predicated shifts
- defm ASR_ZPmI : sve_int_bin_pred_shift_imm_right<0b0000, "asr">;
- defm LSR_ZPmI : sve_int_bin_pred_shift_imm_right<0b0001, "lsr">;
+ defm ASR_ZPmI : sve_int_bin_pred_shift_imm_right<0b0000, "asr", "ASR_ZPZI">;
+ defm LSR_ZPmI : sve_int_bin_pred_shift_imm_right<0b0001, "lsr", "LSR_ZPZI">;
defm LSL_ZPmI : sve_int_bin_pred_shift_imm_left< 0b0011, "lsl">;
- defm ASRD_ZPmI : sve_int_bin_pred_shift_imm_right<0b0100, "asrd", int_aarch64_sve_asrd>;
+ defm ASRD_ZPmI : sve_int_bin_pred_shift_imm_right<0b0100, "asrd", "ASRD_ZPZI", int_aarch64_sve_asrd>;
+
+ let Predicates = [HasSVE, UseExperimentalZeroingPseudos] in {
+ defm ASR_ZPZZ : sve_int_bin_pred_zeroing_bhsd<AArch64asr_m1>;
+ defm LSR_ZPZZ : sve_int_bin_pred_zeroing_bhsd<AArch64lsr_m1>;
+ defm LSL_ZPZZ : sve_int_bin_pred_zeroing_bhsd<AArch64lsl_m1>;
+ defm ASRD_ZPZI : sve_int_bin_pred_shift_imm_right_zeroing_bhsd<int_aarch64_sve_asrd>;
+ }
- defm ASR_ZPmZ : sve_int_bin_pred_shift<0b000, "asr", int_aarch64_sve_asr>;
- defm LSR_ZPmZ : sve_int_bin_pred_shift<0b001, "lsr", int_aarch64_sve_lsr>;
- defm LSL_ZPmZ : sve_int_bin_pred_shift<0b011, "lsl", int_aarch64_sve_lsl>;
- defm ASRR_ZPmZ : sve_int_bin_pred_shift<0b100, "asrr", null_frag>;
- defm LSRR_ZPmZ : sve_int_bin_pred_shift<0b101, "lsrr", null_frag>;
- defm LSLR_ZPmZ : sve_int_bin_pred_shift<0b111, "lslr", null_frag>;
+ defm ASR_ZPmZ : sve_int_bin_pred_shift<0b000, "asr", "ASR_ZPZZ", AArch64asr_m1, "ASRR_ZPmZ">;
+ defm LSR_ZPmZ : sve_int_bin_pred_shift<0b001, "lsr", "LSR_ZPZZ", AArch64lsr_m1, "LSRR_ZPmZ">;
+ defm LSL_ZPmZ : sve_int_bin_pred_shift<0b011, "lsl", "LSL_ZPZZ", AArch64lsl_m1, "LSLR_ZPmZ">;
+ defm ASRR_ZPmZ : sve_int_bin_pred_shift<0b100, "asrr", "ASRR_ZPZZ", null_frag, "ASR_ZPmZ", /*isReverseInstr*/ 1>;
+ defm LSRR_ZPmZ : sve_int_bin_pred_shift<0b101, "lsrr", "LSRR_ZPZZ", null_frag, "LSR_ZPmZ", /*isReverseInstr*/ 1>;
+ defm LSLR_ZPmZ : sve_int_bin_pred_shift<0b111, "lslr", "LSLR_ZPZZ", null_frag, "LSL_ZPmZ", /*isReverseInstr*/ 1>;
defm ASR_WIDE_ZPmZ : sve_int_bin_pred_shift_wide<0b000, "asr", int_aarch64_sve_asr_wide>;
defm LSR_WIDE_ZPmZ : sve_int_bin_pred_shift_wide<0b001, "lsr", int_aarch64_sve_lsr_wide>;
defm LSL_WIDE_ZPmZ : sve_int_bin_pred_shift_wide<0b011, "lsl", int_aarch64_sve_lsl_wide>;
- defm FCVT_ZPmZ_StoH : sve_fp_2op_p_zd<0b1001000, "fcvt", ZPR32, ZPR16, int_aarch64_sve_fcvt_f16f32, nxv8f16, nxv16i1, nxv4f32, ElementSizeS>;
- defm FCVT_ZPmZ_HtoS : sve_fp_2op_p_zd<0b1001001, "fcvt", ZPR16, ZPR32, int_aarch64_sve_fcvt_f32f16, nxv4f32, nxv16i1, nxv8f16, ElementSizeS>;
- defm SCVTF_ZPmZ_HtoH : sve_fp_2op_p_zd<0b0110010, "scvtf", ZPR16, ZPR16, int_aarch64_sve_scvtf, nxv8f16, nxv8i1, nxv8i16, ElementSizeH>;
- defm SCVTF_ZPmZ_StoS : sve_fp_2op_p_zd<0b1010100, "scvtf", ZPR32, ZPR32, int_aarch64_sve_scvtf, nxv4f32, nxv4i1, nxv4i32, ElementSizeS>;
- defm UCVTF_ZPmZ_StoS : sve_fp_2op_p_zd<0b1010101, "ucvtf", ZPR32, ZPR32, int_aarch64_sve_ucvtf, nxv4f32, nxv4i1, nxv4i32, ElementSizeS>;
- defm UCVTF_ZPmZ_HtoH : sve_fp_2op_p_zd<0b0110011, "ucvtf", ZPR16, ZPR16, int_aarch64_sve_ucvtf, nxv8f16, nxv8i1, nxv8i16, ElementSizeH>;
- defm FCVTZS_ZPmZ_HtoH : sve_fp_2op_p_zd<0b0111010, "fcvtzs", ZPR16, ZPR16, int_aarch64_sve_fcvtzs, nxv8i16, nxv8i1, nxv8f16, ElementSizeH>;
- defm FCVTZS_ZPmZ_StoS : sve_fp_2op_p_zd<0b1011100, "fcvtzs", ZPR32, ZPR32, int_aarch64_sve_fcvtzs, nxv4i32, nxv4i1, nxv4f32, ElementSizeS>;
- defm FCVTZU_ZPmZ_HtoH : sve_fp_2op_p_zd<0b0111011, "fcvtzu", ZPR16, ZPR16, int_aarch64_sve_fcvtzu, nxv8i16, nxv8i1, nxv8f16, ElementSizeH>;
- defm FCVTZU_ZPmZ_StoS : sve_fp_2op_p_zd<0b1011101, "fcvtzu", ZPR32, ZPR32, int_aarch64_sve_fcvtzu, nxv4i32, nxv4i1, nxv4f32, ElementSizeS>;
- defm FCVT_ZPmZ_DtoH : sve_fp_2op_p_zd<0b1101000, "fcvt", ZPR64, ZPR16, int_aarch64_sve_fcvt_f16f64, nxv8f16, nxv16i1, nxv2f64, ElementSizeD>;
- defm FCVT_ZPmZ_HtoD : sve_fp_2op_p_zd<0b1101001, "fcvt", ZPR16, ZPR64, int_aarch64_sve_fcvt_f64f16, nxv2f64, nxv16i1, nxv8f16, ElementSizeD>;
- defm FCVT_ZPmZ_DtoS : sve_fp_2op_p_zd<0b1101010, "fcvt", ZPR64, ZPR32, int_aarch64_sve_fcvt_f32f64, nxv4f32, nxv16i1, nxv2f64, ElementSizeD>;
- defm FCVT_ZPmZ_StoD : sve_fp_2op_p_zd<0b1101011, "fcvt", ZPR32, ZPR64, int_aarch64_sve_fcvt_f64f32, nxv2f64, nxv16i1, nxv4f32, ElementSizeD>;
- defm SCVTF_ZPmZ_StoD : sve_fp_2op_p_zd<0b1110000, "scvtf", ZPR32, ZPR64, int_aarch64_sve_scvtf_f64i32, nxv2f64, nxv16i1, nxv4i32, ElementSizeD>;
- defm UCVTF_ZPmZ_StoD : sve_fp_2op_p_zd<0b1110001, "ucvtf", ZPR32, ZPR64, int_aarch64_sve_ucvtf_f64i32, nxv2f64, nxv16i1, nxv4i32, ElementSizeD>;
- defm UCVTF_ZPmZ_StoH : sve_fp_2op_p_zd<0b0110101, "ucvtf", ZPR32, ZPR16, int_aarch64_sve_ucvtf_f16i32, nxv8f16, nxv16i1, nxv4i32, ElementSizeS>;
- defm SCVTF_ZPmZ_DtoS : sve_fp_2op_p_zd<0b1110100, "scvtf", ZPR64, ZPR32, int_aarch64_sve_scvtf_f32i64, nxv4f32, nxv16i1, nxv2i64, ElementSizeD>;
- defm SCVTF_ZPmZ_StoH : sve_fp_2op_p_zd<0b0110100, "scvtf", ZPR32, ZPR16, int_aarch64_sve_scvtf_f16i32, nxv8f16, nxv16i1, nxv4i32, ElementSizeS>;
- defm SCVTF_ZPmZ_DtoH : sve_fp_2op_p_zd<0b0110110, "scvtf", ZPR64, ZPR16, int_aarch64_sve_scvtf_f16i64, nxv8f16, nxv16i1, nxv2i64, ElementSizeD>;
- defm UCVTF_ZPmZ_DtoS : sve_fp_2op_p_zd<0b1110101, "ucvtf", ZPR64, ZPR32, int_aarch64_sve_ucvtf_f32i64, nxv4f32, nxv16i1, nxv2i64, ElementSizeD>;
- defm UCVTF_ZPmZ_DtoH : sve_fp_2op_p_zd<0b0110111, "ucvtf", ZPR64, ZPR16, int_aarch64_sve_ucvtf_f16i64, nxv8f16, nxv16i1, nxv2i64, ElementSizeD>;
- defm SCVTF_ZPmZ_DtoD : sve_fp_2op_p_zd<0b1110110, "scvtf", ZPR64, ZPR64, int_aarch64_sve_scvtf, nxv2f64, nxv2i1, nxv2i64, ElementSizeD>;
- defm UCVTF_ZPmZ_DtoD : sve_fp_2op_p_zd<0b1110111, "ucvtf", ZPR64, ZPR64, int_aarch64_sve_ucvtf, nxv2f64, nxv2i1, nxv2i64, ElementSizeD>;
- defm FCVTZS_ZPmZ_DtoS : sve_fp_2op_p_zd<0b1111000, "fcvtzs", ZPR64, ZPR32, int_aarch64_sve_fcvtzs_i32f64, nxv4i32, nxv16i1, nxv2f64, ElementSizeD>;
- defm FCVTZU_ZPmZ_DtoS : sve_fp_2op_p_zd<0b1111001, "fcvtzu", ZPR64, ZPR32, int_aarch64_sve_fcvtzu_i32f64, nxv4i32, nxv16i1, nxv2f64, ElementSizeD>;
- defm FCVTZS_ZPmZ_StoD : sve_fp_2op_p_zd<0b1111100, "fcvtzs", ZPR32, ZPR64, int_aarch64_sve_fcvtzs_i64f32, nxv2i64, nxv16i1, nxv4f32, ElementSizeD>;
- defm FCVTZS_ZPmZ_HtoS : sve_fp_2op_p_zd<0b0111100, "fcvtzs", ZPR16, ZPR32, int_aarch64_sve_fcvtzs_i32f16, nxv4i32, nxv16i1, nxv8f16, ElementSizeS>;
- defm FCVTZS_ZPmZ_HtoD : sve_fp_2op_p_zd<0b0111110, "fcvtzs", ZPR16, ZPR64, int_aarch64_sve_fcvtzs_i64f16, nxv2i64, nxv16i1, nxv8f16, ElementSizeD>;
- defm FCVTZU_ZPmZ_HtoS : sve_fp_2op_p_zd<0b0111101, "fcvtzu", ZPR16, ZPR32, int_aarch64_sve_fcvtzu_i32f16, nxv4i32, nxv16i1, nxv8f16, ElementSizeS>;
- defm FCVTZU_ZPmZ_HtoD : sve_fp_2op_p_zd<0b0111111, "fcvtzu", ZPR16, ZPR64, int_aarch64_sve_fcvtzu_i64f16, nxv2i64, nxv16i1, nxv8f16, ElementSizeD>;
- defm FCVTZU_ZPmZ_StoD : sve_fp_2op_p_zd<0b1111101, "fcvtzu", ZPR32, ZPR64, int_aarch64_sve_fcvtzu_i64f32, nxv2i64, nxv16i1, nxv4f32, ElementSizeD>;
- defm FCVTZS_ZPmZ_DtoD : sve_fp_2op_p_zd<0b1111110, "fcvtzs", ZPR64, ZPR64, int_aarch64_sve_fcvtzs, nxv2i64, nxv2i1, nxv2f64, ElementSizeD>;
- defm FCVTZU_ZPmZ_DtoD : sve_fp_2op_p_zd<0b1111111, "fcvtzu", ZPR64, ZPR64, int_aarch64_sve_fcvtzu, nxv2i64, nxv2i1, nxv2f64, ElementSizeD>;
+ defm FCVT_ZPmZ_StoH : sve_fp_2op_p_zd<0b1001000, "fcvt", ZPR32, ZPR16, int_aarch64_sve_fcvt_f16f32, nxv8f16, nxv4i1, nxv4f32, ElementSizeS>;
+ defm FCVT_ZPmZ_HtoS : sve_fp_2op_p_zd<0b1001001, "fcvt", ZPR16, ZPR32, int_aarch64_sve_fcvt_f32f16, nxv4f32, nxv4i1, nxv8f16, ElementSizeS>;
+ defm SCVTF_ZPmZ_HtoH : sve_fp_2op_p_zd<0b0110010, "scvtf", ZPR16, ZPR16, int_aarch64_sve_scvtf, nxv8f16, nxv8i1, nxv8i16, ElementSizeH>;
+ defm SCVTF_ZPmZ_StoS : sve_fp_2op_p_zd<0b1010100, "scvtf", ZPR32, ZPR32, int_aarch64_sve_scvtf, nxv4f32, nxv4i1, nxv4i32, ElementSizeS>;
+ defm UCVTF_ZPmZ_StoS : sve_fp_2op_p_zd<0b1010101, "ucvtf", ZPR32, ZPR32, int_aarch64_sve_ucvtf, nxv4f32, nxv4i1, nxv4i32, ElementSizeS>;
+ defm UCVTF_ZPmZ_HtoH : sve_fp_2op_p_zd<0b0110011, "ucvtf", ZPR16, ZPR16, int_aarch64_sve_ucvtf, nxv8f16, nxv8i1, nxv8i16, ElementSizeH>;
+ defm FCVTZS_ZPmZ_HtoH : sve_fp_2op_p_zd<0b0111010, "fcvtzs", ZPR16, ZPR16, int_aarch64_sve_fcvtzs, nxv8i16, nxv8i1, nxv8f16, ElementSizeH>;
+ defm FCVTZS_ZPmZ_StoS : sve_fp_2op_p_zd<0b1011100, "fcvtzs", ZPR32, ZPR32, int_aarch64_sve_fcvtzs, nxv4i32, nxv4i1, nxv4f32, ElementSizeS>;
+ defm FCVTZU_ZPmZ_HtoH : sve_fp_2op_p_zd<0b0111011, "fcvtzu", ZPR16, ZPR16, int_aarch64_sve_fcvtzu, nxv8i16, nxv8i1, nxv8f16, ElementSizeH>;
+ defm FCVTZU_ZPmZ_StoS : sve_fp_2op_p_zd<0b1011101, "fcvtzu", ZPR32, ZPR32, int_aarch64_sve_fcvtzu, nxv4i32, nxv4i1, nxv4f32, ElementSizeS>;
+ defm FCVT_ZPmZ_DtoH : sve_fp_2op_p_zd<0b1101000, "fcvt", ZPR64, ZPR16, int_aarch64_sve_fcvt_f16f64, nxv8f16, nxv2i1, nxv2f64, ElementSizeD>;
+ defm FCVT_ZPmZ_HtoD : sve_fp_2op_p_zd<0b1101001, "fcvt", ZPR16, ZPR64, int_aarch64_sve_fcvt_f64f16, nxv2f64, nxv2i1, nxv8f16, ElementSizeD>;
+ defm FCVT_ZPmZ_DtoS : sve_fp_2op_p_zd<0b1101010, "fcvt", ZPR64, ZPR32, int_aarch64_sve_fcvt_f32f64, nxv4f32, nxv2i1, nxv2f64, ElementSizeD>;
+ defm FCVT_ZPmZ_StoD : sve_fp_2op_p_zd<0b1101011, "fcvt", ZPR32, ZPR64, int_aarch64_sve_fcvt_f64f32, nxv2f64, nxv2i1, nxv4f32, ElementSizeD>;
+ defm SCVTF_ZPmZ_StoD : sve_fp_2op_p_zd<0b1110000, "scvtf", ZPR32, ZPR64, int_aarch64_sve_scvtf_f64i32, nxv2f64, nxv2i1, nxv4i32, ElementSizeD>;
+ defm UCVTF_ZPmZ_StoD : sve_fp_2op_p_zd<0b1110001, "ucvtf", ZPR32, ZPR64, int_aarch64_sve_ucvtf_f64i32, nxv2f64, nxv2i1, nxv4i32, ElementSizeD>;
+ defm UCVTF_ZPmZ_StoH : sve_fp_2op_p_zd<0b0110101, "ucvtf", ZPR32, ZPR16, int_aarch64_sve_ucvtf_f16i32, nxv8f16, nxv4i1, nxv4i32, ElementSizeS>;
+ defm SCVTF_ZPmZ_DtoS : sve_fp_2op_p_zd<0b1110100, "scvtf", ZPR64, ZPR32, int_aarch64_sve_scvtf_f32i64, nxv4f32, nxv2i1, nxv2i64, ElementSizeD>;
+ defm SCVTF_ZPmZ_StoH : sve_fp_2op_p_zd<0b0110100, "scvtf", ZPR32, ZPR16, int_aarch64_sve_scvtf_f16i32, nxv8f16, nxv4i1, nxv4i32, ElementSizeS>;
+ defm SCVTF_ZPmZ_DtoH : sve_fp_2op_p_zd<0b0110110, "scvtf", ZPR64, ZPR16, int_aarch64_sve_scvtf_f16i64, nxv8f16, nxv2i1, nxv2i64, ElementSizeD>;
+ defm UCVTF_ZPmZ_DtoS : sve_fp_2op_p_zd<0b1110101, "ucvtf", ZPR64, ZPR32, int_aarch64_sve_ucvtf_f32i64, nxv4f32, nxv2i1, nxv2i64, ElementSizeD>;
+ defm UCVTF_ZPmZ_DtoH : sve_fp_2op_p_zd<0b0110111, "ucvtf", ZPR64, ZPR16, int_aarch64_sve_ucvtf_f16i64, nxv8f16, nxv2i1, nxv2i64, ElementSizeD>;
+ defm SCVTF_ZPmZ_DtoD : sve_fp_2op_p_zd<0b1110110, "scvtf", ZPR64, ZPR64, int_aarch64_sve_scvtf, nxv2f64, nxv2i1, nxv2i64, ElementSizeD>;
+ defm UCVTF_ZPmZ_DtoD : sve_fp_2op_p_zd<0b1110111, "ucvtf", ZPR64, ZPR64, int_aarch64_sve_ucvtf, nxv2f64, nxv2i1, nxv2i64, ElementSizeD>;
+ defm FCVTZS_ZPmZ_DtoS : sve_fp_2op_p_zd<0b1111000, "fcvtzs", ZPR64, ZPR32, int_aarch64_sve_fcvtzs_i32f64, nxv4i32, nxv2i1, nxv2f64, ElementSizeD>;
+ defm FCVTZU_ZPmZ_DtoS : sve_fp_2op_p_zd<0b1111001, "fcvtzu", ZPR64, ZPR32, int_aarch64_sve_fcvtzu_i32f64, nxv4i32, nxv2i1, nxv2f64, ElementSizeD>;
+ defm FCVTZS_ZPmZ_StoD : sve_fp_2op_p_zd<0b1111100, "fcvtzs", ZPR32, ZPR64, int_aarch64_sve_fcvtzs_i64f32, nxv2i64, nxv2i1, nxv4f32, ElementSizeD>;
+ defm FCVTZS_ZPmZ_HtoS : sve_fp_2op_p_zd<0b0111100, "fcvtzs", ZPR16, ZPR32, int_aarch64_sve_fcvtzs_i32f16, nxv4i32, nxv4i1, nxv8f16, ElementSizeS>;
+ defm FCVTZS_ZPmZ_HtoD : sve_fp_2op_p_zd<0b0111110, "fcvtzs", ZPR16, ZPR64, int_aarch64_sve_fcvtzs_i64f16, nxv2i64, nxv2i1, nxv8f16, ElementSizeD>;
+ defm FCVTZU_ZPmZ_HtoS : sve_fp_2op_p_zd<0b0111101, "fcvtzu", ZPR16, ZPR32, int_aarch64_sve_fcvtzu_i32f16, nxv4i32, nxv4i1, nxv8f16, ElementSizeS>;
+ defm FCVTZU_ZPmZ_HtoD : sve_fp_2op_p_zd<0b0111111, "fcvtzu", ZPR16, ZPR64, int_aarch64_sve_fcvtzu_i64f16, nxv2i64, nxv2i1, nxv8f16, ElementSizeD>;
+ defm FCVTZU_ZPmZ_StoD : sve_fp_2op_p_zd<0b1111101, "fcvtzu", ZPR32, ZPR64, int_aarch64_sve_fcvtzu_i64f32, nxv2i64, nxv2i1, nxv4f32, ElementSizeD>;
+ defm FCVTZS_ZPmZ_DtoD : sve_fp_2op_p_zd<0b1111110, "fcvtzs", ZPR64, ZPR64, int_aarch64_sve_fcvtzs, nxv2i64, nxv2i1, nxv2f64, ElementSizeD>;
+ defm FCVTZU_ZPmZ_DtoD : sve_fp_2op_p_zd<0b1111111, "fcvtzu", ZPR64, ZPR64, int_aarch64_sve_fcvtzu, nxv2i64, nxv2i1, nxv2f64, ElementSizeD>;
defm FRINTN_ZPmZ : sve_fp_2op_p_zd_HSD<0b00000, "frintn", int_aarch64_sve_frintn>;
defm FRINTP_ZPmZ : sve_fp_2op_p_zd_HSD<0b00001, "frintp", int_aarch64_sve_frintp>;
@@ -1004,6 +1343,18 @@ let Predicates = [HasSVE] in {
defm FRECPX_ZPmZ : sve_fp_2op_p_zd_HSD<0b01100, "frecpx", int_aarch64_sve_frecpx>;
defm FSQRT_ZPmZ : sve_fp_2op_p_zd_HSD<0b01101, "fsqrt", int_aarch64_sve_fsqrt>;
+ let Predicates = [HasBF16, HasSVE] in {
+ defm BFDOT_ZZZ : sve_bfloat_dot<"bfdot", int_aarch64_sve_bfdot>;
+ defm BFDOT_ZZI : sve_bfloat_dot_indexed<"bfdot", int_aarch64_sve_bfdot_lane>;
+ defm BFMMLA_ZZZ : sve_bfloat_matmul<"bfmmla", int_aarch64_sve_bfmmla>;
+ defm BFMMLA_B_ZZZ : sve_bfloat_matmul_longvecl<0b0, "bfmlalb", int_aarch64_sve_bfmlalb>;
+ defm BFMMLA_T_ZZZ : sve_bfloat_matmul_longvecl<0b1, "bfmlalt", int_aarch64_sve_bfmlalt>;
+ defm BFMMLA_B_ZZI : sve_bfloat_matmul_longvecl_idx<0b0, "bfmlalb", int_aarch64_sve_bfmlalb_lane>;
+ defm BFMMLA_T_ZZI : sve_bfloat_matmul_longvecl_idx<0b1, "bfmlalt", int_aarch64_sve_bfmlalt_lane>;
+ defm BFCVT_ZPmZ : sve_bfloat_convert<0b1, "bfcvt", int_aarch64_sve_fcvt_bf16f32>;
+ defm BFCVTNT_ZPmZ : sve_bfloat_convert<0b0, "bfcvtnt", int_aarch64_sve_fcvtnt_bf16f32>;
+ }
+
// InstAliases
def : InstAlias<"mov $Zd, $Zn",
(ORR_ZZZ ZPR64:$Zd, ZPR64:$Zn, ZPR64:$Zn), 1>;
@@ -1089,6 +1440,20 @@ let Predicates = [HasSVE] in {
def : InstAlias<"fcmlt $Zd, $Pg/z, $Zm, $Zn",
(FCMGT_PPzZZ_D PPR64:$Zd, PPR3bAny:$Pg, ZPR64:$Zn, ZPR64:$Zm), 0>;
+ // Pseudo instructions representing unpredicated LDR and STR for ZPR2,3,4.
+ // These get expanded to individual LDR_ZXI/STR_ZXI instructions in
+ // AArch64ExpandPseudoInsts.
+ let mayLoad = 1, hasSideEffects = 0 in {
+ def LDR_ZZXI : Pseudo<(outs ZZ_b:$Zd), (ins GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
+ def LDR_ZZZXI : Pseudo<(outs ZZZ_b:$Zd), (ins GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
+ def LDR_ZZZZXI : Pseudo<(outs ZZZZ_b:$Zd), (ins GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
+ }
+ let mayStore = 1, hasSideEffects = 0 in {
+ def STR_ZZXI : Pseudo<(outs), (ins ZZ_b:$Zs, GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
+ def STR_ZZZXI : Pseudo<(outs), (ins ZZZ_b:$Zs, GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
+ def STR_ZZZZXI : Pseudo<(outs), (ins ZZZZ_b:$Zs, GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
+ }
+
def : Pat<(AArch64ptest (nxv16i1 PPR:$pg), (nxv16i1 PPR:$src)),
(PTEST_PP PPR:$pg, PPR:$src)>;
def : Pat<(AArch64ptest (nxv8i1 PPR:$pg), (nxv8i1 PPR:$src)),
@@ -1098,6 +1463,25 @@ let Predicates = [HasSVE] in {
def : Pat<(AArch64ptest (nxv2i1 PPR:$pg), (nxv2i1 PPR:$src)),
(PTEST_PP PPR:$pg, PPR:$src)>;
+ // LD1R of 128-bit masked data
+ def : Pat<(nxv16i8 (AArch64ld1rq_z PPR:$gp, GPR64:$base)),
+ (LD1RQ_B_IMM $gp, $base, (i64 0))>;
+ def : Pat<(nxv8i16 (AArch64ld1rq_z PPR:$gp, GPR64:$base)),
+ (LD1RQ_H_IMM $gp, $base, (i64 0))>;
+ def : Pat<(nxv4i32 (AArch64ld1rq_z PPR:$gp, GPR64:$base)),
+ (LD1RQ_W_IMM $gp, $base, (i64 0))>;
+ def : Pat<(nxv2i64 (AArch64ld1rq_z PPR:$gp, GPR64:$base)),
+ (LD1RQ_D_IMM $gp, $base, (i64 0))>;
+
+ def : Pat<(nxv16i8 (AArch64ld1rq_z PPR:$gp, (add GPR64:$base, (i64 simm4s16:$imm)))),
+ (LD1RQ_B_IMM $gp, $base, simm4s16:$imm)>;
+ def : Pat<(nxv8i16 (AArch64ld1rq_z PPR:$gp, (add GPR64:$base, (i64 simm4s16:$imm)))),
+ (LD1RQ_H_IMM $gp, $base, simm4s16:$imm)>;
+ def : Pat<(nxv4i32 (AArch64ld1rq_z PPR:$gp, (add GPR64:$base, (i64 simm4s16:$imm)))),
+ (LD1RQ_W_IMM $gp, $base, simm4s16:$imm)>;
+ def : Pat<(nxv2i64 (AArch64ld1rq_z PPR:$gp, (add GPR64:$base, (i64 simm4s16:$imm)))),
+ (LD1RQ_D_IMM $gp, $base, simm4s16:$imm)>;
+
def : Pat<(sext_inreg (nxv2i64 ZPR:$Zs), nxv2i32), (SXTW_ZPmZ_D (IMPLICIT_DEF), (PTRUE_D 31), ZPR:$Zs)>;
def : Pat<(sext_inreg (nxv2i64 ZPR:$Zs), nxv2i16), (SXTH_ZPmZ_D (IMPLICIT_DEF), (PTRUE_D 31), ZPR:$Zs)>;
def : Pat<(sext_inreg (nxv2i64 ZPR:$Zs), nxv2i8), (SXTB_ZPmZ_D (IMPLICIT_DEF), (PTRUE_D 31), ZPR:$Zs)>;
@@ -1105,346 +1489,899 @@ let Predicates = [HasSVE] in {
def : Pat<(sext_inreg (nxv4i32 ZPR:$Zs), nxv4i8), (SXTB_ZPmZ_S (IMPLICIT_DEF), (PTRUE_S 31), ZPR:$Zs)>;
def : Pat<(sext_inreg (nxv8i16 ZPR:$Zs), nxv8i8), (SXTB_ZPmZ_H (IMPLICIT_DEF), (PTRUE_H 31), ZPR:$Zs)>;
- def : Pat<(nxv16i8 (bitconvert (nxv8i16 ZPR:$src))), (nxv16i8 ZPR:$src)>;
- def : Pat<(nxv16i8 (bitconvert (nxv4i32 ZPR:$src))), (nxv16i8 ZPR:$src)>;
- def : Pat<(nxv16i8 (bitconvert (nxv2i64 ZPR:$src))), (nxv16i8 ZPR:$src)>;
- def : Pat<(nxv16i8 (bitconvert (nxv8f16 ZPR:$src))), (nxv16i8 ZPR:$src)>;
- def : Pat<(nxv16i8 (bitconvert (nxv4f32 ZPR:$src))), (nxv16i8 ZPR:$src)>;
- def : Pat<(nxv16i8 (bitconvert (nxv2f64 ZPR:$src))), (nxv16i8 ZPR:$src)>;
-
- def : Pat<(nxv8i16 (bitconvert (nxv16i8 ZPR:$src))), (nxv8i16 ZPR:$src)>;
- def : Pat<(nxv8i16 (bitconvert (nxv4i32 ZPR:$src))), (nxv8i16 ZPR:$src)>;
- def : Pat<(nxv8i16 (bitconvert (nxv2i64 ZPR:$src))), (nxv8i16 ZPR:$src)>;
- def : Pat<(nxv8i16 (bitconvert (nxv8f16 ZPR:$src))), (nxv8i16 ZPR:$src)>;
- def : Pat<(nxv8i16 (bitconvert (nxv4f32 ZPR:$src))), (nxv8i16 ZPR:$src)>;
- def : Pat<(nxv8i16 (bitconvert (nxv2f64 ZPR:$src))), (nxv8i16 ZPR:$src)>;
-
- def : Pat<(nxv4i32 (bitconvert (nxv16i8 ZPR:$src))), (nxv4i32 ZPR:$src)>;
- def : Pat<(nxv4i32 (bitconvert (nxv8i16 ZPR:$src))), (nxv4i32 ZPR:$src)>;
- def : Pat<(nxv4i32 (bitconvert (nxv2i64 ZPR:$src))), (nxv4i32 ZPR:$src)>;
- def : Pat<(nxv4i32 (bitconvert (nxv8f16 ZPR:$src))), (nxv4i32 ZPR:$src)>;
- def : Pat<(nxv4i32 (bitconvert (nxv4f32 ZPR:$src))), (nxv4i32 ZPR:$src)>;
- def : Pat<(nxv4i32 (bitconvert (nxv2f64 ZPR:$src))), (nxv4i32 ZPR:$src)>;
-
- def : Pat<(nxv2i64 (bitconvert (nxv16i8 ZPR:$src))), (nxv2i64 ZPR:$src)>;
- def : Pat<(nxv2i64 (bitconvert (nxv8i16 ZPR:$src))), (nxv2i64 ZPR:$src)>;
- def : Pat<(nxv2i64 (bitconvert (nxv4i32 ZPR:$src))), (nxv2i64 ZPR:$src)>;
- def : Pat<(nxv2i64 (bitconvert (nxv8f16 ZPR:$src))), (nxv2i64 ZPR:$src)>;
- def : Pat<(nxv2i64 (bitconvert (nxv4f32 ZPR:$src))), (nxv2i64 ZPR:$src)>;
- def : Pat<(nxv2i64 (bitconvert (nxv2f64 ZPR:$src))), (nxv2i64 ZPR:$src)>;
-
- def : Pat<(nxv8f16 (bitconvert (nxv16i8 ZPR:$src))), (nxv8f16 ZPR:$src)>;
- def : Pat<(nxv8f16 (bitconvert (nxv8i16 ZPR:$src))), (nxv8f16 ZPR:$src)>;
- def : Pat<(nxv8f16 (bitconvert (nxv4i32 ZPR:$src))), (nxv8f16 ZPR:$src)>;
- def : Pat<(nxv8f16 (bitconvert (nxv2i64 ZPR:$src))), (nxv8f16 ZPR:$src)>;
- def : Pat<(nxv8f16 (bitconvert (nxv4f32 ZPR:$src))), (nxv8f16 ZPR:$src)>;
- def : Pat<(nxv8f16 (bitconvert (nxv2f64 ZPR:$src))), (nxv8f16 ZPR:$src)>;
-
- def : Pat<(nxv4f32 (bitconvert (nxv16i8 ZPR:$src))), (nxv4f32 ZPR:$src)>;
- def : Pat<(nxv4f32 (bitconvert (nxv8i16 ZPR:$src))), (nxv4f32 ZPR:$src)>;
- def : Pat<(nxv4f32 (bitconvert (nxv4i32 ZPR:$src))), (nxv4f32 ZPR:$src)>;
- def : Pat<(nxv4f32 (bitconvert (nxv2i64 ZPR:$src))), (nxv4f32 ZPR:$src)>;
- def : Pat<(nxv4f32 (bitconvert (nxv8f16 ZPR:$src))), (nxv4f32 ZPR:$src)>;
- def : Pat<(nxv4f32 (bitconvert (nxv2f64 ZPR:$src))), (nxv4f32 ZPR:$src)>;
-
- def : Pat<(nxv2f64 (bitconvert (nxv16i8 ZPR:$src))), (nxv2f64 ZPR:$src)>;
- def : Pat<(nxv2f64 (bitconvert (nxv8i16 ZPR:$src))), (nxv2f64 ZPR:$src)>;
- def : Pat<(nxv2f64 (bitconvert (nxv4i32 ZPR:$src))), (nxv2f64 ZPR:$src)>;
- def : Pat<(nxv2f64 (bitconvert (nxv2i64 ZPR:$src))), (nxv2f64 ZPR:$src)>;
- def : Pat<(nxv2f64 (bitconvert (nxv8f16 ZPR:$src))), (nxv2f64 ZPR:$src)>;
- def : Pat<(nxv2f64 (bitconvert (nxv4f32 ZPR:$src))), (nxv2f64 ZPR:$src)>;
+ // General case that we ideally never want to match.
+ def : Pat<(vscale GPR64:$scale), (MADDXrrr (UBFMXri (RDVLI_XI 1), 4, 63), $scale, XZR)>;
+
+ let AddedComplexity = 5 in {
+ def : Pat<(vscale (i64 1)), (UBFMXri (RDVLI_XI 1), 4, 63)>;
+ def : Pat<(vscale (i64 -1)), (SBFMXri (RDVLI_XI -1), 4, 63)>;
+
+ def : Pat<(vscale (sve_rdvl_imm i32:$imm)), (RDVLI_XI $imm)>;
+ def : Pat<(vscale (sve_cnth_imm i32:$imm)), (CNTH_XPiI 31, $imm)>;
+ def : Pat<(vscale (sve_cntw_imm i32:$imm)), (CNTW_XPiI 31, $imm)>;
+ def : Pat<(vscale (sve_cntd_imm i32:$imm)), (CNTD_XPiI 31, $imm)>;
+
+ def : Pat<(vscale (sve_cnth_imm_neg i32:$imm)), (SUBXrs XZR, (CNTH_XPiI 31, $imm), 0)>;
+ def : Pat<(vscale (sve_cntw_imm_neg i32:$imm)), (SUBXrs XZR, (CNTW_XPiI 31, $imm), 0)>;
+ def : Pat<(vscale (sve_cntd_imm_neg i32:$imm)), (SUBXrs XZR, (CNTD_XPiI 31, $imm), 0)>;
+ }
+
+ // FIXME: BigEndian requires an additional REV instruction to satisfy the
+ // constraint that none of the bits change when stored to memory as one
+ // type, and and reloaded as another type.
+ let Predicates = [IsLE] in {
+ def : Pat<(nxv16i8 (bitconvert (nxv8i16 ZPR:$src))), (nxv16i8 ZPR:$src)>;
+ def : Pat<(nxv16i8 (bitconvert (nxv4i32 ZPR:$src))), (nxv16i8 ZPR:$src)>;
+ def : Pat<(nxv16i8 (bitconvert (nxv2i64 ZPR:$src))), (nxv16i8 ZPR:$src)>;
+ def : Pat<(nxv16i8 (bitconvert (nxv8f16 ZPR:$src))), (nxv16i8 ZPR:$src)>;
+ def : Pat<(nxv16i8 (bitconvert (nxv4f32 ZPR:$src))), (nxv16i8 ZPR:$src)>;
+ def : Pat<(nxv16i8 (bitconvert (nxv2f64 ZPR:$src))), (nxv16i8 ZPR:$src)>;
+
+ def : Pat<(nxv8i16 (bitconvert (nxv16i8 ZPR:$src))), (nxv8i16 ZPR:$src)>;
+ def : Pat<(nxv8i16 (bitconvert (nxv4i32 ZPR:$src))), (nxv8i16 ZPR:$src)>;
+ def : Pat<(nxv8i16 (bitconvert (nxv2i64 ZPR:$src))), (nxv8i16 ZPR:$src)>;
+ def : Pat<(nxv8i16 (bitconvert (nxv8f16 ZPR:$src))), (nxv8i16 ZPR:$src)>;
+ def : Pat<(nxv8i16 (bitconvert (nxv4f32 ZPR:$src))), (nxv8i16 ZPR:$src)>;
+ def : Pat<(nxv8i16 (bitconvert (nxv2f64 ZPR:$src))), (nxv8i16 ZPR:$src)>;
+
+ def : Pat<(nxv4i32 (bitconvert (nxv16i8 ZPR:$src))), (nxv4i32 ZPR:$src)>;
+ def : Pat<(nxv4i32 (bitconvert (nxv8i16 ZPR:$src))), (nxv4i32 ZPR:$src)>;
+ def : Pat<(nxv4i32 (bitconvert (nxv2i64 ZPR:$src))), (nxv4i32 ZPR:$src)>;
+ def : Pat<(nxv4i32 (bitconvert (nxv8f16 ZPR:$src))), (nxv4i32 ZPR:$src)>;
+ def : Pat<(nxv4i32 (bitconvert (nxv4f32 ZPR:$src))), (nxv4i32 ZPR:$src)>;
+ def : Pat<(nxv4i32 (bitconvert (nxv2f64 ZPR:$src))), (nxv4i32 ZPR:$src)>;
+
+ def : Pat<(nxv2i64 (bitconvert (nxv16i8 ZPR:$src))), (nxv2i64 ZPR:$src)>;
+ def : Pat<(nxv2i64 (bitconvert (nxv8i16 ZPR:$src))), (nxv2i64 ZPR:$src)>;
+ def : Pat<(nxv2i64 (bitconvert (nxv4i32 ZPR:$src))), (nxv2i64 ZPR:$src)>;
+ def : Pat<(nxv2i64 (bitconvert (nxv8f16 ZPR:$src))), (nxv2i64 ZPR:$src)>;
+ def : Pat<(nxv2i64 (bitconvert (nxv4f32 ZPR:$src))), (nxv2i64 ZPR:$src)>;
+ def : Pat<(nxv2i64 (bitconvert (nxv2f64 ZPR:$src))), (nxv2i64 ZPR:$src)>;
+
+ def : Pat<(nxv8f16 (bitconvert (nxv16i8 ZPR:$src))), (nxv8f16 ZPR:$src)>;
+ def : Pat<(nxv8f16 (bitconvert (nxv8i16 ZPR:$src))), (nxv8f16 ZPR:$src)>;
+ def : Pat<(nxv8f16 (bitconvert (nxv4i32 ZPR:$src))), (nxv8f16 ZPR:$src)>;
+ def : Pat<(nxv8f16 (bitconvert (nxv2i64 ZPR:$src))), (nxv8f16 ZPR:$src)>;
+ def : Pat<(nxv8f16 (bitconvert (nxv4f32 ZPR:$src))), (nxv8f16 ZPR:$src)>;
+ def : Pat<(nxv8f16 (bitconvert (nxv2f64 ZPR:$src))), (nxv8f16 ZPR:$src)>;
+
+ def : Pat<(nxv4f32 (bitconvert (nxv16i8 ZPR:$src))), (nxv4f32 ZPR:$src)>;
+ def : Pat<(nxv4f32 (bitconvert (nxv8i16 ZPR:$src))), (nxv4f32 ZPR:$src)>;
+ def : Pat<(nxv4f32 (bitconvert (nxv4i32 ZPR:$src))), (nxv4f32 ZPR:$src)>;
+ def : Pat<(nxv4f32 (bitconvert (nxv2i64 ZPR:$src))), (nxv4f32 ZPR:$src)>;
+ def : Pat<(nxv4f32 (bitconvert (nxv8f16 ZPR:$src))), (nxv4f32 ZPR:$src)>;
+ def : Pat<(nxv4f32 (bitconvert (nxv2f64 ZPR:$src))), (nxv4f32 ZPR:$src)>;
+
+ def : Pat<(nxv2f64 (bitconvert (nxv16i8 ZPR:$src))), (nxv2f64 ZPR:$src)>;
+ def : Pat<(nxv2f64 (bitconvert (nxv8i16 ZPR:$src))), (nxv2f64 ZPR:$src)>;
+ def : Pat<(nxv2f64 (bitconvert (nxv4i32 ZPR:$src))), (nxv2f64 ZPR:$src)>;
+ def : Pat<(nxv2f64 (bitconvert (nxv2i64 ZPR:$src))), (nxv2f64 ZPR:$src)>;
+ def : Pat<(nxv2f64 (bitconvert (nxv8f16 ZPR:$src))), (nxv2f64 ZPR:$src)>;
+ def : Pat<(nxv2f64 (bitconvert (nxv4f32 ZPR:$src))), (nxv2f64 ZPR:$src)>;
+
+ }
+
+ let Predicates = [IsLE, HasBF16, HasSVE] in {
+ def : Pat<(nxv2i64 (bitconvert (nxv8bf16 ZPR:$src))), (nxv2i64 ZPR:$src)>;
+ def : Pat<(nxv8bf16 (bitconvert (nxv2i64 ZPR:$src))), (nxv8bf16 ZPR:$src)>;
+ def : Pat<(nxv8bf16 (bitconvert (nxv8i16 ZPR:$src))), (nxv8bf16 ZPR:$src)>;
+ }
+
+ let Predicates = [IsLE, HasSVE, HasBF16] in {
+ def : Pat<(nxv8bf16 (bitconvert (nxv16i8 ZPR:$src))), (nxv8bf16 ZPR:$src)>;
+ def : Pat<(nxv8bf16 (bitconvert (nxv8i16 ZPR:$src))), (nxv8bf16 ZPR:$src)>;
+ def : Pat<(nxv8bf16 (bitconvert (nxv4i32 ZPR:$src))), (nxv8bf16 ZPR:$src)>;
+ def : Pat<(nxv8bf16 (bitconvert (nxv2i64 ZPR:$src))), (nxv8bf16 ZPR:$src)>;
+ def : Pat<(nxv8bf16 (bitconvert (nxv8f16 ZPR:$src))), (nxv8bf16 ZPR:$src)>;
+ def : Pat<(nxv8bf16 (bitconvert (nxv4f32 ZPR:$src))), (nxv8bf16 ZPR:$src)>;
+ def : Pat<(nxv8bf16 (bitconvert (nxv2f64 ZPR:$src))), (nxv8bf16 ZPR:$src)>;
+
+ def : Pat<(nxv16i8 (bitconvert (nxv8bf16 ZPR:$src))), (nxv16i8 ZPR:$src)>;
+ def : Pat<(nxv8i16 (bitconvert (nxv8bf16 ZPR:$src))), (nxv8i16 ZPR:$src)>;
+ def : Pat<(nxv4i32 (bitconvert (nxv8bf16 ZPR:$src))), (nxv4i32 ZPR:$src)>;
+ def : Pat<(nxv2i64 (bitconvert (nxv8bf16 ZPR:$src))), (nxv2i64 ZPR:$src)>;
+ def : Pat<(nxv8f16 (bitconvert (nxv8bf16 ZPR:$src))), (nxv8f16 ZPR:$src)>;
+ def : Pat<(nxv4f32 (bitconvert (nxv8bf16 ZPR:$src))), (nxv4f32 ZPR:$src)>;
+ def : Pat<(nxv2f64 (bitconvert (nxv8bf16 ZPR:$src))), (nxv2f64 ZPR:$src)>;
+ }
+
+ def : Pat<(nxv16i1 (reinterpret_cast (nxv16i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>;
+ def : Pat<(nxv16i1 (reinterpret_cast (nxv8i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>;
+ def : Pat<(nxv16i1 (reinterpret_cast (nxv4i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>;
+ def : Pat<(nxv16i1 (reinterpret_cast (nxv2i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>;
+ def : Pat<(nxv8i1 (reinterpret_cast (nxv16i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>;
+ def : Pat<(nxv8i1 (reinterpret_cast (nxv4i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>;
+ def : Pat<(nxv8i1 (reinterpret_cast (nxv2i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>;
+ def : Pat<(nxv4i1 (reinterpret_cast (nxv16i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>;
+ def : Pat<(nxv4i1 (reinterpret_cast (nxv8i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>;
+ def : Pat<(nxv4i1 (reinterpret_cast (nxv2i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>;
+ def : Pat<(nxv2i1 (reinterpret_cast (nxv16i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>;
+ def : Pat<(nxv2i1 (reinterpret_cast (nxv8i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>;
+ def : Pat<(nxv2i1 (reinterpret_cast (nxv4i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>;
+
+ def : Pat<(nxv16i1 (and PPR:$Ps1, PPR:$Ps2)),
+ (AND_PPzPP (PTRUE_B 31), PPR:$Ps1, PPR:$Ps2)>;
+ def : Pat<(nxv8i1 (and PPR:$Ps1, PPR:$Ps2)),
+ (AND_PPzPP (PTRUE_H 31), PPR:$Ps1, PPR:$Ps2)>;
+ def : Pat<(nxv4i1 (and PPR:$Ps1, PPR:$Ps2)),
+ (AND_PPzPP (PTRUE_S 31), PPR:$Ps1, PPR:$Ps2)>;
+ def : Pat<(nxv2i1 (and PPR:$Ps1, PPR:$Ps2)),
+ (AND_PPzPP (PTRUE_D 31), PPR:$Ps1, PPR:$Ps2)>;
// Add more complex addressing modes here as required
multiclass pred_load<ValueType Ty, ValueType PredTy, SDPatternOperator Load,
- Instruction RegImmInst> {
-
+ Instruction RegRegInst, Instruction RegImmInst, ComplexPattern AddrCP> {
+ // reg + reg
+ let AddedComplexity = 1 in {
+ def _reg_reg_z : Pat<(Ty (Load (AddrCP GPR64:$base, GPR64:$offset), (PredTy PPR:$gp), (SVEDup0Undef))),
+ (RegRegInst PPR:$gp, GPR64:$base, GPR64:$offset)>;
+ }
+ // reg + imm
+ let AddedComplexity = 2 in {
+ def _reg_imm_z : Pat<(Ty (Load (am_sve_indexed_s4 GPR64sp:$base, simm4s1:$offset), (PredTy PPR:$gp), (SVEDup0Undef))),
+ (RegImmInst PPR:$gp, GPR64:$base, simm4s1:$offset)>;
+ }
def _default_z : Pat<(Ty (Load GPR64:$base, (PredTy PPR:$gp), (SVEDup0Undef))),
(RegImmInst PPR:$gp, GPR64:$base, (i64 0))>;
}
// 2-element contiguous loads
- defm : pred_load<nxv2i64, nxv2i1, zext_masked_load_i8, LD1B_D_IMM>;
- defm : pred_load<nxv2i64, nxv2i1, asext_masked_load_i8, LD1SB_D_IMM>;
- defm : pred_load<nxv2i64, nxv2i1, zext_masked_load_i16, LD1H_D_IMM>;
- defm : pred_load<nxv2i64, nxv2i1, asext_masked_load_i16, LD1SH_D_IMM>;
- defm : pred_load<nxv2i64, nxv2i1, zext_masked_load_i32, LD1W_D_IMM>;
- defm : pred_load<nxv2i64, nxv2i1, asext_masked_load_i32, LD1SW_D_IMM>;
- defm : pred_load<nxv2i64, nxv2i1, nonext_masked_load, LD1D_IMM>;
- defm : pred_load<nxv2f16, nxv2i1, nonext_masked_load, LD1H_D_IMM>;
- defm : pred_load<nxv2f32, nxv2i1, nonext_masked_load, LD1W_D_IMM>;
- defm : pred_load<nxv2f64, nxv2i1, nonext_masked_load, LD1D_IMM>;
+ defm : pred_load<nxv2i64, nxv2i1, zext_masked_load_i8, LD1B_D, LD1B_D_IMM, am_sve_regreg_lsl0>;
+ defm : pred_load<nxv2i64, nxv2i1, asext_masked_load_i8, LD1SB_D, LD1SB_D_IMM, am_sve_regreg_lsl0>;
+ defm : pred_load<nxv2i64, nxv2i1, zext_masked_load_i16, LD1H_D, LD1H_D_IMM, am_sve_regreg_lsl1>;
+ defm : pred_load<nxv2i64, nxv2i1, asext_masked_load_i16, LD1SH_D, LD1SH_D_IMM, am_sve_regreg_lsl1>;
+ defm : pred_load<nxv2i64, nxv2i1, zext_masked_load_i32, LD1W_D, LD1W_D_IMM, am_sve_regreg_lsl2>;
+ defm : pred_load<nxv2i64, nxv2i1, asext_masked_load_i32, LD1SW_D, LD1SW_D_IMM, am_sve_regreg_lsl2>;
+ defm : pred_load<nxv2i64, nxv2i1, nonext_masked_load, LD1D, LD1D_IMM, am_sve_regreg_lsl3>;
+ defm : pred_load<nxv2f16, nxv2i1, nonext_masked_load, LD1H_D, LD1H_D_IMM, am_sve_regreg_lsl1>;
+ defm : pred_load<nxv2f32, nxv2i1, nonext_masked_load, LD1W_D, LD1W_D_IMM, am_sve_regreg_lsl2>;
+ defm : pred_load<nxv2f64, nxv2i1, nonext_masked_load, LD1D, LD1D_IMM, am_sve_regreg_lsl3>;
// 4-element contiguous loads
- defm : pred_load<nxv4i32, nxv4i1, zext_masked_load_i8, LD1B_S_IMM>;
- defm : pred_load<nxv4i32, nxv4i1, asext_masked_load_i8, LD1SB_S_IMM>;
- defm : pred_load<nxv4i32, nxv4i1, zext_masked_load_i16, LD1H_S_IMM>;
- defm : pred_load<nxv4i32, nxv4i1, asext_masked_load_i16, LD1SH_S_IMM>;
- defm : pred_load<nxv4i32, nxv4i1, nonext_masked_load, LD1W_IMM>;
- defm : pred_load<nxv4f16, nxv4i1, nonext_masked_load, LD1H_S_IMM>;
- defm : pred_load<nxv4f32, nxv4i1, nonext_masked_load, LD1W_IMM>;
+ defm : pred_load<nxv4i32, nxv4i1, zext_masked_load_i8, LD1B_S, LD1B_S_IMM, am_sve_regreg_lsl0>;
+ defm : pred_load<nxv4i32, nxv4i1, asext_masked_load_i8, LD1SB_S, LD1SB_S_IMM, am_sve_regreg_lsl0>;
+ defm : pred_load<nxv4i32, nxv4i1, zext_masked_load_i16, LD1H_S, LD1H_S_IMM, am_sve_regreg_lsl1>;
+ defm : pred_load<nxv4i32, nxv4i1, asext_masked_load_i16, LD1SH_S, LD1SH_S_IMM, am_sve_regreg_lsl1>;
+ defm : pred_load<nxv4i32, nxv4i1, nonext_masked_load, LD1W, LD1W_IMM, am_sve_regreg_lsl2>;
+ defm : pred_load<nxv4f16, nxv4i1, nonext_masked_load, LD1H_S, LD1H_S_IMM, am_sve_regreg_lsl1>;
+ defm : pred_load<nxv4f32, nxv4i1, nonext_masked_load, LD1W, LD1W_IMM, am_sve_regreg_lsl2>;
// 8-element contiguous loads
- defm : pred_load<nxv8i16, nxv8i1, zext_masked_load_i8, LD1B_H_IMM>;
- defm : pred_load<nxv8i16, nxv8i1, asext_masked_load_i8, LD1SB_H_IMM>;
- defm : pred_load<nxv8i16, nxv8i1, nonext_masked_load, LD1H_IMM>;
- defm : pred_load<nxv8f16, nxv8i1, nonext_masked_load, LD1H_IMM>;
+ defm : pred_load<nxv8i16, nxv8i1, zext_masked_load_i8, LD1B_H, LD1B_H_IMM, am_sve_regreg_lsl0>;
+ defm : pred_load<nxv8i16, nxv8i1, asext_masked_load_i8, LD1SB_H, LD1SB_H_IMM, am_sve_regreg_lsl0>;
+ defm : pred_load<nxv8i16, nxv8i1, nonext_masked_load, LD1H, LD1H_IMM, am_sve_regreg_lsl1>;
+ defm : pred_load<nxv8f16, nxv8i1, nonext_masked_load, LD1H, LD1H_IMM, am_sve_regreg_lsl1>;
+
+ let Predicates = [HasBF16, HasSVE] in {
+ defm : pred_load<nxv8bf16, nxv8i1, nonext_masked_load, LD1H, LD1H_IMM, am_sve_regreg_lsl1>;
+ }
// 16-element contiguous loads
- defm : pred_load<nxv16i8, nxv16i1, nonext_masked_load, LD1B_IMM>;
+ defm : pred_load<nxv16i8, nxv16i1, nonext_masked_load, LD1B, LD1B_IMM, am_sve_regreg_lsl0>;
multiclass pred_store<ValueType Ty, ValueType PredTy, SDPatternOperator Store,
- Instruction RegImmInst> {
+ Instruction RegRegInst, Instruction RegImmInst, ComplexPattern AddrCP> {
+ // reg + reg
+ let AddedComplexity = 1 in {
+ def _reg_reg : Pat<(Store (Ty ZPR:$vec), (AddrCP GPR64:$base, GPR64:$offset), (PredTy PPR:$gp)),
+ (RegRegInst ZPR:$vec, PPR:$gp, GPR64:$base, GPR64:$offset)>;
+ }
+ // reg + imm
+ let AddedComplexity = 2 in {
+ def _reg_imm : Pat<(Store (Ty ZPR:$vec), (am_sve_indexed_s4 GPR64sp:$base, simm4s1:$offset), (PredTy PPR:$gp)),
+ (RegImmInst ZPR:$vec, PPR:$gp, GPR64:$base, simm4s1:$offset)>;
+ }
def _default : Pat<(Store (Ty ZPR:$vec), GPR64:$base, (PredTy PPR:$gp)),
(RegImmInst ZPR:$vec, PPR:$gp, GPR64:$base, (i64 0))>;
}
// 2-element contiguous stores
- defm : pred_store<nxv2i64, nxv2i1, trunc_masked_store_i8, ST1B_D_IMM>;
- defm : pred_store<nxv2i64, nxv2i1, trunc_masked_store_i16, ST1H_D_IMM>;
- defm : pred_store<nxv2i64, nxv2i1, trunc_masked_store_i32, ST1W_D_IMM>;
- defm : pred_store<nxv2i64, nxv2i1, nontrunc_masked_store, ST1D_IMM>;
- defm : pred_store<nxv2f16, nxv2i1, nontrunc_masked_store, ST1H_D_IMM>;
- defm : pred_store<nxv2f32, nxv2i1, nontrunc_masked_store, ST1W_D_IMM>;
- defm : pred_store<nxv2f64, nxv2i1, nontrunc_masked_store, ST1D_IMM>;
+ defm : pred_store<nxv2i64, nxv2i1, trunc_masked_store_i8, ST1B_D, ST1B_D_IMM, am_sve_regreg_lsl0>;
+ defm : pred_store<nxv2i64, nxv2i1, trunc_masked_store_i16, ST1H_D, ST1H_D_IMM, am_sve_regreg_lsl1>;
+ defm : pred_store<nxv2i64, nxv2i1, trunc_masked_store_i32, ST1W_D, ST1W_D_IMM, am_sve_regreg_lsl2>;
+ defm : pred_store<nxv2i64, nxv2i1, nontrunc_masked_store, ST1D, ST1D_IMM, am_sve_regreg_lsl3>;
+ defm : pred_store<nxv2f16, nxv2i1, nontrunc_masked_store, ST1H_D, ST1H_D_IMM, am_sve_regreg_lsl1>;
+ defm : pred_store<nxv2f32, nxv2i1, nontrunc_masked_store, ST1W_D, ST1W_D_IMM, am_sve_regreg_lsl2>;
+ defm : pred_store<nxv2f64, nxv2i1, nontrunc_masked_store, ST1D, ST1D_IMM, am_sve_regreg_lsl3>;
// 4-element contiguous stores
- defm : pred_store<nxv4i32, nxv4i1, trunc_masked_store_i8, ST1B_S_IMM>;
- defm : pred_store<nxv4i32, nxv4i1, trunc_masked_store_i16, ST1H_S_IMM>;
- defm : pred_store<nxv4i32, nxv4i1, nontrunc_masked_store, ST1W_IMM>;
- defm : pred_store<nxv4f16, nxv4i1, nontrunc_masked_store, ST1H_S_IMM>;
- defm : pred_store<nxv4f32, nxv4i1, nontrunc_masked_store, ST1W_IMM>;
+ defm : pred_store<nxv4i32, nxv4i1, trunc_masked_store_i8, ST1B_S, ST1B_S_IMM, am_sve_regreg_lsl0>;
+ defm : pred_store<nxv4i32, nxv4i1, trunc_masked_store_i16, ST1H_S, ST1H_S_IMM, am_sve_regreg_lsl1>;
+ defm : pred_store<nxv4i32, nxv4i1, nontrunc_masked_store, ST1W, ST1W_IMM, am_sve_regreg_lsl2>;
+ defm : pred_store<nxv4f16, nxv4i1, nontrunc_masked_store, ST1H_S, ST1H_S_IMM, am_sve_regreg_lsl1>;
+ defm : pred_store<nxv4f32, nxv4i1, nontrunc_masked_store, ST1W, ST1W_IMM, am_sve_regreg_lsl2>;
// 8-element contiguous stores
- defm : pred_store<nxv8i16, nxv8i1, trunc_masked_store_i8, ST1B_H_IMM>;
- defm : pred_store<nxv8i16, nxv8i1, nontrunc_masked_store, ST1H_IMM>;
- defm : pred_store<nxv8f16, nxv8i1, nontrunc_masked_store, ST1H_IMM>;
+ defm : pred_store<nxv8i16, nxv8i1, trunc_masked_store_i8, ST1B_H, ST1B_H_IMM, am_sve_regreg_lsl0>;
+ defm : pred_store<nxv8i16, nxv8i1, nontrunc_masked_store, ST1H, ST1H_IMM, am_sve_regreg_lsl1>;
+ defm : pred_store<nxv8f16, nxv8i1, nontrunc_masked_store, ST1H, ST1H_IMM, am_sve_regreg_lsl1>;
+
+ let Predicates = [HasBF16, HasSVE] in {
+ defm : pred_store<nxv8bf16, nxv8i1, nontrunc_masked_store, ST1H, ST1H_IMM, am_sve_regreg_lsl1>;
+ }
// 16-element contiguous stores
- defm : pred_store<nxv16i8, nxv16i1, nontrunc_masked_store, ST1B_IMM>;
+ defm : pred_store<nxv16i8, nxv16i1, nontrunc_masked_store, ST1B, ST1B_IMM, am_sve_regreg_lsl0>;
+
+ defm : pred_load<nxv16i8, nxv16i1, non_temporal_load, LDNT1B_ZRR, LDNT1B_ZRI, am_sve_regreg_lsl0>;
+ defm : pred_load<nxv8i16, nxv8i1, non_temporal_load, LDNT1H_ZRR, LDNT1H_ZRI, am_sve_regreg_lsl1>;
+ defm : pred_load<nxv4i32, nxv4i1, non_temporal_load, LDNT1W_ZRR, LDNT1W_ZRI, am_sve_regreg_lsl2>;
+ defm : pred_load<nxv2i64, nxv2i1, non_temporal_load, LDNT1D_ZRR, LDNT1D_ZRI, am_sve_regreg_lsl3>;
+
+ defm : pred_store<nxv16i8, nxv16i1, non_temporal_store, STNT1B_ZRR, STNT1B_ZRI, am_sve_regreg_lsl0>;
+ defm : pred_store<nxv8i16, nxv8i1, non_temporal_store, STNT1H_ZRR, STNT1H_ZRI, am_sve_regreg_lsl1>;
+ defm : pred_store<nxv4i32, nxv4i1, non_temporal_store, STNT1W_ZRR, STNT1W_ZRI, am_sve_regreg_lsl2>;
+ defm : pred_store<nxv2i64, nxv2i1, non_temporal_store, STNT1D_ZRR, STNT1D_ZRI, am_sve_regreg_lsl3>;
+
+ multiclass unpred_store<PatFrag Store, ValueType Ty, Instruction RegImmInst,
+ Instruction PTrue> {
+ let AddedComplexity = 1 in {
+ def _imm : Pat<(Store (Ty ZPR:$val), (am_sve_indexed_s4 GPR64sp:$base, simm4s1:$offset)),
+ (RegImmInst ZPR:$val, (PTrue 31), GPR64sp:$base, simm4s1:$offset)>;
+ }
+ let AddedComplexity = 2 in {
+ def _fi : Pat<(Store (Ty ZPR:$val), (am_sve_fi GPR64sp:$base, simm4s1:$offset)),
+ (RegImmInst ZPR:$val, (PTrue 31), GPR64sp:$base, simm4s1:$offset)>;
+ }
+
+ def : Pat<(Store (Ty ZPR:$val), GPR64:$base),
+ (RegImmInst ZPR:$val, (PTrue 31), GPR64:$base, (i64 0))>;
+ }
+
+ defm : unpred_store< store, nxv16i8, ST1B_IMM, PTRUE_B>;
+ defm : unpred_store< truncstorevi8, nxv8i16, ST1B_H_IMM, PTRUE_H>;
+ defm : unpred_store< truncstorevi8, nxv4i32, ST1B_S_IMM, PTRUE_S>;
+ defm : unpred_store< truncstorevi8, nxv2i64, ST1B_D_IMM, PTRUE_D>;
+ defm : unpred_store< store, nxv8i16, ST1H_IMM, PTRUE_H>;
+ defm : unpred_store<truncstorevi16, nxv4i32, ST1H_S_IMM, PTRUE_S>;
+ defm : unpred_store<truncstorevi16, nxv2i64, ST1H_D_IMM, PTRUE_D>;
+ defm : unpred_store< store, nxv4i32, ST1W_IMM, PTRUE_S>;
+ defm : unpred_store<truncstorevi32, nxv2i64, ST1W_D_IMM, PTRUE_D>;
+ defm : unpred_store< store, nxv2i64, ST1D_IMM, PTRUE_D>;
+ defm : unpred_store< store, nxv8f16, ST1H_IMM, PTRUE_H>;
+ defm : unpred_store< store, nxv8bf16, ST1H_IMM, PTRUE_H>;
+ defm : unpred_store< store, nxv4f16, ST1H_S_IMM, PTRUE_S>;
+ defm : unpred_store< store, nxv2f16, ST1H_D_IMM, PTRUE_D>;
+ defm : unpred_store< store, nxv4f32, ST1W_IMM, PTRUE_S>;
+ defm : unpred_store< store, nxv4f32, ST1W_D_IMM, PTRUE_D>;
+ defm : unpred_store< store, nxv2f64, ST1D_IMM, PTRUE_D>;
+
+ multiclass unpred_load<PatFrag Load, ValueType Ty, Instruction RegImmInst,
+ Instruction PTrue> {
+ let AddedComplexity = 1 in {
+ def _imm: Pat<(Ty (Load (am_sve_indexed_s4 GPR64sp:$base, simm4s1:$offset))),
+ (RegImmInst (PTrue 31), GPR64sp:$base, simm4s1:$offset)>;
+ }
+
+ let AddedComplexity = 2 in {
+ def _fi : Pat<(Ty (Load (am_sve_fi GPR64sp:$base, simm4s1:$offset))),
+ (RegImmInst (PTrue 31), GPR64sp:$base, simm4s1:$offset)>;
+ }
+
+ def : Pat<(Ty (Load GPR64:$base)),
+ (RegImmInst (PTrue 31), GPR64:$base, (i64 0))>;
+ }
+
+ defm : unpred_load< load, nxv16i8, LD1B_IMM, PTRUE_B>;
+ defm : unpred_load< zextloadvi8, nxv8i16, LD1B_H_IMM, PTRUE_H>;
+ defm : unpred_load< zextloadvi8, nxv4i32, LD1B_S_IMM, PTRUE_S>;
+ defm : unpred_load< zextloadvi8, nxv2i64, LD1B_D_IMM, PTRUE_D>;
+ defm : unpred_load< extloadvi8, nxv8i16, LD1B_H_IMM, PTRUE_H>;
+ defm : unpred_load< extloadvi8, nxv4i32, LD1B_S_IMM, PTRUE_S>;
+ defm : unpred_load< extloadvi8, nxv2i64, LD1B_D_IMM, PTRUE_D>;
+ defm : unpred_load< sextloadvi8, nxv8i16, LD1SB_H_IMM, PTRUE_H>;
+ defm : unpred_load< sextloadvi8, nxv4i32, LD1SB_S_IMM, PTRUE_S>;
+ defm : unpred_load< sextloadvi8, nxv2i64, LD1SB_D_IMM, PTRUE_D>;
+ defm : unpred_load< load, nxv8i16, LD1H_IMM, PTRUE_H>;
+ defm : unpred_load<zextloadvi16, nxv4i32, LD1H_S_IMM, PTRUE_S>;
+ defm : unpred_load<zextloadvi16, nxv2i64, LD1H_D_IMM, PTRUE_D>;
+ defm : unpred_load< extloadvi16, nxv4i32, LD1H_S_IMM, PTRUE_S>;
+ defm : unpred_load< extloadvi16, nxv2i64, LD1H_D_IMM, PTRUE_D>;
+ defm : unpred_load<sextloadvi16, nxv4i32, LD1SH_S_IMM, PTRUE_S>;
+ defm : unpred_load<sextloadvi16, nxv2i64, LD1SH_D_IMM, PTRUE_D>;
+ defm : unpred_load< load, nxv4i32, LD1W_IMM, PTRUE_S>;
+ defm : unpred_load<zextloadvi32, nxv2i64, LD1W_D_IMM, PTRUE_D>;
+ defm : unpred_load< extloadvi32, nxv2i64, LD1W_D_IMM, PTRUE_D>;
+ defm : unpred_load<sextloadvi32, nxv2i64, LD1SW_D_IMM, PTRUE_D>;
+ defm : unpred_load< load, nxv2i64, LD1D_IMM, PTRUE_D>;
+ defm : unpred_load< load, nxv8f16, LD1H_IMM, PTRUE_H>;
+ defm : unpred_load< load, nxv8bf16, LD1H_IMM, PTRUE_H>;
+ defm : unpred_load< load, nxv4f16, LD1H_S_IMM, PTRUE_S>;
+ defm : unpred_load< load, nxv2f16, LD1H_D_IMM, PTRUE_D>;
+ defm : unpred_load< load, nxv4f32, LD1W_IMM, PTRUE_S>;
+ defm : unpred_load< load, nxv2f32, LD1W_D_IMM, PTRUE_D>;
+ defm : unpred_load< load, nxv2f64, LD1D_IMM, PTRUE_D>;
+
+ multiclass unpred_store_predicate<ValueType Ty, Instruction Store> {
+ def _fi : Pat<(store (Ty PPR:$val), (am_sve_fi GPR64sp:$base, simm9:$offset)),
+ (Store PPR:$val, GPR64sp:$base, simm9:$offset)>;
+
+ def _default : Pat<(store (Ty PPR:$Val), GPR64:$base),
+ (Store PPR:$Val, GPR64:$base, (i64 0))>;
+ }
+
+ defm Pat_Store_P16 : unpred_store_predicate<nxv16i1, STR_PXI>;
+ defm Pat_Store_P8 : unpred_store_predicate<nxv8i1, STR_PXI>;
+ defm Pat_Store_P4 : unpred_store_predicate<nxv4i1, STR_PXI>;
+ defm Pat_Store_P2 : unpred_store_predicate<nxv2i1, STR_PXI>;
+
+ multiclass unpred_load_predicate<ValueType Ty, Instruction Load> {
+ def _fi : Pat<(Ty (load (am_sve_fi GPR64sp:$base, simm9:$offset))),
+ (Load GPR64sp:$base, simm9:$offset)>;
+
+ def _default : Pat<(Ty (load GPR64:$base)),
+ (Load GPR64:$base, (i64 0))>;
+ }
+
+ defm Pat_Load_P16 : unpred_load_predicate<nxv16i1, LDR_PXI>;
+ defm Pat_Load_P8 : unpred_load_predicate<nxv8i1, LDR_PXI>;
+ defm Pat_Load_P4 : unpred_load_predicate<nxv4i1, LDR_PXI>;
+ defm Pat_Load_P2 : unpred_load_predicate<nxv2i1, LDR_PXI>;
+
+ multiclass ld1<Instruction RegRegInst, Instruction RegImmInst, ValueType Ty,
+ SDPatternOperator Load, ValueType PredTy, ValueType MemVT, ComplexPattern AddrCP> {
+ // reg + reg
+ let AddedComplexity = 1 in {
+ def : Pat<(Ty (Load (PredTy PPR:$gp), (AddrCP GPR64:$base, GPR64:$offset), MemVT)),
+ (RegRegInst PPR:$gp, GPR64sp:$base, GPR64:$offset)>;
+ }
+
+ // scalar + immediate (mul vl)
+ let AddedComplexity = 2 in {
+ def : Pat<(Ty (Load (PredTy PPR:$gp), (am_sve_indexed_s4 GPR64sp:$base, simm4s1:$offset), MemVT)),
+ (RegImmInst PPR:$gp, GPR64sp:$base, simm4s1:$offset)>;
+ }
+
+ // base
+ def : Pat<(Ty (Load (PredTy PPR:$gp), GPR64:$base, MemVT)),
+ (RegImmInst PPR:$gp, GPR64sp:$base, (i64 0))>;
+ }
+
+ // 2-element contiguous loads
+ defm : ld1<LD1B_D, LD1B_D_IMM, nxv2i64, AArch64ld1_z, nxv2i1, nxv2i8, am_sve_regreg_lsl0>;
+ defm : ld1<LD1SB_D, LD1SB_D_IMM, nxv2i64, AArch64ld1s_z, nxv2i1, nxv2i8, am_sve_regreg_lsl0>;
+ defm : ld1<LD1H_D, LD1H_D_IMM, nxv2i64, AArch64ld1_z, nxv2i1, nxv2i16, am_sve_regreg_lsl1>;
+ defm : ld1<LD1SH_D, LD1SH_D_IMM, nxv2i64, AArch64ld1s_z, nxv2i1, nxv2i16, am_sve_regreg_lsl1>;
+ defm : ld1<LD1W_D, LD1W_D_IMM, nxv2i64, AArch64ld1_z, nxv2i1, nxv2i32, am_sve_regreg_lsl2>;
+ defm : ld1<LD1SW_D, LD1SW_D_IMM, nxv2i64, AArch64ld1s_z, nxv2i1, nxv2i32, am_sve_regreg_lsl2>;
+ defm : ld1<LD1D, LD1D_IMM, nxv2i64, AArch64ld1_z, nxv2i1, nxv2i64, am_sve_regreg_lsl3>;
+ defm : ld1<LD1D, LD1D_IMM, nxv2f64, AArch64ld1_z, nxv2i1, nxv2f64, am_sve_regreg_lsl3>;
+
+ // 4-element contiguous loads
+ defm : ld1<LD1B_S, LD1B_S_IMM, nxv4i32, AArch64ld1_z, nxv4i1, nxv4i8, am_sve_regreg_lsl0>;
+ defm : ld1<LD1SB_S, LD1SB_S_IMM, nxv4i32, AArch64ld1s_z, nxv4i1, nxv4i8, am_sve_regreg_lsl0>;
+ defm : ld1<LD1H_S, LD1H_S_IMM, nxv4i32, AArch64ld1_z, nxv4i1, nxv4i16, am_sve_regreg_lsl1>;
+ defm : ld1<LD1SH_S, LD1SH_S_IMM, nxv4i32, AArch64ld1s_z, nxv4i1, nxv4i16, am_sve_regreg_lsl1>;
+ defm : ld1<LD1W, LD1W_IMM, nxv4i32, AArch64ld1_z, nxv4i1, nxv4i32, am_sve_regreg_lsl2>;
+ defm : ld1<LD1W, LD1W_IMM, nxv4f32, AArch64ld1_z, nxv4i1, nxv4f32, am_sve_regreg_lsl2>;
+
+ // 8-element contiguous loads
+ defm : ld1<LD1B_H, LD1B_H_IMM, nxv8i16, AArch64ld1_z, nxv8i1, nxv8i8, am_sve_regreg_lsl0>;
+ defm : ld1<LD1SB_H, LD1SB_H_IMM, nxv8i16, AArch64ld1s_z, nxv8i1, nxv8i8, am_sve_regreg_lsl0>;
+ defm : ld1<LD1H, LD1H_IMM, nxv8i16, AArch64ld1_z, nxv8i1, nxv8i16, am_sve_regreg_lsl1>;
+ defm : ld1<LD1H, LD1H_IMM, nxv8f16, AArch64ld1_z, nxv8i1, nxv8f16, am_sve_regreg_lsl1>;
+
+ let Predicates = [HasBF16, HasSVE] in {
+ defm : ld1<LD1H, LD1H_IMM, nxv8bf16, AArch64ld1_z, nxv8i1, nxv8bf16, am_sve_regreg_lsl1>;
+ }
+
+ // 16-element contiguous loads
+ defm : ld1<LD1B, LD1B_IMM, nxv16i8, AArch64ld1_z, nxv16i1, nxv16i8, am_sve_regreg_lsl0>;
+
+ multiclass ldnf1<Instruction I, ValueType Ty, SDPatternOperator Load, ValueType PredTy, ValueType MemVT> {
+ // scalar + immediate (mul vl)
+ let AddedComplexity = 1 in {
+ def : Pat<(Ty (Load (PredTy PPR:$gp), (am_sve_indexed_s4 GPR64sp:$base, simm4s1:$offset), MemVT)),
+ (I PPR:$gp, GPR64sp:$base, simm4s1:$offset)>;
+ }
+
+ // base
+ def : Pat<(Ty (Load (PredTy PPR:$gp), GPR64:$base, MemVT)),
+ (I PPR:$gp, GPR64sp:$base, (i64 0))>;
+ }
+
+ // 2-element contiguous non-faulting loads
+ defm : ldnf1<LDNF1B_D_IMM, nxv2i64, AArch64ldnf1_z, nxv2i1, nxv2i8>;
+ defm : ldnf1<LDNF1SB_D_IMM, nxv2i64, AArch64ldnf1s_z, nxv2i1, nxv2i8>;
+ defm : ldnf1<LDNF1H_D_IMM, nxv2i64, AArch64ldnf1_z, nxv2i1, nxv2i16>;
+ defm : ldnf1<LDNF1SH_D_IMM, nxv2i64, AArch64ldnf1s_z, nxv2i1, nxv2i16>;
+ defm : ldnf1<LDNF1W_D_IMM, nxv2i64, AArch64ldnf1_z, nxv2i1, nxv2i32>;
+ defm : ldnf1<LDNF1SW_D_IMM, nxv2i64, AArch64ldnf1s_z, nxv2i1, nxv2i32>;
+ defm : ldnf1<LDNF1D_IMM, nxv2i64, AArch64ldnf1_z, nxv2i1, nxv2i64>;
+ defm : ldnf1<LDNF1D_IMM, nxv2f64, AArch64ldnf1_z, nxv2i1, nxv2f64>;
+
+ // 4-element contiguous non-faulting loads
+ defm : ldnf1<LDNF1B_S_IMM, nxv4i32, AArch64ldnf1_z, nxv4i1, nxv4i8>;
+ defm : ldnf1<LDNF1SB_S_IMM, nxv4i32, AArch64ldnf1s_z, nxv4i1, nxv4i8>;
+ defm : ldnf1<LDNF1H_S_IMM, nxv4i32, AArch64ldnf1_z, nxv4i1, nxv4i16>;
+ defm : ldnf1<LDNF1SH_S_IMM, nxv4i32, AArch64ldnf1s_z, nxv4i1, nxv4i16>;
+ defm : ldnf1<LDNF1W_IMM, nxv4i32, AArch64ldnf1_z, nxv4i1, nxv4i32>;
+ defm : ldnf1<LDNF1W_IMM, nxv4f32, AArch64ldnf1_z, nxv4i1, nxv4f32>;
+
+ // 8-element contiguous non-faulting loads
+ defm : ldnf1<LDNF1B_H_IMM, nxv8i16, AArch64ldnf1_z, nxv8i1, nxv8i8>;
+ defm : ldnf1<LDNF1SB_H_IMM, nxv8i16, AArch64ldnf1s_z, nxv8i1, nxv8i8>;
+ defm : ldnf1<LDNF1H_IMM, nxv8i16, AArch64ldnf1_z, nxv8i1, nxv8i16>;
+ defm : ldnf1<LDNF1H_IMM, nxv8f16, AArch64ldnf1_z, nxv8i1, nxv8f16>;
+
+ let Predicates = [HasBF16, HasSVE] in {
+ defm : ldnf1<LDNF1H_IMM, nxv8bf16, AArch64ldnf1_z, nxv8i1, nxv8bf16>;
+ }
+
+ // 16-element contiguous non-faulting loads
+ defm : ldnf1<LDNF1B_IMM, nxv16i8, AArch64ldnf1_z, nxv16i1, nxv16i8>;
- defm : pred_load<nxv16i8, nxv16i1, non_temporal_load, LDNT1B_ZRI>;
- defm : pred_load<nxv8i16, nxv8i1, non_temporal_load, LDNT1H_ZRI>;
- defm : pred_load<nxv4i32, nxv4i1, non_temporal_load, LDNT1W_ZRI>;
- defm : pred_load<nxv2i64, nxv2i1, non_temporal_load, LDNT1D_ZRI>;
+ multiclass ldff1<Instruction I, ValueType Ty, SDPatternOperator Load, ValueType PredTy, ValueType MemVT, ComplexPattern AddrCP> {
+ // reg + reg
+ let AddedComplexity = 1 in {
+ def : Pat<(Ty (Load (PredTy PPR:$gp), (AddrCP GPR64:$base, GPR64:$offset), MemVT)),
+ (I PPR:$gp, GPR64sp:$base, GPR64:$offset)>;
+ }
- defm : pred_store<nxv16i8, nxv16i1, non_temporal_store, STNT1B_ZRI>;
- defm : pred_store<nxv8i16, nxv8i1, non_temporal_store, STNT1H_ZRI>;
- defm : pred_store<nxv4i32, nxv4i1, non_temporal_store, STNT1W_ZRI>;
- defm : pred_store<nxv2i64, nxv2i1, non_temporal_store, STNT1D_ZRI>;
+ // Base
+ def : Pat<(Ty (Load (PredTy PPR:$gp), GPR64:$base, MemVT)),
+ (I PPR:$gp, GPR64sp:$base, XZR)>;
+ }
+
+ // 2-element contiguous first faulting loads
+ defm : ldff1<LDFF1B_D, nxv2i64, AArch64ldff1_z, nxv2i1, nxv2i8, am_sve_regreg_lsl0>;
+ defm : ldff1<LDFF1SB_D, nxv2i64, AArch64ldff1s_z, nxv2i1, nxv2i8, am_sve_regreg_lsl0>;
+ defm : ldff1<LDFF1H_D, nxv2i64, AArch64ldff1_z, nxv2i1, nxv2i16, am_sve_regreg_lsl1>;
+ defm : ldff1<LDFF1SH_D, nxv2i64, AArch64ldff1s_z, nxv2i1, nxv2i16, am_sve_regreg_lsl1>;
+ defm : ldff1<LDFF1W_D, nxv2i64, AArch64ldff1_z, nxv2i1, nxv2i32, am_sve_regreg_lsl2>;
+ defm : ldff1<LDFF1SW_D, nxv2i64, AArch64ldff1s_z, nxv2i1, nxv2i32, am_sve_regreg_lsl2>;
+ defm : ldff1<LDFF1D, nxv2i64, AArch64ldff1_z, nxv2i1, nxv2i64, am_sve_regreg_lsl3>;
+ defm : ldff1<LDFF1W_D, nxv2f32, AArch64ldff1_z, nxv2i1, nxv2f32, am_sve_regreg_lsl2>;
+ defm : ldff1<LDFF1D, nxv2f64, AArch64ldff1_z, nxv2i1, nxv2f64, am_sve_regreg_lsl3>;
+
+ // 4-element contiguous first faulting loads
+ defm : ldff1<LDFF1B_S, nxv4i32, AArch64ldff1_z, nxv4i1, nxv4i8, am_sve_regreg_lsl0>;
+ defm : ldff1<LDFF1SB_S, nxv4i32, AArch64ldff1s_z, nxv4i1, nxv4i8, am_sve_regreg_lsl0>;
+ defm : ldff1<LDFF1H_S, nxv4i32, AArch64ldff1_z, nxv4i1, nxv4i16, am_sve_regreg_lsl1>;
+ defm : ldff1<LDFF1SH_S, nxv4i32, AArch64ldff1s_z, nxv4i1, nxv4i16, am_sve_regreg_lsl1>;
+ defm : ldff1<LDFF1W, nxv4i32, AArch64ldff1_z, nxv4i1, nxv4i32, am_sve_regreg_lsl2>;
+ defm : ldff1<LDFF1W, nxv4f32, AArch64ldff1_z, nxv4i1, nxv4f32, am_sve_regreg_lsl2>;
+
+ // 8-element contiguous first faulting loads
+ defm : ldff1<LDFF1B_H, nxv8i16, AArch64ldff1_z, nxv8i1, nxv8i8, am_sve_regreg_lsl0>;
+ defm : ldff1<LDFF1SB_H, nxv8i16, AArch64ldff1s_z, nxv8i1, nxv8i8, am_sve_regreg_lsl0>;
+ defm : ldff1<LDFF1H, nxv8i16, AArch64ldff1_z, nxv8i1, nxv8i16, am_sve_regreg_lsl1>;
+ defm : ldff1<LDFF1H, nxv8f16, AArch64ldff1_z, nxv8i1, nxv8f16, am_sve_regreg_lsl1>;
+
+ let Predicates = [HasBF16, HasSVE] in {
+ defm : ldff1<LDFF1H, nxv8bf16, AArch64ldff1_z, nxv8i1, nxv8bf16, am_sve_regreg_lsl1>;
+ }
+
+ // 16-element contiguous first faulting loads
+ defm : ldff1<LDFF1B, nxv16i8, AArch64ldff1_z, nxv16i1, nxv16i8, am_sve_regreg_lsl0>;
+
+ multiclass st1<Instruction RegRegInst, Instruction RegImmInst, ValueType Ty,
+ SDPatternOperator Store, ValueType PredTy, ValueType MemVT, ComplexPattern AddrCP> {
+ // reg + reg
+ let AddedComplexity = 1 in {
+ def : Pat<(Store (Ty ZPR:$vec), (AddrCP GPR64:$base, GPR64:$offset), (PredTy PPR:$gp), MemVT),
+ (RegRegInst ZPR:$vec, PPR:$gp, GPR64sp:$base, GPR64:$offset)>;
+ }
+
+ // scalar + immediate (mul vl)
+ let AddedComplexity = 2 in {
+ def : Pat<(Store (Ty ZPR:$vec), (am_sve_indexed_s4 GPR64sp:$base, simm4s1:$offset), (PredTy PPR:$gp), MemVT),
+ (RegImmInst ZPR:$vec, PPR:$gp, GPR64sp:$base, simm4s1:$offset)>;
+ }
+
+ // base
+ def : Pat<(Store (Ty ZPR:$vec), GPR64:$base, (PredTy PPR:$gp), MemVT),
+ (RegImmInst ZPR:$vec, PPR:$gp, GPR64:$base, (i64 0))>;
+ }
+
+ // 2-element contiguous store
+ defm : st1<ST1B_D, ST1B_D_IMM, nxv2i64, AArch64st1, nxv2i1, nxv2i8, am_sve_regreg_lsl0>;
+ defm : st1<ST1H_D, ST1H_D_IMM, nxv2i64, AArch64st1, nxv2i1, nxv2i16, am_sve_regreg_lsl1>;
+ defm : st1<ST1W_D, ST1W_D_IMM, nxv2i64, AArch64st1, nxv2i1, nxv2i32, am_sve_regreg_lsl2>;
+ defm : st1<ST1D, ST1D_IMM, nxv2i64, AArch64st1, nxv2i1, nxv2i64, am_sve_regreg_lsl3>;
+
+ // 4-element contiguous store
+ defm : st1<ST1B_S, ST1B_S_IMM, nxv4i32, AArch64st1, nxv4i1, nxv4i8, am_sve_regreg_lsl0>;
+ defm : st1<ST1H_S, ST1H_S_IMM, nxv4i32, AArch64st1, nxv4i1, nxv4i16, am_sve_regreg_lsl1>;
+ defm : st1<ST1W, ST1W_IMM, nxv4i32, AArch64st1, nxv4i1, nxv4i32, am_sve_regreg_lsl2>;
+
+ // 8-element contiguous store
+ defm : st1<ST1B_H, ST1B_H_IMM, nxv8i16, AArch64st1, nxv8i1, nxv8i8, am_sve_regreg_lsl0>;
+ defm : st1<ST1H, ST1H_IMM, nxv8i16, AArch64st1, nxv8i1, nxv8i16, am_sve_regreg_lsl1>;
+
+ // 16-element contiguous store
+ defm : st1<ST1B, ST1B_IMM, nxv16i8, AArch64st1, nxv16i1, nxv16i8, am_sve_regreg_lsl0>;
+
+ def : Pat<(nxv16i8 (vector_insert (nxv16i8 (undef)), (i32 FPR32:$src), 0)),
+ (INSERT_SUBREG (nxv16i8 (IMPLICIT_DEF)), FPR32:$src, ssub)>;
+ def : Pat<(nxv8i16 (vector_insert (nxv8i16 (undef)), (i32 FPR32:$src), 0)),
+ (INSERT_SUBREG (nxv8i16 (IMPLICIT_DEF)), FPR32:$src, ssub)>;
+ def : Pat<(nxv4i32 (vector_insert (nxv4i32 (undef)), (i32 FPR32:$src), 0)),
+ (INSERT_SUBREG (nxv4i32 (IMPLICIT_DEF)), FPR32:$src, ssub)>;
+ def : Pat<(nxv2i64 (vector_insert (nxv2i64 (undef)), (i64 FPR64:$src), 0)),
+ (INSERT_SUBREG (nxv2i64 (IMPLICIT_DEF)), FPR64:$src, dsub)>;
+
+ // Insert scalar into vector[0]
+ def : Pat<(nxv16i8 (vector_insert (nxv16i8 ZPR:$vec), (i32 GPR32:$src), 0)),
+ (CPY_ZPmR_B ZPR:$vec, (PTRUE_B 1), GPR32:$src)>;
+ def : Pat<(nxv8i16 (vector_insert (nxv8i16 ZPR:$vec), (i32 GPR32:$src), 0)),
+ (CPY_ZPmR_H ZPR:$vec, (PTRUE_H 1), GPR32:$src)>;
+ def : Pat<(nxv4i32 (vector_insert (nxv4i32 ZPR:$vec), (i32 GPR32:$src), 0)),
+ (CPY_ZPmR_S ZPR:$vec, (PTRUE_S 1), GPR32:$src)>;
+ def : Pat<(nxv2i64 (vector_insert (nxv2i64 ZPR:$vec), (i64 GPR64:$src), 0)),
+ (CPY_ZPmR_D ZPR:$vec, (PTRUE_D 1), GPR64:$src)>;
+
+ def : Pat<(nxv8f16 (vector_insert (nxv8f16 ZPR:$vec), (f16 FPR16:$src), 0)),
+ (SEL_ZPZZ_H (PTRUE_H 1), (INSERT_SUBREG (IMPLICIT_DEF), FPR16:$src, hsub), ZPR:$vec)>;
+ def : Pat<(nxv4f32 (vector_insert (nxv4f32 ZPR:$vec), (f32 FPR32:$src), 0)),
+ (SEL_ZPZZ_S (PTRUE_S 1), (INSERT_SUBREG (IMPLICIT_DEF), FPR32:$src, ssub), ZPR:$vec)>;
+ def : Pat<(nxv2f64 (vector_insert (nxv2f64 ZPR:$vec), (f64 FPR64:$src), 0)),
+ (SEL_ZPZZ_D (PTRUE_D 1), (INSERT_SUBREG (IMPLICIT_DEF), FPR64:$src, dsub), ZPR:$vec)>;
+
+ // Insert scalar into vector with scalar index
+ def : Pat<(nxv16i8 (vector_insert (nxv16i8 ZPR:$vec), GPR32:$src, GPR64:$index)),
+ (CPY_ZPmR_B ZPR:$vec,
+ (CMPEQ_PPzZZ_B (PTRUE_B 31),
+ (INDEX_II_B 0, 1),
+ (DUP_ZR_B (i32 (EXTRACT_SUBREG GPR64:$index, sub_32)))),
+ GPR32:$src)>;
+ def : Pat<(nxv8i16 (vector_insert (nxv8i16 ZPR:$vec), GPR32:$src, GPR64:$index)),
+ (CPY_ZPmR_H ZPR:$vec,
+ (CMPEQ_PPzZZ_H (PTRUE_H 31),
+ (INDEX_II_H 0, 1),
+ (DUP_ZR_H (i32 (EXTRACT_SUBREG GPR64:$index, sub_32)))),
+ GPR32:$src)>;
+ def : Pat<(nxv4i32 (vector_insert (nxv4i32 ZPR:$vec), GPR32:$src, GPR64:$index)),
+ (CPY_ZPmR_S ZPR:$vec,
+ (CMPEQ_PPzZZ_S (PTRUE_S 31),
+ (INDEX_II_S 0, 1),
+ (DUP_ZR_S (i32 (EXTRACT_SUBREG GPR64:$index, sub_32)))),
+ GPR32:$src)>;
+ def : Pat<(nxv2i64 (vector_insert (nxv2i64 ZPR:$vec), GPR64:$src, GPR64:$index)),
+ (CPY_ZPmR_D ZPR:$vec,
+ (CMPEQ_PPzZZ_D (PTRUE_D 31),
+ (INDEX_II_D 0, 1),
+ (DUP_ZR_D GPR64:$index)),
+ GPR64:$src)>;
+
+ // Insert FP scalar into vector with scalar index
+ def : Pat<(nxv8f16 (vector_insert (nxv8f16 ZPR:$vec), (f16 FPR16:$src), GPR64:$index)),
+ (CPY_ZPmV_H ZPR:$vec,
+ (CMPEQ_PPzZZ_H (PTRUE_H 31),
+ (INDEX_II_H 0, 1),
+ (DUP_ZR_H (i32 (EXTRACT_SUBREG GPR64:$index, sub_32)))),
+ $src)>;
+ def : Pat<(nxv4f32 (vector_insert (nxv4f32 ZPR:$vec), (f32 FPR32:$src), GPR64:$index)),
+ (CPY_ZPmV_S ZPR:$vec,
+ (CMPEQ_PPzZZ_S (PTRUE_S 31),
+ (INDEX_II_S 0, 1),
+ (DUP_ZR_S (i32 (EXTRACT_SUBREG GPR64:$index, sub_32)))),
+ $src)>;
+ def : Pat<(nxv2f64 (vector_insert (nxv2f64 ZPR:$vec), (f64 FPR64:$src), GPR64:$index)),
+ (CPY_ZPmV_D ZPR:$vec,
+ (CMPEQ_PPzZZ_D (PTRUE_D 31),
+ (INDEX_II_D 0, 1),
+ (DUP_ZR_D $index)),
+ $src)>;
+
+ // Extract element from vector with immediate index
+ def : Pat<(i32 (vector_extract (nxv16i8 ZPR:$vec), sve_elm_idx_extdup_b:$index)),
+ (EXTRACT_SUBREG (DUP_ZZI_B ZPR:$vec, sve_elm_idx_extdup_b:$index), ssub)>;
+ def : Pat<(i32 (vector_extract (nxv8i16 ZPR:$vec), sve_elm_idx_extdup_h:$index)),
+ (EXTRACT_SUBREG (DUP_ZZI_H ZPR:$vec, sve_elm_idx_extdup_h:$index), ssub)>;
+ def : Pat<(i32 (vector_extract (nxv4i32 ZPR:$vec), sve_elm_idx_extdup_s:$index)),
+ (EXTRACT_SUBREG (DUP_ZZI_S ZPR:$vec, sve_elm_idx_extdup_s:$index), ssub)>;
+ def : Pat<(i64 (vector_extract (nxv2i64 ZPR:$vec), sve_elm_idx_extdup_d:$index)),
+ (EXTRACT_SUBREG (DUP_ZZI_D ZPR:$vec, sve_elm_idx_extdup_d:$index), dsub)>;
+ def : Pat<(f16 (vector_extract (nxv8f16 ZPR:$vec), sve_elm_idx_extdup_h:$index)),
+ (EXTRACT_SUBREG (DUP_ZZI_H ZPR:$vec, sve_elm_idx_extdup_h:$index), hsub)>;
+ def : Pat<(f32 (vector_extract (nxv4f32 ZPR:$vec), sve_elm_idx_extdup_s:$index)),
+ (EXTRACT_SUBREG (DUP_ZZI_S ZPR:$vec, sve_elm_idx_extdup_s:$index), ssub)>;
+ def : Pat<(f64 (vector_extract (nxv2f64 ZPR:$vec), sve_elm_idx_extdup_d:$index)),
+ (EXTRACT_SUBREG (DUP_ZZI_D ZPR:$vec, sve_elm_idx_extdup_d:$index), dsub)>;
+
+ // Extract element from vector with scalar index
+ def : Pat<(i32 (vector_extract (nxv16i8 ZPR:$vec), GPR64:$index)),
+ (LASTB_RPZ_B (WHILELS_PXX_B XZR, GPR64:$index),
+ ZPR:$vec)>;
+ def : Pat<(i32 (vector_extract (nxv8i16 ZPR:$vec), GPR64:$index)),
+ (LASTB_RPZ_H (WHILELS_PXX_H XZR, GPR64:$index),
+ ZPR:$vec)>;
+ def : Pat<(i32 (vector_extract (nxv4i32 ZPR:$vec), GPR64:$index)),
+ (LASTB_RPZ_S (WHILELS_PXX_S XZR, GPR64:$index),
+ ZPR:$vec)>;
+ def : Pat<(i64 (vector_extract (nxv2i64 ZPR:$vec), GPR64:$index)),
+ (LASTB_RPZ_D (WHILELS_PXX_D XZR, GPR64:$index),
+ ZPR:$vec)>;
+
+ def : Pat<(f16 (vector_extract (nxv8f16 ZPR:$vec), GPR64:$index)),
+ (LASTB_VPZ_H (WHILELS_PXX_H XZR, GPR64:$index),
+ ZPR:$vec)>;
+ def : Pat<(f32 (vector_extract (nxv4f32 ZPR:$vec), GPR64:$index)),
+ (LASTB_VPZ_S (WHILELS_PXX_S XZR, GPR64:$index),
+ ZPR:$vec)>;
+ def : Pat<(f64 (vector_extract (nxv2f64 ZPR:$vec), GPR64:$index)),
+ (LASTB_VPZ_D (WHILELS_PXX_D XZR, GPR64:$index),
+ ZPR:$vec)>;
+}
+
+let Predicates = [HasSVE, HasMatMulInt8] in {
+ defm SMMLA_ZZZ : sve_int_matmul<0b00, "smmla", int_aarch64_sve_smmla>;
+ defm UMMLA_ZZZ : sve_int_matmul<0b11, "ummla", int_aarch64_sve_ummla>;
+ defm USMMLA_ZZZ : sve_int_matmul<0b10, "usmmla", int_aarch64_sve_usmmla>;
+ defm USDOT_ZZZ : sve_int_dot_mixed<"usdot", int_aarch64_sve_usdot>;
+ defm USDOT_ZZZI : sve_int_dot_mixed_indexed<0, "usdot", int_aarch64_sve_usdot_lane>;
+ defm SUDOT_ZZZI : sve_int_dot_mixed_indexed<1, "sudot", int_aarch64_sve_sudot_lane>;
+}
+
+let Predicates = [HasSVE, HasMatMulFP32] in {
+ defm FMMLA_ZZZ_S : sve_fp_matrix_mla<0, "fmmla", ZPR32, int_aarch64_sve_fmmla, nxv4f32>;
+}
+
+let Predicates = [HasSVE, HasMatMulFP64] in {
+ defm FMMLA_ZZZ_D : sve_fp_matrix_mla<1, "fmmla", ZPR64, int_aarch64_sve_fmmla, nxv2f64>;
+ defm LD1RO_B_IMM : sve_mem_ldor_si<0b00, "ld1rob", Z_b, ZPR8, nxv16i8, nxv16i1, AArch64ld1ro_z>;
+ defm LD1RO_H_IMM : sve_mem_ldor_si<0b01, "ld1roh", Z_h, ZPR16, nxv8i16, nxv8i1, AArch64ld1ro_z>;
+ defm LD1RO_W_IMM : sve_mem_ldor_si<0b10, "ld1row", Z_s, ZPR32, nxv4i32, nxv4i1, AArch64ld1ro_z>;
+ defm LD1RO_D_IMM : sve_mem_ldor_si<0b11, "ld1rod", Z_d, ZPR64, nxv2i64, nxv2i1, AArch64ld1ro_z>;
+ defm LD1RO_B : sve_mem_ldor_ss<0b00, "ld1rob", Z_b, ZPR8, GPR64NoXZRshifted8, nxv16i8, nxv16i1, AArch64ld1ro_z, am_sve_regreg_lsl0>;
+ defm LD1RO_H : sve_mem_ldor_ss<0b01, "ld1roh", Z_h, ZPR16, GPR64NoXZRshifted16, nxv8i16, nxv8i1, AArch64ld1ro_z, am_sve_regreg_lsl1>;
+ defm LD1RO_W : sve_mem_ldor_ss<0b10, "ld1row", Z_s, ZPR32, GPR64NoXZRshifted32, nxv4i32, nxv4i1, AArch64ld1ro_z, am_sve_regreg_lsl2>;
+ defm LD1RO_D : sve_mem_ldor_ss<0b11, "ld1rod", Z_d, ZPR64, GPR64NoXZRshifted64, nxv2i64, nxv2i1, AArch64ld1ro_z, am_sve_regreg_lsl3>;
+ defm ZIP1_ZZZ_Q : sve_int_perm_bin_perm_128_zz<0b00, 0, "zip1", int_aarch64_sve_zip1q>;
+ defm ZIP2_ZZZ_Q : sve_int_perm_bin_perm_128_zz<0b00, 1, "zip2", int_aarch64_sve_zip2q>;
+ defm UZP1_ZZZ_Q : sve_int_perm_bin_perm_128_zz<0b01, 0, "uzp1", int_aarch64_sve_uzp1q>;
+ defm UZP2_ZZZ_Q : sve_int_perm_bin_perm_128_zz<0b01, 1, "uzp2", int_aarch64_sve_uzp2q>;
+ defm TRN1_ZZZ_Q : sve_int_perm_bin_perm_128_zz<0b11, 0, "trn1", int_aarch64_sve_trn1q>;
+ defm TRN2_ZZZ_Q : sve_int_perm_bin_perm_128_zz<0b11, 1, "trn2", int_aarch64_sve_trn2q>;
+}
+
+let Predicates = [HasSVE, HasMatMulFP64, HasBF16] in {
+ def : SVE_2_Op_Pat<nxv8bf16, int_aarch64_sve_zip1q, nxv8bf16, nxv8bf16, ZIP1_ZZZ_Q>;
+ def : SVE_2_Op_Pat<nxv8bf16, int_aarch64_sve_zip2q, nxv8bf16, nxv8bf16, ZIP2_ZZZ_Q>;
+ def : SVE_2_Op_Pat<nxv8bf16, int_aarch64_sve_uzp1q, nxv8bf16, nxv8bf16, UZP1_ZZZ_Q>;
+ def : SVE_2_Op_Pat<nxv8bf16, int_aarch64_sve_uzp2q, nxv8bf16, nxv8bf16, UZP2_ZZZ_Q>;
+ def : SVE_2_Op_Pat<nxv8bf16, int_aarch64_sve_trn1q, nxv8bf16, nxv8bf16, TRN1_ZZZ_Q>;
+ def : SVE_2_Op_Pat<nxv8bf16, int_aarch64_sve_trn2q, nxv8bf16, nxv8bf16, TRN2_ZZZ_Q>;
}
let Predicates = [HasSVE2] in {
// SVE2 integer multiply-add (indexed)
- defm MLA_ZZZI : sve2_int_mla_by_indexed_elem<0b01, 0b0, "mla">;
- defm MLS_ZZZI : sve2_int_mla_by_indexed_elem<0b01, 0b1, "mls">;
+ defm MLA_ZZZI : sve2_int_mla_by_indexed_elem<0b01, 0b0, "mla", int_aarch64_sve_mla_lane>;
+ defm MLS_ZZZI : sve2_int_mla_by_indexed_elem<0b01, 0b1, "mls", int_aarch64_sve_mls_lane>;
// SVE2 saturating multiply-add high (indexed)
- defm SQRDMLAH_ZZZI : sve2_int_mla_by_indexed_elem<0b10, 0b0, "sqrdmlah">;
- defm SQRDMLSH_ZZZI : sve2_int_mla_by_indexed_elem<0b10, 0b1, "sqrdmlsh">;
+ defm SQRDMLAH_ZZZI : sve2_int_mla_by_indexed_elem<0b10, 0b0, "sqrdmlah", int_aarch64_sve_sqrdmlah_lane>;
+ defm SQRDMLSH_ZZZI : sve2_int_mla_by_indexed_elem<0b10, 0b1, "sqrdmlsh", int_aarch64_sve_sqrdmlsh_lane>;
// SVE2 saturating multiply-add high (vectors, unpredicated)
- defm SQRDMLAH_ZZZ : sve2_int_mla<0b0, "sqrdmlah">;
- defm SQRDMLSH_ZZZ : sve2_int_mla<0b1, "sqrdmlsh">;
+ defm SQRDMLAH_ZZZ : sve2_int_mla<0b0, "sqrdmlah", int_aarch64_sve_sqrdmlah>;
+ defm SQRDMLSH_ZZZ : sve2_int_mla<0b1, "sqrdmlsh", int_aarch64_sve_sqrdmlsh>;
// SVE2 integer multiply (indexed)
- defm MUL_ZZZI : sve2_int_mul_by_indexed_elem<0b1110, "mul">;
+ defm MUL_ZZZI : sve2_int_mul_by_indexed_elem<0b1110, "mul", int_aarch64_sve_mul_lane>;
// SVE2 saturating multiply high (indexed)
- defm SQDMULH_ZZZI : sve2_int_mul_by_indexed_elem<0b1100, "sqdmulh">;
- defm SQRDMULH_ZZZI : sve2_int_mul_by_indexed_elem<0b1101, "sqrdmulh">;
+ defm SQDMULH_ZZZI : sve2_int_mul_by_indexed_elem<0b1100, "sqdmulh", int_aarch64_sve_sqdmulh_lane>;
+ defm SQRDMULH_ZZZI : sve2_int_mul_by_indexed_elem<0b1101, "sqrdmulh", int_aarch64_sve_sqrdmulh_lane>;
// SVE2 signed saturating doubling multiply high (unpredicated)
- defm SQDMULH_ZZZ : sve2_int_mul<0b100, "sqdmulh">;
- defm SQRDMULH_ZZZ : sve2_int_mul<0b101, "sqrdmulh">;
+ defm SQDMULH_ZZZ : sve2_int_mul<0b100, "sqdmulh", int_aarch64_sve_sqdmulh>;
+ defm SQRDMULH_ZZZ : sve2_int_mul<0b101, "sqrdmulh", int_aarch64_sve_sqrdmulh>;
// SVE2 integer multiply vectors (unpredicated)
- defm MUL_ZZZ : sve2_int_mul<0b000, "mul">;
- defm SMULH_ZZZ : sve2_int_mul<0b010, "smulh">;
- defm UMULH_ZZZ : sve2_int_mul<0b011, "umulh">;
- def PMUL_ZZZ_B : sve2_int_mul<0b00, 0b001, "pmul", ZPR8>;
-
+ defm MUL_ZZZ : sve2_int_mul<0b000, "mul", mul>;
+ defm SMULH_ZZZ : sve2_int_mul<0b010, "smulh", null_frag>;
+ defm UMULH_ZZZ : sve2_int_mul<0b011, "umulh", null_frag>;
+ defm PMUL_ZZZ : sve2_int_mul_single<0b001, "pmul", int_aarch64_sve_pmul>;
+
+ // Add patterns for unpredicated version of smulh and umulh.
+ def : Pat<(nxv16i8 (int_aarch64_sve_smulh (nxv16i1 (AArch64ptrue 31)), nxv16i8:$Op1, nxv16i8:$Op2)),
+ (SMULH_ZZZ_B $Op1, $Op2)>;
+ def : Pat<(nxv8i16 (int_aarch64_sve_smulh (nxv8i1 (AArch64ptrue 31)), nxv8i16:$Op1, nxv8i16:$Op2)),
+ (SMULH_ZZZ_H $Op1, $Op2)>;
+ def : Pat<(nxv4i32 (int_aarch64_sve_smulh (nxv4i1 (AArch64ptrue 31)), nxv4i32:$Op1, nxv4i32:$Op2)),
+ (SMULH_ZZZ_S $Op1, $Op2)>;
+ def : Pat<(nxv2i64 (int_aarch64_sve_smulh (nxv2i1 (AArch64ptrue 31)), nxv2i64:$Op1, nxv2i64:$Op2)),
+ (SMULH_ZZZ_D $Op1, $Op2)>;
+ def : Pat<(nxv16i8 (int_aarch64_sve_umulh (nxv16i1 (AArch64ptrue 31)), nxv16i8:$Op1, nxv16i8:$Op2)),
+ (UMULH_ZZZ_B $Op1, $Op2)>;
+ def : Pat<(nxv8i16 (int_aarch64_sve_umulh (nxv8i1 (AArch64ptrue 31)), nxv8i16:$Op1, nxv8i16:$Op2)),
+ (UMULH_ZZZ_H $Op1, $Op2)>;
+ def : Pat<(nxv4i32 (int_aarch64_sve_umulh (nxv4i1 (AArch64ptrue 31)), nxv4i32:$Op1, nxv4i32:$Op2)),
+ (UMULH_ZZZ_S $Op1, $Op2)>;
+ def : Pat<(nxv2i64 (int_aarch64_sve_umulh (nxv2i1 (AArch64ptrue 31)), nxv2i64:$Op1, nxv2i64:$Op2)),
+ (UMULH_ZZZ_D $Op1, $Op2)>;
// SVE2 complex integer dot product (indexed)
- defm CDOT_ZZZI : sve2_cintx_dot_by_indexed_elem<"cdot">;
+ defm CDOT_ZZZI : sve2_cintx_dot_by_indexed_elem<"cdot", int_aarch64_sve_cdot_lane>;
// SVE2 complex integer dot product
- defm CDOT_ZZZ : sve2_cintx_dot<"cdot">;
+ defm CDOT_ZZZ : sve2_cintx_dot<"cdot", int_aarch64_sve_cdot>;
// SVE2 complex integer multiply-add (indexed)
- defm CMLA_ZZZI : sve2_cmla_by_indexed_elem<0b0, "cmla">;
+ defm CMLA_ZZZI : sve2_cmla_by_indexed_elem<0b0, "cmla", int_aarch64_sve_cmla_lane_x>;
// SVE2 complex saturating multiply-add (indexed)
- defm SQRDCMLAH_ZZZI : sve2_cmla_by_indexed_elem<0b1, "sqrdcmlah">;
+ defm SQRDCMLAH_ZZZI : sve2_cmla_by_indexed_elem<0b1, "sqrdcmlah", int_aarch64_sve_sqrdcmlah_lane_x>;
// SVE2 complex integer multiply-add
- defm CMLA_ZZZ : sve2_int_cmla<0b0, "cmla">;
- defm SQRDCMLAH_ZZZ : sve2_int_cmla<0b1, "sqrdcmlah">;
+ defm CMLA_ZZZ : sve2_int_cmla<0b0, "cmla", int_aarch64_sve_cmla_x>;
+ defm SQRDCMLAH_ZZZ : sve2_int_cmla<0b1, "sqrdcmlah", int_aarch64_sve_sqrdcmlah_x>;
// SVE2 integer multiply long (indexed)
- defm SMULLB_ZZZI : sve2_int_mul_long_by_indexed_elem<0b000, "smullb">;
- defm SMULLT_ZZZI : sve2_int_mul_long_by_indexed_elem<0b001, "smullt">;
- defm UMULLB_ZZZI : sve2_int_mul_long_by_indexed_elem<0b010, "umullb">;
- defm UMULLT_ZZZI : sve2_int_mul_long_by_indexed_elem<0b011, "umullt">;
+ defm SMULLB_ZZZI : sve2_int_mul_long_by_indexed_elem<0b000, "smullb", int_aarch64_sve_smullb_lane>;
+ defm SMULLT_ZZZI : sve2_int_mul_long_by_indexed_elem<0b001, "smullt", int_aarch64_sve_smullt_lane>;
+ defm UMULLB_ZZZI : sve2_int_mul_long_by_indexed_elem<0b010, "umullb", int_aarch64_sve_umullb_lane>;
+ defm UMULLT_ZZZI : sve2_int_mul_long_by_indexed_elem<0b011, "umullt", int_aarch64_sve_umullt_lane>;
// SVE2 saturating multiply (indexed)
- defm SQDMULLB_ZZZI : sve2_int_mul_long_by_indexed_elem<0b100, "sqdmullb">;
- defm SQDMULLT_ZZZI : sve2_int_mul_long_by_indexed_elem<0b101, "sqdmullt">;
+ defm SQDMULLB_ZZZI : sve2_int_mul_long_by_indexed_elem<0b100, "sqdmullb", int_aarch64_sve_sqdmullb_lane>;
+ defm SQDMULLT_ZZZI : sve2_int_mul_long_by_indexed_elem<0b101, "sqdmullt", int_aarch64_sve_sqdmullt_lane>;
// SVE2 integer multiply-add long (indexed)
- defm SMLALB_ZZZI : sve2_int_mla_long_by_indexed_elem<0b1000, "smlalb">;
- defm SMLALT_ZZZI : sve2_int_mla_long_by_indexed_elem<0b1001, "smlalt">;
- defm UMLALB_ZZZI : sve2_int_mla_long_by_indexed_elem<0b1010, "umlalb">;
- defm UMLALT_ZZZI : sve2_int_mla_long_by_indexed_elem<0b1011, "umlalt">;
- defm SMLSLB_ZZZI : sve2_int_mla_long_by_indexed_elem<0b1100, "smlslb">;
- defm SMLSLT_ZZZI : sve2_int_mla_long_by_indexed_elem<0b1101, "smlslt">;
- defm UMLSLB_ZZZI : sve2_int_mla_long_by_indexed_elem<0b1110, "umlslb">;
- defm UMLSLT_ZZZI : sve2_int_mla_long_by_indexed_elem<0b1111, "umlslt">;
+ defm SMLALB_ZZZI : sve2_int_mla_long_by_indexed_elem<0b1000, "smlalb", int_aarch64_sve_smlalb_lane>;
+ defm SMLALT_ZZZI : sve2_int_mla_long_by_indexed_elem<0b1001, "smlalt", int_aarch64_sve_smlalt_lane>;
+ defm UMLALB_ZZZI : sve2_int_mla_long_by_indexed_elem<0b1010, "umlalb", int_aarch64_sve_umlalb_lane>;
+ defm UMLALT_ZZZI : sve2_int_mla_long_by_indexed_elem<0b1011, "umlalt", int_aarch64_sve_umlalt_lane>;
+ defm SMLSLB_ZZZI : sve2_int_mla_long_by_indexed_elem<0b1100, "smlslb", int_aarch64_sve_smlslb_lane>;
+ defm SMLSLT_ZZZI : sve2_int_mla_long_by_indexed_elem<0b1101, "smlslt", int_aarch64_sve_smlslt_lane>;
+ defm UMLSLB_ZZZI : sve2_int_mla_long_by_indexed_elem<0b1110, "umlslb", int_aarch64_sve_umlslb_lane>;
+ defm UMLSLT_ZZZI : sve2_int_mla_long_by_indexed_elem<0b1111, "umlslt", int_aarch64_sve_umlslt_lane>;
// SVE2 integer multiply-add long (vectors, unpredicated)
- defm SMLALB_ZZZ : sve2_int_mla_long<0b10000, "smlalb">;
- defm SMLALT_ZZZ : sve2_int_mla_long<0b10001, "smlalt">;
- defm UMLALB_ZZZ : sve2_int_mla_long<0b10010, "umlalb">;
- defm UMLALT_ZZZ : sve2_int_mla_long<0b10011, "umlalt">;
- defm SMLSLB_ZZZ : sve2_int_mla_long<0b10100, "smlslb">;
- defm SMLSLT_ZZZ : sve2_int_mla_long<0b10101, "smlslt">;
- defm UMLSLB_ZZZ : sve2_int_mla_long<0b10110, "umlslb">;
- defm UMLSLT_ZZZ : sve2_int_mla_long<0b10111, "umlslt">;
+ defm SMLALB_ZZZ : sve2_int_mla_long<0b10000, "smlalb", int_aarch64_sve_smlalb>;
+ defm SMLALT_ZZZ : sve2_int_mla_long<0b10001, "smlalt", int_aarch64_sve_smlalt>;
+ defm UMLALB_ZZZ : sve2_int_mla_long<0b10010, "umlalb", int_aarch64_sve_umlalb>;
+ defm UMLALT_ZZZ : sve2_int_mla_long<0b10011, "umlalt", int_aarch64_sve_umlalt>;
+ defm SMLSLB_ZZZ : sve2_int_mla_long<0b10100, "smlslb", int_aarch64_sve_smlslb>;
+ defm SMLSLT_ZZZ : sve2_int_mla_long<0b10101, "smlslt", int_aarch64_sve_smlslt>;
+ defm UMLSLB_ZZZ : sve2_int_mla_long<0b10110, "umlslb", int_aarch64_sve_umlslb>;
+ defm UMLSLT_ZZZ : sve2_int_mla_long<0b10111, "umlslt", int_aarch64_sve_umlslt>;
// SVE2 saturating multiply-add long (indexed)
- defm SQDMLALB_ZZZI : sve2_int_mla_long_by_indexed_elem<0b0100, "sqdmlalb">;
- defm SQDMLALT_ZZZI : sve2_int_mla_long_by_indexed_elem<0b0101, "sqdmlalt">;
- defm SQDMLSLB_ZZZI : sve2_int_mla_long_by_indexed_elem<0b0110, "sqdmlslb">;
- defm SQDMLSLT_ZZZI : sve2_int_mla_long_by_indexed_elem<0b0111, "sqdmlslt">;
+ defm SQDMLALB_ZZZI : sve2_int_mla_long_by_indexed_elem<0b0100, "sqdmlalb", int_aarch64_sve_sqdmlalb_lane>;
+ defm SQDMLALT_ZZZI : sve2_int_mla_long_by_indexed_elem<0b0101, "sqdmlalt", int_aarch64_sve_sqdmlalt_lane>;
+ defm SQDMLSLB_ZZZI : sve2_int_mla_long_by_indexed_elem<0b0110, "sqdmlslb", int_aarch64_sve_sqdmlslb_lane>;
+ defm SQDMLSLT_ZZZI : sve2_int_mla_long_by_indexed_elem<0b0111, "sqdmlslt", int_aarch64_sve_sqdmlslt_lane>;
// SVE2 saturating multiply-add long (vectors, unpredicated)
- defm SQDMLALB_ZZZ : sve2_int_mla_long<0b11000, "sqdmlalb">;
- defm SQDMLALT_ZZZ : sve2_int_mla_long<0b11001, "sqdmlalt">;
- defm SQDMLSLB_ZZZ : sve2_int_mla_long<0b11010, "sqdmlslb">;
- defm SQDMLSLT_ZZZ : sve2_int_mla_long<0b11011, "sqdmlslt">;
+ defm SQDMLALB_ZZZ : sve2_int_mla_long<0b11000, "sqdmlalb", int_aarch64_sve_sqdmlalb>;
+ defm SQDMLALT_ZZZ : sve2_int_mla_long<0b11001, "sqdmlalt", int_aarch64_sve_sqdmlalt>;
+ defm SQDMLSLB_ZZZ : sve2_int_mla_long<0b11010, "sqdmlslb", int_aarch64_sve_sqdmlslb>;
+ defm SQDMLSLT_ZZZ : sve2_int_mla_long<0b11011, "sqdmlslt", int_aarch64_sve_sqdmlslt>;
// SVE2 saturating multiply-add interleaved long
- defm SQDMLALBT_ZZZ : sve2_int_mla_long<0b00010, "sqdmlalbt">;
- defm SQDMLSLBT_ZZZ : sve2_int_mla_long<0b00011, "sqdmlslbt">;
+ defm SQDMLALBT_ZZZ : sve2_int_mla_long<0b00010, "sqdmlalbt", int_aarch64_sve_sqdmlalbt>;
+ defm SQDMLSLBT_ZZZ : sve2_int_mla_long<0b00011, "sqdmlslbt", int_aarch64_sve_sqdmlslbt>;
// SVE2 integer halving add/subtract (predicated)
- defm SHADD_ZPmZ : sve2_int_arith_pred<0b100000, "shadd">;
- defm UHADD_ZPmZ : sve2_int_arith_pred<0b100010, "uhadd">;
- defm SHSUB_ZPmZ : sve2_int_arith_pred<0b100100, "shsub">;
- defm UHSUB_ZPmZ : sve2_int_arith_pred<0b100110, "uhsub">;
- defm SRHADD_ZPmZ : sve2_int_arith_pred<0b101000, "srhadd">;
- defm URHADD_ZPmZ : sve2_int_arith_pred<0b101010, "urhadd">;
- defm SHSUBR_ZPmZ : sve2_int_arith_pred<0b101100, "shsubr">;
- defm UHSUBR_ZPmZ : sve2_int_arith_pred<0b101110, "uhsubr">;
+ defm SHADD_ZPmZ : sve2_int_arith_pred<0b100000, "shadd", int_aarch64_sve_shadd>;
+ defm UHADD_ZPmZ : sve2_int_arith_pred<0b100010, "uhadd", int_aarch64_sve_uhadd>;
+ defm SHSUB_ZPmZ : sve2_int_arith_pred<0b100100, "shsub", int_aarch64_sve_shsub>;
+ defm UHSUB_ZPmZ : sve2_int_arith_pred<0b100110, "uhsub", int_aarch64_sve_uhsub>;
+ defm SRHADD_ZPmZ : sve2_int_arith_pred<0b101000, "srhadd", int_aarch64_sve_srhadd>;
+ defm URHADD_ZPmZ : sve2_int_arith_pred<0b101010, "urhadd", int_aarch64_sve_urhadd>;
+ defm SHSUBR_ZPmZ : sve2_int_arith_pred<0b101100, "shsubr", int_aarch64_sve_shsubr>;
+ defm UHSUBR_ZPmZ : sve2_int_arith_pred<0b101110, "uhsubr", int_aarch64_sve_uhsubr>;
// SVE2 integer pairwise add and accumulate long
- defm SADALP_ZPmZ : sve2_int_sadd_long_accum_pairwise<0, "sadalp">;
- defm UADALP_ZPmZ : sve2_int_sadd_long_accum_pairwise<1, "uadalp">;
+ defm SADALP_ZPmZ : sve2_int_sadd_long_accum_pairwise<0, "sadalp", int_aarch64_sve_sadalp>;
+ defm UADALP_ZPmZ : sve2_int_sadd_long_accum_pairwise<1, "uadalp", int_aarch64_sve_uadalp>;
// SVE2 integer pairwise arithmetic
- defm ADDP_ZPmZ : sve2_int_arith_pred<0b100011, "addp">;
- defm SMAXP_ZPmZ : sve2_int_arith_pred<0b101001, "smaxp">;
- defm UMAXP_ZPmZ : sve2_int_arith_pred<0b101011, "umaxp">;
- defm SMINP_ZPmZ : sve2_int_arith_pred<0b101101, "sminp">;
- defm UMINP_ZPmZ : sve2_int_arith_pred<0b101111, "uminp">;
+ defm ADDP_ZPmZ : sve2_int_arith_pred<0b100011, "addp", int_aarch64_sve_addp>;
+ defm SMAXP_ZPmZ : sve2_int_arith_pred<0b101001, "smaxp", int_aarch64_sve_smaxp>;
+ defm UMAXP_ZPmZ : sve2_int_arith_pred<0b101011, "umaxp", int_aarch64_sve_umaxp>;
+ defm SMINP_ZPmZ : sve2_int_arith_pred<0b101101, "sminp", int_aarch64_sve_sminp>;
+ defm UMINP_ZPmZ : sve2_int_arith_pred<0b101111, "uminp", int_aarch64_sve_uminp>;
// SVE2 integer unary operations (predicated)
- defm URECPE_ZPmZ : sve2_int_un_pred_arit_s<0b000, "urecpe">;
- defm URSQRTE_ZPmZ : sve2_int_un_pred_arit_s<0b001, "ursqrte">;
- defm SQABS_ZPmZ : sve2_int_un_pred_arit<0b100, "sqabs">;
- defm SQNEG_ZPmZ : sve2_int_un_pred_arit<0b101, "sqneg">;
+ defm URECPE_ZPmZ : sve2_int_un_pred_arit_s<0b000, "urecpe", int_aarch64_sve_urecpe>;
+ defm URSQRTE_ZPmZ : sve2_int_un_pred_arit_s<0b001, "ursqrte", int_aarch64_sve_ursqrte>;
+ defm SQABS_ZPmZ : sve2_int_un_pred_arit<0b100, "sqabs", int_aarch64_sve_sqabs>;
+ defm SQNEG_ZPmZ : sve2_int_un_pred_arit<0b101, "sqneg", int_aarch64_sve_sqneg>;
// SVE2 saturating add/subtract
- defm SQADD_ZPmZ : sve2_int_arith_pred<0b110000, "sqadd">;
- defm UQADD_ZPmZ : sve2_int_arith_pred<0b110010, "uqadd">;
- defm SQSUB_ZPmZ : sve2_int_arith_pred<0b110100, "sqsub">;
- defm UQSUB_ZPmZ : sve2_int_arith_pred<0b110110, "uqsub">;
- defm SUQADD_ZPmZ : sve2_int_arith_pred<0b111000, "suqadd">;
- defm USQADD_ZPmZ : sve2_int_arith_pred<0b111010, "usqadd">;
- defm SQSUBR_ZPmZ : sve2_int_arith_pred<0b111100, "sqsubr">;
- defm UQSUBR_ZPmZ : sve2_int_arith_pred<0b111110, "uqsubr">;
+ defm SQADD_ZPmZ : sve2_int_arith_pred<0b110000, "sqadd", int_aarch64_sve_sqadd>;
+ defm UQADD_ZPmZ : sve2_int_arith_pred<0b110010, "uqadd", int_aarch64_sve_uqadd>;
+ defm SQSUB_ZPmZ : sve2_int_arith_pred<0b110100, "sqsub", int_aarch64_sve_sqsub>;
+ defm UQSUB_ZPmZ : sve2_int_arith_pred<0b110110, "uqsub", int_aarch64_sve_uqsub>;
+ defm SUQADD_ZPmZ : sve2_int_arith_pred<0b111000, "suqadd", int_aarch64_sve_suqadd>;
+ defm USQADD_ZPmZ : sve2_int_arith_pred<0b111010, "usqadd", int_aarch64_sve_usqadd>;
+ defm SQSUBR_ZPmZ : sve2_int_arith_pred<0b111100, "sqsubr", int_aarch64_sve_sqsubr>;
+ defm UQSUBR_ZPmZ : sve2_int_arith_pred<0b111110, "uqsubr", int_aarch64_sve_uqsubr>;
// SVE2 saturating/rounding bitwise shift left (predicated)
- defm SRSHL_ZPmZ : sve2_int_arith_pred<0b000100, "srshl">;
- defm URSHL_ZPmZ : sve2_int_arith_pred<0b000110, "urshl">;
- defm SRSHLR_ZPmZ : sve2_int_arith_pred<0b001100, "srshlr">;
- defm URSHLR_ZPmZ : sve2_int_arith_pred<0b001110, "urshlr">;
- defm SQSHL_ZPmZ : sve2_int_arith_pred<0b010000, "sqshl">;
- defm UQSHL_ZPmZ : sve2_int_arith_pred<0b010010, "uqshl">;
- defm SQRSHL_ZPmZ : sve2_int_arith_pred<0b010100, "sqrshl">;
- defm UQRSHL_ZPmZ : sve2_int_arith_pred<0b010110, "uqrshl">;
- defm SQSHLR_ZPmZ : sve2_int_arith_pred<0b011000, "sqshlr">;
- defm UQSHLR_ZPmZ : sve2_int_arith_pred<0b011010, "uqshlr">;
- defm SQRSHLR_ZPmZ : sve2_int_arith_pred<0b011100, "sqrshlr">;
- defm UQRSHLR_ZPmZ : sve2_int_arith_pred<0b011110, "uqrshlr">;
+ defm SRSHL_ZPmZ : sve2_int_arith_pred<0b000100, "srshl", int_aarch64_sve_srshl>;
+ defm URSHL_ZPmZ : sve2_int_arith_pred<0b000110, "urshl", int_aarch64_sve_urshl>;
+ defm SRSHLR_ZPmZ : sve2_int_arith_pred<0b001100, "srshlr", null_frag>;
+ defm URSHLR_ZPmZ : sve2_int_arith_pred<0b001110, "urshlr", null_frag>;
+ defm SQSHL_ZPmZ : sve2_int_arith_pred<0b010000, "sqshl", int_aarch64_sve_sqshl>;
+ defm UQSHL_ZPmZ : sve2_int_arith_pred<0b010010, "uqshl", int_aarch64_sve_uqshl>;
+ defm SQRSHL_ZPmZ : sve2_int_arith_pred<0b010100, "sqrshl", int_aarch64_sve_sqrshl>;
+ defm UQRSHL_ZPmZ : sve2_int_arith_pred<0b010110, "uqrshl", int_aarch64_sve_uqrshl>;
+ defm SQSHLR_ZPmZ : sve2_int_arith_pred<0b011000, "sqshlr", null_frag>;
+ defm UQSHLR_ZPmZ : sve2_int_arith_pred<0b011010, "uqshlr", null_frag>;
+ defm SQRSHLR_ZPmZ : sve2_int_arith_pred<0b011100, "sqrshlr", null_frag>;
+ defm UQRSHLR_ZPmZ : sve2_int_arith_pred<0b011110, "uqrshlr", null_frag>;
+
+ let Predicates = [HasSVE2, UseExperimentalZeroingPseudos] in {
+ defm SQSHL_ZPZI : sve_int_bin_pred_shift_imm_left_zeroing_bhsd<null_frag>;
+ defm UQSHL_ZPZI : sve_int_bin_pred_shift_imm_left_zeroing_bhsd<null_frag>;
+ defm SRSHR_ZPZI : sve_int_bin_pred_shift_imm_right_zeroing_bhsd<int_aarch64_sve_srshr>;
+ defm URSHR_ZPZI : sve_int_bin_pred_shift_imm_right_zeroing_bhsd<int_aarch64_sve_urshr>;
+ defm SQSHLU_ZPZI : sve_int_bin_pred_shift_imm_left_zeroing_bhsd<int_aarch64_sve_sqshlu>;
+ }
// SVE2 predicated shifts
- defm SQSHL_ZPmI : sve_int_bin_pred_shift_imm_left< 0b0110, "sqshl">;
- defm UQSHL_ZPmI : sve_int_bin_pred_shift_imm_left< 0b0111, "uqshl">;
- defm SRSHR_ZPmI : sve_int_bin_pred_shift_imm_right<0b1100, "srshr">;
- defm URSHR_ZPmI : sve_int_bin_pred_shift_imm_right<0b1101, "urshr">;
- defm SQSHLU_ZPmI : sve_int_bin_pred_shift_imm_left< 0b1111, "sqshlu">;
+ defm SQSHL_ZPmI : sve_int_bin_pred_shift_imm_left< 0b0110, "sqshl", "SQSHL_ZPZI">;
+ defm UQSHL_ZPmI : sve_int_bin_pred_shift_imm_left< 0b0111, "uqshl", "UQSHL_ZPZI">;
+ defm SRSHR_ZPmI : sve_int_bin_pred_shift_imm_right<0b1100, "srshr", "SRSHR_ZPZI", int_aarch64_sve_srshr>;
+ defm URSHR_ZPmI : sve_int_bin_pred_shift_imm_right<0b1101, "urshr", "URSHR_ZPZI", int_aarch64_sve_urshr>;
+ defm SQSHLU_ZPmI : sve2_int_bin_pred_shift_imm_left< 0b1111, "sqshlu", "SQSHLU_ZPZI", int_aarch64_sve_sqshlu>;
// SVE2 integer add/subtract long
- defm SADDLB_ZZZ : sve2_wide_int_arith_long<0b00000, "saddlb">;
- defm SADDLT_ZZZ : sve2_wide_int_arith_long<0b00001, "saddlt">;
- defm UADDLB_ZZZ : sve2_wide_int_arith_long<0b00010, "uaddlb">;
- defm UADDLT_ZZZ : sve2_wide_int_arith_long<0b00011, "uaddlt">;
- defm SSUBLB_ZZZ : sve2_wide_int_arith_long<0b00100, "ssublb">;
- defm SSUBLT_ZZZ : sve2_wide_int_arith_long<0b00101, "ssublt">;
- defm USUBLB_ZZZ : sve2_wide_int_arith_long<0b00110, "usublb">;
- defm USUBLT_ZZZ : sve2_wide_int_arith_long<0b00111, "usublt">;
- defm SABDLB_ZZZ : sve2_wide_int_arith_long<0b01100, "sabdlb">;
- defm SABDLT_ZZZ : sve2_wide_int_arith_long<0b01101, "sabdlt">;
- defm UABDLB_ZZZ : sve2_wide_int_arith_long<0b01110, "uabdlb">;
- defm UABDLT_ZZZ : sve2_wide_int_arith_long<0b01111, "uabdlt">;
+ defm SADDLB_ZZZ : sve2_wide_int_arith_long<0b00000, "saddlb", int_aarch64_sve_saddlb>;
+ defm SADDLT_ZZZ : sve2_wide_int_arith_long<0b00001, "saddlt", int_aarch64_sve_saddlt>;
+ defm UADDLB_ZZZ : sve2_wide_int_arith_long<0b00010, "uaddlb", int_aarch64_sve_uaddlb>;
+ defm UADDLT_ZZZ : sve2_wide_int_arith_long<0b00011, "uaddlt", int_aarch64_sve_uaddlt>;
+ defm SSUBLB_ZZZ : sve2_wide_int_arith_long<0b00100, "ssublb", int_aarch64_sve_ssublb>;
+ defm SSUBLT_ZZZ : sve2_wide_int_arith_long<0b00101, "ssublt", int_aarch64_sve_ssublt>;
+ defm USUBLB_ZZZ : sve2_wide_int_arith_long<0b00110, "usublb", int_aarch64_sve_usublb>;
+ defm USUBLT_ZZZ : sve2_wide_int_arith_long<0b00111, "usublt", int_aarch64_sve_usublt>;
+ defm SABDLB_ZZZ : sve2_wide_int_arith_long<0b01100, "sabdlb", int_aarch64_sve_sabdlb>;
+ defm SABDLT_ZZZ : sve2_wide_int_arith_long<0b01101, "sabdlt", int_aarch64_sve_sabdlt>;
+ defm UABDLB_ZZZ : sve2_wide_int_arith_long<0b01110, "uabdlb", int_aarch64_sve_uabdlb>;
+ defm UABDLT_ZZZ : sve2_wide_int_arith_long<0b01111, "uabdlt", int_aarch64_sve_uabdlt>;
// SVE2 integer add/subtract wide
- defm SADDWB_ZZZ : sve2_wide_int_arith_wide<0b000, "saddwb">;
- defm SADDWT_ZZZ : sve2_wide_int_arith_wide<0b001, "saddwt">;
- defm UADDWB_ZZZ : sve2_wide_int_arith_wide<0b010, "uaddwb">;
- defm UADDWT_ZZZ : sve2_wide_int_arith_wide<0b011, "uaddwt">;
- defm SSUBWB_ZZZ : sve2_wide_int_arith_wide<0b100, "ssubwb">;
- defm SSUBWT_ZZZ : sve2_wide_int_arith_wide<0b101, "ssubwt">;
- defm USUBWB_ZZZ : sve2_wide_int_arith_wide<0b110, "usubwb">;
- defm USUBWT_ZZZ : sve2_wide_int_arith_wide<0b111, "usubwt">;
+ defm SADDWB_ZZZ : sve2_wide_int_arith_wide<0b000, "saddwb", int_aarch64_sve_saddwb>;
+ defm SADDWT_ZZZ : sve2_wide_int_arith_wide<0b001, "saddwt", int_aarch64_sve_saddwt>;
+ defm UADDWB_ZZZ : sve2_wide_int_arith_wide<0b010, "uaddwb", int_aarch64_sve_uaddwb>;
+ defm UADDWT_ZZZ : sve2_wide_int_arith_wide<0b011, "uaddwt", int_aarch64_sve_uaddwt>;
+ defm SSUBWB_ZZZ : sve2_wide_int_arith_wide<0b100, "ssubwb", int_aarch64_sve_ssubwb>;
+ defm SSUBWT_ZZZ : sve2_wide_int_arith_wide<0b101, "ssubwt", int_aarch64_sve_ssubwt>;
+ defm USUBWB_ZZZ : sve2_wide_int_arith_wide<0b110, "usubwb", int_aarch64_sve_usubwb>;
+ defm USUBWT_ZZZ : sve2_wide_int_arith_wide<0b111, "usubwt", int_aarch64_sve_usubwt>;
// SVE2 integer multiply long
- defm SQDMULLB_ZZZ : sve2_wide_int_arith_long<0b11000, "sqdmullb">;
- defm SQDMULLT_ZZZ : sve2_wide_int_arith_long<0b11001, "sqdmullt">;
- defm SMULLB_ZZZ : sve2_wide_int_arith_long<0b11100, "smullb">;
- defm SMULLT_ZZZ : sve2_wide_int_arith_long<0b11101, "smullt">;
- defm UMULLB_ZZZ : sve2_wide_int_arith_long<0b11110, "umullb">;
- defm UMULLT_ZZZ : sve2_wide_int_arith_long<0b11111, "umullt">;
- defm PMULLB_ZZZ : sve2_pmul_long<0b0, "pmullb">;
- defm PMULLT_ZZZ : sve2_pmul_long<0b1, "pmullt">;
+ defm SQDMULLB_ZZZ : sve2_wide_int_arith_long<0b11000, "sqdmullb", int_aarch64_sve_sqdmullb>;
+ defm SQDMULLT_ZZZ : sve2_wide_int_arith_long<0b11001, "sqdmullt", int_aarch64_sve_sqdmullt>;
+ defm SMULLB_ZZZ : sve2_wide_int_arith_long<0b11100, "smullb", int_aarch64_sve_smullb>;
+ defm SMULLT_ZZZ : sve2_wide_int_arith_long<0b11101, "smullt", int_aarch64_sve_smullt>;
+ defm UMULLB_ZZZ : sve2_wide_int_arith_long<0b11110, "umullb", int_aarch64_sve_umullb>;
+ defm UMULLT_ZZZ : sve2_wide_int_arith_long<0b11111, "umullt", int_aarch64_sve_umullt>;
+ defm PMULLB_ZZZ : sve2_pmul_long<0b0, "pmullb", int_aarch64_sve_pmullb_pair>;
+ defm PMULLT_ZZZ : sve2_pmul_long<0b1, "pmullt", int_aarch64_sve_pmullt_pair>;
// SVE2 bitwise shift and insert
- defm SRI_ZZI : sve2_int_bin_shift_imm_right<0b0, "sri">;
- defm SLI_ZZI : sve2_int_bin_shift_imm_left< 0b1, "sli">;
+ defm SRI_ZZI : sve2_int_bin_shift_imm_right<0b0, "sri", int_aarch64_sve_sri>;
+ defm SLI_ZZI : sve2_int_bin_shift_imm_left< 0b1, "sli", int_aarch64_sve_sli>;
// SVE2 bitwise shift right and accumulate
- defm SSRA_ZZI : sve2_int_bin_accum_shift_imm_right<0b00, "ssra">;
- defm USRA_ZZI : sve2_int_bin_accum_shift_imm_right<0b01, "usra">;
- defm SRSRA_ZZI : sve2_int_bin_accum_shift_imm_right<0b10, "srsra">;
- defm URSRA_ZZI : sve2_int_bin_accum_shift_imm_right<0b11, "ursra">;
+ defm SSRA_ZZI : sve2_int_bin_accum_shift_imm_right<0b00, "ssra", int_aarch64_sve_ssra>;
+ defm USRA_ZZI : sve2_int_bin_accum_shift_imm_right<0b01, "usra", int_aarch64_sve_usra>;
+ defm SRSRA_ZZI : sve2_int_bin_accum_shift_imm_right<0b10, "srsra", int_aarch64_sve_srsra>;
+ defm URSRA_ZZI : sve2_int_bin_accum_shift_imm_right<0b11, "ursra", int_aarch64_sve_ursra>;
// SVE2 complex integer add
- defm CADD_ZZI : sve2_int_cadd<0b0, "cadd">;
- defm SQCADD_ZZI : sve2_int_cadd<0b1, "sqcadd">;
+ defm CADD_ZZI : sve2_int_cadd<0b0, "cadd", int_aarch64_sve_cadd_x>;
+ defm SQCADD_ZZI : sve2_int_cadd<0b1, "sqcadd", int_aarch64_sve_sqcadd_x>;
// SVE2 integer absolute difference and accumulate
- defm SABA_ZZZ : sve2_int_absdiff_accum<0b0, "saba">;
- defm UABA_ZZZ : sve2_int_absdiff_accum<0b1, "uaba">;
+ defm SABA_ZZZ : sve2_int_absdiff_accum<0b0, "saba", int_aarch64_sve_saba>;
+ defm UABA_ZZZ : sve2_int_absdiff_accum<0b1, "uaba", int_aarch64_sve_uaba>;
// SVE2 integer absolute difference and accumulate long
- defm SABALB_ZZZ : sve2_int_absdiff_accum_long<0b00, "sabalb">;
- defm SABALT_ZZZ : sve2_int_absdiff_accum_long<0b01, "sabalt">;
- defm UABALB_ZZZ : sve2_int_absdiff_accum_long<0b10, "uabalb">;
- defm UABALT_ZZZ : sve2_int_absdiff_accum_long<0b11, "uabalt">;
+ defm SABALB_ZZZ : sve2_int_absdiff_accum_long<0b00, "sabalb", int_aarch64_sve_sabalb>;
+ defm SABALT_ZZZ : sve2_int_absdiff_accum_long<0b01, "sabalt", int_aarch64_sve_sabalt>;
+ defm UABALB_ZZZ : sve2_int_absdiff_accum_long<0b10, "uabalb", int_aarch64_sve_uabalb>;
+ defm UABALT_ZZZ : sve2_int_absdiff_accum_long<0b11, "uabalt", int_aarch64_sve_uabalt>;
// SVE2 integer add/subtract long with carry
- defm ADCLB_ZZZ : sve2_int_addsub_long_carry<0b00, "adclb">;
- defm ADCLT_ZZZ : sve2_int_addsub_long_carry<0b01, "adclt">;
- defm SBCLB_ZZZ : sve2_int_addsub_long_carry<0b10, "sbclb">;
- defm SBCLT_ZZZ : sve2_int_addsub_long_carry<0b11, "sbclt">;
+ defm ADCLB_ZZZ : sve2_int_addsub_long_carry<0b00, "adclb", int_aarch64_sve_adclb>;
+ defm ADCLT_ZZZ : sve2_int_addsub_long_carry<0b01, "adclt", int_aarch64_sve_adclt>;
+ defm SBCLB_ZZZ : sve2_int_addsub_long_carry<0b10, "sbclb", int_aarch64_sve_sbclb>;
+ defm SBCLT_ZZZ : sve2_int_addsub_long_carry<0b11, "sbclt", int_aarch64_sve_sbclt>;
// SVE2 bitwise shift right narrow (bottom)
defm SQSHRUNB_ZZI : sve2_int_bin_shift_imm_right_narrow_bottom<0b000, "sqshrunb", int_aarch64_sve_sqshrunb>;
@@ -1489,29 +2426,29 @@ let Predicates = [HasSVE2] in {
defm SQXTUNT_ZZ : sve2_int_sat_extract_narrow_top<0b10, "sqxtunt", int_aarch64_sve_sqxtunt>;
// SVE2 character match
- defm MATCH_PPzZZ : sve2_char_match<0b0, "match">;
- defm NMATCH_PPzZZ : sve2_char_match<0b1, "nmatch">;
+ defm MATCH_PPzZZ : sve2_char_match<0b0, "match", int_aarch64_sve_match>;
+ defm NMATCH_PPzZZ : sve2_char_match<0b1, "nmatch", int_aarch64_sve_nmatch>;
// SVE2 bitwise exclusive-or interleaved
- defm EORBT_ZZZ : sve2_bitwise_xor_interleaved<0b0, "eorbt">;
- defm EORTB_ZZZ : sve2_bitwise_xor_interleaved<0b1, "eortb">;
+ defm EORBT_ZZZ : sve2_bitwise_xor_interleaved<0b0, "eorbt", int_aarch64_sve_eorbt>;
+ defm EORTB_ZZZ : sve2_bitwise_xor_interleaved<0b1, "eortb", int_aarch64_sve_eortb>;
// SVE2 bitwise shift left long
- defm SSHLLB_ZZI : sve2_bitwise_shift_left_long<0b00, "sshllb">;
- defm SSHLLT_ZZI : sve2_bitwise_shift_left_long<0b01, "sshllt">;
- defm USHLLB_ZZI : sve2_bitwise_shift_left_long<0b10, "ushllb">;
- defm USHLLT_ZZI : sve2_bitwise_shift_left_long<0b11, "ushllt">;
+ defm SSHLLB_ZZI : sve2_bitwise_shift_left_long<0b00, "sshllb", int_aarch64_sve_sshllb>;
+ defm SSHLLT_ZZI : sve2_bitwise_shift_left_long<0b01, "sshllt", int_aarch64_sve_sshllt>;
+ defm USHLLB_ZZI : sve2_bitwise_shift_left_long<0b10, "ushllb", int_aarch64_sve_ushllb>;
+ defm USHLLT_ZZI : sve2_bitwise_shift_left_long<0b11, "ushllt", int_aarch64_sve_ushllt>;
// SVE2 integer add/subtract interleaved long
- defm SADDLBT_ZZZ : sve2_misc_int_addsub_long_interleaved<0b00, "saddlbt">;
- defm SSUBLBT_ZZZ : sve2_misc_int_addsub_long_interleaved<0b10, "ssublbt">;
- defm SSUBLTB_ZZZ : sve2_misc_int_addsub_long_interleaved<0b11, "ssubltb">;
+ defm SADDLBT_ZZZ : sve2_misc_int_addsub_long_interleaved<0b00, "saddlbt", int_aarch64_sve_saddlbt>;
+ defm SSUBLBT_ZZZ : sve2_misc_int_addsub_long_interleaved<0b10, "ssublbt", int_aarch64_sve_ssublbt>;
+ defm SSUBLTB_ZZZ : sve2_misc_int_addsub_long_interleaved<0b11, "ssubltb", int_aarch64_sve_ssubltb>;
// SVE2 histogram generation (segment)
- def HISTSEG_ZZZ : sve2_hist_gen_segment<"histseg">;
+ def HISTSEG_ZZZ : sve2_hist_gen_segment<"histseg", int_aarch64_sve_histseg>;
// SVE2 histogram generation (vector)
- defm HISTCNT_ZPzZZ : sve2_hist_gen_vector<"histcnt">;
+ defm HISTCNT_ZPzZZ : sve2_hist_gen_vector<"histcnt", int_aarch64_sve_histcnt>;
// SVE2 floating-point base 2 logarithm as integer
defm FLOGB_ZPmZ : sve2_fp_flogb<"flogb", int_aarch64_sve_flogb>;
@@ -1542,50 +2479,57 @@ let Predicates = [HasSVE2] in {
defm FMLSLT_ZZZ_SHH : sve2_fp_mla_long<0b11, "fmlslt", int_aarch64_sve_fmlslt>;
// SVE2 bitwise ternary operations
- defm EOR3_ZZZZ_D : sve2_int_bitwise_ternary_op<0b000, "eor3">;
- defm BCAX_ZZZZ_D : sve2_int_bitwise_ternary_op<0b010, "bcax">;
- def BSL_ZZZZ_D : sve2_int_bitwise_ternary_op_d<0b001, "bsl">;
- def BSL1N_ZZZZ_D : sve2_int_bitwise_ternary_op_d<0b011, "bsl1n">;
- def BSL2N_ZZZZ_D : sve2_int_bitwise_ternary_op_d<0b101, "bsl2n">;
- def NBSL_ZZZZ_D : sve2_int_bitwise_ternary_op_d<0b111, "nbsl">;
+ defm EOR3_ZZZZ : sve2_int_bitwise_ternary_op<0b000, "eor3", int_aarch64_sve_eor3>;
+ defm BCAX_ZZZZ : sve2_int_bitwise_ternary_op<0b010, "bcax", int_aarch64_sve_bcax>;
+ defm BSL_ZZZZ : sve2_int_bitwise_ternary_op<0b001, "bsl", int_aarch64_sve_bsl>;
+ defm BSL1N_ZZZZ : sve2_int_bitwise_ternary_op<0b011, "bsl1n", int_aarch64_sve_bsl1n>;
+ defm BSL2N_ZZZZ : sve2_int_bitwise_ternary_op<0b101, "bsl2n", int_aarch64_sve_bsl2n>;
+ defm NBSL_ZZZZ : sve2_int_bitwise_ternary_op<0b111, "nbsl", int_aarch64_sve_nbsl>;
// SVE2 bitwise xor and rotate right by immediate
- defm XAR_ZZZI : sve2_int_rotate_right_imm<"xar">;
+ defm XAR_ZZZI : sve2_int_rotate_right_imm<"xar", int_aarch64_sve_xar>;
// SVE2 extract vector (immediate offset, constructive)
def EXT_ZZI_B : sve2_int_perm_extract_i_cons<"ext">;
// SVE2 non-temporal gather loads
- defm LDNT1SB_ZZR_S : sve2_mem_gldnt_vs<0b00000, "ldnt1sb", Z_s, ZPR32>;
- defm LDNT1B_ZZR_S : sve2_mem_gldnt_vs<0b00001, "ldnt1b", Z_s, ZPR32>;
- defm LDNT1SH_ZZR_S : sve2_mem_gldnt_vs<0b00100, "ldnt1sh", Z_s, ZPR32>;
- defm LDNT1H_ZZR_S : sve2_mem_gldnt_vs<0b00101, "ldnt1h", Z_s, ZPR32>;
- defm LDNT1W_ZZR_S : sve2_mem_gldnt_vs<0b01001, "ldnt1w", Z_s, ZPR32>;
-
- defm LDNT1SB_ZZR_D : sve2_mem_gldnt_vs<0b10000, "ldnt1sb", Z_d, ZPR64>;
- defm LDNT1B_ZZR_D : sve2_mem_gldnt_vs<0b10010, "ldnt1b", Z_d, ZPR64>;
- defm LDNT1SH_ZZR_D : sve2_mem_gldnt_vs<0b10100, "ldnt1sh", Z_d, ZPR64>;
- defm LDNT1H_ZZR_D : sve2_mem_gldnt_vs<0b10110, "ldnt1h", Z_d, ZPR64>;
- defm LDNT1SW_ZZR_D : sve2_mem_gldnt_vs<0b11000, "ldnt1sw", Z_d, ZPR64>;
- defm LDNT1W_ZZR_D : sve2_mem_gldnt_vs<0b11010, "ldnt1w", Z_d, ZPR64>;
- defm LDNT1D_ZZR_D : sve2_mem_gldnt_vs<0b11110, "ldnt1d", Z_d, ZPR64>;
+ defm LDNT1SB_ZZR_S : sve2_mem_gldnt_vs_32_ptrs<0b00000, "ldnt1sb", AArch64ldnt1s_gather_z, nxv4i8>;
+ defm LDNT1B_ZZR_S : sve2_mem_gldnt_vs_32_ptrs<0b00001, "ldnt1b", AArch64ldnt1_gather_z, nxv4i8>;
+ defm LDNT1SH_ZZR_S : sve2_mem_gldnt_vs_32_ptrs<0b00100, "ldnt1sh", AArch64ldnt1s_gather_z, nxv4i16>;
+ defm LDNT1H_ZZR_S : sve2_mem_gldnt_vs_32_ptrs<0b00101, "ldnt1h", AArch64ldnt1_gather_z, nxv4i16>;
+ defm LDNT1W_ZZR_S : sve2_mem_gldnt_vs_32_ptrs<0b01001, "ldnt1w", AArch64ldnt1_gather_z, nxv4i32>;
+
+ defm LDNT1SB_ZZR_D : sve2_mem_gldnt_vs_64_ptrs<0b10000, "ldnt1sb", AArch64ldnt1s_gather_z, nxv2i8>;
+ defm LDNT1B_ZZR_D : sve2_mem_gldnt_vs_64_ptrs<0b10010, "ldnt1b", AArch64ldnt1_gather_z, nxv2i8>;
+ defm LDNT1SH_ZZR_D : sve2_mem_gldnt_vs_64_ptrs<0b10100, "ldnt1sh", AArch64ldnt1s_gather_z, nxv2i16>;
+ defm LDNT1H_ZZR_D : sve2_mem_gldnt_vs_64_ptrs<0b10110, "ldnt1h", AArch64ldnt1_gather_z, nxv2i16>;
+ defm LDNT1SW_ZZR_D : sve2_mem_gldnt_vs_64_ptrs<0b11000, "ldnt1sw", AArch64ldnt1s_gather_z, nxv2i32>;
+ defm LDNT1W_ZZR_D : sve2_mem_gldnt_vs_64_ptrs<0b11010, "ldnt1w", AArch64ldnt1_gather_z, nxv2i32>;
+ defm LDNT1D_ZZR_D : sve2_mem_gldnt_vs_64_ptrs<0b11110, "ldnt1d", AArch64ldnt1_gather_z, nxv2i64>;
// SVE2 vector splice (constructive)
defm SPLICE_ZPZZ : sve2_int_perm_splice_cons<"splice">;
// SVE2 non-temporal scatter stores
- defm STNT1B_ZZR_S : sve2_mem_sstnt_vs<0b001, "stnt1b", Z_s, ZPR32>;
- defm STNT1H_ZZR_S : sve2_mem_sstnt_vs<0b011, "stnt1h", Z_s, ZPR32>;
- defm STNT1W_ZZR_S : sve2_mem_sstnt_vs<0b101, "stnt1w", Z_s, ZPR32>;
+ defm STNT1B_ZZR_S : sve2_mem_sstnt_vs_32_ptrs<0b001, "stnt1b", AArch64stnt1_scatter, nxv4i8>;
+ defm STNT1H_ZZR_S : sve2_mem_sstnt_vs_32_ptrs<0b011, "stnt1h", AArch64stnt1_scatter, nxv4i16>;
+ defm STNT1W_ZZR_S : sve2_mem_sstnt_vs_32_ptrs<0b101, "stnt1w", AArch64stnt1_scatter, nxv4i32>;
- defm STNT1B_ZZR_D : sve2_mem_sstnt_vs<0b000, "stnt1b", Z_d, ZPR64>;
- defm STNT1H_ZZR_D : sve2_mem_sstnt_vs<0b010, "stnt1h", Z_d, ZPR64>;
- defm STNT1W_ZZR_D : sve2_mem_sstnt_vs<0b100, "stnt1w", Z_d, ZPR64>;
- defm STNT1D_ZZR_D : sve2_mem_sstnt_vs<0b110, "stnt1d", Z_d, ZPR64>;
+ defm STNT1B_ZZR_D : sve2_mem_sstnt_vs_64_ptrs<0b000, "stnt1b", AArch64stnt1_scatter, nxv2i8>;
+ defm STNT1H_ZZR_D : sve2_mem_sstnt_vs_64_ptrs<0b010, "stnt1h", AArch64stnt1_scatter, nxv2i16>;
+ defm STNT1W_ZZR_D : sve2_mem_sstnt_vs_64_ptrs<0b100, "stnt1w", AArch64stnt1_scatter, nxv2i32>;
+ defm STNT1D_ZZR_D : sve2_mem_sstnt_vs_64_ptrs<0b110, "stnt1d", AArch64stnt1_scatter, nxv2i64>;
// SVE2 table lookup (three sources)
- defm TBL_ZZZZ : sve2_int_perm_tbl<"tbl">;
- defm TBX_ZZZ : sve2_int_perm_tbx<"tbx">;
+ defm TBL_ZZZZ : sve2_int_perm_tbl<"tbl", int_aarch64_sve_tbl2>;
+ defm TBX_ZZZ : sve2_int_perm_tbx<"tbx", int_aarch64_sve_tbx>;
+
+ let Predicates = [HasSVE, HasBF16] in {
+ def : SVE_3_Op_Pat<nxv8bf16, int_aarch64_sve_tbx, nxv8bf16, nxv8bf16, nxv8i16, TBX_ZZZ_H>;
+ def : Pat<(nxv8bf16 (int_aarch64_sve_tbl2 nxv8bf16:$Op1, nxv8bf16:$Op2, nxv8i16:$Op3)),
+ (nxv8bf16 (TBL_ZZZZ_H (REG_SEQUENCE ZPR2, nxv8bf16:$Op1, zsub0, nxv8bf16:$Op2, zsub1),
+ nxv8i16:$Op3))>;
+ }
// SVE2 integer compare scalar count and limit
defm WHILEGE_PWW : sve_int_while4_rr<0b000, "whilege", int_aarch64_sve_whilege>;
@@ -1599,43 +2543,41 @@ let Predicates = [HasSVE2] in {
defm WHILEHI_PXX : sve_int_while8_rr<0b101, "whilehi", int_aarch64_sve_whilehi>;
// SVE2 pointer conflict compare
- defm WHILEWR_PXX : sve2_int_while_rr<0b0, "whilewr">;
- defm WHILERW_PXX : sve2_int_while_rr<0b1, "whilerw">;
+ defm WHILEWR_PXX : sve2_int_while_rr<0b0, "whilewr", "int_aarch64_sve_whilewr">;
+ defm WHILERW_PXX : sve2_int_while_rr<0b1, "whilerw", "int_aarch64_sve_whilerw">;
}
let Predicates = [HasSVE2AES] in {
// SVE2 crypto destructive binary operations
- def AESE_ZZZ_B : sve2_crypto_des_bin_op<0b00, "aese", ZPR8>;
- def AESD_ZZZ_B : sve2_crypto_des_bin_op<0b01, "aesd", ZPR8>;
+ defm AESE_ZZZ_B : sve2_crypto_des_bin_op<0b00, "aese", ZPR8, int_aarch64_sve_aese, nxv16i8>;
+ defm AESD_ZZZ_B : sve2_crypto_des_bin_op<0b01, "aesd", ZPR8, int_aarch64_sve_aesd, nxv16i8>;
// SVE2 crypto unary operations
- def AESMC_ZZ_B : sve2_crypto_unary_op<0b0, "aesmc">;
- def AESIMC_ZZ_B : sve2_crypto_unary_op<0b1, "aesimc">;
+ defm AESMC_ZZ_B : sve2_crypto_unary_op<0b0, "aesmc", int_aarch64_sve_aesmc>;
+ defm AESIMC_ZZ_B : sve2_crypto_unary_op<0b1, "aesimc", int_aarch64_sve_aesimc>;
// PMULLB and PMULLT instructions which operate with 64-bit source and
// 128-bit destination elements are enabled with crypto extensions, similar
// to NEON PMULL2 instruction.
- def PMULLB_ZZZ_Q : sve2_wide_int_arith<0b00, 0b11010, "pmullb",
- ZPR128, ZPR64, ZPR64>;
- def PMULLT_ZZZ_Q : sve2_wide_int_arith<0b00, 0b11011, "pmullt",
- ZPR128, ZPR64, ZPR64>;
+ defm PMULLB_ZZZ_Q : sve2_wide_int_arith_pmul<0b00, 0b11010, "pmullb", int_aarch64_sve_pmullb_pair>;
+ defm PMULLT_ZZZ_Q : sve2_wide_int_arith_pmul<0b00, 0b11011, "pmullt", int_aarch64_sve_pmullt_pair>;
}
let Predicates = [HasSVE2SM4] in {
// SVE2 crypto constructive binary operations
- def SM4EKEY_ZZZ_S : sve2_crypto_cons_bin_op<0b0, "sm4ekey", ZPR32>;
+ defm SM4EKEY_ZZZ_S : sve2_crypto_cons_bin_op<0b0, "sm4ekey", ZPR32, int_aarch64_sve_sm4ekey, nxv4i32>;
// SVE2 crypto destructive binary operations
- def SM4E_ZZZ_S : sve2_crypto_des_bin_op<0b10, "sm4e", ZPR32>;
+ defm SM4E_ZZZ_S : sve2_crypto_des_bin_op<0b10, "sm4e", ZPR32, int_aarch64_sve_sm4e, nxv4i32>;
}
let Predicates = [HasSVE2SHA3] in {
// SVE2 crypto constructive binary operations
- def RAX1_ZZZ_D : sve2_crypto_cons_bin_op<0b1, "rax1", ZPR64>;
+ defm RAX1_ZZZ_D : sve2_crypto_cons_bin_op<0b1, "rax1", ZPR64, int_aarch64_sve_rax1, nxv2i64>;
}
let Predicates = [HasSVE2BitPerm] in {
// SVE2 bitwise permute
- defm BEXT_ZZZ : sve2_misc_bitwise<0b1100, "bext">;
- defm BDEP_ZZZ : sve2_misc_bitwise<0b1101, "bdep">;
- defm BGRP_ZZZ : sve2_misc_bitwise<0b1110, "bgrp">;
+ defm BEXT_ZZZ : sve2_misc_bitwise<0b1100, "bext", int_aarch64_sve_bext_x>;
+ defm BDEP_ZZZ : sve2_misc_bitwise<0b1101, "bdep", int_aarch64_sve_bdep_x>;
+ defm BGRP_ZZZ : sve2_misc_bitwise<0b1110, "bgrp", int_aarch64_sve_bgrp_x>;
}
diff --git a/llvm/lib/Target/AArch64/AArch64SchedA53.td b/llvm/lib/Target/AArch64/AArch64SchedA53.td
index a6df0f3f083cb..c5ff1fcb274b7 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedA53.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedA53.td
@@ -26,7 +26,8 @@ def CortexA53Model : SchedMachineModel {
// v 1.0 Spreadsheet
let CompleteModel = 1;
- list<Predicate> UnsupportedFeatures = SVEUnsupported.F;
+ list<Predicate> UnsupportedFeatures = !listconcat(SVEUnsupported.F,
+ PAUnsupported.F);
}
diff --git a/llvm/lib/Target/AArch64/AArch64SchedA57.td b/llvm/lib/Target/AArch64/AArch64SchedA57.td
index 9f566d1c7079b..7c40da05c3056 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedA57.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedA57.td
@@ -31,7 +31,8 @@ def CortexA57Model : SchedMachineModel {
let LoopMicroOpBufferSize = 16;
let CompleteModel = 1;
- list<Predicate> UnsupportedFeatures = SVEUnsupported.F;
+ list<Predicate> UnsupportedFeatures = !listconcat(SVEUnsupported.F,
+ PAUnsupported.F);
}
//===----------------------------------------------------------------------===//
@@ -501,7 +502,7 @@ def : InstRW<[A57Write_5cyc_2V], (instregex "^FRINT[AIMNPXZ](v4f32|v2f64)")>;
// Q form - v16i8, v8i16, v4i32, v2i64
// ASIMD bitwise insert, Q-form
-def : InstRW<[A57Write_3cyc_2V], (instregex "^(BIF|BIT|BSL)v16i8")>;
+def : InstRW<[A57Write_3cyc_2V], (instregex "^(BIF|BIT|BSL|BSP)v16i8")>;
// ASIMD duplicate, gen reg, D-form and Q-form
def : InstRW<[A57Write_8cyc_1L_1V], (instregex "^CPY")>;
diff --git a/llvm/lib/Target/AArch64/AArch64SchedCyclone.td b/llvm/lib/Target/AArch64/AArch64SchedCyclone.td
index 798ecb7508c08..8abcb804d5c71 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedCyclone.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedCyclone.td
@@ -18,7 +18,8 @@ def CycloneModel : SchedMachineModel {
let MispredictPenalty = 16; // 14-19 cycles are typical.
let CompleteModel = 1;
- list<Predicate> UnsupportedFeatures = SVEUnsupported.F;
+ list<Predicate> UnsupportedFeatures = !listconcat(SVEUnsupported.F,
+ PAUnsupported.F);
}
//===----------------------------------------------------------------------===//
@@ -494,7 +495,7 @@ def : InstRW<[CyWriteV3], (instregex "SQRSHLv","UQRSHLv")>;
// WriteV includes:
// SHLL,SSHLL,USHLL
// SLI,SRI
-// BIF,BIT,BSL
+// BIF,BIT,BSL,BSP
// EXT
// CLS,CLZ,CNT,RBIT,REV16,REV32,REV64,XTN
// XTN2
diff --git a/llvm/lib/Target/AArch64/AArch64SchedExynosM3.td b/llvm/lib/Target/AArch64/AArch64SchedExynosM3.td
index d1734c455b2b4..8413a06ed3916 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedExynosM3.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedExynosM3.td
@@ -24,7 +24,8 @@ def ExynosM3Model : SchedMachineModel {
let MispredictPenalty = 16; // Minimum branch misprediction penalty.
let CompleteModel = 1; // Use the default model otherwise.
- list<Predicate> UnsupportedFeatures = SVEUnsupported.F;
+ list<Predicate> UnsupportedFeatures = !listconcat(SVEUnsupported.F,
+ PAUnsupported.F);
}
//===----------------------------------------------------------------------===//
@@ -660,7 +661,7 @@ def : InstRW<[M3WriteNEONY], (instrs FSQRTv2f64)>;
// ASIMD miscellaneous instructions.
def : InstRW<[M3WriteNALU1], (instregex "^RBITv")>;
-def : InstRW<[M3WriteNALU1], (instregex "^(BIF|BIT|BSL)v")>;
+def : InstRW<[M3WriteNALU1], (instregex "^(BIF|BIT|BSL|BSP)v")>;
def : InstRW<[M3WriteNEONB], (instregex "^DUPv.+gpr")>;
def : InstRW<[M3WriteNSHF1], (instregex "^DUPv.+lane")>;
def : InstRW<[M3WriteNSHF1], (instregex "^EXTv")>;
diff --git a/llvm/lib/Target/AArch64/AArch64SchedExynosM4.td b/llvm/lib/Target/AArch64/AArch64SchedExynosM4.td
index d2284f9fa0b50..34e8beb423ce9 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedExynosM4.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedExynosM4.td
@@ -24,7 +24,8 @@ def ExynosM4Model : SchedMachineModel {
let MispredictPenalty = 16; // Minimum branch misprediction penalty.
let CompleteModel = 1; // Use the default model otherwise.
- list<Predicate> UnsupportedFeatures = SVEUnsupported.F;
+ list<Predicate> UnsupportedFeatures = !listconcat(SVEUnsupported.F,
+ PAUnsupported.F);
}
//===----------------------------------------------------------------------===//
@@ -803,7 +804,7 @@ def : InstRW<[M4WriteNEONY], (instrs FSQRTv2f64)>;
// ASIMD miscellaneous instructions.
def : InstRW<[M4WriteNALU1], (instregex "^RBITv")>;
-def : InstRW<[M4WriteNALU1], (instregex "^(BIF|BIT|BSL)v")>;
+def : InstRW<[M4WriteNALU1], (instregex "^(BIF|BIT|BSL|BSP)v")>;
def : InstRW<[M4WriteNALU1], (instregex "^CL[STZ]v")>;
def : InstRW<[M4WriteNEONB], (instregex "^DUPv.+gpr")>;
def : InstRW<[M4WriteNSHF1], (instregex "^CPY")>;
diff --git a/llvm/lib/Target/AArch64/AArch64SchedExynosM5.td b/llvm/lib/Target/AArch64/AArch64SchedExynosM5.td
index df7402591e7b9..403aac80e47bf 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedExynosM5.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedExynosM5.td
@@ -24,7 +24,8 @@ def ExynosM5Model : SchedMachineModel {
let MispredictPenalty = 15; // Minimum branch misprediction penalty.
let CompleteModel = 1; // Use the default model otherwise.
- list<Predicate> UnsupportedFeatures = SVEUnsupported.F;
+ list<Predicate> UnsupportedFeatures = !listconcat(SVEUnsupported.F,
+ PAUnsupported.F);
}
//===----------------------------------------------------------------------===//
@@ -841,7 +842,7 @@ def : InstRW<[M5WriteNEONY], (instrs FSQRTv2f64)>;
// ASIMD miscellaneous instructions.
def : InstRW<[M5WriteNALU2], (instregex "^RBITv")>;
-def : InstRW<[M5WriteNALU2], (instregex "^(BIF|BIT|BSL)v")>;
+def : InstRW<[M5WriteNALU2], (instregex "^(BIF|BIT|BSL|BSP)v")>;
def : InstRW<[M5WriteNALU2], (instregex "^CL[STZ]v")>;
def : InstRW<[M5WriteNEONB], (instregex "^DUPv.+gpr")>;
def : InstRW<[M5WriteNSHF2], (instregex "^CPY")>;
diff --git a/llvm/lib/Target/AArch64/AArch64SchedFalkor.td b/llvm/lib/Target/AArch64/AArch64SchedFalkor.td
index 92d03963de57f..a17ab36d7f9e0 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedFalkor.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedFalkor.td
@@ -23,8 +23,8 @@ def FalkorModel : SchedMachineModel {
let MispredictPenalty = 11; // Minimum branch misprediction penalty.
let CompleteModel = 1;
- list<Predicate> UnsupportedFeatures = SVEUnsupported.F;
-
+ list<Predicate> UnsupportedFeatures = !listconcat(SVEUnsupported.F,
+ PAUnsupported.F);
// FIXME: Remove when all errors have been fixed.
let FullInstRWOverlapCheck = 0;
}
diff --git a/llvm/lib/Target/AArch64/AArch64SchedFalkorDetails.td b/llvm/lib/Target/AArch64/AArch64SchedFalkorDetails.td
index 697a0f69c58cb..f2cd83caffa2b 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedFalkorDetails.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedFalkorDetails.td
@@ -911,7 +911,7 @@ def : InstRW<[FalkorWr_1VXVY_1cyc], (instregex "^DUP(v16i8|v8i16)(gpr|lane)$")
def : InstRW<[FalkorWr_1VXVY_1cyc], (instregex "^CPY(i8|i16|i32|i64)$")>;
def : InstRW<[FalkorWr_1GTOV_1cyc], (instregex "^INSv(i8|i16)(gpr|lane)$")>;
def : InstRW<[FalkorWr_1VTOG_1cyc], (instregex "^(S|U)MOVv.*$")>;
-def : InstRW<[FalkorWr_1VXVY_1cyc], (instregex "^(BIF|BIT|BSL)v8i8$")>;
+def : InstRW<[FalkorWr_1VXVY_1cyc], (instregex "^(BIF|BIT|BSL|BSP)v8i8$")>;
def : InstRW<[FalkorWr_1VXVY_1cyc], (instrs EXTv8i8)>;
def : InstRW<[FalkorWr_1VXVY_0cyc], (instregex "(MOVI|MVNI)(D|v8b_ns|v2i32|v4i16|v2s_msl)$")>; // imm fwd
def : InstRW<[FalkorWr_1VXVY_1cyc], (instrs TBLv8i8One)>;
@@ -935,7 +935,7 @@ def : InstRW<[FalkorWr_FMUL64_1VXVY_6cyc],
def : InstRW<[FalkorWr_1GTOV_1VXVY_2cyc],
(instregex "^INSv(i32|i64)(gpr|lane)$")>;
def : InstRW<[FalkorWr_2GTOV_1cyc], (instregex "^DUP(v4i32|v2i64)(gpr|lane)$")>;
-def : InstRW<[FalkorWr_2VXVY_1cyc], (instregex "^(BIF|BIT|BSL)v16i8$")>;
+def : InstRW<[FalkorWr_2VXVY_1cyc], (instregex "^(BIF|BIT|BSL|BSP)v16i8$")>;
def : InstRW<[FalkorWr_2VXVY_1cyc], (instrs EXTv16i8)>;
def : InstRW<[FalkorWr_2VXVY_0cyc], (instregex "(MOVI|MVNI)(v2d_ns|v16b_ns|v4i32|v8i16|v4s_msl)$")>; // imm fwd
def : InstRW<[FalkorWr_2VXVY_1cyc], (instrs NOTv16i8)>;
diff --git a/llvm/lib/Target/AArch64/AArch64SchedKryo.td b/llvm/lib/Target/AArch64/AArch64SchedKryo.td
index 0e1a24103121e..ba14bf1f50de1 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedKryo.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedKryo.td
@@ -27,8 +27,8 @@ def KryoModel : SchedMachineModel {
let LoopMicroOpBufferSize = 16;
let CompleteModel = 1;
- list<Predicate> UnsupportedFeatures = SVEUnsupported.F;
-
+ list<Predicate> UnsupportedFeatures = !listconcat(SVEUnsupported.F,
+ PAUnsupported.F);
// FIXME: Remove when all errors have been fixed.
let FullInstRWOverlapCheck = 0;
}
diff --git a/llvm/lib/Target/AArch64/AArch64SchedKryoDetails.td b/llvm/lib/Target/AArch64/AArch64SchedKryoDetails.td
index 4c60992e6351a..bc5ad0f8beced 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedKryoDetails.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedKryoDetails.td
@@ -462,13 +462,13 @@ def KryoWrite_1cyc_X_noRSV_74ln :
let Latency = 1; let NumMicroOps = 2;
}
def : InstRW<[KryoWrite_1cyc_X_noRSV_74ln],
- (instrs BIFv8i8, BITv8i8, BSLv8i8)>;
+ (instrs BIFv8i8, BITv8i8, BSLv8i8, BSPv8i8)>;
def KryoWrite_1cyc_X_X_75ln :
SchedWriteRes<[KryoUnitX, KryoUnitX]> {
let Latency = 1; let NumMicroOps = 2;
}
def : InstRW<[KryoWrite_1cyc_X_X_75ln],
- (instrs BIFv16i8, BITv16i8, BSLv16i8)>;
+ (instrs BIFv16i8, BITv16i8, BSLv16i8, BSPv16i8)>;
def KryoWrite_0cyc_noRSV_11ln :
SchedWriteRes<[]> {
let Latency = 0; let NumMicroOps = 1;
diff --git a/llvm/lib/Target/AArch64/AArch64SchedThunderX.td b/llvm/lib/Target/AArch64/AArch64SchedThunderX.td
index 3b6aecf5c0353..9c50f97085830 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedThunderX.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedThunderX.td
@@ -25,8 +25,8 @@ def ThunderXT8XModel : SchedMachineModel {
let PostRAScheduler = 1; // Use PostRA scheduler.
let CompleteModel = 1;
- list<Predicate> UnsupportedFeatures = SVEUnsupported.F;
-
+ list<Predicate> UnsupportedFeatures = !listconcat(SVEUnsupported.F,
+ PAUnsupported.F);
// FIXME: Remove when all errors have been fixed.
let FullInstRWOverlapCheck = 0;
}
diff --git a/llvm/lib/Target/AArch64/AArch64SchedThunderX2T99.td b/llvm/lib/Target/AArch64/AArch64SchedThunderX2T99.td
index e2a293c068774..95c29dd2a567f 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedThunderX2T99.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedThunderX2T99.td
@@ -25,8 +25,8 @@ def ThunderX2T99Model : SchedMachineModel {
let PostRAScheduler = 1; // Using PostRA sched.
let CompleteModel = 1;
- list<Predicate> UnsupportedFeatures = SVEUnsupported.F;
-
+ list<Predicate> UnsupportedFeatures = !listconcat(SVEUnsupported.F,
+ PAUnsupported.F);
// FIXME: Remove when all errors have been fixed.
let FullInstRWOverlapCheck = 0;
}
@@ -1482,7 +1482,7 @@ def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^RBITv")>;
// ASIMD bitwise insert, D-form
// ASIMD bitwise insert, Q-form
def : InstRW<[THX2T99Write_5Cyc_F01],
- (instregex "^BIFv", "^BITv", "^BSLv")>;
+ (instregex "^BIFv", "^BITv", "^BSLv", "^BSPv")>;
// ASIMD count, D-form
// ASIMD count, Q-form
diff --git a/llvm/lib/Target/AArch64/AArch64SchedThunderX3T110.td b/llvm/lib/Target/AArch64/AArch64SchedThunderX3T110.td
new file mode 100644
index 0000000000000..00838cc4b9bd4
--- /dev/null
+++ b/llvm/lib/Target/AArch64/AArch64SchedThunderX3T110.td
@@ -0,0 +1,1997 @@
+//=- AArch64SchedThunderX3T110.td - Marvell ThunderX3 T110 ---*- tablegen -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the scheduling model for Marvell ThunderX3T110
+// family of processors.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Pipeline Description.
+
+def ThunderX3T110Model : SchedMachineModel {
+ let IssueWidth = 4; // 4 micro-ops dispatched at a time.
+ let MicroOpBufferSize = 70; // 70 entries in micro-op re-order buffer.
+ let LoadLatency = 4; // Optimistic load latency.
+ let MispredictPenalty = 12; // Extra cycles for mispredicted branch.
+ // Determined via a mix of micro-arch details and experimentation.
+ let LoopMicroOpBufferSize = 128; // FIXME: might be much bigger in TX3.
+ let PostRAScheduler = 1; // Using PostRA sched.
+ let CompleteModel = 1;
+
+ list<Predicate> UnsupportedFeatures = !listconcat(SVEUnsupported.F,
+ PAUnsupported.F);
+ // FIXME: Remove when all errors have been fixed.
+ let FullInstRWOverlapCheck = 0;
+}
+
+let SchedModel = ThunderX3T110Model in {
+
+// Issue ports.
+
+// Port 0: ALU.
+def THX3T110P0 : ProcResource<1>;
+
+// Port 1: ALU.
+def THX3T110P1 : ProcResource<1>;
+
+// Port 2: ALU/Branch.
+def THX3T110P2 : ProcResource<1>;
+
+// Port 3: ALU/Branch.
+def THX3T110P3 : ProcResource<1>;
+
+// Port 4: Load/Store.
+def THX3T110P4 : ProcResource<1>;
+
+// Port 5: Load/store.
+def THX3T110P5 : ProcResource<1>;
+
+// Port 6: FP/Neon/SIMD/Crypto.
+def THX3T110P6FP0 : ProcResource<1>;
+
+// Port 7: FP/Neon/SIMD/Crypto.
+def THX3T110P7FP1 : ProcResource<1>;
+
+// Port 8: FP/Neon/SIMD/Crypto.
+def THX3T110P8FP2 : ProcResource<1>;
+
+// Port 9: FP/Neon/SIMD/Crypto.
+def THX3T110P9FP3 : ProcResource<1>;
+
+// Port 10: Store Data Unit.
+def THX3T110SD0 : ProcResource<1>;
+
+// Define groups for the functional units on each issue port. Each group
+// created will be used by a WriteRes.
+
+// Integer divide/mulhi micro-ops only on port I1.
+def THX3T110I1 : ProcResGroup<[THX3T110P1]>;
+
+// Branch micro-ops on ports I2/I3.
+def THX3T110I23 : ProcResGroup<[THX3T110P2, THX3T110P3]>;
+
+// Branch micro-ops on ports I1/I2/I3.
+def THX3T110I123 : ProcResGroup<[THX3T110P1, THX3T110P2, THX3T110P3]>;
+
+// Integer micro-ops on ports I0/I1/I2.
+def THX3T110I012 : ProcResGroup<[THX3T110P0, THX3T110P1, THX3T110P2]>;
+
+// Integer micro-ops on ports I0/I1/I2/I3.
+def THX3T110I0123 : ProcResGroup<[THX3T110P0, THX3T110P1,
+ THX3T110P2, THX3T110P3]>;
+
+// FP micro-ops on ports FP0/FP1/FP2/FP3.
+def THX3T110FP0123 : ProcResGroup<[THX3T110P6FP0, THX3T110P7FP1,
+ THX3T110P8FP2, THX3T110P9FP3]>;
+
+// FP micro-ops on ports FP2/FP3.
+def THX3T110FP23 : ProcResGroup<[THX3T110P8FP2, THX3T110P9FP3]>;
+
+// ASIMD micro-ops on ports FP0/FP1/FP2/FP3.
+def THX3T110SIMD : ProcResGroup<[THX3T110P6FP0, THX3T110P7FP1,
+ THX3T110P8FP2, THX3T110P9FP3]>;
+
+// Store data micro-ops only on port 10.
+def THX3T110SD : ProcResGroup<[THX3T110SD0]>;
+
+// Load/store micro-ops on ports P4/P5.
+def THX3T110LS : ProcResGroup<[THX3T110P4, THX3T110P5]>;
+
+// 70 entry unified scheduler.
+def THX3T110ANY: ProcResGroup<[THX3T110P0, THX3T110P1, THX3T110P2,
+ THX3T110P3, THX3T110P4, THX3T110P5,
+ THX3T110P6FP0, THX3T110P7FP1,
+ THX3T110P8FP2, THX3T110P9FP3]> {
+ let BufferSize = 70;
+}
+
+// Define commonly used write types for InstRW specializations.
+// All definitions follow the format: THX3T110Write_<NumCycles>Cyc_<Resources>.
+
+// 3 cycles on I1.
+def THX3T110Write_3Cyc_I1 : SchedWriteRes<[THX3T110I1]> {
+ let Latency = 3;
+ let NumMicroOps = 2;
+}
+
+// 4 cycles on I1.
+def THX3T110Write_4Cyc_I1 : SchedWriteRes<[THX3T110I1]> {
+ let Latency = 4;
+ let NumMicroOps = 2;
+}
+
+// 5 cycles on I1.
+def THX3T110Write_5Cyc_I1 : SchedWriteRes<[THX3T110I1]> {
+ let Latency = 5;
+ let NumMicroOps = 2;
+}
+
+// 7 cycles on I1.
+def THX3T110Write_7Cyc_I1 : SchedWriteRes<[THX3T110I1]> {
+ let Latency = 7;
+ let NumMicroOps = 3;
+}
+
+// 23 cycles on I1.
+def THX3T110Write_23Cyc_I1 : SchedWriteRes<[THX3T110I1]> {
+ let Latency = 23;
+ let ResourceCycles = [13, 23];
+ let NumMicroOps = 4;
+}
+
+// 39 cycles on I1.
+def THX3T110Write_39Cyc_I1 : SchedWriteRes<[THX3T110I1]> {
+ let Latency = 39;
+ let ResourceCycles = [13, 39];
+ let NumMicroOps = 4;
+}
+
+// 1 cycle on I2/I3
+def THX3T110Write_1Cyc_I23 : SchedWriteRes<[THX3T110I23]> {
+ let Latency = 1;
+ let NumMicroOps = 2;
+}
+
+// 8 cycles on I2/I3
+def THX3T110Write_8Cyc_I23 : SchedWriteRes<[THX3T110I23]> {
+ let Latency = 8;
+ let NumMicroOps = 3;
+}
+
+// 1 cycle on I1/I2/I3
+def THX3T110Write_1Cyc_I123 : SchedWriteRes<[THX3T110I123]> {
+ let Latency = 1;
+ let NumMicroOps = 2;
+}
+
+// 8 cycles on I1/I2/I3
+def THX3T110Write_8Cyc_I123 : SchedWriteRes<[THX3T110I123]> {
+ let Latency = 8;
+ let NumMicroOps = 3;
+}
+
+// 1 cycle on I0/I1/I2/I3.
+def THX3T110Write_1Cyc_I0123 : SchedWriteRes<[THX3T110I0123]> {
+ let Latency = 1;
+ let NumMicroOps = 2;
+}
+
+// 2 cycles on I0/I1/I2/I3.
+def THX3T110Write_2Cyc_I0123 : SchedWriteRes<[THX3T110I0123]> {
+ let Latency = 2;
+ let NumMicroOps = 2;
+}
+
+// 3 cycles on I0/I1/I2/I3.
+def THX3T110Write_3Cyc_I0123 : SchedWriteRes<[THX3T110I0123]> {
+ let Latency = 3;
+ let NumMicroOps = 2;
+}
+
+// 4 cycles on I0/I1/I2/I3.
+def THX3T110Write_4Cyc_I0123 : SchedWriteRes<[THX3T110I0123]> {
+ let Latency = 4;
+ let NumMicroOps = 3;
+}
+
+// 5 cycles on I0/I1/I2/I3.
+def THX3T110Write_5Cyc_I0123 : SchedWriteRes<[THX3T110I0123]> {
+ let Latency = 5;
+ let NumMicroOps = 3;
+}
+
+// 6 cycles on I0/I1/I2/I3.
+def THX3T110Write_6Cyc_I0123 : SchedWriteRes<[THX3T110I0123]> {
+ let Latency = 6;
+ let NumMicroOps = 3;
+}
+
+// 8 cycles on I0/I1/I2/I3.
+def THX3T110Write_8Cyc_I0123 : SchedWriteRes<[THX3T110I0123]> {
+ let Latency = 8;
+ let NumMicroOps = 4;
+}
+
+// 13 cycles on I0/I1/I2/I3.
+def THX3T110Write_13Cyc_I0123 : SchedWriteRes<[THX3T110I0123]> {
+ let Latency = 13;
+ let NumMicroOps = 3;
+}
+
+// 23 cycles on I0/I1/I2/I3.
+def THX3T110Write_23Cyc_I0123 : SchedWriteRes<[THX3T110I0123]> {
+ let Latency = 23;
+ let NumMicroOps = 3;
+}
+
+// 39 cycles on I0/I1/I2/I3.
+def THX3T110Write_39Cyc_I0123 : SchedWriteRes<[THX3T110I0123]> {
+ let Latency = 39;
+ let NumMicroOps = 3;
+}
+
+// 4 cycles on F2/F3.
+def THX3T110Write_4Cyc_F23 : SchedWriteRes<[THX3T110FP23]> {
+ let Latency = 4;
+ let NumMicroOps = 2;
+}
+
+// 5 cycles on F0/F1/F2/F3.
+def THX3T110Write_5Cyc_F01 : SchedWriteRes<[THX3T110FP0123]> {
+ let Latency = 5;
+ let NumMicroOps = 2;
+}
+
+// 6 cycles on F0/F1/F2/F3.
+def THX3T110Write_6Cyc_F01 : SchedWriteRes<[THX3T110FP0123]> {
+ let Latency = 6;
+ let NumMicroOps = 3;
+}
+
+// 7 cycles on F0/F1/F2/F3.
+def THX3T110Write_7Cyc_F01 : SchedWriteRes<[THX3T110FP0123]> {
+ let Latency = 7;
+ let NumMicroOps = 3;
+}
+
+// 8 cycles on F0/F1/F2/F3.
+def THX3T110Write_8Cyc_F01 : SchedWriteRes<[THX3T110FP0123]> {
+ let Latency = 8;
+ let NumMicroOps = 3;
+}
+
+// 10 cycles on F0/F1/F2/F3.
+def THX3T110Write_10Cyc_F01 : SchedWriteRes<[THX3T110FP0123]> {
+ let Latency = 10;
+ let NumMicroOps = 3;
+}
+
+// 16 cycles on F0/F1/F2/F3.
+def THX3T110Write_16Cyc_F01 : SchedWriteRes<[THX3T110FP0123]> {
+ let Latency = 16;
+ let NumMicroOps = 3;
+ let ResourceCycles = [8];
+}
+
+// 23 cycles on F0/F1/F2/F3.
+def THX3T110Write_23Cyc_F01 : SchedWriteRes<[THX3T110FP0123]> {
+ let Latency = 23;
+ let NumMicroOps = 3;
+ let ResourceCycles = [11];
+}
+
+// 1 cycle on LS0/LS1.
+def THX3T110Write_1Cyc_LS01 : SchedWriteRes<[THX3T110LS]> {
+ let Latency = 1;
+ let NumMicroOps = 1;
+}
+
+// 2 cycles on LS0/LS1.
+def THX3T110Write_2Cyc_LS01 : SchedWriteRes<[THX3T110LS]> {
+ let Latency = 2;
+ let NumMicroOps = 2;
+}
+
+// 4 cycles on LS0/LS1.
+def THX3T110Write_4Cyc_LS01 : SchedWriteRes<[THX3T110LS]> {
+ let Latency = 4;
+ let NumMicroOps = 2;
+ let ResourceCycles = [2];
+}
+
+// 5 cycles on LS0/LS1.
+def THX3T110Write_5Cyc_LS01 : SchedWriteRes<[THX3T110LS]> {
+ let Latency = 5;
+ let NumMicroOps = 3;
+}
+
+// 6 cycles on LS0/LS1.
+def THX3T110Write_6Cyc_LS01 : SchedWriteRes<[THX3T110LS]> {
+ let Latency = 6;
+ let NumMicroOps = 3;
+}
+
+// 4 + 5 cycles on LS0/LS1.
+// First resource is available after 4 cycles.
+// Second resource is available after 5 cycles.
+// Load vector pair, immed offset, Q-form [LDP/LDNP].
+def THX3T110Write_4_5Cyc_LS01 : SchedWriteRes<[THX3T110LS]> {
+ let Latency = 4;
+ let NumMicroOps = 2;
+ let ResourceCycles = [4, 5];
+}
+
+// 4 + 8 cycles on LS0/LS1.
+// First resource is available after 4 cycles.
+// Second resource is available after 8 cycles.
+// Load vector pair, immed offset, S/D-form [LDP/LDNP].
+def THX3T110Write_4_8Cyc_LS01 : SchedWriteRes<[THX3T110LS]> {
+ let Latency = 4;
+ let NumMicroOps = 2;
+ let ResourceCycles = [4, 8];
+}
+
+// 11 cycles on LS0/LS1 and I1.
+def THX3T110Write_11Cyc_LS01_I1 :
+ SchedWriteRes<[THX3T110LS, THX3T110I1]> {
+ let Latency = 11;
+ let NumMicroOps = 4;
+}
+
+// 1 cycles on LS0/LS1 and I0/I1/I2/I3.
+def THX3T110Write_1Cyc_LS01_I0123 :
+ SchedWriteRes<[THX3T110LS, THX3T110I0123]> {
+ let Latency = 1;
+ let NumMicroOps = 2;
+}
+
+// 1 cycles on LS0/LS1 and 2 of I0/I1/I2/I3.
+def THX3T110Write_1Cyc_LS01_I0123_I0123 :
+ SchedWriteRes<[THX3T110LS, THX3T110I0123, THX3T110I0123]> {
+ let Latency = 1;
+ let NumMicroOps = 3;
+}
+
+// 4 cycles on LS0/LS1 and I0/I1/I2/I3.
+def THX3T110Write_4Cyc_LS01_I0123 :
+ SchedWriteRes<[THX3T110LS, THX3T110I0123]> {
+ let Latency = 4;
+ let NumMicroOps = 3;
+}
+
+// 4 cycles on LS0/LS1 and 2 of I0/I1/I2/I3.
+def THX3T110Write_4Cyc_LS01_I0123_I0123 :
+ SchedWriteRes<[THX3T110LS, THX3T110I0123, THX3T110I0123]> {
+ let Latency = 4;
+ let NumMicroOps = 3;
+}
+
+// 5 cycles on LS0/LS1 and I0/I1/I2/I3.
+def THX3T110Write_5Cyc_LS01_I0123 :
+ SchedWriteRes<[THX3T110LS, THX3T110I0123]> {
+ let Latency = 5;
+ let NumMicroOps = 3;
+}
+
+// 5 cycles on LS0/LS1 and 2 of I0/I1/I2/I3.
+def THX3T110Write_5Cyc_LS01_I0123_I0123 :
+ SchedWriteRes<[THX3T110LS, THX3T110I0123, THX3T110I0123]> {
+ let Latency = 5;
+ let NumMicroOps = 3;
+}
+
+// 6 cycles on LS0/LS1 and I0/I1/I2/I3.
+def THX3T110Write_6Cyc_LS01_I012 :
+ SchedWriteRes<[THX3T110LS, THX3T110I0123]> {
+ let Latency = 6;
+ let NumMicroOps = 4;
+}
+
+// 6 cycles on LS0/LS1 and 2 of I0/I1/I2/I3.
+def THX3T110Write_6Cyc_LS01_I0123_I0123 :
+ SchedWriteRes<[THX3T110LS, THX3T110I0123, THX3T110I0123]> {
+ let Latency = 6;
+ let NumMicroOps = 3;
+}
+
+// 1 cycle on LS0/LS1 and SD.
+def THX3T110Write_1Cyc_LS01_SD :
+ SchedWriteRes<[THX3T110LS, THX3T110SD]> {
+ let Latency = 1;
+ let NumMicroOps = 2;
+}
+
+// 2 cycles on LS0/LS1 and SD.
+def THX3T110Write_2Cyc_LS01_SD :
+ SchedWriteRes<[THX3T110LS, THX3T110SD]> {
+ let Latency = 2;
+ let NumMicroOps = 2;
+}
+
+// 4 cycles on LS0/LS1 and SD.
+def THX3T110Write_4Cyc_LS01_SD :
+ SchedWriteRes<[THX3T110LS, THX3T110SD]> {
+ let Latency = 4;
+ let NumMicroOps = 3;
+}
+
+// 5 cycles on LS0/LS1 and SD.
+def THX3T110Write_5Cyc_LS01_SD :
+ SchedWriteRes<[THX3T110LS, THX3T110SD]> {
+ let Latency = 5;
+ let NumMicroOps = 4;
+}
+
+// 6 cycles on LS0/LS1 and SD.
+def THX3T110Write_6Cyc_LS01_SD :
+ SchedWriteRes<[THX3T110LS, THX3T110SD]> {
+ let Latency = 6;
+ let NumMicroOps = 5;
+}
+
+// 1 cycle on LS0/LS1, SD and I0/I1/I2/I3.
+def THX3T110Write_1Cyc_LS01_SD_I0123 :
+ SchedWriteRes<[THX3T110LS, THX3T110SD, THX3T110I0123]> {
+ let Latency = 1;
+ let NumMicroOps = 2;
+}
+
+// 2 cycles on LS0/LS1, SD and I0/I1/I2/I3.
+def THX3T110Write_2Cyc_LS01_SD_I0123 :
+ SchedWriteRes<[THX3T110LS, THX3T110SD, THX3T110I0123]> {
+ let Latency = 2;
+ let NumMicroOps = 2;
+}
+
+// 4 cycles on LS0/LS1, SD and I0/I1/I2/I3.
+def THX3T110Write_4Cyc_LS01_SD_I0123 :
+ SchedWriteRes<[THX3T110LS, THX3T110SD, THX3T110I0123]> {
+ let Latency = 4;
+ let NumMicroOps = 3;
+}
+
+// 5 cycles on LS0/LS1, SD and I0/I1/I2/I3.
+def THX3T110Write_5Cyc_LS01_SD_I0123 :
+ SchedWriteRes<[THX3T110LS, THX3T110SD, THX3T110I0123]> {
+ let Latency = 5;
+ let NumMicroOps = 4;
+}
+
+// 6 cycles on LS0/LS1, SD and I0/I1/I2/I3.
+def THX3T110Write_6Cyc_LS01_SD_I0123 :
+ SchedWriteRes<[THX3T110LS, THX3T110SD, THX3T110I0123]> {
+ let Latency = 6;
+ let NumMicroOps = 5;
+}
+
+// 1 cycles on LS0/LS1 and F0/F1/F2/F3.
+def THX3T110Write_1Cyc_LS01_F0123 :
+ SchedWriteRes<[THX3T110LS, THX3T110FP0123]> {
+ let Latency = 1;
+ let NumMicroOps = 2;
+}
+
+// 5 cycles on LS0/LS1 and F0/F1/F2/F3.
+def THX3T110Write_5Cyc_LS01_F0123 :
+ SchedWriteRes<[THX3T110LS, THX3T110FP0123]> {
+ let Latency = 5;
+ let NumMicroOps = 3;
+}
+
+// 6 cycles on LS0/LS1 and F0/F1/F2/F3.
+def THX3T110Write_6Cyc_LS01_F0123 :
+ SchedWriteRes<[THX3T110LS, THX3T110FP0123]> {
+ let Latency = 6;
+ let NumMicroOps = 3;
+}
+
+// 7 cycles on LS0/LS1 and F0/F1/F2/F3.
+def THX3T110Write_7Cyc_LS01_F0123 :
+ SchedWriteRes<[THX3T110LS, THX3T110FP0123]> {
+ let Latency = 7;
+ let NumMicroOps = 3;
+}
+
+// 8 cycles on LS0/LS1 and F0/F1/F2/F3.
+def THX3T110Write_8Cyc_LS01_F0123 :
+ SchedWriteRes<[THX3T110LS, THX3T110FP0123]> {
+ let Latency = 8;
+ let NumMicroOps = 3;
+}
+
+// 8 cycles on LS0/LS1 and I0/I1/I2/I3.
+def THX3T110Write_8Cyc_LS01_I0123 :
+ SchedWriteRes<[THX3T110LS, THX3T110I0123]> {
+ let Latency = 8;
+ let NumMicroOps = 3;
+}
+
+// 12 cycles on LS0/LS1 and I0/I1/I2/I3.
+def THX3T110Write_12Cyc_LS01_I0123 :
+ SchedWriteRes<[THX3T110LS, THX3T110I0123]> {
+ let Latency = 12;
+ let NumMicroOps = 4;
+}
+
+// 16 cycles on LS0/LS1 and I0/I1/I2/I3.
+def THX3T110Write_16Cyc_LS01_I0123 :
+ SchedWriteRes<[THX3T110LS, THX3T110I0123]> {
+ let Latency = 16;
+ let NumMicroOps = 5;
+}
+
+// 24 cycles on LS0/LS1 and I0/I1/I2/I3.
+def THX3T110Write_24Cyc_LS01_I0123 :
+ SchedWriteRes<[THX3T110LS, THX3T110I0123]> {
+ let Latency = 24;
+ let NumMicroOps = 10;
+}
+
+// 32 cycles on LS0/LS1 and I0/I1/I2/I3.
+def THX3T110Write_32Cyc_LS01_I0123 :
+ SchedWriteRes<[THX3T110LS, THX3T110I0123]> {
+ let Latency = 32;
+ let NumMicroOps = 14;
+}
+
+// 3 cycles on F0/F1/F2/F3.
+def THX3T110Write_3Cyc_F0123 : SchedWriteRes<[THX3T110FP0123]> {
+ let Latency = 3;
+ let NumMicroOps = 2;
+}
+
+// 4 cycles on F0/F1/F2/F3.
+def THX3T110Write_4Cyc_F0123 : SchedWriteRes<[THX3T110FP0123]> {
+ let Latency = 4;
+ let NumMicroOps = 2;
+}
+
+// 5 cycles on F0/F1/F2/F3.
+def THX3T110Write_5Cyc_F0123 : SchedWriteRes<[THX3T110FP0123]> {
+ let Latency = 5;
+ let NumMicroOps = 2;
+}
+
+// 10 cycles on F0/F1/F2/F3.
+def THX3T110Write_10Cyc_F0123 : SchedWriteRes<[THX3T110FP0123]> {
+ let Latency = 10;
+ let NumMicroOps = 4;
+}
+
+// 15 cycles on F0/F1/F2/F3.
+def THX3T110Write_15Cyc_F0123 : SchedWriteRes<[THX3T110FP0123]> {
+ let Latency = 15;
+ let NumMicroOps = 7;
+}
+
+// 16 cycles on F0/F1/F2/F3.
+def THX3T110Write_16Cyc_F0123 : SchedWriteRes<[THX3T110FP0123]> {
+ let Latency = 16;
+ let NumMicroOps = 3;
+}
+
+// 18 cycles on F0/F1/F2/F3.
+def THX3T110Write_18Cyc_F0123 : SchedWriteRes<[THX3T110FP0123]> {
+ let Latency = 18;
+ let NumMicroOps = 3;
+}
+
+// 19 cycles on F0/F1/F2/F3.
+def THX3T110Write_19Cyc_F0123 : SchedWriteRes<[THX3T110FP0123]> {
+ let Latency = 19;
+ let NumMicroOps = 4;
+}
+
+// 20 cycles on F0/F1/F2/F3.
+def THX3T110Write_20Cyc_F0123 : SchedWriteRes<[THX3T110FP0123]> {
+ let Latency = 20;
+ let NumMicroOps = 4;
+}
+
+// 23 cycles on F0/F1/F2/F3.
+def THX3T110Write_23Cyc_F0123 : SchedWriteRes<[THX3T110FP0123]> {
+ let Latency = 23;
+ let NumMicroOps = 4;
+}
+
+// 3 cycles on F2/F3 and 4 cycles on F0/F1/F2/F3.
+def THX3T110Write_3_4Cyc_F23_F0123 :
+ SchedWriteRes<[THX3T110FP23, THX3T110FP0123]> {
+ let Latency = 3;
+ let NumMicroOps = 2;
+ let ResourceCycles = [3, 4];
+}
+
+
+// Define commonly used read types.
+
+// No forwarding is provided for these types.
+def : ReadAdvance<ReadI, 0>;
+def : ReadAdvance<ReadISReg, 0>;
+def : ReadAdvance<ReadIEReg, 0>;
+def : ReadAdvance<ReadIM, 0>;
+def : ReadAdvance<ReadIMA, 0>;
+def : ReadAdvance<ReadID, 0>;
+def : ReadAdvance<ReadExtrHi, 0>;
+def : ReadAdvance<ReadAdrBase, 0>;
+def : ReadAdvance<ReadVLD, 0>;
+
+//===----------------------------------------------------------------------===//
+// 3. Instruction Tables.
+
+//---
+// 3.1 Branch Instructions
+//---
+
+// Branch, immed
+// Branch and link, immed
+// Compare and branch
+def : WriteRes<WriteBr, [THX3T110I23]> {
+ let Latency = 1;
+ let NumMicroOps = 2;
+}
+
+// Branch, register
+// Branch and link, register != LR
+// Branch and link, register = LR
+def : WriteRes<WriteBrReg, [THX3T110I23]> {
+ let Latency = 1;
+ let NumMicroOps = 2;
+}
+
+def : WriteRes<WriteSys, []> { let Latency = 1; }
+def : WriteRes<WriteBarrier, []> { let Latency = 1; }
+def : WriteRes<WriteHint, []> { let Latency = 1; }
+
+def : WriteRes<WriteAtomic, []> {
+ let Latency = 4;
+ let NumMicroOps = 2;
+}
+
+//---
+// Branch
+//---
+def : InstRW<[THX3T110Write_1Cyc_I23], (instrs B, BL, BR, BLR)>;
+def : InstRW<[THX3T110Write_1Cyc_I23], (instrs Bcc)>;
+def : InstRW<[THX3T110Write_1Cyc_I23], (instrs RET)>;
+def : InstRW<[THX3T110Write_1Cyc_I23],
+ (instrs CBZW, CBZX, CBNZW, CBNZX, TBZW, TBZX, TBNZW, TBNZX)>;
+
+//---
+// 3.2 Arithmetic and Logical Instructions
+// 3.3 Move and Shift Instructions
+//---
+
+
+// ALU, basic
+// Conditional compare
+// Conditional select
+// Address generation
+def : WriteRes<WriteI, [THX3T110I0123]> {
+ let Latency = 1;
+ let ResourceCycles = [1];
+ let NumMicroOps = 2;
+}
+
+def : InstRW<[WriteI],
+ (instregex "ADD?(W|X)r(i|r|s|x)", "ADDS?(W|X)r(i|r|s|x)(64)?",
+ "AND?(W|X)r(i|r|s|x)", "ANDS?(W|X)r(i|r|s|x)",
+ "ADC(W|X)r",
+ "BIC?(W|X)r(i|r|s|x)", "BICS?(W|X)r(i|r|s|x)",
+ "EON?(W|X)r(i|r|s|x)", "ORN?(W|X)r(i|r|s|x)",
+ "ORR?(W|X)r(i|r|s|x)", "SUB?(W|X)r(i|r|s|x)",
+ "SUBS?(W|X)r(i|r|s|x)", "SBC(W|X)r",
+ "SBCS(W|X)r", "CCMN(W|X)(i|r)",
+ "CCMP(W|X)(i|r)", "CSEL(W|X)r",
+ "CSINC(W|X)r", "CSINV(W|X)r",
+ "CSNEG(W|X)r")>;
+
+def : InstRW<[WriteI], (instrs COPY)>;
+
+// ALU, extend and/or shift
+def : WriteRes<WriteISReg, [THX3T110I0123]> {
+ let Latency = 2;
+ let ResourceCycles = [2];
+ let NumMicroOps = 2;
+}
+
+def : InstRW<[WriteISReg],
+ (instregex "ADD?(W|X)r(i|r|s|x)", "ADDS?(W|X)r(i|r|s|x)(64)?",
+ "AND?(W|X)r(i|r|s|x)", "ANDS?(W|X)r(i|r|s|x)",
+ "ADC(W|X)r",
+ "BIC?(W|X)r(i|r|s|x)", "BICS?(W|X)r(i|r|s|x)",
+ "EON?(W|X)r(i|r|s|x)", "ORN?(W|X)r(i|r|s|x)",
+ "ORR?(W|X)r(i|r|s|x)", "SUB?(W|X)r(i|r|s|x)",
+ "SUBS?(W|X)r(i|r|s|x)", "SBC(W|X)r",
+ "SBCS(W|X)r", "CCMN(W|X)(i|r)",
+ "CCMP(W|X)(i|r)", "CSEL(W|X)r",
+ "CSINC(W|X)r", "CSINV(W|X)r",
+ "CSNEG(W|X)r")>;
+
+def : WriteRes<WriteIEReg, [THX3T110I0123]> {
+ let Latency = 1;
+ let ResourceCycles = [1];
+ let NumMicroOps = 2;
+}
+
+def : InstRW<[WriteIEReg],
+ (instregex "ADD?(W|X)r(i|r|s|x)", "ADDS?(W|X)r(i|r|s|x)(64)?",
+ "AND?(W|X)r(i|r|s|x)", "ANDS?(W|X)r(i|r|s|x)",
+ "ADC(W|X)r",
+ "BIC?(W|X)r(i|r|s|x)", "BICS?(W|X)r(i|r|s|x)",
+ "EON?(W|X)r(i|r|s|x)", "ORN?(W|X)r(i|r|s|x)",
+ "ORR?(W|X)r(i|r|s|x)", "SUB?(W|X)r(i|r|s|x)",
+ "SUBS?(W|X)r(i|r|s|x)", "SBC(W|X)r",
+ "SBCS(W|X)r", "CCMN(W|X)(i|r)",
+ "CCMP(W|X)(i|r)", "CSEL(W|X)r",
+ "CSINC(W|X)r", "CSINV(W|X)r",
+ "CSNEG(W|X)r")>;
+
+// Move immed
+def : WriteRes<WriteImm, [THX3T110I0123]> {
+ let Latency = 1;
+ let NumMicroOps = 2;
+}
+
+def : InstRW<[THX3T110Write_1Cyc_I0123],
+ (instrs MOVKWi, MOVKXi, MOVNWi, MOVNXi, MOVZWi, MOVZXi)>;
+
+def : InstRW<[THX3T110Write_1Cyc_I0123],
+ (instrs ASRVWr, ASRVXr, LSLVWr, LSLVXr, RORVWr, RORVXr)>;
+
+// Variable shift
+def : WriteRes<WriteIS, [THX3T110I0123]> {
+ let Latency = 1;
+ let NumMicroOps = 2;
+}
+
+//---
+// 3.4 Divide and Multiply Instructions
+//---
+
+// Divide, W-form
+// Latency range of 13-23/13-39.
+def : WriteRes<WriteID32, [THX3T110I1]> {
+ let Latency = 39;
+ let ResourceCycles = [39];
+ let NumMicroOps = 4;
+}
+
+// Divide, X-form
+def : WriteRes<WriteID64, [THX3T110I1]> {
+ let Latency = 23;
+ let ResourceCycles = [23];
+ let NumMicroOps = 4;
+}
+
+// Multiply accumulate, W-form
+def : WriteRes<WriteIM32, [THX3T110I0123]> {
+ let Latency = 5;
+ let NumMicroOps = 3;
+}
+
+// Multiply accumulate, X-form
+def : WriteRes<WriteIM64, [THX3T110I0123]> {
+ let Latency = 5;
+ let NumMicroOps = 3;
+}
+
+//def : InstRW<[WriteIM32, ReadIM, ReadIM, ReadIMA, THX3T110Write_5Cyc_I012],
+// (instrs MADDWrrr, MSUBWrrr)>;
+def : InstRW<[WriteIM32], (instrs MADDWrrr, MSUBWrrr)>;
+def : InstRW<[WriteIM32], (instrs MADDXrrr, MSUBXrrr)>;
+def : InstRW<[THX3T110Write_5Cyc_I0123],
+ (instregex "(S|U)(MADDL|MSUBL)rrr")>;
+
+def : InstRW<[WriteID32], (instrs SDIVWr, UDIVWr)>;
+def : InstRW<[WriteID64], (instrs SDIVXr, UDIVXr)>;
+
+// Bitfield extract, two reg
+def : WriteRes<WriteExtr, [THX3T110I0123]> {
+ let Latency = 1;
+ let NumMicroOps = 2;
+}
+
+// Multiply high
+def : InstRW<[THX3T110Write_4Cyc_I1], (instrs SMULHrr, UMULHrr)>;
+
+// Miscellaneous Data-Processing Instructions
+// Bitfield extract
+def : InstRW<[THX3T110Write_1Cyc_I0123], (instrs EXTRWrri, EXTRXrri)>;
+
+// Bitifield move - basic
+def : InstRW<[THX3T110Write_1Cyc_I0123],
+ (instrs SBFMWri, SBFMXri, UBFMWri, UBFMXri)>;
+
+// Bitfield move, insert
+def : InstRW<[THX3T110Write_1Cyc_I0123], (instregex "^BFM")>;
+def : InstRW<[THX3T110Write_1Cyc_I0123], (instregex "(S|U)?BFM.*")>;
+
+// Count leading
+def : InstRW<[THX3T110Write_3_4Cyc_F23_F0123],
+ (instregex "^CLS(W|X)r$", "^CLZ(W|X)r$")>;
+
+// Reverse bits
+def : InstRW<[THX3T110Write_3_4Cyc_F23_F0123], (instrs RBITWr, RBITXr)>;
+
+// Cryptography Extensions
+def : InstRW<[THX3T110Write_4Cyc_F0123], (instregex "^AES[DE]")>;
+def : InstRW<[THX3T110Write_4Cyc_F0123], (instregex "^AESI?MC")>;
+def : InstRW<[THX3T110Write_4Cyc_F0123], (instregex "^PMULL")>;
+def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "^SHA1SU0")>;
+def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "^SHA1(H|SU1)")>;
+def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "^SHA1[CMP]")>;
+def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "^SHA256SU0")>;
+def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "^SHA256(H|H2|SU1)")>;
+
+// CRC Instructions
+// def : InstRW<[THX3T110Write_4Cyc_I1], (instregex "^CRC32", "^CRC32C")>;
+def : InstRW<[THX3T110Write_4Cyc_I1],
+ (instrs CRC32Brr, CRC32Hrr, CRC32Wrr, CRC32Xrr)>;
+
+def : InstRW<[THX3T110Write_4Cyc_I1],
+ (instrs CRC32CBrr, CRC32CHrr, CRC32CWrr, CRC32CXrr)>;
+
+// Reverse bits/bytes
+// NOTE: Handled by WriteI.
+
+//---
+// 3.6 Load Instructions
+// 3.10 FP Load Instructions
+//---
+
+// Load register, literal
+// Load register, unscaled immed
+// Load register, immed unprivileged
+// Load register, unsigned immed
+def : WriteRes<WriteLD, [THX3T110LS]> {
+ let Latency = 4;
+ let NumMicroOps = 4;
+}
+
+// Load register, immed post-index
+// NOTE: Handled by WriteLD, WriteI.
+// Load register, immed pre-index
+// NOTE: Handled by WriteLD, WriteAdr.
+def : WriteRes<WriteAdr, [THX3T110I0123]> {
+ let Latency = 1;
+ let NumMicroOps = 2;
+}
+
+// Load pair, immed offset, normal
+// Load pair, immed offset, signed words, base != SP
+// Load pair, immed offset signed words, base = SP
+// LDP only breaks into *one* LS micro-op. Thus
+// the resources are handled by WriteLD.
+def : WriteRes<WriteLDHi, []> {
+ let Latency = 4;
+ let NumMicroOps = 4;
+}
+
+// Load register offset, basic
+// Load register, register offset, scale by 4/8
+// Load register, register offset, scale by 2
+// Load register offset, extend
+// Load register, register offset, extend, scale by 4/8
+// Load register, register offset, extend, scale by 2
+def THX3T110WriteLDIdx : SchedWriteVariant<[
+ SchedVar<ScaledIdxPred, [THX3T110Write_4Cyc_LS01_I0123_I0123]>,
+ SchedVar<NoSchedPred, [THX3T110Write_4Cyc_LS01_I0123]>]>;
+def : SchedAlias<WriteLDIdx, THX3T110WriteLDIdx>;
+
+def THX3T110ReadAdrBase : SchedReadVariant<[
+ SchedVar<ScaledIdxPred, [ReadDefault]>,
+ SchedVar<NoSchedPred, [ReadDefault]>]>;
+def : SchedAlias<ReadAdrBase, THX3T110ReadAdrBase>;
+
+// Load pair, immed pre-index, normal
+// Load pair, immed pre-index, signed words
+// Load pair, immed post-index, normal
+// Load pair, immed post-index, signed words
+def : InstRW<[THX3T110Write_4Cyc_LS01_I0123, WriteLDHi], (instrs LDNPDi)>;
+def : InstRW<[THX3T110Write_4Cyc_LS01_I0123, WriteLDHi], (instrs LDNPQi)>;
+def : InstRW<[THX3T110Write_4Cyc_LS01_I0123, WriteLDHi], (instrs LDNPSi)>;
+def : InstRW<[THX3T110Write_4Cyc_LS01_I0123, WriteLDHi], (instrs LDNPWi)>;
+def : InstRW<[THX3T110Write_4Cyc_LS01_I0123, WriteLDHi], (instrs LDNPXi)>;
+
+def : InstRW<[THX3T110Write_4Cyc_LS01_I0123, WriteLDHi], (instrs LDPDi)>;
+def : InstRW<[THX3T110Write_4Cyc_LS01_I0123, WriteLDHi], (instrs LDPQi)>;
+def : InstRW<[THX3T110Write_4Cyc_LS01_I0123, WriteLDHi], (instrs LDPSi)>;
+def : InstRW<[THX3T110Write_4Cyc_LS01_I0123, WriteLDHi], (instrs LDPSWi)>;
+def : InstRW<[THX3T110Write_4Cyc_LS01_I0123, WriteLDHi], (instrs LDPWi)>;
+def : InstRW<[THX3T110Write_4Cyc_LS01_I0123, WriteLDHi], (instrs LDPXi)>;
+
+def : InstRW<[THX3T110Write_4Cyc_LS01], (instrs LDRBui)>;
+def : InstRW<[THX3T110Write_4Cyc_LS01], (instrs LDRDui)>;
+def : InstRW<[THX3T110Write_4Cyc_LS01], (instrs LDRHui)>;
+def : InstRW<[THX3T110Write_5Cyc_LS01], (instrs LDRQui)>;
+def : InstRW<[THX3T110Write_5Cyc_LS01], (instrs LDRSui)>;
+
+def : InstRW<[THX3T110Write_4Cyc_LS01], (instrs LDRDl)>;
+def : InstRW<[THX3T110Write_4Cyc_LS01], (instrs LDRQl)>;
+def : InstRW<[THX3T110Write_4Cyc_LS01], (instrs LDRWl)>;
+def : InstRW<[THX3T110Write_4Cyc_LS01], (instrs LDRXl)>;
+
+def : InstRW<[THX3T110Write_4Cyc_LS01], (instrs LDTRBi)>;
+def : InstRW<[THX3T110Write_4Cyc_LS01], (instrs LDTRHi)>;
+def : InstRW<[THX3T110Write_4Cyc_LS01], (instrs LDTRWi)>;
+def : InstRW<[THX3T110Write_4Cyc_LS01], (instrs LDTRXi)>;
+
+def : InstRW<[THX3T110Write_4Cyc_LS01], (instrs LDTRSBWi)>;
+def : InstRW<[THX3T110Write_4Cyc_LS01], (instrs LDTRSBXi)>;
+def : InstRW<[THX3T110Write_4Cyc_LS01], (instrs LDTRSHWi)>;
+def : InstRW<[THX3T110Write_4Cyc_LS01], (instrs LDTRSHXi)>;
+def : InstRW<[THX3T110Write_4Cyc_LS01], (instrs LDTRSWi)>;
+
+def : InstRW<[THX3T110Write_5Cyc_LS01_I0123, WriteLDHi, WriteAdr],
+ (instrs LDPDpre)>;
+def : InstRW<[THX3T110Write_5Cyc_LS01_I0123, WriteLDHi, WriteAdr],
+ (instrs LDPQpre)>;
+def : InstRW<[THX3T110Write_5Cyc_LS01_I0123, WriteLDHi, WriteAdr],
+ (instrs LDPSpre)>;
+def : InstRW<[THX3T110Write_5Cyc_LS01_I0123, WriteLDHi, WriteAdr],
+ (instrs LDPWpre)>;
+def : InstRW<[THX3T110Write_5Cyc_LS01_I0123, WriteLDHi, WriteAdr],
+ (instrs LDPWpre)>;
+
+def : InstRW<[THX3T110Write_4Cyc_LS01, WriteAdr],
+ (instrs LDRBpre, LDRDpre, LDRHpre, LDRQpre,
+ LDRSpre, LDRWpre, LDRXpre,
+ LDRSBWpre, LDRSBXpre, LDRSBWpost, LDRSBXpost,
+ LDRSHWpre, LDRSHXpre, LDRSHWpost, LDRSHXpost,
+ LDRBBpre, LDRBBpost, LDRHHpre, LDRHHpost)>;
+
+def : InstRW<[THX3T110Write_5Cyc_LS01_I0123, WriteLDHi, WriteAdr],
+ (instrs LDPDpost, LDPQpost, LDPSpost, LDPWpost, LDPXpost)>;
+
+def : InstRW<[THX3T110Write_5Cyc_LS01_I0123, WriteI],
+ (instrs LDRBpost, LDRDpost, LDRHpost,
+ LDRQpost, LDRSpost, LDRWpost, LDRXpost)>;
+
+def : InstRW<[THX3T110Write_4Cyc_LS01_I0123_I0123, WriteLDHi, WriteAdr],
+ (instrs LDPDpre, LDPQpre, LDPSpre, LDPWpre, LDPXpre)>;
+
+def : InstRW<[THX3T110Write_4Cyc_LS01_I0123_I0123, WriteAdr],
+ (instrs LDRBpre, LDRDpre, LDRHpre, LDRQpre,
+ LDRSpre, LDRWpre, LDRXpre)>;
+
+def : InstRW<[THX3T110Write_4Cyc_LS01_I0123_I0123, WriteLDHi, WriteAdr],
+ (instrs LDPDpost, LDPQpost, LDPSpost, LDPWpost, LDPXpost)>;
+
+def : InstRW<[THX3T110Write_4Cyc_LS01_I0123_I0123, WriteI],
+ (instrs LDRBpost, LDRDpost, LDRHpost, LDRQpost,
+ LDRSpost, LDRWpost, LDRXpost)>;
+
+def : InstRW<[THX3T110Write_4Cyc_LS01_I0123, ReadAdrBase], (instrs LDRBroW)>;
+def : InstRW<[THX3T110Write_4Cyc_LS01_I0123, ReadAdrBase], (instrs LDRDroW)>;
+def : InstRW<[THX3T110Write_4Cyc_LS01_I0123, ReadAdrBase], (instrs LDRHroW)>;
+def : InstRW<[THX3T110Write_4Cyc_LS01_I0123, ReadAdrBase], (instrs LDRHHroW)>;
+def : InstRW<[THX3T110Write_4Cyc_LS01_I0123, ReadAdrBase], (instrs LDRQroW)>;
+def : InstRW<[THX3T110Write_4Cyc_LS01_I0123, ReadAdrBase], (instrs LDRSroW)>;
+def : InstRW<[THX3T110Write_4Cyc_LS01_I0123, ReadAdrBase], (instrs LDRSHWroW)>;
+def : InstRW<[THX3T110Write_4Cyc_LS01_I0123, ReadAdrBase], (instrs LDRSHXroW)>;
+def : InstRW<[THX3T110Write_4Cyc_LS01_I0123, ReadAdrBase], (instrs LDRWroW)>;
+def : InstRW<[THX3T110Write_4Cyc_LS01_I0123, ReadAdrBase], (instrs LDRXroW)>;
+
+def : InstRW<[THX3T110Write_4Cyc_LS01_I0123, ReadAdrBase], (instrs LDRBroX)>;
+def : InstRW<[THX3T110Write_4Cyc_LS01_I0123, ReadAdrBase], (instrs LDRDroX)>;
+def : InstRW<[THX3T110Write_4Cyc_LS01_I0123, ReadAdrBase], (instrs LDRHHroX)>;
+def : InstRW<[THX3T110Write_4Cyc_LS01_I0123, ReadAdrBase], (instrs LDRHroX)>;
+def : InstRW<[THX3T110Write_4Cyc_LS01_I0123, ReadAdrBase], (instrs LDRQroX)>;
+def : InstRW<[THX3T110Write_4Cyc_LS01_I0123, ReadAdrBase], (instrs LDRSroX)>;
+def : InstRW<[THX3T110Write_4Cyc_LS01_I0123, ReadAdrBase], (instrs LDRSHWroX)>;
+def : InstRW<[THX3T110Write_4Cyc_LS01_I0123, ReadAdrBase], (instrs LDRSHXroX)>;
+def : InstRW<[THX3T110Write_4Cyc_LS01_I0123, ReadAdrBase], (instrs LDRWroX)>;
+def : InstRW<[THX3T110Write_4Cyc_LS01_I0123, ReadAdrBase], (instrs LDRXroX)>;
+
+def : InstRW<[THX3T110Write_4Cyc_LS01], (instrs LDURBi)>;
+def : InstRW<[THX3T110Write_4Cyc_LS01], (instrs LDURBBi)>;
+def : InstRW<[THX3T110Write_4Cyc_LS01], (instrs LDURDi)>;
+def : InstRW<[THX3T110Write_4Cyc_LS01], (instrs LDURHi)>;
+def : InstRW<[THX3T110Write_4Cyc_LS01], (instrs LDURHHi)>;
+def : InstRW<[THX3T110Write_4Cyc_LS01], (instrs LDURQi)>;
+def : InstRW<[THX3T110Write_4Cyc_LS01], (instrs LDURSi)>;
+def : InstRW<[THX3T110Write_4Cyc_LS01], (instrs LDURXi)>;
+def : InstRW<[THX3T110Write_4Cyc_LS01], (instrs LDURSBWi)>;
+def : InstRW<[THX3T110Write_4Cyc_LS01], (instrs LDURSBXi)>;
+def : InstRW<[THX3T110Write_4Cyc_LS01], (instrs LDURSHWi)>;
+def : InstRW<[THX3T110Write_4Cyc_LS01], (instrs LDURSHXi)>;
+def : InstRW<[THX3T110Write_4Cyc_LS01], (instrs LDURSWi)>;
+
+// Load exclusive
+def : InstRW<[THX3T110Write_4Cyc_LS01], (instregex "^LDAR(B|H|W|X)$")>;
+def : InstRW<[THX3T110Write_4Cyc_LS01], (instregex "^LDAXR(B|H|W|X)$")>;
+def : InstRW<[THX3T110Write_4Cyc_LS01], (instregex "^LDXR(B|H|W|X)$")>;
+def : InstRW<[THX3T110Write_4Cyc_LS01], (instregex "^LDAXP(W|X)$")>;
+def : InstRW<[THX3T110Write_4Cyc_LS01], (instregex "^LDXP(W|X)$")>;
+
+//---
+// Prefetch
+//---
+def : InstRW<[THX3T110Write_6Cyc_LS01_I012], (instrs PRFMl)>;
+def : InstRW<[THX3T110Write_6Cyc_LS01_I012], (instrs PRFUMi)>;
+def : InstRW<[THX3T110Write_6Cyc_LS01_I012], (instrs PRFMui)>;
+def : InstRW<[THX3T110Write_6Cyc_LS01_I012], (instrs PRFMroW)>;
+def : InstRW<[THX3T110Write_6Cyc_LS01_I012], (instrs PRFMroX)>;
+
+//--
+// 3.7 Store Instructions
+// 3.11 FP Store Instructions
+//--
+
+// Store register, unscaled immed
+// Store register, immed unprivileged
+// Store register, unsigned immed
+def : WriteRes<WriteST, [THX3T110LS, THX3T110SD]> {
+ let Latency = 1;
+ let NumMicroOps = 2;
+}
+
+// Store register, immed post-index
+// NOTE: Handled by WriteAdr, WriteST, ReadAdrBase
+
+// Store register, immed pre-index
+// NOTE: Handled by WriteAdr, WriteST
+
+// Store register, register offset, basic
+// Store register, register offset, scaled by 4/8
+// Store register, register offset, scaled by 2
+// Store register, register offset, extend
+// Store register, register offset, extend, scale by 4/8
+// Store register, register offset, extend, scale by 1
+def : WriteRes<WriteSTIdx, [THX3T110LS, THX3T110SD, THX3T110I0123]> {
+ let Latency = 1;
+ let NumMicroOps = 2;
+}
+
+// Store pair, immed offset, W-form
+// Store pair, immed offset, X-form
+def : WriteRes<WriteSTP, [THX3T110LS, THX3T110SD]> {
+ let Latency = 1;
+ let NumMicroOps = 2;
+}
+
+// Store pair, immed post-index, W-form
+// Store pair, immed post-index, X-form
+// Store pair, immed pre-index, W-form
+// Store pair, immed pre-index, X-form
+// NOTE: Handled by WriteAdr, WriteSTP.
+def : InstRW<[THX3T110Write_1Cyc_LS01_SD], (instrs STURBi)>;
+def : InstRW<[THX3T110Write_1Cyc_LS01_SD], (instrs STURBBi)>;
+def : InstRW<[THX3T110Write_1Cyc_LS01_SD], (instrs STURDi)>;
+def : InstRW<[THX3T110Write_1Cyc_LS01_SD], (instrs STURHi)>;
+def : InstRW<[THX3T110Write_1Cyc_LS01_SD], (instrs STURHHi)>;
+def : InstRW<[THX3T110Write_1Cyc_LS01_SD], (instrs STURQi)>;
+def : InstRW<[THX3T110Write_1Cyc_LS01_SD], (instrs STURSi)>;
+def : InstRW<[THX3T110Write_1Cyc_LS01_SD], (instrs STURWi)>;
+def : InstRW<[THX3T110Write_1Cyc_LS01_SD], (instrs STURXi)>;
+
+def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_SD], (instrs STTRBi)>;
+def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_SD], (instrs STTRHi)>;
+def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_SD], (instrs STTRWi)>;
+def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_SD], (instrs STTRXi)>;
+
+def : InstRW<[THX3T110Write_1Cyc_LS01_SD], (instrs STNPDi)>;
+def : InstRW<[THX3T110Write_1Cyc_LS01_SD], (instrs STNPQi)>;
+def : InstRW<[THX3T110Write_1Cyc_LS01_SD], (instrs STNPXi)>;
+def : InstRW<[THX3T110Write_1Cyc_LS01_SD], (instrs STNPWi)>;
+
+def : InstRW<[THX3T110Write_1Cyc_LS01_SD], (instrs STPDi)>;
+def : InstRW<[THX3T110Write_1Cyc_LS01_SD], (instrs STPQi)>;
+def : InstRW<[THX3T110Write_1Cyc_LS01_SD], (instrs STPXi)>;
+def : InstRW<[THX3T110Write_1Cyc_LS01_SD], (instrs STPWi)>;
+
+def : InstRW<[THX3T110Write_1Cyc_LS01_I0123], (instrs STRBui)>;
+def : InstRW<[THX3T110Write_1Cyc_LS01_I0123], (instrs STRDui)>;
+def : InstRW<[THX3T110Write_1Cyc_LS01_I0123], (instrs STRHui)>;
+def : InstRW<[THX3T110Write_1Cyc_LS01_I0123], (instrs STRQui)>;
+def : InstRW<[THX3T110Write_1Cyc_LS01_I0123], (instrs STRXui)>;
+def : InstRW<[THX3T110Write_1Cyc_LS01_I0123], (instrs STRWui)>;
+
+def : InstRW<[WriteSTP, THX3T110Write_1Cyc_LS01_SD], (instrs STRBui)>;
+def : InstRW<[WriteSTP, THX3T110Write_1Cyc_LS01_SD], (instrs STRDui)>;
+def : InstRW<[WriteSTP, THX3T110Write_1Cyc_LS01_SD], (instrs STRHui)>;
+def : InstRW<[WriteSTP, THX3T110Write_1Cyc_LS01_SD], (instrs STRQui)>;
+def : InstRW<[WriteSTP, THX3T110Write_1Cyc_LS01_SD], (instrs STRXui)>;
+def : InstRW<[WriteSTP, THX3T110Write_1Cyc_LS01_SD], (instrs STRWui)>;
+
+def : InstRW<[WriteSTIdx, THX3T110Write_1Cyc_LS01_SD_I0123], (instrs STRBui)>;
+def : InstRW<[WriteSTIdx, THX3T110Write_1Cyc_LS01_SD_I0123], (instrs STRDui)>;
+def : InstRW<[WriteSTIdx, THX3T110Write_1Cyc_LS01_SD_I0123], (instrs STRHui)>;
+def : InstRW<[WriteSTIdx, THX3T110Write_1Cyc_LS01_SD_I0123], (instrs STRQui)>;
+def : InstRW<[WriteSTIdx, THX3T110Write_1Cyc_LS01_SD_I0123], (instrs STRXui)>;
+def : InstRW<[WriteSTIdx, THX3T110Write_1Cyc_LS01_SD_I0123], (instrs STRWui)>;
+
+def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_I0123],
+ (instrs STPDpre, STPDpost)>;
+def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_I0123, ReadAdrBase],
+ (instrs STPDpre, STPDpost)>;
+def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_I0123],
+ (instrs STPQpre, STPQpost)>;
+def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_I0123, ReadAdrBase],
+ (instrs STPQpre, STPQpost)>;
+def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_I0123],
+ (instrs STPSpre, STPSpost)>;
+def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_I0123, ReadAdrBase],
+ (instrs STPSpre, STPSpost)>;
+def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_I0123],
+ (instrs STPWpre, STPWpost)>;
+def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_I0123, ReadAdrBase],
+ (instrs STPWpre, STPWpost)>;
+def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_I0123],
+ (instrs STPXpre, STPXpost)>;
+def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_I0123, ReadAdrBase],
+ (instrs STPXpre, STPXpost)>;
+def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_I0123],
+ (instrs STRBpre, STRBpost)>;
+def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_I0123, ReadAdrBase],
+ (instrs STRBpre, STRBpost)>;
+def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_I0123],
+ (instrs STRBBpre, STRBBpost)>;
+def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_I0123, ReadAdrBase],
+ (instrs STRBBpre, STRBBpost)>;
+def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_I0123],
+ (instrs STRDpre, STRDpost)>;
+def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_I0123, ReadAdrBase],
+ (instrs STRDpre, STRDpost)>;
+def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_I0123],
+ (instrs STRHpre, STRHpost)>;
+def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_I0123, ReadAdrBase],
+ (instrs STRHpre, STRHpost)>;
+def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_I0123],
+ (instrs STRHHpre, STRHHpost)>;
+def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_I0123, ReadAdrBase],
+ (instrs STRHHpre, STRHHpost)>;
+def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_I0123],
+ (instrs STRQpre, STRQpost)>;
+def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_I0123, ReadAdrBase],
+ (instrs STRQpre, STRQpost)>;
+def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_I0123],
+ (instrs STRSpre, STRSpost)>;
+def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_I0123, ReadAdrBase],
+ (instrs STRSpre, STRSpost)>;
+def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_I0123],
+ (instrs STRWpre, STRWpost)>;
+def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_I0123, ReadAdrBase],
+ (instrs STRWpre, STRWpost)>;
+def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_I0123],
+ (instrs STRXpre, STRXpost)>;
+def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_I0123, ReadAdrBase],
+ (instrs STRXpre, STRXpost)>;
+def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_I0123, ReadAdrBase],
+ (instrs STRBroW, STRBroX)>;
+def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_I0123, ReadAdrBase],
+ (instrs STRBBroW, STRBBroX)>;
+def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_I0123, ReadAdrBase],
+ (instrs STRDroW, STRDroX)>;
+def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_I0123, ReadAdrBase],
+ (instrs STRHroW, STRHroX)>;
+def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_I0123, ReadAdrBase],
+ (instrs STRHHroW, STRHHroX)>;
+def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_I0123, ReadAdrBase],
+ (instrs STRQroW, STRQroX)>;
+def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_I0123, ReadAdrBase],
+ (instrs STRSroW, STRSroX)>;
+def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_I0123, ReadAdrBase],
+ (instrs STRWroW, STRWroX)>;
+def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_I0123, ReadAdrBase],
+ (instrs STRXroW, STRXroX)>;
+
+// Store exclusive
+def : InstRW<[THX3T110Write_4Cyc_LS01_SD], (instrs STNPWi, STNPXi)>;
+def : InstRW<[THX3T110Write_4Cyc_LS01_SD], (instregex "^STLR(B|H|W|X)$")>;
+def : InstRW<[THX3T110Write_4Cyc_LS01_SD], (instregex "^STXP(W|X)$")>;
+def : InstRW<[THX3T110Write_4Cyc_LS01_SD], (instregex "^STXR(B|H|W|X)$")>;
+def : InstRW<[THX3T110Write_4Cyc_LS01_SD], (instregex "^STLXP(W|X)$")>;
+def : InstRW<[THX3T110Write_4Cyc_LS01_SD], (instregex "^STLXR(B|H|W|X)$")>;
+
+//---
+// 3.8 FP Data Processing Instructions
+//---
+
+// FP absolute value
+// FP min/max
+// FP negate
+def : WriteRes<WriteF, [THX3T110FP0123]> {
+ let Latency = 5;
+ let NumMicroOps = 2;
+}
+
+// FP arithmetic
+def : InstRW<[THX3T110Write_6Cyc_F01], (instregex "^FADD", "^FSUB")>;
+
+// FP compare
+def : WriteRes<WriteFCmp, [THX3T110FP0123]> {
+ let Latency = 5;
+ let NumMicroOps = 2;
+}
+
+// FP Mul, Div, Sqrt
+def : WriteRes<WriteFDiv, [THX3T110FP0123]> {
+ let Latency = 22;
+ let ResourceCycles = [19];
+}
+
+def THX3T110XWriteFDiv : SchedWriteRes<[THX3T110FP0123]> {
+ let Latency = 16;
+ let ResourceCycles = [8];
+ let NumMicroOps = 4;
+}
+
+def THX3T110XWriteFDivSP : SchedWriteRes<[THX3T110FP0123]> {
+ let Latency = 16;
+ let ResourceCycles = [8];
+ let NumMicroOps = 4;
+}
+
+def THX3T110XWriteFDivDP : SchedWriteRes<[THX3T110FP0123]> {
+ let Latency = 23;
+ let ResourceCycles = [12];
+ let NumMicroOps = 4;
+}
+
+def THX3T110XWriteFSqrtSP : SchedWriteRes<[THX3T110FP0123]> {
+ let Latency = 16;
+ let ResourceCycles = [8];
+ let NumMicroOps = 4;
+}
+
+def THX3T110XWriteFSqrtDP : SchedWriteRes<[THX3T110FP0123]> {
+ let Latency = 23;
+ let ResourceCycles = [12];
+ let NumMicroOps = 4;
+}
+
+// FP divide, S-form
+// FP square root, S-form
+def : InstRW<[THX3T110XWriteFDivSP], (instrs FDIVSrr)>;
+def : InstRW<[THX3T110XWriteFSqrtSP], (instrs FSQRTSr)>;
+def : InstRW<[THX3T110XWriteFDivSP], (instregex "^FDIVv.*32$")>;
+def : InstRW<[THX3T110XWriteFSqrtSP], (instregex "^.*SQRT.*32$")>;
+def : InstRW<[THX3T110Write_16Cyc_F01], (instregex "^FDIVSrr", "^FSQRTSr")>;
+
+// FP divide, D-form
+// FP square root, D-form
+def : InstRW<[THX3T110XWriteFDivDP], (instrs FDIVDrr)>;
+def : InstRW<[THX3T110XWriteFSqrtDP], (instrs FSQRTDr)>;
+def : InstRW<[THX3T110XWriteFDivDP], (instregex "^FDIVv.*64$")>;
+def : InstRW<[THX3T110XWriteFSqrtDP], (instregex "^.*SQRT.*64$")>;
+def : InstRW<[THX3T110Write_23Cyc_F01], (instregex "^FDIVDrr", "^FSQRTDr")>;
+
+// FP multiply
+// FP multiply accumulate
+def : WriteRes<WriteFMul, [THX3T110FP0123]> {
+ let Latency = 6;
+ let ResourceCycles = [2];
+ let NumMicroOps = 3;
+}
+
+def THX3T110XWriteFMul : SchedWriteRes<[THX3T110FP0123]> {
+ let Latency = 6;
+ let ResourceCycles = [2];
+ let NumMicroOps = 3;
+}
+
+def THX3T110XWriteFMulAcc : SchedWriteRes<[THX3T110FP0123]> {
+ let Latency = 6;
+ let ResourceCycles = [2];
+ let NumMicroOps = 3;
+}
+
+def : InstRW<[THX3T110XWriteFMul], (instregex "^FMUL", "^FNMUL")>;
+def : InstRW<[THX3T110XWriteFMulAcc],
+ (instregex "^FMADD", "^FMSUB", "^FNMADD", "^FNMSUB")>;
+
+// FP round to integral
+def : InstRW<[THX3T110Write_7Cyc_F01],
+ (instregex "^FRINT(A|I|M|N|P|X|Z)(Sr|Dr)")>;
+
+// FP select
+def : InstRW<[THX3T110Write_3_4Cyc_F23_F0123], (instregex "^FCSEL")>;
+
+//---
+// 3.9 FP Miscellaneous Instructions
+//---
+
+// FP convert, from vec to vec reg
+// FP convert, from gen to vec reg
+// FP convert, from vec to gen reg
+def : WriteRes<WriteFCvt, [THX3T110FP0123]> {
+ let Latency = 7;
+ let NumMicroOps = 3;
+}
+
+// FP move, immed
+// FP move, register
+def : WriteRes<WriteFImm, [THX3T110FP0123]> {
+ let Latency = 4;
+ let NumMicroOps = 2;
+}
+
+// FP transfer, from gen to vec reg
+// FP transfer, from vec to gen reg
+def : WriteRes<WriteFCopy, [THX3T110FP0123]> {
+ let Latency = 4;
+ let NumMicroOps = 2;
+}
+
+def : InstRW<[THX3T110Write_5Cyc_F01], (instrs FMOVXDHighr, FMOVDXHighr)>;
+
+//---
+// 3.12 ASIMD Integer Instructions
+//---
+
+// ASIMD absolute diff, D-form
+// ASIMD absolute diff, Q-form
+// ASIMD absolute diff accum, D-form
+// ASIMD absolute diff accum, Q-form
+// ASIMD absolute diff accum long
+// ASIMD absolute diff long
+// ASIMD arith, basic
+// ASIMD arith, complex
+// ASIMD compare
+// ASIMD logical (AND, BIC, EOR)
+// ASIMD max/min, basic
+// ASIMD max/min, reduce, 4H/4S
+// ASIMD max/min, reduce, 8B/8H
+// ASIMD max/min, reduce, 16B
+// ASIMD multiply, D-form
+// ASIMD multiply, Q-form
+// ASIMD multiply accumulate long
+// ASIMD multiply accumulate saturating long
+// ASIMD multiply long
+// ASIMD pairwise add and accumulate
+// ASIMD shift accumulate
+// ASIMD shift by immed, basic
+// ASIMD shift by immed and insert, basic, D-form
+// ASIMD shift by immed and insert, basic, Q-form
+// ASIMD shift by immed, complex
+// ASIMD shift by register, basic, D-form
+// ASIMD shift by register, basic, Q-form
+// ASIMD shift by register, complex, D-form
+// ASIMD shift by register, complex, Q-form
+def : WriteRes<WriteV, [THX3T110FP0123]> {
+ let Latency = 5;
+ let NumMicroOps = 4;
+ let ResourceCycles = [4];
+}
+
+// ASIMD arith, reduce, 4H/4S
+// ASIMD arith, reduce, 8B/8H
+// ASIMD arith, reduce, 16B
+
+// ASIMD logical (MVN (alias for NOT), ORN, ORR)
+def : InstRW<[THX3T110Write_5Cyc_F0123],
+ (instregex "^ANDv", "^BICv", "^EORv", "^ORRv", "^ORNv", "^NOTv")>;
+
+// ASIMD arith, reduce
+def : InstRW<[THX3T110Write_5Cyc_F0123],
+ (instregex "^ADDVv", "^SADDLVv", "^UADDLVv")>;
+
+// ASIMD polynomial (8x8) multiply long
+def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "^(S|U|SQD)MULL")>;
+def : InstRW<[THX3T110Write_5Cyc_F0123],
+ (instregex "(S|U|SQD)(MLAL|MLSL|MULL)v.*")>;
+def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "^PMULL(v8i8|v16i8)")>;
+def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "^PMULL(v1i64|v2i64)")>;
+
+// ASIMD absolute diff accum, D-form
+def : InstRW<[THX3T110Write_5Cyc_F0123],
+ (instregex "^[SU]ABA(v8i8|v4i16|v2i32)$")>;
+// ASIMD absolute diff accum, Q-form
+def : InstRW<[THX3T110Write_5Cyc_F0123],
+ (instregex "^[SU]ABA(v16i8|v8i16|v4i32)$")>;
+// ASIMD absolute diff accum long
+def : InstRW<[THX3T110Write_5Cyc_F0123],
+ (instregex "^[SU]ABAL")>;
+// ASIMD arith, reduce, 4H/4S
+def : InstRW<[THX3T110Write_5Cyc_F0123],
+ (instregex "^[SU]?ADDL?V(v8i8|v4i16|v2i32)v$")>;
+// ASIMD arith, reduce, 8B
+def : InstRW<[THX3T110Write_5Cyc_F0123],
+ (instregex "^[SU]?ADDL?V(v8i16|v4i32)v$")>;
+// ASIMD arith, reduce, 16B/16H
+def : InstRW<[THX3T110Write_10Cyc_F0123],
+ (instregex "^[SU]?ADDL?Vv16i8v$")>;
+// ASIMD max/min, reduce, 4H/4S
+def : InstRW<[THX3T110Write_5Cyc_F0123],
+ (instregex "^[SU](MIN|MAX)V(v4i16|v4i32)v$")>;
+// ASIMD max/min, reduce, 8B/8H
+def : InstRW<[THX3T110Write_5Cyc_F0123],
+ (instregex "^[SU](MIN|MAX)V(v8i8|v8i16)v$")>;
+// ASIMD max/min, reduce, 16B/16H
+def : InstRW<[THX3T110Write_5Cyc_F0123],
+ (instregex "^[SU](MIN|MAX)Vv16i8v$")>;
+// ASIMD multiply, D-form
+def : InstRW<[THX3T110Write_5Cyc_F0123],
+ (instregex "^(P?MUL|SQR?DMULH)" #
+ "(v8i8|v4i16|v2i32|v1i8|v1i16|v1i32|v1i64)" #
+ "(_indexed)?$")>;
+// ASIMD multiply, Q-form
+def : InstRW<[THX3T110Write_5Cyc_F0123],
+ (instregex "^(P?MUL|SQR?DMULH)(v16i8|v8i16|v4i32)(_indexed)?$")>;
+// ASIMD multiply accumulate, D-form
+def : InstRW<[THX3T110Write_5Cyc_F0123],
+ (instregex "^ML[AS](v8i8|v4i16|v2i32)(_indexed)?$")>;
+// ASIMD multiply accumulate, Q-form
+def : InstRW<[THX3T110Write_5Cyc_F0123],
+ (instregex "^ML[AS](v16i8|v8i16|v4i32)(_indexed)?$")>;
+// ASIMD shift accumulate
+def : InstRW<[THX3T110Write_5Cyc_F0123],
+ (instregex "SRSRAv","SSRAv","URSRAv","USRAv")>;
+
+// ASIMD shift by immed, basic
+def : InstRW<[THX3T110Write_5Cyc_F0123],
+ (instregex "RSHRNv","SHRNv", "SQRSHRNv","SQRSHRUNv",
+ "SQSHRNv","SQSHRUNv", "UQRSHRNv",
+ "UQSHRNv","SQXTNv","SQXTUNv","UQXTNv")>;
+// ASIMD shift by immed, complex
+def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "^[SU]?(Q|R){1,2}SHR")>;
+def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "^SQSHLU")>;
+// ASIMD shift by register, basic, Q-form
+def : InstRW<[THX3T110Write_5Cyc_F01],
+ (instregex "^[SU]SHL(v16i8|v8i16|v4i32|v2i64)")>;
+// ASIMD shift by register, complex, D-form
+def : InstRW<[THX3T110Write_5Cyc_F0123],
+ (instregex "^[SU][QR]{1,2}SHL" #
+ "(v1i8|v1i16|v1i32|v1i64|v8i8|v4i16|v2i32|b|d|h|s)")>;
+// ASIMD shift by register, complex, Q-form
+def : InstRW<[THX3T110Write_5Cyc_F0123],
+ (instregex "^[SU][QR]{1,2}SHL(v16i8|v8i16|v4i32|v2i64)")>;
+
+// ASIMD Arithmetic
+def : InstRW<[THX3T110Write_5Cyc_F0123],
+ (instregex "(ADD|SUB)(v8i8|v4i16|v2i32|v1i64)")>;
+def : InstRW<[THX3T110Write_5Cyc_F0123],
+ (instregex "(ADD|SUB)(v16i8|v8i16|v4i32|v2i64)")>;
+def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "(ADD|SUB)HNv.*")>;
+def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "(RADD|RSUB)HNv.*")>;
+def : InstRW<[THX3T110Write_5Cyc_F0123],
+ (instregex "^SQADD", "^SQNEG", "^SQSUB", "^SRHADD",
+ "^SUQADD", "^UQADD", "^UQSUB", "^URHADD", "^USQADD")>;
+def : InstRW<[THX3T110Write_5Cyc_F0123],
+ (instregex "ADDP(v16i8|v8i16|v4i32|v2i64)")>;
+def : InstRW<[THX3T110Write_5Cyc_F0123],
+ (instregex "((AND|ORN|EOR|EON)S?(Xr[rsi]|v16i8|v8i16|v4i32)|" #
+ "(ORR|BIC)S?(Xr[rs]|v16i8|v8i16|v4i32))")>;
+def : InstRW<[THX3T110Write_5Cyc_F0123],
+ (instregex "(CLS|CLZ|CNT)(v4i32|v8i16|v16i8)")>;
+def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "^SADALP","^UADALP")>;
+def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "^SADDLPv","^UADDLPv")>;
+def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "^SADDLV","^UADDLV")>;
+def : InstRW<[THX3T110Write_5Cyc_F0123],
+ (instregex "^ADDVv","^SMAXVv","^UMAXVv","^SMINVv","^UMINVv")>;
+def : InstRW<[THX3T110Write_5Cyc_F0123],
+ (instregex "^SABAv","^UABAv","^SABALv","^UABALv")>;
+def : InstRW<[THX3T110Write_5Cyc_F0123],
+ (instregex "^SQADDv","^SQSUBv","^UQADDv","^UQSUBv")>;
+def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "^SUQADDv","^USQADDv")>;
+def : InstRW<[THX3T110Write_5Cyc_F0123],
+ (instregex "^ADDHNv","^RADDHNv", "^RSUBHNv",
+ "^SQABS", "^SQADD", "^SQNEG", "^SQSUB",
+ "^SRHADD", "^SUBHNv", "^SUQADD",
+ "^UQADD", "^UQSUB", "^URHADD", "^USQADD")>;
+def : InstRW<[THX3T110Write_5Cyc_F0123],
+ (instregex "^CMEQv","^CMGEv","^CMGTv",
+ "^CMLEv","^CMLTv", "^CMHIv","^CMHSv")>;
+def : InstRW<[THX3T110Write_5Cyc_F0123],
+ (instregex "^SMAXv","^SMINv","^UMAXv","^UMINv",
+ "^SMAXPv","^SMINPv","^UMAXPv","^UMINPv")>;
+def : InstRW<[THX3T110Write_5Cyc_F0123],
+ (instregex "^SABDv","^UABDv", "^SABDLv","^UABDLv")>;
+
+//---
+// 3.13 ASIMD Floating-point Instructions
+//---
+
+// ASIMD FP absolute value
+def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "^FABSv")>;
+
+// ASIMD FP arith, normal, D-form
+// ASIMD FP arith, normal, Q-form
+def : InstRW<[THX3T110Write_3_4Cyc_F23_F0123],
+ (instregex "^FABDv", "^FADDv", "^FSUBv")>;
+
+// ASIMD FP arith,pairwise, D-form
+// ASIMD FP arith, pairwise, Q-form
+def : InstRW<[THX3T110Write_3_4Cyc_F23_F0123], (instregex "^FADDPv")>;
+
+// ASIMD FP compare, D-form
+// ASIMD FP compare, Q-form
+def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "^FACGEv", "^FACGTv")>;
+def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "^FCMEQv", "^FCMGEv",
+ "^FCMGTv", "^FCMLEv",
+ "^FCMLTv")>;
+
+// ASIMD FP round, D-form
+def : InstRW<[THX3T110Write_5Cyc_F0123],
+ (instregex "^FRINT[AIMNPXZ](v2f32)")>;
+// ASIMD FP round, Q-form
+def : InstRW<[THX3T110Write_5Cyc_F0123],
+ (instregex "^FRINT[AIMNPXZ](v4f32|v2f64)")>;
+
+// ASIMD FP convert, long
+// ASIMD FP convert, narrow
+// ASIMD FP convert, other, D-form
+// ASIMD FP convert, other, Q-form
+// NOTE: Handled by WriteV.
+
+// ASIMD FP convert, long and narrow
+def : InstRW<[THX3T110Write_5Cyc_F01], (instregex "^FCVT(L|N|XN)v")>;
+// ASIMD FP convert, other, D-form
+def : InstRW<[THX3T110Write_5Cyc_F01],
+ (instregex "^[FVSU]CVT([AMNPZ][SU])?(_Int)?(v2f32|v1i32|v2i32|v1i64)")>;
+// ASIMD FP convert, other, Q-form
+def : InstRW<[THX3T110Write_5Cyc_F01],
+ (instregex "^[FVSU]CVT([AMNPZ][SU])?(_Int)?(v4f32|v2f64|v4i32|v2i64)")>;
+
+// ASIMD FP divide, D-form, F32
+def : InstRW<[THX3T110Write_16Cyc_F0123], (instrs FDIVv2f32)>;
+def : InstRW<[THX3T110Write_16Cyc_F0123], (instregex "FDIVv2f32")>;
+
+// ASIMD FP divide, Q-form, F32
+def : InstRW<[THX3T110Write_16Cyc_F0123], (instrs FDIVv4f32)>;
+def : InstRW<[THX3T110Write_16Cyc_F0123], (instregex "FDIVv4f32")>;
+
+// ASIMD FP divide, Q-form, F64
+def : InstRW<[THX3T110Write_23Cyc_F0123], (instrs FDIVv2f64)>;
+def : InstRW<[THX3T110Write_23Cyc_F0123], (instregex "FDIVv2f64")>;
+
+// ASIMD FP max/min, normal, D-form
+// ASIMD FP max/min, normal, Q-form
+def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "^FMAXv", "^FMAXNMv",
+ "^FMINv", "^FMINNMv")>;
+
+// ASIMD FP max/min, pairwise, D-form
+// ASIMD FP max/min, pairwise, Q-form
+def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "^FMAXPv", "^FMAXNMPv",
+ "^FMINPv", "^FMINNMPv")>;
+
+// ASIMD FP max/min, reduce
+def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "^FMAXVv", "^FMAXNMVv",
+ "^FMINVv", "^FMINNMVv")>;
+
+// ASIMD FP multiply, D-form, FZ
+// ASIMD FP multiply, D-form, no FZ
+// ASIMD FP multiply, Q-form, FZ
+// ASIMD FP multiply, Q-form, no FZ
+def : InstRW<[THX3T110Write_5Cyc_F0123],
+ (instregex "^FMULv", "^FMULXv")>;
+def : InstRW<[THX3T110Write_5Cyc_F0123],
+ (instregex "^FMULX?(v2f32|v1i32|v2i32|v1i64|32|64)")>;
+def : InstRW<[THX3T110Write_5Cyc_F0123],
+ (instregex "^FMULX?(v4f32|v2f64|v4i32|v2i64)")>;
+
+// ASIMD FP multiply accumulate, Dform, FZ
+// ASIMD FP multiply accumulate, Dform, no FZ
+// ASIMD FP multiply accumulate, Qform, FZ
+// ASIMD FP multiply accumulate, Qform, no FZ
+def : InstRW<[THX3T110Write_5Cyc_F0123],
+ (instregex "^FMLAv", "^FMLSv")>;
+def : InstRW<[THX3T110Write_5Cyc_F0123],
+ (instregex "^FML[AS](v2f32|v1i32|v2i32|v1i64)")>;
+def : InstRW<[THX3T110Write_5Cyc_F0123],
+ (instregex "^FML[AS](v4f32|v2f64|v4i32|v2i64)")>;
+
+// ASIMD FP negate
+def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "^FNEGv")>;
+
+//--
+// 3.14 ASIMD Miscellaneous Instructions
+//--
+
+// ASIMD bit reverse
+def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "^RBITv")>;
+
+// ASIMD bitwise insert, D-form
+// ASIMD bitwise insert, Q-form
+def : InstRW<[THX3T110Write_3_4Cyc_F23_F0123],
+ (instregex "^BIFv", "^BITv", "^BSLv")>;
+
+// ASIMD count, D-form
+// ASIMD count, Q-form
+def : InstRW<[THX3T110Write_3_4Cyc_F23_F0123],
+ (instregex "^CLSv", "^CLZv", "^CNTv")>;
+
+// ASIMD duplicate, gen reg
+// ASIMD duplicate, element
+def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "^DUPv")>;
+def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "^CPY")>;
+def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "^DUPv.+gpr")>;
+
+// ASIMD extract
+def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "^EXTv")>;
+
+// ASIMD extract narrow
+def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "^XTNv")>;
+
+// ASIMD extract narrow, saturating
+def : InstRW<[THX3T110Write_5Cyc_F0123],
+ (instregex "^SQXTNv", "^SQXTUNv", "^UQXTNv")>;
+
+// ASIMD insert, element to element
+def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "^INSv")>;
+
+// ASIMD transfer, element to gen reg
+def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "^[SU]MOVv")>;
+
+// ASIMD move, integer immed
+def : InstRW<[THX3T110Write_3_4Cyc_F23_F0123], (instregex "^MOVIv")>;
+
+// ASIMD move, FP immed
+def : InstRW<[THX3T110Write_3_4Cyc_F23_F0123], (instregex "^FMOVv")>;
+
+// ASIMD transpose
+def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "^TRN1", "^TRN2")>;
+
+// ASIMD unzip/zip
+def : InstRW<[THX3T110Write_5Cyc_F0123],
+ (instregex "^UZP1", "^UZP2", "^ZIP1", "^ZIP2")>;
+
+// ASIMD reciprocal estimate, D-form
+// ASIMD reciprocal estimate, Q-form
+def : InstRW<[THX3T110Write_5Cyc_F0123],
+ (instregex "^FRECPEv", "^FRECPXv", "^URECPEv",
+ "^FRSQRTEv", "^URSQRTEv")>;
+
+// ASIMD reciprocal step, D-form, FZ
+// ASIMD reciprocal step, D-form, no FZ
+// ASIMD reciprocal step, Q-form, FZ
+// ASIMD reciprocal step, Q-form, no FZ
+def : InstRW<[THX3T110Write_5Cyc_F0123],
+ (instregex "^FRECPSv", "^FRSQRTSv")>;
+
+// ASIMD reverse
+def : InstRW<[THX3T110Write_5Cyc_F0123],
+ (instregex "^REV16v", "^REV32v", "^REV64v")>;
+
+// ASIMD table lookup, D-form
+// ASIMD table lookup, Q-form
+def : InstRW<[THX3T110Write_5Cyc_F0123],
+ (instrs TBLv8i8One, TBLv16i8One, TBXv8i8One, TBXv16i8One)>;
+def : InstRW<[THX3T110Write_10Cyc_F0123],
+ (instrs TBLv8i8Two, TBLv16i8Two, TBXv8i8Two, TBXv16i8Two)>;
+def : InstRW<[THX3T110Write_15Cyc_F0123],
+ (instrs TBLv8i8Three, TBLv16i8Three, TBXv8i8Three, TBXv16i8Three)>;
+def : InstRW<[THX3T110Write_20Cyc_F0123],
+ (instrs TBLv8i8Four, TBLv16i8Four, TBXv8i8Four, TBXv16i8Four)>;
+
+// ASIMD transfer, element to word or word
+def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "^[SU]MOVv")>;
+
+// ASIMD transfer, element to gen reg
+def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "(S|U)MOVv.*")>;
+
+// ASIMD transfer gen reg to element
+def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "^INSv")>;
+
+// ASIMD transpose
+def : InstRW<[THX3T110Write_5Cyc_F0123],
+ (instregex "^TRN1v", "^TRN2v", "^UZP1v", "^UZP2v")>;
+
+// ASIMD unzip/zip
+def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "^ZIP1v", "^ZIP2v")>;
+
+//--
+// 3.15 ASIMD Load Instructions
+//--
+
+// ASIMD load, 1 element, multiple, 1 reg, D-form
+// ASIMD load, 1 element, multiple, 1 reg, Q-form
+def : InstRW<[THX3T110Write_4Cyc_LS01],
+ (instregex "^LD1Onev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[THX3T110Write_4Cyc_LS01, WriteAdr],
+ (instregex "^LD1Onev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD load, 1 element, multiple, 2 reg, D-form
+// ASIMD load, 1 element, multiple, 2 reg, Q-form
+def : InstRW<[THX3T110Write_4Cyc_LS01],
+ (instregex "^LD1Twov(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[THX3T110Write_4Cyc_LS01, WriteAdr],
+ (instregex "^LD1Twov(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD load, 1 element, multiple, 3 reg, D-form
+// ASIMD load, 1 element, multiple, 3 reg, Q-form
+def : InstRW<[THX3T110Write_5Cyc_LS01],
+ (instregex "^LD1Threev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[THX3T110Write_5Cyc_LS01, WriteAdr],
+ (instregex "^LD1Threev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD load, 1 element, multiple, 4 reg, D-form
+// ASIMD load, 1 element, multiple, 4 reg, Q-form
+def : InstRW<[THX3T110Write_6Cyc_LS01],
+ (instregex "^LD1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[THX3T110Write_6Cyc_LS01, WriteAdr],
+ (instregex "^LD1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD load, 1 element, one lane, B/H/S
+// ASIMD load, 1 element, one lane, D
+def : InstRW<[THX3T110Write_5Cyc_LS01_F0123],
+ (instregex "^LD1i(8|16|32|64)$")>;
+def : InstRW<[THX3T110Write_5Cyc_LS01_F0123, WriteAdr],
+ (instregex "^LD1i(8|16|32|64)_POST$")>;
+
+// ASIMD load, 1 element, all lanes, D-form, B/H/S
+// ASIMD load, 1 element, all lanes, D-form, D
+// ASIMD load, 1 element, all lanes, Q-form
+def : InstRW<[THX3T110Write_5Cyc_LS01_F0123],
+ (instregex "^LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[THX3T110Write_5Cyc_LS01_F0123, WriteAdr],
+ (instregex "^LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD load, 2 element, multiple, D-form, B/H/S
+// ASIMD load, 2 element, multiple, Q-form, D
+def : InstRW<[THX3T110Write_5Cyc_LS01_F0123],
+ (instregex "^LD2Twov(8b|4h|2s|16b|8h|4s|2d)$")>;
+def : InstRW<[THX3T110Write_5Cyc_LS01_F0123, WriteAdr],
+ (instregex "^LD2Twov(8b|4h|2s|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD load, 2 element, one lane, B/H
+// ASIMD load, 2 element, one lane, S
+// ASIMD load, 2 element, one lane, D
+def : InstRW<[THX3T110Write_5Cyc_LS01_F0123],
+ (instregex "^LD2i(8|16|32|64)$")>;
+def : InstRW<[THX3T110Write_5Cyc_LS01_F0123, WriteAdr],
+ (instregex "^LD2i(8|16|32|64)_POST$")>;
+
+// ASIMD load, 2 element, all lanes, D-form, B/H/S
+// ASIMD load, 2 element, all lanes, D-form, D
+// ASIMD load, 2 element, all lanes, Q-form
+def : InstRW<[THX3T110Write_5Cyc_LS01_F0123],
+ (instregex "^LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[THX3T110Write_5Cyc_LS01_F0123, WriteAdr],
+ (instregex "^LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD load, 3 element, multiple, D-form, B/H/S
+// ASIMD load, 3 element, multiple, Q-form, B/H/S
+// ASIMD load, 3 element, multiple, Q-form, D
+def : InstRW<[THX3T110Write_8Cyc_LS01_F0123],
+ (instregex "^LD3Threev(8b|4h|2s|16b|8h|4s|2d)$")>;
+def : InstRW<[THX3T110Write_8Cyc_LS01_F0123, WriteAdr],
+ (instregex "^LD3Threev(8b|4h|2s|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD load, 3 element, one lone, B/H
+// ASIMD load, 3 element, one lane, S
+// ASIMD load, 3 element, one lane, D
+def : InstRW<[THX3T110Write_7Cyc_LS01_F0123],
+ (instregex "^LD3i(8|16|32|64)$")>;
+def : InstRW<[THX3T110Write_7Cyc_LS01_F0123, WriteAdr],
+ (instregex "^LD3i(8|16|32|64)_POST$")>;
+
+// ASIMD load, 3 element, all lanes, D-form, B/H/S
+// ASIMD load, 3 element, all lanes, D-form, D
+// ASIMD load, 3 element, all lanes, Q-form, B/H/S
+// ASIMD load, 3 element, all lanes, Q-form, D
+def : InstRW<[THX3T110Write_7Cyc_LS01_F0123],
+ (instregex "^LD3Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[THX3T110Write_7Cyc_LS01_F0123, WriteAdr],
+ (instregex "^LD3Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD load, 4 element, multiple, D-form, B/H/S
+// ASIMD load, 4 element, multiple, Q-form, B/H/S
+// ASIMD load, 4 element, multiple, Q-form, D
+def : InstRW<[THX3T110Write_8Cyc_LS01_F0123],
+ (instregex "^LD4Fourv(8b|4h|2s|16b|8h|4s|2d)$")>;
+def : InstRW<[THX3T110Write_8Cyc_LS01_F0123, WriteAdr],
+ (instregex "^LD4Fourv(8b|4h|2s|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD load, 4 element, one lane, B/H
+// ASIMD load, 4 element, one lane, S
+// ASIMD load, 4 element, one lane, D
+def : InstRW<[THX3T110Write_6Cyc_LS01_F0123],
+ (instregex "^LD4i(8|16|32|64)$")>;
+def : InstRW<[THX3T110Write_6Cyc_LS01_F0123, WriteAdr],
+ (instregex "^LD4i(8|16|32|64)_POST$")>;
+
+// ASIMD load, 4 element, all lanes, D-form, B/H/S
+// ASIMD load, 4 element, all lanes, D-form, D
+// ASIMD load, 4 element, all lanes, Q-form, B/H/S
+// ASIMD load, 4 element, all lanes, Q-form, D
+def : InstRW<[THX3T110Write_6Cyc_LS01_F0123],
+ (instregex "^LD4Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[THX3T110Write_6Cyc_LS01_F0123, WriteAdr],
+ (instregex "^LD4Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+//--
+// 3.16 ASIMD Store Instructions
+//--
+
+// ASIMD store, 1 element, multiple, 1 reg, D-form
+// ASIMD store, 1 element, multiple, 1 reg, Q-form
+def : InstRW<[THX3T110Write_1Cyc_LS01],
+ (instregex "^ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[THX3T110Write_1Cyc_LS01, WriteAdr],
+ (instregex "^ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD store, 1 element, multiple, 2 reg, D-form
+// ASIMD store, 1 element, multiple, 2 reg, Q-form
+def : InstRW<[THX3T110Write_1Cyc_LS01],
+ (instregex "^ST1Twov(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[THX3T110Write_1Cyc_LS01, WriteAdr],
+ (instregex "^ST1Twov(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD store, 1 element, multiple, 3 reg, D-form
+// ASIMD store, 1 element, multiple, 3 reg, Q-form
+def : InstRW<[THX3T110Write_1Cyc_LS01],
+ (instregex "^ST1Threev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[THX3T110Write_1Cyc_LS01, WriteAdr],
+ (instregex "^ST1Threev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD store, 1 element, multiple, 4 reg, D-form
+// ASIMD store, 1 element, multiple, 4 reg, Q-form
+def : InstRW<[THX3T110Write_1Cyc_LS01],
+ (instregex "^ST1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[THX3T110Write_1Cyc_LS01, WriteAdr],
+ (instregex "^ST1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD store, 1 element, one lane, B/H/S
+// ASIMD store, 1 element, one lane, D
+def : InstRW<[THX3T110Write_1Cyc_LS01_F0123],
+ (instregex "^ST1i(8|16|32|64)$")>;
+def : InstRW<[THX3T110Write_1Cyc_LS01_F0123, WriteAdr],
+ (instregex "^ST1i(8|16|32|64)_POST$")>;
+
+// ASIMD store, 2 element, multiple, D-form, B/H/S
+// ASIMD store, 2 element, multiple, Q-form, B/H/S
+// ASIMD store, 2 element, multiple, Q-form, D
+def : InstRW<[THX3T110Write_1Cyc_LS01_F0123],
+ (instregex "^ST2Twov(8b|4h|2s|16b|8h|4s|2d)$")>;
+def : InstRW<[THX3T110Write_1Cyc_LS01_F0123, WriteAdr],
+ (instregex "^ST2Twov(8b|4h|2s|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD store, 2 element, one lane, B/H/S
+// ASIMD store, 2 element, one lane, D
+def : InstRW<[THX3T110Write_1Cyc_LS01_F0123],
+ (instregex "^ST2i(8|16|32|64)$")>;
+def : InstRW<[THX3T110Write_1Cyc_LS01_F0123, WriteAdr],
+ (instregex "^ST2i(8|16|32|64)_POST$")>;
+
+// ASIMD store, 3 element, multiple, D-form, B/H/S
+// ASIMD store, 3 element, multiple, Q-form, B/H/S
+// ASIMD store, 3 element, multiple, Q-form, D
+def : InstRW<[THX3T110Write_1Cyc_LS01_F0123],
+ (instregex "^ST3Threev(8b|4h|2s|16b|8h|4s|2d)$")>;
+def : InstRW<[THX3T110Write_1Cyc_LS01_F0123, WriteAdr],
+ (instregex "^ST3Threev(8b|4h|2s|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD store, 3 element, one lane, B/H
+// ASIMD store, 3 element, one lane, S
+// ASIMD store, 3 element, one lane, D
+def : InstRW<[THX3T110Write_1Cyc_LS01_F0123],
+ (instregex "^ST3i(8|16|32|64)$")>;
+def : InstRW<[THX3T110Write_1Cyc_LS01_F0123, WriteAdr],
+ (instregex "^ST3i(8|16|32|64)_POST$")>;
+
+// ASIMD store, 4 element, multiple, D-form, B/H/S
+// ASIMD store, 4 element, multiple, Q-form, B/H/S
+// ASIMD store, 4 element, multiple, Q-form, D
+def : InstRW<[THX3T110Write_1Cyc_LS01_F0123],
+ (instregex "^ST4Fourv(8b|4h|2s|16b|8h|4s|2d)$")>;
+def : InstRW<[THX3T110Write_1Cyc_LS01_F0123, WriteAdr],
+ (instregex "^ST4Fourv(8b|4h|2s|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD store, 4 element, one lane, B/H
+// ASIMD store, 4 element, one lane, S
+// ASIMD store, 4 element, one lane, D
+def : InstRW<[THX3T110Write_1Cyc_LS01_F0123],
+ (instregex "^ST4i(8|16|32|64)$")>;
+def : InstRW<[THX3T110Write_1Cyc_LS01_F0123, WriteAdr],
+ (instregex "^ST4i(8|16|32|64)_POST$")>;
+
+// V8.1a Atomics (LSE)
+def : InstRW<[THX3T110Write_4Cyc_I0123, WriteAtomic],
+ (instrs CASB, CASH, CASW, CASX)>;
+
+def : InstRW<[THX3T110Write_6Cyc_I0123, WriteAtomic],
+ (instrs CASAB, CASAH, CASAW, CASAX)>;
+
+def : InstRW<[THX3T110Write_6Cyc_I0123, WriteAtomic],
+ (instrs CASLB, CASLH, CASLW, CASLX)>;
+
+def : InstRW<[THX3T110Write_8Cyc_I0123, WriteAtomic],
+ (instrs CASALB, CASALH, CASALW, CASALX)>;
+
+def : InstRW<[THX3T110Write_6Cyc_I0123, WriteAtomic],
+ (instrs LDLARB, LDLARH, LDLARW, LDLARX)>;
+
+def : InstRW<[THX3T110Write_4Cyc_I0123, WriteAtomic],
+ (instrs LDADDB, LDADDH, LDADDW, LDADDX)>;
+
+def : InstRW<[THX3T110Write_6Cyc_I0123, WriteAtomic],
+ (instrs LDADDAB, LDADDAH, LDADDAW, LDADDAX)>;
+
+def : InstRW<[THX3T110Write_6Cyc_I0123, WriteAtomic],
+ (instrs LDADDLB, LDADDLH, LDADDLW, LDADDLX)>;
+
+def : InstRW<[THX3T110Write_8Cyc_I0123, WriteAtomic],
+ (instrs LDADDALB, LDADDALH, LDADDALW, LDADDALX)>;
+
+def : InstRW<[THX3T110Write_4Cyc_I0123, WriteAtomic],
+ (instrs LDCLRB, LDCLRH, LDCLRW, LDCLRX)>;
+
+def : InstRW<[THX3T110Write_6Cyc_I0123, WriteAtomic],
+ (instrs LDCLRAB, LDCLRAH, LDCLRAW, LDCLRAX)>;
+
+def : InstRW<[THX3T110Write_6Cyc_I0123, WriteAtomic],
+ (instrs LDCLRLB, LDCLRLH, LDCLRLW, LDCLRLX)>;
+
+def : InstRW<[THX3T110Write_8Cyc_I0123, WriteAtomic],
+ (instrs LDCLRALB, LDCLRALH, LDCLRALW, LDCLRALX)>;
+
+def : InstRW<[THX3T110Write_4Cyc_I0123, WriteAtomic],
+ (instrs LDEORB, LDEORH, LDEORW, LDEORX)>;
+
+def : InstRW<[THX3T110Write_6Cyc_I0123, WriteAtomic],
+ (instrs LDEORAB, LDEORAH, LDEORAW, LDEORAX)>;
+
+def : InstRW<[THX3T110Write_6Cyc_I0123, WriteAtomic],
+ (instrs LDEORLB, LDEORLH, LDEORLW, LDEORLX)>;
+
+def : InstRW<[THX3T110Write_8Cyc_I0123, WriteAtomic],
+ (instrs LDEORALB, LDEORALH, LDEORALW, LDEORALX)>;
+
+def : InstRW<[THX3T110Write_4Cyc_I0123, WriteAtomic],
+ (instrs LDSETB, LDSETH, LDSETW, LDSETX)>;
+
+def : InstRW<[THX3T110Write_6Cyc_I0123, WriteAtomic],
+ (instrs LDSETAB, LDSETAH, LDSETAW, LDSETAX)>;
+
+def : InstRW<[THX3T110Write_6Cyc_I0123, WriteAtomic],
+ (instrs LDSETLB, LDSETLH, LDSETLW, LDSETLX)>;
+
+def : InstRW<[THX3T110Write_8Cyc_I0123, WriteAtomic],
+ (instrs LDSETALB, LDSETALH, LDSETALW, LDSETALX)>;
+
+def : InstRW<[THX3T110Write_4Cyc_I0123, WriteAtomic],
+ (instrs LDSMAXB, LDSMAXH, LDSMAXW, LDSMAXX,
+ LDSMAXAB, LDSMAXAH, LDSMAXAW, LDSMAXAX,
+ LDSMAXLB, LDSMAXLH, LDSMAXLW, LDSMAXLX,
+ LDSMAXALB, LDSMAXALH, LDSMAXALW, LDSMAXALX)>;
+
+def : InstRW<[THX3T110Write_4Cyc_I0123, WriteAtomic],
+ (instrs LDSMINB, LDSMINH, LDSMINW, LDSMINX,
+ LDSMINAB, LDSMINAH, LDSMINAW, LDSMINAX,
+ LDSMINLB, LDSMINLH, LDSMINLW, LDSMINLX,
+ LDSMINALB, LDSMINALH, LDSMINALW, LDSMINALX)>;
+
+def : InstRW<[THX3T110Write_4Cyc_I0123, WriteAtomic],
+ (instrs LDUMAXB, LDUMAXH, LDUMAXW, LDUMAXX,
+ LDUMAXAB, LDUMAXAH, LDUMAXAW, LDUMAXAX,
+ LDUMAXLB, LDUMAXLH, LDUMAXLW, LDUMAXLX,
+ LDUMAXALB, LDUMAXALH, LDUMAXALW, LDUMAXALX)>;
+
+def : InstRW<[THX3T110Write_4Cyc_I0123, WriteAtomic],
+ (instrs LDUMINB, LDUMINH, LDUMINW, LDUMINX,
+ LDUMINAB, LDUMINAH, LDUMINAW, LDUMINAX,
+ LDUMINLB, LDUMINLH, LDUMINLW, LDUMINLX,
+ LDUMINALB, LDUMINALH, LDUMINALW, LDUMINALX)>;
+
+def : InstRW<[THX3T110Write_4Cyc_I0123, WriteAtomic],
+ (instrs SWPB, SWPH, SWPW, SWPX)>;
+
+def : InstRW<[THX3T110Write_6Cyc_I0123, WriteAtomic],
+ (instrs SWPAB, SWPAH, SWPAW, SWPAX)>;
+
+def : InstRW<[THX3T110Write_6Cyc_I0123, WriteAtomic],
+ (instrs SWPLB, SWPLH, SWPLW, SWPLX)>;
+
+def : InstRW<[THX3T110Write_8Cyc_I0123, WriteAtomic],
+ (instrs SWPALB, SWPALH, SWPALW, SWPALX)>;
+
+def : InstRW<[THX3T110Write_4Cyc_I0123, WriteAtomic],
+ (instrs STLLRB, STLLRH, STLLRW, STLLRX)>;
+
+// V8.3a PAC
+def : InstRW<[THX3T110Write_11Cyc_LS01_I1], (instregex "^LDRAA", "^LDRAB")>;
+def : InstRW<[THX3T110Write_8Cyc_I123],
+ (instrs BLRAA, BLRAAZ, BLRAB, BLRABZ,
+ BRAA, BRAAZ, BRAB, BRABZ)>;
+def : InstRW<[THX3T110Write_8Cyc_I123], (instrs RETAA, RETAB)>;
+
+} // SchedModel = ThunderX3T110Model
diff --git a/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp b/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
index ba61ed726e840..8f814d185e859 100644
--- a/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
@@ -17,7 +17,7 @@ using namespace llvm;
SDValue AArch64SelectionDAGInfo::EmitTargetCodeForMemset(
SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,
- SDValue Size, unsigned Align, bool isVolatile,
+ SDValue Size, Align Alignment, bool isVolatile,
MachinePointerInfo DstPtrInfo) const {
// Check to see if there is a specialized entry-point for memory zeroing.
ConstantSDNode *V = dyn_cast<ConstantSDNode>(Src);
@@ -117,7 +117,7 @@ SDValue AArch64SelectionDAGInfo::EmitTargetCodeForSetTag(
MachineFunction &MF = DAG.getMachineFunction();
MachineMemOperand *BaseMemOperand = MF.getMachineMemOperand(
- DstPtrInfo, MachineMemOperand::MOStore, ObjSize, 16);
+ DstPtrInfo, MachineMemOperand::MOStore, ObjSize, Align(16));
bool UseSetTagRangeLoop =
kSetTagLoopThreshold >= 0 && (int)ObjSize >= kSetTagLoopThreshold;
@@ -125,21 +125,18 @@ SDValue AArch64SelectionDAGInfo::EmitTargetCodeForSetTag(
return EmitUnrolledSetTag(DAG, dl, Chain, Addr, ObjSize, BaseMemOperand,
ZeroData);
- if (ObjSize % 32 != 0) {
- SDNode *St1 = DAG.getMachineNode(
- ZeroData ? AArch64::STZGPostIndex : AArch64::STGPostIndex, dl,
- {MVT::i64, MVT::Other},
- {Addr, Addr, DAG.getTargetConstant(1, dl, MVT::i64), Chain});
- DAG.setNodeMemRefs(cast<MachineSDNode>(St1), {BaseMemOperand});
- ObjSize -= 16;
- Addr = SDValue(St1, 0);
- Chain = SDValue(St1, 1);
- }
-
const EVT ResTys[] = {MVT::i64, MVT::i64, MVT::Other};
- SDValue Ops[] = {DAG.getConstant(ObjSize, dl, MVT::i64), Addr, Chain};
- SDNode *St = DAG.getMachineNode(
- ZeroData ? AArch64::STZGloop : AArch64::STGloop, dl, ResTys, Ops);
+
+ unsigned Opcode;
+ if (Addr.getOpcode() == ISD::FrameIndex) {
+ int FI = cast<FrameIndexSDNode>(Addr)->getIndex();
+ Addr = DAG.getTargetFrameIndex(FI, MVT::i64);
+ Opcode = ZeroData ? AArch64::STZGloop : AArch64::STGloop;
+ } else {
+ Opcode = ZeroData ? AArch64::STZGloop_wback : AArch64::STGloop_wback;
+ }
+ SDValue Ops[] = {DAG.getTargetConstant(ObjSize, dl, MVT::i64), Addr, Chain};
+ SDNode *St = DAG.getMachineNode(Opcode, dl, ResTys, Ops);
DAG.setNodeMemRefs(cast<MachineSDNode>(St), {BaseMemOperand});
return SDValue(St, 2);
diff --git a/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.h b/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.h
index d0967fb973cc3..d94fd8471b7b9 100644
--- a/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.h
@@ -21,7 +21,8 @@ class AArch64SelectionDAGInfo : public SelectionDAGTargetInfo {
public:
SDValue EmitTargetCodeForMemset(SelectionDAG &DAG, const SDLoc &dl,
SDValue Chain, SDValue Dst, SDValue Src,
- SDValue Size, unsigned Align, bool isVolatile,
+ SDValue Size, Align Alignment,
+ bool isVolatile,
MachinePointerInfo DstPtrInfo) const override;
SDValue EmitTargetCodeForSetTag(SelectionDAG &DAG, const SDLoc &dl,
SDValue Chain, SDValue Op1, SDValue Op2,
diff --git a/llvm/lib/Target/AArch64/AArch64StackOffset.h b/llvm/lib/Target/AArch64/AArch64StackOffset.h
index f95b5dc5246e9..6fa1c744f77e2 100644
--- a/llvm/lib/Target/AArch64/AArch64StackOffset.h
+++ b/llvm/lib/Target/AArch64/AArch64StackOffset.h
@@ -16,6 +16,7 @@
#include "llvm/Support/MachineValueType.h"
#include "llvm/Support/TypeSize.h"
+#include <cassert>
namespace llvm {
diff --git a/llvm/lib/Target/AArch64/AArch64StackTagging.cpp b/llvm/lib/Target/AArch64/AArch64StackTagging.cpp
index 975502818fcd2..61f27cbc3b29d 100644
--- a/llvm/lib/Target/AArch64/AArch64StackTagging.cpp
+++ b/llvm/lib/Target/AArch64/AArch64StackTagging.cpp
@@ -19,10 +19,13 @@
#include "llvm/ADT/Optional.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AliasAnalysis.h"
#include "llvm/Analysis/CFG.h"
#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/PostDominators.h"
#include "llvm/Analysis/ScalarEvolution.h"
#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/Analysis/StackSafetyAnalysis.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/CodeGen/LiveRegUnits.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
@@ -44,6 +47,7 @@
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/IntrinsicsAArch64.h"
#include "llvm/IR/Metadata.h"
+#include "llvm/InitializePasses.h"
#include "llvm/Pass.h"
#include "llvm/Support/Casting.h"
#include "llvm/Support/Debug.h"
@@ -61,6 +65,11 @@ static cl::opt<bool> ClMergeInit(
"stack-tagging-merge-init", cl::Hidden, cl::init(true), cl::ZeroOrMore,
cl::desc("merge stack variable initializers with tagging when possible"));
+static cl::opt<bool>
+ ClUseStackSafety("stack-tagging-use-stack-safety", cl::Hidden,
+ cl::init(true), cl::ZeroOrMore,
+ cl::desc("Use Stack Safety analysis results"));
+
static cl::opt<unsigned> ClScanLimit("stack-tagging-merge-init-scan-limit",
cl::init(40), cl::Hidden);
@@ -256,8 +265,9 @@ public:
Type *EltTy = VecTy->getElementType();
if (EltTy->isPointerTy()) {
uint32_t EltSize = DL->getTypeSizeInBits(EltTy);
- Type *NewTy = VectorType::get(IntegerType::get(Ctx, EltSize),
- VecTy->getNumElements());
+ auto *NewTy = FixedVectorType::get(
+ IntegerType::get(Ctx, EltSize),
+ cast<FixedVectorType>(VecTy)->getNumElements());
V = IRB.CreatePointerCast(V, NewTy);
}
}
@@ -275,15 +285,17 @@ class AArch64StackTagging : public FunctionPass {
int Tag; // -1 for non-tagged allocations
};
- bool MergeInit;
+ const bool MergeInit;
+ const bool UseStackSafety;
public:
static char ID; // Pass ID, replacement for typeid
- AArch64StackTagging(bool MergeInit = true)
+ AArch64StackTagging(bool IsOptNone = false)
: FunctionPass(ID),
- MergeInit(ClMergeInit.getNumOccurrences() > 0 ? ClMergeInit
- : MergeInit) {
+ MergeInit(ClMergeInit.getNumOccurrences() ? ClMergeInit : !IsOptNone),
+ UseStackSafety(ClUseStackSafety.getNumOccurrences() ? ClUseStackSafety
+ : !IsOptNone) {
initializeAArch64StackTaggingPass(*PassRegistry::getPassRegistry());
}
@@ -305,13 +317,16 @@ public:
StringRef getPassName() const override { return "AArch64 Stack Tagging"; }
private:
- Function *F;
- Function *SetTagFunc;
- const DataLayout *DL;
- AAResults *AA;
+ Function *F = nullptr;
+ Function *SetTagFunc = nullptr;
+ const DataLayout *DL = nullptr;
+ AAResults *AA = nullptr;
+ const StackSafetyGlobalInfo *SSI = nullptr;
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.setPreservesCFG();
+ if (UseStackSafety)
+ AU.addRequired<StackSafetyGlobalInfoWrapperPass>();
if (MergeInit)
AU.addRequired<AAResultsWrapperPass>();
}
@@ -323,11 +338,13 @@ char AArch64StackTagging::ID = 0;
INITIALIZE_PASS_BEGIN(AArch64StackTagging, DEBUG_TYPE, "AArch64 Stack Tagging",
false, false)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(StackSafetyGlobalInfoWrapperPass)
INITIALIZE_PASS_END(AArch64StackTagging, DEBUG_TYPE, "AArch64 Stack Tagging",
false, false)
-FunctionPass *llvm::createAArch64StackTaggingPass(bool MergeInit) {
- return new AArch64StackTagging(MergeInit);
+FunctionPass *llvm::createAArch64StackTaggingPass(bool IsOptNone) {
+ return new AArch64StackTagging(IsOptNone);
}
Instruction *AArch64StackTagging::collectInitializers(Instruction *StartInst,
@@ -400,7 +417,9 @@ bool AArch64StackTagging::isInterestingAlloca(const AllocaInst &AI) {
// dynamic alloca instrumentation for them as well.
!AI.isUsedWithInAlloca() &&
// swifterror allocas are register promoted by ISel
- !AI.isSwiftError();
+ !AI.isSwiftError() &&
+ // safe allocas are not interesting
+ !(SSI && SSI->isSafe(AI));
return IsInteresting;
}
@@ -482,7 +501,7 @@ void AArch64StackTagging::alignAndPadAlloca(AllocaInfo &Info) {
auto *NewAI = new AllocaInst(
TypeWithPadding, Info.AI->getType()->getAddressSpace(), nullptr, "", Info.AI);
NewAI->takeName(Info.AI);
- NewAI->setAlignment(MaybeAlign(Info.AI->getAlignment()));
+ NewAI->setAlignment(Info.AI->getAlign());
NewAI->setUsedWithInAlloca(Info.AI->isUsedWithInAlloca());
NewAI->setSwiftError(Info.AI->isSwiftError());
NewAI->copyMetadata(*Info.AI);
@@ -516,6 +535,8 @@ bool AArch64StackTagging::runOnFunction(Function &Fn) {
if (!Fn.hasFnAttribute(Attribute::SanitizeMemTag))
return false;
+ if (UseStackSafety)
+ SSI = &getAnalysis<StackSafetyGlobalInfoWrapperPass>().getResult();
F = &Fn;
DL = &Fn.getParent()->getDataLayout();
if (MergeInit)
diff --git a/llvm/lib/Target/AArch64/AArch64StorePairSuppress.cpp b/llvm/lib/Target/AArch64/AArch64StorePairSuppress.cpp
index 5deb601822b8c..a94856ef4fba3 100644
--- a/llvm/lib/Target/AArch64/AArch64StorePairSuppress.cpp
+++ b/llvm/lib/Target/AArch64/AArch64StorePairSuppress.cpp
@@ -149,7 +149,9 @@ bool AArch64StorePairSuppress::runOnMachineFunction(MachineFunction &MF) {
continue;
const MachineOperand *BaseOp;
int64_t Offset;
- if (TII->getMemOperandWithOffset(MI, BaseOp, Offset, TRI) &&
+ bool OffsetIsScalable;
+ if (TII->getMemOperandWithOffset(MI, BaseOp, Offset, OffsetIsScalable,
+ TRI) &&
BaseOp->isReg()) {
Register BaseReg = BaseOp->getReg();
if (PrevBaseReg == BaseReg) {
diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
index 3636d8d2b628c..029535cb98b57 100644
--- a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
+++ b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
@@ -13,12 +13,12 @@
#include "AArch64Subtarget.h"
#include "AArch64.h"
-#include "AArch64CallLowering.h"
#include "AArch64InstrInfo.h"
-#include "AArch64LegalizerInfo.h"
#include "AArch64PBQPRegAlloc.h"
-#include "AArch64RegisterBankInfo.h"
#include "AArch64TargetMachine.h"
+#include "GISel/AArch64CallLowering.h"
+#include "GISel/AArch64LegalizerInfo.h"
+#include "GISel/AArch64RegisterBankInfo.h"
#include "MCTargetDesc/AArch64AddressingModes.h"
#include "llvm/CodeGen/GlobalISel/InstructionSelect.h"
#include "llvm/CodeGen/MachineScheduler.h"
@@ -47,6 +47,18 @@ static cl::opt<bool>
cl::desc("Call nonlazybind functions via direct GOT load"),
cl::init(false), cl::Hidden);
+static cl::opt<unsigned> SVEVectorBitsMax(
+ "aarch64-sve-vector-bits-max",
+ cl::desc("Assume SVE vector registers are at most this big, "
+ "with zero meaning no maximum size is assumed."),
+ cl::init(0), cl::Hidden);
+
+static cl::opt<unsigned> SVEVectorBitsMin(
+ "aarch64-sve-vector-bits-min",
+ cl::desc("Assume SVE vector registers are at least this big, "
+ "with zero meaning no minimum size is assumed."),
+ cl::init(0), cl::Hidden);
+
AArch64Subtarget &
AArch64Subtarget::initializeSubtargetDependencies(StringRef FS,
StringRef CPUString) {
@@ -68,6 +80,9 @@ void AArch64Subtarget::initializeProperties() {
switch (ARMProcFamily) {
case Others:
break;
+ case Carmel:
+ CacheLineSize = 64;
+ break;
case CortexA35:
break;
case CortexA53:
@@ -86,8 +101,16 @@ void AArch64Subtarget::initializeProperties() {
case CortexA73:
case CortexA75:
case CortexA76:
+ case CortexA77:
+ case CortexA78:
+ case CortexX1:
PrefFunctionLogAlignment = 4;
break;
+ case A64FX:
+ CacheLineSize = 256;
+ PrefFunctionLogAlignment = 5;
+ PrefLoopLogAlignment = 5;
+ break;
case AppleA7:
case AppleA10:
case AppleA11:
@@ -160,6 +183,17 @@ void AArch64Subtarget::initializeProperties() {
PrefFunctionLogAlignment = 4;
PrefLoopLogAlignment = 2;
break;
+ case ThunderX3T110:
+ CacheLineSize = 64;
+ PrefFunctionLogAlignment = 4;
+ PrefLoopLogAlignment = 2;
+ MaxInterleaveFactor = 4;
+ PrefetchDistance = 128;
+ MinPrefetchStride = 1024;
+ MaxPrefetchIterationsAhead = 4;
+ // FIXME: remove this to enable 64-bit SLP if performance looks good.
+ MinVectorRegisterBitWidth = 128;
+ break;
}
}
@@ -177,6 +211,7 @@ AArch64Subtarget::AArch64Subtarget(const Triple &TT, const std::string &CPU,
ReserveXRegister.set(18);
CallLoweringInfo.reset(new AArch64CallLowering(*getTargetLowering()));
+ InlineAsmLoweringInfo.reset(new InlineAsmLowering(getTargetLowering()));
Legalizer.reset(new AArch64LegalizerInfo(*this));
auto *RBI = new AArch64RegisterBankInfo(*getRegisterInfo());
@@ -194,6 +229,10 @@ const CallLowering *AArch64Subtarget::getCallLowering() const {
return CallLoweringInfo.get();
}
+const InlineAsmLowering *AArch64Subtarget::getInlineAsmLowering() const {
+ return InlineAsmLoweringInfo.get();
+}
+
InstructionSelector *AArch64Subtarget::getInstructionSelector() const {
return InstSelector.get();
}
@@ -305,3 +344,25 @@ void AArch64Subtarget::mirFileLoaded(MachineFunction &MF) const {
if (!MFI.isMaxCallFrameSizeComputed())
MFI.computeMaxCallFrameSize(MF);
}
+
+unsigned AArch64Subtarget::getMaxSVEVectorSizeInBits() const {
+ assert(HasSVE && "Tried to get SVE vector length without SVE support!");
+ assert(SVEVectorBitsMax % 128 == 0 &&
+ "SVE requires vector length in multiples of 128!");
+ assert((SVEVectorBitsMax >= SVEVectorBitsMin || SVEVectorBitsMax == 0) &&
+ "Minimum SVE vector size should not be larger than its maximum!");
+ if (SVEVectorBitsMax == 0)
+ return 0;
+ return (std::max(SVEVectorBitsMin, SVEVectorBitsMax) / 128) * 128;
+}
+
+unsigned AArch64Subtarget::getMinSVEVectorSizeInBits() const {
+ assert(HasSVE && "Tried to get SVE vector length without SVE support!");
+ assert(SVEVectorBitsMin % 128 == 0 &&
+ "SVE requires vector length in multiples of 128!");
+ assert((SVEVectorBitsMax >= SVEVectorBitsMin || SVEVectorBitsMax == 0) &&
+ "Minimum SVE vector size should not be larger than its maximum!");
+ if (SVEVectorBitsMax == 0)
+ return (SVEVectorBitsMin / 128) * 128;
+ return (std::min(SVEVectorBitsMin, SVEVectorBitsMax) / 128) * 128;
+}
diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.h b/llvm/lib/Target/AArch64/AArch64Subtarget.h
index 79c2c161d3cb2..b111f00169488 100644
--- a/llvm/lib/Target/AArch64/AArch64Subtarget.h
+++ b/llvm/lib/Target/AArch64/AArch64Subtarget.h
@@ -19,6 +19,7 @@
#include "AArch64RegisterInfo.h"
#include "AArch64SelectionDAGInfo.h"
#include "llvm/CodeGen/GlobalISel/CallLowering.h"
+#include "llvm/CodeGen/GlobalISel/InlineAsmLowering.h"
#include "llvm/CodeGen/GlobalISel/InstructionSelector.h"
#include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
#include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
@@ -38,11 +39,13 @@ class AArch64Subtarget final : public AArch64GenSubtargetInfo {
public:
enum ARMProcFamilyEnum : uint8_t {
Others,
+ A64FX,
AppleA7,
AppleA10,
AppleA11,
AppleA12,
AppleA13,
+ Carmel,
CortexA35,
CortexA53,
CortexA55,
@@ -52,6 +55,9 @@ public:
CortexA73,
CortexA75,
CortexA76,
+ CortexA77,
+ CortexA78,
+ CortexX1,
ExynosM3,
Falkor,
Kryo,
@@ -63,7 +69,8 @@ public:
ThunderXT81,
ThunderXT83,
ThunderXT88,
- TSV110
+ TSV110,
+ ThunderX3T110
};
protected:
@@ -75,6 +82,7 @@ protected:
bool HasV8_3aOps = false;
bool HasV8_4aOps = false;
bool HasV8_5aOps = false;
+ bool HasV8_6aOps = false;
bool HasFPARMv8 = false;
bool HasNEON = false;
@@ -99,6 +107,10 @@ protected:
bool HasPAN_RWV = false;
bool HasCCPP = false;
+ // SVE extensions
+ bool HasSVE = false;
+ bool UseExperimentalZeroingPseudos = false;
+
// Armv8.2 Crypto extensions
bool HasSM4 = false;
bool HasSHA3 = false;
@@ -125,8 +137,6 @@ protected:
bool HasRCPC_IMMO = false;
bool HasLSLFast = false;
- bool HasSVE = false;
- bool HasSVE2 = false;
bool HasRCPC = false;
bool HasAggressiveFMA = false;
@@ -143,7 +153,17 @@ protected:
bool HasMTE = false;
bool HasTME = false;
+ // Armv8.6-A Extensions
+ bool HasBF16 = false;
+ bool HasMatMulInt8 = false;
+ bool HasMatMulFP32 = false;
+ bool HasMatMulFP64 = false;
+ bool HasAMVS = false;
+ bool HasFineGrainedTraps = false;
+ bool HasEnhancedCounterVirtualization = false;
+
// Arm SVE2 extensions
+ bool HasSVE2 = false;
bool HasSVE2AES = false;
bool HasSVE2SM4 = false;
bool HasSVE2SHA3 = false;
@@ -196,6 +216,8 @@ protected:
bool UseEL2ForTP = false;
bool UseEL3ForTP = false;
bool AllowTaggedGlobals = false;
+ bool HardenSlsRetBr = false;
+ bool HardenSlsBlr = false;
uint8_t MaxInterleaveFactor = 2;
uint8_t VectorInsertExtractBaseCost = 3;
uint16_t CacheLineSize = 0;
@@ -225,6 +247,7 @@ protected:
/// GlobalISel related APIs.
std::unique_ptr<CallLowering> CallLoweringInfo;
+ std::unique_ptr<InlineAsmLowering> InlineAsmLoweringInfo;
std::unique_ptr<InstructionSelector> InstSelector;
std::unique_ptr<LegalizerInfo> Legalizer;
std::unique_ptr<RegisterBankInfo> RegBankInfo;
@@ -260,6 +283,7 @@ public:
return &getInstrInfo()->getRegisterInfo();
}
const CallLowering *getCallLowering() const override;
+ const InlineAsmLowering *getInlineAsmLowering() const override;
InstructionSelector *getInstructionSelector() const override;
const LegalizerInfo *getLegalizerInfo() const override;
const RegisterBankInfo *getRegBankInfo() const override;
@@ -347,6 +371,9 @@ public:
hasFuseCCSelect() || hasFuseLiterals();
}
+ bool hardenSlsRetBr() const { return HardenSlsRetBr; }
+ bool hardenSlsBlr() const { return HardenSlsBlr; }
+
bool useEL1ForTP() const { return UseEL1ForTP; }
bool useEL2ForTP() const { return UseEL2ForTP; }
bool useEL3ForTP() const { return UseEL3ForTP; }
@@ -359,7 +386,12 @@ public:
}
unsigned getCacheLineSize() const override { return CacheLineSize; }
unsigned getPrefetchDistance() const override { return PrefetchDistance; }
- unsigned getMinPrefetchStride() const override { return MinPrefetchStride; }
+ unsigned getMinPrefetchStride(unsigned NumMemAccesses,
+ unsigned NumStridedMemAccesses,
+ unsigned NumPrefetches,
+ bool HasCall) const override {
+ return MinPrefetchStride;
+ }
unsigned getMaxPrefetchIterationsAhead() const override {
return MaxPrefetchIterationsAhead;
}
@@ -372,6 +404,10 @@ public:
unsigned getWideningBaseCost() const { return WideningBaseCost; }
+ bool useExperimentalZeroingPseudos() const {
+ return UseExperimentalZeroingPseudos;
+ }
+
/// CPU has TBI (top byte of addresses is ignored during HW address
/// translation) and OS enables it.
bool supportsAddressTopByteIgnored() const;
@@ -401,6 +437,16 @@ public:
bool hasSVE2SM4() const { return HasSVE2SM4; }
bool hasSVE2SHA3() const { return HasSVE2SHA3; }
bool hasSVE2BitPerm() const { return HasSVE2BitPerm; }
+ bool hasMatMulInt8() const { return HasMatMulInt8; }
+ bool hasMatMulFP32() const { return HasMatMulFP32; }
+ bool hasMatMulFP64() const { return HasMatMulFP64; }
+
+ // Armv8.6-A Extensions
+ bool hasBF16() const { return HasBF16; }
+ bool hasFineGrainedTraps() const { return HasFineGrainedTraps; }
+ bool hasEnhancedCounterVirtualization() const {
+ return HasEnhancedCounterVirtualization;
+ }
bool isLittleEndian() const { return IsLittle; }
@@ -438,6 +484,7 @@ public:
bool hasDIT() const { return HasDIT; }
bool hasTRACEV8_4() const { return HasTRACEV8_4; }
bool hasAM() const { return HasAM; }
+ bool hasAMVS() const { return HasAMVS; }
bool hasSEL2() const { return HasSEL2; }
bool hasPMU() const { return HasPMU; }
bool hasTLB_RMI() const { return HasTLB_RMI; }
@@ -497,6 +544,12 @@ public:
}
void mirFileLoaded(MachineFunction &MF) const override;
+
+ // Return the known range for the bit length of SVE data registers. A value
+ // of 0 means nothing is known about that particular limit beyong what's
+ // implied by the architecture.
+ unsigned getMaxSVEVectorSizeInBits() const;
+ unsigned getMinSVEVectorSizeInBits() const;
};
} // End llvm namespace
diff --git a/llvm/lib/Target/AArch64/AArch64SystemOperands.td b/llvm/lib/Target/AArch64/AArch64SystemOperands.td
index 6e82d326e5194..ceceabc6ff4ed 100644
--- a/llvm/lib/Target/AArch64/AArch64SystemOperands.td
+++ b/llvm/lib/Target/AArch64/AArch64SystemOperands.td
@@ -18,18 +18,18 @@ include "llvm/TableGen/SearchableTable.td"
//===----------------------------------------------------------------------===//
def HasCCPP : Predicate<"Subtarget->hasCCPP()">,
- AssemblerPredicate<"FeatureCCPP", "ccpp">;
+ AssemblerPredicate<(all_of FeatureCCPP), "ccpp">;
def HasPAN : Predicate<"Subtarget->hasPAN()">,
- AssemblerPredicate<"FeaturePAN",
+ AssemblerPredicate<(all_of FeaturePAN),
"ARM v8.1 Privileged Access-Never extension">;
def HasPsUAO : Predicate<"Subtarget->hasPsUAO()">,
- AssemblerPredicate<"FeaturePsUAO",
+ AssemblerPredicate<(all_of FeaturePsUAO),
"ARM v8.2 UAO PState extension (psuao)">;
def HasPAN_RWV : Predicate<"Subtarget->hasPAN_RWV()">,
- AssemblerPredicate<"FeaturePAN_RWV",
+ AssemblerPredicate<(all_of FeaturePAN_RWV),
"ARM v8.2 PAN AT S1E1R and AT S1E1W Variation">;
//===----------------------------------------------------------------------===//
@@ -338,7 +338,7 @@ def : PState<"PAN", 0b00100>;
// v8.2a "User Access Override" extension-specific PStates
let Requires = [{ {AArch64::FeaturePsUAO} }] in
def : PState<"UAO", 0b00011>;
-// v8.4a timining insensitivity of data processing instructions
+// v8.4a timing insensitivity of data processing instructions
let Requires = [{ {AArch64::FeatureDIT} }] in
def : PState<"DIT", 0b11010>;
// v8.5a Spectre Mitigation
@@ -844,7 +844,7 @@ def : RWSysReg<"SP_EL2", 0b11, 0b110, 0b0100, 0b0001, 0b000>;
def : RWSysReg<"SPSel", 0b11, 0b000, 0b0100, 0b0010, 0b000>;
def : RWSysReg<"NZCV", 0b11, 0b011, 0b0100, 0b0010, 0b000>;
def : RWSysReg<"DAIF", 0b11, 0b011, 0b0100, 0b0010, 0b001>;
-def : RWSysReg<"CurrentEL", 0b11, 0b000, 0b0100, 0b0010, 0b010>;
+def : ROSysReg<"CurrentEL", 0b11, 0b000, 0b0100, 0b0010, 0b010>;
def : RWSysReg<"SPSR_irq", 0b11, 0b100, 0b0100, 0b0011, 0b000>;
def : RWSysReg<"SPSR_abt", 0b11, 0b100, 0b0100, 0b0011, 0b001>;
def : RWSysReg<"SPSR_und", 0b11, 0b100, 0b0100, 0b0011, 0b010>;
@@ -1167,7 +1167,6 @@ def : RWSysReg<"ICC_SRE_EL3", 0b11, 0b110, 0b1100, 0b1100, 0b101>;
def : RWSysReg<"ICC_IGRPEN0_EL1", 0b11, 0b000, 0b1100, 0b1100, 0b110>;
def : RWSysReg<"ICC_IGRPEN1_EL1", 0b11, 0b000, 0b1100, 0b1100, 0b111>;
def : RWSysReg<"ICC_IGRPEN1_EL3", 0b11, 0b110, 0b1100, 0b1100, 0b111>;
-def : RWSysReg<"ICC_SEIEN_EL1", 0b11, 0b000, 0b1100, 0b1101, 0b000>;
def : RWSysReg<"ICC_AP0R0_EL1", 0b11, 0b000, 0b1100, 0b1000, 0b100>;
def : RWSysReg<"ICC_AP0R1_EL1", 0b11, 0b000, 0b1100, 0b1000, 0b101>;
def : RWSysReg<"ICC_AP0R2_EL1", 0b11, 0b000, 0b1100, 0b1000, 0b110>;
@@ -1185,9 +1184,8 @@ def : RWSysReg<"ICH_AP1R1_EL2", 0b11, 0b100, 0b1100, 0b1001, 0b001>;
def : RWSysReg<"ICH_AP1R2_EL2", 0b11, 0b100, 0b1100, 0b1001, 0b010>;
def : RWSysReg<"ICH_AP1R3_EL2", 0b11, 0b100, 0b1100, 0b1001, 0b011>;
def : RWSysReg<"ICH_HCR_EL2", 0b11, 0b100, 0b1100, 0b1011, 0b000>;
-def : RWSysReg<"ICH_MISR_EL2", 0b11, 0b100, 0b1100, 0b1011, 0b010>;
+def : ROSysReg<"ICH_MISR_EL2", 0b11, 0b100, 0b1100, 0b1011, 0b010>;
def : RWSysReg<"ICH_VMCR_EL2", 0b11, 0b100, 0b1100, 0b1011, 0b111>;
-def : RWSysReg<"ICH_VSEIR_EL2", 0b11, 0b100, 0b1100, 0b1001, 0b100>;
def : RWSysReg<"ICH_LR0_EL2", 0b11, 0b100, 0b1100, 0b1100, 0b000>;
def : RWSysReg<"ICH_LR1_EL2", 0b11, 0b100, 0b1100, 0b1100, 0b001>;
def : RWSysReg<"ICH_LR2_EL2", 0b11, 0b100, 0b1100, 0b1100, 0b010>;
@@ -1260,7 +1258,7 @@ let Requires = [{ {AArch64::FeatureSPE} }] in {
def : RWSysReg<"PMBLIMITR_EL1", 0b11, 0b000, 0b1001, 0b1010, 0b000>;
def : RWSysReg<"PMBPTR_EL1", 0b11, 0b000, 0b1001, 0b1010, 0b001>;
def : RWSysReg<"PMBSR_EL1", 0b11, 0b000, 0b1001, 0b1010, 0b011>;
-def : RWSysReg<"PMBIDR_EL1", 0b11, 0b000, 0b1001, 0b1010, 0b111>;
+def : ROSysReg<"PMBIDR_EL1", 0b11, 0b000, 0b1001, 0b1010, 0b111>;
def : RWSysReg<"PMSCR_EL2", 0b11, 0b100, 0b1001, 0b1001, 0b000>;
def : RWSysReg<"PMSCR_EL12", 0b11, 0b101, 0b1001, 0b1001, 0b000>;
def : RWSysReg<"PMSCR_EL1", 0b11, 0b000, 0b1001, 0b1001, 0b000>;
@@ -1269,7 +1267,7 @@ def : RWSysReg<"PMSIRR_EL1", 0b11, 0b000, 0b1001, 0b1001, 0b011>;
def : RWSysReg<"PMSFCR_EL1", 0b11, 0b000, 0b1001, 0b1001, 0b100>;
def : RWSysReg<"PMSEVFR_EL1", 0b11, 0b000, 0b1001, 0b1001, 0b101>;
def : RWSysReg<"PMSLATFR_EL1", 0b11, 0b000, 0b1001, 0b1001, 0b110>;
-def : RWSysReg<"PMSIDR_EL1", 0b11, 0b000, 0b1001, 0b1001, 0b111>;
+def : ROSysReg<"PMSIDR_EL1", 0b11, 0b000, 0b1001, 0b1001, 0b111>;
}
// v8.2a "RAS extension" registers
@@ -1333,7 +1331,6 @@ def : RWSysReg<"PMMIR_EL1", 0b11, 0b000, 0b1001, 0b1110, 0b110>;
let Requires = [{ {AArch64::FeatureRASv8_4} }] in {
def : RWSysReg<"ERXPFGCTL_EL1", 0b11, 0b000, 0b0101, 0b0100, 0b101>;
def : RWSysReg<"ERXPFGCDN_EL1", 0b11, 0b000, 0b0101, 0b0100, 0b110>;
-def : RWSysReg<"ERXTS_EL1", 0b11, 0b000, 0b0101, 0b0101, 0b111>;
def : RWSysReg<"ERXMISC2_EL1", 0b11, 0b000, 0b0101, 0b0101, 0b010>;
def : RWSysReg<"ERXMISC3_EL1", 0b11, 0b000, 0b0101, 0b0101, 0b011>;
def : ROSysReg<"ERXPFGF_EL1", 0b11, 0b000, 0b0101, 0b0100, 0b100>;
@@ -1360,7 +1357,7 @@ def : RWSysReg<"MPAMVPM7_EL2", 0b11, 0b100, 0b1010, 0b0110, 0b111>;
def : ROSysReg<"MPAMIDR_EL1", 0b11, 0b000, 0b1010, 0b0100, 0b100>;
} //FeatureMPAM
-// v8.4a Activitiy Monitor registers
+// v8.4a Activity Monitor registers
// Op0 Op1 CRn CRm Op2
let Requires = [{ {AArch64::FeatureAM} }] in {
def : RWSysReg<"AMCR_EL0", 0b11, 0b011, 0b1101, 0b0010, 0b000>;
@@ -1426,7 +1423,7 @@ def : RWSysReg<"TRFCR_EL2", 0b11, 0b100, 0b0001, 0b0010, 0b001>;
def : RWSysReg<"TRFCR_EL12", 0b11, 0b101, 0b0001, 0b0010, 0b001>;
} //FeatureTRACEV8_4
-// v8.4a Timining insensitivity of data processing instructions
+// v8.4a Timing insensitivity of data processing instructions
// DIT: Data Independent Timing instructions
// Op0 Op1 CRn CRm Op2
let Requires = [{ {AArch64::FeatureDIT} }] in {
@@ -1490,6 +1487,41 @@ def : RWSysReg<"TRBTRG_EL1", 0b11, 0b000, 0b1001, 0b1011, 0b110>;
def : ROSysReg<"TRBIDR_EL1", 0b11, 0b000, 0b1001, 0b1011, 0b111>;
} // FeatureTRBE
+
+// v8.6a Activity Monitors Virtualization Support
+let Requires = [{ {AArch64::FeatureAMVS} }] in {
+foreach n = 0-15 in {
+ foreach x = 0-1 in {
+ def : RWSysReg<"AMEVCNTVOFF"#x#n#"_EL2",
+ 0b11, 0b100, 0b1101, 0b1000, 0b000>{
+ let Encoding{4} = x;
+ let Encoding{3-0} = n;
+ }
+ }
+}
+}
+
+// v8.6a Fine Grained Virtualization Traps
+// Op0 Op1 CRn CRm Op2
+let Requires = [{ {AArch64::FeatureFineGrainedTraps} }] in {
+def : RWSysReg<"HFGRTR_EL2", 0b11, 0b100, 0b0001, 0b0001, 0b100>;
+def : RWSysReg<"HFGWTR_EL2", 0b11, 0b100, 0b0001, 0b0001, 0b101>;
+def : RWSysReg<"HFGITR_EL2", 0b11, 0b100, 0b0001, 0b0001, 0b110>;
+def : RWSysReg<"HDFGRTR_EL2", 0b11, 0b100, 0b0011, 0b0001, 0b100>;
+def : RWSysReg<"HDFGWTR_EL2", 0b11, 0b100, 0b0011, 0b0001, 0b101>;
+}
+
+// v8.6a Enhanced Counter Virtualization
+// Op0 Op1 CRn CRm Op2
+let Requires = [{ {AArch64::FeatureEnhancedCounterVirtualization} }] in {
+def : RWSysReg<"CNTSCALE_EL2", 0b11, 0b100, 0b1110, 0b0000, 0b100>;
+def : RWSysReg<"CNTISCALE_EL2", 0b11, 0b100, 0b1110, 0b0000, 0b101>;
+def : RWSysReg<"CNTPOFF_EL2", 0b11, 0b100, 0b1110, 0b0000, 0b110>;
+def : RWSysReg<"CNTVFRQ_EL2", 0b11, 0b100, 0b1110, 0b0000, 0b111>;
+def : RWSysReg<"CNTPCTSS_EL0", 0b11, 0b011, 0b1110, 0b0000, 0b101>;
+def : RWSysReg<"CNTVCTSS_EL0", 0b11, 0b011, 0b1110, 0b0000, 0b110>;
+}
+
// Cyclone specific system registers
// Op0 Op1 CRn CRm Op2
let Requires = [{ {AArch64::ProcAppleA7} }] in
diff --git a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
index 115a7da8a6d90..a63b9a97ada55 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
@@ -11,6 +11,7 @@
#include "AArch64TargetMachine.h"
#include "AArch64.h"
+#include "AArch64MachineFunctionInfo.h"
#include "AArch64MacroFusion.h"
#include "AArch64Subtarget.h"
#include "AArch64TargetObjectFile.h"
@@ -26,6 +27,7 @@
#include "llvm/CodeGen/GlobalISel/Legalizer.h"
#include "llvm/CodeGen/GlobalISel/Localizer.h"
#include "llvm/CodeGen/GlobalISel/RegBankSelect.h"
+#include "llvm/CodeGen/MIRParser/MIParser.h"
#include "llvm/CodeGen/MachineScheduler.h"
#include "llvm/CodeGen/Passes.h"
#include "llvm/CodeGen/TargetPassConfig.h"
@@ -146,6 +148,11 @@ static cl::opt<int> EnableGlobalISelAtO(
cl::desc("Enable GlobalISel at or below an opt level (-1 to disable)"),
cl::init(0));
+static cl::opt<bool> EnableSVEIntrinsicOpts(
+ "aarch64-sve-intrinsic-opts", cl::Hidden,
+ cl::desc("Enable SVE intrinsic opts"),
+ cl::init(true));
+
static cl::opt<bool> EnableFalkorHWPFFix("aarch64-enable-falkor-hwpf-fix",
cl::init(true), cl::Hidden);
@@ -176,13 +183,16 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAArch64Target() {
initializeAArch64LoadStoreOptPass(*PR);
initializeAArch64SIMDInstrOptPass(*PR);
initializeAArch64PreLegalizerCombinerPass(*PR);
+ initializeAArch64PostLegalizerCombinerPass(*PR);
initializeAArch64PromoteConstantPass(*PR);
initializeAArch64RedundantCopyEliminationPass(*PR);
initializeAArch64StorePairSuppressPass(*PR);
initializeFalkorHWPFFixPass(*PR);
initializeFalkorMarkStridedAccessesLegacyPass(*PR);
initializeLDTLSCleanupPass(*PR);
+ initializeSVEIntrinsicOptsPass(*PR);
initializeAArch64SpeculationHardeningPass(*PR);
+ initializeAArch64SLSHardeningPass(*PR);
initializeAArch64StackTaggingPass(*PR);
initializeAArch64StackTaggingPreRAPass(*PR);
}
@@ -236,12 +246,8 @@ getEffectiveAArch64CodeModel(const Triple &TT, Optional<CodeModel::Model> CM,
if (CM) {
if (*CM != CodeModel::Small && *CM != CodeModel::Tiny &&
*CM != CodeModel::Large) {
- if (!TT.isOSFuchsia())
- report_fatal_error(
- "Only small, tiny and large code models are allowed on AArch64");
- else if (*CM != CodeModel::Kernel)
- report_fatal_error("Only small, tiny, kernel, and large code models "
- "are allowed on AArch64");
+ report_fatal_error(
+ "Only small, tiny and large code models are allowed on AArch64");
} else if (*CM == CodeModel::Tiny && !TT.isOSBinFormatELF())
report_fatal_error("tiny code model is only supported on ELF");
return *CM;
@@ -313,6 +319,9 @@ AArch64TargetMachine::AArch64TargetMachine(const Target &T, const Triple &TT,
// AArch64 supports default outlining behaviour.
setSupportsDefaultOutlining(true);
+
+ // AArch64 supports the debug entry values.
+ setSupportsDebugEntryValues(true);
}
AArch64TargetMachine::~AArch64TargetMachine() = default;
@@ -403,6 +412,7 @@ public:
bool addIRTranslator() override;
void addPreLegalizeMachineIR() override;
bool addLegalizeMachineIR() override;
+ void addPreRegBankSelect() override;
bool addRegBankSelect() override;
void addPreGlobalInstructionSelect() override;
bool addGlobalInstructionSelect() override;
@@ -435,6 +445,10 @@ void AArch64PassConfig::addIRPasses() {
// ourselves.
addPass(createAtomicExpandPass());
+ // Expand any SVE vector library calls that we can't code generate directly.
+ if (EnableSVEIntrinsicOpts && TM->getOptLevel() == CodeGenOpt::Aggressive)
+ addPass(createSVEIntrinsicOptsPass());
+
// Cmpxchg instructions are often used with a subsequent comparison to
// determine whether it succeeded. We can exploit existing control-flow in
// ldrex/strex loops to simplify this, but it needs tidying up.
@@ -454,6 +468,9 @@ void AArch64PassConfig::addIRPasses() {
TargetPassConfig::addIRPasses();
+ addPass(createAArch64StackTaggingPass(
+ /*IsOptNone=*/TM->getOptLevel() == CodeGenOpt::None));
+
// Match interleaved memory accesses to ldN/stN intrinsics.
if (TM->getOptLevel() != CodeGenOpt::None) {
addPass(createInterleavedLoadCombinePass());
@@ -473,9 +490,6 @@ void AArch64PassConfig::addIRPasses() {
addPass(createLICMPass());
}
- addPass(createAArch64StackTaggingPass(/* MergeInit = */ TM->getOptLevel() !=
- CodeGenOpt::None));
-
// Add Control Flow Guard checks.
if (TM->getTargetTriple().isOSWindows())
addPass(createCFGuardCheckPass());
@@ -541,6 +555,14 @@ bool AArch64PassConfig::addLegalizeMachineIR() {
return false;
}
+void AArch64PassConfig::addPreRegBankSelect() {
+ // For now we don't add this to the pipeline for -O0. We could do in future
+ // if we split the combines into separate O0/opt groupings.
+ bool IsOptNone = getOptLevel() == CodeGenOpt::None;
+ if (!IsOptNone)
+ addPass(createAArch64PostLegalizeCombiner(IsOptNone));
+}
+
bool AArch64PassConfig::addRegBankSelect() {
addPass(new RegBankSelect());
return false;
@@ -614,6 +636,9 @@ void AArch64PassConfig::addPreSched2() {
// info.
addPass(createAArch64SpeculationHardeningPass());
+ addPass(createAArch64IndirectThunks());
+ addPass(createAArch64SLSHardeningPass());
+
if (TM->getOptLevel() != CodeGenOpt::None) {
if (EnableFalkorHWPFFix)
addPass(createFalkorHWPFFixPass());
@@ -648,4 +673,28 @@ void AArch64PassConfig::addPreEmitPass() {
if (TM->getOptLevel() != CodeGenOpt::None && EnableCollectLOH &&
TM->getTargetTriple().isOSBinFormatMachO())
addPass(createAArch64CollectLOHPass());
+
+ // SVE bundles move prefixes with destructive operations.
+ addPass(createUnpackMachineBundles(nullptr));
+}
+
+yaml::MachineFunctionInfo *
+AArch64TargetMachine::createDefaultFuncInfoYAML() const {
+ return new yaml::AArch64FunctionInfo();
+}
+
+yaml::MachineFunctionInfo *
+AArch64TargetMachine::convertFuncInfoToYAML(const MachineFunction &MF) const {
+ const auto *MFI = MF.getInfo<AArch64FunctionInfo>();
+ return new yaml::AArch64FunctionInfo(*MFI);
+}
+
+bool AArch64TargetMachine::parseMachineFunctionInfo(
+ const yaml::MachineFunctionInfo &MFI, PerFunctionMIParsingState &PFS,
+ SMDiagnostic &Error, SMRange &SourceRange) const {
+ const auto &YamlMFI =
+ reinterpret_cast<const yaml::AArch64FunctionInfo &>(MFI);
+ MachineFunction &MF = PFS.MF;
+ MF.getInfo<AArch64FunctionInfo>()->initializeBaseYamlFields(YamlMFI);
+ return false;
}
diff --git a/llvm/lib/Target/AArch64/AArch64TargetMachine.h b/llvm/lib/Target/AArch64/AArch64TargetMachine.h
index 5264efb89b9c5..7738a42293919 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetMachine.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetMachine.h
@@ -49,6 +49,14 @@ public:
return TLOF.get();
}
+ yaml::MachineFunctionInfo *createDefaultFuncInfoYAML() const override;
+ yaml::MachineFunctionInfo *
+ convertFuncInfoToYAML(const MachineFunction &MF) const override;
+ bool parseMachineFunctionInfo(const yaml::MachineFunctionInfo &,
+ PerFunctionMIParsingState &PFS,
+ SMDiagnostic &Error,
+ SMRange &SourceRange) const override;
+
private:
bool isLittle;
};
diff --git a/llvm/lib/Target/AArch64/AArch64TargetObjectFile.cpp b/llvm/lib/Target/AArch64/AArch64TargetObjectFile.cpp
index 54562094fcf56..dfc66f0cb4c16 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetObjectFile.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetObjectFile.cpp
@@ -20,7 +20,6 @@ using namespace dwarf;
void AArch64_ELFTargetObjectFile::Initialize(MCContext &Ctx,
const TargetMachine &TM) {
TargetLoweringObjectFileELF::Initialize(Ctx, TM);
- InitializeELF(TM.Options.UseInitArray);
// AARCH64 ELF ABI does not define static relocation type for TLS offset
// within a module. Do not generate AT_location for TLS variables.
SupportDebugThreadLocalLocation = false;
@@ -43,7 +42,7 @@ const MCExpr *AArch64_MachoTargetObjectFile::getTTypeGlobalReference(
const MCExpr *Res =
MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_GOT, getContext());
MCSymbol *PCSym = getContext().createTempSymbol();
- Streamer.EmitLabel(PCSym);
+ Streamer.emitLabel(PCSym);
const MCExpr *PC = MCSymbolRefExpr::create(PCSym, getContext());
return MCBinaryExpr::createSub(Res, PC, getContext());
}
@@ -68,7 +67,7 @@ const MCExpr *AArch64_MachoTargetObjectFile::getIndirectSymViaGOTPCRel(
const MCExpr *Res =
MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_GOT, getContext());
MCSymbol *PCSym = getContext().createTempSymbol();
- Streamer.EmitLabel(PCSym);
+ Streamer.emitLabel(PCSym);
const MCExpr *PC = MCSymbolRefExpr::create(PCSym, getContext());
return MCBinaryExpr::createSub(Res, PC, getContext());
}
diff --git a/llvm/lib/Target/AArch64/AArch64TargetObjectFile.h b/llvm/lib/Target/AArch64/AArch64TargetObjectFile.h
index 1cb4c028c80d2..28324c2ae608f 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetObjectFile.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetObjectFile.h
@@ -18,6 +18,11 @@ class AArch64TargetMachine;
/// This implementation is used for AArch64 ELF targets (Linux in particular).
class AArch64_ELFTargetObjectFile : public TargetLoweringObjectFileELF {
void Initialize(MCContext &Ctx, const TargetMachine &TM) override;
+
+public:
+ AArch64_ELFTargetObjectFile() {
+ PLTRelativeVariantKind = MCSymbolRefExpr::VK_PLT;
+ }
};
/// AArch64_MachoTargetObjectFile - This TLOF implementation is used for Darwin.
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 4724d6b8daea7..cf6de797727be 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -57,7 +57,8 @@ int AArch64TTIImpl::getIntImmCost(int64_t Val) {
}
/// Calculate the cost of materializing the given constant.
-int AArch64TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) {
+int AArch64TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty,
+ TTI::TargetCostKind CostKind) {
assert(Ty->isIntegerTy());
unsigned BitSize = Ty->getPrimitiveSizeInBits();
@@ -82,7 +83,8 @@ int AArch64TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) {
}
int AArch64TTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
- const APInt &Imm, Type *Ty) {
+ const APInt &Imm, Type *Ty,
+ TTI::TargetCostKind CostKind) {
assert(Ty->isIntegerTy());
unsigned BitSize = Ty->getPrimitiveSizeInBits();
@@ -139,16 +141,17 @@ int AArch64TTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
if (Idx == ImmIdx) {
int NumConstants = (BitSize + 63) / 64;
- int Cost = AArch64TTIImpl::getIntImmCost(Imm, Ty);
+ int Cost = AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind);
return (Cost <= NumConstants * TTI::TCC_Basic)
? static_cast<int>(TTI::TCC_Free)
: Cost;
}
- return AArch64TTIImpl::getIntImmCost(Imm, Ty);
+ return AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind);
}
int AArch64TTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx,
- const APInt &Imm, Type *Ty) {
+ const APInt &Imm, Type *Ty,
+ TTI::TargetCostKind CostKind) {
assert(Ty->isIntegerTy());
unsigned BitSize = Ty->getPrimitiveSizeInBits();
@@ -161,7 +164,7 @@ int AArch64TTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx,
// selected instruction, so we compute the materialization cost for the
// immediate directly.
if (IID >= Intrinsic::aarch64_addg && IID <= Intrinsic::aarch64_udiv)
- return AArch64TTIImpl::getIntImmCost(Imm, Ty);
+ return AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind);
switch (IID) {
default:
@@ -174,7 +177,7 @@ int AArch64TTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx,
case Intrinsic::umul_with_overflow:
if (Idx == 1) {
int NumConstants = (BitSize + 63) / 64;
- int Cost = AArch64TTIImpl::getIntImmCost(Imm, Ty);
+ int Cost = AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind);
return (Cost <= NumConstants * TTI::TCC_Basic)
? static_cast<int>(TTI::TCC_Free)
: Cost;
@@ -190,7 +193,7 @@ int AArch64TTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx,
return TTI::TCC_Free;
break;
}
- return AArch64TTIImpl::getIntImmCost(Imm, Ty);
+ return AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind);
}
TargetTransformInfo::PopcntSupportKind
@@ -208,8 +211,8 @@ bool AArch64TTIImpl::isWideningInstruction(Type *DstTy, unsigned Opcode,
// A helper that returns a vector type from the given type. The number of
// elements in type Ty determine the vector width.
auto toVectorTy = [&](Type *ArgTy) {
- return VectorType::get(ArgTy->getScalarType(),
- DstTy->getVectorNumElements());
+ return FixedVectorType::get(ArgTy->getScalarType(),
+ cast<FixedVectorType>(DstTy)->getNumElements());
};
// Exit early if DstTy is not a vector type whose elements are at least
@@ -251,7 +254,7 @@ bool AArch64TTIImpl::isWideningInstruction(Type *DstTy, unsigned Opcode,
// Legalize the source type and ensure it can be used in a widening
// operation.
- Type *SrcTy = toVectorTy(Extend->getSrcTy());
+ auto *SrcTy = toVectorTy(Extend->getSrcTy());
auto SrcTyL = TLI->getTypeLegalizationCost(DL, SrcTy);
unsigned SrcElTySize = SrcTyL.second.getScalarSizeInBits();
if (!SrcTyL.second.isVector() || SrcElTySize != SrcTy->getScalarSizeInBits())
@@ -267,6 +270,7 @@ bool AArch64TTIImpl::isWideningInstruction(Type *DstTy, unsigned Opcode,
}
int AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
+ TTI::TargetCostKind CostKind,
const Instruction *I) {
int ISD = TLI->InstructionOpcodeToISD(Opcode);
assert(ISD && "Invalid opcode");
@@ -291,11 +295,18 @@ int AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
}
}
+ // TODO: Allow non-throughput costs that aren't binary.
+ auto AdjustCost = [&CostKind](int Cost) {
+ if (CostKind != TTI::TCK_RecipThroughput)
+ return Cost == 0 ? 0 : 1;
+ return Cost;
+ };
+
EVT SrcTy = TLI->getValueType(DL, Src);
EVT DstTy = TLI->getValueType(DL, Dst);
if (!SrcTy.isSimple() || !DstTy.isSimple())
- return BaseT::getCastInstrCost(Opcode, Dst, Src);
+ return AdjustCost(BaseT::getCastInstrCost(Opcode, Dst, Src, CostKind, I));
static const TypeConversionCostTblEntry
ConversionTbl[] = {
@@ -397,9 +408,9 @@ int AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
if (const auto *Entry = ConvertCostTableLookup(ConversionTbl, ISD,
DstTy.getSimpleVT(),
SrcTy.getSimpleVT()))
- return Entry->Cost;
+ return AdjustCost(Entry->Cost);
- return BaseT::getCastInstrCost(Opcode, Dst, Src);
+ return AdjustCost(BaseT::getCastInstrCost(Opcode, Dst, Src, CostKind, I));
}
int AArch64TTIImpl::getExtractWithExtendCost(unsigned Opcode, Type *Dst,
@@ -425,17 +436,18 @@ int AArch64TTIImpl::getExtractWithExtendCost(unsigned Opcode, Type *Dst,
auto VecLT = TLI->getTypeLegalizationCost(DL, VecTy);
auto DstVT = TLI->getValueType(DL, Dst);
auto SrcVT = TLI->getValueType(DL, Src);
+ TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
// If the resulting type is still a vector and the destination type is legal,
// we may get the extension for free. If not, get the default cost for the
// extend.
if (!VecLT.second.isVector() || !TLI->isTypeLegal(DstVT))
- return Cost + getCastInstrCost(Opcode, Dst, Src);
+ return Cost + getCastInstrCost(Opcode, Dst, Src, CostKind);
// The destination type should be larger than the element type. If not, get
// the default cost for the extend.
if (DstVT.getSizeInBits() < SrcVT.getSizeInBits())
- return Cost + getCastInstrCost(Opcode, Dst, Src);
+ return Cost + getCastInstrCost(Opcode, Dst, Src, CostKind);
switch (Opcode) {
default:
@@ -454,7 +466,16 @@ int AArch64TTIImpl::getExtractWithExtendCost(unsigned Opcode, Type *Dst,
}
// If we are unable to perform the extend for free, get the default cost.
- return Cost + getCastInstrCost(Opcode, Dst, Src);
+ return Cost + getCastInstrCost(Opcode, Dst, Src, CostKind);
+}
+
+unsigned AArch64TTIImpl::getCFInstrCost(unsigned Opcode,
+ TTI::TargetCostKind CostKind) {
+ if (CostKind != TTI::TCK_RecipThroughput)
+ return Opcode == Instruction::PHI ? 0 : 1;
+ assert(CostKind == TTI::TCK_RecipThroughput && "unexpected CostKind");
+ // Branches are assumed to be predicted.
+ return 0;
}
int AArch64TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
@@ -483,10 +504,17 @@ int AArch64TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
}
int AArch64TTIImpl::getArithmeticInstrCost(
- unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info,
+ unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
+ TTI::OperandValueKind Opd1Info,
TTI::OperandValueKind Opd2Info, TTI::OperandValueProperties Opd1PropInfo,
TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value *> Args,
const Instruction *CxtI) {
+ // TODO: Handle more cost kinds.
+ if (CostKind != TTI::TCK_RecipThroughput)
+ return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info,
+ Opd2Info, Opd1PropInfo,
+ Opd2PropInfo, Args, CxtI);
+
// Legalize the type.
std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
@@ -504,7 +532,8 @@ int AArch64TTIImpl::getArithmeticInstrCost(
switch (ISD) {
default:
- return Cost + BaseT::getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info,
+ return Cost + BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info,
+ Opd2Info,
Opd1PropInfo, Opd2PropInfo);
case ISD::SDIV:
if (Opd2Info == TargetTransformInfo::OK_UniformConstantValue &&
@@ -513,16 +542,20 @@ int AArch64TTIImpl::getArithmeticInstrCost(
// normally expanded to the sequence ADD + CMP + SELECT + SRA.
// The OperandValue properties many not be same as that of previous
// operation; conservatively assume OP_None.
- Cost += getArithmeticInstrCost(Instruction::Add, Ty, Opd1Info, Opd2Info,
+ Cost += getArithmeticInstrCost(Instruction::Add, Ty, CostKind,
+ Opd1Info, Opd2Info,
TargetTransformInfo::OP_None,
TargetTransformInfo::OP_None);
- Cost += getArithmeticInstrCost(Instruction::Sub, Ty, Opd1Info, Opd2Info,
+ Cost += getArithmeticInstrCost(Instruction::Sub, Ty, CostKind,
+ Opd1Info, Opd2Info,
TargetTransformInfo::OP_None,
TargetTransformInfo::OP_None);
- Cost += getArithmeticInstrCost(Instruction::Select, Ty, Opd1Info, Opd2Info,
+ Cost += getArithmeticInstrCost(Instruction::Select, Ty, CostKind,
+ Opd1Info, Opd2Info,
TargetTransformInfo::OP_None,
TargetTransformInfo::OP_None);
- Cost += getArithmeticInstrCost(Instruction::AShr, Ty, Opd1Info, Opd2Info,
+ Cost += getArithmeticInstrCost(Instruction::AShr, Ty, CostKind,
+ Opd1Info, Opd2Info,
TargetTransformInfo::OP_None,
TargetTransformInfo::OP_None);
return Cost;
@@ -535,31 +568,34 @@ int AArch64TTIImpl::getArithmeticInstrCost(
// Vector signed division by constant are expanded to the
// sequence MULHS + ADD/SUB + SRA + SRL + ADD, and unsigned division
// to MULHS + SUB + SRL + ADD + SRL.
- int MulCost = getArithmeticInstrCost(Instruction::Mul, Ty, Opd1Info,
- Opd2Info,
+ int MulCost = getArithmeticInstrCost(Instruction::Mul, Ty, CostKind,
+ Opd1Info, Opd2Info,
TargetTransformInfo::OP_None,
TargetTransformInfo::OP_None);
- int AddCost = getArithmeticInstrCost(Instruction::Add, Ty, Opd1Info,
- Opd2Info,
+ int AddCost = getArithmeticInstrCost(Instruction::Add, Ty, CostKind,
+ Opd1Info, Opd2Info,
TargetTransformInfo::OP_None,
TargetTransformInfo::OP_None);
- int ShrCost = getArithmeticInstrCost(Instruction::AShr, Ty, Opd1Info,
- Opd2Info,
+ int ShrCost = getArithmeticInstrCost(Instruction::AShr, Ty, CostKind,
+ Opd1Info, Opd2Info,
TargetTransformInfo::OP_None,
TargetTransformInfo::OP_None);
return MulCost * 2 + AddCost * 2 + ShrCost * 2 + 1;
}
}
- Cost += BaseT::getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info,
+ Cost += BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info,
+ Opd2Info,
Opd1PropInfo, Opd2PropInfo);
if (Ty->isVectorTy()) {
// On AArch64, vector divisions are not supported natively and are
// expanded into scalar divisions of each pair of elements.
- Cost += getArithmeticInstrCost(Instruction::ExtractElement, Ty, Opd1Info,
- Opd2Info, Opd1PropInfo, Opd2PropInfo);
- Cost += getArithmeticInstrCost(Instruction::InsertElement, Ty, Opd1Info,
- Opd2Info, Opd1PropInfo, Opd2PropInfo);
+ Cost += getArithmeticInstrCost(Instruction::ExtractElement, Ty, CostKind,
+ Opd1Info, Opd2Info, Opd1PropInfo,
+ Opd2PropInfo);
+ Cost += getArithmeticInstrCost(Instruction::InsertElement, Ty, CostKind,
+ Opd1Info, Opd2Info, Opd1PropInfo,
+ Opd2PropInfo);
// TODO: if one of the arguments is scalar, then it's not necessary to
// double the cost of handling the vector elements.
Cost += Cost;
@@ -574,6 +610,16 @@ int AArch64TTIImpl::getArithmeticInstrCost(
// These nodes are marked as 'custom' for combining purposes only.
// We know that they are legal. See LowerAdd in ISelLowering.
return (Cost + 1) * LT.first;
+
+ case ISD::FADD:
+ // These nodes are marked as 'custom' just to lower them to SVE.
+ // We know said lowering will incur no additional cost.
+ if (isa<FixedVectorType>(Ty) && !Ty->getScalarType()->isFP128Ty())
+ return (Cost + 2) * LT.first;
+
+ return Cost + BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info,
+ Opd2Info,
+ Opd1PropInfo, Opd2PropInfo);
}
}
@@ -596,7 +642,12 @@ int AArch64TTIImpl::getAddressComputationCost(Type *Ty, ScalarEvolution *SE,
}
int AArch64TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
- Type *CondTy, const Instruction *I) {
+ Type *CondTy,
+ TTI::TargetCostKind CostKind,
+ const Instruction *I) {
+ // TODO: Handle other cost kinds.
+ if (CostKind != TTI::TCK_RecipThroughput)
+ return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, CostKind, I);
int ISD = TLI->InstructionOpcodeToISD(Opcode);
// We don't lower some vector selects well that are wider than the register
@@ -623,13 +674,18 @@ int AArch64TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
return Entry->Cost;
}
}
- return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, I);
+ return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, CostKind, I);
}
AArch64TTIImpl::TTI::MemCmpExpansionOptions
AArch64TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
TTI::MemCmpExpansionOptions Options;
- Options.AllowOverlappingLoads = !ST->requiresStrictAlign();
+ if (ST->requiresStrictAlign()) {
+ // TODO: Add cost modeling for strict align. Misaligned loads expand to
+ // a bunch of instructions when strict align is enabled.
+ return Options;
+ }
+ Options.AllowOverlappingLoads = true;
Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
Options.NumLoadsPerBlock = Options.MaxNumLoads;
// TODO: Though vector loads usually perform well on AArch64, in some targets
@@ -641,7 +697,17 @@ AArch64TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
int AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Ty,
MaybeAlign Alignment, unsigned AddressSpace,
+ TTI::TargetCostKind CostKind,
const Instruction *I) {
+ // TODO: Handle other cost kinds.
+ if (CostKind != TTI::TCK_RecipThroughput)
+ return 1;
+
+ // Type legalization can't handle structs
+ if (TLI->getValueType(DL, Ty, true) == MVT::Other)
+ return BaseT::getMemoryOpCost(Opcode, Ty, Alignment, AddressSpace,
+ CostKind);
+
auto LT = TLI->getTypeLegalizationCost(DL, Ty);
if (ST->isMisaligned128StoreSlow() && Opcode == Instruction::Store &&
@@ -656,7 +722,8 @@ int AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Ty,
return LT.first * 2 * AmortizationCost;
}
- if (Ty->isVectorTy() && Ty->getVectorElementType()->isIntegerTy(8)) {
+ if (Ty->isVectorTy() &&
+ cast<VectorType>(Ty)->getElementType()->isIntegerTy(8)) {
unsigned ProfitableNumElements;
if (Opcode == Instruction::Store)
// We use a custom trunc store lowering so v.4b should be profitable.
@@ -666,8 +733,8 @@ int AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Ty,
// have to promote the elements to v.2.
ProfitableNumElements = 8;
- if (Ty->getVectorNumElements() < ProfitableNumElements) {
- unsigned NumVecElts = Ty->getVectorNumElements();
+ if (cast<FixedVectorType>(Ty)->getNumElements() < ProfitableNumElements) {
+ unsigned NumVecElts = cast<FixedVectorType>(Ty)->getNumElements();
unsigned NumVectorizableInstsToAmortize = NumVecElts * 2;
// We generate 2 instructions per vector element.
return NumVectorizableInstsToAmortize * NumVecElts * 2;
@@ -677,20 +744,18 @@ int AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Ty,
return LT.first;
}
-int AArch64TTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
- unsigned Factor,
- ArrayRef<unsigned> Indices,
- unsigned Alignment,
- unsigned AddressSpace,
- bool UseMaskForCond,
- bool UseMaskForGaps) {
+int AArch64TTIImpl::getInterleavedMemoryOpCost(
+ unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
+ Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
+ bool UseMaskForCond, bool UseMaskForGaps) {
assert(Factor >= 2 && "Invalid interleave factor");
- assert(isa<VectorType>(VecTy) && "Expect a vector type");
+ auto *VecVTy = cast<FixedVectorType>(VecTy);
if (!UseMaskForCond && !UseMaskForGaps &&
Factor <= TLI->getMaxSupportedInterleaveFactor()) {
- unsigned NumElts = VecTy->getVectorNumElements();
- auto *SubVecTy = VectorType::get(VecTy->getScalarType(), NumElts / Factor);
+ unsigned NumElts = VecVTy->getNumElements();
+ auto *SubVecTy =
+ FixedVectorType::get(VecTy->getScalarType(), NumElts / Factor);
// ldN/stN only support legal vector types of size 64 or 128 in bits.
// Accesses having vector types that are a multiple of 128 bits can be
@@ -701,18 +766,20 @@ int AArch64TTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
}
return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
- Alignment, AddressSpace,
+ Alignment, AddressSpace, CostKind,
UseMaskForCond, UseMaskForGaps);
}
int AArch64TTIImpl::getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys) {
int Cost = 0;
+ TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
for (auto *I : Tys) {
if (!I->isVectorTy())
continue;
- if (I->getScalarSizeInBits() * I->getVectorNumElements() == 128)
- Cost += getMemoryOpCost(Instruction::Store, I, Align(128), 0) +
- getMemoryOpCost(Instruction::Load, I, Align(128), 0);
+ if (I->getScalarSizeInBits() * cast<FixedVectorType>(I)->getNumElements() ==
+ 128)
+ Cost += getMemoryOpCost(Instruction::Store, I, Align(128), 0, CostKind) +
+ getMemoryOpCost(Instruction::Load, I, Align(128), 0, CostKind);
}
return Cost;
}
@@ -792,6 +859,11 @@ void AArch64TTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
getFalkorUnrollingPreferences(L, SE, UP);
}
+void AArch64TTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
+ TTI::PeelingPreferences &PP) {
+ BaseT::getPeelingPreferences(L, SE, PP);
+}
+
Value *AArch64TTIImpl::getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst,
Type *ExpectedType) {
switch (Inst->getIntrinsicID()) {
@@ -902,7 +974,7 @@ bool AArch64TTIImpl::shouldConsiderAddressTypePromotion(
bool AArch64TTIImpl::useReductionIntrinsic(unsigned Opcode, Type *Ty,
TTI::ReductionFlags Flags) const {
- assert(isa<VectorType>(Ty) && "Expected Ty to be a vector type");
+ auto *VTy = cast<VectorType>(Ty);
unsigned ScalarBits = Ty->getScalarSizeInBits();
switch (Opcode) {
case Instruction::FAdd:
@@ -913,10 +985,10 @@ bool AArch64TTIImpl::useReductionIntrinsic(unsigned Opcode, Type *Ty,
case Instruction::Mul:
return false;
case Instruction::Add:
- return ScalarBits * Ty->getVectorNumElements() >= 128;
+ return ScalarBits * cast<FixedVectorType>(VTy)->getNumElements() >= 128;
case Instruction::ICmp:
return (ScalarBits < 64) &&
- (ScalarBits * Ty->getVectorNumElements() >= 128);
+ (ScalarBits * cast<FixedVectorType>(VTy)->getNumElements() >= 128);
case Instruction::FCmp:
return Flags.NoNaN;
default:
@@ -925,11 +997,14 @@ bool AArch64TTIImpl::useReductionIntrinsic(unsigned Opcode, Type *Ty,
return false;
}
-int AArch64TTIImpl::getArithmeticReductionCost(unsigned Opcode, Type *ValTy,
- bool IsPairwiseForm) {
+int AArch64TTIImpl::getArithmeticReductionCost(unsigned Opcode,
+ VectorType *ValTy,
+ bool IsPairwiseForm,
+ TTI::TargetCostKind CostKind) {
if (IsPairwiseForm)
- return BaseT::getArithmeticReductionCost(Opcode, ValTy, IsPairwiseForm);
+ return BaseT::getArithmeticReductionCost(Opcode, ValTy, IsPairwiseForm,
+ CostKind);
std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
MVT MTy = LT.second;
@@ -950,11 +1025,12 @@ int AArch64TTIImpl::getArithmeticReductionCost(unsigned Opcode, Type *ValTy,
if (const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy))
return LT.first * Entry->Cost;
- return BaseT::getArithmeticReductionCost(Opcode, ValTy, IsPairwiseForm);
+ return BaseT::getArithmeticReductionCost(Opcode, ValTy, IsPairwiseForm,
+ CostKind);
}
-int AArch64TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
- Type *SubTp) {
+int AArch64TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp,
+ int Index, VectorType *SubTp) {
if (Kind == TTI::SK_Broadcast || Kind == TTI::SK_Transpose ||
Kind == TTI::SK_Select || Kind == TTI::SK_PermuteSingleSrc) {
static const CostTblEntry ShuffleTbl[] = {
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
index 6f4569a497831..1f029689a60e6 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -72,11 +72,11 @@ public:
using BaseT::getIntImmCost;
int getIntImmCost(int64_t Val);
- int getIntImmCost(const APInt &Imm, Type *Ty);
+ int getIntImmCost(const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind);
int getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm,
- Type *Ty);
+ Type *Ty, TTI::TargetCostKind CostKind);
int getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,
- Type *Ty);
+ Type *Ty, TTI::TargetCostKind CostKind);
TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth);
/// @}
@@ -98,6 +98,8 @@ public:
unsigned getRegisterBitWidth(bool Vector) const {
if (Vector) {
+ if (ST->hasSVE())
+ return std::max(ST->getMinSVEVectorSizeInBits(), 128u);
if (ST->hasNEON())
return 128;
return 0;
@@ -112,15 +114,19 @@ public:
unsigned getMaxInterleaveFactor(unsigned VF);
int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
+ TTI::TargetCostKind CostKind,
const Instruction *I = nullptr);
int getExtractWithExtendCost(unsigned Opcode, Type *Dst, VectorType *VecTy,
unsigned Index);
+ unsigned getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind);
+
int getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index);
int getArithmeticInstrCost(
unsigned Opcode, Type *Ty,
+ TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput,
TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue,
TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
@@ -131,30 +137,37 @@ public:
int getAddressComputationCost(Type *Ty, ScalarEvolution *SE, const SCEV *Ptr);
int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
+ TTI::TargetCostKind CostKind,
const Instruction *I = nullptr);
TTI::MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize,
bool IsZeroCmp) const;
int getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment,
- unsigned AddressSpace, const Instruction *I = nullptr);
+ unsigned AddressSpace,
+ TTI::TargetCostKind CostKind,
+ const Instruction *I = nullptr);
int getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys);
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
TTI::UnrollingPreferences &UP);
+ void getPeelingPreferences(Loop *L, ScalarEvolution &SE,
+ TTI::PeelingPreferences &PP);
+
Value *getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst,
Type *ExpectedType);
bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info);
- bool isLegalMaskedLoadStore(Type *DataType, MaybeAlign Alignment) {
- if (!isa<VectorType>(DataType) || !ST->hasSVE())
+ bool isLegalMaskedLoadStore(Type *DataType, Align Alignment) {
+ if (!isa<ScalableVectorType>(DataType) || !ST->hasSVE())
return false;
- Type *Ty = DataType->getVectorElementType();
- if (Ty->isHalfTy() || Ty->isFloatTy() || Ty->isDoubleTy())
+ Type *Ty = cast<ScalableVectorType>(DataType)->getElementType();
+ if (Ty->isBFloatTy() || Ty->isHalfTy() ||
+ Ty->isFloatTy() || Ty->isDoubleTy())
return true;
if (Ty->isIntegerTy(8) || Ty->isIntegerTy(16) ||
@@ -164,26 +177,58 @@ public:
return false;
}
- bool isLegalMaskedLoad(Type *DataType, MaybeAlign Alignment) {
+ bool isLegalMaskedLoad(Type *DataType, Align Alignment) {
return isLegalMaskedLoadStore(DataType, Alignment);
}
- bool isLegalMaskedStore(Type *DataType, MaybeAlign Alignment) {
+ bool isLegalMaskedStore(Type *DataType, Align Alignment) {
return isLegalMaskedLoadStore(DataType, Alignment);
}
- int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor,
- ArrayRef<unsigned> Indices, unsigned Alignment,
- unsigned AddressSpace,
- bool UseMaskForCond = false,
- bool UseMaskForGaps = false);
+ bool isLegalNTStore(Type *DataType, Align Alignment) {
+ // NOTE: The logic below is mostly geared towards LV, which calls it with
+ // vectors with 2 elements. We might want to improve that, if other
+ // users show up.
+ // Nontemporal vector stores can be directly lowered to STNP, if the vector
+ // can be halved so that each half fits into a register. That's the case if
+ // the element type fits into a register and the number of elements is a
+ // power of 2 > 1.
+ if (auto *DataTypeVTy = dyn_cast<VectorType>(DataType)) {
+ unsigned NumElements =
+ cast<FixedVectorType>(DataTypeVTy)->getNumElements();
+ unsigned EltSize = DataTypeVTy->getElementType()->getScalarSizeInBits();
+ return NumElements > 1 && isPowerOf2_64(NumElements) && EltSize >= 8 &&
+ EltSize <= 128 && isPowerOf2_64(EltSize);
+ }
+ return BaseT::isLegalNTStore(DataType, Alignment);
+ }
+
+ int getInterleavedMemoryOpCost(
+ unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
+ Align Alignment, unsigned AddressSpace,
+ TTI::TargetCostKind CostKind = TTI::TCK_SizeAndLatency,
+ bool UseMaskForCond = false, bool UseMaskForGaps = false);
bool
shouldConsiderAddressTypePromotion(const Instruction &I,
bool &AllowPromotionWithoutCommonHeader);
bool shouldExpandReduction(const IntrinsicInst *II) const {
- return false;
+ switch (II->getIntrinsicID()) {
+ case Intrinsic::experimental_vector_reduce_v2_fadd:
+ case Intrinsic::experimental_vector_reduce_v2_fmul:
+ // We don't have legalization support for ordered FP reductions.
+ return !II->getFastMathFlags().allowReassoc();
+
+ case Intrinsic::experimental_vector_reduce_fmax:
+ case Intrinsic::experimental_vector_reduce_fmin:
+ // Lowering asserts that there are no NaNs.
+ return !II->getFastMathFlags().noNaNs();
+
+ default:
+ // Don't expand anything else, let legalization deal with it.
+ return false;
+ }
}
unsigned getGISelRematGlobalCost() const {
@@ -193,10 +238,12 @@ public:
bool useReductionIntrinsic(unsigned Opcode, Type *Ty,
TTI::ReductionFlags Flags) const;
- int getArithmeticReductionCost(unsigned Opcode, Type *Ty,
- bool IsPairwiseForm);
+ int getArithmeticReductionCost(unsigned Opcode, VectorType *Ty,
+ bool IsPairwiseForm,
+ TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput);
- int getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp);
+ int getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, int Index,
+ VectorType *SubTp);
/// @}
};
diff --git a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
index be4c960224727..0ac09c4f96f04 100644
--- a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
+++ b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
@@ -260,6 +260,8 @@ public:
bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
SMLoc NameLoc, OperandVector &Operands) override;
bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) override;
+ OperandMatchResultTy tryParseRegister(unsigned &RegNo, SMLoc &StartLoc,
+ SMLoc &EndLoc) override;
bool ParseDirective(AsmToken DirectiveID) override;
unsigned validateTargetOperandClass(MCParsedAsmOperand &Op,
unsigned Kind) override;
@@ -755,12 +757,13 @@ public:
return false;
int64_t Val = MCE->getValue();
- int64_t SVal = typename std::make_signed<T>::type(Val);
- int64_t UVal = typename std::make_unsigned<T>::type(Val);
- if (Val != SVal && Val != UVal)
+ // Avoid left shift by 64 directly.
+ uint64_t Upper = UINT64_C(-1) << (sizeof(T) * 4) << (sizeof(T) * 4);
+ // Allow all-0 or all-1 in top bits to permit bitwise NOT.
+ if ((Val & Upper) && (Val & Upper) != Upper)
return false;
- return AArch64_AM::isLogicalImmediate(UVal, sizeof(T) * 8);
+ return AArch64_AM::isLogicalImmediate(Val & ~Upper, sizeof(T) * 8);
}
bool isShiftedImm() const { return Kind == k_ShiftedImm; }
@@ -852,8 +855,7 @@ public:
if (!isShiftedImm() && (!isImm() || !isa<MCConstantExpr>(getImm())))
return DiagnosticPredicateTy::NoMatch;
- bool IsByte =
- std::is_same<int8_t, typename std::make_signed<T>::type>::value;
+ bool IsByte = std::is_same<int8_t, std::make_signed_t<T>>::value;
if (auto ShiftedImm = getShiftedVal<8>())
if (!(IsByte && ShiftedImm->second) &&
AArch64_AM::isSVECpyImm<T>(uint64_t(ShiftedImm->first)
@@ -870,8 +872,7 @@ public:
if (!isShiftedImm() && (!isImm() || !isa<MCConstantExpr>(getImm())))
return DiagnosticPredicateTy::NoMatch;
- bool IsByte =
- std::is_same<int8_t, typename std::make_signed<T>::type>::value;
+ bool IsByte = std::is_same<int8_t, std::make_signed_t<T>>::value;
if (auto ShiftedImm = getShiftedVal<8>())
if (!(IsByte && ShiftedImm->second) &&
AArch64_AM::isSVEAddSubImm<T>(ShiftedImm->first
@@ -969,11 +970,15 @@ public:
bool isMOVZMovAlias() const {
if (!isImm()) return false;
- const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
- if (!CE) return false;
- uint64_t Value = CE->getValue();
+ const MCExpr *E = getImm();
+ if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(E)) {
+ uint64_t Value = CE->getValue();
- return AArch64_AM::isMOVZMovAlias(Value, Shift, RegWidth);
+ return AArch64_AM::isMOVZMovAlias(Value, Shift, RegWidth);
+ }
+ // Only supports the case of Shift being 0 if an expression is used as an
+ // operand
+ return !Shift && E;
}
template<int RegWidth, int Shift>
@@ -1033,8 +1038,10 @@ public:
bool isNeonVectorRegLo() const {
return Kind == k_Register && Reg.Kind == RegKind::NeonVector &&
- AArch64MCRegisterClasses[AArch64::FPR128_loRegClassID].contains(
- Reg.RegNum);
+ (AArch64MCRegisterClasses[AArch64::FPR128_loRegClassID].contains(
+ Reg.RegNum) ||
+ AArch64MCRegisterClasses[AArch64::FPR64_loRegClassID].contains(
+ Reg.RegNum));
}
template <unsigned Class> bool isSVEVectorReg() const {
@@ -1606,7 +1613,7 @@ public:
void addLogicalImmOperands(MCInst &Inst, unsigned N) const {
assert(N == 1 && "Invalid number of operands!");
const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm());
- typename std::make_unsigned<T>::type Val = MCE->getValue();
+ std::make_unsigned_t<T> Val = MCE->getValue();
uint64_t encoding = AArch64_AM::encodeLogicalImmediate(Val, sizeof(T) * 8);
Inst.addOperand(MCOperand::createImm(encoding));
}
@@ -1615,7 +1622,7 @@ public:
void addLogicalImmNotOperands(MCInst &Inst, unsigned N) const {
assert(N == 1 && "Invalid number of operands!");
const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm());
- typename std::make_unsigned<T>::type Val = ~MCE->getValue();
+ std::make_unsigned_t<T> Val = ~MCE->getValue();
uint64_t encoding = AArch64_AM::encodeLogicalImmediate(Val, sizeof(T) * 8);
Inst.addOperand(MCOperand::createImm(encoding));
}
@@ -1771,9 +1778,13 @@ public:
void addMOVZMovAliasOperands(MCInst &Inst, unsigned N) const {
assert(N == 1 && "Invalid number of operands!");
- const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
- uint64_t Value = CE->getValue();
- Inst.addOperand(MCOperand::createImm((Value >> Shift) & 0xffff));
+ const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ if (CE) {
+ uint64_t Value = CE->getValue();
+ Inst.addOperand(MCOperand::createImm((Value >> Shift) & 0xffff));
+ } else {
+ addExpr(Inst, getImm());
+ }
}
template<int Shift>
@@ -2243,10 +2254,16 @@ static unsigned matchSVEPredicateVectorRegName(StringRef Name) {
bool AArch64AsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc,
SMLoc &EndLoc) {
+ return tryParseRegister(RegNo, StartLoc, EndLoc) != MatchOperand_Success;
+}
+
+OperandMatchResultTy AArch64AsmParser::tryParseRegister(unsigned &RegNo,
+ SMLoc &StartLoc,
+ SMLoc &EndLoc) {
StartLoc = getLoc();
auto Res = tryParseScalarRegister(RegNo);
EndLoc = SMLoc::getFromPointer(getLoc().getPointer() - 1);
- return Res != MatchOperand_Success;
+ return Res;
}
// Matches a register name or register alias previously defined by '.req'
@@ -2404,9 +2421,9 @@ AArch64AsmParser::tryParsePrefetch(OperandVector &Operands) {
return MatchOperand_ParseFail;
}
- Parser.Lex(); // Eat identifier token.
Operands.push_back(AArch64Operand::CreatePrefetch(
*PRFM, Tok.getString(), S, getContext()));
+ Parser.Lex(); // Eat identifier token.
return MatchOperand_Success;
}
@@ -2427,9 +2444,9 @@ AArch64AsmParser::tryParsePSBHint(OperandVector &Operands) {
return MatchOperand_ParseFail;
}
- Parser.Lex(); // Eat identifier token.
Operands.push_back(AArch64Operand::CreatePSBHint(
PSB->Encoding, Tok.getString(), S, getContext()));
+ Parser.Lex(); // Eat identifier token.
return MatchOperand_Success;
}
@@ -2450,9 +2467,9 @@ AArch64AsmParser::tryParseBTIHint(OperandVector &Operands) {
return MatchOperand_ParseFail;
}
- Parser.Lex(); // Eat identifier token.
Operands.push_back(AArch64Operand::CreateBTIHint(
BTI->Encoding, Tok.getString(), S, getContext()));
+ Parser.Lex(); // Eat identifier token.
return MatchOperand_Success;
}
@@ -2827,6 +2844,7 @@ static const struct Extension {
{"tlb-rmi", {AArch64::FeatureTLB_RMI}},
{"pan-rwv", {AArch64::FeaturePAN_RWV}},
{"ccpp", {AArch64::FeatureCCPP}},
+ {"rcpc", {AArch64::FeatureRCPC}},
{"sve", {AArch64::FeatureSVE}},
{"sve2", {AArch64::FeatureSVE2}},
{"sve2-aes", {AArch64::FeatureSVE2AES}},
@@ -2851,6 +2869,8 @@ static void setRequiredFeatureString(FeatureBitset FBS, std::string &Str) {
Str += "ARMv8.4a";
else if (FBS[AArch64::HasV8_5aOps])
Str += "ARMv8.5a";
+ else if (FBS[AArch64::HasV8_6aOps])
+ Str += "ARMv8.6a";
else {
auto ext = std::find_if(std::begin(ExtensionMap),
std::end(ExtensionMap),
@@ -3771,7 +3791,7 @@ bool AArch64AsmParser::ParseInstruction(ParseInstructionInfo &Info,
// First check for the AArch64-specific .req directive.
if (Parser.getTok().is(AsmToken::Identifier) &&
- Parser.getTok().getIdentifier() == ".req") {
+ Parser.getTok().getIdentifier().lower() == ".req") {
parseDirectiveReq(Name, NameLoc);
// We always return 'error' for this, as we're done with this
// statement and don't need to match the 'instruction."
@@ -4106,6 +4126,16 @@ bool AArch64AsmParser::validateInstruction(MCInst &Inst, SMLoc &IDLoc,
"unpredictable STXP instruction, status is also a source");
break;
}
+ case AArch64::LDRABwriteback:
+ case AArch64::LDRAAwriteback: {
+ unsigned Xt = Inst.getOperand(0).getReg();
+ unsigned Xn = Inst.getOperand(1).getReg();
+ if (Xt == Xn)
+ return Error(Loc[0],
+ "unpredictable LDRA instruction, writeback base"
+ " is also a destination");
+ break;
+ }
}
@@ -4235,6 +4265,8 @@ bool AArch64AsmParser::showMatchError(SMLoc Loc, unsigned ErrCode,
return Error(Loc, "index must be a multiple of 4 in range [-32, 28].");
case Match_InvalidMemoryIndexed16SImm4:
return Error(Loc, "index must be a multiple of 16 in range [-128, 112].");
+ case Match_InvalidMemoryIndexed32SImm4:
+ return Error(Loc, "index must be a multiple of 32 in range [-256, 224].");
case Match_InvalidMemoryIndexed1SImm6:
return Error(Loc, "index must be an integer in range [-32, 31].");
case Match_InvalidMemoryIndexedSImm8:
@@ -4824,7 +4856,7 @@ bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
return true;
Inst.setLoc(IDLoc);
- Out.EmitInstruction(Inst, getSTI());
+ Out.emitInstruction(Inst, getSTI());
return false;
}
case Match_MissingFeature: {
@@ -4894,6 +4926,7 @@ bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
case Match_InvalidMemoryIndexed4SImm4:
case Match_InvalidMemoryIndexed1SImm6:
case Match_InvalidMemoryIndexed16SImm4:
+ case Match_InvalidMemoryIndexed32SImm4:
case Match_InvalidMemoryIndexed4SImm7:
case Match_InvalidMemoryIndexed8SImm7:
case Match_InvalidMemoryIndexed16SImm7:
@@ -5024,7 +5057,7 @@ bool AArch64AsmParser::ParseDirective(AsmToken DirectiveID) {
getContext().getObjectFileInfo()->getObjectFileType();
bool IsMachO = Format == MCObjectFileInfo::IsMachO;
- StringRef IDVal = DirectiveID.getIdentifier();
+ auto IDVal = DirectiveID.getIdentifier().lower();
SMLoc Loc = DirectiveID.getLoc();
if (IDVal == ".arch")
parseDirectiveArch(Loc);
@@ -5076,6 +5109,7 @@ static void ExpandCryptoAEK(AArch64::ArchKind ArchKind,
break;
case AArch64::ArchKind::ARMV8_4A:
case AArch64::ArchKind::ARMV8_5A:
+ case AArch64::ArchKind::ARMV8_6A:
RequestedExtensions.push_back("sm4");
RequestedExtensions.push_back("sha3");
RequestedExtensions.push_back("sha2");
@@ -5095,6 +5129,7 @@ static void ExpandCryptoAEK(AArch64::ArchKind ArchKind,
break;
case AArch64::ArchKind::ARMV8_4A:
case AArch64::ArchKind::ARMV8_5A:
+ case AArch64::ArchKind::ARMV8_6A:
RequestedExtensions.push_back("nosm4");
RequestedExtensions.push_back("nosha3");
RequestedExtensions.push_back("nosha2");
@@ -5314,7 +5349,7 @@ bool AArch64AsmParser::parseDirectiveTLSDescCall(SMLoc L) {
Inst.setOpcode(AArch64::TLSDESCCALL);
Inst.addOperand(MCOperand::createExpr(Expr));
- getParser().getStreamer().EmitInstruction(Inst, getSTI());
+ getParser().getStreamer().emitInstruction(Inst, getSTI());
return false;
}
@@ -5365,7 +5400,7 @@ bool AArch64AsmParser::parseDirectiveLOH(StringRef IDVal, SMLoc Loc) {
"unexpected token in '" + Twine(IDVal) + "' directive"))
return true;
- getStreamer().EmitLOHDirective((MCLOHType)Kind, Args);
+ getStreamer().emitLOHDirective((MCLOHType)Kind, Args);
return false;
}
@@ -5458,7 +5493,7 @@ bool AArch64AsmParser::parseDirectiveUnreq(SMLoc L) {
bool AArch64AsmParser::parseDirectiveCFINegateRAState() {
if (parseToken(AsmToken::EndOfStatement, "unexpected token in directive"))
return true;
- getStreamer().EmitCFINegateRAState();
+ getStreamer().emitCFINegateRAState();
return false;
}
@@ -5468,7 +5503,7 @@ bool AArch64AsmParser::parseDirectiveCFIBKeyFrame() {
if (parseToken(AsmToken::EndOfStatement,
"unexpected token in '.cfi_b_key_frame'"))
return true;
- getStreamer().EmitCFIBKeyFrame();
+ getStreamer().emitCFIBKeyFrame();
return false;
}
diff --git a/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp b/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp
index d6db88603429f..1ff4abb340540 100644
--- a/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp
+++ b/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp
@@ -146,6 +146,9 @@ static DecodeStatus DecodeExclusiveLdStInstruction(MCInst &Inst, uint32_t insn,
static DecodeStatus DecodePairLdStInstruction(MCInst &Inst, uint32_t insn,
uint64_t Address,
const void *Decoder);
+static DecodeStatus DecodeAuthLoadInstruction(MCInst &Inst, uint32_t insn,
+ uint64_t Address,
+ const void *Decoder);
static DecodeStatus DecodeAddSubERegInstruction(MCInst &Inst, uint32_t insn,
uint64_t Address,
const void *Decoder);
@@ -1501,6 +1504,39 @@ static DecodeStatus DecodePairLdStInstruction(MCInst &Inst, uint32_t insn,
return Success;
}
+static DecodeStatus DecodeAuthLoadInstruction(MCInst &Inst, uint32_t insn,
+ uint64_t Addr,
+ const void *Decoder) {
+ unsigned Rt = fieldFromInstruction(insn, 0, 5);
+ unsigned Rn = fieldFromInstruction(insn, 5, 5);
+ uint64_t offset = fieldFromInstruction(insn, 22, 1) << 9 |
+ fieldFromInstruction(insn, 12, 9);
+ unsigned writeback = fieldFromInstruction(insn, 11, 1);
+
+ switch (Inst.getOpcode()) {
+ default:
+ return Fail;
+ case AArch64::LDRAAwriteback:
+ case AArch64::LDRABwriteback:
+ DecodeGPR64spRegisterClass(Inst, Rn /* writeback register */, Addr,
+ Decoder);
+ break;
+ case AArch64::LDRAAindexed:
+ case AArch64::LDRABindexed:
+ break;
+ }
+
+ DecodeGPR64RegisterClass(Inst, Rt, Addr, Decoder);
+ DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder);
+ DecodeSImm<10>(Inst, offset, Addr, Decoder);
+
+ if (writeback && Rt == Rn && Rn != 31) {
+ return SoftFail;
+ }
+
+ return Success;
+}
+
static DecodeStatus DecodeAddSubERegInstruction(MCInst &Inst, uint32_t insn,
uint64_t Addr,
const void *Decoder) {
diff --git a/llvm/lib/Target/AArch64/AArch64CallLowering.cpp b/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp
index 76ff238234d99..11a8d5def4296 100644
--- a/llvm/lib/Target/AArch64/AArch64CallLowering.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp
@@ -62,10 +62,9 @@ struct IncomingArgHandler : public CallLowering::ValueHandler {
auto &MFI = MIRBuilder.getMF().getFrameInfo();
int FI = MFI.CreateFixedObject(Size, Offset, true);
MPO = MachinePointerInfo::getFixedStack(MIRBuilder.getMF(), FI);
- Register AddrReg = MRI.createGenericVirtualRegister(LLT::pointer(0, 64));
- MIRBuilder.buildFrameIndex(AddrReg, FI);
+ auto AddrReg = MIRBuilder.buildFrameIndex(LLT::pointer(0, 64), FI);
StackUsed = std::max(StackUsed, Size + Offset);
- return AddrReg;
+ return AddrReg.getReg(0);
}
void assignValueToReg(Register ValVReg, Register PhysReg,
@@ -87,10 +86,10 @@ struct IncomingArgHandler : public CallLowering::ValueHandler {
void assignValueToAddress(Register ValVReg, Register Addr, uint64_t Size,
MachinePointerInfo &MPO, CCValAssign &VA) override {
- // FIXME: Get alignment
- auto MMO = MIRBuilder.getMF().getMachineMemOperand(
+ MachineFunction &MF = MIRBuilder.getMF();
+ auto MMO = MF.getMachineMemOperand(
MPO, MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant, Size,
- 1);
+ inferAlignFromPtrInfo(MF, MPO));
MIRBuilder.buildLoad(ValVReg, Addr, *MMO);
}
@@ -134,7 +133,7 @@ struct OutgoingArgHandler : public CallLowering::ValueHandler {
int FPDiff = 0)
: ValueHandler(MIRBuilder, MRI, AssignFn), MIB(MIB),
AssignFnVarArg(AssignFnVarArg), IsTailCall(IsTailCall), FPDiff(FPDiff),
- StackSize(0) {}
+ StackSize(0), SPReg(0) {}
bool isIncomingArgumentHandler() const override { return false; }
@@ -147,23 +146,20 @@ struct OutgoingArgHandler : public CallLowering::ValueHandler {
if (IsTailCall) {
Offset += FPDiff;
int FI = MF.getFrameInfo().CreateFixedObject(Size, Offset, true);
- Register FIReg = MRI.createGenericVirtualRegister(p0);
- MIRBuilder.buildFrameIndex(FIReg, FI);
+ auto FIReg = MIRBuilder.buildFrameIndex(p0, FI);
MPO = MachinePointerInfo::getFixedStack(MF, FI);
- return FIReg;
+ return FIReg.getReg(0);
}
- Register SPReg = MRI.createGenericVirtualRegister(p0);
- MIRBuilder.buildCopy(SPReg, Register(AArch64::SP));
+ if (!SPReg)
+ SPReg = MIRBuilder.buildCopy(p0, Register(AArch64::SP)).getReg(0);
- Register OffsetReg = MRI.createGenericVirtualRegister(s64);
- MIRBuilder.buildConstant(OffsetReg, Offset);
+ auto OffsetReg = MIRBuilder.buildConstant(s64, Offset);
- Register AddrReg = MRI.createGenericVirtualRegister(p0);
- MIRBuilder.buildPtrAdd(AddrReg, SPReg, OffsetReg);
+ auto AddrReg = MIRBuilder.buildPtrAdd(p0, SPReg, OffsetReg);
MPO = MachinePointerInfo::getStack(MF, Offset);
- return AddrReg;
+ return AddrReg.getReg(0);
}
void assignValueToReg(Register ValVReg, Register PhysReg,
@@ -175,17 +171,33 @@ struct OutgoingArgHandler : public CallLowering::ValueHandler {
void assignValueToAddress(Register ValVReg, Register Addr, uint64_t Size,
MachinePointerInfo &MPO, CCValAssign &VA) override {
- if (VA.getLocInfo() == CCValAssign::LocInfo::AExt) {
- Size = VA.getLocVT().getSizeInBits() / 8;
- ValVReg = MIRBuilder.buildAnyExt(LLT::scalar(Size * 8), ValVReg)
- ->getOperand(0)
- .getReg();
- }
- auto MMO = MIRBuilder.getMF().getMachineMemOperand(
- MPO, MachineMemOperand::MOStore, Size, 1);
+ MachineFunction &MF = MIRBuilder.getMF();
+ auto MMO = MF.getMachineMemOperand(MPO, MachineMemOperand::MOStore, Size,
+ inferAlignFromPtrInfo(MF, MPO));
MIRBuilder.buildStore(ValVReg, Addr, *MMO);
}
+ void assignValueToAddress(const CallLowering::ArgInfo &Arg, Register Addr,
+ uint64_t Size, MachinePointerInfo &MPO,
+ CCValAssign &VA) override {
+ unsigned MaxSize = Size * 8;
+ // For varargs, we always want to extend them to 8 bytes, in which case
+ // we disable setting a max.
+ if (!Arg.IsFixed)
+ MaxSize = 0;
+
+ Register ValVReg = VA.getLocInfo() != CCValAssign::LocInfo::FPExt
+ ? extendRegister(Arg.Regs[0], VA, MaxSize)
+ : Arg.Regs[0];
+
+ // If we extended we might need to adjust the MMO's Size.
+ const LLT RegTy = MRI.getType(ValVReg);
+ if (RegTy.getSizeInBytes() > Size)
+ Size = RegTy.getSizeInBytes();
+
+ assignValueToAddress(ValVReg, Addr, Size, MPO, VA);
+ }
+
bool assignArg(unsigned ValNo, MVT ValVT, MVT LocVT,
CCValAssign::LocInfo LocInfo,
const CallLowering::ArgInfo &Info,
@@ -209,6 +221,9 @@ struct OutgoingArgHandler : public CallLowering::ValueHandler {
/// callee's. Unused elsewhere.
int FPDiff;
uint64_t StackSize;
+
+ // Cache the SP register vreg if we need it more than once in this call site.
+ Register SPReg;
};
} // namespace
@@ -222,13 +237,13 @@ void AArch64CallLowering::splitToValueTypes(
const AArch64TargetLowering &TLI = *getTLI<AArch64TargetLowering>();
LLVMContext &Ctx = OrigArg.Ty->getContext();
- if (OrigArg.Ty->isVoidTy())
- return;
-
SmallVector<EVT, 4> SplitVTs;
SmallVector<uint64_t, 4> Offsets;
ComputeValueVTs(TLI, DL, OrigArg.Ty, SplitVTs, &Offsets, 0);
+ if (SplitVTs.size() == 0)
+ return;
+
if (SplitVTs.size() == 1) {
// No splitting to do, but we want to replace the original type (e.g. [1 x
// double] -> double).
@@ -322,8 +337,7 @@ bool AArch64CallLowering::lowerReturn(MachineIRBuilder &MIRBuilder,
}
auto Undef = MIRBuilder.buildUndef({OldLLT});
CurVReg =
- MIRBuilder.buildMerge({NewLLT}, {CurVReg, Undef.getReg(0)})
- .getReg(0);
+ MIRBuilder.buildMerge({NewLLT}, {CurVReg, Undef}).getReg(0);
} else {
// Just do a vector extend.
CurVReg = MIRBuilder.buildInstr(ExtendOp, {NewLLT}, {CurVReg})
@@ -413,6 +427,14 @@ static void handleMustTailForwardedRegisters(MachineIRBuilder &MIRBuilder,
}
}
+bool AArch64CallLowering::fallBackToDAGISel(const Function &F) const {
+ if (isa<ScalableVectorType>(F.getReturnType()))
+ return true;
+ return llvm::any_of(F.args(), [](const Argument &A) {
+ return isa<ScalableVectorType>(A.getType());
+ });
+}
+
bool AArch64CallLowering::lowerFormalArguments(
MachineIRBuilder &MIRBuilder, const Function &F,
ArrayRef<ArrayRef<Register>> VRegs) const {
@@ -424,7 +446,7 @@ bool AArch64CallLowering::lowerFormalArguments(
SmallVector<ArgInfo, 8> SplitArgs;
unsigned i = 0;
for (auto &Arg : F.args()) {
- if (DL.getTypeStoreSize(Arg.getType()) == 0)
+ if (DL.getTypeStoreSize(Arg.getType()).isZero())
continue;
ArgInfo OrigArg{VRegs[i], Arg.getType()};
@@ -759,17 +781,17 @@ bool AArch64CallLowering::isEligibleForTailCallOptimization(
return true;
}
-static unsigned getCallOpcode(const Function &CallerF, bool IsIndirect,
+static unsigned getCallOpcode(const MachineFunction &CallerF, bool IsIndirect,
bool IsTailCall) {
if (!IsTailCall)
- return IsIndirect ? AArch64::BLR : AArch64::BL;
+ return IsIndirect ? getBLRCallOpcode(CallerF) : (unsigned)AArch64::BL;
if (!IsIndirect)
return AArch64::TCRETURNdi;
// When BTI is enabled, we need to use TCRETURNriBTI to make sure that we use
// x16 or x17.
- if (CallerF.hasFnAttribute("branch-target-enforcement"))
+ if (CallerF.getFunction().hasFnAttribute("branch-target-enforcement"))
return AArch64::TCRETURNriBTI;
return AArch64::TCRETURNri;
@@ -805,7 +827,7 @@ bool AArch64CallLowering::lowerTailCall(
if (!IsSibCall)
CallSeqStart = MIRBuilder.buildInstr(AArch64::ADJCALLSTACKDOWN);
- unsigned Opc = getCallOpcode(F, Info.Callee.isReg(), true);
+ unsigned Opc = getCallOpcode(MF, Info.Callee.isReg(), true);
auto MIB = MIRBuilder.buildInstrNoInsert(Opc);
MIB.add(Info.Callee);
@@ -863,7 +885,6 @@ bool AArch64CallLowering::lowerTailCall(
const auto &Forwards = FuncInfo->getForwardedMustTailRegParms();
// Do the actual argument marshalling.
- SmallVector<unsigned, 8> PhysRegs;
OutgoingArgHandler Handler(MIRBuilder, MRI, MIB, AssignFnFixed,
AssignFnVarArg, true, FPDiff);
if (!handleAssignments(MIRBuilder, OutArgs, Handler))
@@ -965,7 +986,7 @@ bool AArch64CallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
// Create a temporarily-floating call instruction so we can add the implicit
// uses of arg registers.
- unsigned Opc = getCallOpcode(F, Info.Callee.isReg(), false);
+ unsigned Opc = getCallOpcode(MF, Info.Callee.isReg(), false);
auto MIB = MIRBuilder.buildInstrNoInsert(Opc);
MIB.add(Info.Callee);
@@ -981,7 +1002,6 @@ bool AArch64CallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
TRI->emitReservedArgRegCallError(MF);
// Do the actual argument marshalling.
- SmallVector<unsigned, 8> PhysRegs;
OutgoingArgHandler Handler(MIRBuilder, MRI, MIB, AssignFnFixed,
AssignFnVarArg, false);
if (!handleAssignments(MIRBuilder, OutArgs, Handler))
diff --git a/llvm/lib/Target/AArch64/AArch64CallLowering.h b/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.h
index b0c601c7062c0..640a862530596 100644
--- a/llvm/lib/Target/AArch64/AArch64CallLowering.h
+++ b/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.h
@@ -37,6 +37,8 @@ public:
ArrayRef<Register> VRegs,
Register SwiftErrorVReg) const override;
+ bool fallBackToDAGISel(const Function &F) const override;
+
bool lowerFormalArguments(MachineIRBuilder &MIRBuilder, const Function &F,
ArrayRef<ArrayRef<Register>> VRegs) const override;
diff --git a/llvm/lib/Target/AArch64/AArch64InstructionSelector.cpp b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
index b9ac2657e1c5a..408f0cb77e738 100644
--- a/llvm/lib/Target/AArch64/AArch64InstructionSelector.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
@@ -31,6 +31,8 @@
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineOperand.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetOpcodes.h"
+#include "llvm/IR/Constants.h"
#include "llvm/IR/Type.h"
#include "llvm/IR/IntrinsicsAArch64.h"
#include "llvm/Support/Debug.h"
@@ -63,6 +65,9 @@ public:
// cache it here for each run of the selector.
ProduceNonFlagSettingCondBr =
!MF.getFunction().hasFnAttribute(Attribute::SpeculativeLoadHardening);
+ MFReturnAddr = Register();
+
+ processPHIs(MF);
}
private:
@@ -71,23 +76,33 @@ private:
bool selectImpl(MachineInstr &I, CodeGenCoverage &CoverageInfo) const;
// A lowering phase that runs before any selection attempts.
-
- void preISelLower(MachineInstr &I) const;
+ // Returns true if the instruction was modified.
+ bool preISelLower(MachineInstr &I);
// An early selection function that runs before the selectImpl() call.
bool earlySelect(MachineInstr &I) const;
+ // Do some preprocessing of G_PHIs before we begin selection.
+ void processPHIs(MachineFunction &MF);
+
bool earlySelectSHL(MachineInstr &I, MachineRegisterInfo &MRI) const;
/// Eliminate same-sized cross-bank copies into stores before selectImpl().
- void contractCrossBankCopyIntoStore(MachineInstr &I,
- MachineRegisterInfo &MRI) const;
+ bool contractCrossBankCopyIntoStore(MachineInstr &I,
+ MachineRegisterInfo &MRI);
+
+ bool convertPtrAddToAdd(MachineInstr &I, MachineRegisterInfo &MRI);
bool selectVaStartAAPCS(MachineInstr &I, MachineFunction &MF,
MachineRegisterInfo &MRI) const;
bool selectVaStartDarwin(MachineInstr &I, MachineFunction &MF,
MachineRegisterInfo &MRI) const;
+ bool tryOptAndIntoCompareBranch(MachineInstr *LHS,
+ int64_t CmpConstant,
+ const CmpInst::Predicate &Pred,
+ MachineBasicBlock *DstMBB,
+ MachineIRBuilder &MIB) const;
bool selectCompareBranch(MachineInstr &I, MachineFunction &MF,
MachineRegisterInfo &MRI) const;
@@ -112,6 +127,8 @@ private:
const RegisterBank &RB,
MachineIRBuilder &MIRBuilder) const;
bool selectInsertElt(MachineInstr &I, MachineRegisterInfo &MRI) const;
+ bool tryOptConstantBuildVec(MachineInstr &MI, LLT DstTy,
+ MachineRegisterInfo &MRI) const;
bool selectBuildVector(MachineInstr &I, MachineRegisterInfo &MRI) const;
bool selectMergeValues(MachineInstr &I, MachineRegisterInfo &MRI) const;
bool selectUnmergeValues(MachineInstr &I, MachineRegisterInfo &MRI) const;
@@ -123,7 +140,7 @@ private:
MachineRegisterInfo &MRI) const;
bool selectIntrinsicWithSideEffects(MachineInstr &I,
MachineRegisterInfo &MRI) const;
- bool selectIntrinsic(MachineInstr &I, MachineRegisterInfo &MRI) const;
+ bool selectIntrinsic(MachineInstr &I, MachineRegisterInfo &MRI);
bool selectVectorICmp(MachineInstr &I, MachineRegisterInfo &MRI) const;
bool selectIntrinsicTrunc(MachineInstr &I, MachineRegisterInfo &MRI) const;
bool selectIntrinsicRound(MachineInstr &I, MachineRegisterInfo &MRI) const;
@@ -131,17 +148,25 @@ private:
bool selectBrJT(MachineInstr &I, MachineRegisterInfo &MRI) const;
bool selectTLSGlobalValue(MachineInstr &I, MachineRegisterInfo &MRI) const;
- unsigned emitConstantPoolEntry(Constant *CPVal, MachineFunction &MF) const;
- MachineInstr *emitLoadFromConstantPool(Constant *CPVal,
+ unsigned emitConstantPoolEntry(const Constant *CPVal,
+ MachineFunction &MF) const;
+ MachineInstr *emitLoadFromConstantPool(const Constant *CPVal,
MachineIRBuilder &MIRBuilder) const;
// Emit a vector concat operation.
MachineInstr *emitVectorConcat(Optional<Register> Dst, Register Op1,
Register Op2,
MachineIRBuilder &MIRBuilder) const;
- MachineInstr *emitIntegerCompare(MachineOperand &LHS, MachineOperand &RHS,
- MachineOperand &Predicate,
- MachineIRBuilder &MIRBuilder) const;
+
+ // Emit an integer compare between LHS and RHS, which checks for Predicate.
+ //
+ // This returns the produced compare instruction, and the predicate which
+ // was ultimately used in the compare. The predicate may differ from what
+ // is passed in \p Predicate due to optimization.
+ std::pair<MachineInstr *, CmpInst::Predicate>
+ emitIntegerCompare(MachineOperand &LHS, MachineOperand &RHS,
+ MachineOperand &Predicate,
+ MachineIRBuilder &MIRBuilder) const;
MachineInstr *emitADD(Register DefReg, MachineOperand &LHS, MachineOperand &RHS,
MachineIRBuilder &MIRBuilder) const;
MachineInstr *emitCMN(MachineOperand &LHS, MachineOperand &RHS,
@@ -163,6 +188,13 @@ private:
MachineInstr *emitCSetForICMP(Register DefReg, unsigned Pred,
MachineIRBuilder &MIRBuilder) const;
+ /// Emit a TB(N)Z instruction which tests \p Bit in \p TestReg.
+ /// \p IsNegative is true if the test should be "not zero".
+ /// This will also optimize the test bit instruction when possible.
+ MachineInstr *emitTestBit(Register TestReg, uint64_t Bit, bool IsNegative,
+ MachineBasicBlock *DstMBB,
+ MachineIRBuilder &MIB) const;
+
// Equivalent to the i32shift_a and friends from AArch64InstrInfo.td.
// We use these manually instead of using the importer since it doesn't
// support SDNodeXForm.
@@ -194,6 +226,11 @@ private:
return selectAddrModeUnscaled(Root, 16);
}
+ /// Helper to try to fold in a GISEL_ADD_LOW into an immediate, to be used
+ /// from complex pattern matchers like selectAddrModeIndexed().
+ ComplexRendererFns tryFoldAddLowIntoImm(MachineInstr &RootDef, unsigned Size,
+ MachineRegisterInfo &MRI) const;
+
ComplexRendererFns selectAddrModeIndexed(MachineOperand &Root,
unsigned Size) const;
template <int Width>
@@ -258,6 +295,8 @@ private:
/// new copy.
Register narrowExtendRegIfNeeded(Register ExtReg,
MachineIRBuilder &MIB) const;
+ Register widenGPRBankRegIfNeeded(Register Reg, unsigned Size,
+ MachineIRBuilder &MIB) const;
ComplexRendererFns selectArithExtendedRegister(MachineOperand &Root) const;
void renderTruncImm(MachineInstrBuilder &MIB, const MachineInstr &MI,
@@ -272,12 +311,17 @@ private:
unsigned OpFlags) const;
// Optimization methods.
- bool tryOptVectorShuffle(MachineInstr &I) const;
- bool tryOptVectorDup(MachineInstr &MI) const;
bool tryOptSelect(MachineInstr &MI) const;
MachineInstr *tryFoldIntegerCompare(MachineOperand &LHS, MachineOperand &RHS,
MachineOperand &Predicate,
MachineIRBuilder &MIRBuilder) const;
+ MachineInstr *tryOptArithImmedIntegerCompare(MachineOperand &LHS,
+ MachineOperand &RHS,
+ CmpInst::Predicate &Predicate,
+ MachineIRBuilder &MIB) const;
+ MachineInstr *tryOptArithShiftedCompare(MachineOperand &LHS,
+ MachineOperand &RHS,
+ MachineIRBuilder &MIB) const;
/// Return true if \p MI is a load or store of \p NumBytes bytes.
bool isLoadStoreOfNumBytes(const MachineInstr &MI, unsigned NumBytes) const;
@@ -295,6 +339,11 @@ private:
bool ProduceNonFlagSettingCondBr = false;
+ // Some cached values used during selection.
+ // We use LR as a live-in register, and we keep track of it here as it can be
+ // clobbered by calls.
+ Register MFReturnAddr;
+
#define GET_GLOBALISEL_PREDICATES_DECL
#include "AArch64GenGlobalISel.inc"
#undef GET_GLOBALISEL_PREDICATES_DECL
@@ -421,6 +470,39 @@ static bool getSubRegForClass(const TargetRegisterClass *RC,
return true;
}
+/// Returns the minimum size the given register bank can hold.
+static unsigned getMinSizeForRegBank(const RegisterBank &RB) {
+ switch (RB.getID()) {
+ case AArch64::GPRRegBankID:
+ return 32;
+ case AArch64::FPRRegBankID:
+ return 8;
+ default:
+ llvm_unreachable("Tried to get minimum size for unknown register bank.");
+ }
+}
+
+static Optional<uint64_t> getImmedFromMO(const MachineOperand &Root) {
+ auto &MI = *Root.getParent();
+ auto &MBB = *MI.getParent();
+ auto &MF = *MBB.getParent();
+ auto &MRI = MF.getRegInfo();
+ uint64_t Immed;
+ if (Root.isImm())
+ Immed = Root.getImm();
+ else if (Root.isCImm())
+ Immed = Root.getCImm()->getZExtValue();
+ else if (Root.isReg()) {
+ auto ValAndVReg =
+ getConstantVRegValWithLookThrough(Root.getReg(), MRI, true);
+ if (!ValAndVReg)
+ return None;
+ Immed = ValAndVReg->Value;
+ } else
+ return None;
+ return Immed;
+}
+
/// Check whether \p I is a currently unsupported binary operation:
/// - it has an unsized type
/// - an operand is not a vreg
@@ -609,23 +691,20 @@ static bool isValidCopy(const MachineInstr &I, const RegisterBank &DstBank,
}
#endif
-/// Helper function for selectCopy. Inserts a subregister copy from
-/// \p *From to \p *To, linking it up to \p I.
-///
-/// e.g, given I = "Dst = COPY SrcReg", we'll transform that into
+/// Helper function for selectCopy. Inserts a subregister copy from \p SrcReg
+/// to \p *To.
///
-/// CopyReg (From class) = COPY SrcReg
-/// SubRegCopy (To class) = COPY CopyReg:SubReg
-/// Dst = COPY SubRegCopy
-static bool selectSubregisterCopy(MachineInstr &I, MachineRegisterInfo &MRI,
- const RegisterBankInfo &RBI, Register SrcReg,
- const TargetRegisterClass *From,
- const TargetRegisterClass *To,
- unsigned SubReg) {
+/// E.g "To = COPY SrcReg:SubReg"
+static bool copySubReg(MachineInstr &I, MachineRegisterInfo &MRI,
+ const RegisterBankInfo &RBI, Register SrcReg,
+ const TargetRegisterClass *To, unsigned SubReg) {
+ assert(SrcReg.isValid() && "Expected a valid source register?");
+ assert(To && "Destination register class cannot be null");
+ assert(SubReg && "Expected a valid subregister");
+
MachineIRBuilder MIB(I);
- auto Copy = MIB.buildCopy({From}, {SrcReg});
- auto SubRegCopy = MIB.buildInstr(TargetOpcode::COPY, {To}, {})
- .addReg(Copy.getReg(0), 0, SubReg);
+ auto SubRegCopy =
+ MIB.buildInstr(TargetOpcode::COPY, {To}, {}).addReg(SrcReg, 0, SubReg);
MachineOperand &RegOp = I.getOperand(1);
RegOp.setReg(SubRegCopy.getReg(0));
@@ -670,7 +749,6 @@ getRegClassesForCopy(MachineInstr &I, const TargetInstrInfo &TII,
static bool selectCopy(MachineInstr &I, const TargetInstrInfo &TII,
MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI,
const RegisterBankInfo &RBI) {
-
Register DstReg = I.getOperand(0).getReg();
Register SrcReg = I.getOperand(1).getReg();
const RegisterBank &DstRegBank = *RBI.getRegBank(DstReg, MRI, TRI);
@@ -703,13 +781,15 @@ static bool selectCopy(MachineInstr &I, const TargetInstrInfo &TII,
(!Register::isPhysicalRegister(I.getOperand(0).getReg()) &&
!Register::isPhysicalRegister(I.getOperand(1).getReg()))) &&
"No phys reg on generic operator!");
- assert(KnownValid || isValidCopy(I, DstRegBank, MRI, TRI, RBI));
- (void)KnownValid;
- return true;
+ bool ValidCopy = true;
+#ifndef NDEBUG
+ ValidCopy = KnownValid || isValidCopy(I, DstRegBank, MRI, TRI, RBI);
+ assert(ValidCopy && "Invalid copy.");
+#endif
+ return ValidCopy;
};
- // Is this a copy? If so, then we may need to insert a subregister copy, or
- // a SUBREG_TO_REG.
+ // Is this a copy? If so, then we may need to insert a subregister copy.
if (I.isCopy()) {
// Yes. Check if there's anything to fix up.
if (!SrcRC) {
@@ -719,48 +799,43 @@ static bool selectCopy(MachineInstr &I, const TargetInstrInfo &TII,
unsigned SrcSize = TRI.getRegSizeInBits(*SrcRC);
unsigned DstSize = TRI.getRegSizeInBits(*DstRC);
+ unsigned SubReg;
- // If we're doing a cross-bank copy on different-sized registers, we need
- // to do a bit more work.
- if (SrcSize > DstSize) {
- // We're doing a cross-bank copy into a smaller register. We need a
- // subregister copy. First, get a register class that's on the same bank
- // as the destination, but the same size as the source.
- const TargetRegisterClass *SubregRC =
- getMinClassForRegBank(DstRegBank, SrcSize, true);
- assert(SubregRC && "Didn't get a register class for subreg?");
-
- // Get the appropriate subregister for the destination.
- unsigned SubReg = 0;
- if (!getSubRegForClass(DstRC, TRI, SubReg)) {
- LLVM_DEBUG(dbgs() << "Couldn't determine subregister for copy.\n");
- return false;
- }
-
- // Now, insert a subregister copy using the new register class.
- selectSubregisterCopy(I, MRI, RBI, SrcReg, SubregRC, DstRC, SubReg);
- return CheckCopy();
- }
+ // If the source bank doesn't support a subregister copy small enough,
+ // then we first need to copy to the destination bank.
+ if (getMinSizeForRegBank(SrcRegBank) > DstSize) {
+ const TargetRegisterClass *DstTempRC =
+ getMinClassForRegBank(DstRegBank, SrcSize, /* GetAllRegSet */ true);
+ getSubRegForClass(DstRC, TRI, SubReg);
- // Is this a cross-bank copy?
- if (DstRegBank.getID() != SrcRegBank.getID()) {
- if (DstRegBank.getID() == AArch64::GPRRegBankID && DstSize == 32 &&
- SrcSize == 16) {
- // Special case for FPR16 to GPR32.
- // FIXME: This can probably be generalized like the above case.
- Register PromoteReg =
- MRI.createVirtualRegister(&AArch64::FPR32RegClass);
- BuildMI(*I.getParent(), I, I.getDebugLoc(),
- TII.get(AArch64::SUBREG_TO_REG), PromoteReg)
- .addImm(0)
- .addUse(SrcReg)
- .addImm(AArch64::hsub);
- MachineOperand &RegOp = I.getOperand(1);
- RegOp.setReg(PromoteReg);
+ MachineIRBuilder MIB(I);
+ auto Copy = MIB.buildCopy({DstTempRC}, {SrcReg});
+ copySubReg(I, MRI, RBI, Copy.getReg(0), DstRC, SubReg);
+ } else if (SrcSize > DstSize) {
+ // If the source register is bigger than the destination we need to
+ // perform a subregister copy.
+ const TargetRegisterClass *SubRegRC =
+ getMinClassForRegBank(SrcRegBank, DstSize, /* GetAllRegSet */ true);
+ getSubRegForClass(SubRegRC, TRI, SubReg);
+ copySubReg(I, MRI, RBI, SrcReg, DstRC, SubReg);
+ } else if (DstSize > SrcSize) {
+ // If the destination register is bigger than the source we need to do
+ // a promotion using SUBREG_TO_REG.
+ const TargetRegisterClass *PromotionRC =
+ getMinClassForRegBank(SrcRegBank, DstSize, /* GetAllRegSet */ true);
+ getSubRegForClass(SrcRC, TRI, SubReg);
+
+ Register PromoteReg = MRI.createVirtualRegister(PromotionRC);
+ BuildMI(*I.getParent(), I, I.getDebugLoc(),
+ TII.get(AArch64::SUBREG_TO_REG), PromoteReg)
+ .addImm(0)
+ .addUse(SrcReg)
+ .addImm(SubReg);
+ MachineOperand &RegOp = I.getOperand(1);
+ RegOp.setReg(PromoteReg);
- // Promise that the copy is implicitly validated by the SUBREG_TO_REG.
- KnownValid = true;
- }
+ // Promise that the copy is implicitly validated by the SUBREG_TO_REG.
+ KnownValid = true;
}
// If the destination is a physical register, then there's nothing to
@@ -977,6 +1052,216 @@ static void changeFCMPPredToAArch64CC(CmpInst::Predicate P,
}
}
+/// Return a register which can be used as a bit to test in a TB(N)Z.
+static Register getTestBitReg(Register Reg, uint64_t &Bit, bool &Invert,
+ MachineRegisterInfo &MRI) {
+ assert(Reg.isValid() && "Expected valid register!");
+ while (MachineInstr *MI = getDefIgnoringCopies(Reg, MRI)) {
+ unsigned Opc = MI->getOpcode();
+
+ if (!MI->getOperand(0).isReg() ||
+ !MRI.hasOneNonDBGUse(MI->getOperand(0).getReg()))
+ break;
+
+ // (tbz (any_ext x), b) -> (tbz x, b) if we don't use the extended bits.
+ //
+ // (tbz (trunc x), b) -> (tbz x, b) is always safe, because the bit number
+ // on the truncated x is the same as the bit number on x.
+ if (Opc == TargetOpcode::G_ANYEXT || Opc == TargetOpcode::G_ZEXT ||
+ Opc == TargetOpcode::G_TRUNC) {
+ Register NextReg = MI->getOperand(1).getReg();
+ // Did we find something worth folding?
+ if (!NextReg.isValid() || !MRI.hasOneNonDBGUse(NextReg))
+ break;
+
+ // NextReg is worth folding. Keep looking.
+ Reg = NextReg;
+ continue;
+ }
+
+ // Attempt to find a suitable operation with a constant on one side.
+ Optional<uint64_t> C;
+ Register TestReg;
+ switch (Opc) {
+ default:
+ break;
+ case TargetOpcode::G_AND:
+ case TargetOpcode::G_XOR: {
+ TestReg = MI->getOperand(1).getReg();
+ Register ConstantReg = MI->getOperand(2).getReg();
+ auto VRegAndVal = getConstantVRegValWithLookThrough(ConstantReg, MRI);
+ if (!VRegAndVal) {
+ // AND commutes, check the other side for a constant.
+ // FIXME: Can we canonicalize the constant so that it's always on the
+ // same side at some point earlier?
+ std::swap(ConstantReg, TestReg);
+ VRegAndVal = getConstantVRegValWithLookThrough(ConstantReg, MRI);
+ }
+ if (VRegAndVal)
+ C = VRegAndVal->Value;
+ break;
+ }
+ case TargetOpcode::G_ASHR:
+ case TargetOpcode::G_LSHR:
+ case TargetOpcode::G_SHL: {
+ TestReg = MI->getOperand(1).getReg();
+ auto VRegAndVal =
+ getConstantVRegValWithLookThrough(MI->getOperand(2).getReg(), MRI);
+ if (VRegAndVal)
+ C = VRegAndVal->Value;
+ break;
+ }
+ }
+
+ // Didn't find a constant or viable register. Bail out of the loop.
+ if (!C || !TestReg.isValid())
+ break;
+
+ // We found a suitable instruction with a constant. Check to see if we can
+ // walk through the instruction.
+ Register NextReg;
+ unsigned TestRegSize = MRI.getType(TestReg).getSizeInBits();
+ switch (Opc) {
+ default:
+ break;
+ case TargetOpcode::G_AND:
+ // (tbz (and x, m), b) -> (tbz x, b) when the b-th bit of m is set.
+ if ((*C >> Bit) & 1)
+ NextReg = TestReg;
+ break;
+ case TargetOpcode::G_SHL:
+ // (tbz (shl x, c), b) -> (tbz x, b-c) when b-c is positive and fits in
+ // the type of the register.
+ if (*C <= Bit && (Bit - *C) < TestRegSize) {
+ NextReg = TestReg;
+ Bit = Bit - *C;
+ }
+ break;
+ case TargetOpcode::G_ASHR:
+ // (tbz (ashr x, c), b) -> (tbz x, b+c) or (tbz x, msb) if b+c is > # bits
+ // in x
+ NextReg = TestReg;
+ Bit = Bit + *C;
+ if (Bit >= TestRegSize)
+ Bit = TestRegSize - 1;
+ break;
+ case TargetOpcode::G_LSHR:
+ // (tbz (lshr x, c), b) -> (tbz x, b+c) when b + c is < # bits in x
+ if ((Bit + *C) < TestRegSize) {
+ NextReg = TestReg;
+ Bit = Bit + *C;
+ }
+ break;
+ case TargetOpcode::G_XOR:
+ // We can walk through a G_XOR by inverting whether we use tbz/tbnz when
+ // appropriate.
+ //
+ // e.g. If x' = xor x, c, and the b-th bit is set in c then
+ //
+ // tbz x', b -> tbnz x, b
+ //
+ // Because x' only has the b-th bit set if x does not.
+ if ((*C >> Bit) & 1)
+ Invert = !Invert;
+ NextReg = TestReg;
+ break;
+ }
+
+ // Check if we found anything worth folding.
+ if (!NextReg.isValid())
+ return Reg;
+ Reg = NextReg;
+ }
+
+ return Reg;
+}
+
+MachineInstr *AArch64InstructionSelector::emitTestBit(
+ Register TestReg, uint64_t Bit, bool IsNegative, MachineBasicBlock *DstMBB,
+ MachineIRBuilder &MIB) const {
+ assert(TestReg.isValid());
+ assert(ProduceNonFlagSettingCondBr &&
+ "Cannot emit TB(N)Z with speculation tracking!");
+ MachineRegisterInfo &MRI = *MIB.getMRI();
+
+ // Attempt to optimize the test bit by walking over instructions.
+ TestReg = getTestBitReg(TestReg, Bit, IsNegative, MRI);
+ LLT Ty = MRI.getType(TestReg);
+ unsigned Size = Ty.getSizeInBits();
+ assert(!Ty.isVector() && "Expected a scalar!");
+ assert(Bit < 64 && "Bit is too large!");
+
+ // When the test register is a 64-bit register, we have to narrow to make
+ // TBNZW work.
+ bool UseWReg = Bit < 32;
+ unsigned NecessarySize = UseWReg ? 32 : 64;
+ if (Size < NecessarySize)
+ TestReg = widenGPRBankRegIfNeeded(TestReg, NecessarySize, MIB);
+ else if (Size > NecessarySize)
+ TestReg = narrowExtendRegIfNeeded(TestReg, MIB);
+
+ static const unsigned OpcTable[2][2] = {{AArch64::TBZX, AArch64::TBNZX},
+ {AArch64::TBZW, AArch64::TBNZW}};
+ unsigned Opc = OpcTable[UseWReg][IsNegative];
+ auto TestBitMI =
+ MIB.buildInstr(Opc).addReg(TestReg).addImm(Bit).addMBB(DstMBB);
+ constrainSelectedInstRegOperands(*TestBitMI, TII, TRI, RBI);
+ return &*TestBitMI;
+}
+
+bool AArch64InstructionSelector::tryOptAndIntoCompareBranch(
+ MachineInstr *AndInst, int64_t CmpConstant, const CmpInst::Predicate &Pred,
+ MachineBasicBlock *DstMBB, MachineIRBuilder &MIB) const {
+ // Given something like this:
+ //
+ // %x = ...Something...
+ // %one = G_CONSTANT i64 1
+ // %zero = G_CONSTANT i64 0
+ // %and = G_AND %x, %one
+ // %cmp = G_ICMP intpred(ne), %and, %zero
+ // %cmp_trunc = G_TRUNC %cmp
+ // G_BRCOND %cmp_trunc, %bb.3
+ //
+ // We want to try and fold the AND into the G_BRCOND and produce either a
+ // TBNZ (when we have intpred(ne)) or a TBZ (when we have intpred(eq)).
+ //
+ // In this case, we'd get
+ //
+ // TBNZ %x %bb.3
+ //
+ if (!AndInst || AndInst->getOpcode() != TargetOpcode::G_AND)
+ return false;
+
+ // Need to be comparing against 0 to fold.
+ if (CmpConstant != 0)
+ return false;
+
+ MachineRegisterInfo &MRI = *MIB.getMRI();
+
+ // Only support EQ and NE. If we have LT, then it *is* possible to fold, but
+ // we don't want to do this. When we have an AND and LT, we need a TST/ANDS,
+ // so folding would be redundant.
+ if (Pred != CmpInst::Predicate::ICMP_EQ &&
+ Pred != CmpInst::Predicate::ICMP_NE)
+ return false;
+
+ // Check if the AND has a constant on its RHS which we can use as a mask.
+ // If it's a power of 2, then it's the same as checking a specific bit.
+ // (e.g, ANDing with 8 == ANDing with 000...100 == testing if bit 3 is set)
+ auto MaybeBit =
+ getConstantVRegValWithLookThrough(AndInst->getOperand(2).getReg(), MRI);
+ if (!MaybeBit || !isPowerOf2_64(MaybeBit->Value))
+ return false;
+
+ uint64_t Bit = Log2_64(static_cast<uint64_t>(MaybeBit->Value));
+ Register TestReg = AndInst->getOperand(1).getReg();
+ bool Invert = Pred == CmpInst::Predicate::ICMP_NE;
+
+ // Emit a TB(N)Z.
+ emitTestBit(TestReg, Bit, Invert, DstMBB, MIB);
+ return true;
+}
+
bool AArch64InstructionSelector::selectCompareBranch(
MachineInstr &I, MachineFunction &MF, MachineRegisterInfo &MRI) const {
@@ -991,28 +1276,67 @@ bool AArch64InstructionSelector::selectCompareBranch(
Register LHS = CCMI->getOperand(2).getReg();
Register RHS = CCMI->getOperand(3).getReg();
auto VRegAndVal = getConstantVRegValWithLookThrough(RHS, MRI);
- if (!VRegAndVal)
+ MachineIRBuilder MIB(I);
+ CmpInst::Predicate Pred =
+ (CmpInst::Predicate)CCMI->getOperand(1).getPredicate();
+ MachineInstr *LHSMI = getDefIgnoringCopies(LHS, MRI);
+
+ // When we can emit a TB(N)Z, prefer that.
+ //
+ // Handle non-commutative condition codes first.
+ // Note that we don't want to do this when we have a G_AND because it can
+ // become a tst. The tst will make the test bit in the TB(N)Z redundant.
+ if (VRegAndVal && LHSMI->getOpcode() != TargetOpcode::G_AND) {
+ int64_t C = VRegAndVal->Value;
+
+ // When we have a greater-than comparison, we can just test if the msb is
+ // zero.
+ if (C == -1 && Pred == CmpInst::ICMP_SGT) {
+ uint64_t Bit = MRI.getType(LHS).getSizeInBits() - 1;
+ emitTestBit(LHS, Bit, /*IsNegative = */ false, DestMBB, MIB);
+ I.eraseFromParent();
+ return true;
+ }
+
+ // When we have a less than comparison, we can just test if the msb is not
+ // zero.
+ if (C == 0 && Pred == CmpInst::ICMP_SLT) {
+ uint64_t Bit = MRI.getType(LHS).getSizeInBits() - 1;
+ emitTestBit(LHS, Bit, /*IsNegative = */ true, DestMBB, MIB);
+ I.eraseFromParent();
+ return true;
+ }
+ }
+
+ if (!VRegAndVal) {
std::swap(RHS, LHS);
+ VRegAndVal = getConstantVRegValWithLookThrough(RHS, MRI);
+ LHSMI = getDefIgnoringCopies(LHS, MRI);
+ }
- VRegAndVal = getConstantVRegValWithLookThrough(RHS, MRI);
if (!VRegAndVal || VRegAndVal->Value != 0) {
- MachineIRBuilder MIB(I);
// If we can't select a CBZ then emit a cmp + Bcc.
- if (!emitIntegerCompare(CCMI->getOperand(2), CCMI->getOperand(3),
- CCMI->getOperand(1), MIB))
+ MachineInstr *Cmp;
+ std::tie(Cmp, Pred) = emitIntegerCompare(
+ CCMI->getOperand(2), CCMI->getOperand(3), CCMI->getOperand(1), MIB);
+ if (!Cmp)
return false;
- const AArch64CC::CondCode CC = changeICMPPredToAArch64CC(
- (CmpInst::Predicate)CCMI->getOperand(1).getPredicate());
+ const AArch64CC::CondCode CC = changeICMPPredToAArch64CC(Pred);
MIB.buildInstr(AArch64::Bcc, {}, {}).addImm(CC).addMBB(DestMBB);
I.eraseFromParent();
return true;
}
+ // Try to emit a TB(N)Z for an eq or ne condition.
+ if (tryOptAndIntoCompareBranch(LHSMI, VRegAndVal->Value, Pred, DestMBB,
+ MIB)) {
+ I.eraseFromParent();
+ return true;
+ }
+
const RegisterBank &RB = *RBI.getRegBank(LHS, MRI, TRI);
if (RB.getID() != AArch64::GPRRegBankID)
return false;
-
- const auto Pred = (CmpInst::Predicate)CCMI->getOperand(1).getPredicate();
if (Pred != CmpInst::ICMP_NE && Pred != CmpInst::ICMP_EQ)
return false;
@@ -1247,7 +1571,7 @@ void AArch64InstructionSelector::materializeLargeCMVal(
return;
}
-void AArch64InstructionSelector::preISelLower(MachineInstr &I) const {
+bool AArch64InstructionSelector::preISelLower(MachineInstr &I) {
MachineBasicBlock &MBB = *I.getParent();
MachineFunction &MF = *MBB.getParent();
MachineRegisterInfo &MRI = MF.getRegInfo();
@@ -1267,10 +1591,10 @@ void AArch64InstructionSelector::preISelLower(MachineInstr &I) const {
const LLT ShiftTy = MRI.getType(ShiftReg);
const LLT SrcTy = MRI.getType(SrcReg);
if (SrcTy.isVector())
- return;
+ return false;
assert(!ShiftTy.isVector() && "unexpected vector shift ty");
if (SrcTy.getSizeInBits() != 32 || ShiftTy.getSizeInBits() != 64)
- return;
+ return false;
auto *AmtMI = MRI.getVRegDef(ShiftReg);
assert(AmtMI && "could not find a vreg definition for shift amount");
if (AmtMI->getOpcode() != TargetOpcode::G_CONSTANT) {
@@ -1281,14 +1605,65 @@ void AArch64InstructionSelector::preISelLower(MachineInstr &I) const {
MRI.setRegBank(Trunc.getReg(0), RBI.getRegBank(AArch64::GPRRegBankID));
I.getOperand(2).setReg(Trunc.getReg(0));
}
- return;
+ return true;
}
case TargetOpcode::G_STORE:
- contractCrossBankCopyIntoStore(I, MRI);
- return;
+ return contractCrossBankCopyIntoStore(I, MRI);
+ case TargetOpcode::G_PTR_ADD:
+ return convertPtrAddToAdd(I, MRI);
+ case TargetOpcode::G_LOAD: {
+ // For scalar loads of pointers, we try to convert the dest type from p0
+ // to s64 so that our imported patterns can match. Like with the G_PTR_ADD
+ // conversion, this should be ok because all users should have been
+ // selected already, so the type doesn't matter for them.
+ Register DstReg = I.getOperand(0).getReg();
+ const LLT DstTy = MRI.getType(DstReg);
+ if (!DstTy.isPointer())
+ return false;
+ MRI.setType(DstReg, LLT::scalar(64));
+ return true;
+ }
default:
- return;
+ return false;
+ }
+}
+
+/// This lowering tries to look for G_PTR_ADD instructions and then converts
+/// them to a standard G_ADD with a COPY on the source.
+///
+/// The motivation behind this is to expose the add semantics to the imported
+/// tablegen patterns. We shouldn't need to check for uses being loads/stores,
+/// because the selector works bottom up, uses before defs. By the time we
+/// end up trying to select a G_PTR_ADD, we should have already attempted to
+/// fold this into addressing modes and were therefore unsuccessful.
+bool AArch64InstructionSelector::convertPtrAddToAdd(
+ MachineInstr &I, MachineRegisterInfo &MRI) {
+ assert(I.getOpcode() == TargetOpcode::G_PTR_ADD && "Expected G_PTR_ADD");
+ Register DstReg = I.getOperand(0).getReg();
+ Register AddOp1Reg = I.getOperand(1).getReg();
+ const LLT PtrTy = MRI.getType(DstReg);
+ if (PtrTy.getAddressSpace() != 0)
+ return false;
+
+ MachineIRBuilder MIB(I);
+ const LLT CastPtrTy = PtrTy.isVector() ? LLT::vector(2, 64) : LLT::scalar(64);
+ auto PtrToInt = MIB.buildPtrToInt(CastPtrTy, AddOp1Reg);
+ // Set regbanks on the registers.
+ if (PtrTy.isVector())
+ MRI.setRegBank(PtrToInt.getReg(0), RBI.getRegBank(AArch64::FPRRegBankID));
+ else
+ MRI.setRegBank(PtrToInt.getReg(0), RBI.getRegBank(AArch64::GPRRegBankID));
+
+ // Now turn the %dst(p0) = G_PTR_ADD %base, off into:
+ // %dst(intty) = G_ADD %intbase, off
+ I.setDesc(TII.get(TargetOpcode::G_ADD));
+ MRI.setType(DstReg, CastPtrTy);
+ I.getOperand(1).setReg(PtrToInt.getReg(0));
+ if (!select(*PtrToInt)) {
+ LLVM_DEBUG(dbgs() << "Failed to select G_PTRTOINT in convertPtrAddToAdd");
+ return false;
}
+ return true;
}
bool AArch64InstructionSelector::earlySelectSHL(
@@ -1326,8 +1701,8 @@ bool AArch64InstructionSelector::earlySelectSHL(
return constrainSelectedInstRegOperands(*NewI, TII, TRI, RBI);
}
-void AArch64InstructionSelector::contractCrossBankCopyIntoStore(
- MachineInstr &I, MachineRegisterInfo &MRI) const {
+bool AArch64InstructionSelector::contractCrossBankCopyIntoStore(
+ MachineInstr &I, MachineRegisterInfo &MRI) {
assert(I.getOpcode() == TargetOpcode::G_STORE && "Expected G_STORE");
// If we're storing a scalar, it doesn't matter what register bank that
// scalar is on. All that matters is the size.
@@ -1343,10 +1718,9 @@ void AArch64InstructionSelector::contractCrossBankCopyIntoStore(
// G_STORE %x:gpr(s32)
//
// And then continue the selection process normally.
- MachineInstr *Def = getDefIgnoringCopies(I.getOperand(0).getReg(), MRI);
- if (!Def)
- return;
- Register DefDstReg = Def->getOperand(0).getReg();
+ Register DefDstReg = getSrcRegIgnoringCopies(I.getOperand(0).getReg(), MRI);
+ if (!DefDstReg.isValid())
+ return false;
LLT DefDstTy = MRI.getType(DefDstReg);
Register StoreSrcReg = I.getOperand(0).getReg();
LLT StoreSrcTy = MRI.getType(StoreSrcReg);
@@ -1354,18 +1728,19 @@ void AArch64InstructionSelector::contractCrossBankCopyIntoStore(
// If we get something strange like a physical register, then we shouldn't
// go any further.
if (!DefDstTy.isValid())
- return;
+ return false;
// Are the source and dst types the same size?
if (DefDstTy.getSizeInBits() != StoreSrcTy.getSizeInBits())
- return;
+ return false;
if (RBI.getRegBank(StoreSrcReg, MRI, TRI) ==
RBI.getRegBank(DefDstReg, MRI, TRI))
- return;
+ return false;
// We have a cross-bank copy, which is entering a store. Let's fold it.
I.getOperand(0).setReg(DefDstReg);
+ return true;
}
bool AArch64InstructionSelector::earlySelect(MachineInstr &I) const {
@@ -1391,16 +1766,15 @@ bool AArch64InstructionSelector::earlySelect(MachineInstr &I) const {
Register DefReg = I.getOperand(0).getReg();
LLT Ty = MRI.getType(DefReg);
- if (Ty != LLT::scalar(64) && Ty != LLT::scalar(32))
- return false;
-
- if (Ty == LLT::scalar(64)) {
+ if (Ty.getSizeInBits() == 64) {
I.getOperand(1).ChangeToRegister(AArch64::XZR, false);
RBI.constrainGenericRegister(DefReg, AArch64::GPR64RegClass, MRI);
- } else {
+ } else if (Ty.getSizeInBits() == 32) {
I.getOperand(1).ChangeToRegister(AArch64::WZR, false);
RBI.constrainGenericRegister(DefReg, AArch64::GPR32RegClass, MRI);
- }
+ } else
+ return false;
+
I.setDesc(TII.get(TargetOpcode::COPY));
return true;
}
@@ -1417,9 +1791,17 @@ bool AArch64InstructionSelector::select(MachineInstr &I) {
MachineFunction &MF = *MBB.getParent();
MachineRegisterInfo &MRI = MF.getRegInfo();
+ const AArch64Subtarget *Subtarget =
+ &static_cast<const AArch64Subtarget &>(MF.getSubtarget());
+ if (Subtarget->requiresStrictAlign()) {
+ // We don't support this feature yet.
+ LLVM_DEBUG(dbgs() << "AArch64 GISel does not support strict-align yet\n");
+ return false;
+ }
+
unsigned Opcode = I.getOpcode();
// G_PHI requires same handling as PHI
- if (!isPreISelGenericOpcode(Opcode) || Opcode == TargetOpcode::G_PHI) {
+ if (!I.isPreISelOpcode() || Opcode == TargetOpcode::G_PHI) {
// Certain non-generic instructions also need some special handling.
if (Opcode == TargetOpcode::LOAD_STACK_GUARD)
@@ -1468,7 +1850,9 @@ bool AArch64InstructionSelector::select(MachineInstr &I) {
// Try to do some lowering before we start instruction selecting. These
// lowerings are purely transformations on the input G_MIR and so selection
// must continue after any modification of the instruction.
- preISelLower(I);
+ if (preISelLower(I)) {
+ Opcode = I.getOpcode(); // The opcode may have been modified, refresh it.
+ }
// There may be patterns where the importer can't deal with them optimally,
// but does select it to a suboptimal sequence so our custom C++ selection
@@ -1503,8 +1887,6 @@ bool AArch64InstructionSelector::select(MachineInstr &I) {
// Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z
// instructions will not be produced, as they are conditional branch
// instructions that do not set flags.
- bool ProduceNonFlagSettingCondBr =
- !MF.getFunction().hasFnAttribute(Attribute::SpeculativeLoadHardening);
if (ProduceNonFlagSettingCondBr && selectCompareBranch(I, MF, MRI))
return true;
@@ -1540,6 +1922,31 @@ bool AArch64InstructionSelector::select(MachineInstr &I) {
case TargetOpcode::G_BRJT:
return selectBrJT(I, MRI);
+ case AArch64::G_ADD_LOW: {
+ // This op may have been separated from it's ADRP companion by the localizer
+ // or some other code motion pass. Given that many CPUs will try to
+ // macro fuse these operations anyway, select this into a MOVaddr pseudo
+ // which will later be expanded into an ADRP+ADD pair after scheduling.
+ MachineInstr *BaseMI = MRI.getVRegDef(I.getOperand(1).getReg());
+ if (BaseMI->getOpcode() != AArch64::ADRP) {
+ I.setDesc(TII.get(AArch64::ADDXri));
+ I.addOperand(MachineOperand::CreateImm(0));
+ return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
+ }
+ assert(TM.getCodeModel() == CodeModel::Small &&
+ "Expected small code model");
+ MachineIRBuilder MIB(I);
+ auto Op1 = BaseMI->getOperand(1);
+ auto Op2 = I.getOperand(2);
+ auto MovAddr = MIB.buildInstr(AArch64::MOVaddr, {I.getOperand(0)}, {})
+ .addGlobalAddress(Op1.getGlobal(), Op1.getOffset(),
+ Op1.getTargetFlags())
+ .addGlobalAddress(Op2.getGlobal(), Op2.getOffset(),
+ Op2.getTargetFlags());
+ I.eraseFromParent();
+ return constrainSelectedInstRegOperands(*MovAddr, TII, TRI, RBI);
+ }
+
case TargetOpcode::G_BSWAP: {
// Handle vector types for G_BSWAP directly.
Register DstReg = I.getOperand(0).getReg();
@@ -1644,6 +2051,20 @@ bool AArch64InstructionSelector::select(MachineInstr &I) {
if (emitFMovForFConstant(I, MRI))
return true;
+ // For 64b values, emit a constant pool load instead.
+ if (DefSize == 64) {
+ auto *FPImm = I.getOperand(1).getFPImm();
+ MachineIRBuilder MIB(I);
+ auto *LoadMI = emitLoadFromConstantPool(FPImm, MIB);
+ if (!LoadMI) {
+ LLVM_DEBUG(dbgs() << "Failed to load double constant pool entry\n");
+ return false;
+ }
+ MIB.buildCopy({DefReg}, {LoadMI->getOperand(0).getReg()});
+ I.eraseFromParent();
+ return RBI.constrainGenericRegister(DefReg, FPRRC, MRI);
+ }
+
// Nope. Emit a copy and use a normal mov instead.
const Register DefGPRReg = MRI.createVirtualRegister(&GPRRC);
MachineOperand &RegOp = I.getOperand(0);
@@ -2005,9 +2426,8 @@ bool AArch64InstructionSelector::select(MachineInstr &I) {
// Add and set the set condition flag.
unsigned AddsOpc = OpSize == 32 ? AArch64::ADDSWrr : AArch64::ADDSXrr;
MachineIRBuilder MIRBuilder(I);
- auto AddsMI = MIRBuilder.buildInstr(
- AddsOpc, {I.getOperand(0).getReg()},
- {I.getOperand(2).getReg(), I.getOperand(3).getReg()});
+ auto AddsMI = MIRBuilder.buildInstr(AddsOpc, {I.getOperand(0)},
+ {I.getOperand(2), I.getOperand(3)});
constrainSelectedInstRegOperands(*AddsMI, TII, TRI, RBI);
// Now, put the overflow result in the register given by the first operand
@@ -2023,14 +2443,17 @@ bool AArch64InstructionSelector::select(MachineInstr &I) {
return true;
}
- case TargetOpcode::G_PTR_MASK: {
- uint64_t Align = I.getOperand(2).getImm();
- if (Align >= 64 || Align == 0)
+ case TargetOpcode::G_PTRMASK: {
+ Register MaskReg = I.getOperand(2).getReg();
+ Optional<int64_t> MaskVal = getConstantVRegVal(MaskReg, MRI);
+ // TODO: Implement arbitrary cases
+ if (!MaskVal || !isShiftedMask_64(*MaskVal))
return false;
- uint64_t Mask = ~((1ULL << Align) - 1);
+ uint64_t Mask = *MaskVal;
I.setDesc(TII.get(AArch64::ANDXri));
- I.getOperand(2).setImm(AArch64_AM::encodeLogicalImmediate(Mask, 64));
+ I.getOperand(2).ChangeToImmediate(
+ AArch64_AM::encodeLogicalImmediate(Mask, 64));
return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
}
@@ -2101,6 +2524,13 @@ bool AArch64InstructionSelector::select(MachineInstr &I) {
I.eraseFromParent();
return true;
}
+
+ // We might have a vector G_PTRTOINT, in which case just emit a COPY.
+ if (Opcode == TargetOpcode::G_PTRTOINT) {
+ assert(DstTy.isVector() && "Expected an FPR ptrtoint to be a vector");
+ I.setDesc(TII.get(TargetOpcode::COPY));
+ return true;
+ }
}
return false;
@@ -2151,16 +2581,22 @@ bool AArch64InstructionSelector::select(MachineInstr &I) {
}
case TargetOpcode::G_ZEXT:
+ case TargetOpcode::G_SEXT_INREG:
case TargetOpcode::G_SEXT: {
unsigned Opcode = I.getOpcode();
- const bool IsSigned = Opcode == TargetOpcode::G_SEXT;
+ const bool IsSigned = Opcode != TargetOpcode::G_ZEXT;
const Register DefReg = I.getOperand(0).getReg();
- const Register SrcReg = I.getOperand(1).getReg();
+ Register SrcReg = I.getOperand(1).getReg();
const LLT DstTy = MRI.getType(DefReg);
const LLT SrcTy = MRI.getType(SrcReg);
unsigned DstSize = DstTy.getSizeInBits();
unsigned SrcSize = SrcTy.getSizeInBits();
+ // SEXT_INREG has the same src reg size as dst, the size of the value to be
+ // extended is encoded in the imm.
+ if (Opcode == TargetOpcode::G_SEXT_INREG)
+ SrcSize = I.getOperand(2).getImm();
+
if (DstTy.isVector())
return false; // Should be handled by imported patterns.
@@ -2179,31 +2615,65 @@ bool AArch64InstructionSelector::select(MachineInstr &I) {
// %v2(s32) = G_ZEXT %v(s8)
if (!IsSigned) {
auto *LoadMI = getOpcodeDef(TargetOpcode::G_LOAD, SrcReg, MRI);
- if (LoadMI &&
- RBI.getRegBank(SrcReg, MRI, TRI)->getID() == AArch64::GPRRegBankID) {
+ bool IsGPR =
+ RBI.getRegBank(SrcReg, MRI, TRI)->getID() == AArch64::GPRRegBankID;
+ if (LoadMI && IsGPR) {
const MachineMemOperand *MemOp = *LoadMI->memoperands_begin();
unsigned BytesLoaded = MemOp->getSize();
if (BytesLoaded < 4 && SrcTy.getSizeInBytes() == BytesLoaded)
return selectCopy(I, TII, MRI, TRI, RBI);
}
- }
- if (DstSize == 64) {
- // FIXME: Can we avoid manually doing this?
- if (!RBI.constrainGenericRegister(SrcReg, AArch64::GPR32RegClass, MRI)) {
- LLVM_DEBUG(dbgs() << "Failed to constrain " << TII.getName(Opcode)
- << " operand\n");
- return false;
- }
-
- auto SubregToReg =
- MIB.buildInstr(AArch64::SUBREG_TO_REG, {&AArch64::GPR64RegClass}, {})
+ // If we are zero extending from 32 bits to 64 bits, it's possible that
+ // the instruction implicitly does the zero extend for us. In that case,
+ // we can just emit a SUBREG_TO_REG.
+ if (IsGPR && SrcSize == 32 && DstSize == 64) {
+ // Unlike with the G_LOAD case, we don't want to look through copies
+ // here.
+ MachineInstr *Def = MRI.getVRegDef(SrcReg);
+ if (Def && isDef32(*Def)) {
+ MIB.buildInstr(AArch64::SUBREG_TO_REG, {DefReg}, {})
.addImm(0)
.addUse(SrcReg)
.addImm(AArch64::sub_32);
+ if (!RBI.constrainGenericRegister(DefReg, AArch64::GPR64RegClass,
+ MRI)) {
+ LLVM_DEBUG(dbgs() << "Failed to constrain G_ZEXT destination\n");
+ return false;
+ }
+
+ if (!RBI.constrainGenericRegister(SrcReg, AArch64::GPR32RegClass,
+ MRI)) {
+ LLVM_DEBUG(dbgs() << "Failed to constrain G_ZEXT source\n");
+ return false;
+ }
+
+ I.eraseFromParent();
+ return true;
+ }
+ }
+ }
+
+ if (DstSize == 64) {
+ if (Opcode != TargetOpcode::G_SEXT_INREG) {
+ // FIXME: Can we avoid manually doing this?
+ if (!RBI.constrainGenericRegister(SrcReg, AArch64::GPR32RegClass,
+ MRI)) {
+ LLVM_DEBUG(dbgs() << "Failed to constrain " << TII.getName(Opcode)
+ << " operand\n");
+ return false;
+ }
+ SrcReg = MIB.buildInstr(AArch64::SUBREG_TO_REG,
+ {&AArch64::GPR64RegClass}, {})
+ .addImm(0)
+ .addUse(SrcReg)
+ .addImm(AArch64::sub_32)
+ .getReg(0);
+ }
+
ExtI = MIB.buildInstr(IsSigned ? AArch64::SBFMXri : AArch64::UBFMXri,
- {DefReg}, {SubregToReg})
+ {DefReg}, {SrcReg})
.addImm(0)
.addImm(SrcSize - 1);
} else if (DstSize <= 32) {
@@ -2236,6 +2706,8 @@ bool AArch64InstructionSelector::select(MachineInstr &I) {
return true;
}
+ case TargetOpcode::G_FREEZE:
+ return selectCopy(I, TII, MRI, TRI, RBI);
case TargetOpcode::G_INTTOPTR:
// The importer is currently unable to import pointer types since they
@@ -2294,11 +2766,13 @@ bool AArch64InstructionSelector::select(MachineInstr &I) {
}
MachineIRBuilder MIRBuilder(I);
- if (!emitIntegerCompare(I.getOperand(2), I.getOperand(3), I.getOperand(1),
- MIRBuilder))
+ MachineInstr *Cmp;
+ CmpInst::Predicate Pred;
+ std::tie(Cmp, Pred) = emitIntegerCompare(I.getOperand(2), I.getOperand(3),
+ I.getOperand(1), MIRBuilder);
+ if (!Cmp)
return false;
- emitCSetForICMP(I.getOperand(0).getReg(), I.getOperand(1).getPredicate(),
- MIRBuilder);
+ emitCSetForICMP(I.getOperand(0).getReg(), Pred, MIRBuilder);
I.eraseFromParent();
return true;
}
@@ -2435,14 +2909,13 @@ bool AArch64InstructionSelector::selectBrJT(MachineInstr &I,
Register TargetReg = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
Register ScratchReg = MRI.createVirtualRegister(&AArch64::GPR64spRegClass);
- MIB.buildInstr(AArch64::JumpTableDest32, {TargetReg, ScratchReg},
- {JTAddr, Index})
- .addJumpTableIndex(JTI);
-
+ auto JumpTableInst = MIB.buildInstr(AArch64::JumpTableDest32,
+ {TargetReg, ScratchReg}, {JTAddr, Index})
+ .addJumpTableIndex(JTI);
// Build the indirect branch.
MIB.buildInstr(AArch64::BR, {}, {TargetReg});
I.eraseFromParent();
- return true;
+ return constrainSelectedInstRegOperands(*JumpTableInst, TII, TRI, RBI);
}
bool AArch64InstructionSelector::selectJumpTable(
@@ -2482,7 +2955,7 @@ bool AArch64InstructionSelector::selectTLSGlobalValue(
// TLS calls preserve all registers except those that absolutely must be
// trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be
// silly).
- MIB.buildInstr(AArch64::BLR, {}, {Load})
+ MIB.buildInstr(getBLRCallOpcode(MF), {}, {Load})
.addDef(AArch64::X0, RegState::Implicit)
.addRegMask(TRI.getTLSCallPreservedMask());
@@ -3158,19 +3631,17 @@ bool AArch64InstructionSelector::selectConcatVectors(
}
unsigned
-AArch64InstructionSelector::emitConstantPoolEntry(Constant *CPVal,
+AArch64InstructionSelector::emitConstantPoolEntry(const Constant *CPVal,
MachineFunction &MF) const {
Type *CPTy = CPVal->getType();
- unsigned Align = MF.getDataLayout().getPrefTypeAlignment(CPTy);
- if (Align == 0)
- Align = MF.getDataLayout().getTypeAllocSize(CPTy);
+ Align Alignment = MF.getDataLayout().getPrefTypeAlign(CPTy);
MachineConstantPool *MCP = MF.getConstantPool();
- return MCP->getConstantPoolIndex(CPVal, Align);
+ return MCP->getConstantPoolIndex(CPVal, Alignment);
}
MachineInstr *AArch64InstructionSelector::emitLoadFromConstantPool(
- Constant *CPVal, MachineIRBuilder &MIRBuilder) const {
+ const Constant *CPVal, MachineIRBuilder &MIRBuilder) const {
unsigned CPIdx = emitConstantPoolEntry(CPVal, MIRBuilder.getMF());
auto Adrp =
@@ -3248,7 +3719,7 @@ AArch64InstructionSelector::emitADD(Register DefReg, MachineOperand &LHS,
bool Is32Bit = MRI.getType(LHS.getReg()).getSizeInBits() == 32;
auto ImmFns = selectArithImmed(RHS);
unsigned Opc = OpcTable[Is32Bit][ImmFns.hasValue()];
- auto AddMI = MIRBuilder.buildInstr(Opc, {DefReg}, {LHS.getReg()});
+ auto AddMI = MIRBuilder.buildInstr(Opc, {DefReg}, {LHS});
// If we matched a valid constant immediate, add those operands.
if (ImmFns) {
@@ -3274,7 +3745,7 @@ AArch64InstructionSelector::emitCMN(MachineOperand &LHS, MachineOperand &RHS,
unsigned Opc = OpcTable[Is32Bit][ImmFns.hasValue()];
Register ZReg = Is32Bit ? AArch64::WZR : AArch64::XZR;
- auto CmpMI = MIRBuilder.buildInstr(Opc, {ZReg}, {LHS.getReg()});
+ auto CmpMI = MIRBuilder.buildInstr(Opc, {ZReg}, {LHS});
// If we matched a valid constant immediate, add those operands.
if (ImmFns) {
@@ -3316,17 +3787,21 @@ AArch64InstructionSelector::emitTST(const Register &LHS, const Register &RHS,
return &*TstMI;
}
-MachineInstr *AArch64InstructionSelector::emitIntegerCompare(
+std::pair<MachineInstr *, CmpInst::Predicate>
+AArch64InstructionSelector::emitIntegerCompare(
MachineOperand &LHS, MachineOperand &RHS, MachineOperand &Predicate,
MachineIRBuilder &MIRBuilder) const {
assert(LHS.isReg() && RHS.isReg() && "Expected LHS and RHS to be registers!");
+ assert(Predicate.isPredicate() && "Expected predicate?");
MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo();
+ CmpInst::Predicate P = (CmpInst::Predicate)Predicate.getPredicate();
+
// Fold the compare if possible.
MachineInstr *FoldCmp =
tryFoldIntegerCompare(LHS, RHS, Predicate, MIRBuilder);
if (FoldCmp)
- return FoldCmp;
+ return {FoldCmp, P};
// Can't fold into a CMN. Just emit a normal compare.
unsigned CmpOpc = 0;
@@ -3337,31 +3812,31 @@ MachineInstr *AArch64InstructionSelector::emitIntegerCompare(
"Expected scalar or pointer");
if (CmpTy == LLT::scalar(32)) {
CmpOpc = AArch64::SUBSWrr;
- ZReg = AArch64::WZR;
+ ZReg = MRI.createVirtualRegister(&AArch64::GPR32RegClass);
} else if (CmpTy == LLT::scalar(64) || CmpTy.isPointer()) {
CmpOpc = AArch64::SUBSXrr;
- ZReg = AArch64::XZR;
+ ZReg = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
} else {
- return nullptr;
+ return {nullptr, CmpInst::Predicate::BAD_ICMP_PREDICATE};
}
// Try to match immediate forms.
- auto ImmFns = selectArithImmed(RHS);
- if (ImmFns)
- CmpOpc = CmpOpc == AArch64::SUBSWrr ? AArch64::SUBSWri : AArch64::SUBSXri;
-
- auto CmpMI = MIRBuilder.buildInstr(CmpOpc).addDef(ZReg).addUse(LHS.getReg());
- // If we matched a valid constant immediate, add those operands.
- if (ImmFns) {
- for (auto &RenderFn : *ImmFns)
- RenderFn(CmpMI);
- } else {
- CmpMI.addUse(RHS.getReg());
- }
-
+ MachineInstr *ImmedCmp =
+ tryOptArithImmedIntegerCompare(LHS, RHS, P, MIRBuilder);
+ if (ImmedCmp)
+ return {ImmedCmp, P};
+
+ // If we don't have an immediate, we may have a shift which can be folded
+ // into the compare.
+ MachineInstr *ShiftedCmp = tryOptArithShiftedCompare(LHS, RHS, MIRBuilder);
+ if (ShiftedCmp)
+ return {ShiftedCmp, P};
+
+ auto CmpMI =
+ MIRBuilder.buildInstr(CmpOpc, {ZReg}, {LHS.getReg(), RHS.getReg()});
// Make sure that we can constrain the compare that we emitted.
constrainSelectedInstRegOperands(*CmpMI, TII, TRI, RBI);
- return &*CmpMI;
+ return {&*CmpMI, P};
}
MachineInstr *AArch64InstructionSelector::emitVectorConcat(
@@ -3497,8 +3972,16 @@ bool AArch64InstructionSelector::tryOptSelect(MachineInstr &I) const {
MachineInstr *CondDef = MRI.getVRegDef(I.getOperand(1).getReg());
while (CondDef) {
// We can only fold if all of the defs have one use.
- if (!MRI.hasOneUse(CondDef->getOperand(0).getReg()))
- return false;
+ Register CondDefReg = CondDef->getOperand(0).getReg();
+ if (!MRI.hasOneNonDBGUse(CondDefReg)) {
+ // Unless it's another select.
+ for (const MachineInstr &UI : MRI.use_nodbg_instructions(CondDefReg)) {
+ if (CondDef == &UI)
+ continue;
+ if (UI.getOpcode() != TargetOpcode::G_SELECT)
+ return false;
+ }
+ }
// We can skip over G_TRUNC since the condition is 1-bit.
// Truncating/extending can have no impact on the value.
@@ -3524,13 +4007,21 @@ bool AArch64InstructionSelector::tryOptSelect(MachineInstr &I) const {
AArch64CC::CondCode CondCode;
if (CondOpc == TargetOpcode::G_ICMP) {
- CondCode = changeICMPPredToAArch64CC(
- (CmpInst::Predicate)CondDef->getOperand(1).getPredicate());
- if (!emitIntegerCompare(CondDef->getOperand(2), CondDef->getOperand(3),
- CondDef->getOperand(1), MIB)) {
+ MachineInstr *Cmp;
+ CmpInst::Predicate Pred;
+
+ std::tie(Cmp, Pred) =
+ emitIntegerCompare(CondDef->getOperand(2), CondDef->getOperand(3),
+ CondDef->getOperand(1), MIB);
+
+ if (!Cmp) {
LLVM_DEBUG(dbgs() << "Couldn't emit compare for select!\n");
return false;
}
+
+ // Have to collect the CondCode after emitIntegerCompare, since it can
+ // update the predicate.
+ CondCode = changeICMPPredToAArch64CC(Pred);
} else {
// Get the condition code for the select.
AArch64CC::CondCode CondCode2;
@@ -3660,119 +4151,150 @@ MachineInstr *AArch64InstructionSelector::tryFoldIntegerCompare(
return nullptr;
}
-bool AArch64InstructionSelector::tryOptVectorDup(MachineInstr &I) const {
- // Try to match a vector splat operation into a dup instruction.
- // We're looking for this pattern:
- // %scalar:gpr(s64) = COPY $x0
- // %undef:fpr(<2 x s64>) = G_IMPLICIT_DEF
- // %cst0:gpr(s32) = G_CONSTANT i32 0
- // %zerovec:fpr(<2 x s32>) = G_BUILD_VECTOR %cst0(s32), %cst0(s32)
- // %ins:fpr(<2 x s64>) = G_INSERT_VECTOR_ELT %undef, %scalar(s64), %cst0(s32)
- // %splat:fpr(<2 x s64>) = G_SHUFFLE_VECTOR %ins(<2 x s64>), %undef,
- // %zerovec(<2 x s32>)
- //
- // ...into:
- // %splat = DUP %scalar
- // We use the regbank of the scalar to determine which kind of dup to use.
- MachineIRBuilder MIB(I);
+MachineInstr *AArch64InstructionSelector::tryOptArithImmedIntegerCompare(
+ MachineOperand &LHS, MachineOperand &RHS, CmpInst::Predicate &P,
+ MachineIRBuilder &MIB) const {
+ // Attempt to select the immediate form of an integer compare.
MachineRegisterInfo &MRI = *MIB.getMRI();
- const TargetRegisterInfo &TRI = *MRI.getTargetRegisterInfo();
- using namespace TargetOpcode;
- using namespace MIPatternMatch;
-
- // Begin matching the insert.
- auto *InsMI =
- getOpcodeDef(G_INSERT_VECTOR_ELT, I.getOperand(1).getReg(), MRI);
- if (!InsMI)
- return false;
- // Match the undef vector operand.
- auto *UndefMI =
- getOpcodeDef(G_IMPLICIT_DEF, InsMI->getOperand(1).getReg(), MRI);
- if (!UndefMI)
- return false;
- // Match the scalar being splatted.
- Register ScalarReg = InsMI->getOperand(2).getReg();
- const RegisterBank *ScalarRB = RBI.getRegBank(ScalarReg, MRI, TRI);
- // Match the index constant 0.
- int64_t Index = 0;
- if (!mi_match(InsMI->getOperand(3).getReg(), MRI, m_ICst(Index)) || Index)
- return false;
-
- // The shuffle's second operand doesn't matter if the mask is all zero.
- ArrayRef<int> Mask = I.getOperand(3).getShuffleMask();
- if (!all_of(Mask, [](int Elem) { return Elem == 0; }))
- return false;
+ auto Ty = MRI.getType(LHS.getReg());
+ assert(!Ty.isVector() && "Expected scalar or pointer only?");
+ unsigned Size = Ty.getSizeInBits();
+ assert((Size == 32 || Size == 64) &&
+ "Expected 32 bit or 64 bit compare only?");
+
+ // Check if this is a case we can already handle.
+ InstructionSelector::ComplexRendererFns ImmFns;
+ ImmFns = selectArithImmed(RHS);
+
+ if (!ImmFns) {
+ // We didn't get a rendering function, but we may still have a constant.
+ auto MaybeImmed = getImmedFromMO(RHS);
+ if (!MaybeImmed)
+ return nullptr;
- // We're done, now find out what kind of splat we need.
- LLT VecTy = MRI.getType(I.getOperand(0).getReg());
- LLT EltTy = VecTy.getElementType();
- if (EltTy.getSizeInBits() < 32) {
- LLVM_DEBUG(dbgs() << "Could not optimize splat pattern < 32b elts yet");
- return false;
- }
- bool IsFP = ScalarRB->getID() == AArch64::FPRRegBankID;
- unsigned Opc = 0;
- if (IsFP) {
- switch (EltTy.getSizeInBits()) {
- case 32:
- if (VecTy.getNumElements() == 2) {
- Opc = AArch64::DUPv2i32lane;
- } else {
- Opc = AArch64::DUPv4i32lane;
- assert(VecTy.getNumElements() == 4);
- }
+ // We have a constant, but it doesn't fit. Try adjusting it by one and
+ // updating the predicate if possible.
+ uint64_t C = *MaybeImmed;
+ CmpInst::Predicate NewP;
+ switch (P) {
+ default:
+ return nullptr;
+ case CmpInst::ICMP_SLT:
+ case CmpInst::ICMP_SGE:
+ // Check for
+ //
+ // x slt c => x sle c - 1
+ // x sge c => x sgt c - 1
+ //
+ // When c is not the smallest possible negative number.
+ if ((Size == 64 && static_cast<int64_t>(C) == INT64_MIN) ||
+ (Size == 32 && static_cast<int32_t>(C) == INT32_MIN))
+ return nullptr;
+ NewP = (P == CmpInst::ICMP_SLT) ? CmpInst::ICMP_SLE : CmpInst::ICMP_SGT;
+ C -= 1;
break;
- case 64:
- assert(VecTy.getNumElements() == 2 && "Unexpected num elts");
- Opc = AArch64::DUPv2i64lane;
+ case CmpInst::ICMP_ULT:
+ case CmpInst::ICMP_UGE:
+ // Check for
+ //
+ // x ult c => x ule c - 1
+ // x uge c => x ugt c - 1
+ //
+ // When c is not zero.
+ if (C == 0)
+ return nullptr;
+ NewP = (P == CmpInst::ICMP_ULT) ? CmpInst::ICMP_ULE : CmpInst::ICMP_UGT;
+ C -= 1;
break;
- }
- } else {
- switch (EltTy.getSizeInBits()) {
- case 32:
- if (VecTy.getNumElements() == 2) {
- Opc = AArch64::DUPv2i32gpr;
- } else {
- Opc = AArch64::DUPv4i32gpr;
- assert(VecTy.getNumElements() == 4);
- }
+ case CmpInst::ICMP_SLE:
+ case CmpInst::ICMP_SGT:
+ // Check for
+ //
+ // x sle c => x slt c + 1
+ // x sgt c => s sge c + 1
+ //
+ // When c is not the largest possible signed integer.
+ if ((Size == 32 && static_cast<int32_t>(C) == INT32_MAX) ||
+ (Size == 64 && static_cast<int64_t>(C) == INT64_MAX))
+ return nullptr;
+ NewP = (P == CmpInst::ICMP_SLE) ? CmpInst::ICMP_SLT : CmpInst::ICMP_SGE;
+ C += 1;
break;
- case 64:
- assert(VecTy.getNumElements() == 2 && "Unexpected num elts");
- Opc = AArch64::DUPv2i64gpr;
+ case CmpInst::ICMP_ULE:
+ case CmpInst::ICMP_UGT:
+ // Check for
+ //
+ // x ule c => x ult c + 1
+ // x ugt c => s uge c + 1
+ //
+ // When c is not the largest possible unsigned integer.
+ if ((Size == 32 && static_cast<uint32_t>(C) == UINT32_MAX) ||
+ (Size == 64 && C == UINT64_MAX))
+ return nullptr;
+ NewP = (P == CmpInst::ICMP_ULE) ? CmpInst::ICMP_ULT : CmpInst::ICMP_UGE;
+ C += 1;
break;
}
+
+ // Check if the new constant is valid.
+ if (Size == 32)
+ C = static_cast<uint32_t>(C);
+ ImmFns = select12BitValueWithLeftShift(C);
+ if (!ImmFns)
+ return nullptr;
+ P = NewP;
}
- assert(Opc && "Did not compute an opcode for a dup");
- // For FP splats, we need to widen the scalar reg via undef too.
- if (IsFP) {
- MachineInstr *Widen = emitScalarToVector(
- EltTy.getSizeInBits(), &AArch64::FPR128RegClass, ScalarReg, MIB);
- if (!Widen)
- return false;
- ScalarReg = Widen->getOperand(0).getReg();
+ // At this point, we know we can select an immediate form. Go ahead and do
+ // that.
+ Register ZReg;
+ unsigned Opc;
+ if (Size == 32) {
+ ZReg = AArch64::WZR;
+ Opc = AArch64::SUBSWri;
+ } else {
+ ZReg = AArch64::XZR;
+ Opc = AArch64::SUBSXri;
}
- auto Dup = MIB.buildInstr(Opc, {I.getOperand(0).getReg()}, {ScalarReg});
- if (IsFP)
- Dup.addImm(0);
- constrainSelectedInstRegOperands(*Dup, TII, TRI, RBI);
- I.eraseFromParent();
- return true;
+
+ auto CmpMI = MIB.buildInstr(Opc, {ZReg}, {LHS.getReg()});
+ for (auto &RenderFn : *ImmFns)
+ RenderFn(CmpMI);
+ constrainSelectedInstRegOperands(*CmpMI, TII, TRI, RBI);
+ return &*CmpMI;
}
-bool AArch64InstructionSelector::tryOptVectorShuffle(MachineInstr &I) const {
- if (TM.getOptLevel() == CodeGenOpt::None)
- return false;
- if (tryOptVectorDup(I))
- return true;
- return false;
+MachineInstr *AArch64InstructionSelector::tryOptArithShiftedCompare(
+ MachineOperand &LHS, MachineOperand &RHS, MachineIRBuilder &MIB) const {
+ // We are looking for the following pattern:
+ //
+ // shift = G_SHL/ASHR/LHSR y, c
+ // ...
+ // cmp = G_ICMP pred, something, shift
+ //
+ // Since we will select the G_ICMP to a SUBS, we can potentially fold the
+ // shift into the subtract.
+ static const unsigned OpcTable[2] = {AArch64::SUBSWrs, AArch64::SUBSXrs};
+ static const Register ZRegTable[2] = {AArch64::WZR, AArch64::XZR};
+ auto ImmFns = selectShiftedRegister(RHS);
+ if (!ImmFns)
+ return nullptr;
+ MachineRegisterInfo &MRI = *MIB.getMRI();
+ auto Ty = MRI.getType(LHS.getReg());
+ assert(!Ty.isVector() && "Expected scalar or pointer only?");
+ unsigned Size = Ty.getSizeInBits();
+ bool Idx = (Size == 64);
+ Register ZReg = ZRegTable[Idx];
+ unsigned Opc = OpcTable[Idx];
+ auto CmpMI = MIB.buildInstr(Opc, {ZReg}, {LHS.getReg()});
+ for (auto &RenderFn : *ImmFns)
+ RenderFn(CmpMI);
+ constrainSelectedInstRegOperands(*CmpMI, TII, TRI, RBI);
+ return &*CmpMI;
}
bool AArch64InstructionSelector::selectShuffleVector(
MachineInstr &I, MachineRegisterInfo &MRI) const {
- if (tryOptVectorShuffle(I))
- return true;
const LLT DstTy = MRI.getType(I.getOperand(0).getReg());
Register Src1Reg = I.getOperand(1).getReg();
const LLT Src1Ty = MRI.getType(Src1Reg);
@@ -3852,9 +4374,8 @@ bool AArch64InstructionSelector::selectShuffleVector(
.addUse(Src2Reg)
.addImm(AArch64::qsub1);
- auto TBL2 =
- MIRBuilder.buildInstr(AArch64::TBLv16i8Two, {I.getOperand(0).getReg()},
- {RegSeq, IndexLoad->getOperand(0).getReg()});
+ auto TBL2 = MIRBuilder.buildInstr(AArch64::TBLv16i8Two, {I.getOperand(0)},
+ {RegSeq, IndexLoad->getOperand(0)});
constrainSelectedInstRegOperands(*RegSeq, TII, TRI, RBI);
constrainSelectedInstRegOperands(*TBL2, TII, TRI, RBI);
I.eraseFromParent();
@@ -3968,6 +4489,44 @@ bool AArch64InstructionSelector::selectInsertElt(
return true;
}
+bool AArch64InstructionSelector::tryOptConstantBuildVec(
+ MachineInstr &I, LLT DstTy, MachineRegisterInfo &MRI) const {
+ assert(I.getOpcode() == TargetOpcode::G_BUILD_VECTOR);
+ assert(DstTy.getSizeInBits() <= 128 && "Unexpected build_vec type!");
+ if (DstTy.getSizeInBits() < 32)
+ return false;
+ // Check if we're building a constant vector, in which case we want to
+ // generate a constant pool load instead of a vector insert sequence.
+ SmallVector<Constant *, 16> Csts;
+ for (unsigned Idx = 1; Idx < I.getNumOperands(); ++Idx) {
+ // Try to find G_CONSTANT or G_FCONSTANT
+ auto *OpMI =
+ getOpcodeDef(TargetOpcode::G_CONSTANT, I.getOperand(Idx).getReg(), MRI);
+ if (OpMI)
+ Csts.emplace_back(
+ const_cast<ConstantInt *>(OpMI->getOperand(1).getCImm()));
+ else if ((OpMI = getOpcodeDef(TargetOpcode::G_FCONSTANT,
+ I.getOperand(Idx).getReg(), MRI)))
+ Csts.emplace_back(
+ const_cast<ConstantFP *>(OpMI->getOperand(1).getFPImm()));
+ else
+ return false;
+ }
+ Constant *CV = ConstantVector::get(Csts);
+ MachineIRBuilder MIB(I);
+ auto *CPLoad = emitLoadFromConstantPool(CV, MIB);
+ if (!CPLoad) {
+ LLVM_DEBUG(dbgs() << "Could not generate cp load for build_vector");
+ return false;
+ }
+ MIB.buildCopy(I.getOperand(0), CPLoad->getOperand(0));
+ RBI.constrainGenericRegister(I.getOperand(0).getReg(),
+ *MRI.getRegClass(CPLoad->getOperand(0).getReg()),
+ MRI);
+ I.eraseFromParent();
+ return true;
+}
+
bool AArch64InstructionSelector::selectBuildVector(
MachineInstr &I, MachineRegisterInfo &MRI) const {
assert(I.getOpcode() == TargetOpcode::G_BUILD_VECTOR);
@@ -3976,6 +4535,9 @@ bool AArch64InstructionSelector::selectBuildVector(
const LLT DstTy = MRI.getType(I.getOperand(0).getReg());
const LLT EltTy = MRI.getType(I.getOperand(1).getReg());
unsigned EltSize = EltTy.getSizeInBits();
+
+ if (tryOptConstantBuildVec(I, DstTy, MRI))
+ return true;
if (EltSize < 16 || EltSize > 64)
return false; // Don't support all element types yet.
const RegisterBank &RB = *RBI.getRegBank(I.getOperand(1).getReg(), MRI, TRI);
@@ -4081,8 +4643,8 @@ bool AArch64InstructionSelector::selectIntrinsicWithSideEffects(
return true;
}
-bool AArch64InstructionSelector::selectIntrinsic(
- MachineInstr &I, MachineRegisterInfo &MRI) const {
+bool AArch64InstructionSelector::selectIntrinsic(MachineInstr &I,
+ MachineRegisterInfo &MRI) {
unsigned IntrinID = findIntrinsicID(I);
if (!IntrinID)
return false;
@@ -4091,7 +4653,7 @@ bool AArch64InstructionSelector::selectIntrinsic(
switch (IntrinID) {
default:
break;
- case Intrinsic::aarch64_crypto_sha1h:
+ case Intrinsic::aarch64_crypto_sha1h: {
Register DstReg = I.getOperand(0).getReg();
Register SrcReg = I.getOperand(2).getReg();
@@ -4130,28 +4692,59 @@ bool AArch64InstructionSelector::selectIntrinsic(
I.eraseFromParent();
return true;
}
- return false;
-}
+ case Intrinsic::frameaddress:
+ case Intrinsic::returnaddress: {
+ MachineFunction &MF = *I.getParent()->getParent();
+ MachineFrameInfo &MFI = MF.getFrameInfo();
-static Optional<uint64_t> getImmedFromMO(const MachineOperand &Root) {
- auto &MI = *Root.getParent();
- auto &MBB = *MI.getParent();
- auto &MF = *MBB.getParent();
- auto &MRI = MF.getRegInfo();
- uint64_t Immed;
- if (Root.isImm())
- Immed = Root.getImm();
- else if (Root.isCImm())
- Immed = Root.getCImm()->getZExtValue();
- else if (Root.isReg()) {
- auto ValAndVReg =
- getConstantVRegValWithLookThrough(Root.getReg(), MRI, true);
- if (!ValAndVReg)
- return None;
- Immed = ValAndVReg->Value;
- } else
- return None;
- return Immed;
+ unsigned Depth = I.getOperand(2).getImm();
+ Register DstReg = I.getOperand(0).getReg();
+ RBI.constrainGenericRegister(DstReg, AArch64::GPR64RegClass, MRI);
+
+ if (Depth == 0 && IntrinID == Intrinsic::returnaddress) {
+ if (MFReturnAddr) {
+ MIRBuilder.buildCopy({DstReg}, MFReturnAddr);
+ I.eraseFromParent();
+ return true;
+ }
+ MFI.setReturnAddressIsTaken(true);
+ MF.addLiveIn(AArch64::LR, &AArch64::GPR64spRegClass);
+ // Insert the copy from LR/X30 into the entry block, before it can be
+ // clobbered by anything.
+ MachineBasicBlock &EntryBlock = *MF.begin();
+ if (!EntryBlock.isLiveIn(AArch64::LR))
+ EntryBlock.addLiveIn(AArch64::LR);
+ MachineIRBuilder EntryBuilder(MF);
+ EntryBuilder.setInstr(*EntryBlock.begin());
+ EntryBuilder.buildCopy({DstReg}, {Register(AArch64::LR)});
+ MFReturnAddr = DstReg;
+ I.eraseFromParent();
+ return true;
+ }
+
+ MFI.setFrameAddressIsTaken(true);
+ Register FrameAddr(AArch64::FP);
+ while (Depth--) {
+ Register NextFrame = MRI.createVirtualRegister(&AArch64::GPR64spRegClass);
+ auto Ldr =
+ MIRBuilder.buildInstr(AArch64::LDRXui, {NextFrame}, {FrameAddr})
+ .addImm(0);
+ constrainSelectedInstRegOperands(*Ldr, TII, TRI, RBI);
+ FrameAddr = NextFrame;
+ }
+
+ if (IntrinID == Intrinsic::frameaddress)
+ MIRBuilder.buildCopy({DstReg}, {FrameAddr});
+ else {
+ MFI.setReturnAddressIsTaken(true);
+ MIRBuilder.buildInstr(AArch64::LDRXui, {DstReg}, {FrameAddr}).addImm(1);
+ }
+
+ I.eraseFromParent();
+ return true;
+ }
+ }
+ return false;
}
InstructionSelector::ComplexRendererFns
@@ -4271,7 +4864,7 @@ bool AArch64InstructionSelector::isWorthFoldingIntoExtendedReg(
MachineInstr &MI, const MachineRegisterInfo &MRI) const {
// Always fold if there is one use, or if we're optimizing for size.
Register DefReg = MI.getOperand(0).getReg();
- if (MRI.hasOneUse(DefReg) ||
+ if (MRI.hasOneNonDBGUse(DefReg) ||
MI.getParent()->getParent()->getFunction().hasMinSize())
return true;
@@ -4283,10 +4876,21 @@ bool AArch64InstructionSelector::isWorthFoldingIntoExtendedReg(
// We have a fastpath, so folding a shift in and potentially computing it
// many times may be beneficial. Check if this is only used in memory ops.
// If it is, then we should fold.
- return all_of(MRI.use_instructions(DefReg),
+ return all_of(MRI.use_nodbg_instructions(DefReg),
[](MachineInstr &Use) { return Use.mayLoadOrStore(); });
}
+static bool isSignExtendShiftType(AArch64_AM::ShiftExtendType Type) {
+ switch (Type) {
+ case AArch64_AM::SXTB:
+ case AArch64_AM::SXTH:
+ case AArch64_AM::SXTW:
+ return true;
+ default:
+ return false;
+ }
+}
+
InstructionSelector::ComplexRendererFns
AArch64InstructionSelector::selectExtendedSHL(
MachineOperand &Root, MachineOperand &Base, MachineOperand &Offset,
@@ -4359,7 +4963,10 @@ AArch64InstructionSelector::selectExtendedSHL(
if (Ext == AArch64_AM::InvalidShiftExtend)
return None;
- SignExtend = Ext == AArch64_AM::SXTW;
+ SignExtend = isSignExtendShiftType(Ext) ? 1 : 0;
+ // We only support SXTW for signed extension here.
+ if (SignExtend && Ext != AArch64_AM::SXTW)
+ return None;
// Need a 32-bit wide register here.
MachineIRBuilder MIB(*MRI.getVRegDef(Root.getReg()));
@@ -4441,7 +5048,7 @@ AArch64InstructionSelector::selectAddrModeRegisterOffset(
// If this is used more than once, let's not bother folding.
// TODO: Check if they are memory ops. If they are, then we can still fold
// without having to recompute anything.
- if (!MRI.hasOneUse(Gep->getOperand(0).getReg()))
+ if (!MRI.hasOneNonDBGUse(Gep->getOperand(0).getReg()))
return None;
// Base is the GEP's LHS, offset is its RHS.
@@ -4595,14 +5202,46 @@ AArch64InstructionSelector::selectAddrModeUnscaled(MachineOperand &Root,
return None;
}
+InstructionSelector::ComplexRendererFns
+AArch64InstructionSelector::tryFoldAddLowIntoImm(MachineInstr &RootDef,
+ unsigned Size,
+ MachineRegisterInfo &MRI) const {
+ if (RootDef.getOpcode() != AArch64::G_ADD_LOW)
+ return None;
+ MachineInstr &Adrp = *MRI.getVRegDef(RootDef.getOperand(1).getReg());
+ if (Adrp.getOpcode() != AArch64::ADRP)
+ return None;
+
+ // TODO: add heuristics like isWorthFoldingADDlow() from SelectionDAG.
+ // TODO: Need to check GV's offset % size if doing offset folding into globals.
+ assert(Adrp.getOperand(1).getOffset() == 0 && "Unexpected offset in global");
+ auto GV = Adrp.getOperand(1).getGlobal();
+ if (GV->isThreadLocal())
+ return None;
+
+ auto &MF = *RootDef.getParent()->getParent();
+ if (GV->getPointerAlignment(MF.getDataLayout()) < Size)
+ return None;
+
+ unsigned OpFlags = STI.ClassifyGlobalReference(GV, MF.getTarget());
+ MachineIRBuilder MIRBuilder(RootDef);
+ Register AdrpReg = Adrp.getOperand(0).getReg();
+ return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(AdrpReg); },
+ [=](MachineInstrBuilder &MIB) {
+ MIB.addGlobalAddress(GV, /* Offset */ 0,
+ OpFlags | AArch64II::MO_PAGEOFF |
+ AArch64II::MO_NC);
+ }}};
+}
+
/// Select a "register plus scaled unsigned 12-bit immediate" address. The
/// "Size" argument is the size in bytes of the memory reference, which
/// determines the scale.
InstructionSelector::ComplexRendererFns
AArch64InstructionSelector::selectAddrModeIndexed(MachineOperand &Root,
unsigned Size) const {
- MachineRegisterInfo &MRI =
- Root.getParent()->getParent()->getParent()->getRegInfo();
+ MachineFunction &MF = *Root.getParent()->getParent()->getParent();
+ MachineRegisterInfo &MRI = MF.getRegInfo();
if (!Root.isReg())
return None;
@@ -4618,6 +5257,14 @@ AArch64InstructionSelector::selectAddrModeIndexed(MachineOperand &Root,
}};
}
+ CodeModel::Model CM = MF.getTarget().getCodeModel();
+ // Check if we can fold in the ADD of small code model ADRP + ADD address.
+ if (CM == CodeModel::Small) {
+ auto OpFns = tryFoldAddLowIntoImm(*RootDef, Size, MRI);
+ if (OpFns)
+ return OpFns;
+ }
+
if (isBaseWithConstantOffset(Root, MRI)) {
MachineOperand &LHS = RootDef->getOperand(1);
MachineOperand &RHS = RootDef->getOperand(2);
@@ -4717,7 +5364,11 @@ AArch64_AM::ShiftExtendType AArch64InstructionSelector::getExtendTypeForInst(
// Handle explicit extend instructions first.
if (Opc == TargetOpcode::G_SEXT || Opc == TargetOpcode::G_SEXT_INREG) {
- unsigned Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
+ unsigned Size;
+ if (Opc == TargetOpcode::G_SEXT)
+ Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
+ else
+ Size = MI.getOperand(2).getImm();
assert(Size != 64 && "Extend from 64 bits?");
switch (Size) {
case 8:
@@ -4782,6 +5433,52 @@ Register AArch64InstructionSelector::narrowExtendRegIfNeeded(
return Copy.getReg(0);
}
+Register AArch64InstructionSelector::widenGPRBankRegIfNeeded(
+ Register Reg, unsigned WideSize, MachineIRBuilder &MIB) const {
+ assert(WideSize >= 8 && "WideSize is smaller than all possible registers?");
+ MachineRegisterInfo &MRI = *MIB.getMRI();
+ unsigned NarrowSize = MRI.getType(Reg).getSizeInBits();
+ assert(WideSize >= NarrowSize &&
+ "WideSize cannot be smaller than NarrowSize!");
+
+ // If the sizes match, just return the register.
+ //
+ // If NarrowSize is an s1, then we can select it to any size, so we'll treat
+ // it as a don't care.
+ if (NarrowSize == WideSize || NarrowSize == 1)
+ return Reg;
+
+ // Now check the register classes.
+ const RegisterBank *RB = RBI.getRegBank(Reg, MRI, TRI);
+ const TargetRegisterClass *OrigRC = getMinClassForRegBank(*RB, NarrowSize);
+ const TargetRegisterClass *WideRC = getMinClassForRegBank(*RB, WideSize);
+ assert(OrigRC && "Could not determine narrow RC?");
+ assert(WideRC && "Could not determine wide RC?");
+
+ // If the sizes differ, but the register classes are the same, there is no
+ // need to insert a SUBREG_TO_REG.
+ //
+ // For example, an s8 that's supposed to be a GPR will be selected to either
+ // a GPR32 or a GPR64 register. Note that this assumes that the s8 will
+ // always end up on a GPR32.
+ if (OrigRC == WideRC)
+ return Reg;
+
+ // We have two different register classes. Insert a SUBREG_TO_REG.
+ unsigned SubReg = 0;
+ getSubRegForClass(OrigRC, TRI, SubReg);
+ assert(SubReg && "Couldn't determine subregister?");
+
+ // Build the SUBREG_TO_REG and return the new, widened register.
+ auto SubRegToReg =
+ MIB.buildInstr(AArch64::SUBREG_TO_REG, {WideRC}, {})
+ .addImm(0)
+ .addUse(Reg)
+ .addImm(SubReg);
+ constrainSelectedInstRegOperands(*SubRegToReg, TII, TRI, RBI);
+ return SubRegToReg.getReg(0);
+}
+
/// Select an "extended register" operand. This operand folds in an extend
/// followed by an optional left shift.
InstructionSelector::ComplexRendererFns
@@ -4908,6 +5605,95 @@ bool AArch64InstructionSelector::isDef32(const MachineInstr &MI) const {
}
}
+
+// Perform fixups on the given PHI instruction's operands to force them all
+// to be the same as the destination regbank.
+static void fixupPHIOpBanks(MachineInstr &MI, MachineRegisterInfo &MRI,
+ const AArch64RegisterBankInfo &RBI) {
+ assert(MI.getOpcode() == TargetOpcode::G_PHI && "Expected a G_PHI");
+ Register DstReg = MI.getOperand(0).getReg();
+ const RegisterBank *DstRB = MRI.getRegBankOrNull(DstReg);
+ assert(DstRB && "Expected PHI dst to have regbank assigned");
+ MachineIRBuilder MIB(MI);
+
+ // Go through each operand and ensure it has the same regbank.
+ for (unsigned OpIdx = 1; OpIdx < MI.getNumOperands(); ++OpIdx) {
+ MachineOperand &MO = MI.getOperand(OpIdx);
+ if (!MO.isReg())
+ continue;
+ Register OpReg = MO.getReg();
+ const RegisterBank *RB = MRI.getRegBankOrNull(OpReg);
+ if (RB != DstRB) {
+ // Insert a cross-bank copy.
+ auto *OpDef = MRI.getVRegDef(OpReg);
+ const LLT &Ty = MRI.getType(OpReg);
+ MIB.setInsertPt(*OpDef->getParent(), std::next(OpDef->getIterator()));
+ auto Copy = MIB.buildCopy(Ty, OpReg);
+ MRI.setRegBank(Copy.getReg(0), *DstRB);
+ MO.setReg(Copy.getReg(0));
+ }
+ }
+}
+
+void AArch64InstructionSelector::processPHIs(MachineFunction &MF) {
+ // We're looking for PHIs, build a list so we don't invalidate iterators.
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+ SmallVector<MachineInstr *, 32> Phis;
+ for (auto &BB : MF) {
+ for (auto &MI : BB) {
+ if (MI.getOpcode() == TargetOpcode::G_PHI)
+ Phis.emplace_back(&MI);
+ }
+ }
+
+ for (auto *MI : Phis) {
+ // We need to do some work here if the operand types are < 16 bit and they
+ // are split across fpr/gpr banks. Since all types <32b on gpr
+ // end up being assigned gpr32 regclasses, we can end up with PHIs here
+ // which try to select between a gpr32 and an fpr16. Ideally RBS shouldn't
+ // be selecting heterogenous regbanks for operands if possible, but we
+ // still need to be able to deal with it here.
+ //
+ // To fix this, if we have a gpr-bank operand < 32b in size and at least
+ // one other operand is on the fpr bank, then we add cross-bank copies
+ // to homogenize the operand banks. For simplicity the bank that we choose
+ // to settle on is whatever bank the def operand has. For example:
+ //
+ // %endbb:
+ // %dst:gpr(s16) = G_PHI %in1:gpr(s16), %bb1, %in2:fpr(s16), %bb2
+ // =>
+ // %bb2:
+ // ...
+ // %in2_copy:gpr(s16) = COPY %in2:fpr(s16)
+ // ...
+ // %endbb:
+ // %dst:gpr(s16) = G_PHI %in1:gpr(s16), %bb1, %in2_copy:gpr(s16), %bb2
+ bool HasGPROp = false, HasFPROp = false;
+ for (unsigned OpIdx = 1; OpIdx < MI->getNumOperands(); ++OpIdx) {
+ const auto &MO = MI->getOperand(OpIdx);
+ if (!MO.isReg())
+ continue;
+ const LLT &Ty = MRI.getType(MO.getReg());
+ if (!Ty.isValid() || !Ty.isScalar())
+ break;
+ if (Ty.getSizeInBits() >= 32)
+ break;
+ const RegisterBank *RB = MRI.getRegBankOrNull(MO.getReg());
+ // If for some reason we don't have a regbank yet. Don't try anything.
+ if (!RB)
+ break;
+
+ if (RB->getID() == AArch64::GPRRegBankID)
+ HasGPROp = true;
+ else
+ HasFPROp = true;
+ }
+ // We have heterogenous regbanks, need to fixup.
+ if (HasGPROp && HasFPROp)
+ fixupPHIOpBanks(*MI, MRI, RBI);
+ }
+}
+
namespace llvm {
InstructionSelector *
createAArch64InstructionSelector(const AArch64TargetMachine &TM,
diff --git a/llvm/lib/Target/AArch64/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
index 95719a35c6daa..2eaec0b970fa6 100644
--- a/llvm/lib/Target/AArch64/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -30,7 +30,8 @@ using namespace LegalizeActions;
using namespace LegalizeMutations;
using namespace LegalityPredicates;
-AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) {
+AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
+ : ST(&ST) {
using namespace TargetOpcode;
const LLT p0 = LLT::pointer(0, 64);
const LLT s1 = LLT::scalar(1);
@@ -52,13 +53,15 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) {
const LLT v2s64 = LLT::vector(2, 64);
const LLT v2p0 = LLT::vector(2, p0);
+ const TargetMachine &TM = ST.getTargetLowering()->getTargetMachine();
+
// FIXME: support subtargets which have neon/fp-armv8 disabled.
if (!ST.hasNEON() || !ST.hasFPARMv8()) {
computeTables();
return;
}
- getActionDefinitionsBuilder(G_IMPLICIT_DEF)
+ getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
.legalFor({p0, s1, s8, s16, s32, s64, v2s32, v4s32, v2s64})
.clampScalar(0, s1, s64)
.widenScalarToNextPow2(0, 8)
@@ -105,10 +108,10 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) {
.minScalarSameAs(1, 0);
getActionDefinitionsBuilder(G_PTR_ADD)
- .legalFor({{p0, s64}})
+ .legalFor({{p0, s64}, {v2p0, v2s64}})
.clampScalar(1, s64, s64);
- getActionDefinitionsBuilder(G_PTR_MASK).legalFor({p0});
+ getActionDefinitionsBuilder(G_PTRMASK).legalFor({{p0, s64}});
getActionDefinitionsBuilder({G_SDIV, G_UDIV})
.legalFor({s32, s64})
@@ -375,7 +378,9 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) {
getActionDefinitionsBuilder(G_TRUNC).alwaysLegal();
- getActionDefinitionsBuilder(G_SEXT_INREG).lower();
+ getActionDefinitionsBuilder(G_SEXT_INREG)
+ .legalFor({s32, s64})
+ .lower();
// FP conversions
getActionDefinitionsBuilder(G_FPTRUNC).legalFor(
@@ -413,7 +418,11 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) {
// Pointer-handling
getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({p0});
- getActionDefinitionsBuilder(G_GLOBAL_VALUE).legalFor({p0});
+
+ if (TM.getCodeModel() == CodeModel::Small)
+ getActionDefinitionsBuilder(G_GLOBAL_VALUE).custom();
+ else
+ getActionDefinitionsBuilder(G_GLOBAL_VALUE).legalFor({p0});
getActionDefinitionsBuilder(G_PTRTOINT)
.legalForCartesianProduct({s1, s8, s16, s32, s64}, {p0})
@@ -617,10 +626,11 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) {
verify(*ST.getInstrInfo());
}
-bool AArch64LegalizerInfo::legalizeCustom(MachineInstr &MI,
- MachineRegisterInfo &MRI,
- MachineIRBuilder &MIRBuilder,
- GISelChangeObserver &Observer) const {
+bool AArch64LegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
+ MachineInstr &MI) const {
+ MachineIRBuilder &MIRBuilder = Helper.MIRBuilder;
+ MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
+ GISelChangeObserver &Observer = Helper.Observer;
switch (MI.getOpcode()) {
default:
// No idea what to do.
@@ -634,19 +644,53 @@ bool AArch64LegalizerInfo::legalizeCustom(MachineInstr &MI,
case TargetOpcode::G_ASHR:
case TargetOpcode::G_LSHR:
return legalizeShlAshrLshr(MI, MRI, MIRBuilder, Observer);
+ case TargetOpcode::G_GLOBAL_VALUE:
+ return legalizeSmallCMGlobalValue(MI, MRI, MIRBuilder, Observer);
}
llvm_unreachable("expected switch to return");
}
+bool AArch64LegalizerInfo::legalizeSmallCMGlobalValue(MachineInstr &MI,
+ MachineRegisterInfo &MRI,
+ MachineIRBuilder &MIRBuilder,
+ GISelChangeObserver &Observer) const {
+ assert(MI.getOpcode() == TargetOpcode::G_GLOBAL_VALUE);
+ // We do this custom legalization to convert G_GLOBAL_VALUE into target ADRP +
+ // G_ADD_LOW instructions.
+ // By splitting this here, we can optimize accesses in the small code model by
+ // folding in the G_ADD_LOW into the load/store offset.
+ auto GV = MI.getOperand(1).getGlobal();
+ if (GV->isThreadLocal())
+ return true; // Don't want to modify TLS vars.
+
+ auto &TM = ST->getTargetLowering()->getTargetMachine();
+ unsigned OpFlags = ST->ClassifyGlobalReference(GV, TM);
+
+ if (OpFlags & AArch64II::MO_GOT)
+ return true;
+
+ Register DstReg = MI.getOperand(0).getReg();
+ auto ADRP = MIRBuilder.buildInstr(AArch64::ADRP, {LLT::pointer(0, 64)}, {})
+ .addGlobalAddress(GV, 0, OpFlags | AArch64II::MO_PAGE);
+ // Set the regclass on the dest reg too.
+ MRI.setRegClass(ADRP.getReg(0), &AArch64::GPR64RegClass);
+
+ MIRBuilder.buildInstr(AArch64::G_ADD_LOW, {DstReg}, {ADRP})
+ .addGlobalAddress(GV, 0,
+ OpFlags | AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
+ MI.eraseFromParent();
+ return true;
+}
+
bool AArch64LegalizerInfo::legalizeIntrinsic(
- MachineInstr &MI, MachineRegisterInfo &MRI,
- MachineIRBuilder &MIRBuilder) const {
+ LegalizerHelper &Helper, MachineInstr &MI) const {
+ MachineIRBuilder &MIRBuilder = Helper.MIRBuilder;
switch (MI.getIntrinsicID()) {
case Intrinsic::memcpy:
case Intrinsic::memset:
case Intrinsic::memmove:
- if (createMemLibcall(MIRBuilder, MRI, MI) ==
+ if (createMemLibcall(MIRBuilder, *MIRBuilder.getMRI(), MI) ==
LegalizerHelper::UnableToLegalize)
return false;
MI.eraseFromParent();
@@ -675,7 +719,6 @@ bool AArch64LegalizerInfo::legalizeShlAshrLshr(
if (Amount > 31)
return true; // This will have to remain a register variant.
assert(MRI.getType(AmtReg).getSizeInBits() == 32);
- MIRBuilder.setInstr(MI);
auto ExtCst = MIRBuilder.buildZExt(LLT::scalar(64), AmtReg);
MI.getOperand(2).setReg(ExtCst.getReg(0));
return true;
@@ -704,17 +747,15 @@ bool AArch64LegalizerInfo::legalizeLoadStore(
return false;
}
- MIRBuilder.setInstr(MI);
unsigned PtrSize = ValTy.getElementType().getSizeInBits();
const LLT NewTy = LLT::vector(ValTy.getNumElements(), PtrSize);
auto &MMO = **MI.memoperands_begin();
if (MI.getOpcode() == TargetOpcode::G_STORE) {
- auto Bitcast = MIRBuilder.buildBitcast({NewTy}, {ValReg});
- MIRBuilder.buildStore(Bitcast.getReg(0), MI.getOperand(1).getReg(), MMO);
+ auto Bitcast = MIRBuilder.buildBitcast(NewTy, ValReg);
+ MIRBuilder.buildStore(Bitcast.getReg(0), MI.getOperand(1), MMO);
} else {
- Register NewReg = MRI.createGenericVirtualRegister(NewTy);
- auto NewLoad = MIRBuilder.buildLoad(NewReg, MI.getOperand(1).getReg(), MMO);
- MIRBuilder.buildBitcast({ValReg}, {NewLoad});
+ auto NewLoad = MIRBuilder.buildLoad(NewTy, MI.getOperand(1), MMO);
+ MIRBuilder.buildBitcast(ValReg, NewLoad);
}
MI.eraseFromParent();
return true;
@@ -723,9 +764,8 @@ bool AArch64LegalizerInfo::legalizeLoadStore(
bool AArch64LegalizerInfo::legalizeVaArg(MachineInstr &MI,
MachineRegisterInfo &MRI,
MachineIRBuilder &MIRBuilder) const {
- MIRBuilder.setInstr(MI);
MachineFunction &MF = MIRBuilder.getMF();
- unsigned Align = MI.getOperand(2).getImm();
+ Align Alignment(MI.getOperand(2).getImm());
Register Dst = MI.getOperand(0).getReg();
Register ListPtr = MI.getOperand(1).getReg();
@@ -733,21 +773,19 @@ bool AArch64LegalizerInfo::legalizeVaArg(MachineInstr &MI,
LLT IntPtrTy = LLT::scalar(PtrTy.getSizeInBits());
const unsigned PtrSize = PtrTy.getSizeInBits() / 8;
- Register List = MRI.createGenericVirtualRegister(PtrTy);
- MIRBuilder.buildLoad(
- List, ListPtr,
+ const Align PtrAlign = Align(PtrSize);
+ auto List = MIRBuilder.buildLoad(
+ PtrTy, ListPtr,
*MF.getMachineMemOperand(MachinePointerInfo(), MachineMemOperand::MOLoad,
- PtrSize, /* Align = */ PtrSize));
+ PtrSize, PtrAlign));
- Register DstPtr;
- if (Align > PtrSize) {
+ MachineInstrBuilder DstPtr;
+ if (Alignment > PtrAlign) {
// Realign the list to the actual required alignment.
- auto AlignMinus1 = MIRBuilder.buildConstant(IntPtrTy, Align - 1);
-
+ auto AlignMinus1 =
+ MIRBuilder.buildConstant(IntPtrTy, Alignment.value() - 1);
auto ListTmp = MIRBuilder.buildPtrAdd(PtrTy, List, AlignMinus1.getReg(0));
-
- DstPtr = MRI.createGenericVirtualRegister(PtrTy);
- MIRBuilder.buildPtrMask(DstPtr, ListTmp, Log2_64(Align));
+ DstPtr = MIRBuilder.buildMaskLowPtrBits(PtrTy, ListTmp, Log2(Alignment));
} else
DstPtr = List;
@@ -755,16 +793,16 @@ bool AArch64LegalizerInfo::legalizeVaArg(MachineInstr &MI,
MIRBuilder.buildLoad(
Dst, DstPtr,
*MF.getMachineMemOperand(MachinePointerInfo(), MachineMemOperand::MOLoad,
- ValSize, std::max(Align, PtrSize)));
+ ValSize, std::max(Alignment, PtrAlign)));
- auto Size = MIRBuilder.buildConstant(IntPtrTy, alignTo(ValSize, PtrSize));
+ auto Size = MIRBuilder.buildConstant(IntPtrTy, alignTo(ValSize, PtrAlign));
auto NewList = MIRBuilder.buildPtrAdd(PtrTy, DstPtr, Size.getReg(0));
- MIRBuilder.buildStore(
- NewList, ListPtr,
- *MF.getMachineMemOperand(MachinePointerInfo(), MachineMemOperand::MOStore,
- PtrSize, /* Align = */ PtrSize));
+ MIRBuilder.buildStore(NewList, ListPtr,
+ *MF.getMachineMemOperand(MachinePointerInfo(),
+ MachineMemOperand::MOStore,
+ PtrSize, PtrAlign));
MI.eraseFromParent();
return true;
diff --git a/llvm/lib/Target/AArch64/AArch64LegalizerInfo.h b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.h
index 15161bab466c4..1cb24559c1abf 100644
--- a/llvm/lib/Target/AArch64/AArch64LegalizerInfo.h
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.h
@@ -27,12 +27,10 @@ class AArch64LegalizerInfo : public LegalizerInfo {
public:
AArch64LegalizerInfo(const AArch64Subtarget &ST);
- bool legalizeCustom(MachineInstr &MI, MachineRegisterInfo &MRI,
- MachineIRBuilder &MIRBuilder,
- GISelChangeObserver &Observer) const override;
+ bool legalizeCustom(LegalizerHelper &Helper, MachineInstr &MI) const override;
- bool legalizeIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI,
- MachineIRBuilder &MIRBuilder) const override;
+ bool legalizeIntrinsic(LegalizerHelper &Helper,
+ MachineInstr &MI) const override;
private:
bool legalizeVaArg(MachineInstr &MI, MachineRegisterInfo &MRI,
@@ -43,6 +41,11 @@ private:
bool legalizeShlAshrLshr(MachineInstr &MI, MachineRegisterInfo &MRI,
MachineIRBuilder &MIRBuilder,
GISelChangeObserver &Observer) const;
+
+ bool legalizeSmallCMGlobalValue(MachineInstr &MI, MachineRegisterInfo &MRI,
+ MachineIRBuilder &MIRBuilder,
+ GISelChangeObserver &Observer) const;
+ const AArch64Subtarget *ST;
};
} // End llvm namespace.
#endif
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp
new file mode 100644
index 0000000000000..baa8515baf3ea
--- /dev/null
+++ b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp
@@ -0,0 +1,507 @@
+ //=== lib/CodeGen/GlobalISel/AArch64PostLegalizerCombiner.cpp -------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This performs post-legalization combines on generic MachineInstrs.
+//
+// Any combine that this pass performs must preserve instruction legality.
+// Combines unconcerned with legality should be handled by the
+// PreLegalizerCombiner instead.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AArch64TargetMachine.h"
+#include "llvm/CodeGen/GlobalISel/Combiner.h"
+#include "llvm/CodeGen/GlobalISel/CombinerHelper.h"
+#include "llvm/CodeGen/GlobalISel/CombinerInfo.h"
+#include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
+#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/Support/Debug.h"
+
+#define DEBUG_TYPE "aarch64-postlegalizer-combiner"
+
+using namespace llvm;
+using namespace MIPatternMatch;
+
+/// Represents a pseudo instruction which replaces a G_SHUFFLE_VECTOR.
+///
+/// Used for matching target-supported shuffles before codegen.
+struct ShuffleVectorPseudo {
+ unsigned Opc; ///< Opcode for the instruction. (E.g. G_ZIP1)
+ Register Dst; ///< Destination register.
+ SmallVector<SrcOp, 2> SrcOps; ///< Source registers.
+ ShuffleVectorPseudo(unsigned Opc, Register Dst,
+ std::initializer_list<SrcOp> SrcOps)
+ : Opc(Opc), Dst(Dst), SrcOps(SrcOps){};
+ ShuffleVectorPseudo() {}
+};
+
+/// \returns The splat index of a G_SHUFFLE_VECTOR \p MI when \p MI is a splat.
+/// If \p MI is not a splat, returns None.
+static Optional<int> getSplatIndex(MachineInstr &MI) {
+ assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR &&
+ "Only G_SHUFFLE_VECTOR can have a splat index!");
+ ArrayRef<int> Mask = MI.getOperand(3).getShuffleMask();
+ auto FirstDefinedIdx = find_if(Mask, [](int Elt) { return Elt >= 0; });
+
+ // If all elements are undefined, this shuffle can be considered a splat.
+ // Return 0 for better potential for callers to simplify.
+ if (FirstDefinedIdx == Mask.end())
+ return 0;
+
+ // Make sure all remaining elements are either undef or the same
+ // as the first non-undef value.
+ int SplatValue = *FirstDefinedIdx;
+ if (any_of(make_range(std::next(FirstDefinedIdx), Mask.end()),
+ [&SplatValue](int Elt) { return Elt >= 0 && Elt != SplatValue; }))
+ return None;
+
+ return SplatValue;
+}
+
+/// Check if a vector shuffle corresponds to a REV instruction with the
+/// specified blocksize.
+static bool isREVMask(ArrayRef<int> M, unsigned EltSize, unsigned NumElts,
+ unsigned BlockSize) {
+ assert((BlockSize == 16 || BlockSize == 32 || BlockSize == 64) &&
+ "Only possible block sizes for REV are: 16, 32, 64");
+ assert(EltSize != 64 && "EltSize cannot be 64 for REV mask.");
+
+ unsigned BlockElts = M[0] + 1;
+
+ // If the first shuffle index is UNDEF, be optimistic.
+ if (M[0] < 0)
+ BlockElts = BlockSize / EltSize;
+
+ if (BlockSize <= EltSize || BlockSize != BlockElts * EltSize)
+ return false;
+
+ for (unsigned i = 0; i < NumElts; ++i) {
+ // Ignore undef indices.
+ if (M[i] < 0)
+ continue;
+ if (static_cast<unsigned>(M[i]) !=
+ (i - i % BlockElts) + (BlockElts - 1 - i % BlockElts))
+ return false;
+ }
+
+ return true;
+}
+
+/// Determines if \p M is a shuffle vector mask for a TRN of \p NumElts.
+/// Whether or not G_TRN1 or G_TRN2 should be used is stored in \p WhichResult.
+static bool isTRNMask(ArrayRef<int> M, unsigned NumElts,
+ unsigned &WhichResult) {
+ if (NumElts % 2 != 0)
+ return false;
+ WhichResult = (M[0] == 0 ? 0 : 1);
+ for (unsigned i = 0; i < NumElts; i += 2) {
+ if ((M[i] >= 0 && static_cast<unsigned>(M[i]) != i + WhichResult) ||
+ (M[i + 1] >= 0 &&
+ static_cast<unsigned>(M[i + 1]) != i + NumElts + WhichResult))
+ return false;
+ }
+ return true;
+}
+
+/// Check if a G_EXT instruction can handle a shuffle mask \p M when the vector
+/// sources of the shuffle are different.
+static Optional<std::pair<bool, uint64_t>> getExtMask(ArrayRef<int> M,
+ unsigned NumElts) {
+ // Look for the first non-undef element.
+ auto FirstRealElt = find_if(M, [](int Elt) { return Elt >= 0; });
+ if (FirstRealElt == M.end())
+ return None;
+
+ // Use APInt to handle overflow when calculating expected element.
+ unsigned MaskBits = APInt(32, NumElts * 2).logBase2();
+ APInt ExpectedElt = APInt(MaskBits, *FirstRealElt + 1);
+
+ // The following shuffle indices must be the successive elements after the
+ // first real element.
+ if (any_of(
+ make_range(std::next(FirstRealElt), M.end()),
+ [&ExpectedElt](int Elt) { return Elt != ExpectedElt++ && Elt >= 0; }))
+ return None;
+
+ // The index of an EXT is the first element if it is not UNDEF.
+ // Watch out for the beginning UNDEFs. The EXT index should be the expected
+ // value of the first element. E.g.
+ // <-1, -1, 3, ...> is treated as <1, 2, 3, ...>.
+ // <-1, -1, 0, 1, ...> is treated as <2*NumElts-2, 2*NumElts-1, 0, 1, ...>.
+ // ExpectedElt is the last mask index plus 1.
+ uint64_t Imm = ExpectedElt.getZExtValue();
+ bool ReverseExt = false;
+
+ // There are two difference cases requiring to reverse input vectors.
+ // For example, for vector <4 x i32> we have the following cases,
+ // Case 1: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, -1, 0>)
+ // Case 2: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, 7, 0>)
+ // For both cases, we finally use mask <5, 6, 7, 0>, which requires
+ // to reverse two input vectors.
+ if (Imm < NumElts)
+ ReverseExt = true;
+ else
+ Imm -= NumElts;
+ return std::make_pair(ReverseExt, Imm);
+}
+
+/// Determines if \p M is a shuffle vector mask for a UZP of \p NumElts.
+/// Whether or not G_UZP1 or G_UZP2 should be used is stored in \p WhichResult.
+static bool isUZPMask(ArrayRef<int> M, unsigned NumElts,
+ unsigned &WhichResult) {
+ WhichResult = (M[0] == 0 ? 0 : 1);
+ for (unsigned i = 0; i != NumElts; ++i) {
+ // Skip undef indices.
+ if (M[i] < 0)
+ continue;
+ if (static_cast<unsigned>(M[i]) != 2 * i + WhichResult)
+ return false;
+ }
+ return true;
+}
+
+/// \return true if \p M is a zip mask for a shuffle vector of \p NumElts.
+/// Whether or not G_ZIP1 or G_ZIP2 should be used is stored in \p WhichResult.
+static bool isZipMask(ArrayRef<int> M, unsigned NumElts,
+ unsigned &WhichResult) {
+ if (NumElts % 2 != 0)
+ return false;
+
+ // 0 means use ZIP1, 1 means use ZIP2.
+ WhichResult = (M[0] == 0 ? 0 : 1);
+ unsigned Idx = WhichResult * NumElts / 2;
+ for (unsigned i = 0; i != NumElts; i += 2) {
+ if ((M[i] >= 0 && static_cast<unsigned>(M[i]) != Idx) ||
+ (M[i + 1] >= 0 && static_cast<unsigned>(M[i + 1]) != Idx + NumElts))
+ return false;
+ Idx += 1;
+ }
+ return true;
+}
+
+/// \return true if a G_SHUFFLE_VECTOR instruction \p MI can be replaced with a
+/// G_REV instruction. Returns the appropriate G_REV opcode in \p Opc.
+static bool matchREV(MachineInstr &MI, MachineRegisterInfo &MRI,
+ ShuffleVectorPseudo &MatchInfo) {
+ assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR);
+ ArrayRef<int> ShuffleMask = MI.getOperand(3).getShuffleMask();
+ Register Dst = MI.getOperand(0).getReg();
+ Register Src = MI.getOperand(1).getReg();
+ LLT Ty = MRI.getType(Dst);
+ unsigned EltSize = Ty.getScalarSizeInBits();
+
+ // Element size for a rev cannot be 64.
+ if (EltSize == 64)
+ return false;
+
+ unsigned NumElts = Ty.getNumElements();
+
+ // Try to produce G_REV64
+ if (isREVMask(ShuffleMask, EltSize, NumElts, 64)) {
+ MatchInfo = ShuffleVectorPseudo(AArch64::G_REV64, Dst, {Src});
+ return true;
+ }
+
+ // TODO: Produce G_REV32 and G_REV16 once we have proper legalization support.
+ // This should be identical to above, but with a constant 32 and constant
+ // 16.
+ return false;
+}
+
+/// \return true if a G_SHUFFLE_VECTOR instruction \p MI can be replaced with
+/// a G_TRN1 or G_TRN2 instruction.
+static bool matchTRN(MachineInstr &MI, MachineRegisterInfo &MRI,
+ ShuffleVectorPseudo &MatchInfo) {
+ assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR);
+ unsigned WhichResult;
+ ArrayRef<int> ShuffleMask = MI.getOperand(3).getShuffleMask();
+ Register Dst = MI.getOperand(0).getReg();
+ unsigned NumElts = MRI.getType(Dst).getNumElements();
+ if (!isTRNMask(ShuffleMask, NumElts, WhichResult))
+ return false;
+ unsigned Opc = (WhichResult == 0) ? AArch64::G_TRN1 : AArch64::G_TRN2;
+ Register V1 = MI.getOperand(1).getReg();
+ Register V2 = MI.getOperand(2).getReg();
+ MatchInfo = ShuffleVectorPseudo(Opc, Dst, {V1, V2});
+ return true;
+}
+
+/// \return true if a G_SHUFFLE_VECTOR instruction \p MI can be replaced with
+/// a G_UZP1 or G_UZP2 instruction.
+///
+/// \param [in] MI - The shuffle vector instruction.
+/// \param [out] MatchInfo - Either G_UZP1 or G_UZP2 on success.
+static bool matchUZP(MachineInstr &MI, MachineRegisterInfo &MRI,
+ ShuffleVectorPseudo &MatchInfo) {
+ assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR);
+ unsigned WhichResult;
+ ArrayRef<int> ShuffleMask = MI.getOperand(3).getShuffleMask();
+ Register Dst = MI.getOperand(0).getReg();
+ unsigned NumElts = MRI.getType(Dst).getNumElements();
+ if (!isUZPMask(ShuffleMask, NumElts, WhichResult))
+ return false;
+ unsigned Opc = (WhichResult == 0) ? AArch64::G_UZP1 : AArch64::G_UZP2;
+ Register V1 = MI.getOperand(1).getReg();
+ Register V2 = MI.getOperand(2).getReg();
+ MatchInfo = ShuffleVectorPseudo(Opc, Dst, {V1, V2});
+ return true;
+}
+
+static bool matchZip(MachineInstr &MI, MachineRegisterInfo &MRI,
+ ShuffleVectorPseudo &MatchInfo) {
+ assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR);
+ unsigned WhichResult;
+ ArrayRef<int> ShuffleMask = MI.getOperand(3).getShuffleMask();
+ Register Dst = MI.getOperand(0).getReg();
+ unsigned NumElts = MRI.getType(Dst).getNumElements();
+ if (!isZipMask(ShuffleMask, NumElts, WhichResult))
+ return false;
+ unsigned Opc = (WhichResult == 0) ? AArch64::G_ZIP1 : AArch64::G_ZIP2;
+ Register V1 = MI.getOperand(1).getReg();
+ Register V2 = MI.getOperand(2).getReg();
+ MatchInfo = ShuffleVectorPseudo(Opc, Dst, {V1, V2});
+ return true;
+}
+
+/// Helper function for matchDup.
+static bool matchDupFromInsertVectorElt(int Lane, MachineInstr &MI,
+ MachineRegisterInfo &MRI,
+ ShuffleVectorPseudo &MatchInfo) {
+ if (Lane != 0)
+ return false;
+
+ // Try to match a vector splat operation into a dup instruction.
+ // We're looking for this pattern:
+ //
+ // %scalar:gpr(s64) = COPY $x0
+ // %undef:fpr(<2 x s64>) = G_IMPLICIT_DEF
+ // %cst0:gpr(s32) = G_CONSTANT i32 0
+ // %zerovec:fpr(<2 x s32>) = G_BUILD_VECTOR %cst0(s32), %cst0(s32)
+ // %ins:fpr(<2 x s64>) = G_INSERT_VECTOR_ELT %undef, %scalar(s64), %cst0(s32)
+ // %splat:fpr(<2 x s64>) = G_SHUFFLE_VECTOR %ins(<2 x s64>), %undef, %zerovec(<2 x s32>)
+ //
+ // ...into:
+ // %splat = G_DUP %scalar
+
+ // Begin matching the insert.
+ auto *InsMI = getOpcodeDef(TargetOpcode::G_INSERT_VECTOR_ELT,
+ MI.getOperand(1).getReg(), MRI);
+ if (!InsMI)
+ return false;
+ // Match the undef vector operand.
+ if (!getOpcodeDef(TargetOpcode::G_IMPLICIT_DEF, InsMI->getOperand(1).getReg(),
+ MRI))
+ return false;
+
+ // Match the index constant 0.
+ int64_t Index = 0;
+ if (!mi_match(InsMI->getOperand(3).getReg(), MRI, m_ICst(Index)) || Index)
+ return false;
+
+ MatchInfo = ShuffleVectorPseudo(AArch64::G_DUP, MI.getOperand(0).getReg(),
+ {InsMI->getOperand(2).getReg()});
+ return true;
+}
+
+/// Helper function for matchDup.
+static bool matchDupFromBuildVector(int Lane, MachineInstr &MI,
+ MachineRegisterInfo &MRI,
+ ShuffleVectorPseudo &MatchInfo) {
+ assert(Lane >= 0 && "Expected positive lane?");
+ // Test if the LHS is a BUILD_VECTOR. If it is, then we can just reference the
+ // lane's definition directly.
+ auto *BuildVecMI = getOpcodeDef(TargetOpcode::G_BUILD_VECTOR,
+ MI.getOperand(1).getReg(), MRI);
+ if (!BuildVecMI)
+ return false;
+ Register Reg = BuildVecMI->getOperand(Lane + 1).getReg();
+ MatchInfo =
+ ShuffleVectorPseudo(AArch64::G_DUP, MI.getOperand(0).getReg(), {Reg});
+ return true;
+}
+
+static bool matchDup(MachineInstr &MI, MachineRegisterInfo &MRI,
+ ShuffleVectorPseudo &MatchInfo) {
+ assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR);
+ auto MaybeLane = getSplatIndex(MI);
+ if (!MaybeLane)
+ return false;
+ int Lane = *MaybeLane;
+ // If this is undef splat, generate it via "just" vdup, if possible.
+ if (Lane < 0)
+ Lane = 0;
+ if (matchDupFromInsertVectorElt(Lane, MI, MRI, MatchInfo))
+ return true;
+ if (matchDupFromBuildVector(Lane, MI, MRI, MatchInfo))
+ return true;
+ return false;
+}
+
+static bool matchEXT(MachineInstr &MI, MachineRegisterInfo &MRI,
+ ShuffleVectorPseudo &MatchInfo) {
+ assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR);
+ Register Dst = MI.getOperand(0).getReg();
+ auto ExtInfo = getExtMask(MI.getOperand(3).getShuffleMask(),
+ MRI.getType(Dst).getNumElements());
+ if (!ExtInfo)
+ return false;
+ bool ReverseExt;
+ uint64_t Imm;
+ std::tie(ReverseExt, Imm) = *ExtInfo;
+ Register V1 = MI.getOperand(1).getReg();
+ Register V2 = MI.getOperand(2).getReg();
+ if (ReverseExt)
+ std::swap(V1, V2);
+ uint64_t ExtFactor = MRI.getType(V1).getScalarSizeInBits() / 8;
+ Imm *= ExtFactor;
+ MatchInfo = ShuffleVectorPseudo(AArch64::G_EXT, Dst, {V1, V2, Imm});
+ return true;
+}
+
+/// Replace a G_SHUFFLE_VECTOR instruction with a pseudo.
+/// \p Opc is the opcode to use. \p MI is the G_SHUFFLE_VECTOR.
+static bool applyShuffleVectorPseudo(MachineInstr &MI,
+ ShuffleVectorPseudo &MatchInfo) {
+ MachineIRBuilder MIRBuilder(MI);
+ MIRBuilder.buildInstr(MatchInfo.Opc, {MatchInfo.Dst}, MatchInfo.SrcOps);
+ MI.eraseFromParent();
+ return true;
+}
+
+/// Replace a G_SHUFFLE_VECTOR instruction with G_EXT.
+/// Special-cased because the constant operand must be emitted as a G_CONSTANT
+/// for the imported tablegen patterns to work.
+static bool applyEXT(MachineInstr &MI, ShuffleVectorPseudo &MatchInfo) {
+ MachineIRBuilder MIRBuilder(MI);
+ // Tablegen patterns expect an i32 G_CONSTANT as the final op.
+ auto Cst =
+ MIRBuilder.buildConstant(LLT::scalar(32), MatchInfo.SrcOps[2].getImm());
+ MIRBuilder.buildInstr(MatchInfo.Opc, {MatchInfo.Dst},
+ {MatchInfo.SrcOps[0], MatchInfo.SrcOps[1], Cst});
+ MI.eraseFromParent();
+ return true;
+}
+
+#define AARCH64POSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS
+#include "AArch64GenPostLegalizeGICombiner.inc"
+#undef AARCH64POSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS
+
+namespace {
+#define AARCH64POSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H
+#include "AArch64GenPostLegalizeGICombiner.inc"
+#undef AARCH64POSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H
+
+class AArch64PostLegalizerCombinerInfo : public CombinerInfo {
+ GISelKnownBits *KB;
+ MachineDominatorTree *MDT;
+
+public:
+ AArch64GenPostLegalizerCombinerHelperRuleConfig GeneratedRuleCfg;
+
+ AArch64PostLegalizerCombinerInfo(bool EnableOpt, bool OptSize, bool MinSize,
+ GISelKnownBits *KB,
+ MachineDominatorTree *MDT)
+ : CombinerInfo(/*AllowIllegalOps*/ true, /*ShouldLegalizeIllegal*/ false,
+ /*LegalizerInfo*/ nullptr, EnableOpt, OptSize, MinSize),
+ KB(KB), MDT(MDT) {
+ if (!GeneratedRuleCfg.parseCommandLineOption())
+ report_fatal_error("Invalid rule identifier");
+ }
+
+ virtual bool combine(GISelChangeObserver &Observer, MachineInstr &MI,
+ MachineIRBuilder &B) const override;
+};
+
+bool AArch64PostLegalizerCombinerInfo::combine(GISelChangeObserver &Observer,
+ MachineInstr &MI,
+ MachineIRBuilder &B) const {
+ const auto *LI =
+ MI.getParent()->getParent()->getSubtarget().getLegalizerInfo();
+ CombinerHelper Helper(Observer, B, KB, MDT, LI);
+ AArch64GenPostLegalizerCombinerHelper Generated(GeneratedRuleCfg);
+ return Generated.tryCombineAll(Observer, MI, B, Helper);
+}
+
+#define AARCH64POSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP
+#include "AArch64GenPostLegalizeGICombiner.inc"
+#undef AARCH64POSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP
+
+class AArch64PostLegalizerCombiner : public MachineFunctionPass {
+public:
+ static char ID;
+
+ AArch64PostLegalizerCombiner(bool IsOptNone = false);
+
+ StringRef getPassName() const override {
+ return "AArch64PostLegalizerCombiner";
+ }
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+ void getAnalysisUsage(AnalysisUsage &AU) const override;
+
+private:
+ bool IsOptNone;
+};
+} // end anonymous namespace
+
+void AArch64PostLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const {
+ AU.addRequired<TargetPassConfig>();
+ AU.setPreservesCFG();
+ getSelectionDAGFallbackAnalysisUsage(AU);
+ AU.addRequired<GISelKnownBitsAnalysis>();
+ AU.addPreserved<GISelKnownBitsAnalysis>();
+ if (!IsOptNone) {
+ AU.addRequired<MachineDominatorTree>();
+ AU.addPreserved<MachineDominatorTree>();
+ }
+ MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+AArch64PostLegalizerCombiner::AArch64PostLegalizerCombiner(bool IsOptNone)
+ : MachineFunctionPass(ID), IsOptNone(IsOptNone) {
+ initializeAArch64PostLegalizerCombinerPass(*PassRegistry::getPassRegistry());
+}
+
+bool AArch64PostLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) {
+ if (MF.getProperties().hasProperty(
+ MachineFunctionProperties::Property::FailedISel))
+ return false;
+ assert(MF.getProperties().hasProperty(
+ MachineFunctionProperties::Property::Legalized) &&
+ "Expected a legalized function?");
+ auto *TPC = &getAnalysis<TargetPassConfig>();
+ const Function &F = MF.getFunction();
+ bool EnableOpt =
+ MF.getTarget().getOptLevel() != CodeGenOpt::None && !skipFunction(F);
+ GISelKnownBits *KB = &getAnalysis<GISelKnownBitsAnalysis>().get(MF);
+ MachineDominatorTree *MDT =
+ IsOptNone ? nullptr : &getAnalysis<MachineDominatorTree>();
+ AArch64PostLegalizerCombinerInfo PCInfo(EnableOpt, F.hasOptSize(),
+ F.hasMinSize(), KB, MDT);
+ Combiner C(PCInfo, TPC);
+ return C.combineMachineInstrs(MF, /*CSEInfo*/ nullptr);
+}
+
+char AArch64PostLegalizerCombiner::ID = 0;
+INITIALIZE_PASS_BEGIN(AArch64PostLegalizerCombiner, DEBUG_TYPE,
+ "Combine AArch64 MachineInstrs after legalization", false,
+ false)
+INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
+INITIALIZE_PASS_DEPENDENCY(GISelKnownBitsAnalysis)
+INITIALIZE_PASS_END(AArch64PostLegalizerCombiner, DEBUG_TYPE,
+ "Combine AArch64 MachineInstrs after legalization", false,
+ false)
+
+namespace llvm {
+FunctionPass *createAArch64PostLegalizeCombiner(bool IsOptNone) {
+ return new AArch64PostLegalizerCombiner(IsOptNone);
+}
+} // end namespace llvm
diff --git a/llvm/lib/Target/AArch64/AArch64PreLegalizerCombiner.cpp b/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp
index 230fd514d0222..9a1f200d52222 100644
--- a/llvm/lib/Target/AArch64/AArch64PreLegalizerCombiner.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp
@@ -27,28 +27,62 @@
using namespace llvm;
using namespace MIPatternMatch;
+/// Return true if a G_FCONSTANT instruction is known to be better-represented
+/// as a G_CONSTANT.
+static bool matchFConstantToConstant(MachineInstr &MI,
+ MachineRegisterInfo &MRI) {
+ assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT);
+ Register DstReg = MI.getOperand(0).getReg();
+ const unsigned DstSize = MRI.getType(DstReg).getSizeInBits();
+ if (DstSize != 32 && DstSize != 64)
+ return false;
+
+ // When we're storing a value, it doesn't matter what register bank it's on.
+ // Since not all floating point constants can be materialized using a fmov,
+ // it makes more sense to just use a GPR.
+ return all_of(MRI.use_nodbg_instructions(DstReg),
+ [](const MachineInstr &Use) { return Use.mayStore(); });
+}
+
+/// Change a G_FCONSTANT into a G_CONSTANT.
+static void applyFConstantToConstant(MachineInstr &MI) {
+ assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT);
+ MachineIRBuilder MIB(MI);
+ const APFloat &ImmValAPF = MI.getOperand(1).getFPImm()->getValueAPF();
+ MIB.buildConstant(MI.getOperand(0).getReg(), ImmValAPF.bitcastToAPInt());
+ MI.eraseFromParent();
+}
+
+class AArch64PreLegalizerCombinerHelperState {
+protected:
+ CombinerHelper &Helper;
+
+public:
+ AArch64PreLegalizerCombinerHelperState(CombinerHelper &Helper)
+ : Helper(Helper) {}
+};
+
#define AARCH64PRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS
-#include "AArch64GenGICombiner.inc"
+#include "AArch64GenPreLegalizeGICombiner.inc"
#undef AARCH64PRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS
namespace {
#define AARCH64PRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H
-#include "AArch64GenGICombiner.inc"
+#include "AArch64GenPreLegalizeGICombiner.inc"
#undef AARCH64PRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H
class AArch64PreLegalizerCombinerInfo : public CombinerInfo {
GISelKnownBits *KB;
MachineDominatorTree *MDT;
+ AArch64GenPreLegalizerCombinerHelperRuleConfig GeneratedRuleCfg;
public:
- AArch64GenPreLegalizerCombinerHelper Generated;
-
AArch64PreLegalizerCombinerInfo(bool EnableOpt, bool OptSize, bool MinSize,
GISelKnownBits *KB, MachineDominatorTree *MDT)
: CombinerInfo(/*AllowIllegalOps*/ true, /*ShouldLegalizeIllegal*/ false,
/*LegalizerInfo*/ nullptr, EnableOpt, OptSize, MinSize),
KB(KB), MDT(MDT) {
- if (!Generated.parseCommandLineOption())
+ if (!GeneratedRuleCfg.parseCommandLineOption())
report_fatal_error("Invalid rule identifier");
}
@@ -60,6 +94,7 @@ bool AArch64PreLegalizerCombinerInfo::combine(GISelChangeObserver &Observer,
MachineInstr &MI,
MachineIRBuilder &B) const {
CombinerHelper Helper(Observer, B, KB, MDT);
+ AArch64GenPreLegalizerCombinerHelper Generated(GeneratedRuleCfg, Helper);
switch (MI.getOpcode()) {
case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS:
@@ -79,7 +114,7 @@ bool AArch64PreLegalizerCombinerInfo::combine(GISelChangeObserver &Observer,
}
}
- if (Generated.tryCombineAll(Observer, MI, B, Helper))
+ if (Generated.tryCombineAll(Observer, MI, B))
return true;
switch (MI.getOpcode()) {
@@ -93,7 +128,7 @@ bool AArch64PreLegalizerCombinerInfo::combine(GISelChangeObserver &Observer,
}
#define AARCH64PRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP
-#include "AArch64GenGICombiner.inc"
+#include "AArch64GenPreLegalizeGICombiner.inc"
#undef AARCH64PRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP
// Pass boilerplate
diff --git a/llvm/lib/Target/AArch64/AArch64RegisterBankInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp
index 40efac261fd99..7e3ff1948dad7 100644
--- a/llvm/lib/Target/AArch64/AArch64RegisterBankInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp
@@ -38,58 +38,58 @@ using namespace llvm;
AArch64RegisterBankInfo::AArch64RegisterBankInfo(const TargetRegisterInfo &TRI)
: AArch64GenRegisterBankInfo() {
- static bool AlreadyInit = false;
- // We have only one set of register banks, whatever the subtarget
- // is. Therefore, the initialization of the RegBanks table should be
- // done only once. Indeed the table of all register banks
- // (AArch64::RegBanks) is unique in the compiler. At some point, it
- // will get tablegen'ed and the whole constructor becomes empty.
- if (AlreadyInit)
- return;
- AlreadyInit = true;
-
- const RegisterBank &RBGPR = getRegBank(AArch64::GPRRegBankID);
- (void)RBGPR;
- assert(&AArch64::GPRRegBank == &RBGPR &&
- "The order in RegBanks is messed up");
-
- const RegisterBank &RBFPR = getRegBank(AArch64::FPRRegBankID);
- (void)RBFPR;
- assert(&AArch64::FPRRegBank == &RBFPR &&
- "The order in RegBanks is messed up");
-
- const RegisterBank &RBCCR = getRegBank(AArch64::CCRegBankID);
- (void)RBCCR;
- assert(&AArch64::CCRegBank == &RBCCR && "The order in RegBanks is messed up");
-
- // The GPR register bank is fully defined by all the registers in
- // GR64all + its subclasses.
- assert(RBGPR.covers(*TRI.getRegClass(AArch64::GPR32RegClassID)) &&
- "Subclass not added?");
- assert(RBGPR.getSize() == 64 && "GPRs should hold up to 64-bit");
-
- // The FPR register bank is fully defined by all the registers in
- // GR64all + its subclasses.
- assert(RBFPR.covers(*TRI.getRegClass(AArch64::QQRegClassID)) &&
- "Subclass not added?");
- assert(RBFPR.covers(*TRI.getRegClass(AArch64::FPR64RegClassID)) &&
- "Subclass not added?");
- assert(RBFPR.getSize() == 512 &&
- "FPRs should hold up to 512-bit via QQQQ sequence");
-
- assert(RBCCR.covers(*TRI.getRegClass(AArch64::CCRRegClassID)) &&
- "Class not added?");
- assert(RBCCR.getSize() == 32 && "CCR should hold up to 32-bit");
-
- // Check that the TableGen'ed like file is in sync we our expectations.
- // First, the Idx.
- assert(checkPartialMappingIdx(PMI_FirstGPR, PMI_LastGPR,
- {PMI_GPR32, PMI_GPR64}) &&
- "PartialMappingIdx's are incorrectly ordered");
- assert(checkPartialMappingIdx(PMI_FirstFPR, PMI_LastFPR,
- {PMI_FPR16, PMI_FPR32, PMI_FPR64, PMI_FPR128,
- PMI_FPR256, PMI_FPR512}) &&
- "PartialMappingIdx's are incorrectly ordered");
+ static llvm::once_flag InitializeRegisterBankFlag;
+
+ static auto InitializeRegisterBankOnce = [&]() {
+ // We have only one set of register banks, whatever the subtarget
+ // is. Therefore, the initialization of the RegBanks table should be
+ // done only once. Indeed the table of all register banks
+ // (AArch64::RegBanks) is unique in the compiler. At some point, it
+ // will get tablegen'ed and the whole constructor becomes empty.
+
+ const RegisterBank &RBGPR = getRegBank(AArch64::GPRRegBankID);
+ (void)RBGPR;
+ assert(&AArch64::GPRRegBank == &RBGPR &&
+ "The order in RegBanks is messed up");
+
+ const RegisterBank &RBFPR = getRegBank(AArch64::FPRRegBankID);
+ (void)RBFPR;
+ assert(&AArch64::FPRRegBank == &RBFPR &&
+ "The order in RegBanks is messed up");
+
+ const RegisterBank &RBCCR = getRegBank(AArch64::CCRegBankID);
+ (void)RBCCR;
+ assert(&AArch64::CCRegBank == &RBCCR &&
+ "The order in RegBanks is messed up");
+
+ // The GPR register bank is fully defined by all the registers in
+ // GR64all + its subclasses.
+ assert(RBGPR.covers(*TRI.getRegClass(AArch64::GPR32RegClassID)) &&
+ "Subclass not added?");
+ assert(RBGPR.getSize() == 64 && "GPRs should hold up to 64-bit");
+
+ // The FPR register bank is fully defined by all the registers in
+ // GR64all + its subclasses.
+ assert(RBFPR.covers(*TRI.getRegClass(AArch64::QQRegClassID)) &&
+ "Subclass not added?");
+ assert(RBFPR.covers(*TRI.getRegClass(AArch64::FPR64RegClassID)) &&
+ "Subclass not added?");
+ assert(RBFPR.getSize() == 512 &&
+ "FPRs should hold up to 512-bit via QQQQ sequence");
+
+ assert(RBCCR.covers(*TRI.getRegClass(AArch64::CCRRegClassID)) &&
+ "Class not added?");
+ assert(RBCCR.getSize() == 32 && "CCR should hold up to 32-bit");
+
+ // Check that the TableGen'ed like file is in sync we our expectations.
+ // First, the Idx.
+ assert(checkPartialMappingIdx(PMI_FirstGPR, PMI_LastGPR,
+ {PMI_GPR32, PMI_GPR64}) &&
+ "PartialMappingIdx's are incorrectly ordered");
+ assert(checkPartialMappingIdx(PMI_FirstFPR, PMI_LastFPR,
+ {PMI_FPR16, PMI_FPR32, PMI_FPR64, PMI_FPR128,
+ PMI_FPR256, PMI_FPR512}) &&
+ "PartialMappingIdx's are incorrectly ordered");
// Now, the content.
// Check partial mapping.
#define CHECK_PARTIALMAP(Idx, ValStartIdx, ValLength, RB) \
@@ -99,14 +99,14 @@ AArch64RegisterBankInfo::AArch64RegisterBankInfo(const TargetRegisterInfo &TRI)
#Idx " is incorrectly initialized"); \
} while (false)
- CHECK_PARTIALMAP(PMI_GPR32, 0, 32, RBGPR);
- CHECK_PARTIALMAP(PMI_GPR64, 0, 64, RBGPR);
- CHECK_PARTIALMAP(PMI_FPR16, 0, 16, RBFPR);
- CHECK_PARTIALMAP(PMI_FPR32, 0, 32, RBFPR);
- CHECK_PARTIALMAP(PMI_FPR64, 0, 64, RBFPR);
- CHECK_PARTIALMAP(PMI_FPR128, 0, 128, RBFPR);
- CHECK_PARTIALMAP(PMI_FPR256, 0, 256, RBFPR);
- CHECK_PARTIALMAP(PMI_FPR512, 0, 512, RBFPR);
+ CHECK_PARTIALMAP(PMI_GPR32, 0, 32, RBGPR);
+ CHECK_PARTIALMAP(PMI_GPR64, 0, 64, RBGPR);
+ CHECK_PARTIALMAP(PMI_FPR16, 0, 16, RBFPR);
+ CHECK_PARTIALMAP(PMI_FPR32, 0, 32, RBFPR);
+ CHECK_PARTIALMAP(PMI_FPR64, 0, 64, RBFPR);
+ CHECK_PARTIALMAP(PMI_FPR128, 0, 128, RBFPR);
+ CHECK_PARTIALMAP(PMI_FPR256, 0, 256, RBFPR);
+ CHECK_PARTIALMAP(PMI_FPR512, 0, 512, RBFPR);
// Check value mapping.
#define CHECK_VALUEMAP_IMPL(RBName, Size, Offset) \
@@ -119,14 +119,14 @@ AArch64RegisterBankInfo::AArch64RegisterBankInfo(const TargetRegisterInfo &TRI)
#define CHECK_VALUEMAP(RBName, Size) CHECK_VALUEMAP_IMPL(RBName, Size, 0)
- CHECK_VALUEMAP(GPR, 32);
- CHECK_VALUEMAP(GPR, 64);
- CHECK_VALUEMAP(FPR, 16);
- CHECK_VALUEMAP(FPR, 32);
- CHECK_VALUEMAP(FPR, 64);
- CHECK_VALUEMAP(FPR, 128);
- CHECK_VALUEMAP(FPR, 256);
- CHECK_VALUEMAP(FPR, 512);
+ CHECK_VALUEMAP(GPR, 32);
+ CHECK_VALUEMAP(GPR, 64);
+ CHECK_VALUEMAP(FPR, 16);
+ CHECK_VALUEMAP(FPR, 32);
+ CHECK_VALUEMAP(FPR, 64);
+ CHECK_VALUEMAP(FPR, 128);
+ CHECK_VALUEMAP(FPR, 256);
+ CHECK_VALUEMAP(FPR, 512);
// Check the value mapping for 3-operands instructions where all the operands
// map to the same value mapping.
@@ -137,13 +137,13 @@ AArch64RegisterBankInfo::AArch64RegisterBankInfo(const TargetRegisterInfo &TRI)
CHECK_VALUEMAP_IMPL(RBName, Size, 2); \
} while (false)
- CHECK_VALUEMAP_3OPS(GPR, 32);
- CHECK_VALUEMAP_3OPS(GPR, 64);
- CHECK_VALUEMAP_3OPS(FPR, 32);
- CHECK_VALUEMAP_3OPS(FPR, 64);
- CHECK_VALUEMAP_3OPS(FPR, 128);
- CHECK_VALUEMAP_3OPS(FPR, 256);
- CHECK_VALUEMAP_3OPS(FPR, 512);
+ CHECK_VALUEMAP_3OPS(GPR, 32);
+ CHECK_VALUEMAP_3OPS(GPR, 64);
+ CHECK_VALUEMAP_3OPS(FPR, 32);
+ CHECK_VALUEMAP_3OPS(FPR, 64);
+ CHECK_VALUEMAP_3OPS(FPR, 128);
+ CHECK_VALUEMAP_3OPS(FPR, 256);
+ CHECK_VALUEMAP_3OPS(FPR, 512);
#define CHECK_VALUEMAP_CROSSREGCPY(RBNameDst, RBNameSrc, Size) \
do { \
@@ -165,14 +165,14 @@ AArch64RegisterBankInfo::AArch64RegisterBankInfo(const TargetRegisterInfo &TRI)
\
} while (false)
- CHECK_VALUEMAP_CROSSREGCPY(GPR, GPR, 32);
- CHECK_VALUEMAP_CROSSREGCPY(GPR, FPR, 32);
- CHECK_VALUEMAP_CROSSREGCPY(GPR, GPR, 64);
- CHECK_VALUEMAP_CROSSREGCPY(GPR, FPR, 64);
- CHECK_VALUEMAP_CROSSREGCPY(FPR, FPR, 32);
- CHECK_VALUEMAP_CROSSREGCPY(FPR, GPR, 32);
- CHECK_VALUEMAP_CROSSREGCPY(FPR, FPR, 64);
- CHECK_VALUEMAP_CROSSREGCPY(FPR, GPR, 64);
+ CHECK_VALUEMAP_CROSSREGCPY(GPR, GPR, 32);
+ CHECK_VALUEMAP_CROSSREGCPY(GPR, FPR, 32);
+ CHECK_VALUEMAP_CROSSREGCPY(GPR, GPR, 64);
+ CHECK_VALUEMAP_CROSSREGCPY(GPR, FPR, 64);
+ CHECK_VALUEMAP_CROSSREGCPY(FPR, FPR, 32);
+ CHECK_VALUEMAP_CROSSREGCPY(FPR, GPR, 32);
+ CHECK_VALUEMAP_CROSSREGCPY(FPR, FPR, 64);
+ CHECK_VALUEMAP_CROSSREGCPY(FPR, GPR, 64);
#define CHECK_VALUEMAP_FPEXT(DstSize, SrcSize) \
do { \
@@ -193,12 +193,15 @@ AArch64RegisterBankInfo::AArch64RegisterBankInfo(const TargetRegisterInfo &TRI)
\
} while (false)
- CHECK_VALUEMAP_FPEXT(32, 16);
- CHECK_VALUEMAP_FPEXT(64, 16);
- CHECK_VALUEMAP_FPEXT(64, 32);
- CHECK_VALUEMAP_FPEXT(128, 64);
+ CHECK_VALUEMAP_FPEXT(32, 16);
+ CHECK_VALUEMAP_FPEXT(64, 16);
+ CHECK_VALUEMAP_FPEXT(64, 32);
+ CHECK_VALUEMAP_FPEXT(128, 64);
- assert(verify(TRI) && "Invalid register bank information");
+ assert(verify(TRI) && "Invalid register bank information");
+ };
+
+ llvm::call_once(InitializeRegisterBankFlag, InitializeRegisterBankOnce);
}
unsigned AArch64RegisterBankInfo::copyCost(const RegisterBank &A,
@@ -228,8 +231,11 @@ AArch64RegisterBankInfo::getRegBankFromRegClass(const TargetRegisterClass &RC,
switch (RC.getID()) {
case AArch64::FPR8RegClassID:
case AArch64::FPR16RegClassID:
+ case AArch64::FPR16_loRegClassID:
+ case AArch64::FPR32_with_hsub_in_FPR16_loRegClassID:
case AArch64::FPR32RegClassID:
case AArch64::FPR64RegClassID:
+ case AArch64::FPR64_loRegClassID:
case AArch64::FPR128RegClassID:
case AArch64::FPR128_loRegClassID:
case AArch64::DDRegClassID:
@@ -495,6 +501,7 @@ bool AArch64RegisterBankInfo::onlyDefinesFP(
const MachineInstr &MI, const MachineRegisterInfo &MRI,
const TargetRegisterInfo &TRI) const {
switch (MI.getOpcode()) {
+ case AArch64::G_DUP:
case TargetOpcode::G_SITOFP:
case TargetOpcode::G_UITOFP:
case TargetOpcode::G_EXTRACT_VECTOR_ELT:
@@ -636,6 +643,16 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
// Some of the floating-point instructions have mixed GPR and FPR operands:
// fine-tune the computed mapping.
switch (Opc) {
+ case AArch64::G_DUP: {
+ Register ScalarReg = MI.getOperand(1).getReg();
+ auto ScalarDef = MRI.getVRegDef(ScalarReg);
+ if (getRegBank(ScalarReg, MRI, TRI) == &AArch64::FPRRegBank ||
+ onlyDefinesFP(*ScalarDef, MRI, TRI))
+ OpRegBankIdx = {PMI_FirstFPR, PMI_FirstFPR};
+ else
+ OpRegBankIdx = {PMI_FirstFPR, PMI_FirstGPR};
+ break;
+ }
case TargetOpcode::G_TRUNC: {
LLT SrcTy = MRI.getType(MI.getOperand(1).getReg());
if (!SrcTy.isVector() && SrcTy.getSizeInBits() == 128)
@@ -680,7 +697,7 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
// In that case, we want the default mapping to be on FPR
// instead of blind map every scalar to GPR.
for (const MachineInstr &UseMI :
- MRI.use_instructions(MI.getOperand(0).getReg())) {
+ MRI.use_nodbg_instructions(MI.getOperand(0).getReg())) {
// If we have at least one direct use in a FP instruction,
// assume this was a floating point load in the IR.
// If it was not, we would have had a bitcast before
@@ -727,9 +744,8 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
//
// %z = G_SELECT %cond %x %y
// fpr = G_FOO %z ...
- if (any_of(
- MRI.use_instructions(MI.getOperand(0).getReg()),
- [&](MachineInstr &MI) { return onlyUsesFP(MI, MRI, TRI); }))
+ if (any_of(MRI.use_nodbg_instructions(MI.getOperand(0).getReg()),
+ [&](MachineInstr &MI) { return onlyUsesFP(MI, MRI, TRI); }))
++NumFP;
// Check if the defs of the source values always produce floating point
@@ -770,7 +786,7 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
// UNMERGE into scalars from a vector should always use FPR.
// Likewise if any of the uses are FP instructions.
if (SrcTy.isVector() || SrcTy == LLT::scalar(128) ||
- any_of(MRI.use_instructions(MI.getOperand(0).getReg()),
+ any_of(MRI.use_nodbg_instructions(MI.getOperand(0).getReg()),
[&](MachineInstr &MI) { return onlyUsesFP(MI, MRI, TRI); })) {
// Set the register bank of every operand to FPR.
for (unsigned Idx = 0, NumOperands = MI.getNumOperands();
diff --git a/llvm/lib/Target/AArch64/AArch64RegisterBankInfo.h b/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.h
index e956fca1aa109..e956fca1aa109 100644
--- a/llvm/lib/Target/AArch64/AArch64RegisterBankInfo.h
+++ b/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.h
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h
index 05a909f1780a0..9814f76258538 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h
@@ -763,10 +763,10 @@ static inline bool isSVECpyImm(int64_t Imm) {
bool IsImm8 = int8_t(Imm) == Imm;
bool IsImm16 = int16_t(Imm & ~0xff) == Imm;
- if (std::is_same<int8_t, typename std::make_signed<T>::type>::value)
+ if (std::is_same<int8_t, std::make_signed_t<T>>::value)
return IsImm8 || uint8_t(Imm) == Imm;
- if (std::is_same<int16_t, typename std::make_signed<T>::type>::value)
+ if (std::is_same<int16_t, std::make_signed_t<T>>::value)
return IsImm8 || IsImm16 || uint16_t(Imm & ~0xff) == Imm;
return IsImm8 || IsImm16;
@@ -775,8 +775,7 @@ static inline bool isSVECpyImm(int64_t Imm) {
/// Returns true if Imm is valid for ADD/SUB.
template <typename T>
static inline bool isSVEAddSubImm(int64_t Imm) {
- bool IsInt8t =
- std::is_same<int8_t, typename std::make_signed<T>::type>::value;
+ bool IsInt8t = std::is_same<int8_t, std::make_signed_t<T>>::value;
return uint8_t(Imm) == Imm || (!IsInt8t && uint16_t(Imm & ~0xff) == Imm);
}
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
index 9db746733aa35..9f7dfdf624829 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
@@ -24,6 +24,7 @@
#include "llvm/MC/MCSectionMachO.h"
#include "llvm/MC/MCTargetOptions.h"
#include "llvm/MC/MCValue.h"
+#include "llvm/Support/EndianStream.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/TargetRegistry.h"
using namespace llvm;
@@ -33,6 +34,7 @@ namespace {
class AArch64AsmBackend : public MCAsmBackend {
static const unsigned PCRelFlagVal =
MCFixupKindInfo::FKF_IsAlignedDownTo32Bits | MCFixupKindInfo::FKF_IsPCRel;
+protected:
Triple TheTriple;
public:
@@ -68,6 +70,11 @@ public:
{"fixup_aarch64_pcrel_call26", 0, 26, PCRelFlagVal},
{"fixup_aarch64_tlsdesc_call", 0, 0, 0}};
+ // Fixup kinds from .reloc directive are like R_AARCH64_NONE. They do not
+ // require any extra processing.
+ if (Kind >= FirstLiteralRelocationKind)
+ return MCAsmBackend::getFixupKindInfo(FK_NONE);
+
if (Kind < FirstTargetFixupKind)
return MCAsmBackend::getFixupKindInfo(Kind);
@@ -86,8 +93,8 @@ public:
bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value,
const MCRelaxableFragment *DF,
const MCAsmLayout &Layout) const override;
- void relaxInstruction(const MCInst &Inst, const MCSubtargetInfo &STI,
- MCInst &Res) const override;
+ void relaxInstruction(MCInst &Inst,
+ const MCSubtargetInfo &STI) const override;
bool writeNopData(raw_ostream &OS, uint64_t Count) const override;
void HandleAssemblerFlag(MCAssemblerFlag Flag) {}
@@ -108,7 +115,6 @@ static unsigned getFixupKindNumBytes(unsigned Kind) {
default:
llvm_unreachable("Unknown fixup kind!");
- case FK_NONE:
case AArch64::fixup_aarch64_tlsdesc_call:
return 0;
@@ -237,11 +243,22 @@ static uint64_t adjustFixupValue(const MCFixup &Fixup, const MCValue &Target,
static_cast<AArch64MCExpr::VariantKind>(Target.getRefKind());
if (AArch64MCExpr::getSymbolLoc(RefKind) != AArch64MCExpr::VK_ABS &&
AArch64MCExpr::getSymbolLoc(RefKind) != AArch64MCExpr::VK_SABS) {
- // VK_GOTTPREL, VK_TPREL, VK_DTPREL are movw fixups, but they can't
- // ever be resolved in the assembler.
- Ctx.reportError(Fixup.getLoc(),
- "relocation for a thread-local variable points to an "
- "absolute symbol");
+ if (!RefKind) {
+ // The fixup is an expression
+ if (SignedValue > 0xFFFF || SignedValue < -0xFFFF)
+ Ctx.reportError(Fixup.getLoc(),
+ "fixup value out of range [-0xFFFF, 0xFFFF]");
+
+ // Invert the negative immediate because it will feed into a MOVN.
+ if (SignedValue < 0)
+ SignedValue = ~SignedValue;
+ Value = static_cast<uint64_t>(SignedValue);
+ } else
+ // VK_GOTTPREL, VK_TPREL, VK_DTPREL are movw fixups, but they can't
+ // ever be resolved in the assembler.
+ Ctx.reportError(Fixup.getLoc(),
+ "relocation for a thread-local variable points to an "
+ "absolute symbol");
return Value;
}
@@ -329,7 +346,6 @@ static uint64_t adjustFixupValue(const MCFixup &Fixup, const MCValue &Target,
if (!valueFitsIntoFixupKind(Fixup.getTargetKind(), Value))
Ctx.reportError(Fixup.getLoc(), "fixup value too large for data type!");
LLVM_FALLTHROUGH;
- case FK_NONE:
case FK_SecRel_2:
case FK_SecRel_4:
return Value;
@@ -337,9 +353,17 @@ static uint64_t adjustFixupValue(const MCFixup &Fixup, const MCValue &Target,
}
Optional<MCFixupKind> AArch64AsmBackend::getFixupKind(StringRef Name) const {
- if (TheTriple.isOSBinFormatELF() && Name == "R_AARCH64_NONE")
- return FK_NONE;
- return MCAsmBackend::getFixupKind(Name);
+ if (!TheTriple.isOSBinFormatELF())
+ return None;
+
+ unsigned Type = llvm::StringSwitch<unsigned>(Name)
+#define ELF_RELOC(X, Y) .Case(#X, Y)
+#include "llvm/BinaryFormat/ELFRelocs/AArch64.def"
+#undef ELF_RELOC
+ .Default(-1u);
+ if (Type == -1u)
+ return None;
+ return static_cast<MCFixupKind>(FirstLiteralRelocationKind + Type);
}
/// getFixupKindContainereSizeInBytes - The number of bytes of the
@@ -386,9 +410,12 @@ void AArch64AsmBackend::applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
MutableArrayRef<char> Data, uint64_t Value,
bool IsResolved,
const MCSubtargetInfo *STI) const {
- unsigned NumBytes = getFixupKindNumBytes(Fixup.getKind());
if (!Value)
return; // Doesn't change encoding.
+ unsigned Kind = Fixup.getKind();
+ if (Kind >= FirstLiteralRelocationKind)
+ return;
+ unsigned NumBytes = getFixupKindNumBytes(Kind);
MCFixupKindInfo Info = getFixupKindInfo(Fixup.getKind());
MCContext &Ctx = Asm.getContext();
int64_t SignedValue = static_cast<int64_t>(Value);
@@ -424,8 +451,9 @@ void AArch64AsmBackend::applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
// FIXME: getFixupKindInfo() and getFixupKindNumBytes() could be fixed to
// handle this more cleanly. This may affect the output of -show-mc-encoding.
AArch64MCExpr::VariantKind RefKind =
- static_cast<AArch64MCExpr::VariantKind>(Target.getRefKind());
- if (AArch64MCExpr::getSymbolLoc(RefKind) == AArch64MCExpr::VK_SABS) {
+ static_cast<AArch64MCExpr::VariantKind>(Target.getRefKind());
+ if (AArch64MCExpr::getSymbolLoc(RefKind) == AArch64MCExpr::VK_SABS ||
+ (!RefKind && Fixup.getTargetKind() == AArch64::fixup_aarch64_movw)) {
// If the immediate is negative, generate MOVN else MOVZ.
// (Bit 30 = 0) ==> MOVN, (Bit 30 = 1) ==> MOVZ.
if (SignedValue < 0)
@@ -451,9 +479,8 @@ bool AArch64AsmBackend::fixupNeedsRelaxation(const MCFixup &Fixup,
return int64_t(Value) != int64_t(int8_t(Value));
}
-void AArch64AsmBackend::relaxInstruction(const MCInst &Inst,
- const MCSubtargetInfo &STI,
- MCInst &Res) const {
+void AArch64AsmBackend::relaxInstruction(MCInst &Inst,
+ const MCSubtargetInfo &STI) const {
llvm_unreachable("AArch64AsmBackend::relaxInstruction() unimplemented");
}
@@ -474,7 +501,7 @@ bool AArch64AsmBackend::shouldForceRelocation(const MCAssembler &Asm,
const MCFixup &Fixup,
const MCValue &Target) {
unsigned Kind = Fixup.getKind();
- if (Kind == FK_NONE)
+ if (Kind >= FirstLiteralRelocationKind)
return true;
// The ADRP instruction adds some multiple of 0x1000 to the current PC &
@@ -544,7 +571,6 @@ enum CompactUnwindEncodings {
// FIXME: This should be in a separate file.
class DarwinAArch64AsmBackend : public AArch64AsmBackend {
const MCRegisterInfo &MRI;
- bool IsILP32;
/// Encode compact unwind stack adjustment for frameless functions.
/// See UNWIND_ARM64_FRAMELESS_STACK_SIZE_MASK in compact_unwind_encoding.h.
@@ -555,18 +581,15 @@ class DarwinAArch64AsmBackend : public AArch64AsmBackend {
public:
DarwinAArch64AsmBackend(const Target &T, const Triple &TT,
- const MCRegisterInfo &MRI, bool IsILP32)
- : AArch64AsmBackend(T, TT, /*IsLittleEndian*/ true), MRI(MRI),
- IsILP32(IsILP32) {}
+ const MCRegisterInfo &MRI)
+ : AArch64AsmBackend(T, TT, /*IsLittleEndian*/ true), MRI(MRI) {}
std::unique_ptr<MCObjectTargetWriter>
createObjectTargetWriter() const override {
- if (IsILP32)
- return createAArch64MachObjectWriter(
- MachO::CPU_TYPE_ARM64_32, MachO::CPU_SUBTYPE_ARM64_32_V8, true);
- else
- return createAArch64MachObjectWriter(MachO::CPU_TYPE_ARM64,
- MachO::CPU_SUBTYPE_ARM64_ALL, false);
+ uint32_t CPUType = cantFail(MachO::getCPUType(TheTriple));
+ uint32_t CPUSubType = cantFail(MachO::getCPUSubType(TheTriple));
+ return createAArch64MachObjectWriter(CPUType, CPUSubType,
+ TheTriple.isArch32Bit());
}
/// Generate the compact unwind encoding from the CFI directives.
@@ -749,8 +772,7 @@ MCAsmBackend *llvm::createAArch64leAsmBackend(const Target &T,
const MCTargetOptions &Options) {
const Triple &TheTriple = STI.getTargetTriple();
if (TheTriple.isOSBinFormatMachO()) {
- const bool IsILP32 = TheTriple.isArch32Bit();
- return new DarwinAArch64AsmBackend(T, TheTriple, MRI, IsILP32);
+ return new DarwinAArch64AsmBackend(T, TheTriple, MRI);
}
if (TheTriple.isOSBinFormatCOFF())
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp
index 0fd1ca187be7f..e5637dcab9419 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp
@@ -106,13 +106,17 @@ unsigned AArch64ELFObjectWriter::getRelocType(MCContext &Ctx,
const MCValue &Target,
const MCFixup &Fixup,
bool IsPCRel) const {
+ unsigned Kind = Fixup.getTargetKind();
+ if (Kind >= FirstLiteralRelocationKind)
+ return Kind - FirstLiteralRelocationKind;
AArch64MCExpr::VariantKind RefKind =
static_cast<AArch64MCExpr::VariantKind>(Target.getRefKind());
AArch64MCExpr::VariantKind SymLoc = AArch64MCExpr::getSymbolLoc(RefKind);
bool IsNC = AArch64MCExpr::isNotChecked(RefKind);
assert((!Target.getSymA() ||
- Target.getSymA()->getKind() == MCSymbolRefExpr::VK_None) &&
+ Target.getSymA()->getKind() == MCSymbolRefExpr::VK_None ||
+ Target.getSymA()->getKind() == MCSymbolRefExpr::VK_PLT) &&
"Should only be expression-level modifiers here");
assert((!Target.getSymB() ||
@@ -120,14 +124,17 @@ unsigned AArch64ELFObjectWriter::getRelocType(MCContext &Ctx,
"Should only be expression-level modifiers here");
if (IsPCRel) {
- switch (Fixup.getTargetKind()) {
+ switch (Kind) {
case FK_Data_1:
Ctx.reportError(Fixup.getLoc(), "1-byte data relocations not supported");
return ELF::R_AARCH64_NONE;
case FK_Data_2:
return R_CLS(PREL16);
- case FK_Data_4:
- return R_CLS(PREL32);
+ case FK_Data_4: {
+ return Target.getAccessVariant() == MCSymbolRefExpr::VK_PLT
+ ? R_CLS(PLT32)
+ : R_CLS(PREL32);
+ }
case FK_Data_8:
if (IsILP32) {
Ctx.reportError(Fixup.getLoc(),
@@ -185,8 +192,6 @@ unsigned AArch64ELFObjectWriter::getRelocType(MCContext &Ctx,
if (IsILP32 && isNonILP32reloc(Fixup, RefKind, Ctx))
return ELF::R_AARCH64_NONE;
switch (Fixup.getTargetKind()) {
- case FK_NONE:
- return ELF::R_AARCH64_NONE;
case FK_Data_1:
Ctx.reportError(Fixup.getLoc(), "1-byte data relocations not supported");
return ELF::R_AARCH64_NONE;
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
index c33f7e957b54a..fe4c34be1519b 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
@@ -81,14 +81,14 @@ public:
std::move(Emitter)),
MappingSymbolCounter(0), LastEMS(EMS_None) {}
- void ChangeSection(MCSection *Section, const MCExpr *Subsection) override {
+ void changeSection(MCSection *Section, const MCExpr *Subsection) override {
// We have to keep track of the mapping symbol state of any sections we
// use. Each one should start off as EMS_None, which is provided as the
// default constructor by DenseMap::lookup.
LastMappingSymbols[getPreviousSection().first] = LastEMS;
LastEMS = LastMappingSymbols.lookup(Section);
- MCELFStreamer::ChangeSection(Section, Subsection);
+ MCELFStreamer::changeSection(Section, Subsection);
}
// Reset state between object emissions
@@ -102,10 +102,10 @@ public:
/// This function is the one used to emit instruction data into the ELF
/// streamer. We override it to add the appropriate mapping symbol if
/// necessary.
- void EmitInstruction(const MCInst &Inst,
+ void emitInstruction(const MCInst &Inst,
const MCSubtargetInfo &STI) override {
EmitA64MappingSymbol();
- MCELFStreamer::EmitInstruction(Inst, STI);
+ MCELFStreamer::emitInstruction(Inst, STI);
}
/// Emit a 32-bit value as an instruction. This is only used for the .inst
@@ -122,28 +122,28 @@ public:
}
EmitA64MappingSymbol();
- MCELFStreamer::EmitBytes(StringRef(Buffer, 4));
+ MCELFStreamer::emitBytes(StringRef(Buffer, 4));
}
/// This is one of the functions used to emit data into an ELF section, so the
/// AArch64 streamer overrides it to add the appropriate mapping symbol ($d)
/// if necessary.
- void EmitBytes(StringRef Data) override {
- EmitDataMappingSymbol();
- MCELFStreamer::EmitBytes(Data);
+ void emitBytes(StringRef Data) override {
+ emitDataMappingSymbol();
+ MCELFStreamer::emitBytes(Data);
}
/// This is one of the functions used to emit data into an ELF section, so the
/// AArch64 streamer overrides it to add the appropriate mapping symbol ($d)
/// if necessary.
- void EmitValueImpl(const MCExpr *Value, unsigned Size, SMLoc Loc) override {
- EmitDataMappingSymbol();
- MCELFStreamer::EmitValueImpl(Value, Size, Loc);
+ void emitValueImpl(const MCExpr *Value, unsigned Size, SMLoc Loc) override {
+ emitDataMappingSymbol();
+ MCELFStreamer::emitValueImpl(Value, Size, Loc);
}
void emitFill(const MCExpr &NumBytes, uint64_t FillValue,
SMLoc Loc) override {
- EmitDataMappingSymbol();
+ emitDataMappingSymbol();
MCObjectStreamer::emitFill(NumBytes, FillValue, Loc);
}
private:
@@ -153,7 +153,7 @@ private:
EMS_Data
};
- void EmitDataMappingSymbol() {
+ void emitDataMappingSymbol() {
if (LastEMS == EMS_Data)
return;
EmitMappingSymbol("$d");
@@ -170,7 +170,7 @@ private:
void EmitMappingSymbol(StringRef Name) {
auto *Symbol = cast<MCSymbolELF>(getContext().getOrCreateSymbol(
Name + "." + Twine(MappingSymbolCounter++)));
- EmitLabel(Symbol);
+ emitLabel(Symbol);
Symbol->setType(ELF::STT_NOTYPE);
Symbol->setBinding(ELF::STB_LOCAL);
Symbol->setExternal(false);
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp
index 469892213ef87..38474d31460dd 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp
@@ -283,7 +283,8 @@ void AArch64InstPrinter::printInst(const MCInst *MI, uint64_t Address,
}
if (Opcode == AArch64::SPACE) {
- O << '\t' << MAI.getCommentString() << " SPACE";
+ O << '\t' << MAI.getCommentString() << " SPACE "
+ << MI->getOperand(1).getImm();
printAnnotation(O, Annot);
return;
}
@@ -295,7 +296,7 @@ void AArch64InstPrinter::printInst(const MCInst *MI, uint64_t Address,
return;
}
- if (!printAliasInstr(MI, STI, O))
+ if (!printAliasInstr(MI, Address, STI, O))
printInstruction(MI, Address, STI, O);
printAnnotation(O, Annot);
@@ -900,6 +901,19 @@ void AArch64InstPrinter::printImmHex(const MCInst *MI, unsigned OpNo,
O << format("#%#llx", Op.getImm());
}
+template<int Size>
+void AArch64InstPrinter::printSImm(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ const MCOperand &Op = MI->getOperand(OpNo);
+ if (Size == 8)
+ O << "#" << formatImm((signed char)Op.getImm());
+ else if (Size == 16)
+ O << "#" << formatImm((signed short)Op.getImm());
+ else
+ O << "#" << formatImm(Op.getImm());
+}
+
void AArch64InstPrinter::printPostIncOperand(const MCInst *MI, unsigned OpNo,
unsigned Imm, raw_ostream &O) {
const MCOperand &Op = MI->getOperand(OpNo);
@@ -1334,7 +1348,8 @@ void AArch64InstPrinter::printVectorIndex(const MCInst *MI, unsigned OpNum,
O << "[" << MI->getOperand(OpNum).getImm() << "]";
}
-void AArch64InstPrinter::printAlignedLabel(const MCInst *MI, unsigned OpNum,
+void AArch64InstPrinter::printAlignedLabel(const MCInst *MI, uint64_t Address,
+ unsigned OpNum,
const MCSubtargetInfo &STI,
raw_ostream &O) {
const MCOperand &Op = MI->getOperand(OpNum);
@@ -1342,17 +1357,20 @@ void AArch64InstPrinter::printAlignedLabel(const MCInst *MI, unsigned OpNum,
// If the label has already been resolved to an immediate offset (say, when
// we're running the disassembler), just print the immediate.
if (Op.isImm()) {
- O << "#" << formatImm(Op.getImm() * 4);
+ int64_t Offset = Op.getImm() * 4;
+ if (PrintBranchImmAsAddress)
+ O << formatHex(Address + Offset);
+ else
+ O << "#" << formatImm(Offset);
return;
}
// If the branch target is simply an address then print it in hex.
const MCConstantExpr *BranchTarget =
dyn_cast<MCConstantExpr>(MI->getOperand(OpNum).getExpr());
- int64_t Address;
- if (BranchTarget && BranchTarget->evaluateAsAbsolute(Address)) {
- O << "0x";
- O.write_hex(Address);
+ int64_t TargetAddress;
+ if (BranchTarget && BranchTarget->evaluateAsAbsolute(TargetAddress)) {
+ O << formatHex(TargetAddress);
} else {
// Otherwise, just print the expression.
MI->getOperand(OpNum).getExpr()->print(O, &MAI);
@@ -1411,6 +1429,12 @@ void AArch64InstPrinter::printMRSSystemRegister(const MCInst *MI, unsigned OpNo,
return;
}
+ // Horrible hack for two different registers having the same encoding.
+ if (Val == AArch64SysReg::TRCEXTINSELR) {
+ O << "TRCEXTINSELR";
+ return;
+ }
+
const AArch64SysReg::SysReg *Reg = AArch64SysReg::lookupSysRegByEncoding(Val);
if (Reg && Reg->Readable && Reg->haveFeatures(STI.getFeatureBits()))
O << Reg->Name;
@@ -1431,6 +1455,12 @@ void AArch64InstPrinter::printMSRSystemRegister(const MCInst *MI, unsigned OpNo,
return;
}
+ // Horrible hack for two different registers having the same encoding.
+ if (Val == AArch64SysReg::TRCEXTINSELR) {
+ O << "TRCEXTINSELR";
+ return;
+ }
+
const AArch64SysReg::SysReg *Reg = AArch64SysReg::lookupSysRegByEncoding(Val);
if (Reg && Reg->Writeable && Reg->haveFeatures(STI.getFeatureBits()))
O << Reg->Name;
@@ -1499,7 +1529,7 @@ void AArch64InstPrinter::printSVERegOp(const MCInst *MI, unsigned OpNum,
template <typename T>
void AArch64InstPrinter::printImmSVE(T Value, raw_ostream &O) {
- typename std::make_unsigned<T>::type HexValue = Value;
+ std::make_unsigned_t<T> HexValue = Value;
if (getPrintImmHex())
O << '#' << formatHex((uint64_t)HexValue);
@@ -1544,8 +1574,8 @@ template <typename T>
void AArch64InstPrinter::printSVELogicalImm(const MCInst *MI, unsigned OpNum,
const MCSubtargetInfo &STI,
raw_ostream &O) {
- typedef typename std::make_signed<T>::type SignedT;
- typedef typename std::make_unsigned<T>::type UnsignedT;
+ typedef std::make_signed_t<T> SignedT;
+ typedef std::make_unsigned_t<T> UnsignedT;
uint64_t Val = MI->getOperand(OpNum).getImm();
UnsignedT PrintVal = AArch64_AM::decodeLogicalImmediate(Val, 64);
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.h b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.h
index 993f379b53433..6da5f0e81c803 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.h
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.h
@@ -32,10 +32,10 @@ public:
// Autogenerated by tblgen.
virtual void printInstruction(const MCInst *MI, uint64_t Address,
const MCSubtargetInfo &STI, raw_ostream &O);
- virtual bool printAliasInstr(const MCInst *MI, const MCSubtargetInfo &STI,
- raw_ostream &O);
- virtual void printCustomAliasOperand(const MCInst *MI, unsigned OpIdx,
- unsigned PrintMethodIdx,
+ virtual bool printAliasInstr(const MCInst *MI, uint64_t Address,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+ virtual void printCustomAliasOperand(const MCInst *MI, uint64_t Address,
+ unsigned OpIdx, unsigned PrintMethodIdx,
const MCSubtargetInfo &STI,
raw_ostream &O);
@@ -56,6 +56,9 @@ protected:
raw_ostream &O);
void printImmHex(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
raw_ostream &O);
+ template <int Size>
+ void printSImm(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
+ raw_ostream &O);
template <typename T> void printImmSVE(T Value, raw_ostream &O);
void printPostIncOperand(const MCInst *MI, unsigned OpNo, unsigned Imm,
raw_ostream &O);
@@ -97,7 +100,7 @@ protected:
const MCSubtargetInfo &STI, raw_ostream &O);
void printInverseCondCode(const MCInst *MI, unsigned OpNum,
const MCSubtargetInfo &STI, raw_ostream &O);
- void printAlignedLabel(const MCInst *MI, unsigned OpNum,
+ void printAlignedLabel(const MCInst *MI, uint64_t Address, unsigned OpNum,
const MCSubtargetInfo &STI, raw_ostream &O);
void printUImm12Offset(const MCInst *MI, unsigned OpNum, unsigned Scale,
raw_ostream &O);
@@ -202,10 +205,10 @@ public:
void printInstruction(const MCInst *MI, uint64_t Address,
const MCSubtargetInfo &STI, raw_ostream &O) override;
- bool printAliasInstr(const MCInst *MI, const MCSubtargetInfo &STI,
- raw_ostream &O) override;
- void printCustomAliasOperand(const MCInst *MI, unsigned OpIdx,
- unsigned PrintMethodIdx,
+ bool printAliasInstr(const MCInst *MI, uint64_t Address,
+ const MCSubtargetInfo &STI, raw_ostream &O) override;
+ void printCustomAliasOperand(const MCInst *MI, uint64_t Address,
+ unsigned OpIdx, unsigned PrintMethodIdx,
const MCSubtargetInfo &STI,
raw_ostream &O) override;
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp
index 5926a4f81616c..9a63e26dec190 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp
@@ -60,7 +60,7 @@ const MCExpr *AArch64MCAsmInfoDarwin::getExprForPersonalitySymbol(
const MCExpr *Res =
MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_GOT, Context);
MCSymbol *PCSym = Context.createTempSymbol();
- Streamer.EmitLabel(PCSym);
+ Streamer.emitLabel(PCSym);
const MCExpr *PC = MCSymbolRefExpr::create(PCSym, Context);
return MCBinaryExpr::createSub(Res, PC, Context);
}
@@ -96,8 +96,6 @@ AArch64MCAsmInfoELF::AArch64MCAsmInfoELF(const Triple &T) {
// Exceptions handling
ExceptionsType = ExceptionHandling::DwarfCFI;
- UseIntegratedAssembler = true;
-
HasIdentDirective = true;
}
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp
index 8f4d9cb94d607..da8f511c650f0 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp
@@ -569,23 +569,24 @@ unsigned AArch64MCCodeEmitter::fixMOVZ(const MCInst &MI, unsigned EncodedValue,
if (UImm16MO.isImm())
return EncodedValue;
- const AArch64MCExpr *A64E = cast<AArch64MCExpr>(UImm16MO.getExpr());
- switch (A64E->getKind()) {
- case AArch64MCExpr::VK_DTPREL_G2:
- case AArch64MCExpr::VK_DTPREL_G1:
- case AArch64MCExpr::VK_DTPREL_G0:
- case AArch64MCExpr::VK_GOTTPREL_G1:
- case AArch64MCExpr::VK_TPREL_G2:
- case AArch64MCExpr::VK_TPREL_G1:
- case AArch64MCExpr::VK_TPREL_G0:
- return EncodedValue & ~(1u << 30);
- default:
- // Nothing to do for an unsigned fixup.
- return EncodedValue;
+ const MCExpr *E = UImm16MO.getExpr();
+ if (const AArch64MCExpr *A64E = dyn_cast<AArch64MCExpr>(E)) {
+ switch (A64E->getKind()) {
+ case AArch64MCExpr::VK_DTPREL_G2:
+ case AArch64MCExpr::VK_DTPREL_G1:
+ case AArch64MCExpr::VK_DTPREL_G0:
+ case AArch64MCExpr::VK_GOTTPREL_G1:
+ case AArch64MCExpr::VK_TPREL_G2:
+ case AArch64MCExpr::VK_TPREL_G1:
+ case AArch64MCExpr::VK_TPREL_G0:
+ return EncodedValue & ~(1u << 30);
+ default:
+ // Nothing to do for an unsigned fixup.
+ return EncodedValue;
+ }
}
-
- return EncodedValue & ~(1u << 30);
+ return EncodedValue;
}
void AArch64MCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
index 7dc3665baabc5..209bff3a23117 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
@@ -254,7 +254,7 @@ static MCAsmInfo *createAArch64MCAsmInfo(const MCRegisterInfo &MRI,
// Initial state of the frame pointer is SP.
unsigned Reg = MRI.getDwarfRegNum(AArch64::SP, true);
- MCCFIInstruction Inst = MCCFIInstruction::createDefCfa(nullptr, Reg, 0);
+ MCCFIInstruction Inst = MCCFIInstruction::cfiDefCfa(nullptr, Reg, 0);
MAI->addInitialFrameState(Inst);
return MAI;
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp
index fc04d37eb3623..b0f414bd27edd 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp
@@ -139,7 +139,7 @@ static bool canUseLocalRelocation(const MCSectionMachO &Section,
return false;
if (RefSec.getSegmentName() == "__DATA" &&
- RefSec.getSectionName() == "__objc_classrefs")
+ RefSec.getName() == "__objc_classrefs")
return false;
// FIXME: ld64 currently handles internal pointer-sized relocations
@@ -407,5 +407,5 @@ std::unique_ptr<MCObjectTargetWriter>
llvm::createAArch64MachObjectWriter(uint32_t CPUType, uint32_t CPUSubtype,
bool IsILP32) {
return std::make_unique<AArch64MachObjectWriter>(CPUType, CPUSubtype,
- IsILP32);
+ IsILP32);
}
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp
index f70752f5303f3..48ed68f492635 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp
@@ -51,7 +51,7 @@ void AArch64TargetStreamer::emitInst(uint32_t Inst) {
Inst >>= 8;
}
- getStreamer().EmitBytes(StringRef(Buffer, 4));
+ getStreamer().emitBytes(StringRef(Buffer, 4));
}
namespace llvm {
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.cpp
index 37c6fbb039081..03fbab5142a2e 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.cpp
@@ -28,7 +28,7 @@ public:
void EmitWinEHHandlerData(SMLoc Loc) override;
void EmitWindowsUnwindTables() override;
- void FinishImpl() override;
+ void finishImpl() override;
};
void AArch64WinCOFFStreamer::EmitWinEHHandlerData(SMLoc Loc) {
@@ -45,11 +45,11 @@ void AArch64WinCOFFStreamer::EmitWindowsUnwindTables() {
EHStreamer.Emit(*this);
}
-void AArch64WinCOFFStreamer::FinishImpl() {
- EmitFrames(nullptr);
+void AArch64WinCOFFStreamer::finishImpl() {
+ emitFrames(nullptr);
EmitWindowsUnwindTables();
- MCWinCOFFStreamer::FinishImpl();
+ MCWinCOFFStreamer::finishImpl();
}
} // end anonymous namespace
@@ -68,7 +68,7 @@ void AArch64TargetWinCOFFStreamer::EmitARM64WinUnwindCode(unsigned UnwindCode,
WinEH::FrameInfo *CurFrame = S.EnsureValidWinFrameInfo(SMLoc());
if (!CurFrame)
return;
- MCSymbol *Label = S.EmitCFILabel();
+ MCSymbol *Label = S.emitCFILabel();
auto Inst = WinEH::Instruction(UnwindCode, Label, Reg, Offset);
if (InEpilogCFI)
CurFrame->EpilogMap[CurrentEpilog].push_back(Inst);
@@ -158,7 +158,7 @@ void AArch64TargetWinCOFFStreamer::EmitARM64WinCFIPrologEnd() {
if (!CurFrame)
return;
- MCSymbol *Label = S.EmitCFILabel();
+ MCSymbol *Label = S.emitCFILabel();
CurFrame->PrologEnd = Label;
WinEH::Instruction Inst = WinEH::Instruction(Win64EH::UOP_End, Label, -1, 0);
auto it = CurFrame->Instructions.begin();
@@ -172,7 +172,7 @@ void AArch64TargetWinCOFFStreamer::EmitARM64WinCFIEpilogStart() {
return;
InEpilogCFI = true;
- CurrentEpilog = S.EmitCFILabel();
+ CurrentEpilog = S.emitCFILabel();
}
void AArch64TargetWinCOFFStreamer::EmitARM64WinCFIEpilogEnd() {
@@ -182,7 +182,7 @@ void AArch64TargetWinCOFFStreamer::EmitARM64WinCFIEpilogEnd() {
return;
InEpilogCFI = false;
- MCSymbol *Label = S.EmitCFILabel();
+ MCSymbol *Label = S.emitCFILabel();
WinEH::Instruction Inst = WinEH::Instruction(Win64EH::UOP_End, Label, -1, 0);
CurFrame->EpilogMap[CurrentEpilog].push_back(Inst);
CurrentEpilog = nullptr;
diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td
index a172b8d7e6b0a..a005d1e65abe1 100644
--- a/llvm/lib/Target/AArch64/SVEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td
@@ -10,6 +10,14 @@
//
//===----------------------------------------------------------------------===//
+def SDT_AArch64Setcc : SDTypeProfile<1, 4, [
+ SDTCisVec<0>, SDTCisVec<1>, SDTCisVec<2>, SDTCisVec<3>,
+ SDTCVecEltisVT<0, i1>, SDTCVecEltisVT<1, i1>, SDTCisSameAs<2, 3>,
+ SDTCisVT<4, OtherVT>
+]>;
+
+def AArch64setcc_z : SDNode<"AArch64ISD::SETCC_MERGE_ZERO", SDT_AArch64Setcc>;
+
def SVEPatternOperand : AsmOperandClass {
let Name = "SVEPattern";
let ParserMethod = "tryParseSVEPattern";
@@ -33,7 +41,7 @@ def SVEPrefetchOperand : AsmOperandClass {
let RenderMethod = "addPrefetchOperands";
}
-def sve_prfop : Operand<i32>, ImmLeaf<i32, [{
+def sve_prfop : Operand<i32>, TImmLeaf<i32, [{
return (((uint32_t)Imm) <= 15);
}]> {
let PrintMethod = "printPrefetchOp<true>";
@@ -167,8 +175,8 @@ def SVEAddSubImmOperand32 : SVEShiftedImmOperand<32, "AddSub", "isSVEAddSubImm<i
def SVEAddSubImmOperand64 : SVEShiftedImmOperand<64, "AddSub", "isSVEAddSubImm<int64_t>">;
class imm8_opt_lsl<int ElementWidth, string printType,
- AsmOperandClass OpndClass, code Predicate>
- : Operand<i32>, ImmLeaf<i32, Predicate> {
+ AsmOperandClass OpndClass>
+ : Operand<i32> {
let EncoderMethod = "getImm8OptLsl";
let DecoderMethod = "DecodeImm8OptLsl<" # ElementWidth # ">";
let PrintMethod = "printImm8OptLsl<" # printType # ">";
@@ -176,31 +184,15 @@ class imm8_opt_lsl<int ElementWidth, string printType,
let MIOperandInfo = (ops i32imm, i32imm);
}
-def cpy_imm8_opt_lsl_i8 : imm8_opt_lsl<8, "int8_t", SVECpyImmOperand8, [{
- return AArch64_AM::isSVECpyImm<int8_t>(Imm);
-}]>;
-def cpy_imm8_opt_lsl_i16 : imm8_opt_lsl<16, "int16_t", SVECpyImmOperand16, [{
- return AArch64_AM::isSVECpyImm<int16_t>(Imm);
-}]>;
-def cpy_imm8_opt_lsl_i32 : imm8_opt_lsl<32, "int32_t", SVECpyImmOperand32, [{
- return AArch64_AM::isSVECpyImm<int32_t>(Imm);
-}]>;
-def cpy_imm8_opt_lsl_i64 : imm8_opt_lsl<64, "int64_t", SVECpyImmOperand64, [{
- return AArch64_AM::isSVECpyImm<int64_t>(Imm);
-}]>;
-
-def addsub_imm8_opt_lsl_i8 : imm8_opt_lsl<8, "uint8_t", SVEAddSubImmOperand8, [{
- return AArch64_AM::isSVEAddSubImm<int8_t>(Imm);
-}]>;
-def addsub_imm8_opt_lsl_i16 : imm8_opt_lsl<16, "uint16_t", SVEAddSubImmOperand16, [{
- return AArch64_AM::isSVEAddSubImm<int16_t>(Imm);
-}]>;
-def addsub_imm8_opt_lsl_i32 : imm8_opt_lsl<32, "uint32_t", SVEAddSubImmOperand32, [{
- return AArch64_AM::isSVEAddSubImm<int32_t>(Imm);
-}]>;
-def addsub_imm8_opt_lsl_i64 : imm8_opt_lsl<64, "uint64_t", SVEAddSubImmOperand64, [{
- return AArch64_AM::isSVEAddSubImm<int64_t>(Imm);
-}]>;
+def cpy_imm8_opt_lsl_i8 : imm8_opt_lsl<8, "int8_t", SVECpyImmOperand8>;
+def cpy_imm8_opt_lsl_i16 : imm8_opt_lsl<16, "int16_t", SVECpyImmOperand16>;
+def cpy_imm8_opt_lsl_i32 : imm8_opt_lsl<32, "int32_t", SVECpyImmOperand32>;
+def cpy_imm8_opt_lsl_i64 : imm8_opt_lsl<64, "int64_t", SVECpyImmOperand64>;
+
+def addsub_imm8_opt_lsl_i8 : imm8_opt_lsl<8, "uint8_t", SVEAddSubImmOperand8>;
+def addsub_imm8_opt_lsl_i16 : imm8_opt_lsl<16, "uint16_t", SVEAddSubImmOperand16>;
+def addsub_imm8_opt_lsl_i32 : imm8_opt_lsl<32, "uint32_t", SVEAddSubImmOperand32>;
+def addsub_imm8_opt_lsl_i64 : imm8_opt_lsl<64, "uint64_t", SVEAddSubImmOperand64>;
def SVEAddSubImm8Pat : ComplexPattern<i32, 2, "SelectSVEAddSubImm<MVT::i8>", []>;
def SVEAddSubImm16Pat : ComplexPattern<i32, 2, "SelectSVEAddSubImm<MVT::i16>", []>;
@@ -212,9 +204,13 @@ def SVELogicalImm16Pat : ComplexPattern<i64, 1, "SelectSVELogicalImm<MVT::i16>",
def SVELogicalImm32Pat : ComplexPattern<i64, 1, "SelectSVELogicalImm<MVT::i32>", []>;
def SVELogicalImm64Pat : ComplexPattern<i64, 1, "SelectSVELogicalImm<MVT::i64>", []>;
+def SVE8BitLslImm : ComplexPattern<i32, 2, "SelectSVE8BitLslImm", [imm]>;
+
def SVEArithUImmPat : ComplexPattern<i32, 1, "SelectSVEArithImm", []>;
def SVEArithSImmPat : ComplexPattern<i32, 1, "SelectSVESignedArithImm", []>;
+def SVEShiftImm64 : ComplexPattern<i32, 1, "SelectSVEShiftImm64<0, 64>", []>;
+
class SVEExactFPImm<string Suffix, string ValA, string ValB> : AsmOperandClass {
let Name = "SVEExactFPImmOperand" # Suffix;
let DiagnosticType = "Invalid" # Name;
@@ -324,6 +320,16 @@ class SVE_1_Op_Imm_Arith_Pat<ValueType vt, SDPatternOperator op, ZPRRegOp zprty,
: Pat<(vt (op (vt zprty:$Op1), (vt (AArch64dup (it (cpx i32:$imm)))))),
(inst $Op1, i32:$imm)>;
+class SVE_1_Op_Imm_Shift_Pred_Pat<ValueType vt, ValueType pt, SDPatternOperator op,
+ ZPRRegOp zprty, Operand ImmTy, Instruction inst>
+ : Pat<(vt (op (pt (AArch64ptrue 31)), (vt zprty:$Op1), (vt (AArch64dup (ImmTy:$imm))))),
+ (inst $Op1, ImmTy:$imm)>;
+
+class SVE_1_Op_Imm_Arith_Pred_Pat<ValueType vt, ValueType pt, SDPatternOperator op,
+ ZPRRegOp zprty, ValueType it, ComplexPattern cpx, Instruction inst>
+ : Pat<(vt (op (pt (AArch64ptrue 31)), (vt zprty:$Op1), (vt (AArch64dup (it (cpx i32:$imm)))))),
+ (inst $Op1, i32:$imm)>;
+
class SVE_1_Op_Imm_Log_Pat<ValueType vt, SDPatternOperator op, ZPRRegOp zprty,
ValueType it, ComplexPattern cpx, Instruction inst>
: Pat<(vt (op (vt zprty:$Op1), (vt (AArch64dup (it (cpx i64:$imm)))))),
@@ -367,8 +373,22 @@ class SVE_4_Op_Imm_Pat<ValueType vtd, SDPatternOperator op, ValueType vt1,
: Pat<(vtd (op vt1:$Op1, vt2:$Op2, vt3:$Op3, (vt4 ImmTy:$Op4))),
(inst $Op1, $Op2, $Op3, ImmTy:$Op4)>;
+def SVEDup0 : ComplexPattern<i64, 0, "SelectDupZero", []>;
def SVEDup0Undef : ComplexPattern<i64, 0, "SelectDupZeroOrUndef", []>;
+let AddedComplexity = 1 in {
+class SVE_3_Op_Pat_SelZero<ValueType vtd, SDPatternOperator op, ValueType vt1,
+ ValueType vt2, ValueType vt3, Instruction inst>
+: Pat<(vtd (vtd (op vt1:$Op1, (vselect vt1:$Op1, vt2:$Op2, (SVEDup0)), vt3:$Op3))),
+ (inst $Op1, $Op2, $Op3)>;
+
+class SVE_3_Op_Pat_Shift_Imm_SelZero<ValueType vtd, SDPatternOperator op,
+ ValueType vt1, ValueType vt2,
+ Operand vt3, Instruction inst>
+: Pat<(vtd (op vt1:$Op1, (vselect vt1:$Op1, vt2:$Op2, (SVEDup0)), (i32 (vt3:$Op3)))),
+ (inst $Op1, $Op2, vt3:$Op3)>;
+}
+
//
// Common but less generic patterns.
//
@@ -378,6 +398,69 @@ class SVE_1_Op_AllActive_Pat<ValueType vtd, SDPatternOperator op, ValueType vt1,
: Pat<(vtd (op vt1:$Op1)),
(inst (IMPLICIT_DEF), (ptrue 31), $Op1)>;
+class SVE_2_Op_AllActive_Pat<ValueType vtd, SDPatternOperator op, ValueType vt1,
+ ValueType vt2, Instruction inst, Instruction ptrue>
+: Pat<(vtd (op vt1:$Op1, vt2:$Op2)),
+ (inst (ptrue 31), $Op1, $Op2)>;
+
+//
+// Pseudo -> Instruction mappings
+//
+def getSVEPseudoMap : InstrMapping {
+ let FilterClass = "SVEPseudo2Instr";
+ let RowFields = ["PseudoName"];
+ let ColFields = ["IsInstr"];
+ let KeyCol = ["0"];
+ let ValueCols = [["1"]];
+}
+
+class SVEPseudo2Instr<string name, bit instr> {
+ string PseudoName = name;
+ bit IsInstr = instr;
+}
+
+// Lookup e.g. DIV -> DIVR
+def getSVERevInstr : InstrMapping {
+ let FilterClass = "SVEInstr2Rev";
+ let RowFields = ["InstrName"];
+ let ColFields = ["isReverseInstr"];
+ let KeyCol = ["0"];
+ let ValueCols = [["1"]];
+}
+
+// Lookup e.g. DIVR -> DIV
+def getSVENonRevInstr : InstrMapping {
+ let FilterClass = "SVEInstr2Rev";
+ let RowFields = ["InstrName"];
+ let ColFields = ["isReverseInstr"];
+ let KeyCol = ["1"];
+ let ValueCols = [["0"]];
+}
+
+class SVEInstr2Rev<string name1, string name2, bit name1IsReverseInstr> {
+ string InstrName = !if(name1IsReverseInstr, name1, name2);
+ bit isReverseInstr = name1IsReverseInstr;
+}
+
+//
+// Pseudos for destructive operands
+//
+let hasNoSchedulingInfo = 1 in {
+ class PredTwoOpPseudo<string name, ZPRRegOp zprty,
+ FalseLanesEnum flags = FalseLanesNone>
+ : SVEPseudo2Instr<name, 0>,
+ Pseudo<(outs zprty:$Zd), (ins PPR3bAny:$Pg, zprty:$Zs1, zprty:$Zs2), []> {
+ let FalseLanes = flags;
+ }
+
+ class PredTwoOpImmPseudo<string name, ZPRRegOp zprty, Operand immty,
+ FalseLanesEnum flags = FalseLanesNone>
+ : SVEPseudo2Instr<name, 0>,
+ Pseudo<(outs zprty:$Zd), (ins PPR3bAny:$Pg, zprty:$Zs1, immty:$imm), []> {
+ let FalseLanes = flags;
+ }
+}
+
//===----------------------------------------------------------------------===//
// SVE Predicate Misc Group
//===----------------------------------------------------------------------===//
@@ -566,7 +649,7 @@ class sve_int_count_v<bits<2> sz8_64, bits<5> opc, string asm,
let Inst{4-0} = Zdn;
let Constraints = "$Zdn = $_Zdn";
- let DestructiveInstType = Destructive;
+ let DestructiveInstType = DestructiveOther;
let ElementSize = ElementSizeNone;
}
@@ -680,7 +763,7 @@ class sve_int_countvlv<bits<5> opc, string asm, ZPRRegOp zprty>
let Inst{4-0} = Zdn;
let Constraints = "$Zdn = $_Zdn";
- let DestructiveInstType = Destructive;
+ let DestructiveInstType = DestructiveOther;
let ElementSize = ElementSizeNone;
}
@@ -941,11 +1024,46 @@ multiclass sve_int_perm_tbl<string asm, SDPatternOperator op> {
def : SVE_2_Op_Pat<nxv2f64, op, nxv2f64, nxv2i64, !cast<Instruction>(NAME # _D)>;
}
-multiclass sve2_int_perm_tbl<string asm> {
+multiclass sve2_int_perm_tbl<string asm, SDPatternOperator op> {
def _B : sve_int_perm_tbl<0b00, 0b01, asm, ZPR8, ZZ_b>;
def _H : sve_int_perm_tbl<0b01, 0b01, asm, ZPR16, ZZ_h>;
def _S : sve_int_perm_tbl<0b10, 0b01, asm, ZPR32, ZZ_s>;
def _D : sve_int_perm_tbl<0b11, 0b01, asm, ZPR64, ZZ_d>;
+
+ def : Pat<(nxv16i8 (op nxv16i8:$Op1, nxv16i8:$Op2, nxv16i8:$Op3)),
+ (nxv16i8 (!cast<Instruction>(NAME # _B) (REG_SEQUENCE ZPR2, nxv16i8:$Op1, zsub0,
+ nxv16i8:$Op2, zsub1),
+ nxv16i8:$Op3))>;
+
+ def : Pat<(nxv8i16 (op nxv8i16:$Op1, nxv8i16:$Op2, nxv8i16:$Op3)),
+ (nxv8i16 (!cast<Instruction>(NAME # _H) (REG_SEQUENCE ZPR2, nxv8i16:$Op1, zsub0,
+ nxv8i16:$Op2, zsub1),
+ nxv8i16:$Op3))>;
+
+ def : Pat<(nxv4i32 (op nxv4i32:$Op1, nxv4i32:$Op2, nxv4i32:$Op3)),
+ (nxv4i32 (!cast<Instruction>(NAME # _S) (REG_SEQUENCE ZPR2, nxv4i32:$Op1, zsub0,
+ nxv4i32:$Op2, zsub1),
+ nxv4i32:$Op3))>;
+
+ def : Pat<(nxv2i64 (op nxv2i64:$Op1, nxv2i64:$Op2, nxv2i64:$Op3)),
+ (nxv2i64 (!cast<Instruction>(NAME # _D) (REG_SEQUENCE ZPR2, nxv2i64:$Op1, zsub0,
+ nxv2i64:$Op2, zsub1),
+ nxv2i64:$Op3))>;
+
+ def : Pat<(nxv8f16 (op nxv8f16:$Op1, nxv8f16:$Op2, nxv8i16:$Op3)),
+ (nxv8f16 (!cast<Instruction>(NAME # _H) (REG_SEQUENCE ZPR2, nxv8f16:$Op1, zsub0,
+ nxv8f16:$Op2, zsub1),
+ nxv8i16:$Op3))>;
+
+ def : Pat<(nxv4f32 (op nxv4f32:$Op1, nxv4f32:$Op2, nxv4i32:$Op3)),
+ (nxv4f32 (!cast<Instruction>(NAME # _S) (REG_SEQUENCE ZPR2, nxv4f32:$Op1, zsub0,
+ nxv4f32:$Op2, zsub1),
+ nxv4i32:$Op3))>;
+
+ def : Pat<(nxv2f64 (op nxv2f64:$Op1, nxv2f64:$Op2, nxv2i64:$Op3)),
+ (nxv2f64 (!cast<Instruction>(NAME # _D) (REG_SEQUENCE ZPR2, nxv2f64:$Op1, zsub0,
+ nxv2f64:$Op2, zsub1),
+ nxv2i64:$Op3))>;
}
class sve2_int_perm_tbx<bits<2> sz8_64, string asm, ZPRRegOp zprty>
@@ -967,11 +1085,20 @@ class sve2_int_perm_tbx<bits<2> sz8_64, string asm, ZPRRegOp zprty>
let Constraints = "$Zd = $_Zd";
}
-multiclass sve2_int_perm_tbx<string asm> {
+multiclass sve2_int_perm_tbx<string asm, SDPatternOperator op> {
def _B : sve2_int_perm_tbx<0b00, asm, ZPR8>;
def _H : sve2_int_perm_tbx<0b01, asm, ZPR16>;
def _S : sve2_int_perm_tbx<0b10, asm, ZPR32>;
def _D : sve2_int_perm_tbx<0b11, asm, ZPR64>;
+
+ def : SVE_3_Op_Pat<nxv16i8, op, nxv16i8, nxv16i8, nxv16i8, !cast<Instruction>(NAME # _B)>;
+ def : SVE_3_Op_Pat<nxv8i16, op, nxv8i16, nxv8i16, nxv8i16, !cast<Instruction>(NAME # _H)>;
+ def : SVE_3_Op_Pat<nxv4i32, op, nxv4i32, nxv4i32, nxv4i32, !cast<Instruction>(NAME # _S)>;
+ def : SVE_3_Op_Pat<nxv2i64, op, nxv2i64, nxv2i64, nxv2i64, !cast<Instruction>(NAME # _D)>;
+
+ def : SVE_3_Op_Pat<nxv8f16, op, nxv8f16, nxv8f16, nxv8i16, !cast<Instruction>(NAME # _H)>;
+ def : SVE_3_Op_Pat<nxv4f32, op, nxv4f32, nxv4f32, nxv4i32, !cast<Instruction>(NAME # _S)>;
+ def : SVE_3_Op_Pat<nxv2f64, op, nxv2f64, nxv2f64, nxv2i64, !cast<Instruction>(NAME # _D)>;
}
class sve_int_perm_reverse_z<bits<2> sz8_64, string asm, ZPRRegOp zprty>
@@ -1072,7 +1199,7 @@ class sve_int_perm_insrs<bits<2> sz8_64, string asm, ZPRRegOp zprty,
let Inst{4-0} = Zdn;
let Constraints = "$Zdn = $_Zdn";
- let DestructiveInstType = Destructive;
+ let DestructiveInstType = DestructiveOther;
}
multiclass sve_int_perm_insrs<string asm, SDPatternOperator op> {
@@ -1102,7 +1229,7 @@ class sve_int_perm_insrv<bits<2> sz8_64, string asm, ZPRRegOp zprty,
let Inst{4-0} = Zdn;
let Constraints = "$Zdn = $_Zdn";
- let DestructiveInstType = Destructive;
+ let DestructiveInstType = DestructiveOther;
}
multiclass sve_int_perm_insrv<string asm, SDPatternOperator op> {
@@ -1135,7 +1262,7 @@ class sve_int_perm_extract_i<string asm>
let Inst{4-0} = Zdn;
let Constraints = "$Zdn = $_Zdn";
- let DestructiveInstType = Destructive;
+ let DestructiveInstType = DestructiveOther;
let ElementSize = ElementSizeNone;
}
@@ -1244,13 +1371,22 @@ class sve_int_pred_log<bits<4> opc, string asm>
}
-multiclass sve_int_pred_log<bits<4> opc, string asm, SDPatternOperator op> {
+multiclass sve_int_pred_log<bits<4> opc, string asm, SDPatternOperator op,
+ SDPatternOperator op_nopred = null_frag> {
def NAME : sve_int_pred_log<opc, asm>;
def : SVE_3_Op_Pat<nxv16i1, op, nxv16i1, nxv16i1, nxv16i1, !cast<Instruction>(NAME)>;
def : SVE_3_Op_Pat<nxv8i1, op, nxv8i1, nxv8i1, nxv8i1, !cast<Instruction>(NAME)>;
def : SVE_3_Op_Pat<nxv4i1, op, nxv4i1, nxv4i1, nxv4i1, !cast<Instruction>(NAME)>;
def : SVE_3_Op_Pat<nxv2i1, op, nxv2i1, nxv2i1, nxv2i1, !cast<Instruction>(NAME)>;
+ def : SVE_2_Op_AllActive_Pat<nxv16i1, op_nopred, nxv16i1, nxv16i1,
+ !cast<Instruction>(NAME), PTRUE_B>;
+ def : SVE_2_Op_AllActive_Pat<nxv8i1, op_nopred, nxv8i1, nxv8i1,
+ !cast<Instruction>(NAME), PTRUE_H>;
+ def : SVE_2_Op_AllActive_Pat<nxv4i1, op_nopred, nxv4i1, nxv4i1,
+ !cast<Instruction>(NAME), PTRUE_S>;
+ def : SVE_2_Op_AllActive_Pat<nxv2i1, op_nopred, nxv2i1, nxv2i1,
+ !cast<Instruction>(NAME), PTRUE_D>;
}
@@ -1272,7 +1408,7 @@ class sve_int_log_imm<bits<2> opc, string asm>
let Constraints = "$Zdn = $_Zdn";
let DecoderMethod = "DecodeSVELogicalImmInstruction";
- let DestructiveInstType = Destructive;
+ let DestructiveInstType = DestructiveOther;
let ElementSize = ElementSizeNone;
}
@@ -1357,7 +1493,8 @@ class sve_int_bin_cons_arit_0<bits<2> sz8_64, bits<3> opc, string asm,
let Inst{4-0} = Zd;
}
-multiclass sve_int_bin_cons_arit_0<bits<3> opc, string asm, SDPatternOperator op> {
+multiclass sve_int_bin_cons_arit_0<bits<3> opc, string asm,
+ SDPatternOperator op, SDPatternOperator int_op> {
def _B : sve_int_bin_cons_arit_0<0b00, opc, asm, ZPR8>;
def _H : sve_int_bin_cons_arit_0<0b01, opc, asm, ZPR16>;
def _S : sve_int_bin_cons_arit_0<0b10, opc, asm, ZPR32>;
@@ -1367,6 +1504,12 @@ multiclass sve_int_bin_cons_arit_0<bits<3> opc, string asm, SDPatternOperator op
def : SVE_2_Op_Pat<nxv8i16, op, nxv8i16, nxv8i16, !cast<Instruction>(NAME # _H)>;
def : SVE_2_Op_Pat<nxv4i32, op, nxv4i32, nxv4i32, !cast<Instruction>(NAME # _S)>;
def : SVE_2_Op_Pat<nxv2i64, op, nxv2i64, nxv2i64, !cast<Instruction>(NAME # _D)>;
+
+ // Intrinsic version
+ def : SVE_2_Op_Pat<nxv16i8, int_op, nxv16i8, nxv16i8, !cast<Instruction>(NAME # _B)>;
+ def : SVE_2_Op_Pat<nxv8i16, int_op, nxv8i16, nxv8i16, !cast<Instruction>(NAME # _H)>;
+ def : SVE_2_Op_Pat<nxv4i32, int_op, nxv4i32, nxv4i32, !cast<Instruction>(NAME # _S)>;
+ def : SVE_2_Op_Pat<nxv2i64, int_op, nxv2i64, nxv2i64, !cast<Instruction>(NAME # _D)>;
}
//===----------------------------------------------------------------------===//
@@ -1394,7 +1537,7 @@ class sve_fp_2op_i_p_zds<bits<2> sz, bits<3> opc, string asm,
let Inst{4-0} = Zdn;
let Constraints = "$Zdn = $_Zdn";
- let DestructiveInstType = Destructive;
+ let DestructiveInstType = DestructiveOther;
let ElementSize = zprty.ElementSize;
}
@@ -1423,15 +1566,21 @@ class sve_fp_2op_p_zds<bits<2> sz, bits<4> opc, string asm,
let Inst{4-0} = Zdn;
let Constraints = "$Zdn = $_Zdn";
- let DestructiveInstType = Destructive;
+ let DestructiveInstType = DestructiveOther;
let ElementSize = zprty.ElementSize;
}
-multiclass sve_fp_2op_p_zds<bits<4> opc, string asm,
- SDPatternOperator op> {
- def _H : sve_fp_2op_p_zds<0b01, opc, asm, ZPR16>;
- def _S : sve_fp_2op_p_zds<0b10, opc, asm, ZPR32>;
- def _D : sve_fp_2op_p_zds<0b11, opc, asm, ZPR64>;
+multiclass sve_fp_2op_p_zds<bits<4> opc, string asm, string Ps,
+ SDPatternOperator op, DestructiveInstTypeEnum flags,
+ string revname="", bit isReverseInstr=0> {
+ let DestructiveInstType = flags in {
+ def _H : sve_fp_2op_p_zds<0b01, opc, asm, ZPR16>,
+ SVEPseudo2Instr<Ps # _H, 1>, SVEInstr2Rev<NAME # _H, revname # _H, isReverseInstr>;
+ def _S : sve_fp_2op_p_zds<0b10, opc, asm, ZPR32>,
+ SVEPseudo2Instr<Ps # _S, 1>, SVEInstr2Rev<NAME # _S, revname # _S, isReverseInstr>;
+ def _D : sve_fp_2op_p_zds<0b11, opc, asm, ZPR64>,
+ SVEPseudo2Instr<Ps # _D, 1>, SVEInstr2Rev<NAME # _D, revname # _D, isReverseInstr>;
+ }
def : SVE_3_Op_Pat<nxv8f16, op, nxv8i1, nxv8f16, nxv8f16, !cast<Instruction>(NAME # _H)>;
def : SVE_3_Op_Pat<nxv4f32, op, nxv4i1, nxv4f32, nxv4f32, !cast<Instruction>(NAME # _S)>;
@@ -1449,6 +1598,16 @@ multiclass sve_fp_2op_p_zds_fscale<bits<4> opc, string asm,
def : SVE_3_Op_Pat<nxv2f64, op, nxv2i1, nxv2f64, nxv2i64, !cast<Instruction>(NAME # _D)>;
}
+multiclass sve_fp_2op_p_zds_zeroing_hsd<SDPatternOperator op> {
+ def _ZERO_H : PredTwoOpPseudo<NAME # _H, ZPR16, FalseLanesZero>;
+ def _ZERO_S : PredTwoOpPseudo<NAME # _S, ZPR32, FalseLanesZero>;
+ def _ZERO_D : PredTwoOpPseudo<NAME # _D, ZPR64, FalseLanesZero>;
+
+ def : SVE_3_Op_Pat_SelZero<nxv8f16, op, nxv8i1, nxv8f16, nxv8f16, !cast<Pseudo>(NAME # _ZERO_H)>;
+ def : SVE_3_Op_Pat_SelZero<nxv4f32, op, nxv4i1, nxv4f32, nxv4f32, !cast<Pseudo>(NAME # _ZERO_S)>;
+ def : SVE_3_Op_Pat_SelZero<nxv2f64, op, nxv2i1, nxv2f64, nxv2f64, !cast<Pseudo>(NAME # _ZERO_D)>;
+}
+
class sve_fp_ftmad<bits<2> sz, string asm, ZPRRegOp zprty>
: I<(outs zprty:$Zdn), (ins zprty:$_Zdn, zprty:$Zm, imm32_0_7:$imm3),
asm, "\t$Zdn, $_Zdn, $Zm, $imm3",
@@ -1466,7 +1625,7 @@ class sve_fp_ftmad<bits<2> sz, string asm, ZPRRegOp zprty>
let Inst{4-0} = Zdn;
let Constraints = "$Zdn = $_Zdn";
- let DestructiveInstType = Destructive;
+ let DestructiveInstType = DestructiveOther;
let ElementSize = ElementSizeNone;
}
@@ -1551,7 +1710,7 @@ class sve_fp_3op_p_zds_a<bits<2> sz, bits<2> opc, string asm, ZPRRegOp zprty>
let Inst{4-0} = Zda;
let Constraints = "$Zda = $_Zda";
- let DestructiveInstType = Destructive;
+ let DestructiveInstType = DestructiveOther;
let ElementSize = zprty.ElementSize;
}
@@ -1586,7 +1745,7 @@ class sve_fp_3op_p_zds_b<bits<2> sz, bits<2> opc, string asm,
let Inst{4-0} = Zdn;
let Constraints = "$Zdn = $_Zdn";
- let DestructiveInstType = Destructive;
+ let DestructiveInstType = DestructiveOther;
let ElementSize = zprty.ElementSize;
}
@@ -1620,7 +1779,7 @@ class sve_fp_fma_by_indexed_elem<bits<2> sz, bit opc, string asm,
let Inst{4-0} = Zda;
let Constraints = "$Zda = $_Zda";
- let DestructiveInstType = Destructive;
+ let DestructiveInstType = DestructiveOther;
let ElementSize = ElementSizeNone;
}
@@ -1646,12 +1805,12 @@ multiclass sve_fp_fma_by_indexed_elem<bit opc, string asm,
let Inst{19-16} = Zm;
}
- def : Pat<(nxv8f16 (op nxv8f16:$Op1, nxv8f16:$Op2, nxv8f16:$Op3, (i32 VectorIndexH32b:$idx))),
- (!cast<Instruction>(NAME # _H) $Op1, $Op2, $Op3, VectorIndexH32b:$idx)>;
- def : Pat<(nxv4f32 (op nxv4f32:$Op1, nxv4f32:$Op2, nxv4f32:$Op3, (i32 VectorIndexS32b:$idx))),
- (!cast<Instruction>(NAME # _S) $Op1, $Op2, $Op3, VectorIndexS32b:$idx)>;
- def : Pat<(nxv2f64 (op nxv2f64:$Op1, nxv2f64:$Op2, nxv2f64:$Op3, (i32 VectorIndexD32b:$idx))),
- (!cast<Instruction>(NAME # _D) $Op1, $Op2, $Op3, VectorIndexD32b:$idx)>;
+ def : Pat<(nxv8f16 (op nxv8f16:$Op1, nxv8f16:$Op2, nxv8f16:$Op3, (i32 VectorIndexH32b_timm:$idx))),
+ (!cast<Instruction>(NAME # _H) $Op1, $Op2, $Op3, VectorIndexH32b_timm:$idx)>;
+ def : Pat<(nxv4f32 (op nxv4f32:$Op1, nxv4f32:$Op2, nxv4f32:$Op3, (i32 VectorIndexS32b_timm:$idx))),
+ (!cast<Instruction>(NAME # _S) $Op1, $Op2, $Op3, VectorIndexS32b_timm:$idx)>;
+ def : Pat<(nxv2f64 (op nxv2f64:$Op1, nxv2f64:$Op2, nxv2f64:$Op3, (i32 VectorIndexD32b_timm:$idx))),
+ (!cast<Instruction>(NAME # _D) $Op1, $Op2, $Op3, VectorIndexD32b_timm:$idx)>;
}
@@ -1694,12 +1853,12 @@ multiclass sve_fp_fmul_by_indexed_elem<string asm, SDPatternOperator op> {
let Inst{19-16} = Zm;
}
- def : Pat<(nxv8f16 (op nxv8f16:$Op1, nxv8f16:$Op2, (i32 VectorIndexH32b:$idx))),
- (!cast<Instruction>(NAME # _H) $Op1, $Op2, VectorIndexH32b:$idx)>;
- def : Pat<(nxv4f32 (op nxv4f32:$Op1, nxv4f32:$Op2, (i32 VectorIndexS32b:$idx))),
- (!cast<Instruction>(NAME # _S) $Op1, $Op2, VectorIndexS32b:$idx)>;
- def : Pat<(nxv2f64 (op nxv2f64:$Op1, nxv2f64:$Op2, (i32 VectorIndexD32b:$idx))),
- (!cast<Instruction>(NAME # _D) $Op1, $Op2, VectorIndexD32b:$idx)>;
+ def : Pat<(nxv8f16 (op nxv8f16:$Op1, nxv8f16:$Op2, (i32 VectorIndexH32b_timm:$idx))),
+ (!cast<Instruction>(NAME # _H) $Op1, $Op2, VectorIndexH32b_timm:$idx)>;
+ def : Pat<(nxv4f32 (op nxv4f32:$Op1, nxv4f32:$Op2, (i32 VectorIndexS32b_timm:$idx))),
+ (!cast<Instruction>(NAME # _S) $Op1, $Op2, VectorIndexS32b_timm:$idx)>;
+ def : Pat<(nxv2f64 (op nxv2f64:$Op1, nxv2f64:$Op2, (i32 VectorIndexD32b_timm:$idx))),
+ (!cast<Instruction>(NAME # _D) $Op1, $Op2, VectorIndexD32b_timm:$idx)>;
}
//===----------------------------------------------------------------------===//
@@ -1727,7 +1886,7 @@ class sve_fp_fcmla<bits<2> sz, string asm, ZPRRegOp zprty>
let Inst{4-0} = Zda;
let Constraints = "$Zda = $_Zda";
- let DestructiveInstType = Destructive;
+ let DestructiveInstType = DestructiveOther;
let ElementSize = zprty.ElementSize;
}
@@ -1767,7 +1926,7 @@ class sve_fp_fcmla_by_indexed_elem<bits<2> sz, string asm,
let Inst{4-0} = Zda;
let Constraints = "$Zda = $_Zda";
- let DestructiveInstType = Destructive;
+ let DestructiveInstType = DestructiveOther;
let ElementSize = ElementSizeNone;
}
@@ -1785,10 +1944,10 @@ multiclass sve_fp_fcmla_by_indexed_elem<string asm, SDPatternOperator op> {
let Inst{19-16} = Zm;
}
- def : Pat<(nxv8f16 (op nxv8f16:$Op1, nxv8f16:$Op2, nxv8f16:$Op3, (i32 VectorIndexS32b:$idx), (i32 complexrotateop:$imm))),
- (!cast<Instruction>(NAME # _H) $Op1, $Op2, $Op3, VectorIndexS32b:$idx, complexrotateop:$imm)>;
- def : Pat<(nxv4f32 (op nxv4f32:$Op1, nxv4f32:$Op2, nxv4f32:$Op3, (i32 VectorIndexD32b:$idx), (i32 complexrotateop:$imm))),
- (!cast<Instruction>(NAME # _S) $Op1, $Op2, $Op3, VectorIndexD32b:$idx, complexrotateop:$imm)>;
+ def : Pat<(nxv8f16 (op nxv8f16:$Op1, nxv8f16:$Op2, nxv8f16:$Op3, (i32 VectorIndexS32b_timm:$idx), (i32 complexrotateop:$imm))),
+ (!cast<Instruction>(NAME # _H) $Op1, $Op2, $Op3, VectorIndexS32b_timm:$idx, complexrotateop:$imm)>;
+ def : Pat<(nxv4f32 (op nxv4f32:$Op1, nxv4f32:$Op2, nxv4f32:$Op3, (i32 VectorIndexD32b_timm:$idx), (i32 complexrotateop:$imm))),
+ (!cast<Instruction>(NAME # _S) $Op1, $Op2, $Op3, VectorIndexD32b_timm:$idx, complexrotateop:$imm)>;
}
//===----------------------------------------------------------------------===//
@@ -1815,7 +1974,7 @@ class sve_fp_fcadd<bits<2> sz, string asm, ZPRRegOp zprty>
let Inst{4-0} = Zdn;
let Constraints = "$Zdn = $_Zdn";
- let DestructiveInstType = Destructive;
+ let DestructiveInstType = DestructiveOther;
let ElementSize = zprty.ElementSize;
}
@@ -1861,22 +2020,22 @@ multiclass sve2_fp_convert_down_narrow<string asm, string op> {
def _StoH : sve2_fp_convert_precision<0b1000, asm, ZPR16, ZPR32>;
def _DtoS : sve2_fp_convert_precision<0b1110, asm, ZPR32, ZPR64>;
- def : SVE_3_Op_Pat<nxv8f16, !cast<SDPatternOperator>(op # _f16f32), nxv8f16, nxv16i1, nxv4f32, !cast<Instruction>(NAME # _StoH)>;
- def : SVE_3_Op_Pat<nxv4f32, !cast<SDPatternOperator>(op # _f32f64), nxv4f32, nxv16i1, nxv2f64, !cast<Instruction>(NAME # _DtoS)>;
+ def : SVE_3_Op_Pat<nxv8f16, !cast<SDPatternOperator>(op # _f16f32), nxv8f16, nxv4i1, nxv4f32, !cast<Instruction>(NAME # _StoH)>;
+ def : SVE_3_Op_Pat<nxv4f32, !cast<SDPatternOperator>(op # _f32f64), nxv4f32, nxv2i1, nxv2f64, !cast<Instruction>(NAME # _DtoS)>;
}
multiclass sve2_fp_convert_up_long<string asm, string op> {
def _HtoS : sve2_fp_convert_precision<0b1001, asm, ZPR32, ZPR16>;
def _StoD : sve2_fp_convert_precision<0b1111, asm, ZPR64, ZPR32>;
- def : SVE_3_Op_Pat<nxv4f32, !cast<SDPatternOperator>(op # _f32f16), nxv4f32, nxv16i1, nxv8f16, !cast<Instruction>(NAME # _HtoS)>;
- def : SVE_3_Op_Pat<nxv2f64, !cast<SDPatternOperator>(op # _f64f32), nxv2f64, nxv16i1, nxv4f32, !cast<Instruction>(NAME # _StoD)>;
+ def : SVE_3_Op_Pat<nxv4f32, !cast<SDPatternOperator>(op # _f32f16), nxv4f32, nxv4i1, nxv8f16, !cast<Instruction>(NAME # _HtoS)>;
+ def : SVE_3_Op_Pat<nxv2f64, !cast<SDPatternOperator>(op # _f64f32), nxv2f64, nxv2i1, nxv4f32, !cast<Instruction>(NAME # _StoD)>;
}
multiclass sve2_fp_convert_down_odd_rounding_top<string asm, string op> {
def _DtoS : sve2_fp_convert_precision<0b0010, asm, ZPR32, ZPR64>;
- def : SVE_3_Op_Pat<nxv4f32, !cast<SDPatternOperator>(op # _f32f64), nxv4f32, nxv16i1, nxv2f64, !cast<Instruction>(NAME # _DtoS)>;
+ def : SVE_3_Op_Pat<nxv4f32, !cast<SDPatternOperator>(op # _f32f64), nxv4f32, nxv2i1, nxv2f64, !cast<Instruction>(NAME # _DtoS)>;
}
//===----------------------------------------------------------------------===//
@@ -1902,7 +2061,7 @@ class sve2_fp_pairwise_pred<bits<2> sz, bits<3> opc, string asm,
let Inst{4-0} = Zdn;
let Constraints = "$Zdn = $_Zdn";
- let DestructiveInstType = Destructive;
+ let DestructiveInstType = DestructiveOther;
let ElementSize = zprty.ElementSize;
}
@@ -1942,14 +2101,14 @@ class sve2_fp_mla_long_by_indexed_elem<bits<2> opc, string asm>
let Inst{4-0} = Zda;
let Constraints = "$Zda = $_Zda";
- let DestructiveInstType = Destructive;
+ let DestructiveInstType = DestructiveOther;
let ElementSize = ElementSizeNone;
}
multiclass sve2_fp_mla_long_by_indexed_elem<bits<2> opc, string asm,
SDPatternOperator op> {
def NAME : sve2_fp_mla_long_by_indexed_elem<opc, asm>;
- def : SVE_4_Op_Imm_Pat<nxv4f32, op, nxv4f32, nxv8f16, nxv8f16, i32, VectorIndexH32b, !cast<Instruction>(NAME)>;
+ def : SVE_4_Op_Imm_Pat<nxv4f32, op, nxv4f32, nxv8f16, nxv8f16, i32, VectorIndexH32b_timm, !cast<Instruction>(NAME)>;
}
//===----------------------------------------------------------------------===//
@@ -1974,7 +2133,7 @@ class sve2_fp_mla_long<bits<2> opc, string asm>
let Inst{4-0} = Zda;
let Constraints = "$Zda = $_Zda";
- let DestructiveInstType = Destructive;
+ let DestructiveInstType = DestructiveOther;
let ElementSize = ElementSizeNone;
}
@@ -2084,7 +2243,7 @@ class sve_fp_2op_p_zd<bits<7> opc, string asm, RegisterOperand i_zprtype,
let Inst{4-0} = Zd;
let Constraints = "$Zd = $_Zd";
- let DestructiveInstType = Destructive;
+ let DestructiveInstType = DestructiveOther;
let ElementSize = size;
}
@@ -2120,7 +2279,7 @@ multiclass sve2_fp_flogb<string asm, SDPatternOperator op> {
multiclass sve2_fp_convert_down_odd_rounding<string asm, string op> {
def _DtoS : sve_fp_2op_p_zd<0b0001010, asm, ZPR64, ZPR32, ElementSizeD>;
- def : SVE_3_Op_Pat<nxv4f32, !cast<SDPatternOperator>(op # _f32f64), nxv4f32, nxv16i1, nxv2f64, !cast<Instruction>(NAME # _DtoS)>;
+ def : SVE_3_Op_Pat<nxv4f32, !cast<SDPatternOperator>(op # _f32f64), nxv4f32, nxv2i1, nxv2f64, !cast<Instruction>(NAME # _DtoS)>;
}
//===----------------------------------------------------------------------===//
@@ -2176,7 +2335,7 @@ class sve_int_bin_pred_arit_log<bits<2> sz8_64, bits<2> fmt, bits<3> opc,
let Inst{4-0} = Zdn;
let Constraints = "$Zdn = $_Zdn";
- let DestructiveInstType = Destructive;
+ let DestructiveInstType = DestructiveOther;
let ElementSize = zprty.ElementSize;
}
@@ -2192,11 +2351,20 @@ multiclass sve_int_bin_pred_log<bits<3> opc, string asm, SDPatternOperator op> {
def : SVE_3_Op_Pat<nxv2i64, op, nxv2i1, nxv2i64, nxv2i64, !cast<Instruction>(NAME # _D)>;
}
-multiclass sve_int_bin_pred_arit_0<bits<3> opc, string asm, SDPatternOperator op> {
- def _B : sve_int_bin_pred_arit_log<0b00, 0b00, opc, asm, ZPR8>;
- def _H : sve_int_bin_pred_arit_log<0b01, 0b00, opc, asm, ZPR16>;
- def _S : sve_int_bin_pred_arit_log<0b10, 0b00, opc, asm, ZPR32>;
- def _D : sve_int_bin_pred_arit_log<0b11, 0b00, opc, asm, ZPR64>;
+multiclass sve_int_bin_pred_arit_0<bits<3> opc, string asm, string Ps,
+ SDPatternOperator op,
+ DestructiveInstTypeEnum flags,
+ string revname="", bit isReverseInstr=0> {
+ let DestructiveInstType = flags in {
+ def _B : sve_int_bin_pred_arit_log<0b00, 0b00, opc, asm, ZPR8>,
+ SVEPseudo2Instr<Ps # _B, 1>, SVEInstr2Rev<NAME # _B, revname # _B, isReverseInstr>;
+ def _H : sve_int_bin_pred_arit_log<0b01, 0b00, opc, asm, ZPR16>,
+ SVEPseudo2Instr<Ps # _H, 1>, SVEInstr2Rev<NAME # _H, revname # _H, isReverseInstr>;
+ def _S : sve_int_bin_pred_arit_log<0b10, 0b00, opc, asm, ZPR32>,
+ SVEPseudo2Instr<Ps # _S, 1>, SVEInstr2Rev<NAME # _S, revname # _S, isReverseInstr>;
+ def _D : sve_int_bin_pred_arit_log<0b11, 0b00, opc, asm, ZPR64>,
+ SVEPseudo2Instr<Ps # _D, 1>, SVEInstr2Rev<NAME # _D, revname # _D, isReverseInstr>;
+ }
def : SVE_3_Op_Pat<nxv16i8, op, nxv16i1, nxv16i8, nxv16i8, !cast<Instruction>(NAME # _B)>;
def : SVE_3_Op_Pat<nxv8i16, op, nxv8i1, nxv8i16, nxv8i16, !cast<Instruction>(NAME # _H)>;
@@ -2229,9 +2397,16 @@ multiclass sve_int_bin_pred_arit_2<bits<3> opc, string asm, SDPatternOperator op
}
// Special case for divides which are not defined for 8b/16b elements.
-multiclass sve_int_bin_pred_arit_2_div<bits<3> opc, string asm, SDPatternOperator op> {
- def _S : sve_int_bin_pred_arit_log<0b10, 0b10, opc, asm, ZPR32>;
- def _D : sve_int_bin_pred_arit_log<0b11, 0b10, opc, asm, ZPR64>;
+multiclass sve_int_bin_pred_arit_2_div<bits<3> opc, string asm, string Ps,
+ SDPatternOperator op,
+ DestructiveInstTypeEnum flags,
+ string revname="", bit isReverseInstr=0> {
+ let DestructiveInstType = flags in {
+ def _S : sve_int_bin_pred_arit_log<0b10, 0b10, opc, asm, ZPR32>,
+ SVEPseudo2Instr<Ps # _S, 1>, SVEInstr2Rev<NAME # _S, revname # _S, isReverseInstr>;
+ def _D : sve_int_bin_pred_arit_log<0b11, 0b10, opc, asm, ZPR64>,
+ SVEPseudo2Instr<Ps # _D, 1>, SVEInstr2Rev<NAME # _D, revname # _D, isReverseInstr>;
+ }
def : SVE_3_Op_Pat<nxv4i32, op, nxv4i1, nxv4i32, nxv4i32, !cast<Instruction>(NAME # _S)>;
def : SVE_3_Op_Pat<nxv2i64, op, nxv2i1, nxv2i64, nxv2i64, !cast<Instruction>(NAME # _D)>;
@@ -2262,7 +2437,7 @@ class sve_int_mladdsub_vvv_pred<bits<2> sz8_64, bits<1> opc, string asm,
let Inst{4-0} = Zdn;
let Constraints = "$Zdn = $_Zdn";
- let DestructiveInstType = Destructive;
+ let DestructiveInstType = DestructiveOther;
let ElementSize = zprty.ElementSize;
}
@@ -2299,7 +2474,7 @@ class sve_int_mlas_vvv_pred<bits<2> sz8_64, bits<1> opc, string asm,
let Inst{4-0} = Zda;
let Constraints = "$Zda = $_Zda";
- let DestructiveInstType = Destructive;
+ let DestructiveInstType = DestructiveOther;
let ElementSize = zprty.ElementSize;
}
@@ -2336,21 +2511,30 @@ class sve2_int_mla<bits<2> sz, bits<5> opc, string asm,
let Inst{4-0} = Zda;
let Constraints = "$Zda = $_Zda";
- let DestructiveInstType = Destructive;
+ let DestructiveInstType = DestructiveOther;
let ElementSize = ElementSizeNone;
}
-multiclass sve2_int_mla<bit S, string asm> {
+multiclass sve2_int_mla<bit S, string asm, SDPatternOperator op> {
def _B : sve2_int_mla<0b00, { 0b1110, S }, asm, ZPR8, ZPR8>;
def _H : sve2_int_mla<0b01, { 0b1110, S }, asm, ZPR16, ZPR16>;
def _S : sve2_int_mla<0b10, { 0b1110, S }, asm, ZPR32, ZPR32>;
def _D : sve2_int_mla<0b11, { 0b1110, S }, asm, ZPR64, ZPR64>;
+
+ def : SVE_3_Op_Pat<nxv16i8, op, nxv16i8, nxv16i8, nxv16i8, !cast<Instruction>(NAME # _B)>;
+ def : SVE_3_Op_Pat<nxv8i16, op, nxv8i16, nxv8i16, nxv8i16, !cast<Instruction>(NAME # _H)>;
+ def : SVE_3_Op_Pat<nxv4i32, op, nxv4i32, nxv4i32, nxv4i32, !cast<Instruction>(NAME # _S)>;
+ def : SVE_3_Op_Pat<nxv2i64, op, nxv2i64, nxv2i64, nxv2i64, !cast<Instruction>(NAME # _D)>;
}
-multiclass sve2_int_mla_long<bits<5> opc, string asm> {
+multiclass sve2_int_mla_long<bits<5> opc, string asm, SDPatternOperator op> {
def _H : sve2_int_mla<0b01, opc, asm, ZPR16, ZPR8>;
def _S : sve2_int_mla<0b10, opc, asm, ZPR32, ZPR16>;
def _D : sve2_int_mla<0b11, opc, asm, ZPR64, ZPR32>;
+
+ def : SVE_3_Op_Pat<nxv8i16, op, nxv8i16, nxv16i8, nxv16i8, !cast<Instruction>(NAME # _H)>;
+ def : SVE_3_Op_Pat<nxv4i32, op, nxv4i32, nxv8i16, nxv8i16, !cast<Instruction>(NAME # _S)>;
+ def : SVE_3_Op_Pat<nxv2i64, op, nxv2i64, nxv4i32, nxv4i32, !cast<Instruction>(NAME # _D)>;
}
//===----------------------------------------------------------------------===//
@@ -2372,39 +2556,44 @@ class sve2_int_mla_by_indexed_elem<bits<2> sz, bits<6> opc, string asm,
let Inst{4-0} = Zda;
let Constraints = "$Zda = $_Zda";
- let DestructiveInstType = Destructive;
+ let DestructiveInstType = DestructiveOther;
let ElementSize = ElementSizeNone;
}
-multiclass sve2_int_mla_by_indexed_elem<bits<2> opc, bit S, string asm> {
- def _H : sve2_int_mla_by_indexed_elem<{0, ?}, { 0b000, opc, S }, asm, ZPR16, ZPR16, ZPR3b16, VectorIndexH> {
+multiclass sve2_int_mla_by_indexed_elem<bits<2> opc, bit S, string asm,
+ SDPatternOperator op> {
+ def _H : sve2_int_mla_by_indexed_elem<{0, ?}, { 0b000, opc, S }, asm, ZPR16, ZPR16, ZPR3b16, VectorIndexH32b> {
bits<3> Zm;
bits<3> iop;
let Inst{22} = iop{2};
let Inst{20-19} = iop{1-0};
let Inst{18-16} = Zm;
}
- def _S : sve2_int_mla_by_indexed_elem<0b10, { 0b000, opc, S }, asm, ZPR32, ZPR32, ZPR3b32, VectorIndexS> {
+ def _S : sve2_int_mla_by_indexed_elem<0b10, { 0b000, opc, S }, asm, ZPR32, ZPR32, ZPR3b32, VectorIndexS32b> {
bits<3> Zm;
bits<2> iop;
let Inst{20-19} = iop;
let Inst{18-16} = Zm;
}
- def _D : sve2_int_mla_by_indexed_elem<0b11, { 0b000, opc, S }, asm, ZPR64, ZPR64, ZPR4b64, VectorIndexD> {
+ def _D : sve2_int_mla_by_indexed_elem<0b11, { 0b000, opc, S }, asm, ZPR64, ZPR64, ZPR4b64, VectorIndexD32b> {
bits<4> Zm;
bit iop;
let Inst{20} = iop;
let Inst{19-16} = Zm;
}
+
+ def : SVE_4_Op_Imm_Pat<nxv8i16, op, nxv8i16, nxv8i16, nxv8i16, i32, VectorIndexH32b_timm, !cast<Instruction>(NAME # _H)>;
+ def : SVE_4_Op_Imm_Pat<nxv4i32, op, nxv4i32, nxv4i32, nxv4i32, i32, VectorIndexS32b_timm, !cast<Instruction>(NAME # _S)>;
+ def : SVE_4_Op_Imm_Pat<nxv2i64, op, nxv2i64, nxv2i64, nxv2i64, i32, VectorIndexD32b_timm, !cast<Instruction>(NAME # _D)>;
}
//===----------------------------------------------------------------------===//
// SVE2 Integer Multiply-Add Long - Indexed Group
//===----------------------------------------------------------------------===//
-multiclass sve2_int_mla_long_by_indexed_elem<bits<4> opc, string asm> {
+multiclass sve2_int_mla_long_by_indexed_elem<bits<4> opc, string asm, SDPatternOperator op> {
def _S : sve2_int_mla_by_indexed_elem<0b10, { opc{3}, 0b0, opc{2-1}, ?, opc{0} },
- asm, ZPR32, ZPR16, ZPR3b16, VectorIndexH> {
+ asm, ZPR32, ZPR16, ZPR3b16, VectorIndexH32b> {
bits<3> Zm;
bits<3> iop;
let Inst{20-19} = iop{2-1};
@@ -2412,13 +2601,16 @@ multiclass sve2_int_mla_long_by_indexed_elem<bits<4> opc, string asm> {
let Inst{11} = iop{0};
}
def _D : sve2_int_mla_by_indexed_elem<0b11, { opc{3}, 0b0, opc{2-1}, ?, opc{0} },
- asm, ZPR64, ZPR32, ZPR4b32, VectorIndexS> {
+ asm, ZPR64, ZPR32, ZPR4b32, VectorIndexS32b> {
bits<4> Zm;
bits<2> iop;
let Inst{20} = iop{1};
let Inst{19-16} = Zm;
let Inst{11} = iop{0};
}
+
+ def : SVE_4_Op_Imm_Pat<nxv4i32, op, nxv4i32, nxv8i16, nxv8i16, i32, VectorIndexH32b_timm, !cast<Instruction>(NAME # _S)>;
+ def : SVE_4_Op_Imm_Pat<nxv2i64, op, nxv2i64, nxv4i32, nxv4i32, i32, VectorIndexS32b_timm, !cast<Instruction>(NAME # _D)>;
}
//===----------------------------------------------------------------------===//
@@ -2442,7 +2634,7 @@ class sve_intx_dot<bit sz, bit U, string asm, ZPRRegOp zprty1,
let Inst{4-0} = Zda;
let Constraints = "$Zda = $_Zda";
- let DestructiveInstType = Destructive;
+ let DestructiveInstType = DestructiveOther;
}
multiclass sve_intx_dot<bit opc, string asm, SDPatternOperator op> {
@@ -2474,28 +2666,28 @@ class sve_intx_dot_by_indexed_elem<bit sz, bit U, string asm,
let Inst{4-0} = Zda;
let Constraints = "$Zda = $_Zda";
- let DestructiveInstType = Destructive;
+ let DestructiveInstType = DestructiveOther;
}
multiclass sve_intx_dot_by_indexed_elem<bit opc, string asm,
SDPatternOperator op> {
- def _S : sve_intx_dot_by_indexed_elem<0b0, opc, asm, ZPR32, ZPR8, ZPR3b8, VectorIndexS32b> {
+ def _S : sve_intx_dot_by_indexed_elem<0b0, opc, asm, ZPR32, ZPR8, ZPR3b8, VectorIndexS32b_timm> {
bits<2> iop;
bits<3> Zm;
let Inst{20-19} = iop;
let Inst{18-16} = Zm;
}
- def _D : sve_intx_dot_by_indexed_elem<0b1, opc, asm, ZPR64, ZPR16, ZPR4b16, VectorIndexD32b> {
+ def _D : sve_intx_dot_by_indexed_elem<0b1, opc, asm, ZPR64, ZPR16, ZPR4b16, VectorIndexD32b_timm> {
bits<1> iop;
bits<4> Zm;
let Inst{20} = iop;
let Inst{19-16} = Zm;
}
- def : Pat<(nxv4i32 (op nxv4i32:$Op1, nxv16i8:$Op2, nxv16i8:$Op3, (i32 VectorIndexS32b:$idx))),
- (!cast<Instruction>(NAME # _S) $Op1, $Op2, $Op3, VectorIndexS32b:$idx)>;
- def : Pat<(nxv2i64 (op nxv2i64:$Op1, nxv8i16:$Op2, nxv8i16:$Op3, (i32 VectorIndexD32b:$idx))),
- (!cast<Instruction>(NAME # _D) $Op1, $Op2, $Op3, VectorIndexD32b:$idx)>;
+ def : Pat<(nxv4i32 (op nxv4i32:$Op1, nxv16i8:$Op2, nxv16i8:$Op3, (i32 VectorIndexS32b_timm:$idx))),
+ (!cast<Instruction>(NAME # _S) $Op1, $Op2, $Op3, VectorIndexS32b_timm:$idx)>;
+ def : Pat<(nxv2i64 (op nxv2i64:$Op1, nxv8i16:$Op2, nxv8i16:$Op3, (i32 VectorIndexD32b_timm:$idx))),
+ (!cast<Instruction>(NAME # _D) $Op1, $Op2, $Op3, VectorIndexD32b_timm:$idx)>;
}
//===----------------------------------------------------------------------===//
@@ -2521,24 +2713,36 @@ class sve2_complex_int_arith<bits<2> sz, bits<4> opc, string asm,
let Inst{4-0} = Zda;
let Constraints = "$Zda = $_Zda";
- let DestructiveInstType = Destructive;
+ let DestructiveInstType = DestructiveOther;
let ElementSize = ElementSizeNone;
}
-multiclass sve2_cintx_dot<string asm> {
+multiclass sve2_cintx_dot<string asm, SDPatternOperator op> {
def _S : sve2_complex_int_arith<0b10, 0b0001, asm, ZPR32, ZPR8>;
def _D : sve2_complex_int_arith<0b11, 0b0001, asm, ZPR64, ZPR16>;
+
+ def : Pat<(nxv4i32 (op (nxv4i32 ZPR32:$Op1), (nxv16i8 ZPR8:$Op2), (nxv16i8 ZPR8:$Op3),
+ (i32 complexrotateop:$imm))),
+ (!cast<Instruction>(NAME # "_S") ZPR32:$Op1, ZPR8:$Op2, ZPR8:$Op3, complexrotateop:$imm)>;
+ def : Pat<(nxv2i64 (op (nxv2i64 ZPR64:$Op1), (nxv8i16 ZPR16:$Op2), (nxv8i16 ZPR16:$Op3),
+ (i32 complexrotateop:$imm))),
+ (!cast<Instruction>(NAME # "_D") ZPR64:$Op1, ZPR16:$Op2, ZPR16:$Op3, complexrotateop:$imm)>;
}
//===----------------------------------------------------------------------===//
// SVE2 Complex Multiply-Add Group
//===----------------------------------------------------------------------===//
-multiclass sve2_int_cmla<bit opc, string asm> {
+multiclass sve2_int_cmla<bit opc, string asm, SDPatternOperator op> {
def _B : sve2_complex_int_arith<0b00, { 0b001, opc }, asm, ZPR8, ZPR8>;
def _H : sve2_complex_int_arith<0b01, { 0b001, opc }, asm, ZPR16, ZPR16>;
def _S : sve2_complex_int_arith<0b10, { 0b001, opc }, asm, ZPR32, ZPR32>;
def _D : sve2_complex_int_arith<0b11, { 0b001, opc }, asm, ZPR64, ZPR64>;
+
+ def : SVE_4_Op_Imm_Pat<nxv16i8, op, nxv16i8, nxv16i8, nxv16i8, i32, complexrotateop, !cast<Instruction>(NAME # _B)>;
+ def : SVE_4_Op_Imm_Pat<nxv8i16, op, nxv8i16, nxv8i16, nxv8i16, i32, complexrotateop, !cast<Instruction>(NAME # _H)>;
+ def : SVE_4_Op_Imm_Pat<nxv4i32, op, nxv4i32, nxv4i32, nxv4i32, i32, complexrotateop, !cast<Instruction>(NAME # _S)>;
+ def : SVE_4_Op_Imm_Pat<nxv2i64, op, nxv2i64, nxv2i64, nxv2i64, i32, complexrotateop, !cast<Instruction>(NAME # _D)>;
}
//===----------------------------------------------------------------------===//
@@ -2563,42 +2767,58 @@ class sve2_complex_int_arith_indexed<bits<2> sz, bits<4> opc, string asm,
let Inst{4-0} = Zda;
let Constraints = "$Zda = $_Zda";
- let DestructiveInstType = Destructive;
+ let DestructiveInstType = DestructiveOther;
let ElementSize = ElementSizeNone;
}
-multiclass sve2_cintx_dot_by_indexed_elem<string asm> {
- def _S : sve2_complex_int_arith_indexed<0b10, 0b0100, asm, ZPR32, ZPR8, ZPR3b8, VectorIndexS> {
+multiclass sve2_cintx_dot_by_indexed_elem<string asm, SDPatternOperator op> {
+ def _S : sve2_complex_int_arith_indexed<0b10, 0b0100, asm, ZPR32, ZPR8, ZPR3b8, VectorIndexS32b> {
bits<2> iop;
bits<3> Zm;
let Inst{20-19} = iop;
let Inst{18-16} = Zm;
}
- def _D : sve2_complex_int_arith_indexed<0b11, 0b0100, asm, ZPR64, ZPR16, ZPR4b16, VectorIndexD> {
+ def _D : sve2_complex_int_arith_indexed<0b11, 0b0100, asm, ZPR64, ZPR16, ZPR4b16, VectorIndexD32b> {
bit iop;
bits<4> Zm;
let Inst{20} = iop;
let Inst{19-16} = Zm;
}
+
+ def : Pat<(nxv4i32 (op (nxv4i32 ZPR32:$Op1), (nxv16i8 ZPR8:$Op2), (nxv16i8 ZPR8:$Op3),
+ (i32 VectorIndexS32b_timm:$idx), (i32 complexrotateop:$imm))),
+ (!cast<Instruction>(NAME # "_S") ZPR32:$Op1, ZPR8:$Op2, ZPR8:$Op3, VectorIndexS32b_timm:$idx, complexrotateop:$imm)>;
+ def : Pat<(nxv2i64 (op (nxv2i64 ZPR64:$Op1), (nxv8i16 ZPR16:$Op2), (nxv8i16 ZPR16:$Op3),
+ (i32 VectorIndexD32b_timm:$idx), (i32 complexrotateop:$imm))),
+ (!cast<Instruction>(NAME # "_D") ZPR64:$Op1, ZPR16:$Op2, ZPR16:$Op3, VectorIndexD32b_timm:$idx, complexrotateop:$imm)>;
}
//===----------------------------------------------------------------------===//
// SVE2 Complex Multiply-Add - Indexed Group
//===----------------------------------------------------------------------===//
-multiclass sve2_cmla_by_indexed_elem<bit opc, string asm> {
- def _H : sve2_complex_int_arith_indexed<0b10, { 0b011, opc }, asm, ZPR16, ZPR16, ZPR3b16, VectorIndexS> {
+multiclass sve2_cmla_by_indexed_elem<bit opc, string asm,
+ SDPatternOperator op> {
+ def _H : sve2_complex_int_arith_indexed<0b10, { 0b011, opc }, asm, ZPR16, ZPR16, ZPR3b16, VectorIndexS32b> {
bits<2> iop;
bits<3> Zm;
let Inst{20-19} = iop;
let Inst{18-16} = Zm;
}
- def _S : sve2_complex_int_arith_indexed<0b11, { 0b011, opc }, asm, ZPR32, ZPR32, ZPR4b32, VectorIndexD> {
+ def _S : sve2_complex_int_arith_indexed<0b11, { 0b011, opc }, asm, ZPR32, ZPR32, ZPR4b32, VectorIndexD32b> {
bit iop;
bits<4> Zm;
let Inst{20} = iop;
let Inst{19-16} = Zm;
}
+
+ def : Pat<(nxv8i16 (op (nxv8i16 ZPR16:$Op1), (nxv8i16 ZPR16:$Op2), (nxv8i16 ZPR16:$Op3),
+ (i32 VectorIndexS32b_timm:$idx), (i32 complexrotateop:$imm))),
+ (!cast<Instruction>(NAME # "_H") ZPR16:$Op1, ZPR16:$Op2, ZPR16:$Op3, VectorIndexS32b_timm:$idx, complexrotateop:$imm)>;
+
+ def : Pat<(nxv4i32 (op (nxv4i32 ZPR32:$Op1), (nxv4i32 ZPR32:$Op2), (nxv4i32 ZPR32:$Op3),
+ (i32 VectorIndexD32b_timm:$idx), (i32 complexrotateop:$imm))),
+ (!cast<Instruction>(NAME # "_S") ZPR32:$Op1, ZPR32:$Op2, ZPR32:$Op3, VectorIndexD32b_timm:$idx, complexrotateop:$imm)>;
}
//===----------------------------------------------------------------------===//
@@ -2621,11 +2841,22 @@ class sve2_int_mul<bits<2> sz, bits<3> opc, string asm, ZPRRegOp zprty>
let Inst{4-0} = Zd;
}
-multiclass sve2_int_mul<bits<3> opc, string asm> {
+multiclass sve2_int_mul<bits<3> opc, string asm, SDPatternOperator op> {
def _B : sve2_int_mul<0b00, opc, asm, ZPR8>;
def _H : sve2_int_mul<0b01, opc, asm, ZPR16>;
def _S : sve2_int_mul<0b10, opc, asm, ZPR32>;
def _D : sve2_int_mul<0b11, opc, asm, ZPR64>;
+
+ def : SVE_2_Op_Pat<nxv16i8, op, nxv16i8, nxv16i8, !cast<Instruction>(NAME # _B)>;
+ def : SVE_2_Op_Pat<nxv8i16, op, nxv8i16, nxv8i16, !cast<Instruction>(NAME # _H)>;
+ def : SVE_2_Op_Pat<nxv4i32, op, nxv4i32, nxv4i32, !cast<Instruction>(NAME # _S)>;
+ def : SVE_2_Op_Pat<nxv2i64, op, nxv2i64, nxv2i64, !cast<Instruction>(NAME # _D)>;
+}
+
+multiclass sve2_int_mul_single<bits<3> opc, string asm, SDPatternOperator op> {
+ def _B : sve2_int_mul<0b00, opc, asm, ZPR8>;
+
+ def : SVE_2_Op_Pat<nxv16i8, op, nxv16i8, nxv16i8, !cast<Instruction>(NAME # _B)>;
}
//===----------------------------------------------------------------------===//
@@ -2648,31 +2879,37 @@ class sve2_int_mul_by_indexed_elem<bits<2> sz, bits<4> opc, string asm,
let Inst{4-0} = Zd;
}
-multiclass sve2_int_mul_by_indexed_elem<bits<4> opc, string asm> {
- def _H : sve2_int_mul_by_indexed_elem<{0, ?}, opc, asm, ZPR16, ZPR16, ZPR3b16, VectorIndexH> {
+multiclass sve2_int_mul_by_indexed_elem<bits<4> opc, string asm,
+ SDPatternOperator op> {
+ def _H : sve2_int_mul_by_indexed_elem<{0, ?}, opc, asm, ZPR16, ZPR16, ZPR3b16, VectorIndexH32b> {
bits<3> Zm;
bits<3> iop;
let Inst{22} = iop{2};
let Inst{20-19} = iop{1-0};
let Inst{18-16} = Zm;
}
- def _S : sve2_int_mul_by_indexed_elem<0b10, opc, asm, ZPR32, ZPR32, ZPR3b32, VectorIndexS> {
+ def _S : sve2_int_mul_by_indexed_elem<0b10, opc, asm, ZPR32, ZPR32, ZPR3b32, VectorIndexS32b> {
bits<3> Zm;
bits<2> iop;
let Inst{20-19} = iop;
let Inst{18-16} = Zm;
}
- def _D : sve2_int_mul_by_indexed_elem<0b11, opc, asm, ZPR64, ZPR64, ZPR4b64, VectorIndexD> {
+ def _D : sve2_int_mul_by_indexed_elem<0b11, opc, asm, ZPR64, ZPR64, ZPR4b64, VectorIndexD32b> {
bits<4> Zm;
bit iop;
let Inst{20} = iop;
let Inst{19-16} = Zm;
}
+
+ def : SVE_3_Op_Imm_Pat<nxv8i16, op, nxv8i16, nxv8i16, i32, VectorIndexH32b_timm, !cast<Instruction>(NAME # _H)>;
+ def : SVE_3_Op_Imm_Pat<nxv4i32, op, nxv4i32, nxv4i32, i32, VectorIndexS32b_timm, !cast<Instruction>(NAME # _S)>;
+ def : SVE_3_Op_Imm_Pat<nxv2i64, op, nxv2i64, nxv2i64, i32, VectorIndexD32b_timm, !cast<Instruction>(NAME # _D)>;
}
-multiclass sve2_int_mul_long_by_indexed_elem<bits<3> opc, string asm> {
+multiclass sve2_int_mul_long_by_indexed_elem<bits<3> opc, string asm,
+ SDPatternOperator op> {
def _S : sve2_int_mul_by_indexed_elem<0b10, { opc{2-1}, ?, opc{0} }, asm,
- ZPR32, ZPR16, ZPR3b16, VectorIndexH> {
+ ZPR32, ZPR16, ZPR3b16, VectorIndexH32b> {
bits<3> Zm;
bits<3> iop;
let Inst{20-19} = iop{2-1};
@@ -2680,13 +2917,16 @@ multiclass sve2_int_mul_long_by_indexed_elem<bits<3> opc, string asm> {
let Inst{11} = iop{0};
}
def _D : sve2_int_mul_by_indexed_elem<0b11, { opc{2-1}, ?, opc{0} }, asm,
- ZPR64, ZPR32, ZPR4b32, VectorIndexS> {
+ ZPR64, ZPR32, ZPR4b32, VectorIndexS32b> {
bits<4> Zm;
bits<2> iop;
let Inst{20} = iop{1};
let Inst{19-16} = Zm;
let Inst{11} = iop{0};
}
+
+ def : SVE_3_Op_Imm_Pat<nxv4i32, op, nxv8i16, nxv8i16, i32, VectorIndexH32b_timm, !cast<Instruction>(NAME # _S)>;
+ def : SVE_3_Op_Imm_Pat<nxv2i64, op, nxv4i32, nxv4i32, i32, VectorIndexS32b_timm, !cast<Instruction>(NAME # _D)>;
}
//===----------------------------------------------------------------------===//
@@ -2702,7 +2942,7 @@ class sve2_int_arith_pred<bits<2> sz, bits<6> opc, string asm,
bits<5> Zdn;
let Inst{31-24} = 0b01000100;
let Inst{23-22} = sz;
- let Inst{21} = 0b0;
+ let Inst{21-20} = 0b01;
let Inst{20-16} = opc{5-1};
let Inst{15-14} = 0b10;
let Inst{13} = opc{0};
@@ -2711,15 +2951,20 @@ class sve2_int_arith_pred<bits<2> sz, bits<6> opc, string asm,
let Inst{4-0} = Zdn;
let Constraints = "$Zdn = $_Zdn";
- let DestructiveInstType = Destructive;
+ let DestructiveInstType = DestructiveOther;
let ElementSize = zprty.ElementSize;
}
-multiclass sve2_int_arith_pred<bits<6> opc, string asm> {
+multiclass sve2_int_arith_pred<bits<6> opc, string asm, SDPatternOperator op> {
def _B : sve2_int_arith_pred<0b00, opc, asm, ZPR8>;
def _H : sve2_int_arith_pred<0b01, opc, asm, ZPR16>;
def _S : sve2_int_arith_pred<0b10, opc, asm, ZPR32>;
def _D : sve2_int_arith_pred<0b11, opc, asm, ZPR64>;
+
+ def : SVE_3_Op_Pat<nxv16i8, op, nxv16i1, nxv16i8, nxv16i8, !cast<Instruction>(NAME # _B)>;
+ def : SVE_3_Op_Pat<nxv8i16, op, nxv8i1, nxv8i16, nxv8i16, !cast<Instruction>(NAME # _H)>;
+ def : SVE_3_Op_Pat<nxv4i32, op, nxv4i1, nxv4i32, nxv4i32, !cast<Instruction>(NAME # _S)>;
+ def : SVE_3_Op_Pat<nxv2i64, op, nxv2i1, nxv2i64, nxv2i64, !cast<Instruction>(NAME # _D)>;
}
class sve2_int_sadd_long_accum_pairwise<bits<2> sz, bit U, string asm,
@@ -2739,14 +2984,18 @@ class sve2_int_sadd_long_accum_pairwise<bits<2> sz, bit U, string asm,
let Inst{4-0} = Zda;
let Constraints = "$Zda = $_Zda";
- let DestructiveInstType = Destructive;
+ let DestructiveInstType = DestructiveOther;
let ElementSize = zprty1.ElementSize;
}
-multiclass sve2_int_sadd_long_accum_pairwise<bit U, string asm> {
+multiclass sve2_int_sadd_long_accum_pairwise<bit U, string asm, SDPatternOperator op> {
def _H : sve2_int_sadd_long_accum_pairwise<0b01, U, asm, ZPR16, ZPR8>;
def _S : sve2_int_sadd_long_accum_pairwise<0b10, U, asm, ZPR32, ZPR16>;
def _D : sve2_int_sadd_long_accum_pairwise<0b11, U, asm, ZPR64, ZPR32>;
+
+ def : SVE_3_Op_Pat<nxv8i16, op, nxv8i1, nxv8i16, nxv16i8, !cast<Instruction>(NAME # _H)>;
+ def : SVE_3_Op_Pat<nxv4i32, op, nxv4i1, nxv4i32, nxv8i16, !cast<Instruction>(NAME # _S)>;
+ def : SVE_3_Op_Pat<nxv2i64, op, nxv2i1, nxv2i64, nxv4i32, !cast<Instruction>(NAME # _D)>;
}
class sve2_int_un_pred_arit<bits<2> sz, bit Q, bits<2> opc,
@@ -2770,19 +3019,26 @@ class sve2_int_un_pred_arit<bits<2> sz, bit Q, bits<2> opc,
let Inst{4-0} = Zd;
let Constraints = "$Zd = $_Zd";
- let DestructiveInstType = Destructive;
+ let DestructiveInstType = DestructiveOther;
let ElementSize = zprty.ElementSize;
}
-multiclass sve2_int_un_pred_arit_s<bits<3> opc, string asm> {
+multiclass sve2_int_un_pred_arit_s<bits<3> opc, string asm,
+ SDPatternOperator op> {
def _S : sve2_int_un_pred_arit<0b10, opc{2}, opc{1-0}, asm, ZPR32>;
+ def : SVE_3_Op_Pat<nxv4i32, op, nxv4i32, nxv4i1, nxv4i32, !cast<Instruction>(NAME # _S)>;
}
-multiclass sve2_int_un_pred_arit<bits<3> opc, string asm> {
+multiclass sve2_int_un_pred_arit<bits<3> opc, string asm, SDPatternOperator op> {
def _B : sve2_int_un_pred_arit<0b00, opc{2}, opc{1-0}, asm, ZPR8>;
def _H : sve2_int_un_pred_arit<0b01, opc{2}, opc{1-0}, asm, ZPR16>;
def _S : sve2_int_un_pred_arit<0b10, opc{2}, opc{1-0}, asm, ZPR32>;
def _D : sve2_int_un_pred_arit<0b11, opc{2}, opc{1-0}, asm, ZPR64>;
+
+ def : SVE_3_Op_Pat<nxv16i8, op, nxv16i8, nxv16i1, nxv16i8, !cast<Instruction>(NAME # _B)>;
+ def : SVE_3_Op_Pat<nxv8i16, op, nxv8i16, nxv8i1, nxv8i16, !cast<Instruction>(NAME # _H)>;
+ def : SVE_3_Op_Pat<nxv4i32, op, nxv4i32, nxv4i1, nxv4i32, !cast<Instruction>(NAME # _S)>;
+ def : SVE_3_Op_Pat<nxv2i64, op, nxv2i64, nxv2i1, nxv2i64, !cast<Instruction>(NAME # _D)>;
}
//===----------------------------------------------------------------------===//
@@ -2806,21 +3062,47 @@ class sve2_wide_int_arith<bits<2> sz, bits<5> opc, string asm,
let Inst{4-0} = Zd;
}
-multiclass sve2_wide_int_arith_long<bits<5> opc, string asm> {
+multiclass sve2_wide_int_arith_long<bits<5> opc, string asm,
+ SDPatternOperator op> {
def _H : sve2_wide_int_arith<0b01, opc, asm, ZPR16, ZPR8, ZPR8>;
def _S : sve2_wide_int_arith<0b10, opc, asm, ZPR32, ZPR16, ZPR16>;
def _D : sve2_wide_int_arith<0b11, opc, asm, ZPR64, ZPR32, ZPR32>;
+
+ def : SVE_2_Op_Pat<nxv8i16, op, nxv16i8, nxv16i8, !cast<Instruction>(NAME # _H)>;
+ def : SVE_2_Op_Pat<nxv4i32, op, nxv8i16, nxv8i16, !cast<Instruction>(NAME # _S)>;
+ def : SVE_2_Op_Pat<nxv2i64, op, nxv4i32, nxv4i32, !cast<Instruction>(NAME # _D)>;
}
-multiclass sve2_wide_int_arith_wide<bits<3> opc, string asm> {
+multiclass sve2_wide_int_arith_wide<bits<3> opc, string asm,
+ SDPatternOperator op> {
def _H : sve2_wide_int_arith<0b01, { 0b10, opc }, asm, ZPR16, ZPR16, ZPR8>;
def _S : sve2_wide_int_arith<0b10, { 0b10, opc }, asm, ZPR32, ZPR32, ZPR16>;
def _D : sve2_wide_int_arith<0b11, { 0b10, opc }, asm, ZPR64, ZPR64, ZPR32>;
+
+ def : SVE_2_Op_Pat<nxv8i16, op, nxv8i16, nxv16i8, !cast<Instruction>(NAME # _H)>;
+ def : SVE_2_Op_Pat<nxv4i32, op, nxv4i32, nxv8i16, !cast<Instruction>(NAME # _S)>;
+ def : SVE_2_Op_Pat<nxv2i64, op, nxv2i64, nxv4i32, !cast<Instruction>(NAME # _D)>;
+}
+
+multiclass sve2_wide_int_arith_pmul<bits<2> sz, bits<5> opc, string asm,
+ SDPatternOperator op> {
+ def NAME : sve2_wide_int_arith<sz, opc, asm, ZPR128, ZPR64, ZPR64>;
+
+ // To avoid using 128 bit elements in the IR, the pattern below works with
+ // llvm intrinsics with the _pair suffix, to reflect that
+ // _Q is implemented as a pair of _D.
+ def : SVE_2_Op_Pat<nxv2i64, op, nxv2i64, nxv2i64, !cast<Instruction>(NAME)>;
}
-multiclass sve2_pmul_long<bits<1> opc, string asm> {
+multiclass sve2_pmul_long<bits<1> opc, string asm, SDPatternOperator op> {
def _H : sve2_wide_int_arith<0b01, {0b1101, opc}, asm, ZPR16, ZPR8, ZPR8>;
def _D : sve2_wide_int_arith<0b11, {0b1101, opc}, asm, ZPR64, ZPR32, ZPR32>;
+
+ // To avoid using 128 bit elements in the IR, the patterns below work with
+ // llvm intrinsics with the _pair suffix, to reflect that
+ // _H is implemented as a pair of _B and _D is implemented as a pair of _S.
+ def : SVE_2_Op_Pat<nxv16i8, op, nxv16i8, nxv16i8, !cast<Instruction>(NAME # _H)>;
+ def : SVE_2_Op_Pat<nxv4i32, op, nxv4i32, nxv4i32, !cast<Instruction>(NAME # _D)>;
}
//===----------------------------------------------------------------------===//
@@ -2844,17 +3126,27 @@ class sve2_misc<bits<2> sz, bits<4> opc, string asm,
let Inst{4-0} = Zd;
}
-multiclass sve2_misc_bitwise<bits<4> opc, string asm> {
+multiclass sve2_misc_bitwise<bits<4> opc, string asm, SDPatternOperator op> {
def _B : sve2_misc<0b00, opc, asm, ZPR8, ZPR8>;
def _H : sve2_misc<0b01, opc, asm, ZPR16, ZPR16>;
def _S : sve2_misc<0b10, opc, asm, ZPR32, ZPR32>;
def _D : sve2_misc<0b11, opc, asm, ZPR64, ZPR64>;
+
+ def : SVE_2_Op_Pat<nxv16i8, op, nxv16i8, nxv16i8, !cast<Instruction>(NAME # _B)>;
+ def : SVE_2_Op_Pat<nxv8i16, op, nxv8i16, nxv8i16, !cast<Instruction>(NAME # _H)>;
+ def : SVE_2_Op_Pat<nxv4i32, op, nxv4i32, nxv4i32, !cast<Instruction>(NAME # _S)>;
+ def : SVE_2_Op_Pat<nxv2i64, op, nxv2i64, nxv2i64, !cast<Instruction>(NAME # _D)>;
}
-multiclass sve2_misc_int_addsub_long_interleaved<bits<2> opc, string asm> {
+multiclass sve2_misc_int_addsub_long_interleaved<bits<2> opc, string asm,
+ SDPatternOperator op> {
def _H : sve2_misc<0b01, { 0b00, opc }, asm, ZPR16, ZPR8>;
def _S : sve2_misc<0b10, { 0b00, opc }, asm, ZPR32, ZPR16>;
def _D : sve2_misc<0b11, { 0b00, opc }, asm, ZPR64, ZPR32>;
+
+ def : SVE_2_Op_Pat<nxv8i16, op, nxv16i8, nxv16i8, !cast<Instruction>(NAME # _H)>;
+ def : SVE_2_Op_Pat<nxv4i32, op, nxv8i16, nxv8i16, !cast<Instruction>(NAME # _S)>;
+ def : SVE_2_Op_Pat<nxv2i64, op, nxv4i32, nxv4i32, !cast<Instruction>(NAME # _D)>;
}
class sve2_bitwise_xor_interleaved<bits<2> sz, bits<1> opc, string asm,
@@ -2874,15 +3166,21 @@ class sve2_bitwise_xor_interleaved<bits<2> sz, bits<1> opc, string asm,
let Inst{4-0} = Zd;
let Constraints = "$Zd = $_Zd";
- let DestructiveInstType = Destructive;
+ let DestructiveInstType = DestructiveOther;
let ElementSize = ElementSizeNone;
}
-multiclass sve2_bitwise_xor_interleaved<bit opc, string asm> {
+multiclass sve2_bitwise_xor_interleaved<bit opc, string asm,
+ SDPatternOperator op> {
def _B : sve2_bitwise_xor_interleaved<0b00, opc, asm, ZPR8, ZPR8>;
def _H : sve2_bitwise_xor_interleaved<0b01, opc, asm, ZPR16, ZPR16>;
def _S : sve2_bitwise_xor_interleaved<0b10, opc, asm, ZPR32, ZPR32>;
def _D : sve2_bitwise_xor_interleaved<0b11, opc, asm, ZPR64, ZPR64>;
+
+ def : SVE_3_Op_Pat<nxv16i8, op, nxv16i8, nxv16i8, nxv16i8, !cast<Instruction>(NAME # _B)>;
+ def : SVE_3_Op_Pat<nxv8i16, op, nxv8i16, nxv8i16, nxv8i16, !cast<Instruction>(NAME # _H)>;
+ def : SVE_3_Op_Pat<nxv4i32, op, nxv4i32, nxv4i32, nxv4i32, !cast<Instruction>(NAME # _S)>;
+ def : SVE_3_Op_Pat<nxv2i64, op, nxv2i64, nxv2i64, nxv2i64, !cast<Instruction>(NAME # _D)>;
}
class sve2_bitwise_shift_left_long<bits<3> tsz8_64, bits<2> opc, string asm,
@@ -2905,7 +3203,8 @@ class sve2_bitwise_shift_left_long<bits<3> tsz8_64, bits<2> opc, string asm,
let Inst{4-0} = Zd;
}
-multiclass sve2_bitwise_shift_left_long<bits<2> opc, string asm> {
+multiclass sve2_bitwise_shift_left_long<bits<2> opc, string asm,
+ SDPatternOperator op> {
def _H : sve2_bitwise_shift_left_long<{0,0,1}, opc, asm,
ZPR16, ZPR8, vecshiftL8>;
def _S : sve2_bitwise_shift_left_long<{0,1,?}, opc, asm,
@@ -2916,6 +3215,9 @@ multiclass sve2_bitwise_shift_left_long<bits<2> opc, string asm> {
ZPR64, ZPR32, vecshiftL32> {
let Inst{20-19} = imm{4-3};
}
+ def : SVE_2_Op_Imm_Pat<nxv8i16, op, nxv16i8, i32, tvecshiftL8, !cast<Instruction>(NAME # _H)>;
+ def : SVE_2_Op_Imm_Pat<nxv4i32, op, nxv8i16, i32, tvecshiftL16, !cast<Instruction>(NAME # _S)>;
+ def : SVE_2_Op_Imm_Pat<nxv2i64, op, nxv4i32, i32, tvecshiftL32, !cast<Instruction>(NAME # _D)>;
}
//===----------------------------------------------------------------------===//
@@ -2943,7 +3245,8 @@ class sve2_int_bin_shift_imm<bits<4> tsz8_64, bit opc, string asm,
let Constraints = "$Zd = $_Zd";
}
-multiclass sve2_int_bin_shift_imm_left<bit opc, string asm> {
+multiclass sve2_int_bin_shift_imm_left<bit opc, string asm,
+ SDPatternOperator op> {
def _B : sve2_int_bin_shift_imm<{0,0,0,1}, opc, asm, ZPR8, vecshiftL8>;
def _H : sve2_int_bin_shift_imm<{0,0,1,?}, opc, asm, ZPR16, vecshiftL16> {
let Inst{19} = imm{3};
@@ -2955,9 +3258,15 @@ multiclass sve2_int_bin_shift_imm_left<bit opc, string asm> {
let Inst{22} = imm{5};
let Inst{20-19} = imm{4-3};
}
+
+ def : SVE_3_Op_Imm_Pat<nxv16i8, op, nxv16i8, nxv16i8, i32, tvecshiftL8, !cast<Instruction>(NAME # _B)>;
+ def : SVE_3_Op_Imm_Pat<nxv8i16, op, nxv8i16, nxv8i16, i32, tvecshiftL16, !cast<Instruction>(NAME # _H)>;
+ def : SVE_3_Op_Imm_Pat<nxv4i32, op, nxv4i32, nxv4i32, i32, tvecshiftL32, !cast<Instruction>(NAME # _S)>;
+ def : SVE_3_Op_Imm_Pat<nxv2i64, op, nxv2i64, nxv2i64, i32, tvecshiftL64, !cast<Instruction>(NAME # _D)>;
}
-multiclass sve2_int_bin_shift_imm_right<bit opc, string asm> {
+multiclass sve2_int_bin_shift_imm_right<bit opc, string asm,
+ SDPatternOperator op> {
def _B : sve2_int_bin_shift_imm<{0,0,0,1}, opc, asm, ZPR8, vecshiftR8>;
def _H : sve2_int_bin_shift_imm<{0,0,1,?}, opc, asm, ZPR16, vecshiftR16> {
let Inst{19} = imm{3};
@@ -2969,6 +3278,11 @@ multiclass sve2_int_bin_shift_imm_right<bit opc, string asm> {
let Inst{22} = imm{5};
let Inst{20-19} = imm{4-3};
}
+
+ def : SVE_3_Op_Imm_Pat<nxv16i8, op, nxv16i8, nxv16i8, i32, tvecshiftR8, !cast<Instruction>(NAME # _B)>;
+ def : SVE_3_Op_Imm_Pat<nxv8i16, op, nxv8i16, nxv8i16, i32, tvecshiftR16, !cast<Instruction>(NAME # _H)>;
+ def : SVE_3_Op_Imm_Pat<nxv4i32, op, nxv4i32, nxv4i32, i32, tvecshiftR32, !cast<Instruction>(NAME # _S)>;
+ def : SVE_3_Op_Imm_Pat<nxv2i64, op, nxv2i64, nxv2i64, i32, tvecshiftR64, !cast<Instruction>(NAME # _D)>;
}
class sve2_int_bin_accum_shift_imm<bits<4> tsz8_64, bits<2> opc, string asm,
@@ -2990,11 +3304,12 @@ class sve2_int_bin_accum_shift_imm<bits<4> tsz8_64, bits<2> opc, string asm,
let Inst{4-0} = Zda;
let Constraints = "$Zda = $_Zda";
- let DestructiveInstType = Destructive;
+ let DestructiveInstType = DestructiveOther;
let ElementSize = ElementSizeNone;
}
-multiclass sve2_int_bin_accum_shift_imm_right<bits<2> opc, string asm> {
+multiclass sve2_int_bin_accum_shift_imm_right<bits<2> opc, string asm,
+ SDPatternOperator op> {
def _B : sve2_int_bin_accum_shift_imm<{0,0,0,1}, opc, asm, ZPR8, vecshiftR8>;
def _H : sve2_int_bin_accum_shift_imm<{0,0,1,?}, opc, asm, ZPR16, vecshiftR16> {
let Inst{19} = imm{3};
@@ -3006,6 +3321,11 @@ multiclass sve2_int_bin_accum_shift_imm_right<bits<2> opc, string asm> {
let Inst{22} = imm{5};
let Inst{20-19} = imm{4-3};
}
+
+ def : SVE_3_Op_Imm_Pat<nxv16i8, op, nxv16i8, nxv16i8, i32, tvecshiftR8, !cast<Instruction>(NAME # _B)>;
+ def : SVE_3_Op_Imm_Pat<nxv8i16, op, nxv8i16, nxv8i16, i32, tvecshiftR16, !cast<Instruction>(NAME # _H)>;
+ def : SVE_3_Op_Imm_Pat<nxv4i32, op, nxv4i32, nxv4i32, i32, tvecshiftR32, !cast<Instruction>(NAME # _S)>;
+ def : SVE_3_Op_Imm_Pat<nxv2i64, op, nxv2i64, nxv2i64, i32, tvecshiftR64, !cast<Instruction>(NAME # _D)>;
}
class sve2_int_cadd<bits<2> sz, bit opc, string asm, ZPRRegOp zprty>
@@ -3024,15 +3344,20 @@ class sve2_int_cadd<bits<2> sz, bit opc, string asm, ZPRRegOp zprty>
let Inst{4-0} = Zdn;
let Constraints = "$Zdn = $_Zdn";
- let DestructiveInstType = Destructive;
+ let DestructiveInstType = DestructiveOther;
let ElementSize = ElementSizeNone;
}
-multiclass sve2_int_cadd<bit opc, string asm> {
+multiclass sve2_int_cadd<bit opc, string asm, SDPatternOperator op> {
def _B : sve2_int_cadd<0b00, opc, asm, ZPR8>;
def _H : sve2_int_cadd<0b01, opc, asm, ZPR16>;
def _S : sve2_int_cadd<0b10, opc, asm, ZPR32>;
def _D : sve2_int_cadd<0b11, opc, asm, ZPR64>;
+
+ def : SVE_3_Op_Imm_Pat<nxv16i8, op, nxv16i8, nxv16i8, i32, complexrotateopodd, !cast<Instruction>(NAME # _B)>;
+ def : SVE_3_Op_Imm_Pat<nxv8i16, op, nxv8i16, nxv8i16, i32, complexrotateopodd, !cast<Instruction>(NAME # _H)>;
+ def : SVE_3_Op_Imm_Pat<nxv4i32, op, nxv4i32, nxv4i32, i32, complexrotateopodd, !cast<Instruction>(NAME # _S)>;
+ def : SVE_3_Op_Imm_Pat<nxv2i64, op, nxv2i64, nxv2i64, i32, complexrotateopodd, !cast<Instruction>(NAME # _D)>;
}
class sve2_int_absdiff_accum<bits<2> sz, bits<4> opc, string asm,
@@ -3052,28 +3377,41 @@ class sve2_int_absdiff_accum<bits<2> sz, bits<4> opc, string asm,
let Inst{4-0} = Zda;
let Constraints = "$Zda = $_Zda";
- let DestructiveInstType = Destructive;
+ let DestructiveInstType = DestructiveOther;
let ElementSize = ElementSizeNone;
}
-multiclass sve2_int_absdiff_accum<bit opc, string asm> {
+multiclass sve2_int_absdiff_accum<bit opc, string asm, SDPatternOperator op> {
def _B : sve2_int_absdiff_accum<0b00, { 0b111, opc }, asm, ZPR8, ZPR8>;
def _H : sve2_int_absdiff_accum<0b01, { 0b111, opc }, asm, ZPR16, ZPR16>;
def _S : sve2_int_absdiff_accum<0b10, { 0b111, opc }, asm, ZPR32, ZPR32>;
def _D : sve2_int_absdiff_accum<0b11, { 0b111, opc }, asm, ZPR64, ZPR64>;
+
+ def : SVE_3_Op_Pat<nxv16i8, op, nxv16i8, nxv16i8, nxv16i8, !cast<Instruction>(NAME # _B)>;
+ def : SVE_3_Op_Pat<nxv8i16, op, nxv8i16, nxv8i16, nxv8i16, !cast<Instruction>(NAME # _H)>;
+ def : SVE_3_Op_Pat<nxv4i32, op, nxv4i32, nxv4i32, nxv4i32, !cast<Instruction>(NAME # _S)>;
+ def : SVE_3_Op_Pat<nxv2i64, op, nxv2i64, nxv2i64, nxv2i64, !cast<Instruction>(NAME # _D)>;
}
-multiclass sve2_int_absdiff_accum_long<bits<2> opc, string asm> {
+multiclass sve2_int_absdiff_accum_long<bits<2> opc, string asm,
+ SDPatternOperator op> {
def _H : sve2_int_absdiff_accum<0b01, { 0b00, opc }, asm, ZPR16, ZPR8>;
def _S : sve2_int_absdiff_accum<0b10, { 0b00, opc }, asm, ZPR32, ZPR16>;
def _D : sve2_int_absdiff_accum<0b11, { 0b00, opc }, asm, ZPR64, ZPR32>;
+
+ def : SVE_3_Op_Pat<nxv8i16, op, nxv8i16, nxv16i8, nxv16i8, !cast<Instruction>(NAME # _H)>;
+ def : SVE_3_Op_Pat<nxv4i32, op, nxv4i32, nxv8i16, nxv8i16, !cast<Instruction>(NAME # _S)>;
+ def : SVE_3_Op_Pat<nxv2i64, op, nxv2i64, nxv4i32, nxv4i32, !cast<Instruction>(NAME # _D)>;
}
-multiclass sve2_int_addsub_long_carry<bits<2> opc, string asm> {
+multiclass sve2_int_addsub_long_carry<bits<2> opc, string asm, SDPatternOperator op> {
def _S : sve2_int_absdiff_accum<{ opc{1}, 0b0 }, { 0b010, opc{0} }, asm,
ZPR32, ZPR32>;
def _D : sve2_int_absdiff_accum<{ opc{1}, 0b1 }, { 0b010, opc{0} }, asm,
ZPR64, ZPR64>;
+
+ def : SVE_3_Op_Pat<nxv4i32, op, nxv4i32, nxv4i32, nxv4i32, !cast<Instruction>(NAME # _S)>;
+ def : SVE_3_Op_Pat<nxv2i64, op, nxv2i64, nxv2i64, nxv2i64, !cast<Instruction>(NAME # _D)>;
}
//===----------------------------------------------------------------------===//
@@ -3300,7 +3638,7 @@ class sve_int_un_pred_arit<bits<2> sz8_64, bits<4> opc,
let Inst{4-0} = Zd;
let Constraints = "$Zd = $_Zd";
- let DestructiveInstType = Destructive;
+ let DestructiveInstType = DestructiveOther;
let ElementSize = zprty.ElementSize;
}
@@ -3465,11 +3803,12 @@ class sve_int_arith_imm0<bits<2> sz8_64, bits<3> opc, string asm,
let Inst{4-0} = Zdn;
let Constraints = "$Zdn = $_Zdn";
- let DestructiveInstType = Destructive;
+ let DestructiveInstType = DestructiveOther;
let ElementSize = ElementSizeNone;
}
-multiclass sve_int_arith_imm0<bits<3> opc, string asm, SDPatternOperator op> {
+multiclass sve_int_arith_imm0<bits<3> opc, string asm,
+ SDPatternOperator op, SDPatternOperator int_op> {
def _B : sve_int_arith_imm0<0b00, opc, asm, ZPR8, addsub_imm8_opt_lsl_i8>;
def _H : sve_int_arith_imm0<0b01, opc, asm, ZPR16, addsub_imm8_opt_lsl_i16>;
def _S : sve_int_arith_imm0<0b10, opc, asm, ZPR32, addsub_imm8_opt_lsl_i32>;
@@ -3479,6 +3818,12 @@ multiclass sve_int_arith_imm0<bits<3> opc, string asm, SDPatternOperator op> {
def : SVE_1_Op_Imm_OptLsl_Pat<nxv8i16, op, ZPR16, i32, SVEAddSubImm16Pat, !cast<Instruction>(NAME # _H)>;
def : SVE_1_Op_Imm_OptLsl_Pat<nxv4i32, op, ZPR32, i32, SVEAddSubImm32Pat, !cast<Instruction>(NAME # _S)>;
def : SVE_1_Op_Imm_OptLsl_Pat<nxv2i64, op, ZPR64, i64, SVEAddSubImm64Pat, !cast<Instruction>(NAME # _D)>;
+
+ // Intrinsic version
+ def : SVE_1_Op_Imm_OptLsl_Pat<nxv16i8, int_op, ZPR8, i32, SVEAddSubImm8Pat, !cast<Instruction>(NAME # _B)>;
+ def : SVE_1_Op_Imm_OptLsl_Pat<nxv8i16, int_op, ZPR16, i32, SVEAddSubImm16Pat, !cast<Instruction>(NAME # _H)>;
+ def : SVE_1_Op_Imm_OptLsl_Pat<nxv4i32, int_op, ZPR32, i32, SVEAddSubImm32Pat, !cast<Instruction>(NAME # _S)>;
+ def : SVE_1_Op_Imm_OptLsl_Pat<nxv2i64, int_op, ZPR64, i64, SVEAddSubImm64Pat, !cast<Instruction>(NAME # _D)>;
}
multiclass sve_int_arith_imm0_subr<bits<3> opc, string asm, SDPatternOperator op> {
@@ -3509,7 +3854,7 @@ class sve_int_arith_imm<bits<2> sz8_64, bits<6> opc, string asm,
let Inst{4-0} = Zdn;
let Constraints = "$Zdn = $_Zdn";
- let DestructiveInstType = Destructive;
+ let DestructiveInstType = DestructiveOther;
let ElementSize = ElementSizeNone;
}
@@ -3519,10 +3864,10 @@ multiclass sve_int_arith_imm1<bits<2> opc, string asm, SDPatternOperator op> {
def _S : sve_int_arith_imm<0b10, { 0b1010, opc }, asm, ZPR32, simm8>;
def _D : sve_int_arith_imm<0b11, { 0b1010, opc }, asm, ZPR64, simm8>;
- def : SVE_1_Op_Imm_Arith_Pat<nxv16i8, op, ZPR8, i32, SVEArithSImmPat, !cast<Instruction>(NAME # _B)>;
- def : SVE_1_Op_Imm_Arith_Pat<nxv8i16, op, ZPR16, i32, SVEArithSImmPat, !cast<Instruction>(NAME # _H)>;
- def : SVE_1_Op_Imm_Arith_Pat<nxv4i32, op, ZPR32, i32, SVEArithSImmPat, !cast<Instruction>(NAME # _S)>;
- def : SVE_1_Op_Imm_Arith_Pat<nxv2i64, op, ZPR64, i64, SVEArithSImmPat, !cast<Instruction>(NAME # _D)>;
+ def : SVE_1_Op_Imm_Arith_Pred_Pat<nxv16i8, nxv16i1, op, ZPR8, i32, SVEArithSImmPat, !cast<Instruction>(NAME # _B)>;
+ def : SVE_1_Op_Imm_Arith_Pred_Pat<nxv8i16, nxv8i1, op, ZPR16, i32, SVEArithSImmPat, !cast<Instruction>(NAME # _H)>;
+ def : SVE_1_Op_Imm_Arith_Pred_Pat<nxv4i32, nxv4i1, op, ZPR32, i32, SVEArithSImmPat, !cast<Instruction>(NAME # _S)>;
+ def : SVE_1_Op_Imm_Arith_Pred_Pat<nxv2i64, nxv2i1, op, ZPR64, i64, SVEArithSImmPat, !cast<Instruction>(NAME # _D)>;
}
multiclass sve_int_arith_imm1_unsigned<bits<2> opc, string asm, SDPatternOperator op> {
@@ -3531,10 +3876,10 @@ multiclass sve_int_arith_imm1_unsigned<bits<2> opc, string asm, SDPatternOperato
def _S : sve_int_arith_imm<0b10, { 0b1010, opc }, asm, ZPR32, imm0_255>;
def _D : sve_int_arith_imm<0b11, { 0b1010, opc }, asm, ZPR64, imm0_255>;
- def : SVE_1_Op_Imm_Arith_Pat<nxv16i8, op, ZPR8, i32, SVEArithUImmPat, !cast<Instruction>(NAME # _B)>;
- def : SVE_1_Op_Imm_Arith_Pat<nxv8i16, op, ZPR16, i32, SVEArithUImmPat, !cast<Instruction>(NAME # _H)>;
- def : SVE_1_Op_Imm_Arith_Pat<nxv4i32, op, ZPR32, i32, SVEArithUImmPat, !cast<Instruction>(NAME # _S)>;
- def : SVE_1_Op_Imm_Arith_Pat<nxv2i64, op, ZPR64, i64, SVEArithUImmPat, !cast<Instruction>(NAME # _D)>;
+ def : SVE_1_Op_Imm_Arith_Pred_Pat<nxv16i8, nxv16i1, op, ZPR8, i32, SVEArithUImmPat, !cast<Instruction>(NAME # _B)>;
+ def : SVE_1_Op_Imm_Arith_Pred_Pat<nxv8i16, nxv8i1, op, ZPR16, i32, SVEArithUImmPat, !cast<Instruction>(NAME # _H)>;
+ def : SVE_1_Op_Imm_Arith_Pred_Pat<nxv4i32, nxv4i1, op, ZPR32, i32, SVEArithUImmPat, !cast<Instruction>(NAME # _S)>;
+ def : SVE_1_Op_Imm_Arith_Pred_Pat<nxv2i64, nxv2i1, op, ZPR64, i64, SVEArithUImmPat, !cast<Instruction>(NAME # _D)>;
}
multiclass sve_int_arith_imm2<string asm, SDPatternOperator op> {
@@ -3604,11 +3949,11 @@ class sve2_int_bitwise_ternary_op_d<bits<3> opc, string asm>
let Inst{4-0} = Zdn;
let Constraints = "$Zdn = $_Zdn";
- let DestructiveInstType = Destructive;
+ let DestructiveInstType = DestructiveOther;
let ElementSize = ElementSizeNone;
}
-multiclass sve2_int_bitwise_ternary_op<bits<3> opc, string asm> {
+multiclass sve2_int_bitwise_ternary_op<bits<3> opc, string asm, SDPatternOperator op> {
def NAME : sve2_int_bitwise_ternary_op_d<opc, asm>;
def : InstAlias<asm # "\t$Zdn, $Zdn, $Zm, $Zk",
@@ -3617,6 +3962,11 @@ multiclass sve2_int_bitwise_ternary_op<bits<3> opc, string asm> {
(!cast<Instruction>(NAME) ZPR16:$Zdn, ZPR16:$Zm, ZPR16:$Zk), 1>;
def : InstAlias<asm # "\t$Zdn, $Zdn, $Zm, $Zk",
(!cast<Instruction>(NAME) ZPR32:$Zdn, ZPR32:$Zm, ZPR32:$Zk), 1>;
+
+ def : SVE_3_Op_Pat<nxv16i8, op, nxv16i8, nxv16i8, nxv16i8, !cast<Instruction>(NAME)>;
+ def : SVE_3_Op_Pat<nxv8i16, op, nxv8i16, nxv8i16, nxv8i16, !cast<Instruction>(NAME)>;
+ def : SVE_3_Op_Pat<nxv4i32, op, nxv4i32, nxv4i32, nxv4i32, !cast<Instruction>(NAME)>;
+ def : SVE_3_Op_Pat<nxv2i64, op, nxv2i64, nxv2i64, nxv2i64, !cast<Instruction>(NAME)>;
}
class sve2_int_rotate_right_imm<bits<4> tsz8_64, string asm,
@@ -3638,11 +3988,11 @@ class sve2_int_rotate_right_imm<bits<4> tsz8_64, string asm,
let Inst{4-0} = Zdn;
let Constraints = "$Zdn = $_Zdn";
- let DestructiveInstType = Destructive;
+ let DestructiveInstType = DestructiveOther;
let ElementSize = ElementSizeNone;
}
-multiclass sve2_int_rotate_right_imm<string asm> {
+multiclass sve2_int_rotate_right_imm<string asm, SDPatternOperator op> {
def _B : sve2_int_rotate_right_imm<{0,0,0,1}, asm, ZPR8, vecshiftR8>;
def _H : sve2_int_rotate_right_imm<{0,0,1,?}, asm, ZPR16, vecshiftR16> {
let Inst{19} = imm{3};
@@ -3654,6 +4004,10 @@ multiclass sve2_int_rotate_right_imm<string asm> {
let Inst{22} = imm{5};
let Inst{20-19} = imm{4-3};
}
+ def : SVE_3_Op_Imm_Pat<nxv16i8, op, nxv16i8, nxv16i8, i32, tvecshiftR8, !cast<Instruction>(NAME # _B)>;
+ def : SVE_3_Op_Imm_Pat<nxv8i16, op, nxv8i16, nxv8i16, i32, tvecshiftR16, !cast<Instruction>(NAME # _H)>;
+ def : SVE_3_Op_Imm_Pat<nxv4i32, op, nxv4i32, nxv4i32, i32, tvecshiftR32, !cast<Instruction>(NAME # _S)>;
+ def : SVE_3_Op_Imm_Pat<nxv2i64, op, nxv2i64, nxv2i64, i32, tvecshiftR64, !cast<Instruction>(NAME # _D)>;
}
//===----------------------------------------------------------------------===//
@@ -3678,7 +4032,7 @@ class sve_int_dup_fpimm_pred<bits<2> sz, Operand fpimmtype,
let Inst{4-0} = Zd;
let Constraints = "$Zd = $_Zd";
- let DestructiveInstType = Destructive;
+ let DestructiveInstType = DestructiveOther;
let ElementSize = zprty.ElementSize;
}
@@ -3713,26 +4067,34 @@ class sve_int_dup_imm_pred<bits<2> sz8_64, bit m, string asm,
let Inst{12-5} = imm{7-0}; // imm8
let Inst{4-0} = Zd;
- let DestructiveInstType = Destructive;
+ let DestructiveInstType = DestructiveOther;
let ElementSize = zprty.ElementSize;
}
-multiclass sve_int_dup_imm_pred_merge<string asm> {
- let Constraints = "$Zd = $_Zd" in {
- def _B : sve_int_dup_imm_pred<0b00, 1, asm, ZPR8, "/m", (ins ZPR8:$_Zd, PPRAny:$Pg, cpy_imm8_opt_lsl_i8:$imm)>;
- def _H : sve_int_dup_imm_pred<0b01, 1, asm, ZPR16, "/m", (ins ZPR16:$_Zd, PPRAny:$Pg, cpy_imm8_opt_lsl_i16:$imm)>;
- def _S : sve_int_dup_imm_pred<0b10, 1, asm, ZPR32, "/m", (ins ZPR32:$_Zd, PPRAny:$Pg, cpy_imm8_opt_lsl_i32:$imm)>;
- def _D : sve_int_dup_imm_pred<0b11, 1, asm, ZPR64, "/m", (ins ZPR64:$_Zd, PPRAny:$Pg, cpy_imm8_opt_lsl_i64:$imm)>;
- }
-
- def : InstAlias<"mov $Zd, $Pg/m, $imm",
- (!cast<Instruction>(NAME # _B) ZPR8:$Zd, PPRAny:$Pg, cpy_imm8_opt_lsl_i8:$imm), 1>;
- def : InstAlias<"mov $Zd, $Pg/m, $imm",
- (!cast<Instruction>(NAME # _H) ZPR16:$Zd, PPRAny:$Pg, cpy_imm8_opt_lsl_i16:$imm), 1>;
+multiclass sve_int_dup_imm_pred_merge_inst<
+ bits<2> sz8_64, string asm, ZPRRegOp zprty, ValueType intty,
+ ValueType predty, ValueType scalarty, imm8_opt_lsl cpyimm> {
+ let Constraints = "$Zd = $_Zd" in
+ def NAME : sve_int_dup_imm_pred<sz8_64, 1, asm, zprty, "/m",
+ (ins zprty:$_Zd, PPRAny:$Pg, cpyimm:$imm)>;
def : InstAlias<"mov $Zd, $Pg/m, $imm",
- (!cast<Instruction>(NAME # _S) ZPR32:$Zd, PPRAny:$Pg, cpy_imm8_opt_lsl_i32:$imm), 1>;
- def : InstAlias<"mov $Zd, $Pg/m, $imm",
- (!cast<Instruction>(NAME # _D) ZPR64:$Zd, PPRAny:$Pg, cpy_imm8_opt_lsl_i64:$imm), 1>;
+ (!cast<Instruction>(NAME) zprty:$Zd, PPRAny:$Pg, cpyimm:$imm), 1>;
+ def : Pat<(intty
+ (vselect predty:$Pg,
+ (intty (AArch64dup (scalarty (SVE8BitLslImm i32:$imm, i32:$shift)))),
+ intty:$Zd)),
+ (!cast<Instruction>(NAME) zprty:$Zd, $Pg, i32:$imm, i32:$shift)>;
+}
+
+multiclass sve_int_dup_imm_pred_merge<string asm> {
+ defm _B : sve_int_dup_imm_pred_merge_inst<0b00, asm, ZPR8, nxv16i8, nxv16i1,
+ i32, cpy_imm8_opt_lsl_i8>;
+ defm _H : sve_int_dup_imm_pred_merge_inst<0b01, asm, ZPR16, nxv8i16, nxv8i1,
+ i32, cpy_imm8_opt_lsl_i16>;
+ defm _S : sve_int_dup_imm_pred_merge_inst<0b10, asm, ZPR32, nxv4i32, nxv4i1,
+ i32, cpy_imm8_opt_lsl_i32>;
+ defm _D : sve_int_dup_imm_pred_merge_inst<0b11, asm, ZPR64, nxv2i64, nxv2i1,
+ i64, cpy_imm8_opt_lsl_i64>;
def : InstAlias<"fmov $Zd, $Pg/m, #0.0",
(!cast<Instruction>(NAME # _H) ZPR16:$Zd, PPRAny:$Pg, 0, 0), 0>;
@@ -3742,20 +4104,35 @@ multiclass sve_int_dup_imm_pred_merge<string asm> {
(!cast<Instruction>(NAME # _D) ZPR64:$Zd, PPRAny:$Pg, 0, 0), 0>;
}
-multiclass sve_int_dup_imm_pred_zero<string asm> {
- def _B : sve_int_dup_imm_pred<0b00, 0, asm, ZPR8, "/z", (ins PPRAny:$Pg, cpy_imm8_opt_lsl_i8:$imm)>;
- def _H : sve_int_dup_imm_pred<0b01, 0, asm, ZPR16, "/z", (ins PPRAny:$Pg, cpy_imm8_opt_lsl_i16:$imm)>;
- def _S : sve_int_dup_imm_pred<0b10, 0, asm, ZPR32, "/z", (ins PPRAny:$Pg, cpy_imm8_opt_lsl_i32:$imm)>;
- def _D : sve_int_dup_imm_pred<0b11, 0, asm, ZPR64, "/z", (ins PPRAny:$Pg, cpy_imm8_opt_lsl_i64:$imm)>;
-
- def : InstAlias<"mov $Zd, $Pg/z, $imm",
- (!cast<Instruction>(NAME # _B) ZPR8:$Zd, PPRAny:$Pg, cpy_imm8_opt_lsl_i8:$imm), 1>;
- def : InstAlias<"mov $Zd, $Pg/z, $imm",
- (!cast<Instruction>(NAME # _H) ZPR16:$Zd, PPRAny:$Pg, cpy_imm8_opt_lsl_i16:$imm), 1>;
+multiclass sve_int_dup_imm_pred_zero_inst<
+ bits<2> sz8_64, string asm, ZPRRegOp zprty, ValueType intty,
+ ValueType predty, ValueType scalarty, imm8_opt_lsl cpyimm> {
+ def NAME : sve_int_dup_imm_pred<sz8_64, 0, asm, zprty, "/z",
+ (ins PPRAny:$Pg, cpyimm:$imm)>;
def : InstAlias<"mov $Zd, $Pg/z, $imm",
- (!cast<Instruction>(NAME # _S) ZPR32:$Zd, PPRAny:$Pg, cpy_imm8_opt_lsl_i32:$imm), 1>;
- def : InstAlias<"mov $Zd, $Pg/z, $imm",
- (!cast<Instruction>(NAME # _D) ZPR64:$Zd, PPRAny:$Pg, cpy_imm8_opt_lsl_i64:$imm), 1>;
+ (!cast<Instruction>(NAME) zprty:$Zd, PPRAny:$Pg, cpyimm:$imm), 1>;
+ def : Pat<(intty (zext (predty PPRAny:$Ps1))),
+ (!cast<Instruction>(NAME) PPRAny:$Ps1, 1, 0)>;
+ def : Pat<(intty (sext (predty PPRAny:$Ps1))),
+ (!cast<Instruction>(NAME) PPRAny:$Ps1, -1, 0)>;
+ def : Pat<(intty (anyext (predty PPRAny:$Ps1))),
+ (!cast<Instruction>(NAME) PPRAny:$Ps1, 1, 0)>;
+ def : Pat<(intty
+ (vselect predty:$Pg,
+ (intty (AArch64dup (scalarty (SVE8BitLslImm i32:$imm, i32:$shift)))),
+ (intty (AArch64dup (scalarty 0))))),
+ (!cast<Instruction>(NAME) $Pg, i32:$imm, i32:$shift)>;
+}
+
+multiclass sve_int_dup_imm_pred_zero<string asm> {
+ defm _B : sve_int_dup_imm_pred_zero_inst<0b00, asm, ZPR8, nxv16i8, nxv16i1,
+ i32, cpy_imm8_opt_lsl_i8>;
+ defm _H : sve_int_dup_imm_pred_zero_inst<0b01, asm, ZPR16, nxv8i16, nxv8i1,
+ i32, cpy_imm8_opt_lsl_i16>;
+ defm _S : sve_int_dup_imm_pred_zero_inst<0b10, asm, ZPR32, nxv4i32, nxv4i1,
+ i32, cpy_imm8_opt_lsl_i32>;
+ defm _D : sve_int_dup_imm_pred_zero_inst<0b11, asm, ZPR64, nxv2i64, nxv2i1,
+ i64, cpy_imm8_opt_lsl_i64>;
}
//===----------------------------------------------------------------------===//
@@ -3787,17 +4164,24 @@ class sve_int_cmp<bit cmp_1, bits<2> sz8_64, bits<3> opc, string asm,
let Defs = [NZCV];
}
-multiclass sve_int_cmp_0<bits<3> opc, string asm, SDPatternOperator op,
- CondCode cc> {
+multiclass SVE_SETCC_Pat<CondCode cc, CondCode invcc, ValueType predvt,
+ ValueType intvt, sve_int_cmp cmp> {
+ def : Pat<(predvt (AArch64setcc_z predvt:$Op1, intvt:$Op2, intvt:$Op3, cc)),
+ (cmp $Op1, $Op2, $Op3)>;
+ def : Pat<(predvt (AArch64setcc_z predvt:$Op1, intvt:$Op2, intvt:$Op3, invcc)),
+ (cmp $Op1, $Op3, $Op2)>;
+}
+
+multiclass sve_int_cmp_0<bits<3> opc, string asm, CondCode cc, CondCode invcc> {
def _B : sve_int_cmp<0b0, 0b00, opc, asm, PPR8, ZPR8, ZPR8>;
def _H : sve_int_cmp<0b0, 0b01, opc, asm, PPR16, ZPR16, ZPR16>;
def _S : sve_int_cmp<0b0, 0b10, opc, asm, PPR32, ZPR32, ZPR32>;
def _D : sve_int_cmp<0b0, 0b11, opc, asm, PPR64, ZPR64, ZPR64>;
- def : SVE_3_Op_Pat<nxv16i1, op, nxv16i1, nxv16i8, nxv16i8, !cast<Instruction>(NAME # _B)>;
- def : SVE_3_Op_Pat<nxv8i1, op, nxv8i1, nxv8i16, nxv8i16, !cast<Instruction>(NAME # _H)>;
- def : SVE_3_Op_Pat<nxv4i1, op, nxv4i1, nxv4i32, nxv4i32, !cast<Instruction>(NAME # _S)>;
- def : SVE_3_Op_Pat<nxv2i1, op, nxv2i1, nxv2i64, nxv2i64, !cast<Instruction>(NAME # _D)>;
+ defm : SVE_SETCC_Pat<cc, invcc, nxv16i1, nxv16i8, !cast<Instruction>(NAME # _B)>;
+ defm : SVE_SETCC_Pat<cc, invcc, nxv8i1, nxv8i16, !cast<Instruction>(NAME # _H)>;
+ defm : SVE_SETCC_Pat<cc, invcc, nxv4i1, nxv4i32, !cast<Instruction>(NAME # _S)>;
+ defm : SVE_SETCC_Pat<cc, invcc, nxv2i1, nxv2i64, !cast<Instruction>(NAME # _D)>;
}
multiclass sve_int_cmp_0_wide<bits<3> opc, string asm, SDPatternOperator op> {
@@ -3852,67 +4236,35 @@ class sve_int_scmp_vi<bits<2> sz8_64, bits<3> opc, string asm, PPRRegOp pprty,
let ElementSize = pprty.ElementSize;
}
-multiclass sve_int_scmp_vi<bits<3> opc, string asm, CondCode cc,
- SDPatternOperator op = null_frag,
- SDPatternOperator inv_op = null_frag> {
+multiclass SVE_SETCC_Imm_Pat<CondCode cc, CondCode commuted_cc,
+ ValueType predvt, ValueType intvt,
+ Operand immtype, Instruction cmp> {
+ def : Pat<(predvt (AArch64setcc_z (predvt PPR_3b:$Pg),
+ (intvt ZPR:$Zs1),
+ (intvt (AArch64dup (immtype:$imm))),
+ cc)),
+ (cmp $Pg, $Zs1, immtype:$imm)>;
+ def : Pat<(predvt (AArch64setcc_z (predvt PPR_3b:$Pg),
+ (intvt (AArch64dup (immtype:$imm))),
+ (intvt ZPR:$Zs1),
+ commuted_cc)),
+ (cmp $Pg, $Zs1, immtype:$imm)>;
+}
+
+multiclass sve_int_scmp_vi<bits<3> opc, string asm, CondCode cc, CondCode commuted_cc> {
def _B : sve_int_scmp_vi<0b00, opc, asm, PPR8, ZPR8, simm5_32b>;
def _H : sve_int_scmp_vi<0b01, opc, asm, PPR16, ZPR16, simm5_32b>;
def _S : sve_int_scmp_vi<0b10, opc, asm, PPR32, ZPR32, simm5_32b>;
def _D : sve_int_scmp_vi<0b11, opc, asm, PPR64, ZPR64, simm5_64b>;
- // IR version
- def : Pat<(nxv16i1 (setcc (nxv16i8 ZPR:$Zs1),
- (nxv16i8 (AArch64dup (simm5_32b:$imm))),
- cc)),
- (!cast<Instruction>(NAME # "_B") (PTRUE_B 31), ZPR:$Zs1, simm5_32b:$imm)>;
- def : Pat<(nxv8i1 (setcc (nxv8i16 ZPR:$Zs1),
- (nxv8i16 (AArch64dup (simm5_32b:$imm))),
- cc)),
- (!cast<Instruction>(NAME # "_H") (PTRUE_H 31), ZPR:$Zs1, simm5_32b:$imm)>;
- def : Pat<(nxv4i1 (setcc (nxv4i32 ZPR:$Zs1),
- (nxv4i32 (AArch64dup (simm5_32b:$imm))),
- cc)),
- (!cast<Instruction>(NAME # "_S") (PTRUE_S 31), ZPR:$Zs1, simm5_32b:$imm)>;
- def : Pat<(nxv2i1 (setcc (nxv2i64 ZPR:$Zs1),
- (nxv2i64 (AArch64dup (simm5_64b:$imm))),
- cc)),
- (!cast<Instruction>(NAME # "_D") (PTRUE_D 31), ZPR:$Zs1, simm5_64b:$imm)>;
-
- // Intrinsic version
- def : Pat<(nxv16i1 (op (nxv16i1 PPR_3b:$Pg),
- (nxv16i8 ZPR:$Zs1),
- (nxv16i8 (AArch64dup (simm5_32b:$imm))))),
- (!cast<Instruction>(NAME # "_B") PPR_3b:$Pg, ZPR:$Zs1, simm5_32b:$imm)>;
- def : Pat<(nxv8i1 (op (nxv8i1 PPR_3b:$Pg),
- (nxv8i16 ZPR:$Zs1),
- (nxv8i16 (AArch64dup (simm5_32b:$imm))))),
- (!cast<Instruction>(NAME # "_H") PPR_3b:$Pg, ZPR:$Zs1, simm5_32b:$imm)>;
- def : Pat<(nxv4i1 (op (nxv4i1 PPR_3b:$Pg),
- (nxv4i32 ZPR:$Zs1),
- (nxv4i32 (AArch64dup (simm5_32b:$imm))))),
- (!cast<Instruction>(NAME # "_S") PPR_3b:$Pg, ZPR:$Zs1, simm5_32b:$imm)>;
- def : Pat<(nxv2i1 (op (nxv2i1 PPR_3b:$Pg),
- (nxv2i64 ZPR:$Zs1),
- (nxv2i64 (AArch64dup (simm5_64b:$imm))))),
- (!cast<Instruction>(NAME # "_D") PPR_3b:$Pg, ZPR:$Zs1, simm5_64b:$imm)>;
-
- // Inverted intrinsic version
- def : Pat<(nxv16i1 (inv_op (nxv16i1 PPR_3b:$Pg),
- (nxv16i8 (AArch64dup (simm5_32b:$imm))),
- (nxv16i8 ZPR:$Zs1))),
- (!cast<Instruction>(NAME # "_B") PPR_3b:$Pg, ZPR:$Zs1, simm5_32b:$imm)>;
- def : Pat<(nxv8i1 (inv_op (nxv8i1 PPR_3b:$Pg),
- (nxv8i16 (AArch64dup (simm5_32b:$imm))),
- (nxv8i16 ZPR:$Zs1))),
- (!cast<Instruction>(NAME # "_H") PPR_3b:$Pg, ZPR:$Zs1, simm5_32b:$imm)>;
- def : Pat<(nxv4i1 (inv_op (nxv4i1 PPR_3b:$Pg),
- (nxv4i32 (AArch64dup (simm5_32b:$imm))),
- (nxv4i32 ZPR:$Zs1))),
- (!cast<Instruction>(NAME # "_S") PPR_3b:$Pg, ZPR:$Zs1, simm5_32b:$imm)>;
- def : Pat<(nxv2i1 (inv_op (nxv2i1 PPR_3b:$Pg),
- (nxv2i64 (AArch64dup (simm5_64b:$imm))),
- (nxv2i64 ZPR:$Zs1))),
- (!cast<Instruction>(NAME # "_D") PPR_3b:$Pg, ZPR:$Zs1, simm5_64b:$imm)>;
+ defm : SVE_SETCC_Imm_Pat<cc, commuted_cc, nxv16i1, nxv16i8, simm5_32b,
+ !cast<Instruction>(NAME # _B)>;
+ defm : SVE_SETCC_Imm_Pat<cc, commuted_cc, nxv8i1, nxv8i16, simm5_32b,
+ !cast<Instruction>(NAME # _H)>;
+ defm : SVE_SETCC_Imm_Pat<cc, commuted_cc, nxv4i1, nxv4i32, simm5_32b,
+ !cast<Instruction>(NAME # _S)>;
+ defm : SVE_SETCC_Imm_Pat<cc, commuted_cc, nxv2i1, nxv2i64, simm5_64b,
+ !cast<Instruction>(NAME # _D)>;
}
@@ -3944,66 +4296,20 @@ class sve_int_ucmp_vi<bits<2> sz8_64, bits<2> opc, string asm, PPRRegOp pprty,
}
multiclass sve_int_ucmp_vi<bits<2> opc, string asm, CondCode cc,
- SDPatternOperator op = null_frag,
- SDPatternOperator inv_op = null_frag> {
+ CondCode commuted_cc> {
def _B : sve_int_ucmp_vi<0b00, opc, asm, PPR8, ZPR8, imm0_127>;
def _H : sve_int_ucmp_vi<0b01, opc, asm, PPR16, ZPR16, imm0_127>;
def _S : sve_int_ucmp_vi<0b10, opc, asm, PPR32, ZPR32, imm0_127>;
def _D : sve_int_ucmp_vi<0b11, opc, asm, PPR64, ZPR64, imm0_127_64b>;
- // IR version
- def : Pat<(nxv16i1 (setcc (nxv16i8 ZPR:$Zs1),
- (nxv16i8 (AArch64dup (imm0_127:$imm))),
- cc)),
- (!cast<Instruction>(NAME # "_B") (PTRUE_B 31), ZPR:$Zs1, imm0_127:$imm)>;
- def : Pat<(nxv8i1 (setcc (nxv8i16 ZPR:$Zs1),
- (nxv8i16 (AArch64dup (imm0_127:$imm))),
- cc)),
- (!cast<Instruction>(NAME # "_H") (PTRUE_H 31), ZPR:$Zs1, imm0_127:$imm)>;
- def : Pat<(nxv4i1 (setcc (nxv4i32 ZPR:$Zs1),
- (nxv4i32 (AArch64dup (imm0_127:$imm))),
- cc)),
- (!cast<Instruction>(NAME # "_S") (PTRUE_S 31), ZPR:$Zs1, imm0_127:$imm)>;
- def : Pat<(nxv2i1 (setcc (nxv2i64 ZPR:$Zs1),
- (nxv2i64 (AArch64dup (imm0_127_64b:$imm))),
- cc)),
- (!cast<Instruction>(NAME # "_D") (PTRUE_D 31), ZPR:$Zs1, imm0_127_64b:$imm)>;
-
- // Intrinsic version
- def : Pat<(nxv16i1 (op (nxv16i1 PPR_3b:$Pg),
- (nxv16i8 ZPR:$Zs1),
- (nxv16i8 (AArch64dup (imm0_127:$imm))))),
- (!cast<Instruction>(NAME # "_B") PPR_3b:$Pg, ZPR:$Zs1, imm0_127:$imm)>;
- def : Pat<(nxv8i1 (op (nxv8i1 PPR_3b:$Pg),
- (nxv8i16 ZPR:$Zs1),
- (nxv8i16 (AArch64dup (imm0_127:$imm))))),
- (!cast<Instruction>(NAME # "_H") PPR_3b:$Pg, ZPR:$Zs1, imm0_127:$imm)>;
- def : Pat<(nxv4i1 (op (nxv4i1 PPR_3b:$Pg),
- (nxv4i32 ZPR:$Zs1),
- (nxv4i32 (AArch64dup (imm0_127:$imm))))),
- (!cast<Instruction>(NAME # "_S") PPR_3b:$Pg, ZPR:$Zs1, imm0_127:$imm)>;
- def : Pat<(nxv2i1 (op (nxv2i1 PPR_3b:$Pg),
- (nxv2i64 ZPR:$Zs1),
- (nxv2i64 (AArch64dup (imm0_127_64b:$imm))))),
- (!cast<Instruction>(NAME # "_D") PPR_3b:$Pg, ZPR:$Zs1, imm0_127_64b:$imm)>;
-
- // Inverted intrinsic version
- def : Pat<(nxv16i1 (inv_op (nxv16i1 PPR_3b:$Pg),
- (nxv16i8 (AArch64dup (imm0_127:$imm))),
- (nxv16i8 ZPR:$Zs1))),
- (!cast<Instruction>(NAME # "_B") PPR_3b:$Pg, ZPR:$Zs1, imm0_127:$imm)>;
- def : Pat<(nxv8i1 (inv_op (nxv8i1 PPR_3b:$Pg),
- (nxv8i16 (AArch64dup (imm0_127:$imm))),
- (nxv8i16 ZPR:$Zs1))),
- (!cast<Instruction>(NAME # "_H") PPR_3b:$Pg, ZPR:$Zs1, imm0_127:$imm)>;
- def : Pat<(nxv4i1 (inv_op (nxv4i1 PPR_3b:$Pg),
- (nxv4i32 (AArch64dup (imm0_127:$imm))),
- (nxv4i32 ZPR:$Zs1))),
- (!cast<Instruction>(NAME # "_S") PPR_3b:$Pg, ZPR:$Zs1, imm0_127:$imm)>;
- def : Pat<(nxv2i1 (inv_op (nxv2i1 PPR_3b:$Pg),
- (nxv2i64 (AArch64dup (imm0_127_64b:$imm))),
- (nxv2i64 ZPR:$Zs1))),
- (!cast<Instruction>(NAME # "_D") PPR_3b:$Pg, ZPR:$Zs1, imm0_127_64b:$imm)>;
+ defm : SVE_SETCC_Imm_Pat<cc, commuted_cc, nxv16i1, nxv16i8, imm0_127,
+ !cast<Instruction>(NAME # _B)>;
+ defm : SVE_SETCC_Imm_Pat<cc, commuted_cc, nxv8i1, nxv8i16, imm0_127,
+ !cast<Instruction>(NAME # _H)>;
+ defm : SVE_SETCC_Imm_Pat<cc, commuted_cc, nxv4i1, nxv4i32, imm0_127,
+ !cast<Instruction>(NAME # _S)>;
+ defm : SVE_SETCC_Imm_Pat<cc, commuted_cc, nxv2i1, nxv2i64, imm0_127_64b,
+ !cast<Instruction>(NAME # _D)>;
}
@@ -4096,11 +4402,17 @@ class sve2_int_while_rr<bits<2> sz8_64, bits<1> rw, string asm,
let Defs = [NZCV];
}
-multiclass sve2_int_while_rr<bits<1> rw, string asm> {
+multiclass sve2_int_while_rr<bits<1> rw, string asm, string op> {
def _B : sve2_int_while_rr<0b00, rw, asm, PPR8>;
def _H : sve2_int_while_rr<0b01, rw, asm, PPR16>;
def _S : sve2_int_while_rr<0b10, rw, asm, PPR32>;
def _D : sve2_int_while_rr<0b11, rw, asm, PPR64>;
+
+ def : SVE_2_Op_Pat<nxv16i1, !cast<SDPatternOperator>(op # _b), i64, i64, !cast<Instruction>(NAME # _B)>;
+ def : SVE_2_Op_Pat<nxv8i1, !cast<SDPatternOperator>(op # _h), i64, i64, !cast<Instruction>(NAME # _H)>;
+ def : SVE_2_Op_Pat<nxv4i1, !cast<SDPatternOperator>(op # _s), i64, i64, !cast<Instruction>(NAME # _S)>;
+ def : SVE_2_Op_Pat<nxv2i1, !cast<SDPatternOperator>(op # _d), i64, i64, !cast<Instruction>(NAME # _D)>;
+
}
//===----------------------------------------------------------------------===//
@@ -4108,8 +4420,8 @@ multiclass sve2_int_while_rr<bits<1> rw, string asm> {
//===----------------------------------------------------------------------===//
class sve_fp_fast_red<bits<2> sz, bits<3> opc, string asm,
- ZPRRegOp zprty, RegisterClass dstRegClass>
-: I<(outs dstRegClass:$Vd), (ins PPR3bAny:$Pg, zprty:$Zn),
+ ZPRRegOp zprty, FPRasZPROperand dstOpType>
+: I<(outs dstOpType:$Vd), (ins PPR3bAny:$Pg, zprty:$Zn),
asm, "\t$Vd, $Pg, $Zn",
"",
[]>, Sched<[]> {
@@ -4127,13 +4439,13 @@ class sve_fp_fast_red<bits<2> sz, bits<3> opc, string asm,
}
multiclass sve_fp_fast_red<bits<3> opc, string asm, SDPatternOperator op> {
- def _H : sve_fp_fast_red<0b01, opc, asm, ZPR16, FPR16>;
- def _S : sve_fp_fast_red<0b10, opc, asm, ZPR32, FPR32>;
- def _D : sve_fp_fast_red<0b11, opc, asm, ZPR64, FPR64>;
+ def _H : sve_fp_fast_red<0b01, opc, asm, ZPR16, FPR16asZPR>;
+ def _S : sve_fp_fast_red<0b10, opc, asm, ZPR32, FPR32asZPR>;
+ def _D : sve_fp_fast_red<0b11, opc, asm, ZPR64, FPR64asZPR>;
- def : SVE_2_Op_Pat<f16, op, nxv8i1, nxv8f16, !cast<Instruction>(NAME # _H)>;
- def : SVE_2_Op_Pat<f32, op, nxv4i1, nxv4f32, !cast<Instruction>(NAME # _S)>;
- def : SVE_2_Op_Pat<f64, op, nxv2i1, nxv2f64, !cast<Instruction>(NAME # _D)>;
+ def : SVE_2_Op_Pat<nxv8f16, op, nxv8i1, nxv8f16, !cast<Instruction>(NAME # _H)>;
+ def : SVE_2_Op_Pat<nxv4f32, op, nxv4i1, nxv4f32, !cast<Instruction>(NAME # _S)>;
+ def : SVE_2_Op_Pat<nxv2f64, op, nxv2i1, nxv2f64, !cast<Instruction>(NAME # _D)>;
}
@@ -4142,8 +4454,8 @@ multiclass sve_fp_fast_red<bits<3> opc, string asm, SDPatternOperator op> {
//===----------------------------------------------------------------------===//
class sve_fp_2op_p_vd<bits<2> sz, bits<3> opc, string asm,
- ZPRRegOp zprty, RegisterClass dstRegClass>
-: I<(outs dstRegClass:$Vdn), (ins PPR3bAny:$Pg, dstRegClass:$_Vdn, zprty:$Zm),
+ ZPRRegOp zprty, FPRasZPROperand dstOpType>
+: I<(outs dstOpType:$Vdn), (ins PPR3bAny:$Pg, dstOpType:$_Vdn, zprty:$Zm),
asm, "\t$Vdn, $Pg, $_Vdn, $Zm",
"",
[]>,
@@ -4164,13 +4476,13 @@ class sve_fp_2op_p_vd<bits<2> sz, bits<3> opc, string asm,
}
multiclass sve_fp_2op_p_vd<bits<3> opc, string asm, SDPatternOperator op> {
- def _H : sve_fp_2op_p_vd<0b01, opc, asm, ZPR16, FPR16>;
- def _S : sve_fp_2op_p_vd<0b10, opc, asm, ZPR32, FPR32>;
- def _D : sve_fp_2op_p_vd<0b11, opc, asm, ZPR64, FPR64>;
+ def _H : sve_fp_2op_p_vd<0b01, opc, asm, ZPR16, FPR16asZPR>;
+ def _S : sve_fp_2op_p_vd<0b10, opc, asm, ZPR32, FPR32asZPR>;
+ def _D : sve_fp_2op_p_vd<0b11, opc, asm, ZPR64, FPR64asZPR>;
- def : SVE_3_Op_Pat<f16, op, nxv8i1, f16, nxv8f16, !cast<Instruction>(NAME # _H)>;
- def : SVE_3_Op_Pat<f32, op, nxv4i1, f32, nxv4f32, !cast<Instruction>(NAME # _S)>;
- def : SVE_3_Op_Pat<f64, op, nxv2i1, f64, nxv2f64, !cast<Instruction>(NAME # _D)>;
+ def : SVE_3_Op_Pat<nxv8f16, op, nxv8i1, nxv8f16, nxv8f16, !cast<Instruction>(NAME # _H)>;
+ def : SVE_3_Op_Pat<nxv4f32, op, nxv4i1, nxv4f32, nxv4f32, !cast<Instruction>(NAME # _S)>;
+ def : SVE_3_Op_Pat<nxv2f64, op, nxv2i1, nxv2f64, nxv2f64, !cast<Instruction>(NAME # _D)>;
}
//===----------------------------------------------------------------------===//
@@ -4210,6 +4522,22 @@ multiclass sve_fp_3op_p_pd<bits<3> opc, string asm, SDPatternOperator op> {
def : SVE_3_Op_Pat<nxv2i1, op, nxv2i1, nxv2f64, nxv2f64, !cast<Instruction>(NAME # _D)>;
}
+multiclass sve_fp_3op_p_pd_cc<bits<3> opc, string asm, SDPatternOperator op,
+ SDPatternOperator op_nopred>
+: sve_fp_3op_p_pd<opc, asm, op> {
+ def : SVE_2_Op_AllActive_Pat<nxv8i1, op_nopred, nxv8f16, nxv8f16,
+ !cast<Instruction>(NAME # _H), PTRUE_H>;
+ def : SVE_2_Op_AllActive_Pat<nxv4i1, op_nopred, nxv4f16, nxv4f16,
+ !cast<Instruction>(NAME # _H), PTRUE_S>;
+ def : SVE_2_Op_AllActive_Pat<nxv2i1, op_nopred, nxv2f16, nxv2f16,
+ !cast<Instruction>(NAME # _H), PTRUE_D>;
+ def : SVE_2_Op_AllActive_Pat<nxv4i1, op_nopred, nxv4f32, nxv4f32,
+ !cast<Instruction>(NAME # _S), PTRUE_S>;
+ def : SVE_2_Op_AllActive_Pat<nxv2i1, op_nopred, nxv2f32, nxv2f32,
+ !cast<Instruction>(NAME # _S), PTRUE_D>;
+ def : SVE_2_Op_AllActive_Pat<nxv2i1, op_nopred, nxv2f64, nxv2f64,
+ !cast<Instruction>(NAME # _D), PTRUE_D>;
+}
//===----------------------------------------------------------------------===//
// SVE Floating Point Compare - with Zero Group
@@ -4263,11 +4591,20 @@ class sve_int_index_ii<bits<2> sz8_64, string asm, ZPRRegOp zprty,
let Inst{4-0} = Zd;
}
-multiclass sve_int_index_ii<string asm> {
- def _B : sve_int_index_ii<0b00, asm, ZPR8, simm5_32b>;
- def _H : sve_int_index_ii<0b01, asm, ZPR16, simm5_32b>;
+multiclass sve_int_index_ii<string asm, SDPatternOperator op> {
+ def _B : sve_int_index_ii<0b00, asm, ZPR8, simm5_8b>;
+ def _H : sve_int_index_ii<0b01, asm, ZPR16, simm5_16b>;
def _S : sve_int_index_ii<0b10, asm, ZPR32, simm5_32b>;
def _D : sve_int_index_ii<0b11, asm, ZPR64, simm5_64b>;
+
+ def : Pat<(nxv16i8 (op simm5_8b:$imm5, simm5_8b:$imm5b)),
+ (!cast<Instruction>(NAME # "_B") simm5_8b:$imm5, simm5_8b:$imm5b)>;
+ def : Pat<(nxv8i16 (op simm5_16b:$imm5, simm5_16b:$imm5b)),
+ (!cast<Instruction>(NAME # "_H") simm5_16b:$imm5, simm5_16b:$imm5b)>;
+ def : Pat<(nxv4i32 (op simm5_32b:$imm5, simm5_32b:$imm5b)),
+ (!cast<Instruction>(NAME # "_S") simm5_32b:$imm5, simm5_32b:$imm5b)>;
+ def : Pat<(nxv2i64 (op simm5_64b:$imm5, simm5_64b:$imm5b)),
+ (!cast<Instruction>(NAME # "_D") simm5_64b:$imm5, simm5_64b:$imm5b)>;
}
class sve_int_index_ir<bits<2> sz8_64, string asm, ZPRRegOp zprty,
@@ -4287,11 +4624,20 @@ class sve_int_index_ir<bits<2> sz8_64, string asm, ZPRRegOp zprty,
let Inst{4-0} = Zd;
}
-multiclass sve_int_index_ir<string asm> {
- def _B : sve_int_index_ir<0b00, asm, ZPR8, GPR32, simm5_32b>;
- def _H : sve_int_index_ir<0b01, asm, ZPR16, GPR32, simm5_32b>;
+multiclass sve_int_index_ir<string asm, SDPatternOperator op> {
+ def _B : sve_int_index_ir<0b00, asm, ZPR8, GPR32, simm5_8b>;
+ def _H : sve_int_index_ir<0b01, asm, ZPR16, GPR32, simm5_16b>;
def _S : sve_int_index_ir<0b10, asm, ZPR32, GPR32, simm5_32b>;
def _D : sve_int_index_ir<0b11, asm, ZPR64, GPR64, simm5_64b>;
+
+ def : Pat<(nxv16i8 (op simm5_8b:$imm5, GPR32:$Rm)),
+ (!cast<Instruction>(NAME # "_B") simm5_8b:$imm5, GPR32:$Rm)>;
+ def : Pat<(nxv8i16 (op simm5_16b:$imm5, GPR32:$Rm)),
+ (!cast<Instruction>(NAME # "_H") simm5_16b:$imm5, GPR32:$Rm)>;
+ def : Pat<(nxv4i32 (op simm5_32b:$imm5, GPR32:$Rm)),
+ (!cast<Instruction>(NAME # "_S") simm5_32b:$imm5, GPR32:$Rm)>;
+ def : Pat<(nxv2i64 (op simm5_64b:$imm5, GPR64:$Rm)),
+ (!cast<Instruction>(NAME # "_D") simm5_64b:$imm5, GPR64:$Rm)>;
}
class sve_int_index_ri<bits<2> sz8_64, string asm, ZPRRegOp zprty,
@@ -4311,11 +4657,20 @@ class sve_int_index_ri<bits<2> sz8_64, string asm, ZPRRegOp zprty,
let Inst{4-0} = Zd;
}
-multiclass sve_int_index_ri<string asm> {
- def _B : sve_int_index_ri<0b00, asm, ZPR8, GPR32, simm5_32b>;
- def _H : sve_int_index_ri<0b01, asm, ZPR16, GPR32, simm5_32b>;
+multiclass sve_int_index_ri<string asm, SDPatternOperator op> {
+ def _B : sve_int_index_ri<0b00, asm, ZPR8, GPR32, simm5_8b>;
+ def _H : sve_int_index_ri<0b01, asm, ZPR16, GPR32, simm5_16b>;
def _S : sve_int_index_ri<0b10, asm, ZPR32, GPR32, simm5_32b>;
def _D : sve_int_index_ri<0b11, asm, ZPR64, GPR64, simm5_64b>;
+
+ def : Pat<(nxv16i8 (op GPR32:$Rm, simm5_8b:$imm5)),
+ (!cast<Instruction>(NAME # "_B") GPR32:$Rm, simm5_8b:$imm5)>;
+ def : Pat<(nxv8i16 (op GPR32:$Rm, simm5_16b:$imm5)),
+ (!cast<Instruction>(NAME # "_H") GPR32:$Rm, simm5_16b:$imm5)>;
+ def : Pat<(nxv4i32 (op GPR32:$Rm, simm5_32b:$imm5)),
+ (!cast<Instruction>(NAME # "_S") GPR32:$Rm, simm5_32b:$imm5)>;
+ def : Pat<(nxv2i64 (op GPR64:$Rm, simm5_64b:$imm5)),
+ (!cast<Instruction>(NAME # "_D") GPR64:$Rm, simm5_64b:$imm5)>;
}
class sve_int_index_rr<bits<2> sz8_64, string asm, ZPRRegOp zprty,
@@ -4335,19 +4690,23 @@ class sve_int_index_rr<bits<2> sz8_64, string asm, ZPRRegOp zprty,
let Inst{4-0} = Zd;
}
-multiclass sve_int_index_rr<string asm> {
+multiclass sve_int_index_rr<string asm, SDPatternOperator op> {
def _B : sve_int_index_rr<0b00, asm, ZPR8, GPR32>;
def _H : sve_int_index_rr<0b01, asm, ZPR16, GPR32>;
def _S : sve_int_index_rr<0b10, asm, ZPR32, GPR32>;
def _D : sve_int_index_rr<0b11, asm, ZPR64, GPR64>;
+
+ def : SVE_2_Op_Pat<nxv16i8, op, i32, i32, !cast<Instruction>(NAME # _B)>;
+ def : SVE_2_Op_Pat<nxv8i16, op, i32, i32, !cast<Instruction>(NAME # _H)>;
+ def : SVE_2_Op_Pat<nxv4i32, op, i32, i32, !cast<Instruction>(NAME # _S)>;
+ def : SVE_2_Op_Pat<nxv2i64, op, i64, i64, !cast<Instruction>(NAME # _D)>;
}
//
//===----------------------------------------------------------------------===//
// SVE Bitwise Shift - Predicated Group
//===----------------------------------------------------------------------===//
class sve_int_bin_pred_shift_imm<bits<4> tsz8_64, bits<4> opc, string asm,
- ZPRRegOp zprty, Operand immtype,
- ElementSizeEnum size>
+ ZPRRegOp zprty, Operand immtype>
: I<(outs zprty:$Zdn), (ins PPR3bAny:$Pg, zprty:$_Zdn, immtype:$imm),
asm, "\t$Zdn, $Pg/m, $_Zdn, $imm",
"",
@@ -4366,50 +4725,99 @@ class sve_int_bin_pred_shift_imm<bits<4> tsz8_64, bits<4> opc, string asm,
let Inst{4-0} = Zdn;
let Constraints = "$Zdn = $_Zdn";
- let DestructiveInstType = Destructive;
- let ElementSize = size;
+ let DestructiveInstType = DestructiveBinaryImm;
+ let ElementSize = zprty.ElementSize;
+}
+
+multiclass sve_int_bin_pred_shift_imm_left<bits<4> opc, string asm, string psName=""> {
+ def _B : SVEPseudo2Instr<psName # _B, 1>,
+ sve_int_bin_pred_shift_imm<{0,0,0,1}, opc, asm, ZPR8, vecshiftL8>;
+ def _H : SVEPseudo2Instr<psName # _H, 1>,
+ sve_int_bin_pred_shift_imm<{0,0,1,?}, opc, asm, ZPR16, vecshiftL16> {
+ let Inst{8} = imm{3};
+ }
+ def _S : SVEPseudo2Instr<psName # _S, 1>,
+ sve_int_bin_pred_shift_imm<{0,1,?,?}, opc, asm, ZPR32, vecshiftL32> {
+ let Inst{9-8} = imm{4-3};
+ }
+ def _D : SVEPseudo2Instr<psName # _D, 1>,
+ sve_int_bin_pred_shift_imm<{1,?,?,?}, opc, asm, ZPR64, vecshiftL64> {
+ let Inst{22} = imm{5};
+ let Inst{9-8} = imm{4-3};
+ }
}
-multiclass sve_int_bin_pred_shift_imm_left<bits<4> opc, string asm> {
- def _B : sve_int_bin_pred_shift_imm<{0,0,0,1}, opc, asm, ZPR8, vecshiftL8,
- ElementSizeB>;
- def _H : sve_int_bin_pred_shift_imm<{0,0,1,?}, opc, asm, ZPR16, vecshiftL16,
- ElementSizeH> {
+multiclass sve2_int_bin_pred_shift_imm_left<bits<4> opc, string asm,
+ string psName,
+ SDPatternOperator op> {
+
+ def _B : SVEPseudo2Instr<psName # _B, 1>, sve_int_bin_pred_shift_imm<{0,0,0,1}, opc, asm, ZPR8, vecshiftL8>;
+ def _H : SVEPseudo2Instr<psName # _H, 1>,
+ sve_int_bin_pred_shift_imm<{0,0,1,?}, opc, asm, ZPR16, vecshiftL16> {
let Inst{8} = imm{3};
}
- def _S : sve_int_bin_pred_shift_imm<{0,1,?,?}, opc, asm, ZPR32, vecshiftL32,
- ElementSizeS> {
+ def _S : SVEPseudo2Instr<psName # _S, 1>,
+ sve_int_bin_pred_shift_imm<{0,1,?,?}, opc, asm, ZPR32, vecshiftL32> {
let Inst{9-8} = imm{4-3};
}
- def _D : sve_int_bin_pred_shift_imm<{1,?,?,?}, opc, asm, ZPR64, vecshiftL64,
- ElementSizeD> {
+ def _D : SVEPseudo2Instr<psName # _D, 1>,
+ sve_int_bin_pred_shift_imm<{1,?,?,?}, opc, asm, ZPR64, vecshiftL64> {
let Inst{22} = imm{5};
let Inst{9-8} = imm{4-3};
}
+
+ def : SVE_3_Op_Imm_Pat<nxv16i8, op, nxv16i1, nxv16i8, i32, tvecshiftL8, !cast<Instruction>(NAME # _B)>;
+ def : SVE_3_Op_Imm_Pat<nxv8i16, op, nxv8i1, nxv8i16, i32, tvecshiftL16, !cast<Instruction>(NAME # _H)>;
+ def : SVE_3_Op_Imm_Pat<nxv4i32, op, nxv4i1, nxv4i32, i32, tvecshiftL32, !cast<Instruction>(NAME # _S)>;
+ def : SVE_3_Op_Imm_Pat<nxv2i64, op, nxv2i1, nxv2i64, i32, tvecshiftL64, !cast<Instruction>(NAME # _D)>;
}
-multiclass sve_int_bin_pred_shift_imm_right<bits<4> opc, string asm,
+multiclass sve_int_bin_pred_shift_imm_left_zeroing_bhsd<SDPatternOperator op> {
+ def _ZERO_B : PredTwoOpImmPseudo<NAME # _B, ZPR8, tvecshiftL8, FalseLanesZero>;
+ def _ZERO_H : PredTwoOpImmPseudo<NAME # _H, ZPR16, tvecshiftL16, FalseLanesZero>;
+ def _ZERO_S : PredTwoOpImmPseudo<NAME # _S, ZPR32, tvecshiftL32, FalseLanesZero>;
+ def _ZERO_D : PredTwoOpImmPseudo<NAME # _D, ZPR64, tvecshiftL64, FalseLanesZero>;
+
+ def : SVE_3_Op_Pat_Shift_Imm_SelZero<nxv16i8, op, nxv16i1, nxv16i8, tvecshiftL8, !cast<Pseudo>(NAME # _ZERO_B)>;
+ def : SVE_3_Op_Pat_Shift_Imm_SelZero<nxv8i16, op, nxv8i1, nxv8i16, tvecshiftL16, !cast<Pseudo>(NAME # _ZERO_H)>;
+ def : SVE_3_Op_Pat_Shift_Imm_SelZero<nxv4i32, op, nxv4i1, nxv4i32, tvecshiftL32, !cast<Pseudo>(NAME # _ZERO_S)>;
+ def : SVE_3_Op_Pat_Shift_Imm_SelZero<nxv2i64, op, nxv2i1, nxv2i64, tvecshiftL64, !cast<Pseudo>(NAME # _ZERO_D)>;
+}
+
+multiclass sve_int_bin_pred_shift_imm_right<bits<4> opc, string asm, string Ps,
SDPatternOperator op = null_frag> {
- def _B : sve_int_bin_pred_shift_imm<{0,0,0,1}, opc, asm, ZPR8, vecshiftR8,
- ElementSizeB>;
- def _H : sve_int_bin_pred_shift_imm<{0,0,1,?}, opc, asm, ZPR16, vecshiftR16,
- ElementSizeH> {
+ def _B : SVEPseudo2Instr<Ps # _B, 1>,
+ sve_int_bin_pred_shift_imm<{0,0,0,1}, opc, asm, ZPR8, vecshiftR8>;
+ def _H : SVEPseudo2Instr<Ps # _H, 1>,
+ sve_int_bin_pred_shift_imm<{0,0,1,?}, opc, asm, ZPR16, vecshiftR16> {
let Inst{8} = imm{3};
}
- def _S : sve_int_bin_pred_shift_imm<{0,1,?,?}, opc, asm, ZPR32, vecshiftR32,
- ElementSizeS> {
+ def _S : SVEPseudo2Instr<Ps # _S, 1>,
+ sve_int_bin_pred_shift_imm<{0,1,?,?}, opc, asm, ZPR32, vecshiftR32> {
let Inst{9-8} = imm{4-3};
}
- def _D : sve_int_bin_pred_shift_imm<{1,?,?,?}, opc, asm, ZPR64, vecshiftR64,
- ElementSizeD> {
+ def _D : SVEPseudo2Instr<Ps # _D, 1>,
+ sve_int_bin_pred_shift_imm<{1,?,?,?}, opc, asm, ZPR64, vecshiftR64> {
let Inst{22} = imm{5};
let Inst{9-8} = imm{4-3};
}
- def : SVE_3_Op_Imm_Pat<nxv16i8, op, nxv16i1, nxv16i8, i32, vecshiftR8, !cast<Instruction>(NAME # _B)>;
- def : SVE_3_Op_Imm_Pat<nxv8i16, op, nxv8i1, nxv8i16, i32, vecshiftR16, !cast<Instruction>(NAME # _H)>;
- def : SVE_3_Op_Imm_Pat<nxv4i32, op, nxv4i1, nxv4i32, i32, vecshiftR32, !cast<Instruction>(NAME # _S)>;
- def : SVE_3_Op_Imm_Pat<nxv2i64, op, nxv2i1, nxv2i64, i32, vecshiftR64, !cast<Instruction>(NAME # _D)>;
+ def : SVE_3_Op_Imm_Pat<nxv16i8, op, nxv16i1, nxv16i8, i32, tvecshiftR8, !cast<Instruction>(NAME # _B)>;
+ def : SVE_3_Op_Imm_Pat<nxv8i16, op, nxv8i1, nxv8i16, i32, tvecshiftR16, !cast<Instruction>(NAME # _H)>;
+ def : SVE_3_Op_Imm_Pat<nxv4i32, op, nxv4i1, nxv4i32, i32, tvecshiftR32, !cast<Instruction>(NAME # _S)>;
+ def : SVE_3_Op_Imm_Pat<nxv2i64, op, nxv2i1, nxv2i64, i32, tvecshiftR64, !cast<Instruction>(NAME # _D)>;
+}
+
+multiclass sve_int_bin_pred_shift_imm_right_zeroing_bhsd<SDPatternOperator op = null_frag> {
+ def _ZERO_B : PredTwoOpImmPseudo<NAME # _B, ZPR8, vecshiftR8, FalseLanesZero>;
+ def _ZERO_H : PredTwoOpImmPseudo<NAME # _H, ZPR16, vecshiftR16, FalseLanesZero>;
+ def _ZERO_S : PredTwoOpImmPseudo<NAME # _S, ZPR32, vecshiftR32, FalseLanesZero>;
+ def _ZERO_D : PredTwoOpImmPseudo<NAME # _D, ZPR64, vecshiftR64, FalseLanesZero>;
+
+ def : SVE_3_Op_Pat_Shift_Imm_SelZero<nxv16i8, op, nxv16i1, nxv16i8, tvecshiftR8, !cast<Pseudo>(NAME # _ZERO_B)>;
+ def : SVE_3_Op_Pat_Shift_Imm_SelZero<nxv8i16, op, nxv8i1, nxv8i16, tvecshiftR16, !cast<Pseudo>(NAME # _ZERO_H)>;
+ def : SVE_3_Op_Pat_Shift_Imm_SelZero<nxv4i32, op, nxv4i1, nxv4i32, tvecshiftR32, !cast<Pseudo>(NAME # _ZERO_S)>;
+ def : SVE_3_Op_Pat_Shift_Imm_SelZero<nxv2i64, op, nxv2i1, nxv2i64, tvecshiftR64, !cast<Pseudo>(NAME # _ZERO_D)>;
}
class sve_int_bin_pred_shift<bits<2> sz8_64, bit wide, bits<3> opc,
@@ -4432,23 +4840,40 @@ class sve_int_bin_pred_shift<bits<2> sz8_64, bit wide, bits<3> opc,
let Inst{4-0} = Zdn;
let Constraints = "$Zdn = $_Zdn";
- let DestructiveInstType = Destructive;
+ let DestructiveInstType = DestructiveOther;
let ElementSize = zprty.ElementSize;
}
-multiclass sve_int_bin_pred_shift<bits<3> opc, string asm,
- SDPatternOperator op> {
- def _B : sve_int_bin_pred_shift<0b00, 0b0, opc, asm, ZPR8, ZPR8>;
- def _H : sve_int_bin_pred_shift<0b01, 0b0, opc, asm, ZPR16, ZPR16>;
- def _S : sve_int_bin_pred_shift<0b10, 0b0, opc, asm, ZPR32, ZPR32>;
- def _D : sve_int_bin_pred_shift<0b11, 0b0, opc, asm, ZPR64, ZPR64>;
-
+multiclass sve_int_bin_pred_shift<bits<3> opc, string asm, string Ps,
+ SDPatternOperator op, string revname, bit isReverseInstr = 0> {
+ let DestructiveInstType = DestructiveBinaryCommWithRev in {
+ def _B : sve_int_bin_pred_shift<0b00, 0b0, opc, asm, ZPR8, ZPR8>,
+ SVEPseudo2Instr<Ps # _B, 1>, SVEInstr2Rev<NAME # _B, revname # _B, isReverseInstr>;
+ def _H : sve_int_bin_pred_shift<0b01, 0b0, opc, asm, ZPR16, ZPR16>,
+ SVEPseudo2Instr<Ps # _H, 1>, SVEInstr2Rev<NAME # _H, revname # _H, isReverseInstr>;
+ def _S : sve_int_bin_pred_shift<0b10, 0b0, opc, asm, ZPR32, ZPR32>,
+ SVEPseudo2Instr<Ps # _S, 1>, SVEInstr2Rev<NAME # _S, revname # _S, isReverseInstr>;
+ def _D : sve_int_bin_pred_shift<0b11, 0b0, opc, asm, ZPR64, ZPR64>,
+ SVEPseudo2Instr<Ps # _D, 1>, SVEInstr2Rev<NAME # _D, revname # _D, isReverseInstr>;
+ }
def : SVE_3_Op_Pat<nxv16i8, op, nxv16i1, nxv16i8, nxv16i8, !cast<Instruction>(NAME # _B)>;
def : SVE_3_Op_Pat<nxv8i16, op, nxv8i1, nxv8i16, nxv8i16, !cast<Instruction>(NAME # _H)>;
def : SVE_3_Op_Pat<nxv4i32, op, nxv4i1, nxv4i32, nxv4i32, !cast<Instruction>(NAME # _S)>;
def : SVE_3_Op_Pat<nxv2i64, op, nxv2i1, nxv2i64, nxv2i64, !cast<Instruction>(NAME # _D)>;
}
+multiclass sve_int_bin_pred_zeroing_bhsd<SDPatternOperator op> {
+ def _ZERO_B : PredTwoOpPseudo<NAME # _B, ZPR8, FalseLanesZero>;
+ def _ZERO_H : PredTwoOpPseudo<NAME # _H, ZPR16, FalseLanesZero>;
+ def _ZERO_S : PredTwoOpPseudo<NAME # _S, ZPR32, FalseLanesZero>;
+ def _ZERO_D : PredTwoOpPseudo<NAME # _D, ZPR64, FalseLanesZero>;
+
+ def : SVE_3_Op_Pat_SelZero<nxv16i8, op, nxv16i1, nxv16i8, nxv16i8, !cast<Pseudo>(NAME # _ZERO_B)>;
+ def : SVE_3_Op_Pat_SelZero<nxv8i16, op, nxv8i1, nxv8i16, nxv8i16, !cast<Pseudo>(NAME # _ZERO_H)>;
+ def : SVE_3_Op_Pat_SelZero<nxv4i32, op, nxv4i1, nxv4i32, nxv4i32, !cast<Pseudo>(NAME # _ZERO_S)>;
+ def : SVE_3_Op_Pat_SelZero<nxv2i64, op, nxv2i1, nxv2i64, nxv2i64, !cast<Pseudo>(NAME # _ZERO_D)>;
+}
+
multiclass sve_int_bin_pred_shift_wide<bits<3> opc, string asm,
SDPatternOperator op> {
def _B : sve_int_bin_pred_shift<0b00, 0b1, opc, asm, ZPR8, ZPR64>;
@@ -4493,7 +4918,8 @@ class sve_int_bin_cons_shift_imm<bits<4> tsz8_64, bits<2> opc, string asm,
ZPRRegOp zprty, Operand immtype>
: I<(outs zprty:$Zd), (ins zprty:$Zn, immtype:$imm),
asm, "\t$Zd, $Zn, $imm",
- "", []>, Sched<[]> {
+ "",
+ []>, Sched<[]> {
bits<5> Zd;
bits<5> Zn;
bits<6> imm;
@@ -4508,7 +4934,8 @@ class sve_int_bin_cons_shift_imm<bits<4> tsz8_64, bits<2> opc, string asm,
let Inst{4-0} = Zd;
}
-multiclass sve_int_bin_cons_shift_imm_left<bits<2> opc, string asm> {
+multiclass sve_int_bin_cons_shift_imm_left<bits<2> opc, string asm,
+ SDPatternOperator op> {
def _B : sve_int_bin_cons_shift_imm<{0,0,0,1}, opc, asm, ZPR8, vecshiftL8>;
def _H : sve_int_bin_cons_shift_imm<{0,0,1,?}, opc, asm, ZPR16, vecshiftL16> {
let Inst{19} = imm{3};
@@ -4520,9 +4947,15 @@ multiclass sve_int_bin_cons_shift_imm_left<bits<2> opc, string asm> {
let Inst{22} = imm{5};
let Inst{20-19} = imm{4-3};
}
+
+ def : SVE_1_Op_Imm_Shift_Pred_Pat<nxv16i8, nxv16i1, op, ZPR8, vecshiftL8, !cast<Instruction>(NAME # _B)>;
+ def : SVE_1_Op_Imm_Shift_Pred_Pat<nxv8i16, nxv8i1, op, ZPR16, vecshiftL16, !cast<Instruction>(NAME # _H)>;
+ def : SVE_1_Op_Imm_Shift_Pred_Pat<nxv4i32, nxv4i1, op, ZPR32, vecshiftL32, !cast<Instruction>(NAME # _S)>;
+ def : SVE_1_Op_Imm_Arith_Pred_Pat<nxv2i64, nxv2i1, op, ZPR64, i64, SVEShiftImm64, !cast<Instruction>(NAME # _D)>;
}
-multiclass sve_int_bin_cons_shift_imm_right<bits<2> opc, string asm> {
+multiclass sve_int_bin_cons_shift_imm_right<bits<2> opc, string asm,
+ SDPatternOperator op> {
def _B : sve_int_bin_cons_shift_imm<{0,0,0,1}, opc, asm, ZPR8, vecshiftR8>;
def _H : sve_int_bin_cons_shift_imm<{0,0,1,?}, opc, asm, ZPR16, vecshiftR16> {
let Inst{19} = imm{3};
@@ -4534,6 +4967,11 @@ multiclass sve_int_bin_cons_shift_imm_right<bits<2> opc, string asm> {
let Inst{22} = imm{5};
let Inst{20-19} = imm{4-3};
}
+
+ def : SVE_1_Op_Imm_Shift_Pred_Pat<nxv16i8, nxv16i1, op, ZPR8, vecshiftR8, !cast<Instruction>(NAME # _B)>;
+ def : SVE_1_Op_Imm_Shift_Pred_Pat<nxv8i16, nxv8i1, op, ZPR16, vecshiftR16, !cast<Instruction>(NAME # _H)>;
+ def : SVE_1_Op_Imm_Shift_Pred_Pat<nxv4i32, nxv4i1, op, ZPR32, vecshiftR32, !cast<Instruction>(NAME # _S)>;
+ def : SVE_1_Op_Imm_Arith_Pred_Pat<nxv2i64, nxv2i1, op, ZPR64, i64, SVEShiftImm64, !cast<Instruction>(NAME # _D)>;
}
//===----------------------------------------------------------------------===//
// SVE Memory - Store Group
@@ -4743,16 +5181,36 @@ class sve2_mem_sstnt_vs_base<bits<3> opc, string asm,
let mayStore = 1;
}
-multiclass sve2_mem_sstnt_vs<bits<3> opc, string asm,
- RegisterOperand listty, ZPRRegOp zprty> {
- def _REAL : sve2_mem_sstnt_vs_base<opc, asm, listty, zprty>;
+multiclass sve2_mem_sstnt_vs_32_ptrs<bits<3> opc, string asm,
+ SDPatternOperator op,
+ ValueType vt> {
+ def _REAL : sve2_mem_sstnt_vs_base<opc, asm, Z_s, ZPR32>;
+
+ def : InstAlias<asm # "\t$Zt, $Pg, [$Zn, $Rm]",
+ (!cast<Instruction>(NAME # _REAL) ZPR32:$Zt, PPR3bAny:$Pg, ZPR32:$Zn, GPR64:$Rm), 0>;
+ def : InstAlias<asm # "\t$Zt, $Pg, [$Zn]",
+ (!cast<Instruction>(NAME # _REAL) ZPR32:$Zt, PPR3bAny:$Pg, ZPR32:$Zn, XZR), 0>;
+ def : InstAlias<asm # "\t$Zt, $Pg, [$Zn]",
+ (!cast<Instruction>(NAME # _REAL) Z_s:$Zt, PPR3bAny:$Pg, ZPR32:$Zn, XZR), 1>;
+
+ def : Pat <(op (nxv4i32 ZPR32:$Zt), (nxv4i1 PPR3bAny:$Pg), (nxv4i32 ZPR32:$Zn), (i64 GPR64:$Rm), vt),
+ (!cast<Instruction>(NAME # _REAL) ZPR32:$Zt, PPR3bAny:$Pg, ZPR32:$Zn, GPR64:$Rm)>;
+}
+
+multiclass sve2_mem_sstnt_vs_64_ptrs<bits<3> opc, string asm,
+ SDPatternOperator op,
+ ValueType vt> {
+ def _REAL : sve2_mem_sstnt_vs_base<opc, asm, Z_d, ZPR64>;
def : InstAlias<asm # "\t$Zt, $Pg, [$Zn, $Rm]",
- (!cast<Instruction>(NAME # _REAL) zprty:$Zt, PPR3bAny:$Pg, zprty:$Zn, GPR64:$Rm), 0>;
+ (!cast<Instruction>(NAME # _REAL) ZPR64:$Zt, PPR3bAny:$Pg, ZPR64:$Zn, GPR64:$Rm), 0>;
def : InstAlias<asm # "\t$Zt, $Pg, [$Zn]",
- (!cast<Instruction>(NAME # _REAL) zprty:$Zt, PPR3bAny:$Pg, zprty:$Zn, XZR), 0>;
+ (!cast<Instruction>(NAME # _REAL) ZPR64:$Zt, PPR3bAny:$Pg, ZPR64:$Zn, XZR), 0>;
def : InstAlias<asm # "\t$Zt, $Pg, [$Zn]",
- (!cast<Instruction>(NAME # _REAL) listty:$Zt, PPR3bAny:$Pg, zprty:$Zn, XZR), 1>;
+ (!cast<Instruction>(NAME # _REAL) Z_d:$Zt, PPR3bAny:$Pg, ZPR64:$Zn, XZR), 1>;
+
+ def : Pat <(op (nxv2i64 ZPR64:$Zt), (nxv2i1 PPR3bAny:$Pg), (nxv2i64 ZPR64:$Zn), (i64 GPR64:$Rm), vt),
+ (!cast<Instruction>(NAME # _REAL) ZPR64:$Zt, PPR3bAny:$Pg, ZPR64:$Zn, GPR64:$Rm)>;
}
class sve_mem_sst_sv<bits<3> opc, bit xs, bit scaled, string asm,
@@ -5094,6 +5552,17 @@ class sve_int_rdffr_pred<bit s, string asm>
let Uses = [FFR];
}
+multiclass sve_int_rdffr_pred<bit s, string asm, SDPatternOperator op> {
+ def _REAL : sve_int_rdffr_pred<s, asm>;
+
+ // We need a layer of indirection because early machine code passes balk at
+ // physical register (i.e. FFR) uses that have no previous definition.
+ let hasSideEffects = 1, hasNoSchedulingInfo = 1 in {
+ def "" : Pseudo<(outs PPR8:$Pd), (ins PPRAny:$Pg), [(set (nxv16i1 PPR8:$Pd), (op (nxv16i1 PPRAny:$Pg)))]>,
+ PseudoInstExpansion<(!cast<Instruction>(NAME # _REAL) PPR8:$Pd, PPRAny:$Pg)>;
+ }
+}
+
class sve_int_rdffr_unpred<string asm> : I<
(outs PPR8:$Pd), (ins),
asm, "\t$Pd",
@@ -5106,11 +5575,22 @@ class sve_int_rdffr_unpred<string asm> : I<
let Uses = [FFR];
}
-class sve_int_wrffr<string asm>
+multiclass sve_int_rdffr_unpred<string asm, SDPatternOperator op> {
+ def _REAL : sve_int_rdffr_unpred<asm>;
+
+ // We need a layer of indirection because early machine code passes balk at
+ // physical register (i.e. FFR) uses that have no previous definition.
+ let hasSideEffects = 1, hasNoSchedulingInfo = 1 in {
+ def "" : Pseudo<(outs PPR8:$Pd), (ins), [(set (nxv16i1 PPR8:$Pd), (op))]>,
+ PseudoInstExpansion<(!cast<Instruction>(NAME # _REAL) PPR8:$Pd)>;
+ }
+}
+
+class sve_int_wrffr<string asm, SDPatternOperator op>
: I<(outs), (ins PPR8:$Pn),
asm, "\t$Pn",
"",
- []>, Sched<[]> {
+ [(op (nxv16i1 PPR8:$Pn))]>, Sched<[]> {
bits<4> Pn;
let Inst{31-9} = 0b00100101001010001001000;
let Inst{8-5} = Pn;
@@ -5120,11 +5600,11 @@ class sve_int_wrffr<string asm>
let Defs = [FFR];
}
-class sve_int_setffr<string asm>
+class sve_int_setffr<string asm, SDPatternOperator op>
: I<(outs), (ins),
asm, "",
"",
- []>, Sched<[]> {
+ [(op)]>, Sched<[]> {
let Inst{31-0} = 0b00100101001011001001000000000000;
let hasSideEffects = 1;
@@ -5219,7 +5699,7 @@ class sve_int_perm_clast_zz<bits<2> sz8_64, bit ab, string asm,
let Inst{4-0} = Zdn;
let Constraints = "$Zdn = $_Zdn";
- let DestructiveInstType = Destructive;
+ let DestructiveInstType = DestructiveOther;
let ElementSize = ElementSizeNone;
}
@@ -5317,7 +5797,7 @@ class sve_int_perm_splice<bits<2> sz8_64, string asm, ZPRRegOp zprty>
let Inst{4-0} = Zdn;
let Constraints = "$Zdn = $_Zdn";
- let DestructiveInstType = Destructive;
+ let DestructiveInstType = DestructiveOther;
let ElementSize = ElementSizeNone;
}
@@ -5332,9 +5812,9 @@ multiclass sve_int_perm_splice<string asm, SDPatternOperator op> {
def : SVE_3_Op_Pat<nxv4i32, op, nxv4i1, nxv4i32, nxv4i32, !cast<Instruction>(NAME # _S)>;
def : SVE_3_Op_Pat<nxv2i64, op, nxv2i1, nxv2i64, nxv2i64, !cast<Instruction>(NAME # _D)>;
- def : SVE_3_Op_Pat<nxv8f16, op, nxv8i1, nxv8f16, nxv8f16, !cast<Instruction>(NAME # _H)>;
- def : SVE_3_Op_Pat<nxv4f32, op, nxv4i1, nxv4f32, nxv4f32, !cast<Instruction>(NAME # _S)>;
- def : SVE_3_Op_Pat<nxv2f64, op, nxv2i1, nxv2f64, nxv2f64, !cast<Instruction>(NAME # _D)>;
+ def : SVE_3_Op_Pat<nxv8f16, op, nxv8i1, nxv8f16, nxv8f16, !cast<Instruction>(NAME # _H)>;
+ def : SVE_3_Op_Pat<nxv4f32, op, nxv4i1, nxv4f32, nxv4f32, !cast<Instruction>(NAME # _S)>;
+ def : SVE_3_Op_Pat<nxv2f64, op, nxv2i1, nxv2f64, nxv2f64, !cast<Instruction>(NAME # _D)>;
}
class sve2_int_perm_splice_cons<bits<2> sz8_64, string asm,
@@ -5380,7 +5860,7 @@ class sve_int_perm_rev<bits<2> sz8_64, bits<2> opc, string asm,
let Inst{4-0} = Zd;
let Constraints = "$Zd = $_Zd";
- let DestructiveInstType = Destructive;
+ let DestructiveInstType = DestructiveOther;
let ElementSize = zprty.ElementSize;
}
@@ -5443,11 +5923,11 @@ class sve_int_perm_cpy_r<bits<2> sz8_64, string asm, ZPRRegOp zprty,
let Inst{4-0} = Zd;
let Constraints = "$Zd = $_Zd";
- let DestructiveInstType = Destructive;
+ let DestructiveInstType = DestructiveOther;
let ElementSize = zprty.ElementSize;
}
-multiclass sve_int_perm_cpy_r<string asm> {
+multiclass sve_int_perm_cpy_r<string asm, SDPatternOperator op> {
def _B : sve_int_perm_cpy_r<0b00, asm, ZPR8, GPR32sp>;
def _H : sve_int_perm_cpy_r<0b01, asm, ZPR16, GPR32sp>;
def _S : sve_int_perm_cpy_r<0b10, asm, ZPR32, GPR32sp>;
@@ -5461,6 +5941,15 @@ multiclass sve_int_perm_cpy_r<string asm> {
(!cast<Instruction>(NAME # _S) ZPR32:$Zd, PPR3bAny:$Pg, GPR32sp:$Rn), 1>;
def : InstAlias<"mov $Zd, $Pg/m, $Rn",
(!cast<Instruction>(NAME # _D) ZPR64:$Zd, PPR3bAny:$Pg, GPR64sp:$Rn), 1>;
+
+ def : Pat<(nxv16i8 (op nxv16i1:$pg, i32:$splat, nxv16i8:$passthru)),
+ (!cast<Instruction>(NAME # _B) $passthru, $pg, $splat)>;
+ def : Pat<(nxv8i16 (op nxv8i1:$pg, i32:$splat, nxv8i16:$passthru)),
+ (!cast<Instruction>(NAME # _H) $passthru, $pg, $splat)>;
+ def : Pat<(nxv4i32 (op nxv4i1:$pg, i32:$splat, nxv4i32:$passthru)),
+ (!cast<Instruction>(NAME # _S) $passthru, $pg, $splat)>;
+ def : Pat<(nxv2i64 (op nxv2i1:$pg, i64:$splat, nxv2i64:$passthru)),
+ (!cast<Instruction>(NAME # _D) $passthru, $pg, $splat)>;
}
class sve_int_perm_cpy_v<bits<2> sz8_64, string asm, ZPRRegOp zprty,
@@ -5480,11 +5969,11 @@ class sve_int_perm_cpy_v<bits<2> sz8_64, string asm, ZPRRegOp zprty,
let Inst{4-0} = Zd;
let Constraints = "$Zd = $_Zd";
- let DestructiveInstType = Destructive;
+ let DestructiveInstType = DestructiveOther;
let ElementSize = zprty.ElementSize;
}
-multiclass sve_int_perm_cpy_v<string asm> {
+multiclass sve_int_perm_cpy_v<string asm, SDPatternOperator op> {
def _B : sve_int_perm_cpy_v<0b00, asm, ZPR8, FPR8>;
def _H : sve_int_perm_cpy_v<0b01, asm, ZPR16, FPR16>;
def _S : sve_int_perm_cpy_v<0b10, asm, ZPR32, FPR32>;
@@ -5498,6 +5987,16 @@ multiclass sve_int_perm_cpy_v<string asm> {
(!cast<Instruction>(NAME # _S) ZPR32:$Zd, PPR3bAny:$Pg, FPR32:$Vn), 1>;
def : InstAlias<"mov $Zd, $Pg/m, $Vn",
(!cast<Instruction>(NAME # _D) ZPR64:$Zd, PPR3bAny:$Pg, FPR64:$Vn), 1>;
+
+
+ def : Pat<(nxv8f16 (op nxv8i1:$pg, f16:$splat, nxv8f16:$passthru)),
+ (!cast<Instruction>(NAME # _H) $passthru, $pg, $splat)>;
+ def : Pat<(nxv2f32 (op nxv2i1:$pg, f32:$splat, nxv2f32:$passthru)),
+ (!cast<Instruction>(NAME # _S) $passthru, $pg, $splat)>;
+ def : Pat<(nxv4f32 (op nxv4i1:$pg, f32:$splat, nxv4f32:$passthru)),
+ (!cast<Instruction>(NAME # _S) $passthru, $pg, $splat)>;
+ def : Pat<(nxv2f64 (op nxv2i1:$pg, f64:$splat, nxv2f64:$passthru)),
+ (!cast<Instruction>(NAME # _D) $passthru, $pg, $splat)>;
}
class sve_int_perm_compact<bit sz, string asm, ZPRRegOp zprty>
@@ -5557,14 +6056,21 @@ class sve_mem_cld_si_base<bits<4> dtype, bit nf, string asm,
multiclass sve_mem_cld_si_base<bits<4> dtype, bit nf, string asm,
RegisterOperand listty, ZPRRegOp zprty> {
- def "" : sve_mem_cld_si_base<dtype, nf, asm, listty>;
+ def _REAL : sve_mem_cld_si_base<dtype, nf, asm, listty>;
def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn]",
- (!cast<Instruction>(NAME) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, 0), 0>;
+ (!cast<Instruction>(NAME # _REAL) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, 0), 0>;
def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn, $imm4, mul vl]",
- (!cast<Instruction>(NAME) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, simm4s1:$imm4), 0>;
+ (!cast<Instruction>(NAME # _REAL) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, simm4s1:$imm4), 0>;
def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn]",
- (!cast<Instruction>(NAME) listty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, 0), 1>;
+ (!cast<Instruction>(NAME # _REAL) listty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, 0), 1>;
+
+ // We need a layer of indirection because early machine code passes balk at
+ // physical register (i.e. FFR) uses that have no previous definition.
+ let hasSideEffects = 1, hasNoSchedulingInfo = 1, mayLoad = 1 in {
+ def "" : Pseudo<(outs listty:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, simm4s1:$imm4), []>,
+ PseudoInstExpansion<(!cast<Instruction>(NAME # _REAL) listty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, simm4s1:$imm4)>;
+ }
}
multiclass sve_mem_cld_si<bits<4> dtype, string asm, RegisterOperand listty,
@@ -5773,6 +6279,13 @@ multiclass sve_mem_cldff_ss<bits<4> dtype, string asm, RegisterOperand listty,
def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn]",
(!cast<Instruction>(NAME # _REAL) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, XZR), 0>;
+
+ // We need a layer of indirection because early machine code passes balk at
+ // physical register (i.e. FFR) uses that have no previous definition.
+ let hasSideEffects = 1, hasNoSchedulingInfo = 1 in {
+ def "" : Pseudo<(outs listty:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, gprty:$Rm), []>,
+ PseudoInstExpansion<(!cast<Instruction>(NAME # _REAL) listty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, gprty:$Rm)>;
+ }
}
multiclass sve_mem_cldnf_si<bits<4> dtype, string asm, RegisterOperand listty,
@@ -5878,10 +6391,19 @@ multiclass sve_mem_32b_gld_sv_32_scaled<bits<4> opc, string asm,
def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn, $Zm]",
(!cast<Instruction>(NAME # _SXTW_SCALED_REAL) ZPR32:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm), 0>;
+ // We need a layer of indirection because early machine code passes balk at
+ // physical register (i.e. FFR) uses that have no previous definition.
+ let hasSideEffects = 1, hasNoSchedulingInfo = 1 in {
+ def _UXTW_SCALED : Pseudo<(outs Z_s:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, uxtw_opnd:$Zm), []>,
+ PseudoInstExpansion<(!cast<Instruction>(NAME # _UXTW_SCALED_REAL) Z_s:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, uxtw_opnd:$Zm)>;
+ def _SXTW_SCALED : Pseudo<(outs Z_s:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm), []>,
+ PseudoInstExpansion<(!cast<Instruction>(NAME # _SXTW_SCALED_REAL) Z_s:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm)>;
+ }
+
def : Pat<(nxv4i32 (uxtw_op (nxv4i1 PPR:$gp), GPR64sp:$base, (nxv4i32 ZPR:$indices), vt)),
- (!cast<Instruction>(NAME # _UXTW_SCALED_REAL) PPR:$gp, GPR64sp:$base, ZPR:$indices)>;
+ (!cast<Instruction>(NAME # _UXTW_SCALED) PPR:$gp, GPR64sp:$base, ZPR:$indices)>;
def : Pat<(nxv4i32 (sxtw_op (nxv4i1 PPR:$gp), GPR64sp:$base, (nxv4i32 ZPR:$indices), vt)),
- (!cast<Instruction>(NAME # _SXTW_SCALED_REAL) PPR:$gp, GPR64sp:$base, ZPR:$indices)>;
+ (!cast<Instruction>(NAME # _SXTW_SCALED) PPR:$gp, GPR64sp:$base, ZPR:$indices)>;
}
multiclass sve_mem_32b_gld_vs_32_unscaled<bits<4> opc, string asm,
@@ -5898,10 +6420,19 @@ multiclass sve_mem_32b_gld_vs_32_unscaled<bits<4> opc, string asm,
def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn, $Zm]",
(!cast<Instruction>(NAME # _SXTW_REAL) ZPR32:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm), 0>;
+ // We need a layer of indirection because early machine code passes balk at
+ // physical register (i.e. FFR) uses that have no previous definition.
+ let hasSideEffects = 1, hasNoSchedulingInfo = 1 in {
+ def _UXTW : Pseudo<(outs Z_s:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, uxtw_opnd:$Zm), []>,
+ PseudoInstExpansion<(!cast<Instruction>(NAME # _UXTW_REAL) Z_s:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, uxtw_opnd:$Zm)>;
+ def _SXTW : Pseudo<(outs Z_s:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm), []>,
+ PseudoInstExpansion<(!cast<Instruction>(NAME # _SXTW_REAL) Z_s:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm)>;
+ }
+
def : Pat<(nxv4i32 (uxtw_op (nxv4i1 PPR:$gp), GPR64sp:$base, (nxv4i32 ZPR:$offsets), vt)),
- (!cast<Instruction>(NAME # _UXTW_REAL) PPR:$gp, GPR64sp:$base, ZPR:$offsets)>;
+ (!cast<Instruction>(NAME # _UXTW) PPR:$gp, GPR64sp:$base, ZPR:$offsets)>;
def : Pat<(nxv4i32 (sxtw_op (nxv4i1 PPR:$gp), GPR64sp:$base, (nxv4i32 ZPR:$offsets), vt)),
- (!cast<Instruction>(NAME # _SXTW_REAL) PPR:$gp, GPR64sp:$base, ZPR:$offsets)>;
+ (!cast<Instruction>(NAME # _SXTW) PPR:$gp, GPR64sp:$base, ZPR:$offsets)>;
}
@@ -5940,8 +6471,15 @@ multiclass sve_mem_32b_gld_vi_32_ptrs<bits<4> opc, string asm, Operand imm_ty,
def : InstAlias<asm # "\t$Zt, $Pg/z, [$Zn]",
(!cast<Instruction>(NAME # _IMM_REAL) Z_s:$Zt, PPR3bAny:$Pg, ZPR32:$Zn, 0), 1>;
+ // We need a layer of indirection because early machine code passes balk at
+ // physical register (i.e. FFR) uses that have no previous definition.
+ let hasSideEffects = 1, hasNoSchedulingInfo = 1 in {
+ def _IMM : Pseudo<(outs Z_s:$Zt), (ins PPR3bAny:$Pg, ZPR32:$Zn, imm_ty:$imm5), []>,
+ PseudoInstExpansion<(!cast<Instruction>(NAME # _IMM_REAL) Z_s:$Zt, PPR3bAny:$Pg, ZPR32:$Zn, imm_ty:$imm5)>;
+ }
+
def : Pat<(nxv4i32 (op (nxv4i1 PPR:$gp), (nxv4i32 ZPR:$ptrs), imm_ty:$index, vt)),
- (!cast<Instruction>(NAME # _IMM_REAL) PPR:$gp, ZPR:$ptrs, imm_ty:$index)>;
+ (!cast<Instruction>(NAME # _IMM) PPR:$gp, ZPR:$ptrs, imm_ty:$index)>;
}
class sve_mem_prfm_si<bits<2> msz, string asm>
@@ -6022,9 +6560,17 @@ class sve_mem_32b_prfm_sv<bits<2> msz, bit xs, string asm,
multiclass sve_mem_32b_prfm_sv_scaled<bits<2> msz, string asm,
RegisterOperand sxtw_opnd,
- RegisterOperand uxtw_opnd> {
+ RegisterOperand uxtw_opnd,
+ PatFrag op_sxtw,
+ PatFrag op_uxtw> {
def _UXTW_SCALED : sve_mem_32b_prfm_sv<msz, 0, asm, uxtw_opnd>;
def _SXTW_SCALED : sve_mem_32b_prfm_sv<msz, 1, asm, sxtw_opnd>;
+
+ def : Pat<(op_uxtw (nxv4i1 PPR3bAny:$Pg), (i64 GPR64sp:$Rn), (nxv4i32 uxtw_opnd:$Zm), (i32 sve_prfop:$prfop)),
+ (!cast<Instruction>(NAME # _UXTW_SCALED) sve_prfop:$prfop, PPR3bAny:$Pg, GPR64sp:$Rn, uxtw_opnd:$Zm)>;
+
+ def : Pat<(op_sxtw (nxv4i1 PPR3bAny:$Pg), (i64 GPR64sp:$Rn), (nxv4i32 sxtw_opnd:$Zm), (i32 sve_prfop:$prfop)),
+ (!cast<Instruction>(NAME # _SXTW_SCALED) sve_prfop:$prfop, PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm)>;
}
class sve_mem_32b_prfm_vi<bits<2> msz, string asm, Operand imm_ty>
@@ -6047,11 +6593,14 @@ class sve_mem_32b_prfm_vi<bits<2> msz, string asm, Operand imm_ty>
let Inst{3-0} = prfop;
}
-multiclass sve_mem_32b_prfm_vi<bits<2> msz, string asm, Operand imm_ty> {
+multiclass sve_mem_32b_prfm_vi<bits<2> msz, string asm, Operand imm_ty, SDPatternOperator op> {
def NAME : sve_mem_32b_prfm_vi<msz, asm, imm_ty>;
def : InstAlias<asm # "\t$prfop, $Pg, [$Zn]",
(!cast<Instruction>(NAME) sve_prfop:$prfop, PPR3bAny:$Pg, ZPR32:$Zn, 0), 1>;
+
+ def : Pat<(op (nxv4i1 PPR_3b:$Pg), (nxv4i32 ZPR32:$Zn), (i64 imm_ty:$imm), (i32 sve_prfop:$prfop)),
+ (!cast<Instruction>(NAME) sve_prfop:$prfop, PPR_3b:$Pg, ZPR32:$Zn, imm_ty:$imm)>;
}
class sve_mem_z_fill<string asm>
@@ -6130,17 +6679,38 @@ class sve2_mem_gldnt_vs_base<bits<5> opc, dag iops, string asm,
let mayLoad = 1;
}
-multiclass sve2_mem_gldnt_vs<bits<5> opc, string asm,
- RegisterOperand listty, ZPRRegOp zprty> {
- def _REAL : sve2_mem_gldnt_vs_base<opc, (ins PPR3bAny:$Pg, zprty:$Zn, GPR64:$Rm),
- asm, listty>;
+multiclass sve2_mem_gldnt_vs_32_ptrs<bits<5> opc, string asm,
+ SDPatternOperator op,
+ ValueType vt> {
+ def _REAL : sve2_mem_gldnt_vs_base<opc, (ins PPR3bAny:$Pg, ZPR32:$Zn, GPR64:$Rm),
+ asm, Z_s>;
+
+ def : InstAlias<asm # "\t$Zt, $Pg/z, [$Zn, $Rm]",
+ (!cast<Instruction>(NAME # _REAL) ZPR32:$Zt, PPR3bAny:$Pg, ZPR32:$Zn, GPR64:$Rm), 0>;
+ def : InstAlias<asm # "\t$Zt, $Pg/z, [$Zn]",
+ (!cast<Instruction>(NAME # _REAL) ZPR32:$Zt, PPR3bAny:$Pg, ZPR32:$Zn, XZR), 0>;
+ def : InstAlias<asm # "\t$Zt, $Pg/z, [$Zn]",
+ (!cast<Instruction>(NAME # _REAL) Z_s:$Zt, PPR3bAny:$Pg, ZPR32:$Zn, XZR), 1>;
+
+ def : Pat <(nxv4i32 (op (nxv4i1 PPR3bAny:$Pg), (nxv4i32 ZPR32:$Zd), (i64 GPR64:$Rm), vt)),
+ (!cast<Instruction>(NAME # _REAL) PPR3bAny:$Pg, ZPR32:$Zd, GPR64:$Rm)>;
+}
+
+multiclass sve2_mem_gldnt_vs_64_ptrs<bits<5> opc, string asm,
+ SDPatternOperator op,
+ ValueType vt> {
+ def _REAL : sve2_mem_gldnt_vs_base<opc, (ins PPR3bAny:$Pg, ZPR64:$Zn, GPR64:$Rm),
+ asm, Z_d>;
def : InstAlias<asm # "\t$Zt, $Pg/z, [$Zn, $Rm]",
- (!cast<Instruction>(NAME # _REAL) zprty:$Zt, PPR3bAny:$Pg, zprty:$Zn, GPR64:$Rm), 0>;
+ (!cast<Instruction>(NAME # _REAL) ZPR64:$Zt, PPR3bAny:$Pg, ZPR64:$Zn, GPR64:$Rm), 0>;
def : InstAlias<asm # "\t$Zt, $Pg/z, [$Zn]",
- (!cast<Instruction>(NAME # _REAL) zprty:$Zt, PPR3bAny:$Pg, zprty:$Zn, XZR), 0>;
+ (!cast<Instruction>(NAME # _REAL) ZPR64:$Zt, PPR3bAny:$Pg, ZPR64:$Zn, XZR), 0>;
def : InstAlias<asm # "\t$Zt, $Pg/z, [$Zn]",
- (!cast<Instruction>(NAME # _REAL) listty:$Zt, PPR3bAny:$Pg, zprty:$Zn, XZR), 1>;
+ (!cast<Instruction>(NAME # _REAL) Z_d:$Zt, PPR3bAny:$Pg, ZPR64:$Zn, XZR), 1>;
+
+ def : Pat <(nxv2i64 (op (nxv2i1 PPR3bAny:$Pg), (nxv2i64 ZPR64:$Zd), (i64 GPR64:$Rm), vt)),
+ (!cast<Instruction>(NAME # _REAL) PPR3bAny:$Pg, ZPR64:$Zd, GPR64:$Rm)>;
}
//===----------------------------------------------------------------------===//
@@ -6190,10 +6760,19 @@ multiclass sve_mem_64b_gld_sv_32_scaled<bits<4> opc, string asm,
def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn, $Zm]",
(!cast<Instruction>(NAME # _SXTW_SCALED_REAL) ZPR64:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm), 0>;
+ // We need a layer of indirection because early machine code passes balk at
+ // physical register (i.e. FFR) uses that have no previous definition.
+ let hasSideEffects = 1, hasNoSchedulingInfo = 1 in {
+ def _UXTW_SCALED : Pseudo<(outs Z_d:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, uxtw_opnd:$Zm), []>,
+ PseudoInstExpansion<(!cast<Instruction>(NAME # _UXTW_SCALED_REAL) Z_d:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, uxtw_opnd:$Zm)>;
+ def _SXTW_SCALED : Pseudo<(outs Z_d:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm), []>,
+ PseudoInstExpansion<(!cast<Instruction>(NAME # _SXTW_SCALED_REAL) Z_d:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm)>;
+ }
+
def : Pat<(nxv2i64 (uxtw_op (nxv2i1 PPR:$gp), GPR64sp:$base, (nxv2i64 ZPR:$indices), vt)),
- (!cast<Instruction>(NAME # _UXTW_SCALED_REAL) PPR:$gp, GPR64sp:$base, ZPR:$indices)>;
+ (!cast<Instruction>(NAME # _UXTW_SCALED) PPR:$gp, GPR64sp:$base, ZPR:$indices)>;
def : Pat<(nxv2i64 (sxtw_op (nxv2i1 PPR:$gp), GPR64sp:$base, (nxv2i64 ZPR:$indices), vt)),
- (!cast<Instruction>(NAME # _SXTW_SCALED_REAL) PPR:$gp, GPR64sp:$base, ZPR:$indices)>;
+ (!cast<Instruction>(NAME # _SXTW_SCALED) PPR:$gp, GPR64sp:$base, ZPR:$indices)>;
}
multiclass sve_mem_64b_gld_vs_32_unscaled<bits<4> opc, string asm,
@@ -6210,10 +6789,19 @@ multiclass sve_mem_64b_gld_vs_32_unscaled<bits<4> opc, string asm,
def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn, $Zm]",
(!cast<Instruction>(NAME # _SXTW_REAL) ZPR64:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm), 0>;
+ // We need a layer of indirection because early machine code passes balk at
+ // physical register (i.e. FFR) uses that have no previous definition.
+ let hasSideEffects = 1, hasNoSchedulingInfo = 1 in {
+ def _UXTW : Pseudo<(outs Z_d:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, uxtw_opnd:$Zm), []>,
+ PseudoInstExpansion<(!cast<Instruction>(NAME # _UXTW_REAL) Z_d:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, uxtw_opnd:$Zm)>;
+ def _SXTW : Pseudo<(outs Z_d:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm), []>,
+ PseudoInstExpansion<(!cast<Instruction>(NAME # _SXTW_REAL) Z_d:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm)>;
+ }
+
def : Pat<(nxv2i64 (uxtw_op (nxv2i1 PPR:$gp), GPR64sp:$base, (nxv2i64 ZPR:$offsets), vt)),
- (!cast<Instruction>(NAME # _UXTW_REAL) PPR:$gp, GPR64sp:$base, ZPR:$offsets)>;
+ (!cast<Instruction>(NAME # _UXTW) PPR:$gp, GPR64sp:$base, ZPR:$offsets)>;
def : Pat<(nxv2i64 (sxtw_op (nxv2i1 PPR:$gp), GPR64sp:$base, (nxv2i64 ZPR:$offsets), vt)),
- (!cast<Instruction>(NAME # _SXTW_REAL) PPR:$gp, GPR64sp:$base, ZPR:$offsets)>;
+ (!cast<Instruction>(NAME # _SXTW) PPR:$gp, GPR64sp:$base, ZPR:$offsets)>;
}
multiclass sve_mem_64b_gld_sv2_64_scaled<bits<4> opc, string asm,
@@ -6224,8 +6812,15 @@ multiclass sve_mem_64b_gld_sv2_64_scaled<bits<4> opc, string asm,
def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn, $Zm]",
(!cast<Instruction>(NAME # _SCALED_REAL) ZPR64:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, zprext:$Zm), 0>;
+ // We need a layer of indirection because early machine code passes balk at
+ // physical register (i.e. FFR) uses that have no previous definition.
+ let hasSideEffects = 1, hasNoSchedulingInfo = 1 in {
+ def _SCALED : Pseudo<(outs Z_d:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, zprext:$Zm), []>,
+ PseudoInstExpansion<(!cast<Instruction>(NAME # _SCALED_REAL) Z_d:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, zprext:$Zm)>;
+ }
+
def : Pat<(nxv2i64 (op (nxv2i1 PPR:$gp), GPR64sp:$base, (nxv2i64 ZPR:$indices), vt)),
- (!cast<Instruction>(NAME # _SCALED_REAL) PPR:$gp, GPR64sp:$base, ZPR:$indices)>;
+ (!cast<Instruction>(NAME # _SCALED) PPR:$gp, GPR64sp:$base, ZPR:$indices)>;
}
multiclass sve_mem_64b_gld_vs2_64_unscaled<bits<4> opc, string asm,
@@ -6235,8 +6830,15 @@ multiclass sve_mem_64b_gld_vs2_64_unscaled<bits<4> opc, string asm,
def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn, $Zm]",
(!cast<Instruction>(NAME # _REAL) ZPR64:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, ZPR64ExtLSL8:$Zm), 0>;
+ // We need a layer of indirection because early machine code passes balk at
+ // physical register (i.e. FFR) uses that have no previous definition.
+ let hasSideEffects = 1, hasNoSchedulingInfo = 1 in {
+ def "" : Pseudo<(outs Z_d:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, ZPR64ExtLSL8:$Zm), []>,
+ PseudoInstExpansion<(!cast<Instruction>(NAME # _REAL) Z_d:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, ZPR64ExtLSL8:$Zm)>;
+ }
+
def : Pat<(nxv2i64 (op (nxv2i1 PPR:$gp), GPR64sp:$base, (nxv2i64 ZPR:$offsets), vt)),
- (!cast<Instruction>(NAME # _REAL) PPR:$gp, GPR64sp:$base, ZPR:$offsets)>;
+ (!cast<Instruction>(NAME) PPR:$gp, GPR64sp:$base, ZPR:$offsets)>;
}
class sve_mem_64b_gld_vi<bits<4> opc, string asm, Operand imm_ty>
@@ -6274,8 +6876,15 @@ multiclass sve_mem_64b_gld_vi_64_ptrs<bits<4> opc, string asm, Operand imm_ty,
def : InstAlias<asm # "\t$Zt, $Pg/z, [$Zn]",
(!cast<Instruction>(NAME # _IMM_REAL) Z_d:$Zt, PPR3bAny:$Pg, ZPR64:$Zn, 0), 1>;
+ // We need a layer of indirection because early machine code passes balk at
+ // physical register (i.e. FFR) uses that have no previous definition.
+ let hasSideEffects = 1, hasNoSchedulingInfo = 1 in {
+ def _IMM : Pseudo<(outs Z_d:$Zt), (ins PPR3bAny:$Pg, ZPR64:$Zn, imm_ty:$imm5), []>,
+ PseudoInstExpansion<(!cast<Instruction>(NAME # _IMM_REAL) Z_d:$Zt, PPR3bAny:$Pg, ZPR64:$Zn, imm_ty:$imm5)>;
+ }
+
def : Pat<(nxv2i64 (op (nxv2i1 PPR:$gp), (nxv2i64 ZPR:$ptrs), imm_ty:$index, vt)),
- (!cast<Instruction>(NAME # _IMM_REAL) PPR:$gp, ZPR:$ptrs, imm_ty:$index)>;
+ (!cast<Instruction>(NAME # _IMM) PPR:$gp, ZPR:$ptrs, imm_ty:$index)>;
}
// bit lsl is '0' if the offsets are extended (uxtw/sxtw), '1' if shifted (lsl)
@@ -6305,14 +6914,27 @@ class sve_mem_64b_prfm_sv<bits<2> msz, bit xs, bit lsl, string asm,
multiclass sve_mem_64b_prfm_sv_ext_scaled<bits<2> msz, string asm,
RegisterOperand sxtw_opnd,
- RegisterOperand uxtw_opnd> {
+ RegisterOperand uxtw_opnd,
+ PatFrag op_sxtw,
+ PatFrag op_uxtw> {
def _UXTW_SCALED : sve_mem_64b_prfm_sv<msz, 0, 0, asm, uxtw_opnd>;
def _SXTW_SCALED : sve_mem_64b_prfm_sv<msz, 1, 0, asm, sxtw_opnd>;
+
+ def : Pat<(op_uxtw (nxv2i1 PPR3bAny:$Pg), (i64 GPR64sp:$Rn), (nxv2i64 uxtw_opnd:$Zm), (i32 sve_prfop:$prfop)),
+ (!cast<Instruction>(NAME # _UXTW_SCALED) sve_prfop:$prfop, PPR3bAny:$Pg, GPR64sp:$Rn, uxtw_opnd:$Zm)>;
+
+ def : Pat<(op_sxtw (nxv2i1 PPR3bAny:$Pg), (i64 GPR64sp:$Rn), (nxv2i64 sxtw_opnd:$Zm), (i32 sve_prfop:$prfop)),
+ (!cast<Instruction>(NAME # _SXTW_SCALED) sve_prfop:$prfop, PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm)>;
+
}
multiclass sve_mem_64b_prfm_sv_lsl_scaled<bits<2> msz, string asm,
- RegisterOperand zprext> {
+ RegisterOperand zprext, PatFrag frag> {
def NAME : sve_mem_64b_prfm_sv<msz, 1, 1, asm, zprext>;
+
+ def : Pat<(frag (nxv2i1 PPR3bAny:$Pg), (i64 GPR64sp:$Rn), (nxv2i64 zprext:$Zm), (i32 sve_prfop:$prfop)),
+ (!cast<Instruction>(NAME) sve_prfop:$prfop, PPR3bAny:$Pg, GPR64sp:$Rn, zprext:$Zm)>;
+
}
@@ -6338,13 +6960,15 @@ class sve_mem_64b_prfm_vi<bits<2> msz, string asm, Operand imm_ty>
let hasSideEffects = 1;
}
-multiclass sve_mem_64b_prfm_vi<bits<2> msz, string asm, Operand imm_ty> {
+multiclass sve_mem_64b_prfm_vi<bits<2> msz, string asm, Operand imm_ty, SDPatternOperator op> {
def NAME : sve_mem_64b_prfm_vi<msz, asm, imm_ty>;
def : InstAlias<asm # "\t$prfop, $Pg, [$Zn]",
(!cast<Instruction>(NAME) sve_prfop:$prfop, PPR3bAny:$Pg, ZPR64:$Zn, 0), 1>;
-}
+ def : Pat<(op (nxv2i1 PPR_3b:$Pg), (nxv2i64 ZPR32:$Zn), (i64 imm_ty:$imm), (i32 sve_prfop:$prfop)),
+ (!cast<Instruction>(NAME) sve_prfop:$prfop, PPR_3b:$Pg, ZPR32:$Zn, imm_ty:$imm)>;
+}
//===----------------------------------------------------------------------===//
// SVE Compute Vector Address Group
@@ -6600,6 +7224,12 @@ class sve_int_brkp<bits<2> opc, string asm>
let Defs = !if(!eq (opc{1}, 1), [NZCV], []);
}
+multiclass sve_int_brkp<bits<2> opc, string asm, SDPatternOperator op> {
+ def NAME : sve_int_brkp<opc, asm>;
+
+ def : SVE_3_Op_Pat<nxv16i1, op, nxv16i1, nxv16i1, nxv16i1, !cast<Instruction>(NAME)>;
+}
+
//===----------------------------------------------------------------------===//
// SVE Partition Break Group
@@ -6626,6 +7256,12 @@ class sve_int_brkn<bit S, string asm>
let Defs = !if(!eq (S, 0b1), [NZCV], []);
}
+multiclass sve_int_brkn<bits<1> opc, string asm, SDPatternOperator op> {
+ def NAME : sve_int_brkn<opc, asm>;
+
+ def : SVE_3_Op_Pat<nxv16i1, op, nxv16i1, nxv16i1, nxv16i1, !cast<Instruction>(NAME)>;
+}
+
class sve_int_break<bits<3> opc, string asm, string suffix, dag iops>
: I<(outs PPR8:$Pd), iops,
asm, "\t$Pd, $Pg"#suffix#", $Pn",
@@ -6648,12 +7284,16 @@ class sve_int_break<bits<3> opc, string asm, string suffix, dag iops>
}
-multiclass sve_int_break_m<bits<3> opc, string asm> {
+multiclass sve_int_break_m<bits<3> opc, string asm, SDPatternOperator op> {
def NAME : sve_int_break<opc, asm, "/m", (ins PPR8:$_Pd, PPRAny:$Pg, PPR8:$Pn)>;
+
+ def : SVE_3_Op_Pat<nxv16i1, op, nxv16i1, nxv16i1, nxv16i1, !cast<Instruction>(NAME)>;
}
-multiclass sve_int_break_z<bits<3> opc, string asm> {
+multiclass sve_int_break_z<bits<3> opc, string asm, SDPatternOperator op> {
def NAME : sve_int_break<opc, asm, "/z", (ins PPRAny:$Pg, PPR8:$Pn)>;
+
+ def : SVE_2_Op_Pat<nxv16i1, op, nxv16i1, nxv16i1, !cast<Instruction>(NAME)>;
}
//===----------------------------------------------------------------------===//
@@ -6683,20 +7323,23 @@ class sve2_char_match<bit sz, bit opc, string asm,
let Defs = [NZCV];
}
-multiclass sve2_char_match<bit opc, string asm> {
+multiclass sve2_char_match<bit opc, string asm, SDPatternOperator op> {
def _B : sve2_char_match<0b0, opc, asm, PPR8, ZPR8>;
def _H : sve2_char_match<0b1, opc, asm, PPR16, ZPR16>;
+
+ def : SVE_3_Op_Pat<nxv16i1, op, nxv16i1, nxv16i8, nxv16i8, !cast<Instruction>(NAME # _B)>;
+ def : SVE_3_Op_Pat<nxv8i1, op, nxv8i1, nxv8i16, nxv8i16, !cast<Instruction>(NAME # _H)>;
}
//===----------------------------------------------------------------------===//
// SVE2 Histogram Computation - Segment Group
//===----------------------------------------------------------------------===//
-class sve2_hist_gen_segment<string asm>
+class sve2_hist_gen_segment<string asm, SDPatternOperator op>
: I<(outs ZPR8:$Zd), (ins ZPR8:$Zn, ZPR8:$Zm),
asm, "\t$Zd, $Zn, $Zm",
"",
- []>, Sched<[]> {
+ [(set nxv16i8:$Zd, (op nxv16i8:$Zn, nxv16i8:$Zm))]>, Sched<[]> {
bits<5> Zd;
bits<5> Zn;
bits<5> Zm;
@@ -6730,9 +7373,12 @@ class sve2_hist_gen_vector<bit sz, string asm, ZPRRegOp zprty>
let Inst{4-0} = Zd;
}
-multiclass sve2_hist_gen_vector<string asm> {
+multiclass sve2_hist_gen_vector<string asm, SDPatternOperator op> {
def _S : sve2_hist_gen_vector<0b0, asm, ZPR32>;
def _D : sve2_hist_gen_vector<0b1, asm, ZPR64>;
+
+ def : SVE_3_Op_Pat<nxv4i32, op, nxv4i1, nxv4i32, nxv4i32, !cast<Instruction>(NAME # _S)>;
+ def : SVE_3_Op_Pat<nxv2i64, op, nxv2i1, nxv2i64, nxv2i64, !cast<Instruction>(NAME # _D)>;
}
//===----------------------------------------------------------------------===//
@@ -6755,6 +7401,12 @@ class sve2_crypto_cons_bin_op<bit opc, string asm, ZPRRegOp zprty>
let Inst{4-0} = Zd;
}
+multiclass sve2_crypto_cons_bin_op<bit opc, string asm, ZPRRegOp zprty,
+ SDPatternOperator op, ValueType vt> {
+ def NAME : sve2_crypto_cons_bin_op<opc, asm, zprty>;
+ def : SVE_2_Op_Pat<vt, op, vt, vt, !cast<Instruction>(NAME)>;
+}
+
class sve2_crypto_des_bin_op<bits<2> opc, string asm, ZPRRegOp zprty>
: I<(outs zprty:$Zdn), (ins zprty:$_Zdn, zprty:$Zm),
asm, "\t$Zdn, $_Zdn, $Zm",
@@ -6772,8 +7424,14 @@ class sve2_crypto_des_bin_op<bits<2> opc, string asm, ZPRRegOp zprty>
let Constraints = "$Zdn = $_Zdn";
}
-class sve2_crypto_unary_op<bit opc, string asm>
-: I<(outs ZPR8:$Zdn), (ins ZPR8:$_Zdn),
+multiclass sve2_crypto_des_bin_op<bits<2> opc, string asm, ZPRRegOp zprty,
+ SDPatternOperator op, ValueType vt> {
+ def NAME : sve2_crypto_des_bin_op<opc, asm, zprty>;
+ def : SVE_2_Op_Pat<vt, op, vt, vt, !cast<Instruction>(NAME)>;
+}
+
+class sve2_crypto_unary_op<bit opc, string asm, ZPRRegOp zprty>
+: I<(outs zprty:$Zdn), (ins zprty:$_Zdn),
asm, "\t$Zdn, $_Zdn",
"",
[]>, Sched<[]> {
@@ -6785,3 +7443,389 @@ class sve2_crypto_unary_op<bit opc, string asm>
let Constraints = "$Zdn = $_Zdn";
}
+
+multiclass sve2_crypto_unary_op<bit opc, string asm, SDPatternOperator op> {
+ def NAME : sve2_crypto_unary_op<opc, asm, ZPR8>;
+ def : SVE_1_Op_Pat<nxv16i8, op, nxv16i8, !cast<Instruction>(NAME)>;
+}
+
+//===----------------------------------------------------------------------===//
+// SVE BFloat16 Group
+//===----------------------------------------------------------------------===//
+
+class sve_bfloat_dot_base<bits<2> opc, string asm, string ops, dag iops>
+: I<(outs ZPR32:$Zda), iops, asm, ops, "", []>, Sched<[]> {
+ bits<5> Zda;
+ bits<5> Zn;
+ let Inst{31-21} = 0b01100100011;
+ let Inst{15-14} = opc;
+ let Inst{13-10} = 0b0000;
+ let Inst{9-5} = Zn;
+ let Inst{4-0} = Zda;
+
+ let Constraints = "$Zda = $_Zda";
+ let DestructiveInstType = DestructiveOther;
+ let ElementSize = ElementSizeH;
+}
+
+class sve_bfloat_dot<string asm>
+: sve_bfloat_dot_base<0b10, asm, "\t$Zda, $Zn, $Zm",
+ (ins ZPR32:$_Zda, ZPR16:$Zn, ZPR16:$Zm)> {
+ bits<5> Zm;
+ let Inst{20-16} = Zm;
+}
+
+multiclass sve_bfloat_dot<string asm, SDPatternOperator op> {
+ def NAME : sve_bfloat_dot<asm>;
+ def : SVE_3_Op_Pat<nxv4f32, op, nxv4f32, nxv8bf16, nxv8bf16 ,!cast<Instruction>(NAME)>;
+}
+
+class sve_bfloat_dot_indexed<string asm>
+: sve_bfloat_dot_base<0b01, asm, "\t$Zda, $Zn, $Zm$iop",
+ (ins ZPR32:$_Zda, ZPR16:$Zn, ZPR3b16:$Zm, VectorIndexS:$iop)> {
+ bits<2> iop;
+ bits<3> Zm;
+ let Inst{20-19} = iop;
+ let Inst{18-16} = Zm;
+}
+
+multiclass sve_bfloat_dot_indexed<string asm, SDPatternOperator op> {
+ def NAME : sve_bfloat_dot_indexed<asm>;
+ def : SVE_4_Op_Imm_Pat<nxv4f32, op, nxv4f32, nxv8bf16, nxv8bf16, i64, VectorIndexS_timm, !cast<Instruction>(NAME)>;
+}
+
+class sve_bfloat_matmul<string asm>
+: I<(outs ZPR32:$Zda), (ins ZPR32:$_Zda, ZPR16:$Zn, ZPR16:$Zm),
+ asm, "\t$Zda, $Zn, $Zm", "", []>, Sched<[]> {
+ bits<5> Zm;
+ bits<5> Zda;
+ bits<5> Zn;
+ let Inst{31-21} = 0b01100100011;
+ let Inst{20-16} = Zm;
+ let Inst{15-10} = 0b111001;
+ let Inst{9-5} = Zn;
+ let Inst{4-0} = Zda;
+
+ let Constraints = "$Zda = $_Zda";
+ let DestructiveInstType = DestructiveOther;
+ let ElementSize = ElementSizeH;
+}
+
+multiclass sve_bfloat_matmul<string asm, SDPatternOperator op> {
+ def NAME : sve_bfloat_matmul<asm>;
+ def : SVE_3_Op_Pat<nxv4f32, op, nxv4f32, nxv8bf16, nxv8bf16 ,!cast<Instruction>(NAME)>;
+}
+
+class sve_bfloat_matmul_longvecl<bit BT, string asm>
+: sve_bfloat_matmul<asm> {
+ let Inst{23} = 0b1;
+ let Inst{14-13} = 0b00;
+ let Inst{10} = BT;
+}
+
+multiclass sve_bfloat_matmul_longvecl<bit BT, string asm, SDPatternOperator op> {
+ def NAME : sve_bfloat_matmul_longvecl<BT, asm>;
+ def : SVE_3_Op_Pat<nxv4f32, op, nxv4f32, nxv8bf16, nxv8bf16 ,!cast<Instruction>(NAME)>;
+}
+
+class sve_bfloat_matmul_longvecl_idx<bit BT, string asm>
+: sve_bfloat_dot_base<0b01, asm, "\t$Zda, $Zn, $Zm$iop",
+ (ins ZPR32:$_Zda, ZPR16:$Zn, ZPR3b16:$Zm, VectorIndexH:$iop)> {
+ bits<3> iop;
+ bits<3> Zm;
+ let Inst{23} = 0b1;
+ let Inst{20-19} = iop{2-1};
+ let Inst{18-16} = Zm;
+ let Inst{11} = iop{0};
+ let Inst{10} = BT;
+}
+
+multiclass sve_bfloat_matmul_longvecl_idx<bit BT, string asm, SDPatternOperator op> {
+ def NAME : sve_bfloat_matmul_longvecl_idx<BT, asm>;
+ def : SVE_4_Op_Imm_Pat<nxv4f32, op, nxv4f32, nxv8bf16, nxv8bf16, i64, VectorIndexH_timm, !cast<Instruction>(NAME)>;
+}
+
+class sve_bfloat_convert<bit N, string asm>
+: I<(outs ZPR16:$Zd), (ins ZPR16:$_Zd, PPR3bAny:$Pg, ZPR32:$Zn),
+ asm, "\t$Zd, $Pg/m, $Zn", "", []>, Sched<[]> {
+ bits<5> Zd;
+ bits<3> Pg;
+ bits<5> Zn;
+ let Inst{31-25} = 0b0110010;
+ let Inst{24} = N;
+ let Inst{23-13} = 0b10001010101;
+ let Inst{12-10} = Pg;
+ let Inst{9-5} = Zn;
+ let Inst{4-0} = Zd;
+
+ let Constraints = "$Zd = $_Zd";
+ let DestructiveInstType = DestructiveOther;
+ let hasSideEffects = 1;
+ let ElementSize = ElementSizeS;
+}
+
+multiclass sve_bfloat_convert<bit N, string asm, SDPatternOperator op> {
+ def NAME : sve_bfloat_convert<N, asm>;
+ def : SVE_3_Op_Pat<nxv8bf16, op, nxv8bf16, nxv8i1, nxv4f32, !cast<Instruction>(NAME)>;
+}
+
+//===----------------------------------------------------------------------===//
+// SVE Integer Matrix Multiply Group
+//===----------------------------------------------------------------------===//
+
+class sve_int_matmul<bits<2> uns, string asm>
+: I<(outs ZPR32:$Zda), (ins ZPR32:$_Zda, ZPR8:$Zn, ZPR8:$Zm), asm,
+ "\t$Zda, $Zn, $Zm", "", []>, Sched<[]> {
+ bits<5> Zda;
+ bits<5> Zn;
+ bits<5> Zm;
+ let Inst{31-24} = 0b01000101;
+ let Inst{23-22} = uns;
+ let Inst{21} = 0;
+ let Inst{20-16} = Zm;
+ let Inst{15-10} = 0b100110;
+ let Inst{9-5} = Zn;
+ let Inst{4-0} = Zda;
+
+ let Constraints = "$Zda = $_Zda";
+ let DestructiveInstType = DestructiveOther;
+ let ElementSize = ZPR32.ElementSize;
+}
+
+multiclass sve_int_matmul<bits<2> uns, string asm, SDPatternOperator op> {
+ def NAME : sve_int_matmul<uns, asm>;
+
+ def : SVE_3_Op_Pat<nxv4i32, op , nxv4i32, nxv16i8, nxv16i8, !cast<Instruction>(NAME)>;
+}
+
+//===----------------------------------------------------------------------===//
+// SVE Integer Dot Product Mixed Sign Group
+//===----------------------------------------------------------------------===//
+
+class sve_int_dot_mixed<string asm>
+: I<(outs ZPR32:$Zda), (ins ZPR32:$_Zda, ZPR8:$Zn, ZPR8:$Zm), asm,
+ "\t$Zda, $Zn, $Zm", "", []>, Sched<[]> {
+ bits<5> Zda;
+ bits<5> Zn;
+ bits<5> Zm;
+ let Inst{31-21} = 0b01000100100;
+ let Inst{20-16} = Zm;
+ let Inst{15-10} = 0b011110;
+ let Inst{9-5} = Zn;
+ let Inst{4-0} = Zda;
+
+ let Constraints = "$Zda = $_Zda";
+ let DestructiveInstType = DestructiveOther;
+ let ElementSize = ZPR32.ElementSize;
+}
+
+multiclass sve_int_dot_mixed<string asm, SDPatternOperator op> {
+ def NAME : sve_int_dot_mixed<asm>;
+
+ def : SVE_3_Op_Pat<nxv4i32, op , nxv4i32, nxv16i8, nxv16i8, !cast<Instruction>(NAME)>;
+}
+
+//===----------------------------------------------------------------------===//
+// SVE Integer Dot Product Mixed Sign - Indexed Group
+//===----------------------------------------------------------------------===//
+
+class sve_int_dot_mixed_indexed<bit U, string asm>
+: I<(outs ZPR32:$Zda), (ins ZPR32:$_Zda, ZPR8:$Zn, ZPR3b8:$Zm, VectorIndexS32b:$idx),
+ asm, "\t$Zda, $Zn, $Zm$idx", "", []>, Sched<[]> {
+ bits<5> Zda;
+ bits<5> Zn;
+ bits<3> Zm;
+ bits<2> idx;
+ let Inst{31-21} = 0b01000100101;
+ let Inst{20-19} = idx;
+ let Inst{18-16} = Zm;
+ let Inst{15-11} = 0b00011;
+ let Inst{10} = U;
+ let Inst{9-5} = Zn;
+ let Inst{4-0} = Zda;
+
+ let Constraints = "$Zda = $_Zda";
+ let DestructiveInstType = DestructiveOther;
+ let ElementSize = ZPR32.ElementSize;
+}
+
+multiclass sve_int_dot_mixed_indexed<bit U, string asm, SDPatternOperator op> {
+ def NAME : sve_int_dot_mixed_indexed<U, asm>;
+
+ def : SVE_4_Op_Imm_Pat<nxv4i32, op, nxv4i32, nxv16i8, nxv16i8, i32, VectorIndexS32b_timm, !cast<Instruction>(NAME)>;
+}
+
+//===----------------------------------------------------------------------===//
+// SVE Floating Point Matrix Multiply Accumulate Group
+//===----------------------------------------------------------------------===//
+
+class sve_fp_matrix_mla<bit sz, string asm, ZPRRegOp zprty>
+: I<(outs zprty:$Zda), (ins zprty:$_Zda, zprty:$Zn, zprty:$Zm),
+ asm, "\t$Zda, $Zn, $Zm", "", []>, Sched<[]> {
+ bits<5> Zda;
+ bits<5> Zn;
+ bits<5> Zm;
+ let Inst{31-23} = 0b011001001;
+ let Inst{22} = sz;
+ let Inst{21} = 1;
+ let Inst{20-16} = Zm;
+ let Inst{15-10} = 0b111001;
+ let Inst{9-5} = Zn;
+ let Inst{4-0} = Zda;
+
+ let Constraints = "$Zda = $_Zda";
+ let DestructiveInstType = DestructiveOther;
+ let ElementSize = zprty.ElementSize;
+}
+
+multiclass sve_fp_matrix_mla<bit sz, string asm, ZPRRegOp zprty, SDPatternOperator op, ValueType vt> {
+ def NAME : sve_fp_matrix_mla<sz, asm, zprty>;
+
+ def : SVE_3_Op_Pat<vt, op , vt, vt, vt, !cast<Instruction>(NAME)>;
+}
+
+//===----------------------------------------------------------------------===//
+// SVE Memory - Contiguous Load And Replicate 256-bit Group
+//===----------------------------------------------------------------------===//
+
+class sve_mem_ldor_si<bits<2> sz, string asm, RegisterOperand VecList>
+: I<(outs VecList:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, simm4s32:$imm4),
+ asm, "\t$Zt, $Pg/z, [$Rn, $imm4]", "", []>, Sched<[]> {
+ bits<5> Zt;
+ bits<5> Rn;
+ bits<3> Pg;
+ bits<4> imm4;
+ let Inst{31-25} = 0b1010010;
+ let Inst{24-23} = sz;
+ let Inst{22-20} = 0b010;
+ let Inst{19-16} = imm4;
+ let Inst{15-13} = 0b001;
+ let Inst{12-10} = Pg;
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Zt;
+
+ let mayLoad = 1;
+}
+
+multiclass sve_mem_ldor_si<bits<2> sz, string asm, RegisterOperand listty,
+ ZPRRegOp zprty, ValueType Ty, ValueType PredTy, SDNode Ld1ro> {
+ def NAME : sve_mem_ldor_si<sz, asm, listty>;
+ def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn]",
+ (!cast<Instruction>(NAME) listty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, 0), 1>;
+ def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn]",
+ (!cast<Instruction>(NAME) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, 0), 0>;
+ def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn, $imm4]",
+ (!cast<Instruction>(NAME) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, simm4s32:$imm4), 0>;
+
+ // Base addressing mode
+ def : Pat<(Ty (Ld1ro (PredTy PPR3bAny:$gp), GPR64sp:$base)),
+ (!cast<Instruction>(NAME) PPR3bAny:$gp, GPR64sp:$base, (i64 0))>;
+
+}
+
+class sve_mem_ldor_ss<bits<2> sz, string asm, RegisterOperand VecList,
+ RegisterOperand gprty>
+: I<(outs VecList:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, gprty:$Rm),
+ asm, "\t$Zt, $Pg/z, [$Rn, $Rm]", "", []>, Sched<[]> {
+ bits<5> Zt;
+ bits<3> Pg;
+ bits<5> Rn;
+ bits<5> Rm;
+ let Inst{31-25} = 0b1010010;
+ let Inst{24-23} = sz;
+ let Inst{22-21} = 0b01;
+ let Inst{20-16} = Rm;
+ let Inst{15-13} = 0;
+ let Inst{12-10} = Pg;
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Zt;
+
+ let mayLoad = 1;
+}
+
+multiclass sve_mem_ldor_ss<bits<2> sz, string asm, RegisterOperand listty,
+ ZPRRegOp zprty, RegisterOperand gprty, ValueType Ty,
+ ValueType PredTy, SDNode Ld1ro, ComplexPattern AddrCP> {
+ def NAME : sve_mem_ldor_ss<sz, asm, listty, gprty>;
+
+ def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn, $Rm]",
+ (!cast<Instruction>(NAME) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, gprty:$Rm), 0>;
+
+ def : Pat<(Ty (Ld1ro (PredTy PPR3bAny:$gp), (AddrCP GPR64sp:$base, gprty:$offset))),
+ (!cast<Instruction>(NAME) PPR3bAny:$gp, GPR64sp:$base, gprty:$offset)>;
+}
+
+//===----------------------------------------------------------------------===//
+// SVE Interleave 128-bit Elements Group
+//===----------------------------------------------------------------------===//
+
+class sve_int_perm_bin_perm_128_zz<bits<2> opc, bit P, string asm>
+: I<(outs ZPR128:$Zd), (ins ZPR128:$Zn, ZPR128:$Zm),
+ asm, "\t$Zd, $Zn, $Zm",
+ "",
+ []>, Sched<[]> {
+ bits<5> Zd;
+ bits<5> Zm;
+ bits<5> Zn;
+ let Inst{31-21} = 0b00000101101;
+ let Inst{20-16} = Zm;
+ let Inst{15-13} = 0b000;
+ let Inst{12-11} = opc;
+ let Inst{10} = P;
+ let Inst{9-5} = Zn;
+ let Inst{4-0} = Zd;
+}
+
+multiclass sve_int_perm_bin_perm_128_zz<bits<2> opc, bit P, string asm, SDPatternOperator op> {
+ def NAME : sve_int_perm_bin_perm_128_zz<opc, P, asm>;
+
+ def : SVE_2_Op_Pat<nxv16i8, op, nxv16i8, nxv16i8, !cast<Instruction>(NAME)>;
+ def : SVE_2_Op_Pat<nxv8i16, op, nxv8i16, nxv8i16, !cast<Instruction>(NAME)>;
+ def : SVE_2_Op_Pat<nxv8f16, op, nxv8f16, nxv8f16, !cast<Instruction>(NAME)>;
+ def : SVE_2_Op_Pat<nxv4i32, op, nxv4i32, nxv4i32, !cast<Instruction>(NAME)>;
+ def : SVE_2_Op_Pat<nxv4f32, op, nxv4f32, nxv4f32, !cast<Instruction>(NAME)>;
+ def : SVE_2_Op_Pat<nxv2i64, op, nxv2i64, nxv2i64, !cast<Instruction>(NAME)>;
+ def : SVE_2_Op_Pat<nxv2f64, op, nxv2f64, nxv2f64, !cast<Instruction>(NAME)>;
+}
+
+/// Addressing modes
+def am_sve_indexed_s4 :ComplexPattern<i64, 2, "SelectAddrModeIndexedSVE<-8,7>", [], [SDNPWantRoot]>;
+def am_sve_indexed_s6 :ComplexPattern<i64, 2, "SelectAddrModeIndexedSVE<-32,31>", [], [SDNPWantRoot]>;
+
+def am_sve_regreg_lsl0 : ComplexPattern<i64, 2, "SelectSVERegRegAddrMode<0>", []>;
+def am_sve_regreg_lsl1 : ComplexPattern<i64, 2, "SelectSVERegRegAddrMode<1>", []>;
+def am_sve_regreg_lsl2 : ComplexPattern<i64, 2, "SelectSVERegRegAddrMode<2>", []>;
+def am_sve_regreg_lsl3 : ComplexPattern<i64, 2, "SelectSVERegRegAddrMode<3>", []>;
+
+// Predicated pseudo floating point two operand instructions.
+multiclass sve_fp_bin_pred_hfd<SDPatternOperator op> {
+ def _UNDEF_H : PredTwoOpPseudo<NAME # _H, ZPR16, FalseLanesUndef>;
+ def _UNDEF_S : PredTwoOpPseudo<NAME # _S, ZPR32, FalseLanesUndef>;
+ def _UNDEF_D : PredTwoOpPseudo<NAME # _D, ZPR64, FalseLanesUndef>;
+
+ def : SVE_3_Op_Pat<nxv8f16, op, nxv8i1, nxv8f16, nxv8f16, !cast<Pseudo>(NAME # _UNDEF_H)>;
+ def : SVE_3_Op_Pat<nxv4f32, op, nxv4i1, nxv4f32, nxv4f32, !cast<Pseudo>(NAME # _UNDEF_S)>;
+ def : SVE_3_Op_Pat<nxv2f64, op, nxv2i1, nxv2f64, nxv2f64, !cast<Pseudo>(NAME # _UNDEF_D)>;
+}
+
+// Predicated pseudo integer two operand instructions.
+multiclass sve_int_bin_pred_bhsd<SDPatternOperator op> {
+ def _UNDEF_B : PredTwoOpPseudo<NAME # _B, ZPR8, FalseLanesUndef>;
+ def _UNDEF_H : PredTwoOpPseudo<NAME # _H, ZPR16, FalseLanesUndef>;
+ def _UNDEF_S : PredTwoOpPseudo<NAME # _S, ZPR32, FalseLanesUndef>;
+ def _UNDEF_D : PredTwoOpPseudo<NAME # _D, ZPR64, FalseLanesUndef>;
+
+ def : SVE_3_Op_Pat<nxv16i8, op, nxv16i1, nxv16i8, nxv16i8, !cast<Pseudo>(NAME # _UNDEF_B)>;
+ def : SVE_3_Op_Pat<nxv8i16, op, nxv8i1, nxv8i16, nxv8i16, !cast<Pseudo>(NAME # _UNDEF_H)>;
+ def : SVE_3_Op_Pat<nxv4i32, op, nxv4i1, nxv4i32, nxv4i32, !cast<Pseudo>(NAME # _UNDEF_S)>;
+ def : SVE_3_Op_Pat<nxv2i64, op, nxv2i1, nxv2i64, nxv2i64, !cast<Pseudo>(NAME # _UNDEF_D)>;
+}
+
+// As sve_int_bin_pred but when only i32 and i64 vector types are required.
+multiclass sve_int_bin_pred_sd<SDPatternOperator op> {
+ def _UNDEF_S : PredTwoOpPseudo<NAME # _S, ZPR32, FalseLanesUndef>;
+ def _UNDEF_D : PredTwoOpPseudo<NAME # _D, ZPR64, FalseLanesUndef>;
+
+ def : SVE_3_Op_Pat<nxv4i32, op, nxv4i1, nxv4i32, nxv4i32, !cast<Pseudo>(NAME # _UNDEF_S)>;
+ def : SVE_3_Op_Pat<nxv2i64, op, nxv2i1, nxv2i64, nxv2i64, !cast<Pseudo>(NAME # _UNDEF_D)>;
+}
diff --git a/llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp b/llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp
new file mode 100644
index 0000000000000..74fe0cdd1ea7f
--- /dev/null
+++ b/llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp
@@ -0,0 +1,265 @@
+//===----- SVEIntrinsicOpts - SVE ACLE Intrinsics Opts --------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Performs general IR level optimizations on SVE intrinsics.
+//
+// The main goal of this pass is to remove unnecessary reinterpret
+// intrinsics (llvm.aarch64.sve.convert.[to|from].svbool), e.g:
+//
+// %1 = @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> %a)
+// %2 = @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %1)
+//
+// This pass also looks for ptest intrinsics & phi instructions where the
+// operands are being needlessly converted to and from svbool_t.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Utils/AArch64BaseInfo.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/IntrinsicsAArch64.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+using namespace llvm::PatternMatch;
+
+#define DEBUG_TYPE "sve-intrinsic-opts"
+
+namespace llvm {
+void initializeSVEIntrinsicOptsPass(PassRegistry &);
+}
+
+namespace {
+struct SVEIntrinsicOpts : public ModulePass {
+ static char ID; // Pass identification, replacement for typeid
+ SVEIntrinsicOpts() : ModulePass(ID) {
+ initializeSVEIntrinsicOptsPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnModule(Module &M) override;
+ void getAnalysisUsage(AnalysisUsage &AU) const override;
+
+private:
+ static IntrinsicInst *isReinterpretToSVBool(Value *V);
+
+ static bool optimizeIntrinsic(Instruction *I);
+
+ bool optimizeFunctions(SmallSetVector<Function *, 4> &Functions);
+
+ static bool optimizeConvertFromSVBool(IntrinsicInst *I);
+ static bool optimizePTest(IntrinsicInst *I);
+
+ static bool processPhiNode(IntrinsicInst *I);
+};
+} // end anonymous namespace
+
+void SVEIntrinsicOpts::getAnalysisUsage(AnalysisUsage &AU) const {
+ AU.addRequired<DominatorTreeWrapperPass>();
+ AU.setPreservesCFG();
+}
+
+char SVEIntrinsicOpts::ID = 0;
+static const char *name = "SVE intrinsics optimizations";
+INITIALIZE_PASS_BEGIN(SVEIntrinsicOpts, DEBUG_TYPE, name, false, false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass);
+INITIALIZE_PASS_END(SVEIntrinsicOpts, DEBUG_TYPE, name, false, false)
+
+namespace llvm {
+ModulePass *createSVEIntrinsicOptsPass() { return new SVEIntrinsicOpts(); }
+} // namespace llvm
+
+/// Returns V if it's a cast from <n x 16 x i1> (aka svbool_t), nullptr
+/// otherwise.
+IntrinsicInst *SVEIntrinsicOpts::isReinterpretToSVBool(Value *V) {
+ IntrinsicInst *I = dyn_cast<IntrinsicInst>(V);
+ if (!I)
+ return nullptr;
+
+ if (I->getIntrinsicID() != Intrinsic::aarch64_sve_convert_to_svbool)
+ return nullptr;
+
+ return I;
+}
+
+/// The function will remove redundant reinterprets casting in the presence
+/// of the control flow
+bool SVEIntrinsicOpts::processPhiNode(IntrinsicInst *X) {
+
+ SmallVector<Instruction *, 32> Worklist;
+ auto RequiredType = X->getType();
+
+ auto *PN = dyn_cast<PHINode>(X->getArgOperand(0));
+ assert(PN && "Expected Phi Node!");
+
+ // Don't create a new Phi unless we can remove the old one.
+ if (!PN->hasOneUse())
+ return false;
+
+ for (Value *IncValPhi : PN->incoming_values()) {
+ auto *Reinterpret = isReinterpretToSVBool(IncValPhi);
+ if (!Reinterpret ||
+ RequiredType != Reinterpret->getArgOperand(0)->getType())
+ return false;
+ }
+
+ // Create the new Phi
+ LLVMContext &Ctx = PN->getContext();
+ IRBuilder<> Builder(Ctx);
+ Builder.SetInsertPoint(PN);
+ PHINode *NPN = Builder.CreatePHI(RequiredType, PN->getNumIncomingValues());
+ Worklist.push_back(PN);
+
+ for (unsigned I = 0; I < PN->getNumIncomingValues(); I++) {
+ auto *Reinterpret = cast<Instruction>(PN->getIncomingValue(I));
+ NPN->addIncoming(Reinterpret->getOperand(0), PN->getIncomingBlock(I));
+ Worklist.push_back(Reinterpret);
+ }
+
+ // Cleanup Phi Node and reinterprets
+ X->replaceAllUsesWith(NPN);
+ X->eraseFromParent();
+
+ for (auto &I : Worklist)
+ if (I->use_empty())
+ I->eraseFromParent();
+
+ return true;
+}
+
+bool SVEIntrinsicOpts::optimizePTest(IntrinsicInst *I) {
+ IntrinsicInst *Op1 = dyn_cast<IntrinsicInst>(I->getArgOperand(0));
+ IntrinsicInst *Op2 = dyn_cast<IntrinsicInst>(I->getArgOperand(1));
+
+ if (Op1 && Op2 &&
+ Op1->getIntrinsicID() == Intrinsic::aarch64_sve_convert_to_svbool &&
+ Op2->getIntrinsicID() == Intrinsic::aarch64_sve_convert_to_svbool &&
+ Op1->getArgOperand(0)->getType() == Op2->getArgOperand(0)->getType()) {
+
+ Value *Ops[] = {Op1->getArgOperand(0), Op2->getArgOperand(0)};
+ Type *Tys[] = {Op1->getArgOperand(0)->getType()};
+ Module *M = I->getParent()->getParent()->getParent();
+
+ auto Fn = Intrinsic::getDeclaration(M, I->getIntrinsicID(), Tys);
+ auto CI = CallInst::Create(Fn, Ops, I->getName(), I);
+
+ I->replaceAllUsesWith(CI);
+ I->eraseFromParent();
+ if (Op1->use_empty())
+ Op1->eraseFromParent();
+ if (Op2->use_empty())
+ Op2->eraseFromParent();
+
+ return true;
+ }
+
+ return false;
+}
+
+bool SVEIntrinsicOpts::optimizeConvertFromSVBool(IntrinsicInst *I) {
+ assert(I->getIntrinsicID() == Intrinsic::aarch64_sve_convert_from_svbool &&
+ "Unexpected opcode");
+
+ // If the reinterpret instruction operand is a PHI Node
+ if (isa<PHINode>(I->getArgOperand(0)))
+ return processPhiNode(I);
+
+ // If we have a reinterpret intrinsic I of type A which is converting from
+ // another reinterpret Y of type B, and the source type of Y is A, then we can
+ // elide away both reinterprets if there are no other users of Y.
+ auto *Y = isReinterpretToSVBool(I->getArgOperand(0));
+ if (!Y)
+ return false;
+
+ Value *SourceVal = Y->getArgOperand(0);
+ if (I->getType() != SourceVal->getType())
+ return false;
+
+ I->replaceAllUsesWith(SourceVal);
+ I->eraseFromParent();
+ if (Y->use_empty())
+ Y->eraseFromParent();
+
+ return true;
+}
+
+bool SVEIntrinsicOpts::optimizeIntrinsic(Instruction *I) {
+ IntrinsicInst *IntrI = dyn_cast<IntrinsicInst>(I);
+ if (!IntrI)
+ return false;
+
+ switch (IntrI->getIntrinsicID()) {
+ case Intrinsic::aarch64_sve_convert_from_svbool:
+ return optimizeConvertFromSVBool(IntrI);
+ case Intrinsic::aarch64_sve_ptest_any:
+ case Intrinsic::aarch64_sve_ptest_first:
+ case Intrinsic::aarch64_sve_ptest_last:
+ return optimizePTest(IntrI);
+ default:
+ return false;
+ }
+
+ return true;
+}
+
+bool SVEIntrinsicOpts::optimizeFunctions(
+ SmallSetVector<Function *, 4> &Functions) {
+ bool Changed = false;
+ for (auto *F : Functions) {
+ DominatorTree *DT = &getAnalysis<DominatorTreeWrapperPass>(*F).getDomTree();
+
+ // Traverse the DT with an rpo walk so we see defs before uses, allowing
+ // simplification to be done incrementally.
+ BasicBlock *Root = DT->getRoot();
+ ReversePostOrderTraversal<BasicBlock *> RPOT(Root);
+ for (auto *BB : RPOT)
+ for (Instruction &I : make_early_inc_range(*BB))
+ Changed |= optimizeIntrinsic(&I);
+ }
+ return Changed;
+}
+
+bool SVEIntrinsicOpts::runOnModule(Module &M) {
+ bool Changed = false;
+ SmallSetVector<Function *, 4> Functions;
+
+ // Check for SVE intrinsic declarations first so that we only iterate over
+ // relevant functions. Where an appropriate declaration is found, store the
+ // function(s) where it is used so we can target these only.
+ for (auto &F : M.getFunctionList()) {
+ if (!F.isDeclaration())
+ continue;
+
+ switch (F.getIntrinsicID()) {
+ case Intrinsic::aarch64_sve_convert_from_svbool:
+ case Intrinsic::aarch64_sve_ptest_any:
+ case Intrinsic::aarch64_sve_ptest_first:
+ case Intrinsic::aarch64_sve_ptest_last:
+ for (auto I = F.user_begin(), E = F.user_end(); I != E;) {
+ auto *Inst = dyn_cast<Instruction>(*I++);
+ Functions.insert(Inst->getFunction());
+ }
+ break;
+ default:
+ break;
+ }
+ }
+
+ if (!Functions.empty())
+ Changed |= optimizeFunctions(Functions);
+
+ return Changed;
+}
diff --git a/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h b/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h
index 87980cddb7c0b..4e289fbe23257 100644
--- a/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h
+++ b/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h
@@ -658,6 +658,7 @@ namespace AArch64 {
// in index i*P of a <n x (M*P) x t> vector. The other elements of the
// <n x (M*P) x t> vector (such as index 1) are undefined.
static constexpr unsigned SVEBitsPerBlock = 128;
+static constexpr unsigned SVEMaxBitsPerVector = 2048;
const unsigned NeonBitsPerVector = 128;
} // end namespace AArch64
} // end namespace llvm