summaryrefslogtreecommitdiff
path: root/lib/Target/AArch64
diff options
context:
space:
mode:
authorDimitry Andric <dim@FreeBSD.org>2019-10-23 17:51:42 +0000
committerDimitry Andric <dim@FreeBSD.org>2019-10-23 17:51:42 +0000
commit1d5ae1026e831016fc29fd927877c86af904481f (patch)
tree2cdfd12620fcfa5d9e4a0389f85368e8e36f63f9 /lib/Target/AArch64
parente6d1592492a3a379186bfb02bd0f4eda0669c0d5 (diff)
downloadsrc-test2-1d5ae1026e831016fc29fd927877c86af904481f.tar.gz
src-test2-1d5ae1026e831016fc29fd927877c86af904481f.zip
Notes
Diffstat (limited to 'lib/Target/AArch64')
-rw-r--r--lib/Target/AArch64/AArch64.h6
-rw-r--r--lib/Target/AArch64/AArch64.td80
-rw-r--r--lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp12
-rw-r--r--lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp16
-rw-r--r--lib/Target/AArch64/AArch64AsmPrinter.cpp280
-rw-r--r--lib/Target/AArch64/AArch64CallLowering.cpp632
-rw-r--r--lib/Target/AArch64/AArch64CallLowering.h29
-rw-r--r--lib/Target/AArch64/AArch64CallingConvention.cpp38
-rw-r--r--lib/Target/AArch64/AArch64CallingConvention.h3
-rw-r--r--lib/Target/AArch64/AArch64CallingConvention.td88
-rw-r--r--lib/Target/AArch64/AArch64CollectLOH.cpp22
-rw-r--r--lib/Target/AArch64/AArch64Combine.td18
-rw-r--r--lib/Target/AArch64/AArch64CondBrTuning.cpp4
-rw-r--r--lib/Target/AArch64/AArch64ConditionalCompares.cpp6
-rw-r--r--lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp4
-rw-r--r--lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp76
-rw-r--r--lib/Target/AArch64/AArch64FalkorHWPFFix.cpp2
-rw-r--r--lib/Target/AArch64/AArch64FastISel.cpp75
-rw-r--r--lib/Target/AArch64/AArch64FrameLowering.cpp301
-rw-r--r--lib/Target/AArch64/AArch64FrameLowering.h28
-rw-r--r--lib/Target/AArch64/AArch64ISelDAGToDAG.cpp45
-rw-r--r--lib/Target/AArch64/AArch64ISelLowering.cpp535
-rw-r--r--lib/Target/AArch64/AArch64ISelLowering.h37
-rw-r--r--lib/Target/AArch64/AArch64InstrAtomics.td65
-rw-r--r--lib/Target/AArch64/AArch64InstrFormats.td220
-rw-r--r--lib/Target/AArch64/AArch64InstrInfo.cpp1054
-rw-r--r--lib/Target/AArch64/AArch64InstrInfo.h12
-rw-r--r--lib/Target/AArch64/AArch64InstrInfo.td253
-rw-r--r--lib/Target/AArch64/AArch64InstructionSelector.cpp1096
-rw-r--r--lib/Target/AArch64/AArch64LegalizerInfo.cpp111
-rw-r--r--lib/Target/AArch64/AArch64LegalizerInfo.h3
-rw-r--r--lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp160
-rw-r--r--lib/Target/AArch64/AArch64MCInstLower.cpp2
-rw-r--r--lib/Target/AArch64/AArch64MachineFunctionInfo.h17
-rw-r--r--lib/Target/AArch64/AArch64PBQPRegAlloc.cpp16
-rw-r--r--lib/Target/AArch64/AArch64PreLegalizerCombiner.cpp98
-rw-r--r--lib/Target/AArch64/AArch64RegisterBankInfo.cpp39
-rw-r--r--lib/Target/AArch64/AArch64RegisterInfo.cpp69
-rw-r--r--lib/Target/AArch64/AArch64SIMDInstrOpt.cpp8
-rw-r--r--lib/Target/AArch64/AArch64SVEInstrInfo.td264
-rw-r--r--lib/Target/AArch64/AArch64SelectionDAGInfo.cpp2
-rw-r--r--lib/Target/AArch64/AArch64SpeculationHardening.cpp13
-rw-r--r--lib/Target/AArch64/AArch64StackOffset.h138
-rw-r--r--lib/Target/AArch64/AArch64StackTagging.cpp394
-rw-r--r--lib/Target/AArch64/AArch64StackTaggingPreRA.cpp209
-rw-r--r--lib/Target/AArch64/AArch64StorePairSuppress.cpp2
-rw-r--r--lib/Target/AArch64/AArch64Subtarget.cpp50
-rw-r--r--lib/Target/AArch64/AArch64Subtarget.h48
-rw-r--r--lib/Target/AArch64/AArch64SystemOperands.td40
-rw-r--r--lib/Target/AArch64/AArch64TargetMachine.cpp35
-rw-r--r--lib/Target/AArch64/AArch64TargetObjectFile.cpp4
-rw-r--r--lib/Target/AArch64/AArch64TargetObjectFile.h3
-rw-r--r--lib/Target/AArch64/AArch64TargetTransformInfo.cpp29
-rw-r--r--lib/Target/AArch64/AArch64TargetTransformInfo.h14
-rw-r--r--lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp123
-rw-r--r--lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp13
-rw-r--r--lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp22
-rw-r--r--lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp3
-rw-r--r--lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp5
-rw-r--r--lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h2
-rw-r--r--lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp7
-rw-r--r--lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h20
-rw-r--r--lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp2
-rw-r--r--lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp4
-rw-r--r--lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFObjectWriter.cpp2
-rw-r--r--lib/Target/AArch64/SVEInstrFormats.td366
-rw-r--r--lib/Target/AArch64/Utils/AArch64BaseInfo.cpp2
-rw-r--r--lib/Target/AArch64/Utils/AArch64BaseInfo.h25
68 files changed, 5599 insertions, 1802 deletions
diff --git a/lib/Target/AArch64/AArch64.h b/lib/Target/AArch64/AArch64.h
index 6965403a25ab..ac765ebcddc0 100644
--- a/lib/Target/AArch64/AArch64.h
+++ b/lib/Target/AArch64/AArch64.h
@@ -55,8 +55,9 @@ FunctionPass *createAArch64CollectLOHPass();
InstructionSelector *
createAArch64InstructionSelector(const AArch64TargetMachine &,
AArch64Subtarget &, AArch64RegisterBankInfo &);
-FunctionPass *createAArch64PreLegalizeCombiner();
-FunctionPass *createAArch64StackTaggingPass();
+FunctionPass *createAArch64PreLegalizeCombiner(bool IsOptNone);
+FunctionPass *createAArch64StackTaggingPass(bool MergeInit);
+FunctionPass *createAArch64StackTaggingPreRAPass();
void initializeAArch64A53Fix835769Pass(PassRegistry&);
void initializeAArch64A57FPLoadBalancingPass(PassRegistry&);
@@ -80,6 +81,7 @@ void initializeFalkorHWPFFixPass(PassRegistry&);
void initializeFalkorMarkStridedAccessesLegacyPass(PassRegistry&);
void initializeLDTLSCleanupPass(PassRegistry&);
void initializeAArch64StackTaggingPass(PassRegistry&);
+void initializeAArch64StackTaggingPreRAPass(PassRegistry&);
} // end namespace llvm
#endif
diff --git a/lib/Target/AArch64/AArch64.td b/lib/Target/AArch64/AArch64.td
index e39c6995e367..5b4c9e2149da 100644
--- a/lib/Target/AArch64/AArch64.td
+++ b/lib/Target/AArch64/AArch64.td
@@ -115,11 +115,12 @@ def FeatureSVE2SM4 : SubtargetFeature<"sve2-sm4", "HasSVE2SM4", "true",
def FeatureSVE2SHA3 : SubtargetFeature<"sve2-sha3", "HasSVE2SHA3", "true",
"Enable SHA3 SVE2 instructions", [FeatureSVE2, FeatureSHA3]>;
-def FeatureSVE2BitPerm : SubtargetFeature<"bitperm", "HasSVE2BitPerm", "true",
+def FeatureSVE2BitPerm : SubtargetFeature<"sve2-bitperm", "HasSVE2BitPerm", "true",
"Enable bit permutation SVE2 instructions", [FeatureSVE2]>;
def FeatureZCRegMove : SubtargetFeature<"zcm", "HasZeroCycleRegMove", "true",
"Has zero-cycle register moves">;
+
def FeatureZCZeroingGP : SubtargetFeature<"zcz-gp", "HasZeroCycleZeroingGP", "true",
"Has zero-cycle zeroing instructions for generic registers">;
@@ -284,6 +285,10 @@ def FeatureSEL2 : SubtargetFeature<
"sel2", "HasSEL2", "true",
"Enable v8.4-A Secure Exception Level 2 extension">;
+def FeaturePMU : SubtargetFeature<
+ "pmu", "HasPMU", "true",
+ "Enable v8.4-A PMU extension">;
+
def FeatureTLB_RMI : SubtargetFeature<
"tlb-rmi", "HasTLB_RMI", "true",
"Enable v8.4-A TLB Range and Maintenance Instructions">;
@@ -345,6 +350,21 @@ def FeatureRandGen : SubtargetFeature<"rand", "HasRandGen",
def FeatureMTE : SubtargetFeature<"mte", "HasMTE",
"true", "Enable Memory Tagging Extension" >;
+def FeatureTRBE : SubtargetFeature<"trbe", "HasTRBE",
+ "true", "Enable Trace Buffer Extension">;
+
+def FeatureETE : SubtargetFeature<"ete", "HasETE",
+ "true", "Enable Embedded Trace Extension",
+ [FeatureTRBE]>;
+
+def FeatureTME : SubtargetFeature<"tme", "HasTME",
+ "true", "Enable Transactional Memory Extension" >;
+
+def FeatureTaggedGlobals : SubtargetFeature<"tagged-globals",
+ "AllowTaggedGlobals",
+ "true", "Use an instruction sequence for taking the address of a global "
+ "that allows a memory tag in the upper address bits">;
+
//===----------------------------------------------------------------------===//
// Architectures.
//
@@ -354,7 +374,7 @@ def HasV8_1aOps : SubtargetFeature<"v8.1a", "HasV8_1aOps", "true",
FeaturePAN, FeatureLOR, FeatureVH]>;
def HasV8_2aOps : SubtargetFeature<"v8.2a", "HasV8_2aOps", "true",
- "Support ARM v8.2a instructions", [HasV8_1aOps, FeaturePsUAO,
+ "Support ARM v8.2a instructions", [HasV8_1aOps, FeaturePsUAO,
FeaturePAN_RWV, FeatureRAS, FeatureCCPP]>;
def HasV8_3aOps : SubtargetFeature<"v8.3a", "HasV8_3aOps", "true",
@@ -364,7 +384,7 @@ def HasV8_3aOps : SubtargetFeature<"v8.3a", "HasV8_3aOps", "true",
def HasV8_4aOps : SubtargetFeature<"v8.4a", "HasV8_4aOps", "true",
"Support ARM v8.4a instructions", [HasV8_3aOps, FeatureDotProd,
FeatureNV, FeatureRASv8_4, FeatureMPAM, FeatureDIT,
- FeatureTRACEV8_4, FeatureAM, FeatureSEL2, FeatureTLB_RMI,
+ FeatureTRACEV8_4, FeatureAM, FeatureSEL2, FeaturePMU, FeatureTLB_RMI,
FeatureFMI, FeatureRCPC_IMMO]>;
def HasV8_5aOps : SubtargetFeature<
@@ -390,6 +410,7 @@ include "AArch64Schedule.td"
include "AArch64InstrInfo.td"
include "AArch64SchedPredicates.td"
include "AArch64SchedPredExynos.td"
+include "AArch64Combine.td"
def AArch64InstrInfo : InstrInfo;
@@ -484,6 +505,19 @@ def ProcA57 : SubtargetFeature<"a57", "ARMProcFamily", "CortexA57",
FeaturePredictableSelectIsExpensive
]>;
+def ProcA65 : SubtargetFeature<"a65", "ARMProcFamily", "CortexA65",
+ "Cortex-A65 ARM processors", [
+ HasV8_2aOps,
+ FeatureCrypto,
+ FeatureDotProd,
+ FeatureFPARMv8,
+ FeatureFullFP16,
+ FeatureNEON,
+ FeatureRAS,
+ FeatureRCPC,
+ FeatureSSBS,
+ ]>;
+
def ProcA72 : SubtargetFeature<"a72", "ARMProcFamily", "CortexA72",
"Cortex-A72 ARM processors", [
FeatureCRC,
@@ -641,6 +675,33 @@ def ProcFalkor : SubtargetFeature<"falkor", "ARMProcFamily", "Falkor",
FeatureSlowSTRQro
]>;
+def ProcNeoverseE1 : SubtargetFeature<"neoversee1", "ARMProcFamily",
+ "NeoverseE1",
+ "Neoverse E1 ARM processors", [
+ HasV8_2aOps,
+ FeatureCrypto,
+ FeatureDotProd,
+ FeatureFPARMv8,
+ FeatureFullFP16,
+ FeatureNEON,
+ FeatureRCPC,
+ FeatureSSBS,
+ ]>;
+
+def ProcNeoverseN1 : SubtargetFeature<"neoversen1", "ARMProcFamily",
+ "NeoverseN1",
+ "Neoverse N1 ARM processors", [
+ HasV8_2aOps,
+ FeatureCrypto,
+ FeatureDotProd,
+ FeatureFPARMv8,
+ FeatureFullFP16,
+ FeatureNEON,
+ FeatureRCPC,
+ FeatureSPE,
+ FeatureSSBS,
+ ]>;
+
def ProcSaphira : SubtargetFeature<"saphira", "ARMProcFamily", "Saphira",
"Qualcomm Saphira processors", [
FeatureCrypto,
@@ -732,19 +793,28 @@ def : ProcessorModel<"generic", NoSchedModel, [
FeatureFuseAES,
FeatureNEON,
FeaturePerfMon,
- FeaturePostRAScheduler
+ FeaturePostRAScheduler,
+// ETE and TRBE are future architecture extensions. We temporariliy enable them
+// by default for users targeting generic AArch64, until it is decided in which
+// armv8.x-a architecture revision they will end up. The extensions do not
+// affect code generated by the compiler and can be used only by explicitly
+// mentioning the new system register names in assembly.
+ FeatureETE
]>;
-// FIXME: Cortex-A35 and Cortex-A55 are currently modeled as a Cortex-A53.
def : ProcessorModel<"cortex-a35", CortexA53Model, [ProcA35]>;
def : ProcessorModel<"cortex-a53", CortexA53Model, [ProcA53]>;
def : ProcessorModel<"cortex-a55", CortexA53Model, [ProcA55]>;
def : ProcessorModel<"cortex-a57", CortexA57Model, [ProcA57]>;
+def : ProcessorModel<"cortex-a65", CortexA53Model, [ProcA65]>;
+def : ProcessorModel<"cortex-a65ae", CortexA53Model, [ProcA65]>;
def : ProcessorModel<"cortex-a72", CortexA57Model, [ProcA72]>;
def : ProcessorModel<"cortex-a73", CortexA57Model, [ProcA73]>;
def : ProcessorModel<"cortex-a75", CortexA57Model, [ProcA75]>;
def : ProcessorModel<"cortex-a76", CortexA57Model, [ProcA76]>;
def : ProcessorModel<"cortex-a76ae", CortexA57Model, [ProcA76]>;
+def : ProcessorModel<"neoverse-e1", CortexA53Model, [ProcNeoverseE1]>;
+def : ProcessorModel<"neoverse-n1", CortexA57Model, [ProcNeoverseN1]>;
def : ProcessorModel<"cyclone", CycloneModel, [ProcCyclone]>;
def : ProcessorModel<"exynos-m1", ExynosM1Model, [ProcExynosM1]>;
def : ProcessorModel<"exynos-m2", ExynosM1Model, [ProcExynosM2]>;
diff --git a/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp b/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp
index 92c8c4955d50..13d389cec7a0 100644
--- a/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp
+++ b/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp
@@ -552,7 +552,7 @@ bool AArch64A57FPLoadBalancing::colorChain(Chain *G, Color C,
std::vector<unsigned> ToErase;
for (auto &U : I.operands()) {
if (U.isReg() && U.isUse() && Substs.find(U.getReg()) != Substs.end()) {
- unsigned OrigReg = U.getReg();
+ Register OrigReg = U.getReg();
U.setReg(Substs[OrigReg]);
if (U.isKill())
// Don't erase straight away, because there may be other operands
@@ -611,12 +611,12 @@ void AArch64A57FPLoadBalancing::scanInstruction(
// Create a new chain. Multiplies don't require forwarding so can go on any
// unit.
- unsigned DestReg = MI->getOperand(0).getReg();
+ Register DestReg = MI->getOperand(0).getReg();
LLVM_DEBUG(dbgs() << "New chain started for register "
<< printReg(DestReg, TRI) << " at " << *MI);
- auto G = llvm::make_unique<Chain>(MI, Idx, getColor(DestReg));
+ auto G = std::make_unique<Chain>(MI, Idx, getColor(DestReg));
ActiveChains[DestReg] = G.get();
AllChains.push_back(std::move(G));
@@ -624,8 +624,8 @@ void AArch64A57FPLoadBalancing::scanInstruction(
// It is beneficial to keep MLAs on the same functional unit as their
// accumulator operand.
- unsigned DestReg = MI->getOperand(0).getReg();
- unsigned AccumReg = MI->getOperand(3).getReg();
+ Register DestReg = MI->getOperand(0).getReg();
+ Register AccumReg = MI->getOperand(3).getReg();
maybeKillChain(MI->getOperand(1), Idx, ActiveChains);
maybeKillChain(MI->getOperand(2), Idx, ActiveChains);
@@ -661,7 +661,7 @@ void AArch64A57FPLoadBalancing::scanInstruction(
LLVM_DEBUG(dbgs() << "Creating new chain for dest register "
<< printReg(DestReg, TRI) << "\n");
- auto G = llvm::make_unique<Chain>(MI, Idx, getColor(DestReg));
+ auto G = std::make_unique<Chain>(MI, Idx, getColor(DestReg));
ActiveChains[DestReg] = G.get();
AllChains.push_back(std::move(G));
diff --git a/lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp b/lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp
index 89404463e1f0..981b366c14b1 100644
--- a/lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp
+++ b/lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp
@@ -105,14 +105,14 @@ static bool isGPR64(unsigned Reg, unsigned SubReg,
const MachineRegisterInfo *MRI) {
if (SubReg)
return false;
- if (TargetRegisterInfo::isVirtualRegister(Reg))
+ if (Register::isVirtualRegister(Reg))
return MRI->getRegClass(Reg)->hasSuperClassEq(&AArch64::GPR64RegClass);
return AArch64::GPR64RegClass.contains(Reg);
}
static bool isFPR64(unsigned Reg, unsigned SubReg,
const MachineRegisterInfo *MRI) {
- if (TargetRegisterInfo::isVirtualRegister(Reg))
+ if (Register::isVirtualRegister(Reg))
return (MRI->getRegClass(Reg)->hasSuperClassEq(&AArch64::FPR64RegClass) &&
SubReg == 0) ||
(MRI->getRegClass(Reg)->hasSuperClassEq(&AArch64::FPR128RegClass) &&
@@ -201,8 +201,8 @@ bool AArch64AdvSIMDScalar::isProfitableToTransform(
unsigned NumNewCopies = 3;
unsigned NumRemovableCopies = 0;
- unsigned OrigSrc0 = MI.getOperand(1).getReg();
- unsigned OrigSrc1 = MI.getOperand(2).getReg();
+ Register OrigSrc0 = MI.getOperand(1).getReg();
+ Register OrigSrc1 = MI.getOperand(2).getReg();
unsigned SubReg0;
unsigned SubReg1;
if (!MRI->def_empty(OrigSrc0)) {
@@ -236,7 +236,7 @@ bool AArch64AdvSIMDScalar::isProfitableToTransform(
// any of the uses is a transformable instruction, it's likely the tranforms
// will chain, enabling us to save a copy there, too. This is an aggressive
// heuristic that approximates the graph based cost analysis described above.
- unsigned Dst = MI.getOperand(0).getReg();
+ Register Dst = MI.getOperand(0).getReg();
bool AllUsesAreCopies = true;
for (MachineRegisterInfo::use_instr_nodbg_iterator
Use = MRI->use_instr_nodbg_begin(Dst),
@@ -293,8 +293,8 @@ void AArch64AdvSIMDScalar::transformInstruction(MachineInstr &MI) {
assert(OldOpc != NewOpc && "transform an instruction to itself?!");
// Check if we need a copy for the source registers.
- unsigned OrigSrc0 = MI.getOperand(1).getReg();
- unsigned OrigSrc1 = MI.getOperand(2).getReg();
+ Register OrigSrc0 = MI.getOperand(1).getReg();
+ Register OrigSrc1 = MI.getOperand(2).getReg();
unsigned Src0 = 0, SubReg0;
unsigned Src1 = 0, SubReg1;
bool KillSrc0 = false, KillSrc1 = false;
@@ -354,7 +354,7 @@ void AArch64AdvSIMDScalar::transformInstruction(MachineInstr &MI) {
// Create a vreg for the destination.
// FIXME: No need to do this if the ultimate user expects an FPR64.
// Check for that and avoid the copy if possible.
- unsigned Dst = MRI->createVirtualRegister(&AArch64::FPR64RegClass);
+ Register Dst = MRI->createVirtualRegister(&AArch64::FPR64RegClass);
// For now, all of the new instructions have the same simple three-register
// form, so no need to special case based on what instruction we're
diff --git a/lib/Target/AArch64/AArch64AsmPrinter.cpp b/lib/Target/AArch64/AArch64AsmPrinter.cpp
index 094fbd999523..7ea7915c2ca6 100644
--- a/lib/Target/AArch64/AArch64AsmPrinter.cpp
+++ b/lib/Target/AArch64/AArch64AsmPrinter.cpp
@@ -99,7 +99,8 @@ public:
void LowerPATCHABLE_FUNCTION_EXIT(const MachineInstr &MI);
void LowerPATCHABLE_TAIL_CALL(const MachineInstr &MI);
- std::map<std::pair<unsigned, uint32_t>, MCSymbol *> HwasanMemaccessSymbols;
+ typedef std::tuple<unsigned, bool, uint32_t> HwasanMemaccessTuple;
+ std::map<HwasanMemaccessTuple, MCSymbol *> HwasanMemaccessSymbols;
void LowerHWASAN_CHECK_MEMACCESS(const MachineInstr &MI);
void EmitHwasanMemaccessSymbols(Module &M);
@@ -150,7 +151,7 @@ private:
void printOperand(const MachineInstr *MI, unsigned OpNum, raw_ostream &O);
bool printAsmMRegister(const MachineOperand &MO, char Mode, raw_ostream &O);
bool printAsmRegInClass(const MachineOperand &MO,
- const TargetRegisterClass *RC, bool isVector,
+ const TargetRegisterClass *RC, unsigned AltName,
raw_ostream &O);
bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
@@ -236,9 +237,12 @@ void AArch64AsmPrinter::EmitSled(const MachineInstr &MI, SledKind Kind)
}
void AArch64AsmPrinter::LowerHWASAN_CHECK_MEMACCESS(const MachineInstr &MI) {
- unsigned Reg = MI.getOperand(0).getReg();
+ Register Reg = MI.getOperand(0).getReg();
+ bool IsShort =
+ MI.getOpcode() == AArch64::HWASAN_CHECK_MEMACCESS_SHORTGRANULES;
uint32_t AccessInfo = MI.getOperand(1).getImm();
- MCSymbol *&Sym = HwasanMemaccessSymbols[{Reg, AccessInfo}];
+ MCSymbol *&Sym =
+ HwasanMemaccessSymbols[HwasanMemaccessTuple(Reg, IsShort, AccessInfo)];
if (!Sym) {
// FIXME: Make this work on non-ELF.
if (!TM.getTargetTriple().isOSBinFormatELF())
@@ -246,6 +250,8 @@ void AArch64AsmPrinter::LowerHWASAN_CHECK_MEMACCESS(const MachineInstr &MI) {
std::string SymName = "__hwasan_check_x" + utostr(Reg - AArch64::X0) + "_" +
utostr(AccessInfo);
+ if (IsShort)
+ SymName += "_short";
Sym = OutContext.getOrCreateSymbol(SymName);
}
@@ -263,15 +269,22 @@ void AArch64AsmPrinter::EmitHwasanMemaccessSymbols(Module &M) {
std::unique_ptr<MCSubtargetInfo> STI(
TM.getTarget().createMCSubtargetInfo(TT.str(), "", ""));
- MCSymbol *HwasanTagMismatchSym =
+ MCSymbol *HwasanTagMismatchV1Sym =
OutContext.getOrCreateSymbol("__hwasan_tag_mismatch");
+ MCSymbol *HwasanTagMismatchV2Sym =
+ OutContext.getOrCreateSymbol("__hwasan_tag_mismatch_v2");
- const MCSymbolRefExpr *HwasanTagMismatchRef =
- MCSymbolRefExpr::create(HwasanTagMismatchSym, OutContext);
+ const MCSymbolRefExpr *HwasanTagMismatchV1Ref =
+ MCSymbolRefExpr::create(HwasanTagMismatchV1Sym, OutContext);
+ const MCSymbolRefExpr *HwasanTagMismatchV2Ref =
+ MCSymbolRefExpr::create(HwasanTagMismatchV2Sym, OutContext);
for (auto &P : HwasanMemaccessSymbols) {
- unsigned Reg = P.first.first;
- uint32_t AccessInfo = P.first.second;
+ unsigned Reg = std::get<0>(P.first);
+ bool IsShort = std::get<1>(P.first);
+ uint32_t AccessInfo = std::get<2>(P.first);
+ const MCSymbolRefExpr *HwasanTagMismatchRef =
+ IsShort ? HwasanTagMismatchV2Ref : HwasanTagMismatchV1Ref;
MCSymbol *Sym = P.second;
OutStreamer->SwitchSection(OutContext.getELFSection(
@@ -304,82 +317,86 @@ void AArch64AsmPrinter::EmitHwasanMemaccessSymbols(Module &M) {
.addReg(Reg)
.addImm(AArch64_AM::getShifterImm(AArch64_AM::LSR, 56)),
*STI);
- MCSymbol *HandlePartialSym = OutContext.createTempSymbol();
+ MCSymbol *HandleMismatchOrPartialSym = OutContext.createTempSymbol();
OutStreamer->EmitInstruction(
MCInstBuilder(AArch64::Bcc)
.addImm(AArch64CC::NE)
- .addExpr(MCSymbolRefExpr::create(HandlePartialSym, OutContext)),
+ .addExpr(MCSymbolRefExpr::create(HandleMismatchOrPartialSym,
+ OutContext)),
*STI);
MCSymbol *ReturnSym = OutContext.createTempSymbol();
OutStreamer->EmitLabel(ReturnSym);
OutStreamer->EmitInstruction(
MCInstBuilder(AArch64::RET).addReg(AArch64::LR), *STI);
+ OutStreamer->EmitLabel(HandleMismatchOrPartialSym);
- OutStreamer->EmitLabel(HandlePartialSym);
- OutStreamer->EmitInstruction(MCInstBuilder(AArch64::SUBSWri)
- .addReg(AArch64::WZR)
- .addReg(AArch64::W16)
- .addImm(15)
- .addImm(0),
- *STI);
- MCSymbol *HandleMismatchSym = OutContext.createTempSymbol();
- OutStreamer->EmitInstruction(
- MCInstBuilder(AArch64::Bcc)
- .addImm(AArch64CC::HI)
- .addExpr(MCSymbolRefExpr::create(HandleMismatchSym, OutContext)),
- *STI);
-
- OutStreamer->EmitInstruction(
- MCInstBuilder(AArch64::ANDXri)
- .addReg(AArch64::X17)
- .addReg(Reg)
- .addImm(AArch64_AM::encodeLogicalImmediate(0xf, 64)),
- *STI);
- unsigned Size = 1 << (AccessInfo & 0xf);
- if (Size != 1)
- OutStreamer->EmitInstruction(MCInstBuilder(AArch64::ADDXri)
- .addReg(AArch64::X17)
- .addReg(AArch64::X17)
- .addImm(Size - 1)
+ if (IsShort) {
+ OutStreamer->EmitInstruction(MCInstBuilder(AArch64::SUBSWri)
+ .addReg(AArch64::WZR)
+ .addReg(AArch64::W16)
+ .addImm(15)
.addImm(0),
*STI);
- OutStreamer->EmitInstruction(MCInstBuilder(AArch64::SUBSWrs)
- .addReg(AArch64::WZR)
- .addReg(AArch64::W16)
- .addReg(AArch64::W17)
- .addImm(0),
- *STI);
- OutStreamer->EmitInstruction(
- MCInstBuilder(AArch64::Bcc)
- .addImm(AArch64CC::LS)
- .addExpr(MCSymbolRefExpr::create(HandleMismatchSym, OutContext)),
- *STI);
-
- OutStreamer->EmitInstruction(
- MCInstBuilder(AArch64::ORRXri)
- .addReg(AArch64::X16)
- .addReg(Reg)
- .addImm(AArch64_AM::encodeLogicalImmediate(0xf, 64)),
- *STI);
- OutStreamer->EmitInstruction(MCInstBuilder(AArch64::LDRBBui)
- .addReg(AArch64::W16)
- .addReg(AArch64::X16)
- .addImm(0),
- *STI);
- OutStreamer->EmitInstruction(
- MCInstBuilder(AArch64::SUBSXrs)
- .addReg(AArch64::XZR)
- .addReg(AArch64::X16)
- .addReg(Reg)
- .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSR, 56)),
- *STI);
- OutStreamer->EmitInstruction(
- MCInstBuilder(AArch64::Bcc)
- .addImm(AArch64CC::EQ)
- .addExpr(MCSymbolRefExpr::create(ReturnSym, OutContext)),
- *STI);
+ MCSymbol *HandleMismatchSym = OutContext.createTempSymbol();
+ OutStreamer->EmitInstruction(
+ MCInstBuilder(AArch64::Bcc)
+ .addImm(AArch64CC::HI)
+ .addExpr(MCSymbolRefExpr::create(HandleMismatchSym, OutContext)),
+ *STI);
+
+ OutStreamer->EmitInstruction(
+ MCInstBuilder(AArch64::ANDXri)
+ .addReg(AArch64::X17)
+ .addReg(Reg)
+ .addImm(AArch64_AM::encodeLogicalImmediate(0xf, 64)),
+ *STI);
+ unsigned Size = 1 << (AccessInfo & 0xf);
+ if (Size != 1)
+ OutStreamer->EmitInstruction(MCInstBuilder(AArch64::ADDXri)
+ .addReg(AArch64::X17)
+ .addReg(AArch64::X17)
+ .addImm(Size - 1)
+ .addImm(0),
+ *STI);
+ OutStreamer->EmitInstruction(MCInstBuilder(AArch64::SUBSWrs)
+ .addReg(AArch64::WZR)
+ .addReg(AArch64::W16)
+ .addReg(AArch64::W17)
+ .addImm(0),
+ *STI);
+ OutStreamer->EmitInstruction(
+ MCInstBuilder(AArch64::Bcc)
+ .addImm(AArch64CC::LS)
+ .addExpr(MCSymbolRefExpr::create(HandleMismatchSym, OutContext)),
+ *STI);
+
+ OutStreamer->EmitInstruction(
+ MCInstBuilder(AArch64::ORRXri)
+ .addReg(AArch64::X16)
+ .addReg(Reg)
+ .addImm(AArch64_AM::encodeLogicalImmediate(0xf, 64)),
+ *STI);
+ OutStreamer->EmitInstruction(MCInstBuilder(AArch64::LDRBBui)
+ .addReg(AArch64::W16)
+ .addReg(AArch64::X16)
+ .addImm(0),
+ *STI);
+ OutStreamer->EmitInstruction(
+ MCInstBuilder(AArch64::SUBSXrs)
+ .addReg(AArch64::XZR)
+ .addReg(AArch64::X16)
+ .addReg(Reg)
+ .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSR, 56)),
+ *STI);
+ OutStreamer->EmitInstruction(
+ MCInstBuilder(AArch64::Bcc)
+ .addImm(AArch64CC::EQ)
+ .addExpr(MCSymbolRefExpr::create(ReturnSym, OutContext)),
+ *STI);
+
+ OutStreamer->EmitLabel(HandleMismatchSym);
+ }
- OutStreamer->EmitLabel(HandleMismatchSym);
OutStreamer->EmitInstruction(MCInstBuilder(AArch64::STPXpre)
.addReg(AArch64::SP)
.addReg(AArch64::X0)
@@ -414,16 +431,16 @@ void AArch64AsmPrinter::EmitHwasanMemaccessSymbols(Module &M) {
MCInstBuilder(AArch64::ADRP)
.addReg(AArch64::X16)
.addExpr(AArch64MCExpr::create(
- HwasanTagMismatchRef,
- AArch64MCExpr::VariantKind::VK_GOT_PAGE, OutContext)),
+ HwasanTagMismatchRef, AArch64MCExpr::VariantKind::VK_GOT_PAGE,
+ OutContext)),
*STI);
OutStreamer->EmitInstruction(
MCInstBuilder(AArch64::LDRXui)
.addReg(AArch64::X16)
.addReg(AArch64::X16)
.addExpr(AArch64MCExpr::create(
- HwasanTagMismatchRef,
- AArch64MCExpr::VariantKind::VK_GOT_LO12, OutContext)),
+ HwasanTagMismatchRef, AArch64MCExpr::VariantKind::VK_GOT_LO12,
+ OutContext)),
*STI);
OutStreamer->EmitInstruction(
MCInstBuilder(AArch64::BR).addReg(AArch64::X16), *STI);
@@ -485,15 +502,14 @@ void AArch64AsmPrinter::printOperand(const MachineInstr *MI, unsigned OpNum,
default:
llvm_unreachable("<unknown operand type>");
case MachineOperand::MO_Register: {
- unsigned Reg = MO.getReg();
- assert(TargetRegisterInfo::isPhysicalRegister(Reg));
+ Register Reg = MO.getReg();
+ assert(Register::isPhysicalRegister(Reg));
assert(!MO.getSubReg() && "Subregs should be eliminated!");
O << AArch64InstPrinter::getRegisterName(Reg);
break;
}
case MachineOperand::MO_Immediate: {
- int64_t Imm = MO.getImm();
- O << '#' << Imm;
+ O << MO.getImm();
break;
}
case MachineOperand::MO_GlobalAddress: {
@@ -510,7 +526,7 @@ void AArch64AsmPrinter::printOperand(const MachineInstr *MI, unsigned OpNum,
bool AArch64AsmPrinter::printAsmMRegister(const MachineOperand &MO, char Mode,
raw_ostream &O) {
- unsigned Reg = MO.getReg();
+ Register Reg = MO.getReg();
switch (Mode) {
default:
return true; // Unknown mode.
@@ -531,14 +547,13 @@ bool AArch64AsmPrinter::printAsmMRegister(const MachineOperand &MO, char Mode,
// printing.
bool AArch64AsmPrinter::printAsmRegInClass(const MachineOperand &MO,
const TargetRegisterClass *RC,
- bool isVector, raw_ostream &O) {
+ unsigned AltName, raw_ostream &O) {
assert(MO.isReg() && "Should only get here with a register!");
const TargetRegisterInfo *RI = STI->getRegisterInfo();
- unsigned Reg = MO.getReg();
+ Register Reg = MO.getReg();
unsigned RegToPrint = RC->getRegister(RI->getEncodingValue(Reg));
assert(RI->regsOverlap(RegToPrint, Reg));
- O << AArch64InstPrinter::getRegisterName(
- RegToPrint, isVector ? AArch64::vreg : AArch64::NoRegAltName);
+ O << AArch64InstPrinter::getRegisterName(RegToPrint, AltName);
return false;
}
@@ -574,6 +589,7 @@ bool AArch64AsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
case 's': // Print S register.
case 'd': // Print D register.
case 'q': // Print Q register.
+ case 'z': // Print Z register.
if (MO.isReg()) {
const TargetRegisterClass *RC;
switch (ExtraCode[0]) {
@@ -592,10 +608,13 @@ bool AArch64AsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
case 'q':
RC = &AArch64::FPR128RegClass;
break;
+ case 'z':
+ RC = &AArch64::ZPRRegClass;
+ break;
default:
return true;
}
- return printAsmRegInClass(MO, RC, false /* vector */, O);
+ return printAsmRegInClass(MO, RC, AArch64::NoRegAltName, O);
}
printOperand(MI, OpNum, O);
return false;
@@ -605,16 +624,26 @@ bool AArch64AsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
// According to ARM, we should emit x and v registers unless we have a
// modifier.
if (MO.isReg()) {
- unsigned Reg = MO.getReg();
+ Register Reg = MO.getReg();
// If this is a w or x register, print an x register.
if (AArch64::GPR32allRegClass.contains(Reg) ||
AArch64::GPR64allRegClass.contains(Reg))
return printAsmMRegister(MO, 'x', O);
+ unsigned AltName = AArch64::NoRegAltName;
+ const TargetRegisterClass *RegClass;
+ if (AArch64::ZPRRegClass.contains(Reg)) {
+ RegClass = &AArch64::ZPRRegClass;
+ } else if (AArch64::PPRRegClass.contains(Reg)) {
+ RegClass = &AArch64::PPRRegClass;
+ } else {
+ RegClass = &AArch64::FPR128RegClass;
+ AltName = AArch64::vreg;
+ }
+
// If this is a b, h, s, d, or q register, print it as a v register.
- return printAsmRegInClass(MO, &AArch64::FPR128RegClass, true /* vector */,
- O);
+ return printAsmRegInClass(MO, RegClass, AltName, O);
}
printOperand(MI, OpNum, O);
@@ -682,7 +711,7 @@ void AArch64AsmPrinter::EmitJumpTableInfo() {
if (JTBBs.empty()) continue;
unsigned Size = AFI->getJumpTableEntrySize(JTI);
- EmitAlignment(Log2_32(Size));
+ EmitAlignment(Align(Size));
OutStreamer->EmitLabel(GetJTISymbol(JTI));
for (auto *JTBB : JTBBs)
@@ -725,12 +754,12 @@ void AArch64AsmPrinter::emitJumpTableEntry(const MachineJumpTableInfo *MJTI,
/// add xDest, xDest, xScratch, lsl #2
void AArch64AsmPrinter::LowerJumpTableDestSmall(llvm::MCStreamer &OutStreamer,
const llvm::MachineInstr &MI) {
- unsigned DestReg = MI.getOperand(0).getReg();
- unsigned ScratchReg = MI.getOperand(1).getReg();
- unsigned ScratchRegW =
+ Register DestReg = MI.getOperand(0).getReg();
+ Register ScratchReg = MI.getOperand(1).getReg();
+ Register ScratchRegW =
STI->getRegisterInfo()->getSubReg(ScratchReg, AArch64::sub_32);
- unsigned TableReg = MI.getOperand(2).getReg();
- unsigned EntryReg = MI.getOperand(3).getReg();
+ Register TableReg = MI.getOperand(2).getReg();
+ Register EntryReg = MI.getOperand(3).getReg();
int JTIdx = MI.getOperand(4).getIndex();
bool IsByteEntry = MI.getOpcode() == AArch64::JumpTableDest8;
@@ -800,7 +829,7 @@ void AArch64AsmPrinter::LowerPATCHPOINT(MCStreamer &OutStreamer, StackMaps &SM,
if (CallTarget) {
assert((CallTarget & 0xFFFFFFFFFFFF) == CallTarget &&
"High 16 bits of call target should be zero.");
- unsigned ScratchReg = MI.getOperand(Opers.getNextScratchIdx()).getReg();
+ Register ScratchReg = MI.getOperand(Opers.getNextScratchIdx()).getReg();
EncodedBytes = 16;
// Materialize the jump address:
EmitToStreamer(OutStreamer, MCInstBuilder(AArch64::MOVZXi)
@@ -830,7 +859,7 @@ void AArch64AsmPrinter::LowerPATCHPOINT(MCStreamer &OutStreamer, StackMaps &SM,
}
void AArch64AsmPrinter::EmitFMov0(const MachineInstr &MI) {
- unsigned DestReg = MI.getOperand(0).getReg();
+ Register DestReg = MI.getOperand(0).getReg();
if (STI->hasZeroCycleZeroingFP() && !STI->hasZeroCycleZeroingFPWorkaround()) {
// Convert H/S/D register to corresponding Q register
if (AArch64::H0 <= DestReg && DestReg <= AArch64::H31)
@@ -894,32 +923,32 @@ void AArch64AsmPrinter::EmitInstruction(const MachineInstr *MI) {
default:
break;
case AArch64::MOVMCSym: {
- unsigned DestReg = MI->getOperand(0).getReg();
- const MachineOperand &MO_Sym = MI->getOperand(1);
- MachineOperand Hi_MOSym(MO_Sym), Lo_MOSym(MO_Sym);
- MCOperand Hi_MCSym, Lo_MCSym;
-
- Hi_MOSym.setTargetFlags(AArch64II::MO_G1 | AArch64II::MO_S);
- Lo_MOSym.setTargetFlags(AArch64II::MO_G0 | AArch64II::MO_NC);
-
- MCInstLowering.lowerOperand(Hi_MOSym, Hi_MCSym);
- MCInstLowering.lowerOperand(Lo_MOSym, Lo_MCSym);
-
- MCInst MovZ;
- MovZ.setOpcode(AArch64::MOVZXi);
- MovZ.addOperand(MCOperand::createReg(DestReg));
- MovZ.addOperand(Hi_MCSym);
- MovZ.addOperand(MCOperand::createImm(16));
- EmitToStreamer(*OutStreamer, MovZ);
-
- MCInst MovK;
- MovK.setOpcode(AArch64::MOVKXi);
- MovK.addOperand(MCOperand::createReg(DestReg));
- MovK.addOperand(MCOperand::createReg(DestReg));
- MovK.addOperand(Lo_MCSym);
- MovK.addOperand(MCOperand::createImm(0));
- EmitToStreamer(*OutStreamer, MovK);
- return;
+ Register DestReg = MI->getOperand(0).getReg();
+ const MachineOperand &MO_Sym = MI->getOperand(1);
+ MachineOperand Hi_MOSym(MO_Sym), Lo_MOSym(MO_Sym);
+ MCOperand Hi_MCSym, Lo_MCSym;
+
+ Hi_MOSym.setTargetFlags(AArch64II::MO_G1 | AArch64II::MO_S);
+ Lo_MOSym.setTargetFlags(AArch64II::MO_G0 | AArch64II::MO_NC);
+
+ MCInstLowering.lowerOperand(Hi_MOSym, Hi_MCSym);
+ MCInstLowering.lowerOperand(Lo_MOSym, Lo_MCSym);
+
+ MCInst MovZ;
+ MovZ.setOpcode(AArch64::MOVZXi);
+ MovZ.addOperand(MCOperand::createReg(DestReg));
+ MovZ.addOperand(Hi_MCSym);
+ MovZ.addOperand(MCOperand::createImm(16));
+ EmitToStreamer(*OutStreamer, MovZ);
+
+ MCInst MovK;
+ MovK.setOpcode(AArch64::MOVKXi);
+ MovK.addOperand(MCOperand::createReg(DestReg));
+ MovK.addOperand(MCOperand::createReg(DestReg));
+ MovK.addOperand(Lo_MCSym);
+ MovK.addOperand(MCOperand::createImm(0));
+ EmitToStreamer(*OutStreamer, MovK);
+ return;
}
case AArch64::MOVIv2d_ns:
// If the target has <rdar://problem/16473581>, lower this
@@ -1084,6 +1113,7 @@ void AArch64AsmPrinter::EmitInstruction(const MachineInstr *MI) {
return;
case AArch64::HWASAN_CHECK_MEMACCESS:
+ case AArch64::HWASAN_CHECK_MEMACCESS_SHORTGRANULES:
LowerHWASAN_CHECK_MEMACCESS(*MI);
return;
@@ -1193,4 +1223,6 @@ extern "C" void LLVMInitializeAArch64AsmPrinter() {
RegisterAsmPrinter<AArch64AsmPrinter> X(getTheAArch64leTarget());
RegisterAsmPrinter<AArch64AsmPrinter> Y(getTheAArch64beTarget());
RegisterAsmPrinter<AArch64AsmPrinter> Z(getTheARM64Target());
+ RegisterAsmPrinter<AArch64AsmPrinter> W(getTheARM64_32Target());
+ RegisterAsmPrinter<AArch64AsmPrinter> V(getTheAArch64_32Target());
}
diff --git a/lib/Target/AArch64/AArch64CallLowering.cpp b/lib/Target/AArch64/AArch64CallLowering.cpp
index 59757769c89a..ed93d02aa615 100644
--- a/lib/Target/AArch64/AArch64CallLowering.cpp
+++ b/lib/Target/AArch64/AArch64CallLowering.cpp
@@ -99,7 +99,7 @@ struct IncomingArgHandler : public CallLowering::ValueHandler {
/// (it's an implicit-def of the BL).
virtual void markPhysRegUsed(unsigned PhysReg) = 0;
- bool isArgumentHandler() const override { return true; }
+ bool isIncomingArgumentHandler() const override { return true; }
uint64_t StackUsed;
};
@@ -110,6 +110,7 @@ struct FormalArgHandler : public IncomingArgHandler {
: IncomingArgHandler(MIRBuilder, MRI, AssignFn) {}
void markPhysRegUsed(unsigned PhysReg) override {
+ MIRBuilder.getMRI()->addLiveIn(PhysReg);
MIRBuilder.getMBB().addLiveIn(PhysReg);
}
};
@@ -129,14 +130,29 @@ struct CallReturnHandler : public IncomingArgHandler {
struct OutgoingArgHandler : public CallLowering::ValueHandler {
OutgoingArgHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI,
MachineInstrBuilder MIB, CCAssignFn *AssignFn,
- CCAssignFn *AssignFnVarArg)
+ CCAssignFn *AssignFnVarArg, bool IsTailCall = false,
+ int FPDiff = 0)
: ValueHandler(MIRBuilder, MRI, AssignFn), MIB(MIB),
- AssignFnVarArg(AssignFnVarArg), StackSize(0) {}
+ AssignFnVarArg(AssignFnVarArg), IsTailCall(IsTailCall), FPDiff(FPDiff),
+ StackSize(0) {}
+
+ bool isIncomingArgumentHandler() const override { return false; }
Register getStackAddress(uint64_t Size, int64_t Offset,
MachinePointerInfo &MPO) override {
+ MachineFunction &MF = MIRBuilder.getMF();
LLT p0 = LLT::pointer(0, 64);
LLT s64 = LLT::scalar(64);
+
+ if (IsTailCall) {
+ Offset += FPDiff;
+ int FI = MF.getFrameInfo().CreateFixedObject(Size, Offset, true);
+ Register FIReg = MRI.createGenericVirtualRegister(p0);
+ MIRBuilder.buildFrameIndex(FIReg, FI);
+ MPO = MachinePointerInfo::getFixedStack(MF, FI);
+ return FIReg;
+ }
+
Register SPReg = MRI.createGenericVirtualRegister(p0);
MIRBuilder.buildCopy(SPReg, Register(AArch64::SP));
@@ -146,7 +162,7 @@ struct OutgoingArgHandler : public CallLowering::ValueHandler {
Register AddrReg = MRI.createGenericVirtualRegister(p0);
MIRBuilder.buildGEP(AddrReg, SPReg, OffsetReg);
- MPO = MachinePointerInfo::getStack(MIRBuilder.getMF(), Offset);
+ MPO = MachinePointerInfo::getStack(MF, Offset);
return AddrReg;
}
@@ -173,12 +189,13 @@ struct OutgoingArgHandler : public CallLowering::ValueHandler {
bool assignArg(unsigned ValNo, MVT ValVT, MVT LocVT,
CCValAssign::LocInfo LocInfo,
const CallLowering::ArgInfo &Info,
+ ISD::ArgFlagsTy Flags,
CCState &State) override {
bool Res;
if (Info.IsFixed)
- Res = AssignFn(ValNo, ValVT, LocVT, LocInfo, Info.Flags, State);
+ Res = AssignFn(ValNo, ValVT, LocVT, LocInfo, Flags, State);
else
- Res = AssignFnVarArg(ValNo, ValVT, LocVT, LocInfo, Info.Flags, State);
+ Res = AssignFnVarArg(ValNo, ValVT, LocVT, LocInfo, Flags, State);
StackSize = State.getNextStackOffset();
return Res;
@@ -186,10 +203,19 @@ struct OutgoingArgHandler : public CallLowering::ValueHandler {
MachineInstrBuilder MIB;
CCAssignFn *AssignFnVarArg;
+ bool IsTailCall;
+
+ /// For tail calls, the byte offset of the call's argument area from the
+ /// callee's. Unused elsewhere.
+ int FPDiff;
uint64_t StackSize;
};
} // namespace
+static bool doesCalleeRestoreStack(CallingConv::ID CallConv, bool TailCallOpt) {
+ return CallConv == CallingConv::Fast && TailCallOpt;
+}
+
void AArch64CallLowering::splitToValueTypes(
const ArgInfo &OrigArg, SmallVectorImpl<ArgInfo> &SplitArgs,
const DataLayout &DL, MachineRegisterInfo &MRI, CallingConv::ID CallConv) const {
@@ -207,7 +233,7 @@ void AArch64CallLowering::splitToValueTypes(
// No splitting to do, but we want to replace the original type (e.g. [1 x
// double] -> double).
SplitArgs.emplace_back(OrigArg.Regs[0], SplitVTs[0].getTypeForEVT(Ctx),
- OrigArg.Flags, OrigArg.IsFixed);
+ OrigArg.Flags[0], OrigArg.IsFixed);
return;
}
@@ -218,13 +244,13 @@ void AArch64CallLowering::splitToValueTypes(
OrigArg.Ty, CallConv, false);
for (unsigned i = 0, e = SplitVTs.size(); i < e; ++i) {
Type *SplitTy = SplitVTs[i].getTypeForEVT(Ctx);
- SplitArgs.emplace_back(OrigArg.Regs[i], SplitTy, OrigArg.Flags,
+ SplitArgs.emplace_back(OrigArg.Regs[i], SplitTy, OrigArg.Flags[0],
OrigArg.IsFixed);
if (NeedsRegBlock)
- SplitArgs.back().Flags.setInConsecutiveRegs();
+ SplitArgs.back().Flags[0].setInConsecutiveRegs();
}
- SplitArgs.back().Flags.setInConsecutiveRegsLast();
+ SplitArgs.back().Flags[0].setInConsecutiveRegsLast();
}
bool AArch64CallLowering::lowerReturn(MachineIRBuilder &MIRBuilder,
@@ -344,6 +370,49 @@ bool AArch64CallLowering::lowerReturn(MachineIRBuilder &MIRBuilder,
return Success;
}
+/// Helper function to compute forwarded registers for musttail calls. Computes
+/// the forwarded registers, sets MBB liveness, and emits COPY instructions that
+/// can be used to save + restore registers later.
+static void handleMustTailForwardedRegisters(MachineIRBuilder &MIRBuilder,
+ CCAssignFn *AssignFn) {
+ MachineBasicBlock &MBB = MIRBuilder.getMBB();
+ MachineFunction &MF = MIRBuilder.getMF();
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+
+ if (!MFI.hasMustTailInVarArgFunc())
+ return;
+
+ AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
+ const Function &F = MF.getFunction();
+ assert(F.isVarArg() && "Expected F to be vararg?");
+
+ // Compute the set of forwarded registers. The rest are scratch.
+ SmallVector<CCValAssign, 16> ArgLocs;
+ CCState CCInfo(F.getCallingConv(), /*IsVarArg=*/true, MF, ArgLocs,
+ F.getContext());
+ SmallVector<MVT, 2> RegParmTypes;
+ RegParmTypes.push_back(MVT::i64);
+ RegParmTypes.push_back(MVT::f128);
+
+ // Later on, we can use this vector to restore the registers if necessary.
+ SmallVectorImpl<ForwardedRegister> &Forwards =
+ FuncInfo->getForwardedMustTailRegParms();
+ CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, AssignFn);
+
+ // Conservatively forward X8, since it might be used for an aggregate
+ // return.
+ if (!CCInfo.isAllocated(AArch64::X8)) {
+ unsigned X8VReg = MF.addLiveIn(AArch64::X8, &AArch64::GPR64RegClass);
+ Forwards.push_back(ForwardedRegister(X8VReg, AArch64::X8, MVT::i64));
+ }
+
+ // Add the forwards to the MachineBasicBlock and MachineFunction.
+ for (const auto &F : Forwards) {
+ MBB.addLiveIn(F.PReg);
+ MIRBuilder.buildCopy(Register(F.VReg), Register(F.PReg));
+ }
+}
+
bool AArch64CallLowering::lowerFormalArguments(
MachineIRBuilder &MIRBuilder, const Function &F,
ArrayRef<ArrayRef<Register>> VRegs) const {
@@ -376,64 +445,530 @@ bool AArch64CallLowering::lowerFormalArguments(
if (!handleAssignments(MIRBuilder, SplitArgs, Handler))
return false;
+ AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
+ uint64_t StackOffset = Handler.StackUsed;
if (F.isVarArg()) {
- if (!MF.getSubtarget<AArch64Subtarget>().isTargetDarwin()) {
- // FIXME: we need to reimplement saveVarArgsRegisters from
+ auto &Subtarget = MF.getSubtarget<AArch64Subtarget>();
+ if (!Subtarget.isTargetDarwin()) {
+ // FIXME: we need to reimplement saveVarArgsRegisters from
// AArch64ISelLowering.
return false;
}
- // We currently pass all varargs at 8-byte alignment.
- uint64_t StackOffset = alignTo(Handler.StackUsed, 8);
+ // We currently pass all varargs at 8-byte alignment, or 4 in ILP32.
+ StackOffset = alignTo(Handler.StackUsed, Subtarget.isTargetILP32() ? 4 : 8);
auto &MFI = MIRBuilder.getMF().getFrameInfo();
- AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
FuncInfo->setVarArgsStackIndex(MFI.CreateFixedObject(4, StackOffset, true));
}
+ if (doesCalleeRestoreStack(F.getCallingConv(),
+ MF.getTarget().Options.GuaranteedTailCallOpt)) {
+ // We have a non-standard ABI, so why not make full use of the stack that
+ // we're going to pop? It must be aligned to 16 B in any case.
+ StackOffset = alignTo(StackOffset, 16);
+
+ // If we're expected to restore the stack (e.g. fastcc), then we'll be
+ // adding a multiple of 16.
+ FuncInfo->setArgumentStackToRestore(StackOffset);
+
+ // Our own callers will guarantee that the space is free by giving an
+ // aligned value to CALLSEQ_START.
+ }
+
+ // When we tail call, we need to check if the callee's arguments
+ // will fit on the caller's stack. So, whenever we lower formal arguments,
+ // we should keep track of this information, since we might lower a tail call
+ // in this function later.
+ FuncInfo->setBytesInStackArgArea(StackOffset);
+
auto &Subtarget = MF.getSubtarget<AArch64Subtarget>();
if (Subtarget.hasCustomCallingConv())
Subtarget.getRegisterInfo()->UpdateCustomCalleeSavedRegs(MF);
+ handleMustTailForwardedRegisters(MIRBuilder, AssignFn);
+
// Move back to the end of the basic block.
MIRBuilder.setMBB(MBB);
return true;
}
+/// Return true if the calling convention is one that we can guarantee TCO for.
+static bool canGuaranteeTCO(CallingConv::ID CC) {
+ return CC == CallingConv::Fast;
+}
+
+/// Return true if we might ever do TCO for calls with this calling convention.
+static bool mayTailCallThisCC(CallingConv::ID CC) {
+ switch (CC) {
+ case CallingConv::C:
+ case CallingConv::PreserveMost:
+ case CallingConv::Swift:
+ return true;
+ default:
+ return canGuaranteeTCO(CC);
+ }
+}
+
+/// Returns a pair containing the fixed CCAssignFn and the vararg CCAssignFn for
+/// CC.
+static std::pair<CCAssignFn *, CCAssignFn *>
+getAssignFnsForCC(CallingConv::ID CC, const AArch64TargetLowering &TLI) {
+ return {TLI.CCAssignFnForCall(CC, false), TLI.CCAssignFnForCall(CC, true)};
+}
+
+bool AArch64CallLowering::doCallerAndCalleePassArgsTheSameWay(
+ CallLoweringInfo &Info, MachineFunction &MF,
+ SmallVectorImpl<ArgInfo> &InArgs) const {
+ const Function &CallerF = MF.getFunction();
+ CallingConv::ID CalleeCC = Info.CallConv;
+ CallingConv::ID CallerCC = CallerF.getCallingConv();
+
+ // If the calling conventions match, then everything must be the same.
+ if (CalleeCC == CallerCC)
+ return true;
+
+ // Check if the caller and callee will handle arguments in the same way.
+ const AArch64TargetLowering &TLI = *getTLI<AArch64TargetLowering>();
+ CCAssignFn *CalleeAssignFnFixed;
+ CCAssignFn *CalleeAssignFnVarArg;
+ std::tie(CalleeAssignFnFixed, CalleeAssignFnVarArg) =
+ getAssignFnsForCC(CalleeCC, TLI);
+
+ CCAssignFn *CallerAssignFnFixed;
+ CCAssignFn *CallerAssignFnVarArg;
+ std::tie(CallerAssignFnFixed, CallerAssignFnVarArg) =
+ getAssignFnsForCC(CallerCC, TLI);
+
+ if (!resultsCompatible(Info, MF, InArgs, *CalleeAssignFnFixed,
+ *CalleeAssignFnVarArg, *CallerAssignFnFixed,
+ *CallerAssignFnVarArg))
+ return false;
+
+ // Make sure that the caller and callee preserve all of the same registers.
+ auto TRI = MF.getSubtarget<AArch64Subtarget>().getRegisterInfo();
+ const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
+ const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
+ if (MF.getSubtarget<AArch64Subtarget>().hasCustomCallingConv()) {
+ TRI->UpdateCustomCallPreservedMask(MF, &CallerPreserved);
+ TRI->UpdateCustomCallPreservedMask(MF, &CalleePreserved);
+ }
+
+ return TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved);
+}
+
+bool AArch64CallLowering::areCalleeOutgoingArgsTailCallable(
+ CallLoweringInfo &Info, MachineFunction &MF,
+ SmallVectorImpl<ArgInfo> &OutArgs) const {
+ // If there are no outgoing arguments, then we are done.
+ if (OutArgs.empty())
+ return true;
+
+ const Function &CallerF = MF.getFunction();
+ CallingConv::ID CalleeCC = Info.CallConv;
+ CallingConv::ID CallerCC = CallerF.getCallingConv();
+ const AArch64TargetLowering &TLI = *getTLI<AArch64TargetLowering>();
+
+ CCAssignFn *AssignFnFixed;
+ CCAssignFn *AssignFnVarArg;
+ std::tie(AssignFnFixed, AssignFnVarArg) = getAssignFnsForCC(CalleeCC, TLI);
+
+ // We have outgoing arguments. Make sure that we can tail call with them.
+ SmallVector<CCValAssign, 16> OutLocs;
+ CCState OutInfo(CalleeCC, false, MF, OutLocs, CallerF.getContext());
+
+ if (!analyzeArgInfo(OutInfo, OutArgs, *AssignFnFixed, *AssignFnVarArg)) {
+ LLVM_DEBUG(dbgs() << "... Could not analyze call operands.\n");
+ return false;
+ }
+
+ // Make sure that they can fit on the caller's stack.
+ const AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
+ if (OutInfo.getNextStackOffset() > FuncInfo->getBytesInStackArgArea()) {
+ LLVM_DEBUG(dbgs() << "... Cannot fit call operands on caller's stack.\n");
+ return false;
+ }
+
+ // Verify that the parameters in callee-saved registers match.
+ // TODO: Port this over to CallLowering as general code once swiftself is
+ // supported.
+ auto TRI = MF.getSubtarget<AArch64Subtarget>().getRegisterInfo();
+ const uint32_t *CallerPreservedMask = TRI->getCallPreservedMask(MF, CallerCC);
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+
+ for (unsigned i = 0; i < OutLocs.size(); ++i) {
+ auto &ArgLoc = OutLocs[i];
+ // If it's not a register, it's fine.
+ if (!ArgLoc.isRegLoc()) {
+ if (Info.IsVarArg) {
+ // Be conservative and disallow variadic memory operands to match SDAG's
+ // behaviour.
+ // FIXME: If the caller's calling convention is C, then we can
+ // potentially use its argument area. However, for cases like fastcc,
+ // we can't do anything.
+ LLVM_DEBUG(
+ dbgs()
+ << "... Cannot tail call vararg function with stack arguments\n");
+ return false;
+ }
+ continue;
+ }
+
+ Register Reg = ArgLoc.getLocReg();
+
+ // Only look at callee-saved registers.
+ if (MachineOperand::clobbersPhysReg(CallerPreservedMask, Reg))
+ continue;
+
+ LLVM_DEBUG(
+ dbgs()
+ << "... Call has an argument passed in a callee-saved register.\n");
+
+ // Check if it was copied from.
+ ArgInfo &OutInfo = OutArgs[i];
+
+ if (OutInfo.Regs.size() > 1) {
+ LLVM_DEBUG(
+ dbgs() << "... Cannot handle arguments in multiple registers.\n");
+ return false;
+ }
+
+ // Check if we copy the register, walking through copies from virtual
+ // registers. Note that getDefIgnoringCopies does not ignore copies from
+ // physical registers.
+ MachineInstr *RegDef = getDefIgnoringCopies(OutInfo.Regs[0], MRI);
+ if (!RegDef || RegDef->getOpcode() != TargetOpcode::COPY) {
+ LLVM_DEBUG(
+ dbgs()
+ << "... Parameter was not copied into a VReg, cannot tail call.\n");
+ return false;
+ }
+
+ // Got a copy. Verify that it's the same as the register we want.
+ Register CopyRHS = RegDef->getOperand(1).getReg();
+ if (CopyRHS != Reg) {
+ LLVM_DEBUG(dbgs() << "... Callee-saved register was not copied into "
+ "VReg, cannot tail call.\n");
+ return false;
+ }
+ }
+
+ return true;
+}
+
+bool AArch64CallLowering::isEligibleForTailCallOptimization(
+ MachineIRBuilder &MIRBuilder, CallLoweringInfo &Info,
+ SmallVectorImpl<ArgInfo> &InArgs,
+ SmallVectorImpl<ArgInfo> &OutArgs) const {
+
+ // Must pass all target-independent checks in order to tail call optimize.
+ if (!Info.IsTailCall)
+ return false;
+
+ CallingConv::ID CalleeCC = Info.CallConv;
+ MachineFunction &MF = MIRBuilder.getMF();
+ const Function &CallerF = MF.getFunction();
+
+ LLVM_DEBUG(dbgs() << "Attempting to lower call as tail call\n");
+
+ if (Info.SwiftErrorVReg) {
+ // TODO: We should handle this.
+ // Note that this is also handled by the check for no outgoing arguments.
+ // Proactively disabling this though, because the swifterror handling in
+ // lowerCall inserts a COPY *after* the location of the call.
+ LLVM_DEBUG(dbgs() << "... Cannot handle tail calls with swifterror yet.\n");
+ return false;
+ }
+
+ if (!mayTailCallThisCC(CalleeCC)) {
+ LLVM_DEBUG(dbgs() << "... Calling convention cannot be tail called.\n");
+ return false;
+ }
+
+ // Byval parameters hand the function a pointer directly into the stack area
+ // we want to reuse during a tail call. Working around this *is* possible (see
+ // X86).
+ //
+ // FIXME: In AArch64ISelLowering, this isn't worked around. Can/should we try
+ // it?
+ //
+ // On Windows, "inreg" attributes signify non-aggregate indirect returns.
+ // In this case, it is necessary to save/restore X0 in the callee. Tail
+ // call opt interferes with this. So we disable tail call opt when the
+ // caller has an argument with "inreg" attribute.
+ //
+ // FIXME: Check whether the callee also has an "inreg" argument.
+ //
+ // When the caller has a swifterror argument, we don't want to tail call
+ // because would have to move into the swifterror register before the
+ // tail call.
+ if (any_of(CallerF.args(), [](const Argument &A) {
+ return A.hasByValAttr() || A.hasInRegAttr() || A.hasSwiftErrorAttr();
+ })) {
+ LLVM_DEBUG(dbgs() << "... Cannot tail call from callers with byval, "
+ "inreg, or swifterror arguments\n");
+ return false;
+ }
+
+ // Externally-defined functions with weak linkage should not be
+ // tail-called on AArch64 when the OS does not support dynamic
+ // pre-emption of symbols, as the AAELF spec requires normal calls
+ // to undefined weak functions to be replaced with a NOP or jump to the
+ // next instruction. The behaviour of branch instructions in this
+ // situation (as used for tail calls) is implementation-defined, so we
+ // cannot rely on the linker replacing the tail call with a return.
+ if (Info.Callee.isGlobal()) {
+ const GlobalValue *GV = Info.Callee.getGlobal();
+ const Triple &TT = MF.getTarget().getTargetTriple();
+ if (GV->hasExternalWeakLinkage() &&
+ (!TT.isOSWindows() || TT.isOSBinFormatELF() ||
+ TT.isOSBinFormatMachO())) {
+ LLVM_DEBUG(dbgs() << "... Cannot tail call externally-defined function "
+ "with weak linkage for this OS.\n");
+ return false;
+ }
+ }
+
+ // If we have -tailcallopt, then we're done.
+ if (MF.getTarget().Options.GuaranteedTailCallOpt)
+ return canGuaranteeTCO(CalleeCC) && CalleeCC == CallerF.getCallingConv();
+
+ // We don't have -tailcallopt, so we're allowed to change the ABI (sibcall).
+ // Try to find cases where we can do that.
+
+ // I want anyone implementing a new calling convention to think long and hard
+ // about this assert.
+ assert((!Info.IsVarArg || CalleeCC == CallingConv::C) &&
+ "Unexpected variadic calling convention");
+
+ // Verify that the incoming and outgoing arguments from the callee are
+ // safe to tail call.
+ if (!doCallerAndCalleePassArgsTheSameWay(Info, MF, InArgs)) {
+ LLVM_DEBUG(
+ dbgs()
+ << "... Caller and callee have incompatible calling conventions.\n");
+ return false;
+ }
+
+ if (!areCalleeOutgoingArgsTailCallable(Info, MF, OutArgs))
+ return false;
+
+ LLVM_DEBUG(
+ dbgs() << "... Call is eligible for tail call optimization.\n");
+ return true;
+}
+
+static unsigned getCallOpcode(const Function &CallerF, bool IsIndirect,
+ bool IsTailCall) {
+ if (!IsTailCall)
+ return IsIndirect ? AArch64::BLR : AArch64::BL;
+
+ if (!IsIndirect)
+ return AArch64::TCRETURNdi;
+
+ // When BTI is enabled, we need to use TCRETURNriBTI to make sure that we use
+ // x16 or x17.
+ if (CallerF.hasFnAttribute("branch-target-enforcement"))
+ return AArch64::TCRETURNriBTI;
+
+ return AArch64::TCRETURNri;
+}
+
+bool AArch64CallLowering::lowerTailCall(
+ MachineIRBuilder &MIRBuilder, CallLoweringInfo &Info,
+ SmallVectorImpl<ArgInfo> &OutArgs) const {
+ MachineFunction &MF = MIRBuilder.getMF();
+ const Function &F = MF.getFunction();
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+ const AArch64TargetLowering &TLI = *getTLI<AArch64TargetLowering>();
+ AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
+
+ // True when we're tail calling, but without -tailcallopt.
+ bool IsSibCall = !MF.getTarget().Options.GuaranteedTailCallOpt;
+
+ // TODO: Right now, regbankselect doesn't know how to handle the rtcGPR64
+ // register class. Until we can do that, we should fall back here.
+ if (F.hasFnAttribute("branch-target-enforcement")) {
+ LLVM_DEBUG(
+ dbgs() << "Cannot lower indirect tail calls with BTI enabled yet.\n");
+ return false;
+ }
+
+ // Find out which ABI gets to decide where things go.
+ CallingConv::ID CalleeCC = Info.CallConv;
+ CCAssignFn *AssignFnFixed;
+ CCAssignFn *AssignFnVarArg;
+ std::tie(AssignFnFixed, AssignFnVarArg) = getAssignFnsForCC(CalleeCC, TLI);
+
+ MachineInstrBuilder CallSeqStart;
+ if (!IsSibCall)
+ CallSeqStart = MIRBuilder.buildInstr(AArch64::ADJCALLSTACKDOWN);
+
+ unsigned Opc = getCallOpcode(F, Info.Callee.isReg(), true);
+ auto MIB = MIRBuilder.buildInstrNoInsert(Opc);
+ MIB.add(Info.Callee);
+
+ // Byte offset for the tail call. When we are sibcalling, this will always
+ // be 0.
+ MIB.addImm(0);
+
+ // Tell the call which registers are clobbered.
+ auto TRI = MF.getSubtarget<AArch64Subtarget>().getRegisterInfo();
+ const uint32_t *Mask = TRI->getCallPreservedMask(MF, F.getCallingConv());
+ if (MF.getSubtarget<AArch64Subtarget>().hasCustomCallingConv())
+ TRI->UpdateCustomCallPreservedMask(MF, &Mask);
+ MIB.addRegMask(Mask);
+
+ if (TRI->isAnyArgRegReserved(MF))
+ TRI->emitReservedArgRegCallError(MF);
+
+ // FPDiff is the byte offset of the call's argument area from the callee's.
+ // Stores to callee stack arguments will be placed in FixedStackSlots offset
+ // by this amount for a tail call. In a sibling call it must be 0 because the
+ // caller will deallocate the entire stack and the callee still expects its
+ // arguments to begin at SP+0.
+ int FPDiff = 0;
+
+ // This will be 0 for sibcalls, potentially nonzero for tail calls produced
+ // by -tailcallopt. For sibcalls, the memory operands for the call are
+ // already available in the caller's incoming argument space.
+ unsigned NumBytes = 0;
+ if (!IsSibCall) {
+ // We aren't sibcalling, so we need to compute FPDiff. We need to do this
+ // before handling assignments, because FPDiff must be known for memory
+ // arguments.
+ unsigned NumReusableBytes = FuncInfo->getBytesInStackArgArea();
+ SmallVector<CCValAssign, 16> OutLocs;
+ CCState OutInfo(CalleeCC, false, MF, OutLocs, F.getContext());
+ analyzeArgInfo(OutInfo, OutArgs, *AssignFnFixed, *AssignFnVarArg);
+
+ // The callee will pop the argument stack as a tail call. Thus, we must
+ // keep it 16-byte aligned.
+ NumBytes = alignTo(OutInfo.getNextStackOffset(), 16);
+
+ // FPDiff will be negative if this tail call requires more space than we
+ // would automatically have in our incoming argument space. Positive if we
+ // actually shrink the stack.
+ FPDiff = NumReusableBytes - NumBytes;
+
+ // The stack pointer must be 16-byte aligned at all times it's used for a
+ // memory operation, which in practice means at *all* times and in
+ // particular across call boundaries. Therefore our own arguments started at
+ // a 16-byte aligned SP and the delta applied for the tail call should
+ // satisfy the same constraint.
+ assert(FPDiff % 16 == 0 && "unaligned stack on tail call");
+ }
+
+ const auto &Forwards = FuncInfo->getForwardedMustTailRegParms();
+
+ // Do the actual argument marshalling.
+ SmallVector<unsigned, 8> PhysRegs;
+ OutgoingArgHandler Handler(MIRBuilder, MRI, MIB, AssignFnFixed,
+ AssignFnVarArg, true, FPDiff);
+ if (!handleAssignments(MIRBuilder, OutArgs, Handler))
+ return false;
+
+ if (Info.IsVarArg && Info.IsMustTailCall) {
+ // Now we know what's being passed to the function. Add uses to the call for
+ // the forwarded registers that we *aren't* passing as parameters. This will
+ // preserve the copies we build earlier.
+ for (const auto &F : Forwards) {
+ Register ForwardedReg = F.PReg;
+ // If the register is already passed, or aliases a register which is
+ // already being passed, then skip it.
+ if (any_of(MIB->uses(), [&ForwardedReg, &TRI](const MachineOperand &Use) {
+ if (!Use.isReg())
+ return false;
+ return TRI->regsOverlap(Use.getReg(), ForwardedReg);
+ }))
+ continue;
+
+ // We aren't passing it already, so we should add it to the call.
+ MIRBuilder.buildCopy(ForwardedReg, Register(F.VReg));
+ MIB.addReg(ForwardedReg, RegState::Implicit);
+ }
+ }
+
+ // If we have -tailcallopt, we need to adjust the stack. We'll do the call
+ // sequence start and end here.
+ if (!IsSibCall) {
+ MIB->getOperand(1).setImm(FPDiff);
+ CallSeqStart.addImm(NumBytes).addImm(0);
+ // End the call sequence *before* emitting the call. Normally, we would
+ // tidy the frame up after the call. However, here, we've laid out the
+ // parameters so that when SP is reset, they will be in the correct
+ // location.
+ MIRBuilder.buildInstr(AArch64::ADJCALLSTACKUP).addImm(NumBytes).addImm(0);
+ }
+
+ // Now we can add the actual call instruction to the correct basic block.
+ MIRBuilder.insertInstr(MIB);
+
+ // If Callee is a reg, since it is used by a target specific instruction,
+ // it must have a register class matching the constraint of that instruction.
+ if (Info.Callee.isReg())
+ MIB->getOperand(0).setReg(constrainOperandRegClass(
+ MF, *TRI, MRI, *MF.getSubtarget().getInstrInfo(),
+ *MF.getSubtarget().getRegBankInfo(), *MIB, MIB->getDesc(), Info.Callee,
+ 0));
+
+ MF.getFrameInfo().setHasTailCall();
+ Info.LoweredTailCall = true;
+ return true;
+}
+
bool AArch64CallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
- CallingConv::ID CallConv,
- const MachineOperand &Callee,
- const ArgInfo &OrigRet,
- ArrayRef<ArgInfo> OrigArgs,
- Register SwiftErrorVReg) const {
+ CallLoweringInfo &Info) const {
MachineFunction &MF = MIRBuilder.getMF();
const Function &F = MF.getFunction();
MachineRegisterInfo &MRI = MF.getRegInfo();
auto &DL = F.getParent()->getDataLayout();
+ const AArch64TargetLowering &TLI = *getTLI<AArch64TargetLowering>();
- SmallVector<ArgInfo, 8> SplitArgs;
- for (auto &OrigArg : OrigArgs) {
- splitToValueTypes(OrigArg, SplitArgs, DL, MRI, CallConv);
+ SmallVector<ArgInfo, 8> OutArgs;
+ for (auto &OrigArg : Info.OrigArgs) {
+ splitToValueTypes(OrigArg, OutArgs, DL, MRI, Info.CallConv);
// AAPCS requires that we zero-extend i1 to 8 bits by the caller.
if (OrigArg.Ty->isIntegerTy(1))
- SplitArgs.back().Flags.setZExt();
+ OutArgs.back().Flags[0].setZExt();
+ }
+
+ SmallVector<ArgInfo, 8> InArgs;
+ if (!Info.OrigRet.Ty->isVoidTy())
+ splitToValueTypes(Info.OrigRet, InArgs, DL, MRI, F.getCallingConv());
+
+ // If we can lower as a tail call, do that instead.
+ bool CanTailCallOpt =
+ isEligibleForTailCallOptimization(MIRBuilder, Info, InArgs, OutArgs);
+
+ // We must emit a tail call if we have musttail.
+ if (Info.IsMustTailCall && !CanTailCallOpt) {
+ // There are types of incoming/outgoing arguments we can't handle yet, so
+ // it doesn't make sense to actually die here like in ISelLowering. Instead,
+ // fall back to SelectionDAG and let it try to handle this.
+ LLVM_DEBUG(dbgs() << "Failed to lower musttail call as tail call\n");
+ return false;
}
+ if (CanTailCallOpt)
+ return lowerTailCall(MIRBuilder, Info, OutArgs);
+
// Find out which ABI gets to decide where things go.
- const AArch64TargetLowering &TLI = *getTLI<AArch64TargetLowering>();
- CCAssignFn *AssignFnFixed =
- TLI.CCAssignFnForCall(CallConv, /*IsVarArg=*/false);
- CCAssignFn *AssignFnVarArg =
- TLI.CCAssignFnForCall(CallConv, /*IsVarArg=*/true);
+ CCAssignFn *AssignFnFixed;
+ CCAssignFn *AssignFnVarArg;
+ std::tie(AssignFnFixed, AssignFnVarArg) =
+ getAssignFnsForCC(Info.CallConv, TLI);
- auto CallSeqStart = MIRBuilder.buildInstr(AArch64::ADJCALLSTACKDOWN);
+ MachineInstrBuilder CallSeqStart;
+ CallSeqStart = MIRBuilder.buildInstr(AArch64::ADJCALLSTACKDOWN);
// Create a temporarily-floating call instruction so we can add the implicit
// uses of arg registers.
- auto MIB = MIRBuilder.buildInstrNoInsert(Callee.isReg() ? AArch64::BLR
- : AArch64::BL);
- MIB.add(Callee);
+ unsigned Opc = getCallOpcode(F, Info.Callee.isReg(), false);
+
+ auto MIB = MIRBuilder.buildInstrNoInsert(Opc);
+ MIB.add(Info.Callee);
// Tell the call which registers are clobbered.
auto TRI = MF.getSubtarget<AArch64Subtarget>().getRegisterInfo();
@@ -448,8 +983,8 @@ bool AArch64CallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
// Do the actual argument marshalling.
SmallVector<unsigned, 8> PhysRegs;
OutgoingArgHandler Handler(MIRBuilder, MRI, MIB, AssignFnFixed,
- AssignFnVarArg);
- if (!handleAssignments(MIRBuilder, SplitArgs, Handler))
+ AssignFnVarArg, false);
+ if (!handleAssignments(MIRBuilder, OutArgs, Handler))
return false;
// Now we can add the actual call instruction to the correct basic block.
@@ -458,34 +993,37 @@ bool AArch64CallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
// If Callee is a reg, since it is used by a target specific
// instruction, it must have a register class matching the
// constraint of that instruction.
- if (Callee.isReg())
+ if (Info.Callee.isReg())
MIB->getOperand(0).setReg(constrainOperandRegClass(
MF, *TRI, MRI, *MF.getSubtarget().getInstrInfo(),
- *MF.getSubtarget().getRegBankInfo(), *MIB, MIB->getDesc(), Callee, 0));
+ *MF.getSubtarget().getRegBankInfo(), *MIB, MIB->getDesc(), Info.Callee,
+ 0));
// Finally we can copy the returned value back into its virtual-register. In
// symmetry with the arugments, the physical register must be an
// implicit-define of the call instruction.
- CCAssignFn *RetAssignFn = TLI.CCAssignFnForReturn(F.getCallingConv());
- if (!OrigRet.Ty->isVoidTy()) {
- SplitArgs.clear();
-
- splitToValueTypes(OrigRet, SplitArgs, DL, MRI, F.getCallingConv());
-
+ if (!Info.OrigRet.Ty->isVoidTy()) {
+ CCAssignFn *RetAssignFn = TLI.CCAssignFnForReturn(F.getCallingConv());
CallReturnHandler Handler(MIRBuilder, MRI, MIB, RetAssignFn);
- if (!handleAssignments(MIRBuilder, SplitArgs, Handler))
+ if (!handleAssignments(MIRBuilder, InArgs, Handler))
return false;
}
- if (SwiftErrorVReg) {
+ if (Info.SwiftErrorVReg) {
MIB.addDef(AArch64::X21, RegState::Implicit);
- MIRBuilder.buildCopy(SwiftErrorVReg, Register(AArch64::X21));
+ MIRBuilder.buildCopy(Info.SwiftErrorVReg, Register(AArch64::X21));
}
+ uint64_t CalleePopBytes =
+ doesCalleeRestoreStack(Info.CallConv,
+ MF.getTarget().Options.GuaranteedTailCallOpt)
+ ? alignTo(Handler.StackSize, 16)
+ : 0;
+
CallSeqStart.addImm(Handler.StackSize).addImm(0);
MIRBuilder.buildInstr(AArch64::ADJCALLSTACKUP)
.addImm(Handler.StackSize)
- .addImm(0);
+ .addImm(CalleePopBytes);
return true;
}
diff --git a/lib/Target/AArch64/AArch64CallLowering.h b/lib/Target/AArch64/AArch64CallLowering.h
index 4f428f254537..b0c601c7062c 100644
--- a/lib/Target/AArch64/AArch64CallLowering.h
+++ b/lib/Target/AArch64/AArch64CallLowering.h
@@ -40,16 +40,15 @@ public:
bool lowerFormalArguments(MachineIRBuilder &MIRBuilder, const Function &F,
ArrayRef<ArrayRef<Register>> VRegs) const override;
- bool lowerCall(MachineIRBuilder &MIRBuilder, CallingConv::ID CallConv,
- const MachineOperand &Callee, const ArgInfo &OrigRet,
- ArrayRef<ArgInfo> OrigArgs,
- Register SwiftErrorVReg) const override;
+ bool lowerCall(MachineIRBuilder &MIRBuilder,
+ CallLoweringInfo &Info) const override;
- bool lowerCall(MachineIRBuilder &MIRBuilder, CallingConv::ID CallConv,
- const MachineOperand &Callee, const ArgInfo &OrigRet,
- ArrayRef<ArgInfo> OrigArgs) const override {
- return lowerCall(MIRBuilder, CallConv, Callee, OrigRet, OrigArgs, 0);
- }
+ /// Returns true if the call can be lowered as a tail call.
+ bool
+ isEligibleForTailCallOptimization(MachineIRBuilder &MIRBuilder,
+ CallLoweringInfo &Info,
+ SmallVectorImpl<ArgInfo> &InArgs,
+ SmallVectorImpl<ArgInfo> &OutArgs) const;
bool supportSwiftError() const override { return true; }
@@ -64,6 +63,18 @@ private:
SmallVectorImpl<ArgInfo> &SplitArgs,
const DataLayout &DL, MachineRegisterInfo &MRI,
CallingConv::ID CallConv) const;
+
+ bool lowerTailCall(MachineIRBuilder &MIRBuilder, CallLoweringInfo &Info,
+ SmallVectorImpl<ArgInfo> &OutArgs) const;
+
+ bool
+ doCallerAndCalleePassArgsTheSameWay(CallLoweringInfo &Info,
+ MachineFunction &MF,
+ SmallVectorImpl<ArgInfo> &InArgs) const;
+
+ bool
+ areCalleeOutgoingArgsTailCallable(CallLoweringInfo &Info, MachineFunction &MF,
+ SmallVectorImpl<ArgInfo> &OutArgs) const;
};
} // end namespace llvm
diff --git a/lib/Target/AArch64/AArch64CallingConvention.cpp b/lib/Target/AArch64/AArch64CallingConvention.cpp
index 02538a187611..a0695cef615f 100644
--- a/lib/Target/AArch64/AArch64CallingConvention.cpp
+++ b/lib/Target/AArch64/AArch64CallingConvention.cpp
@@ -40,12 +40,14 @@ static bool finishStackBlock(SmallVectorImpl<CCValAssign> &PendingMembers,
MVT LocVT, ISD::ArgFlagsTy &ArgFlags,
CCState &State, unsigned SlotAlign) {
unsigned Size = LocVT.getSizeInBits() / 8;
- unsigned StackAlign =
+ const Align StackAlign =
State.getMachineFunction().getDataLayout().getStackAlignment();
- unsigned Align = std::min(ArgFlags.getOrigAlign(), StackAlign);
+ const Align OrigAlign(ArgFlags.getOrigAlign());
+ const Align Align = std::min(OrigAlign, StackAlign);
for (auto &It : PendingMembers) {
- It.convertToMem(State.AllocateStack(Size, std::max(Align, SlotAlign)));
+ It.convertToMem(State.AllocateStack(
+ Size, std::max((unsigned)Align.value(), SlotAlign)));
State.addLoc(It);
SlotAlign = 1;
}
@@ -79,10 +81,14 @@ static bool CC_AArch64_Custom_Stack_Block(
static bool CC_AArch64_Custom_Block(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
CCValAssign::LocInfo &LocInfo,
ISD::ArgFlagsTy &ArgFlags, CCState &State) {
+ const AArch64Subtarget &Subtarget = static_cast<const AArch64Subtarget &>(
+ State.getMachineFunction().getSubtarget());
+ bool IsDarwinILP32 = Subtarget.isTargetILP32() && Subtarget.isTargetMachO();
+
// Try to allocate a contiguous block of registers, each of the correct
// size to hold one member.
ArrayRef<MCPhysReg> RegList;
- if (LocVT.SimpleTy == MVT::i64)
+ if (LocVT.SimpleTy == MVT::i64 || (IsDarwinILP32 && LocVT.SimpleTy == MVT::i32))
RegList = XRegList;
else if (LocVT.SimpleTy == MVT::f16)
RegList = HRegList;
@@ -107,8 +113,12 @@ static bool CC_AArch64_Custom_Block(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
if (!ArgFlags.isInConsecutiveRegsLast())
return true;
- unsigned RegResult = State.AllocateRegBlock(RegList, PendingMembers.size());
- if (RegResult) {
+ // [N x i32] arguments get packed into x-registers on Darwin's arm64_32
+ // because that's how the armv7k Clang front-end emits small structs.
+ unsigned EltsPerReg = (IsDarwinILP32 && LocVT.SimpleTy == MVT::i32) ? 2 : 1;
+ unsigned RegResult = State.AllocateRegBlock(
+ RegList, alignTo(PendingMembers.size(), EltsPerReg) / EltsPerReg);
+ if (RegResult && EltsPerReg == 1) {
for (auto &It : PendingMembers) {
It.convertToReg(RegResult);
State.addLoc(It);
@@ -116,14 +126,26 @@ static bool CC_AArch64_Custom_Block(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
}
PendingMembers.clear();
return true;
+ } else if (RegResult) {
+ assert(EltsPerReg == 2 && "unexpected ABI");
+ bool UseHigh = false;
+ CCValAssign::LocInfo Info;
+ for (auto &It : PendingMembers) {
+ Info = UseHigh ? CCValAssign::AExtUpper : CCValAssign::ZExt;
+ State.addLoc(CCValAssign::getReg(It.getValNo(), MVT::i32, RegResult,
+ MVT::i64, Info));
+ UseHigh = !UseHigh;
+ if (!UseHigh)
+ ++RegResult;
+ }
+ PendingMembers.clear();
+ return true;
}
// Mark all regs in the class as unavailable
for (auto Reg : RegList)
State.AllocateReg(Reg);
- const AArch64Subtarget &Subtarget = static_cast<const AArch64Subtarget &>(
- State.getMachineFunction().getSubtarget());
unsigned SlotAlign = Subtarget.isTargetDarwin() ? 1 : 8;
return finishStackBlock(PendingMembers, LocVT, ArgFlags, State, SlotAlign);
diff --git a/lib/Target/AArch64/AArch64CallingConvention.h b/lib/Target/AArch64/AArch64CallingConvention.h
index 13cc0c583fd2..5a55d090d7c8 100644
--- a/lib/Target/AArch64/AArch64CallingConvention.h
+++ b/lib/Target/AArch64/AArch64CallingConvention.h
@@ -25,6 +25,9 @@ bool CC_AArch64_DarwinPCS_VarArg(unsigned ValNo, MVT ValVT, MVT LocVT,
bool CC_AArch64_DarwinPCS(unsigned ValNo, MVT ValVT, MVT LocVT,
CCValAssign::LocInfo LocInfo,
ISD::ArgFlagsTy ArgFlags, CCState &State);
+bool CC_AArch64_DarwinPCS_ILP32_VarArg(unsigned ValNo, MVT ValVT, MVT LocVT,
+ CCValAssign::LocInfo LocInfo,
+ ISD::ArgFlagsTy ArgFlags, CCState &State);
bool CC_AArch64_Win64_VarArg(unsigned ValNo, MVT ValVT, MVT LocVT,
CCValAssign::LocInfo LocInfo,
ISD::ArgFlagsTy ArgFlags, CCState &State);
diff --git a/lib/Target/AArch64/AArch64CallingConvention.td b/lib/Target/AArch64/AArch64CallingConvention.td
index d969a9e1ab3a..bccbbd4591ed 100644
--- a/lib/Target/AArch64/AArch64CallingConvention.td
+++ b/lib/Target/AArch64/AArch64CallingConvention.td
@@ -17,6 +17,10 @@ class CCIfAlign<string Align, CCAction A> :
class CCIfBigEndian<CCAction A> :
CCIf<"State.getMachineFunction().getDataLayout().isBigEndian()", A>;
+class CCIfILP32<CCAction A> :
+ CCIf<"State.getMachineFunction().getDataLayout().getPointerSize() == 4", A>;
+
+
//===----------------------------------------------------------------------===//
// ARM AAPCS64 Calling Convention
//===----------------------------------------------------------------------===//
@@ -70,6 +74,18 @@ def CC_AArch64_AAPCS : CallingConv<[
CCIfConsecutiveRegs<CCCustom<"CC_AArch64_Custom_Block">>,
+ CCIfType<[nxv16i8, nxv8i16, nxv4i32, nxv2i64, nxv2f16, nxv4f16, nxv8f16,
+ nxv1f32, nxv2f32, nxv4f32, nxv1f64, nxv2f64],
+ CCAssignToReg<[Z0, Z1, Z2, Z3, Z4, Z5, Z6, Z7]>>,
+ CCIfType<[nxv16i8, nxv8i16, nxv4i32, nxv2i64, nxv2f16, nxv4f16, nxv8f16,
+ nxv1f32, nxv2f32, nxv4f32, nxv1f64, nxv2f64],
+ CCPassIndirect<i64>>,
+
+ CCIfType<[nxv2i1, nxv4i1, nxv8i1, nxv16i1],
+ CCAssignToReg<[P0, P1, P2, P3]>>,
+ CCIfType<[nxv2i1, nxv4i1, nxv8i1, nxv16i1],
+ CCPassIndirect<i64>>,
+
// Handle i1, i8, i16, i32, i64, f32, f64 and v2f64 by passing in registers,
// up to eight each of GPR and FPR.
CCIfType<[i1, i8, i16], CCPromoteToType<i32>>,
@@ -111,6 +127,7 @@ def RetCC_AArch64_AAPCS : CallingConv<[
CCIfType<[v2f32], CCBitConvertToType<v2i32>>,
CCIfType<[v2f64, v4f32], CCBitConvertToType<v2i64>>,
+ CCIfConsecutiveRegs<CCCustom<"CC_AArch64_Custom_Block">>,
CCIfSwiftError<CCIfType<[i64], CCAssignToRegWithShadow<[X21], [W21]>>>,
// Big endian vectors must be passed as if they were 1-element vectors so that
@@ -135,7 +152,14 @@ def RetCC_AArch64_AAPCS : CallingConv<[
CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7],
[Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
CCIfType<[f128, v2i64, v4i32, v8i16, v16i8, v4f32, v2f64, v8f16],
- CCAssignToReg<[Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>
+ CCAssignToReg<[Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
+
+ CCIfType<[nxv16i8, nxv8i16, nxv4i32, nxv2i64, nxv2f16, nxv4f16, nxv8f16,
+ nxv1f32, nxv2f32, nxv4f32, nxv1f64, nxv2f64],
+ CCAssignToReg<[Z0, Z1, Z2, Z3, Z4, Z5, Z6, Z7]>>,
+
+ CCIfType<[nxv2i1, nxv4i1, nxv8i1, nxv16i1],
+ CCAssignToReg<[P0, P1, P2, P3]>>
]>;
// Vararg functions on windows pass floats in integer registers
@@ -202,6 +226,12 @@ def CC_AArch64_DarwinPCS : CallingConv<[
CCIf<"ValVT == MVT::i1 || ValVT == MVT::i8", CCAssignToStack<1, 1>>,
CCIf<"ValVT == MVT::i16 || ValVT == MVT::f16", CCAssignToStack<2, 2>>,
CCIfType<[i32, f32], CCAssignToStack<4, 4>>,
+
+ // Re-demote pointers to 32-bits so we don't end up storing 64-bit
+ // values and clobbering neighbouring stack locations. Not very pretty.
+ CCIfPtr<CCIfILP32<CCTruncToType<i32>>>,
+ CCIfPtr<CCIfILP32<CCAssignToStack<4, 4>>>,
+
CCIfType<[i64, f64, v1f64, v2f32, v1i64, v2i32, v4i16, v8i8, v4f16],
CCAssignToStack<8, 8>>,
CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32, v2f64, v8f16],
@@ -229,6 +259,29 @@ def CC_AArch64_DarwinPCS_VarArg : CallingConv<[
CCAssignToStack<16, 16>>
]>;
+// In the ILP32 world, the minimum stack slot size is 4 bytes. Otherwise the
+// same as the normal Darwin VarArgs handling.
+let Entry = 1 in
+def CC_AArch64_DarwinPCS_ILP32_VarArg : CallingConv<[
+ CCIfType<[v2f32], CCBitConvertToType<v2i32>>,
+ CCIfType<[v2f64, v4f32, f128], CCBitConvertToType<v2i64>>,
+
+ // Handle all scalar types as either i32 or f32.
+ CCIfType<[i8, i16], CCPromoteToType<i32>>,
+ CCIfType<[f16], CCPromoteToType<f32>>,
+
+ // Everything is on the stack.
+ // i128 is split to two i64s, and its stack alignment is 16 bytes.
+ CCIfPtr<CCIfILP32<CCTruncToType<i32>>>,
+ CCIfType<[i32, f32], CCAssignToStack<4, 4>>,
+ CCIfType<[i64], CCIfSplit<CCAssignToStack<8, 16>>>,
+ CCIfType<[i64, f64, v1i64, v2i32, v4i16, v8i8, v1f64, v2f32, v4f16],
+ CCAssignToStack<8, 8>>,
+ CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32, v2f64, v8f16],
+ CCAssignToStack<16, 16>>
+]>;
+
+
// The WebKit_JS calling convention only passes the first argument (the callee)
// in register and the remaining arguments on stack. We allow 32bit stack slots,
// so that WebKit can write partial values in the stack and define the other
@@ -298,6 +351,12 @@ def CC_AArch64_GHC : CallingConv<[
CCIfType<[i64], CCAssignToReg<[X19, X20, X21, X22, X23, X24, X25, X26, X27, X28]>>
]>;
+// The order of the callee-saves in this file is important, because the
+// FrameLowering code will use this order to determine the layout the
+// callee-save area in the stack frame. As can be observed below, Darwin
+// requires the frame-record (LR, FP) to be at the top the callee-save area,
+// whereas for other platforms they are at the bottom.
+
// FIXME: LR is only callee-saved in the sense that *we* preserve it and are
// presumably a callee to someone. External functions may not do so, but this
// is currently safe since BL has LR as an implicit-def and what happens after a
@@ -306,7 +365,13 @@ def CC_AArch64_GHC : CallingConv<[
// It would be better to model its preservation semantics properly (create a
// vreg on entry, use it in RET & tail call generation; make that vreg def if we
// end up saving LR as part of a call frame). Watch this space...
-def CSR_AArch64_AAPCS : CalleeSavedRegs<(add LR, FP, X19, X20, X21, X22,
+def CSR_AArch64_AAPCS : CalleeSavedRegs<(add X19, X20, X21, X22, X23, X24,
+ X25, X26, X27, X28, LR, FP,
+ D8, D9, D10, D11,
+ D12, D13, D14, D15)>;
+
+// Darwin puts the frame-record at the top of the callee-save area.
+def CSR_Darwin_AArch64_AAPCS : CalleeSavedRegs<(add LR, FP, X19, X20, X21, X22,
X23, X24, X25, X26, X27, X28,
D8, D9, D10, D11,
D12, D13, D14, D15)>;
@@ -314,17 +379,24 @@ def CSR_AArch64_AAPCS : CalleeSavedRegs<(add LR, FP, X19, X20, X21, X22,
// Win64 has unwinding codes for an (FP,LR) pair, save_fplr and save_fplr_x.
// We put FP before LR, so that frame lowering logic generates (FP,LR) pairs,
// and not (LR,FP) pairs.
-def CSR_Win_AArch64_AAPCS : CalleeSavedRegs<(add FP, LR, X19, X20, X21, X22,
- X23, X24, X25, X26, X27, X28,
+def CSR_Win_AArch64_AAPCS : CalleeSavedRegs<(add X19, X20, X21, X22, X23, X24,
+ X25, X26, X27, X28, FP, LR,
D8, D9, D10, D11,
D12, D13, D14, D15)>;
// AArch64 PCS for vector functions (VPCS)
// must (additionally) preserve full Q8-Q23 registers
-def CSR_AArch64_AAVPCS : CalleeSavedRegs<(add LR, FP, X19, X20, X21, X22,
- X23, X24, X25, X26, X27, X28,
+def CSR_AArch64_AAVPCS : CalleeSavedRegs<(add X19, X20, X21, X22, X23, X24,
+ X25, X26, X27, X28, LR, FP,
(sequence "Q%u", 8, 23))>;
+// Functions taking SVE arguments or returning an SVE type
+// must (additionally) preserve full Z8-Z23 and predicate registers P4-P15
+def CSR_AArch64_SVE_AAPCS : CalleeSavedRegs<(add X19, X20, X21, X22, X23, X24,
+ X25, X26, X27, X28, LR, FP,
+ (sequence "Z%u", 8, 23),
+ (sequence "P%u", 4, 15))>;
+
// Constructors and destructors return 'this' in the iOS 64-bit C++ ABI; since
// 'this' and the pointer return value are both passed in X0 in these cases,
// this can be partially modelled by treating X0 as a callee-saved register;
@@ -336,7 +408,7 @@ def CSR_AArch64_AAVPCS : CalleeSavedRegs<(add LR, FP, X19, X20, X21, X22,
def CSR_AArch64_AAPCS_ThisReturn : CalleeSavedRegs<(add CSR_AArch64_AAPCS, X0)>;
def CSR_AArch64_AAPCS_SwiftError
- : CalleeSavedRegs<(sub CSR_AArch64_AAPCS, X21)>;
+ : CalleeSavedRegs<(sub CSR_Darwin_AArch64_AAPCS, X21)>;
// The function used by Darwin to obtain the address of a thread-local variable
// guarantees more than a normal AAPCS function. x16 and x17 are used on the
@@ -352,7 +424,7 @@ def CSR_AArch64_TLS_Darwin
// fast path calls a function that follows CSR_AArch64_TLS_Darwin,
// CSR_AArch64_CXX_TLS_Darwin should be a subset of CSR_AArch64_TLS_Darwin.
def CSR_AArch64_CXX_TLS_Darwin
- : CalleeSavedRegs<(add CSR_AArch64_AAPCS,
+ : CalleeSavedRegs<(add CSR_Darwin_AArch64_AAPCS,
(sub (sequence "X%u", 1, 28), X15, X16, X17, X18),
(sequence "D%u", 0, 31))>;
diff --git a/lib/Target/AArch64/AArch64CollectLOH.cpp b/lib/Target/AArch64/AArch64CollectLOH.cpp
index 9f324b433209..35e6fef24363 100644
--- a/lib/Target/AArch64/AArch64CollectLOH.cpp
+++ b/lib/Target/AArch64/AArch64CollectLOH.cpp
@@ -103,6 +103,7 @@
#include "llvm/ADT/BitVector.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/SmallSet.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
@@ -181,6 +182,7 @@ static bool canDefBePartOfLOH(const MachineInstr &MI) {
case AArch64::ADDXri:
return canAddBePartOfLOH(MI);
case AArch64::LDRXui:
+ case AArch64::LDRWui:
// Check immediate to see if the immediate is an address.
switch (MI.getOperand(2).getType()) {
default:
@@ -312,7 +314,8 @@ static void handleUse(const MachineInstr &MI, const MachineOperand &MO,
Info.Type = MCLOH_AdrpAdd;
Info.IsCandidate = true;
Info.MI0 = &MI;
- } else if (MI.getOpcode() == AArch64::LDRXui &&
+ } else if ((MI.getOpcode() == AArch64::LDRXui ||
+ MI.getOpcode() == AArch64::LDRWui) &&
MI.getOperand(2).getTargetFlags() & AArch64II::MO_GOT) {
Info.Type = MCLOH_AdrpLdrGot;
Info.IsCandidate = true;
@@ -357,7 +360,9 @@ static bool handleMiddleInst(const MachineInstr &MI, LOHInfo &DefInfo,
return true;
}
} else {
- assert(MI.getOpcode() == AArch64::LDRXui && "Expect LDRXui");
+ assert((MI.getOpcode() == AArch64::LDRXui ||
+ MI.getOpcode() == AArch64::LDRWui) &&
+ "Expect LDRXui or LDRWui");
assert((MI.getOperand(2).getTargetFlags() & AArch64II::MO_GOT) &&
"Expected GOT relocation");
if (OpInfo.Type == MCLOH_AdrpAddStr && OpInfo.MI1 == nullptr) {
@@ -474,13 +479,23 @@ static void handleNormalInst(const MachineInstr &MI, LOHInfo *LOHInfos) {
handleClobber(LOHInfos[Idx]);
}
// Handle uses.
+
+ SmallSet<int, 4> UsesSeen;
for (const MachineOperand &MO : MI.uses()) {
if (!MO.isReg() || !MO.readsReg())
continue;
int Idx = mapRegToGPRIndex(MO.getReg());
if (Idx < 0)
continue;
- handleUse(MI, MO, LOHInfos[Idx]);
+
+ // Multiple uses of the same register within a single instruction don't
+ // count as MultiUser or block optimization. This is especially important on
+ // arm64_32, where any memory operation is likely to be an explicit use of
+ // xN and an implicit use of wN (the base address register).
+ if (!UsesSeen.count(Idx)) {
+ handleUse(MI, MO, LOHInfos[Idx]);
+ UsesSeen.insert(Idx);
+ }
}
}
@@ -512,6 +527,7 @@ bool AArch64CollectLOH::runOnMachineFunction(MachineFunction &MF) {
switch (Opcode) {
case AArch64::ADDXri:
case AArch64::LDRXui:
+ case AArch64::LDRWui:
if (canDefBePartOfLOH(MI)) {
const MachineOperand &Def = MI.getOperand(0);
const MachineOperand &Op = MI.getOperand(1);
diff --git a/lib/Target/AArch64/AArch64Combine.td b/lib/Target/AArch64/AArch64Combine.td
new file mode 100644
index 000000000000..bb99f2516ecf
--- /dev/null
+++ b/lib/Target/AArch64/AArch64Combine.td
@@ -0,0 +1,18 @@
+//=- AArch64.td - Define AArch64 Combine Rules ---------------*- tablegen -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+include "llvm/Target/GlobalISel/Combine.td"
+
+def AArch64PreLegalizerCombinerHelper: GICombinerHelper<
+ "AArch64GenPreLegalizerCombinerHelper", [all_combines,
+ elide_br_by_inverting_cond]> {
+ let DisableRuleOption = "aarch64prelegalizercombiner-disable-rule";
+}
diff --git a/lib/Target/AArch64/AArch64CondBrTuning.cpp b/lib/Target/AArch64/AArch64CondBrTuning.cpp
index 453132e09669..25e23e4623de 100644
--- a/lib/Target/AArch64/AArch64CondBrTuning.cpp
+++ b/lib/Target/AArch64/AArch64CondBrTuning.cpp
@@ -78,7 +78,7 @@ void AArch64CondBrTuning::getAnalysisUsage(AnalysisUsage &AU) const {
}
MachineInstr *AArch64CondBrTuning::getOperandDef(const MachineOperand &MO) {
- if (!TargetRegisterInfo::isVirtualRegister(MO.getReg()))
+ if (!Register::isVirtualRegister(MO.getReg()))
return nullptr;
return MRI->getUniqueVRegDef(MO.getReg());
}
@@ -98,7 +98,7 @@ MachineInstr *AArch64CondBrTuning::convertToFlagSetting(MachineInstr &MI,
}
bool Is64Bit;
unsigned NewOpc = TII->convertToFlagSettingOpc(MI.getOpcode(), Is64Bit);
- unsigned NewDestReg = MI.getOperand(0).getReg();
+ Register NewDestReg = MI.getOperand(0).getReg();
if (MRI->hasOneNonDBGUse(MI.getOperand(0).getReg()))
NewDestReg = Is64Bit ? AArch64::XZR : AArch64::WZR;
diff --git a/lib/Target/AArch64/AArch64ConditionalCompares.cpp b/lib/Target/AArch64/AArch64ConditionalCompares.cpp
index 2cfbcc592d6a..43ae9f8ec47f 100644
--- a/lib/Target/AArch64/AArch64ConditionalCompares.cpp
+++ b/lib/Target/AArch64/AArch64ConditionalCompares.cpp
@@ -220,7 +220,7 @@ bool SSACCmpConv::trivialTailPHIs() {
// PHI operands come in (VReg, MBB) pairs.
for (unsigned oi = 1, oe = I.getNumOperands(); oi != oe; oi += 2) {
MachineBasicBlock *MBB = I.getOperand(oi + 1).getMBB();
- unsigned Reg = I.getOperand(oi).getReg();
+ Register Reg = I.getOperand(oi).getReg();
if (MBB == Head) {
assert((!HeadReg || HeadReg == Reg) && "Inconsistent PHI operands");
HeadReg = Reg;
@@ -259,7 +259,7 @@ bool SSACCmpConv::isDeadDef(unsigned DstReg) {
// Writes to the zero register are dead.
if (DstReg == AArch64::WZR || DstReg == AArch64::XZR)
return true;
- if (!TargetRegisterInfo::isVirtualRegister(DstReg))
+ if (!Register::isVirtualRegister(DstReg))
return false;
// A virtual register def without any uses will be marked dead later, and
// eventually replaced by the zero register.
@@ -631,7 +631,7 @@ void SSACCmpConv::convert(SmallVectorImpl<MachineBasicBlock *> &RemovedBlocks) {
}
const MCInstrDesc &MCID = TII->get(Opc);
// Create a dummy virtual register for the SUBS def.
- unsigned DestReg =
+ Register DestReg =
MRI->createVirtualRegister(TII->getRegClass(MCID, 0, TRI, *MF));
// Insert a SUBS Rn, #0 instruction instead of the cbz / cbnz.
BuildMI(*Head, Head->end(), TermDL, MCID)
diff --git a/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp b/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp
index a43077cb88ec..bc3808df1dbc 100644
--- a/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp
+++ b/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp
@@ -145,8 +145,8 @@ void AArch64DeadRegisterDefinitions::processMachineBasicBlock(
continue;
// We should not have any relevant physreg defs that are replacable by
// zero before register allocation. So we just check for dead vreg defs.
- unsigned Reg = MO.getReg();
- if (!TargetRegisterInfo::isVirtualRegister(Reg) ||
+ Register Reg = MO.getReg();
+ if (!Register::isVirtualRegister(Reg) ||
(!MO.isDead() && !MRI->use_nodbg_empty(Reg)))
continue;
assert(!MO.isImplicit() && "Unexpected implicit def!");
diff --git a/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
index 210c10eb1842..082e17e44d04 100644
--- a/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
+++ b/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
@@ -109,7 +109,7 @@ bool AArch64ExpandPseudo::expandMOVImm(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MBBI,
unsigned BitSize) {
MachineInstr &MI = *MBBI;
- unsigned DstReg = MI.getOperand(0).getReg();
+ Register DstReg = MI.getOperand(0).getReg();
uint64_t Imm = MI.getOperand(1).getImm();
if (DstReg == AArch64::XZR || DstReg == AArch64::WZR) {
@@ -150,7 +150,7 @@ bool AArch64ExpandPseudo::expandMOVImm(MachineBasicBlock &MBB,
} break;
case AArch64::MOVKWi:
case AArch64::MOVKXi: {
- unsigned DstReg = MI.getOperand(0).getReg();
+ Register DstReg = MI.getOperand(0).getReg();
bool DstIsDead = MI.getOperand(0).isDead();
MIBS.push_back(BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(I->Opcode))
.addReg(DstReg,
@@ -174,14 +174,14 @@ bool AArch64ExpandPseudo::expandCMP_SWAP(
MachineInstr &MI = *MBBI;
DebugLoc DL = MI.getDebugLoc();
const MachineOperand &Dest = MI.getOperand(0);
- unsigned StatusReg = MI.getOperand(1).getReg();
+ Register StatusReg = MI.getOperand(1).getReg();
bool StatusDead = MI.getOperand(1).isDead();
// Duplicating undef operands into 2 instructions does not guarantee the same
// value on both; However undef should be replaced by xzr anyway.
assert(!MI.getOperand(2).isUndef() && "cannot handle undef");
- unsigned AddrReg = MI.getOperand(2).getReg();
- unsigned DesiredReg = MI.getOperand(3).getReg();
- unsigned NewReg = MI.getOperand(4).getReg();
+ Register AddrReg = MI.getOperand(2).getReg();
+ Register DesiredReg = MI.getOperand(3).getReg();
+ Register NewReg = MI.getOperand(4).getReg();
MachineFunction *MF = MBB.getParent();
auto LoadCmpBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
@@ -254,16 +254,16 @@ bool AArch64ExpandPseudo::expandCMP_SWAP_128(
DebugLoc DL = MI.getDebugLoc();
MachineOperand &DestLo = MI.getOperand(0);
MachineOperand &DestHi = MI.getOperand(1);
- unsigned StatusReg = MI.getOperand(2).getReg();
+ Register StatusReg = MI.getOperand(2).getReg();
bool StatusDead = MI.getOperand(2).isDead();
// Duplicating undef operands into 2 instructions does not guarantee the same
// value on both; However undef should be replaced by xzr anyway.
assert(!MI.getOperand(3).isUndef() && "cannot handle undef");
- unsigned AddrReg = MI.getOperand(3).getReg();
- unsigned DesiredLoReg = MI.getOperand(4).getReg();
- unsigned DesiredHiReg = MI.getOperand(5).getReg();
- unsigned NewLoReg = MI.getOperand(6).getReg();
- unsigned NewHiReg = MI.getOperand(7).getReg();
+ Register AddrReg = MI.getOperand(3).getReg();
+ Register DesiredLoReg = MI.getOperand(4).getReg();
+ Register DesiredHiReg = MI.getOperand(5).getReg();
+ Register NewLoReg = MI.getOperand(6).getReg();
+ Register NewHiReg = MI.getOperand(7).getReg();
MachineFunction *MF = MBB.getParent();
auto LoadCmpBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
@@ -475,7 +475,7 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB,
case AArch64::LOADgot: {
MachineFunction *MF = MBB.getParent();
- unsigned DstReg = MI.getOperand(0).getReg();
+ Register DstReg = MI.getOperand(0).getReg();
const MachineOperand &MO1 = MI.getOperand(1);
unsigned Flags = MO1.getTargetFlags();
@@ -495,12 +495,26 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB,
}
} else {
// Small codemodel expand into ADRP + LDR.
+ MachineFunction &MF = *MI.getParent()->getParent();
+ DebugLoc DL = MI.getDebugLoc();
MachineInstrBuilder MIB1 =
BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::ADRP), DstReg);
- MachineInstrBuilder MIB2 =
- BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::LDRXui))
- .add(MI.getOperand(0))
- .addReg(DstReg);
+
+ MachineInstrBuilder MIB2;
+ if (MF.getSubtarget<AArch64Subtarget>().isTargetILP32()) {
+ auto TRI = MBB.getParent()->getSubtarget().getRegisterInfo();
+ unsigned Reg32 = TRI->getSubReg(DstReg, AArch64::sub_32);
+ unsigned DstFlags = MI.getOperand(0).getTargetFlags();
+ MIB2 = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::LDRWui))
+ .addDef(Reg32)
+ .addReg(DstReg, RegState::Kill)
+ .addReg(DstReg, DstFlags | RegState::Implicit);
+ } else {
+ unsigned DstReg = MI.getOperand(0).getReg();
+ MIB2 = BuildMI(MBB, MBBI, DL, TII->get(AArch64::LDRXui))
+ .add(MI.getOperand(0))
+ .addUse(DstReg, RegState::Kill);
+ }
if (MO1.isGlobal()) {
MIB1.addGlobalAddress(MO1.getGlobal(), 0, Flags | AArch64II::MO_PAGE);
@@ -534,11 +548,28 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB,
case AArch64::MOVaddrTLS:
case AArch64::MOVaddrEXT: {
// Expand into ADRP + ADD.
- unsigned DstReg = MI.getOperand(0).getReg();
+ Register DstReg = MI.getOperand(0).getReg();
MachineInstrBuilder MIB1 =
BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::ADRP), DstReg)
.add(MI.getOperand(1));
+ if (MI.getOperand(1).getTargetFlags() & AArch64II::MO_TAGGED) {
+ // MO_TAGGED on the page indicates a tagged address. Set the tag now.
+ // We do so by creating a MOVK that sets bits 48-63 of the register to
+ // (global address + 0x100000000 - PC) >> 48. This assumes that we're in
+ // the small code model so we can assume a binary size of <= 4GB, which
+ // makes the untagged PC relative offset positive. The binary must also be
+ // loaded into address range [0, 2^48). Both of these properties need to
+ // be ensured at runtime when using tagged addresses.
+ auto Tag = MI.getOperand(1);
+ Tag.setTargetFlags(AArch64II::MO_PREL | AArch64II::MO_G3);
+ Tag.setOffset(0x100000000);
+ BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::MOVKXi), DstReg)
+ .addReg(DstReg)
+ .add(Tag)
+ .addImm(48);
+ }
+
MachineInstrBuilder MIB2 =
BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::ADDXri))
.add(MI.getOperand(0))
@@ -561,7 +592,7 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB,
return true;
case AArch64::MOVbaseTLS: {
- unsigned DstReg = MI.getOperand(0).getReg();
+ Register DstReg = MI.getOperand(0).getReg();
auto SysReg = AArch64SysReg::TPIDR_EL0;
MachineFunction *MF = MBB.getParent();
if (MF->getTarget().getTargetTriple().isOSFuchsia() &&
@@ -642,11 +673,12 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB,
// instruction sequence.
int BaseOffset = -AFI->getTaggedBasePointerOffset();
unsigned FrameReg;
- int FrameRegOffset = TFI->resolveFrameOffsetReference(
- MF, BaseOffset, false /*isFixed*/, FrameReg, /*PreferFP=*/false,
+ StackOffset FrameRegOffset = TFI->resolveFrameOffsetReference(
+ MF, BaseOffset, false /*isFixed*/, false /*isSVE*/, FrameReg,
+ /*PreferFP=*/false,
/*ForSimm=*/true);
Register SrcReg = FrameReg;
- if (FrameRegOffset != 0) {
+ if (FrameRegOffset) {
// Use output register as temporary.
SrcReg = MI.getOperand(0).getReg();
emitFrameOffset(MBB, &MI, MI.getDebugLoc(), SrcReg, FrameReg,
diff --git a/lib/Target/AArch64/AArch64FalkorHWPFFix.cpp b/lib/Target/AArch64/AArch64FalkorHWPFFix.cpp
index 3b3182128c4c..b54fc2e51bac 100644
--- a/lib/Target/AArch64/AArch64FalkorHWPFFix.cpp
+++ b/lib/Target/AArch64/AArch64FalkorHWPFFix.cpp
@@ -642,7 +642,7 @@ static Optional<LoadInfo> getLoadInfo(const MachineInstr &MI) {
}
// Loads from the stack pointer don't get prefetched.
- unsigned BaseReg = MI.getOperand(BaseRegIdx).getReg();
+ Register BaseReg = MI.getOperand(BaseRegIdx).getReg();
if (BaseReg == AArch64::SP || BaseReg == AArch64::WSP)
return None;
diff --git a/lib/Target/AArch64/AArch64FastISel.cpp b/lib/Target/AArch64/AArch64FastISel.cpp
index 8dc2768b9597..277a3052f1e5 100644
--- a/lib/Target/AArch64/AArch64FastISel.cpp
+++ b/lib/Target/AArch64/AArch64FastISel.cpp
@@ -459,7 +459,7 @@ unsigned AArch64FastISel::materializeGV(const GlobalValue *GV) {
if (!Subtarget->useSmallAddressing() && !Subtarget->isTargetMachO())
return 0;
- unsigned char OpFlags = Subtarget->ClassifyGlobalReference(GV, TM);
+ unsigned OpFlags = Subtarget->ClassifyGlobalReference(GV, TM);
EVT DestEVT = TLI.getValueType(DL, GV->getType(), true);
if (!DestEVT.isSimple())
@@ -474,12 +474,32 @@ unsigned AArch64FastISel::materializeGV(const GlobalValue *GV) {
ADRPReg)
.addGlobalAddress(GV, 0, AArch64II::MO_PAGE | OpFlags);
- ResultReg = createResultReg(&AArch64::GPR64RegClass);
- BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::LDRXui),
+ unsigned LdrOpc;
+ if (Subtarget->isTargetILP32()) {
+ ResultReg = createResultReg(&AArch64::GPR32RegClass);
+ LdrOpc = AArch64::LDRWui;
+ } else {
+ ResultReg = createResultReg(&AArch64::GPR64RegClass);
+ LdrOpc = AArch64::LDRXui;
+ }
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(LdrOpc),
ResultReg)
- .addReg(ADRPReg)
- .addGlobalAddress(GV, 0,
- AArch64II::MO_PAGEOFF | AArch64II::MO_NC | OpFlags);
+ .addReg(ADRPReg)
+ .addGlobalAddress(GV, 0, AArch64II::MO_GOT | AArch64II::MO_PAGEOFF |
+ AArch64II::MO_NC | OpFlags);
+ if (!Subtarget->isTargetILP32())
+ return ResultReg;
+
+ // LDRWui produces a 32-bit register, but pointers in-register are 64-bits
+ // so we must extend the result on ILP32.
+ unsigned Result64 = createResultReg(&AArch64::GPR64RegClass);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(TargetOpcode::SUBREG_TO_REG))
+ .addDef(Result64)
+ .addImm(0)
+ .addReg(ResultReg, RegState::Kill)
+ .addImm(AArch64::sub_32);
+ return Result64;
} else {
// ADRP + ADDX
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ADRP),
@@ -504,6 +524,15 @@ unsigned AArch64FastISel::fastMaterializeConstant(const Constant *C) {
if (!CEVT.isSimple())
return 0;
MVT VT = CEVT.getSimpleVT();
+ // arm64_32 has 32-bit pointers held in 64-bit registers. Because of that,
+ // 'null' pointers need to have a somewhat special treatment.
+ if (const auto *CPN = dyn_cast<ConstantPointerNull>(C)) {
+ (void)CPN;
+ assert(CPN->getType()->getPointerAddressSpace() == 0 &&
+ "Unexpected address space");
+ assert(VT == MVT::i64 && "Expected 64-bit pointers");
+ return materializeInt(ConstantInt::get(Type::getInt64Ty(*Context), 0), VT);
+ }
if (const auto *CI = dyn_cast<ConstantInt>(C))
return materializeInt(CI, VT);
@@ -946,6 +975,9 @@ bool AArch64FastISel::computeCallAddress(const Value *V, Address &Addr) {
bool AArch64FastISel::isTypeLegal(Type *Ty, MVT &VT) {
EVT evt = TLI.getValueType(DL, Ty, true);
+ if (Subtarget->isTargetILP32() && Ty->isPointerTy())
+ return false;
+
// Only handle simple types.
if (evt == MVT::Other || !evt.isSimple())
return false;
@@ -988,6 +1020,9 @@ bool AArch64FastISel::isValueAvailable(const Value *V) const {
}
bool AArch64FastISel::simplifyAddress(Address &Addr, MVT VT) {
+ if (Subtarget->isTargetILP32())
+ return false;
+
unsigned ScaleFactor = getImplicitScaleFactor(VT);
if (!ScaleFactor)
return false;
@@ -3165,6 +3200,11 @@ bool AArch64FastISel::fastLowerCall(CallLoweringInfo &CLI) {
if (IsTailCall)
return false;
+ // FIXME: we could and should support this, but for now correctness at -O0 is
+ // more important.
+ if (Subtarget->isTargetILP32())
+ return false;
+
CodeModel::Model CM = TM.getCodeModel();
// Only support the small-addressing and large code models.
if (CM != CodeModel::Large && !Subtarget->useSmallAddressing())
@@ -3434,8 +3474,8 @@ bool AArch64FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) {
MFI.setFrameAddressIsTaken(true);
const AArch64RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
- unsigned FramePtr = RegInfo->getFrameRegister(*(FuncInfo.MF));
- unsigned SrcReg = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
+ Register FramePtr = RegInfo->getFrameRegister(*(FuncInfo.MF));
+ Register SrcReg = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
TII.get(TargetOpcode::COPY), SrcReg).addReg(FramePtr);
// Recursively load frame address
@@ -3796,6 +3836,11 @@ bool AArch64FastISel::selectRet(const Instruction *I) {
if (!FuncInfo.CanLowerReturn)
return false;
+ // FIXME: in principle it could. Mostly just a case of zero extending outgoing
+ // pointers.
+ if (Subtarget->isTargetILP32())
+ return false;
+
if (F.isVarArg())
return false;
@@ -3842,7 +3887,7 @@ bool AArch64FastISel::selectRet(const Instruction *I) {
return false;
unsigned SrcReg = Reg + VA.getValNo();
- unsigned DestReg = VA.getLocReg();
+ Register DestReg = VA.getLocReg();
// Avoid a cross-class copy. This is very unlikely.
if (!MRI.getRegClass(SrcReg)->contains(DestReg))
return false;
@@ -3970,7 +4015,7 @@ unsigned AArch64FastISel::emiti1Ext(unsigned SrcReg, MVT DestVT, bool IsZExt) {
if (DestVT == MVT::i64) {
// We're ZExt i1 to i64. The ANDWri Wd, Ws, #1 implicitly clears the
// upper 32 bits. Emit a SUBREG_TO_REG to extend from Wd to Xd.
- unsigned Reg64 = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
+ Register Reg64 = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
TII.get(AArch64::SUBREG_TO_REG), Reg64)
.addImm(0)
@@ -4123,7 +4168,7 @@ unsigned AArch64FastISel::emitLSL_ri(MVT RetVT, MVT SrcVT, unsigned Op0,
};
unsigned Opc = OpcTable[IsZExt][Is64Bit];
if (SrcVT.SimpleTy <= MVT::i32 && RetVT == MVT::i64) {
- unsigned TmpReg = MRI.createVirtualRegister(RC);
+ Register TmpReg = MRI.createVirtualRegister(RC);
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
TII.get(AArch64::SUBREG_TO_REG), TmpReg)
.addImm(0)
@@ -4244,7 +4289,7 @@ unsigned AArch64FastISel::emitLSR_ri(MVT RetVT, MVT SrcVT, unsigned Op0,
};
unsigned Opc = OpcTable[IsZExt][Is64Bit];
if (SrcVT.SimpleTy <= MVT::i32 && RetVT == MVT::i64) {
- unsigned TmpReg = MRI.createVirtualRegister(RC);
+ Register TmpReg = MRI.createVirtualRegister(RC);
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
TII.get(AArch64::SUBREG_TO_REG), TmpReg)
.addImm(0)
@@ -4353,7 +4398,7 @@ unsigned AArch64FastISel::emitASR_ri(MVT RetVT, MVT SrcVT, unsigned Op0,
};
unsigned Opc = OpcTable[IsZExt][Is64Bit];
if (SrcVT.SimpleTy <= MVT::i32 && RetVT == MVT::i64) {
- unsigned TmpReg = MRI.createVirtualRegister(RC);
+ Register TmpReg = MRI.createVirtualRegister(RC);
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
TII.get(AArch64::SUBREG_TO_REG), TmpReg)
.addImm(0)
@@ -4412,7 +4457,7 @@ unsigned AArch64FastISel::emitIntExt(MVT SrcVT, unsigned SrcReg, MVT DestVT,
if (DestVT == MVT::i8 || DestVT == MVT::i16)
DestVT = MVT::i32;
else if (DestVT == MVT::i64) {
- unsigned Src64 = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
+ Register Src64 = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
TII.get(AArch64::SUBREG_TO_REG), Src64)
.addImm(0)
@@ -4495,7 +4540,7 @@ bool AArch64FastISel::optimizeIntExtLoad(const Instruction *I, MVT RetVT,
const auto *LoadMI = MI;
if (LoadMI->getOpcode() == TargetOpcode::COPY &&
LoadMI->getOperand(1).getSubReg() == AArch64::sub_32) {
- unsigned LoadReg = MI->getOperand(1).getReg();
+ Register LoadReg = MI->getOperand(1).getReg();
LoadMI = MRI.getUniqueVRegDef(LoadReg);
assert(LoadMI && "Expected valid instruction");
}
diff --git a/lib/Target/AArch64/AArch64FrameLowering.cpp b/lib/Target/AArch64/AArch64FrameLowering.cpp
index 8c6e5cbd5c13..68e1e6a30224 100644
--- a/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -44,11 +44,19 @@
// | |
// |-----------------------------------|
// | |
-// | prev_fp, prev_lr |
+// | callee-saved gpr registers | <--.
+// | | | On Darwin platforms these
+// |- - - - - - - - - - - - - - - - - -| | callee saves are swapped,
+// | | | (frame record first)
+// | prev_fp, prev_lr | <--'
// | (a.k.a. "frame record") |
// |-----------------------------------| <- fp(=x29)
// | |
-// | other callee-saved registers |
+// | callee-saved fp/simd/SVE regs |
+// | |
+// |-----------------------------------|
+// | |
+// | SVE stack objects |
// | |
// |-----------------------------------|
// |.empty.space.to.make.part.below....|
@@ -80,6 +88,20 @@
// * A frame pointer is definitely needed when there are local variables with
// more-than-default alignment requirements.
//
+// For Darwin platforms the frame-record (fp, lr) is stored at the top of the
+// callee-saved area, since the unwind encoding does not allow for encoding
+// this dynamically and existing tools depend on this layout. For other
+// platforms, the frame-record is stored at the bottom of the (gpr) callee-saved
+// area to allow SVE stack objects (allocated directly below the callee-saves,
+// if available) to be accessed directly from the framepointer.
+// The SVE spill/fill instructions have VL-scaled addressing modes such
+// as:
+// ldr z8, [fp, #-7 mul vl]
+// For SVE the size of the vector length (VL) is not known at compile-time, so
+// '#-7 mul vl' is an offset that can only be evaluated at runtime. With this
+// layout, we don't need to add an unscaled offset to the framepointer before
+// accessing the SVE object in the frame.
+//
// In some cases when a base pointer is not strictly needed, it is generated
// anyway when offsets from the frame pointer to access local variables become
// so large that the offset can't be encoded in the immediate fields of loads
@@ -94,6 +116,7 @@
#include "AArch64InstrInfo.h"
#include "AArch64MachineFunctionInfo.h"
#include "AArch64RegisterInfo.h"
+#include "AArch64StackOffset.h"
#include "AArch64Subtarget.h"
#include "AArch64TargetMachine.h"
#include "MCTargetDesc/AArch64AddressingModes.h"
@@ -173,7 +196,7 @@ static unsigned estimateRSStackSizeLimit(MachineFunction &MF) {
if (!MO.isFI())
continue;
- int Offset = 0;
+ StackOffset Offset;
if (isAArch64FrameOffsetLegal(MI, Offset, nullptr, nullptr, nullptr) ==
AArch64FrameOffsetCannotUpdate)
return 0;
@@ -183,6 +206,12 @@ static unsigned estimateRSStackSizeLimit(MachineFunction &MF) {
return DefaultSafeSPDisplacement;
}
+/// Returns the size of the entire SVE stackframe (calleesaves + spills).
+static StackOffset getSVEStackSize(const MachineFunction &MF) {
+ const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
+ return {(int64_t)AFI->getStackSizeSVE(), MVT::nxv1i8};
+}
+
bool AArch64FrameLowering::canUseRedZone(const MachineFunction &MF) const {
if (!EnableRedZone)
return false;
@@ -195,7 +224,8 @@ bool AArch64FrameLowering::canUseRedZone(const MachineFunction &MF) const {
const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
unsigned NumBytes = AFI->getLocalStackSize();
- return !(MFI.hasCalls() || hasFP(MF) || NumBytes > 128);
+ return !(MFI.hasCalls() || hasFP(MF) || NumBytes > 128 ||
+ getSVEStackSize(MF));
}
/// hasFP - Return true if the specified function should have a dedicated frame
@@ -273,14 +303,15 @@ MachineBasicBlock::iterator AArch64FrameLowering::eliminateCallFramePseudoInstr(
// Most call frames will be allocated at the start of a function so
// this is OK, but it is a limitation that needs dealing with.
assert(Amount > -0xffffff && Amount < 0xffffff && "call frame too large");
- emitFrameOffset(MBB, I, DL, AArch64::SP, AArch64::SP, Amount, TII);
+ emitFrameOffset(MBB, I, DL, AArch64::SP, AArch64::SP, {Amount, MVT::i8},
+ TII);
}
} else if (CalleePopAmount != 0) {
// If the calling convention demands that the callee pops arguments from the
// stack, we want to add it back if we have a reserved call frame.
assert(CalleePopAmount < 0xffffff && "call frame too large");
- emitFrameOffset(MBB, I, DL, AArch64::SP, AArch64::SP, -CalleePopAmount,
- TII);
+ emitFrameOffset(MBB, I, DL, AArch64::SP, AArch64::SP,
+ {-(int64_t)CalleePopAmount, MVT::i8}, TII);
}
return MBB.erase(I);
}
@@ -416,6 +447,9 @@ bool AArch64FrameLowering::shouldCombineCSRLocalStackBump(
const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
+ if (MF.getFunction().hasOptSize())
+ return false;
+
if (AFI->getLocalStackSize() == 0)
return false;
@@ -436,6 +470,11 @@ bool AArch64FrameLowering::shouldCombineCSRLocalStackBump(
if (canUseRedZone(MF))
return false;
+ // When there is an SVE area on the stack, always allocate the
+ // callee-saves and spills/locals separately.
+ if (getSVEStackSize(MF))
+ return false;
+
return true;
}
@@ -474,8 +513,8 @@ static MachineBasicBlock::iterator InsertSEH(MachineBasicBlock::iterator MBBI,
Imm = -Imm;
LLVM_FALLTHROUGH;
case AArch64::STPXpre: {
- unsigned Reg0 = MBBI->getOperand(1).getReg();
- unsigned Reg1 = MBBI->getOperand(2).getReg();
+ Register Reg0 = MBBI->getOperand(1).getReg();
+ Register Reg1 = MBBI->getOperand(2).getReg();
if (Reg0 == AArch64::FP && Reg1 == AArch64::LR)
MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveFPLR_X))
.addImm(Imm * 8)
@@ -523,8 +562,8 @@ static MachineBasicBlock::iterator InsertSEH(MachineBasicBlock::iterator MBBI,
}
case AArch64::STPXi:
case AArch64::LDPXi: {
- unsigned Reg0 = MBBI->getOperand(0).getReg();
- unsigned Reg1 = MBBI->getOperand(1).getReg();
+ Register Reg0 = MBBI->getOperand(0).getReg();
+ Register Reg1 = MBBI->getOperand(1).getReg();
if (Reg0 == AArch64::FP && Reg1 == AArch64::LR)
MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveFPLR))
.addImm(Imm * 8)
@@ -791,6 +830,10 @@ static bool needsWinCFI(const MachineFunction &MF) {
F.needsUnwindTableEntry();
}
+static bool isTargetDarwin(const MachineFunction &MF) {
+ return MF.getSubtarget<AArch64Subtarget>().isTargetDarwin();
+}
+
void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
MachineBasicBlock &MBB) const {
MachineBasicBlock::iterator MBBI = MBB.begin();
@@ -846,6 +889,8 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
// Ideally it should match SP value after prologue.
AFI->setTaggedBasePointerOffset(MFI.getStackSize());
+ const StackOffset &SVEStackSize = getSVEStackSize(MF);
+
// getStackSize() includes all the locals in its size calculation. We don't
// include these locals when computing the stack size of a funclet, as they
// are allocated in the parent's stack frame and accessed via the frame
@@ -856,6 +901,8 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
: (int)MFI.getStackSize();
if (!AFI->hasStackFrame() && !windowsRequiresStackProbe(MF, NumBytes)) {
assert(!HasFP && "unexpected function without stack frame but with FP");
+ assert(!SVEStackSize &&
+ "unexpected function without stack frame but with SVE objects");
// All of the stack allocation is for locals.
AFI->setLocalStackSize(NumBytes);
if (!NumBytes)
@@ -866,8 +913,9 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
AFI->setHasRedZone(true);
++NumRedZoneFunctions;
} else {
- emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP, -NumBytes, TII,
- MachineInstr::FrameSetup, false, NeedsWinCFI, &HasWinCFI);
+ emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP,
+ {-NumBytes, MVT::i8}, TII, MachineInstr::FrameSetup,
+ false, NeedsWinCFI, &HasWinCFI);
if (!NeedsWinCFI) {
// Label used to tie together the PROLOG_LABEL and the MachineMoves.
MCSymbol *FrameLabel = MMI.getContext().createTempSymbol();
@@ -901,8 +949,10 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
AFI->setLocalStackSize(NumBytes - PrologueSaveSize);
bool CombineSPBump = shouldCombineCSRLocalStackBump(MF, NumBytes);
if (CombineSPBump) {
- emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP, -NumBytes, TII,
- MachineInstr::FrameSetup, false, NeedsWinCFI, &HasWinCFI);
+ assert(!SVEStackSize && "Cannot combine SP bump with SVE");
+ emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP,
+ {-NumBytes, MVT::i8}, TII, MachineInstr::FrameSetup, false,
+ NeedsWinCFI, &HasWinCFI);
NumBytes = 0;
} else if (PrologueSaveSize != 0) {
MBBI = convertCalleeSaveRestoreToSPPrePostIncDec(
@@ -948,9 +998,9 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
}
if (HasFP) {
- // Only set up FP if we actually need to. Frame pointer is fp =
- // sp - fixedobject - 16.
- int FPOffset = AFI->getCalleeSavedStackSize() - 16;
+ // Only set up FP if we actually need to.
+ int FPOffset = isTargetDarwin(MF) ? (AFI->getCalleeSavedStackSize() - 16) : 0;
+
if (CombineSPBump)
FPOffset += AFI->getLocalStackSize();
@@ -958,8 +1008,9 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
// mov fp,sp when FPOffset is zero.
// Note: All stores of callee-saved registers are marked as "FrameSetup".
// This code marks the instruction(s) that set the FP also.
- emitFrameOffset(MBB, MBBI, DL, AArch64::FP, AArch64::SP, FPOffset, TII,
- MachineInstr::FrameSetup, false, NeedsWinCFI, &HasWinCFI);
+ emitFrameOffset(MBB, MBBI, DL, AArch64::FP, AArch64::SP,
+ {FPOffset, MVT::i8}, TII, MachineInstr::FrameSetup, false,
+ NeedsWinCFI, &HasWinCFI);
}
if (windowsRequiresStackProbe(MF, NumBytes)) {
@@ -1056,6 +1107,9 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
NumBytes = 0;
}
+ emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP, -SVEStackSize, TII,
+ MachineInstr::FrameSetup);
+
// Allocate space for the rest of the frame.
if (NumBytes) {
const bool NeedsRealignment = RegInfo->needsStackRealignment(MF);
@@ -1071,8 +1125,9 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
// FIXME: in the case of dynamic re-alignment, NumBytes doesn't have
// the correct value here, as NumBytes also includes padding bytes,
// which shouldn't be counted here.
- emitFrameOffset(MBB, MBBI, DL, scratchSPReg, AArch64::SP, -NumBytes, TII,
- MachineInstr::FrameSetup, false, NeedsWinCFI, &HasWinCFI);
+ emitFrameOffset(MBB, MBBI, DL, scratchSPReg, AArch64::SP,
+ {-NumBytes, MVT::i8}, TII, MachineInstr::FrameSetup,
+ false, NeedsWinCFI, &HasWinCFI);
if (NeedsRealignment) {
const unsigned Alignment = MFI.getMaxAlignment();
@@ -1130,8 +1185,10 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
if (needsFrameMoves) {
const DataLayout &TD = MF.getDataLayout();
- const int StackGrowth = -TD.getPointerSize(0);
- unsigned FramePtr = RegInfo->getFrameRegister(MF);
+ const int StackGrowth = isTargetDarwin(MF)
+ ? (2 * -TD.getPointerSize(0))
+ : -AFI->getCalleeSavedStackSize();
+ Register FramePtr = RegInfo->getFrameRegister(MF);
// An example of the prologue:
//
// .globl __foo
@@ -1202,7 +1259,7 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
// Define the current CFA rule to use the provided FP.
unsigned Reg = RegInfo->getDwarfRegNum(FramePtr, true);
unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::createDefCfa(
- nullptr, Reg, 2 * StackGrowth - FixedObject));
+ nullptr, Reg, StackGrowth - FixedObject));
BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
.addCFIIndex(CFIIndex)
.setMIFlags(MachineInstr::FrameSetup);
@@ -1401,11 +1458,14 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
.setMIFlag(MachineInstr::FrameDestroy);
}
+ const StackOffset &SVEStackSize = getSVEStackSize(MF);
+
// If there is a single SP update, insert it before the ret and we're done.
if (CombineSPBump) {
+ assert(!SVEStackSize && "Cannot combine SP bump with SVE");
emitFrameOffset(MBB, MBB.getFirstTerminator(), DL, AArch64::SP, AArch64::SP,
- NumBytes + AfterCSRPopSize, TII, MachineInstr::FrameDestroy,
- false, NeedsWinCFI, &HasWinCFI);
+ {NumBytes + (int64_t)AfterCSRPopSize, MVT::i8}, TII,
+ MachineInstr::FrameDestroy, false, NeedsWinCFI, &HasWinCFI);
if (NeedsWinCFI && HasWinCFI)
BuildMI(MBB, MBB.getFirstTerminator(), DL,
TII->get(AArch64::SEH_EpilogEnd))
@@ -1416,6 +1476,12 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
NumBytes -= PrologueSaveSize;
assert(NumBytes >= 0 && "Negative stack allocation size!?");
+ // Deallocate the SVE area.
+ if (SVEStackSize)
+ if (!AFI->isStackRealigned())
+ emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::SP, SVEStackSize,
+ TII, MachineInstr::FrameDestroy);
+
if (!hasFP(MF)) {
bool RedZone = canUseRedZone(MF);
// If this was a redzone leaf function, we don't need to restore the
@@ -1437,8 +1503,8 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
adaptForLdStOpt(MBB, MBB.getFirstTerminator(), LastPopI);
emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::SP,
- StackRestoreBytes, TII, MachineInstr::FrameDestroy, false,
- NeedsWinCFI, &HasWinCFI);
+ {StackRestoreBytes, MVT::i8}, TII,
+ MachineInstr::FrameDestroy, false, NeedsWinCFI, &HasWinCFI);
if (Done) {
if (NeedsWinCFI) {
HasWinCFI = true;
@@ -1456,13 +1522,16 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
// FIXME: Rather than doing the math here, we should instead just use
// non-post-indexed loads for the restores if we aren't actually going to
// be able to save any instructions.
- if (!IsFunclet && (MFI.hasVarSizedObjects() || AFI->isStackRealigned()))
+ if (!IsFunclet && (MFI.hasVarSizedObjects() || AFI->isStackRealigned())) {
+ int64_t OffsetToFrameRecord =
+ isTargetDarwin(MF) ? (-(int64_t)AFI->getCalleeSavedStackSize() + 16) : 0;
emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::FP,
- -AFI->getCalleeSavedStackSize() + 16, TII,
- MachineInstr::FrameDestroy, false, NeedsWinCFI);
- else if (NumBytes)
- emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::SP, NumBytes, TII,
- MachineInstr::FrameDestroy, false, NeedsWinCFI);
+ {OffsetToFrameRecord, MVT::i8},
+ TII, MachineInstr::FrameDestroy, false, NeedsWinCFI);
+ } else if (NumBytes)
+ emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::SP,
+ {NumBytes, MVT::i8}, TII, MachineInstr::FrameDestroy, false,
+ NeedsWinCFI);
// This must be placed after the callee-save restore code because that code
// assumes the SP is at the same location as it was after the callee-save save
@@ -1483,8 +1552,8 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
adaptForLdStOpt(MBB, FirstSPPopI, LastPopI);
emitFrameOffset(MBB, FirstSPPopI, DL, AArch64::SP, AArch64::SP,
- AfterCSRPopSize, TII, MachineInstr::FrameDestroy, false,
- NeedsWinCFI, &HasWinCFI);
+ {(int64_t)AfterCSRPopSize, MVT::i8}, TII,
+ MachineInstr::FrameDestroy, false, NeedsWinCFI, &HasWinCFI);
}
if (NeedsWinCFI && HasWinCFI)
BuildMI(MBB, MBB.getFirstTerminator(), DL, TII->get(AArch64::SEH_EpilogEnd))
@@ -1501,10 +1570,11 @@ int AArch64FrameLowering::getFrameIndexReference(const MachineFunction &MF,
int FI,
unsigned &FrameReg) const {
return resolveFrameIndexReference(
- MF, FI, FrameReg,
- /*PreferFP=*/
- MF.getFunction().hasFnAttribute(Attribute::SanitizeHWAddress),
- /*ForSimm=*/false);
+ MF, FI, FrameReg,
+ /*PreferFP=*/
+ MF.getFunction().hasFnAttribute(Attribute::SanitizeHWAddress),
+ /*ForSimm=*/false)
+ .getBytes();
}
int AArch64FrameLowering::getNonLocalFrameIndexReference(
@@ -1512,18 +1582,19 @@ int AArch64FrameLowering::getNonLocalFrameIndexReference(
return getSEHFrameIndexOffset(MF, FI);
}
-static int getFPOffset(const MachineFunction &MF, int ObjectOffset) {
+static StackOffset getFPOffset(const MachineFunction &MF, int ObjectOffset) {
const auto *AFI = MF.getInfo<AArch64FunctionInfo>();
const auto &Subtarget = MF.getSubtarget<AArch64Subtarget>();
bool IsWin64 =
Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv());
unsigned FixedObject = IsWin64 ? alignTo(AFI->getVarArgsGPRSize(), 16) : 0;
- return ObjectOffset + FixedObject + 16;
+ unsigned FPAdjust = isTargetDarwin(MF) ? 16 : AFI->getCalleeSavedStackSize();
+ return {ObjectOffset + FixedObject + FPAdjust, MVT::i8};
}
-static int getStackOffset(const MachineFunction &MF, int ObjectOffset) {
+static StackOffset getStackOffset(const MachineFunction &MF, int ObjectOffset) {
const auto &MFI = MF.getFrameInfo();
- return ObjectOffset + MFI.getStackSize();
+ return {ObjectOffset + (int)MFI.getStackSize(), MVT::i8};
}
int AArch64FrameLowering::getSEHFrameIndexOffset(const MachineFunction &MF,
@@ -1532,23 +1603,23 @@ int AArch64FrameLowering::getSEHFrameIndexOffset(const MachineFunction &MF,
MF.getSubtarget().getRegisterInfo());
int ObjectOffset = MF.getFrameInfo().getObjectOffset(FI);
return RegInfo->getLocalAddressRegister(MF) == AArch64::FP
- ? getFPOffset(MF, ObjectOffset)
- : getStackOffset(MF, ObjectOffset);
+ ? getFPOffset(MF, ObjectOffset).getBytes()
+ : getStackOffset(MF, ObjectOffset).getBytes();
}
-int AArch64FrameLowering::resolveFrameIndexReference(const MachineFunction &MF,
- int FI, unsigned &FrameReg,
- bool PreferFP,
- bool ForSimm) const {
+StackOffset AArch64FrameLowering::resolveFrameIndexReference(
+ const MachineFunction &MF, int FI, unsigned &FrameReg, bool PreferFP,
+ bool ForSimm) const {
const auto &MFI = MF.getFrameInfo();
int ObjectOffset = MFI.getObjectOffset(FI);
bool isFixed = MFI.isFixedObjectIndex(FI);
- return resolveFrameOffsetReference(MF, ObjectOffset, isFixed, FrameReg,
+ bool isSVE = MFI.getStackID(FI) == TargetStackID::SVEVector;
+ return resolveFrameOffsetReference(MF, ObjectOffset, isFixed, isSVE, FrameReg,
PreferFP, ForSimm);
}
-int AArch64FrameLowering::resolveFrameOffsetReference(
- const MachineFunction &MF, int ObjectOffset, bool isFixed,
+StackOffset AArch64FrameLowering::resolveFrameOffsetReference(
+ const MachineFunction &MF, int ObjectOffset, bool isFixed, bool isSVE,
unsigned &FrameReg, bool PreferFP, bool ForSimm) const {
const auto &MFI = MF.getFrameInfo();
const auto *RegInfo = static_cast<const AArch64RegisterInfo *>(
@@ -1556,17 +1627,23 @@ int AArch64FrameLowering::resolveFrameOffsetReference(
const auto *AFI = MF.getInfo<AArch64FunctionInfo>();
const auto &Subtarget = MF.getSubtarget<AArch64Subtarget>();
- int FPOffset = getFPOffset(MF, ObjectOffset);
- int Offset = getStackOffset(MF, ObjectOffset);
+ int FPOffset = getFPOffset(MF, ObjectOffset).getBytes();
+ int Offset = getStackOffset(MF, ObjectOffset).getBytes();
bool isCSR =
!isFixed && ObjectOffset >= -((int)AFI->getCalleeSavedStackSize());
+ const StackOffset &SVEStackSize = getSVEStackSize(MF);
+
// Use frame pointer to reference fixed objects. Use it for locals if
// there are VLAs or a dynamically realigned SP (and thus the SP isn't
// reliable as a base). Make sure useFPForScavengingIndex() does the
// right thing for the emergency spill slot.
bool UseFP = false;
- if (AFI->hasStackFrame()) {
+ if (AFI->hasStackFrame() && !isSVE) {
+ // We shouldn't prefer using the FP when there is an SVE area
+ // in between the FP and the non-SVE locals/spills.
+ PreferFP &= !SVEStackSize;
+
// Note: Keeping the following as multiple 'if' statements rather than
// merging to a single expression for readability.
//
@@ -1594,8 +1671,10 @@ int AArch64FrameLowering::resolveFrameOffsetReference(
bool CanUseBP = RegInfo->hasBasePointer(MF);
if (FPOffsetFits && CanUseBP) // Both are ok. Pick the best.
UseFP = PreferFP;
- else if (!CanUseBP) // Can't use BP. Forced to use FP.
+ else if (!CanUseBP) { // Can't use BP. Forced to use FP.
+ assert(!SVEStackSize && "Expected BP to be available");
UseFP = true;
+ }
// else we can use BP and FP, but the offset from FP won't fit.
// That will make us scavenge registers which we can probably avoid by
// using BP. If it won't fit for BP either, we'll scavenge anyway.
@@ -1625,9 +1704,36 @@ int AArch64FrameLowering::resolveFrameOffsetReference(
"In the presence of dynamic stack pointer realignment, "
"non-argument/CSR objects cannot be accessed through the frame pointer");
+ if (isSVE) {
+ int64_t OffsetToSVEArea =
+ MFI.getStackSize() - AFI->getCalleeSavedStackSize();
+ StackOffset FPOffset = {ObjectOffset, MVT::nxv1i8};
+ StackOffset SPOffset = SVEStackSize +
+ StackOffset(ObjectOffset, MVT::nxv1i8) +
+ StackOffset(OffsetToSVEArea, MVT::i8);
+ // Always use the FP for SVE spills if available and beneficial.
+ if (hasFP(MF) &&
+ (SPOffset.getBytes() ||
+ FPOffset.getScalableBytes() < SPOffset.getScalableBytes() ||
+ RegInfo->needsStackRealignment(MF))) {
+ FrameReg = RegInfo->getFrameRegister(MF);
+ return FPOffset;
+ }
+
+ FrameReg = RegInfo->hasBasePointer(MF) ? RegInfo->getBaseRegister()
+ : (unsigned)AArch64::SP;
+ return SPOffset;
+ }
+
+ StackOffset ScalableOffset = {};
+ if (UseFP && !(isFixed || isCSR))
+ ScalableOffset = -SVEStackSize;
+ if (!UseFP && (isFixed || isCSR))
+ ScalableOffset = SVEStackSize;
+
if (UseFP) {
FrameReg = RegInfo->getFrameRegister(MF);
- return FPOffset;
+ return StackOffset(FPOffset, MVT::i8) + ScalableOffset;
}
// Use the base pointer if we have one.
@@ -1644,7 +1750,7 @@ int AArch64FrameLowering::resolveFrameOffsetReference(
Offset -= AFI->getLocalStackSize();
}
- return Offset;
+ return StackOffset(Offset, MVT::i8) + ScalableOffset;
}
static unsigned getPrologueDeath(MachineFunction &MF, unsigned Reg) {
@@ -1682,6 +1788,23 @@ static bool invalidateWindowsRegisterPairing(unsigned Reg1, unsigned Reg2,
return true;
}
+/// Returns true if Reg1 and Reg2 cannot be paired using a ldp/stp instruction.
+/// WindowsCFI requires that only consecutive registers can be paired.
+/// LR and FP need to be allocated together when the frame needs to save
+/// the frame-record. This means any other register pairing with LR is invalid.
+static bool invalidateRegisterPairing(unsigned Reg1, unsigned Reg2,
+ bool NeedsWinCFI, bool NeedsFrameRecord) {
+ if (NeedsWinCFI)
+ return invalidateWindowsRegisterPairing(Reg1, Reg2, true);
+
+ // If we need to store the frame record, don't pair any register
+ // with LR other than FP.
+ if (NeedsFrameRecord)
+ return Reg2 == AArch64::LR;
+
+ return false;
+}
+
namespace {
struct RegPairInfo {
@@ -1701,7 +1824,7 @@ struct RegPairInfo {
static void computeCalleeSaveRegisterPairs(
MachineFunction &MF, const std::vector<CalleeSavedInfo> &CSI,
const TargetRegisterInfo *TRI, SmallVectorImpl<RegPairInfo> &RegPairs,
- bool &NeedShadowCallStackProlog) {
+ bool &NeedShadowCallStackProlog, bool NeedsFrameRecord) {
if (CSI.empty())
return;
@@ -1743,7 +1866,8 @@ static void computeCalleeSaveRegisterPairs(
switch (RPI.Type) {
case RegPairInfo::GPR:
if (AArch64::GPR64RegClass.contains(NextReg) &&
- !invalidateWindowsRegisterPairing(RPI.Reg1, NextReg, NeedsWinCFI))
+ !invalidateRegisterPairing(RPI.Reg1, NextReg, NeedsWinCFI,
+ NeedsFrameRecord))
RPI.Reg2 = NextReg;
break;
case RegPairInfo::FPR64:
@@ -1777,6 +1901,10 @@ static void computeCalleeSaveRegisterPairs(
(CSI[i].getFrameIdx() + 1 == CSI[i + 1].getFrameIdx())) &&
"Out of order callee saved regs!");
+ assert((!RPI.isPaired() || !NeedsFrameRecord || RPI.Reg2 != AArch64::FP ||
+ RPI.Reg1 == AArch64::LR) &&
+ "FrameRecord must be allocated together with LR");
+
// MachO's compact unwind format relies on all registers being stored in
// adjacent register pairs.
assert((!produceCompactUnwindFrame(MF) ||
@@ -1825,7 +1953,7 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters(
bool NeedShadowCallStackProlog = false;
computeCalleeSaveRegisterPairs(MF, CSI, TRI, RegPairs,
- NeedShadowCallStackProlog);
+ NeedShadowCallStackProlog, hasFP(MF));
const MachineRegisterInfo &MRI = MF.getRegInfo();
if (NeedShadowCallStackProlog) {
@@ -1955,7 +2083,7 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters(
bool NeedShadowCallStackProlog = false;
computeCalleeSaveRegisterPairs(MF, CSI, TRI, RegPairs,
- NeedShadowCallStackProlog);
+ NeedShadowCallStackProlog, hasFP(MF));
auto EmitMI = [&](const RegPairInfo &RPI) {
unsigned Reg1 = RPI.Reg1;
@@ -2113,19 +2241,26 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF,
SavedRegs.set(AArch64::LR);
}
- LLVM_DEBUG(dbgs() << "*** determineCalleeSaves\nUsed CSRs:";
+ LLVM_DEBUG(dbgs() << "*** determineCalleeSaves\nSaved CSRs:";
for (unsigned Reg
: SavedRegs.set_bits()) dbgs()
<< ' ' << printReg(Reg, RegInfo);
dbgs() << "\n";);
// If any callee-saved registers are used, the frame cannot be eliminated.
- bool CanEliminateFrame = SavedRegs.count() == 0;
+ unsigned MaxAlign = getStackAlignment();
+ int64_t SVEStackSize =
+ alignTo(determineSVEStackSize(MFI, MaxAlign), MaxAlign);
+ assert(MaxAlign <= 16 && "Cannot align scalable vectors more than 16 bytes");
+ bool CanEliminateFrame = (SavedRegs.count() == 0) && !SVEStackSize;
// The CSR spill slots have not been allocated yet, so estimateStackSize
// won't include them.
unsigned EstimatedStackSizeLimit = estimateRSStackSizeLimit(MF);
- bool BigStack = (EstimatedStackSize + CSStackSize) > EstimatedStackSizeLimit;
+
+ // Conservatively always assume BigStack when there are SVE spills.
+ bool BigStack = SVEStackSize ||
+ (EstimatedStackSize + CSStackSize) > EstimatedStackSizeLimit;
if (BigStack || !CanEliminateFrame || RegInfo->cannotEliminateFrame(MF))
AFI->setHasStackFrame(true);
@@ -2145,7 +2280,7 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF,
// store the pair.
if (produceCompactUnwindFrame(MF))
SavedRegs.set(UnspilledCSGPRPaired);
- ExtraCSSpill = UnspilledCSGPRPaired;
+ ExtraCSSpill = UnspilledCSGPR;
}
// If we didn't find an extra callee-saved register to spill, create
@@ -2181,14 +2316,42 @@ bool AArch64FrameLowering::enableStackSlotScavenging(
return AFI->hasCalleeSaveStackFreeSpace();
}
+int64_t AArch64FrameLowering::determineSVEStackSize(MachineFrameInfo &MFI,
+ unsigned &MaxAlign) const {
+ // Process all fixed stack objects.
+ int64_t Offset = 0;
+ for (int I = MFI.getObjectIndexBegin(); I != 0; ++I)
+ if (MFI.getStackID(I) == TargetStackID::SVEVector) {
+ int64_t FixedOffset = -MFI.getObjectOffset(I);
+ if (FixedOffset > Offset)
+ Offset = FixedOffset;
+ }
+
+ // Note: We don't take allocatable stack objects into
+ // account yet, because allocation for those is not yet
+ // implemented.
+ return Offset;
+}
+
void AArch64FrameLowering::processFunctionBeforeFrameFinalized(
MachineFunction &MF, RegScavenger *RS) const {
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+
+ assert(getStackGrowthDirection() == TargetFrameLowering::StackGrowsDown &&
+ "Upwards growing stack unsupported");
+
+ unsigned MaxAlign = getStackAlignment();
+ int64_t SVEStackSize = determineSVEStackSize(MFI, MaxAlign);
+
+ AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
+ AFI->setStackSizeSVE(alignTo(SVEStackSize, MaxAlign));
+ assert(MaxAlign <= 16 && "Cannot align scalable vectors more than 16 bytes");
+
// If this function isn't doing Win64-style C++ EH, we don't need to do
// anything.
if (!MF.hasEHFunclets())
return;
const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
- MachineFrameInfo &MFI = MF.getFrameInfo();
WinEHFuncInfo &EHInfo = *MF.getWinEHFuncInfo();
MachineBasicBlock &MBB = MF.front();
diff --git a/lib/Target/AArch64/AArch64FrameLowering.h b/lib/Target/AArch64/AArch64FrameLowering.h
index 6dbd34b2189f..ac150e86c9eb 100644
--- a/lib/Target/AArch64/AArch64FrameLowering.h
+++ b/lib/Target/AArch64/AArch64FrameLowering.h
@@ -13,6 +13,7 @@
#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64FRAMELOWERING_H
#define LLVM_LIB_TARGET_AARCH64_AARCH64FRAMELOWERING_H
+#include "AArch64StackOffset.h"
#include "llvm/CodeGen/TargetFrameLowering.h"
namespace llvm {
@@ -20,7 +21,7 @@ namespace llvm {
class AArch64FrameLowering : public TargetFrameLowering {
public:
explicit AArch64FrameLowering()
- : TargetFrameLowering(StackGrowsDown, 16, 0, 16,
+ : TargetFrameLowering(StackGrowsDown, Align(16), 0, Align(16),
true /*StackRealignable*/) {}
void emitCalleeSavedFrameMoves(MachineBasicBlock &MBB,
@@ -39,12 +40,13 @@ public:
int getFrameIndexReference(const MachineFunction &MF, int FI,
unsigned &FrameReg) const override;
- int resolveFrameIndexReference(const MachineFunction &MF, int FI,
- unsigned &FrameReg, bool PreferFP,
- bool ForSimm) const;
- int resolveFrameOffsetReference(const MachineFunction &MF, int ObjectOffset,
- bool isFixed, unsigned &FrameReg,
- bool PreferFP, bool ForSimm) const;
+ StackOffset resolveFrameIndexReference(const MachineFunction &MF, int FI,
+ unsigned &FrameReg, bool PreferFP,
+ bool ForSimm) const;
+ StackOffset resolveFrameOffsetReference(const MachineFunction &MF,
+ int ObjectOffset, bool isFixed,
+ bool isSVE, unsigned &FrameReg,
+ bool PreferFP, bool ForSimm) const;
bool spillCalleeSavedRegisters(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MI,
const std::vector<CalleeSavedInfo> &CSI,
@@ -85,9 +87,21 @@ public:
int FI) const override;
int getSEHFrameIndexOffset(const MachineFunction &MF, int FI) const;
+ bool isSupportedStackID(TargetStackID::Value ID) const override {
+ switch (ID) {
+ default:
+ return false;
+ case TargetStackID::Default:
+ case TargetStackID::SVEVector:
+ case TargetStackID::NoAlloc:
+ return true;
+ }
+ }
+
private:
bool shouldCombineCSRLocalStackBump(MachineFunction &MF,
unsigned StackBumpBytes) const;
+ int64_t determineSVEStackSize(MachineFrameInfo &MF, unsigned &MaxAlign) const;
};
} // End llvm namespace
diff --git a/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
index cd7e927ac80c..1f08505f37e7 100644
--- a/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -2053,7 +2053,7 @@ static void getUsefulBitsForUse(SDNode *UserNode, APInt &UsefulBits,
}
static void getUsefulBits(SDValue Op, APInt &UsefulBits, unsigned Depth) {
- if (Depth >= 6)
+ if (Depth >= SelectionDAG::MaxRecursionDepth)
return;
// Initialize UsefulBits
if (!Depth) {
@@ -2913,49 +2913,6 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
return;
break;
- case ISD::EXTRACT_VECTOR_ELT: {
- // Extracting lane zero is a special case where we can just use a plain
- // EXTRACT_SUBREG instruction, which will become FMOV. This is easier for
- // the rest of the compiler, especially the register allocator and copyi
- // propagation, to reason about, so is preferred when it's possible to
- // use it.
- ConstantSDNode *LaneNode = cast<ConstantSDNode>(Node->getOperand(1));
- // Bail and use the default Select() for non-zero lanes.
- if (LaneNode->getZExtValue() != 0)
- break;
- // If the element type is not the same as the result type, likewise
- // bail and use the default Select(), as there's more to do than just
- // a cross-class COPY. This catches extracts of i8 and i16 elements
- // since they will need an explicit zext.
- if (VT != Node->getOperand(0).getValueType().getVectorElementType())
- break;
- unsigned SubReg;
- switch (Node->getOperand(0)
- .getValueType()
- .getVectorElementType()
- .getSizeInBits()) {
- default:
- llvm_unreachable("Unexpected vector element type!");
- case 64:
- SubReg = AArch64::dsub;
- break;
- case 32:
- SubReg = AArch64::ssub;
- break;
- case 16:
- SubReg = AArch64::hsub;
- break;
- case 8:
- llvm_unreachable("unexpected zext-requiring extract element!");
- }
- SDValue Extract = CurDAG->getTargetExtractSubreg(SubReg, SDLoc(Node), VT,
- Node->getOperand(0));
- LLVM_DEBUG(dbgs() << "ISEL: Custom selection!\n=> ");
- LLVM_DEBUG(Extract->dumpr(CurDAG));
- LLVM_DEBUG(dbgs() << "\n");
- ReplaceNode(Node, Extract.getNode());
- return;
- }
case ISD::Constant: {
// Materialize zero constants as copies from WZR/XZR. This allows
// the coalescer to propagate these into other instructions.
diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp
index 7becc99fb5c7..2746117e8ee5 100644
--- a/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -23,6 +23,7 @@
#include "llvm/ADT/APInt.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallSet.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/ADT/StringRef.h"
@@ -161,6 +162,29 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
addQRTypeForNEON(MVT::v8f16);
}
+ if (Subtarget->hasSVE()) {
+ // Add legal sve predicate types
+ addRegisterClass(MVT::nxv2i1, &AArch64::PPRRegClass);
+ addRegisterClass(MVT::nxv4i1, &AArch64::PPRRegClass);
+ addRegisterClass(MVT::nxv8i1, &AArch64::PPRRegClass);
+ addRegisterClass(MVT::nxv16i1, &AArch64::PPRRegClass);
+
+ // Add legal sve data types
+ addRegisterClass(MVT::nxv16i8, &AArch64::ZPRRegClass);
+ addRegisterClass(MVT::nxv8i16, &AArch64::ZPRRegClass);
+ addRegisterClass(MVT::nxv4i32, &AArch64::ZPRRegClass);
+ addRegisterClass(MVT::nxv2i64, &AArch64::ZPRRegClass);
+
+ addRegisterClass(MVT::nxv2f16, &AArch64::ZPRRegClass);
+ addRegisterClass(MVT::nxv4f16, &AArch64::ZPRRegClass);
+ addRegisterClass(MVT::nxv8f16, &AArch64::ZPRRegClass);
+ addRegisterClass(MVT::nxv1f32, &AArch64::ZPRRegClass);
+ addRegisterClass(MVT::nxv2f32, &AArch64::ZPRRegClass);
+ addRegisterClass(MVT::nxv4f32, &AArch64::ZPRRegClass);
+ addRegisterClass(MVT::nxv1f64, &AArch64::ZPRRegClass);
+ addRegisterClass(MVT::nxv2f64, &AArch64::ZPRRegClass);
+ }
+
// Compute derived properties from the register classes
computeRegisterProperties(Subtarget->getRegisterInfo());
@@ -283,7 +307,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
// AArch64 lacks both left-rotate and popcount instructions.
setOperationAction(ISD::ROTL, MVT::i32, Expand);
setOperationAction(ISD::ROTL, MVT::i64, Expand);
- for (MVT VT : MVT::vector_valuetypes()) {
+ for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
setOperationAction(ISD::ROTL, VT, Expand);
setOperationAction(ISD::ROTR, VT, Expand);
}
@@ -297,7 +321,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setOperationAction(ISD::SDIVREM, MVT::i32, Expand);
setOperationAction(ISD::SDIVREM, MVT::i64, Expand);
- for (MVT VT : MVT::vector_valuetypes()) {
+ for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
setOperationAction(ISD::SDIVREM, VT, Expand);
setOperationAction(ISD::UDIVREM, VT, Expand);
}
@@ -606,6 +630,10 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
MaxStoresPerMemmoveOptSize = MaxStoresPerMemmove = 4;
+ MaxLoadsPerMemcmpOptSize = 4;
+ MaxLoadsPerMemcmp = Subtarget->requiresStrictAlign()
+ ? MaxLoadsPerMemcmpOptSize : 8;
+
setStackPointerRegisterToSaveRestore(AArch64::SP);
setSchedulingPreference(Sched::Hybrid);
@@ -613,10 +641,10 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
EnableExtLdPromotion = true;
// Set required alignment.
- setMinFunctionAlignment(2);
+ setMinFunctionAlignment(Align(4));
// Set preferred alignments.
- setPrefFunctionAlignment(STI.getPrefFunctionAlignment());
- setPrefLoopAlignment(STI.getPrefLoopAlignment());
+ setPrefLoopAlignment(Align(1ULL << STI.getPrefLoopLogAlignment()));
+ setPrefFunctionAlignment(Align(1ULL << STI.getPrefFunctionLogAlignment()));
// Only change the limit for entries in a jump table if specified by
// the sub target, but not at the command line.
@@ -725,7 +753,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
// Likewise, narrowing and extending vector loads/stores aren't handled
// directly.
- for (MVT VT : MVT::vector_valuetypes()) {
+ for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand);
if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32) {
@@ -741,7 +769,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setOperationAction(ISD::BSWAP, VT, Expand);
setOperationAction(ISD::CTTZ, VT, Expand);
- for (MVT InnerVT : MVT::vector_valuetypes()) {
+ for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
setTruncStoreAction(VT, InnerVT, Expand);
setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
@@ -773,6 +801,13 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setTruncStoreAction(MVT::v4i16, MVT::v4i8, Custom);
}
+ if (Subtarget->hasSVE()) {
+ for (MVT VT : MVT::integer_scalable_vector_valuetypes()) {
+ if (isTypeLegal(VT) && VT.getVectorElementType() != MVT::i1)
+ setOperationAction(ISD::SPLAT_VECTOR, VT, Custom);
+ }
+ }
+
PredictableSelectIsExpensive = Subtarget->predictableSelectIsExpensive();
}
@@ -1025,6 +1060,14 @@ void AArch64TargetLowering::computeKnownBitsForTargetNode(
Known.One &= Known2.One;
break;
}
+ case AArch64ISD::LOADgot:
+ case AArch64ISD::ADDlow: {
+ if (!Subtarget->isTargetILP32())
+ break;
+ // In ILP32 mode all valid pointers are in the low 4GB of the address-space.
+ Known.Zero = APInt::getHighBitsSet(64, 32);
+ break;
+ }
case ISD::INTRINSIC_W_CHAIN: {
ConstantSDNode *CN = cast<ConstantSDNode>(Op->getOperand(1));
Intrinsic::ID IntID = static_cast<Intrinsic::ID>(CN->getZExtValue());
@@ -1100,6 +1143,32 @@ bool AArch64TargetLowering::allowsMisalignedMemoryAccesses(
return true;
}
+// Same as above but handling LLTs instead.
+bool AArch64TargetLowering::allowsMisalignedMemoryAccesses(
+ LLT Ty, unsigned AddrSpace, unsigned Align, MachineMemOperand::Flags Flags,
+ bool *Fast) const {
+ if (Subtarget->requiresStrictAlign())
+ return false;
+
+ if (Fast) {
+ // Some CPUs are fine with unaligned stores except for 128-bit ones.
+ *Fast = !Subtarget->isMisaligned128StoreSlow() ||
+ Ty.getSizeInBytes() != 16 ||
+ // See comments in performSTORECombine() for more details about
+ // these conditions.
+
+ // Code that uses clang vector extensions can mark that it
+ // wants unaligned accesses to be treated as fast by
+ // underspecifying alignment to be 1 or 2.
+ Align <= 2 ||
+
+ // Disregard v2i64. Memcpy lowering produces those and splitting
+ // them regresses performance on micro-benchmarks and olden/bh.
+ Ty == LLT::vector(2, 64);
+ }
+ return true;
+}
+
FastISel *
AArch64TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
const TargetLibraryInfo *libInfo) const {
@@ -1238,6 +1307,10 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
case AArch64ISD::STZG: return "AArch64ISD::STZG";
case AArch64ISD::ST2G: return "AArch64ISD::ST2G";
case AArch64ISD::STZ2G: return "AArch64ISD::STZ2G";
+ case AArch64ISD::SUNPKHI: return "AArch64ISD::SUNPKHI";
+ case AArch64ISD::SUNPKLO: return "AArch64ISD::SUNPKLO";
+ case AArch64ISD::UUNPKHI: return "AArch64ISD::UUNPKHI";
+ case AArch64ISD::UUNPKLO: return "AArch64ISD::UUNPKLO";
}
return nullptr;
}
@@ -1263,9 +1336,9 @@ AArch64TargetLowering::EmitF128CSEL(MachineInstr &MI,
DebugLoc DL = MI.getDebugLoc();
MachineFunction::iterator It = ++MBB->getIterator();
- unsigned DestReg = MI.getOperand(0).getReg();
- unsigned IfTrueReg = MI.getOperand(1).getReg();
- unsigned IfFalseReg = MI.getOperand(2).getReg();
+ Register DestReg = MI.getOperand(0).getReg();
+ Register IfTrueReg = MI.getOperand(1).getReg();
+ Register IfFalseReg = MI.getOperand(2).getReg();
unsigned CondCode = MI.getOperand(3).getImm();
bool NZCVKilled = MI.getOperand(4).isKill();
@@ -2140,7 +2213,8 @@ getAArch64XALUOOp(AArch64CC::CondCode &CC, SDValue Op, SelectionDAG &DAG) {
SDValue AArch64TargetLowering::LowerF128Call(SDValue Op, SelectionDAG &DAG,
RTLIB::Libcall Call) const {
SmallVector<SDValue, 2> Ops(Op->op_begin(), Op->op_end());
- return makeLibCall(DAG, Call, MVT::f128, Ops, false, SDLoc(Op)).first;
+ MakeLibCallOptions CallOptions;
+ return makeLibCall(DAG, Call, MVT::f128, Ops, CallOptions, SDLoc(Op)).first;
}
// Returns true if the given Op is the overflow flag result of an overflow
@@ -2349,7 +2423,8 @@ SDValue AArch64TargetLowering::LowerFP_ROUND(SDValue Op,
// precise. That doesn't take part in the LibCall so we can't directly use
// LowerF128Call.
SDValue SrcVal = Op.getOperand(0);
- return makeLibCall(DAG, LC, Op.getValueType(), SrcVal, /*isSigned*/ false,
+ MakeLibCallOptions CallOptions;
+ return makeLibCall(DAG, LC, Op.getValueType(), SrcVal, CallOptions,
SDLoc(Op)).first;
}
@@ -2419,7 +2494,8 @@ SDValue AArch64TargetLowering::LowerFP_TO_INT(SDValue Op,
LC = RTLIB::getFPTOUINT(Op.getOperand(0).getValueType(), Op.getValueType());
SmallVector<SDValue, 2> Ops(Op->op_begin(), Op->op_end());
- return makeLibCall(DAG, LC, Op.getValueType(), Ops, false, SDLoc(Op)).first;
+ MakeLibCallOptions CallOptions;
+ return makeLibCall(DAG, LC, Op.getValueType(), Ops, CallOptions, SDLoc(Op)).first;
}
static SDValue LowerVectorINT_TO_FP(SDValue Op, SelectionDAG &DAG) {
@@ -2773,6 +2849,19 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
return DAG.getNode(ISD::UMIN, dl, Op.getValueType(),
Op.getOperand(1), Op.getOperand(2));
+ case Intrinsic::aarch64_sve_sunpkhi:
+ return DAG.getNode(AArch64ISD::SUNPKHI, dl, Op.getValueType(),
+ Op.getOperand(1));
+ case Intrinsic::aarch64_sve_sunpklo:
+ return DAG.getNode(AArch64ISD::SUNPKLO, dl, Op.getValueType(),
+ Op.getOperand(1));
+ case Intrinsic::aarch64_sve_uunpkhi:
+ return DAG.getNode(AArch64ISD::UUNPKHI, dl, Op.getValueType(),
+ Op.getOperand(1));
+ case Intrinsic::aarch64_sve_uunpklo:
+ return DAG.getNode(AArch64ISD::UUNPKLO, dl, Op.getValueType(),
+ Op.getOperand(1));
+
case Intrinsic::localaddress: {
const auto &MF = DAG.getMachineFunction();
const auto *RegInfo = Subtarget->getRegisterInfo();
@@ -2937,6 +3026,8 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
return LowerBUILD_VECTOR(Op, DAG);
case ISD::VECTOR_SHUFFLE:
return LowerVECTOR_SHUFFLE(Op, DAG);
+ case ISD::SPLAT_VECTOR:
+ return LowerSPLAT_VECTOR(Op, DAG);
case ISD::EXTRACT_SUBVECTOR:
return LowerEXTRACT_SUBVECTOR(Op, DAG);
case ISD::SRA:
@@ -3014,8 +3105,11 @@ CCAssignFn *AArch64TargetLowering::CCAssignFnForCall(CallingConv::ID CC,
return CC_AArch64_Win64_VarArg;
if (!Subtarget->isTargetDarwin())
return CC_AArch64_AAPCS;
- return IsVarArg ? CC_AArch64_DarwinPCS_VarArg : CC_AArch64_DarwinPCS;
- case CallingConv::Win64:
+ if (!IsVarArg)
+ return CC_AArch64_DarwinPCS;
+ return Subtarget->isTargetILP32() ? CC_AArch64_DarwinPCS_ILP32_VarArg
+ : CC_AArch64_DarwinPCS_VarArg;
+ case CallingConv::Win64:
return IsVarArg ? CC_AArch64_Win64_VarArg : CC_AArch64_AAPCS;
case CallingConv::AArch64_VectorCall:
return CC_AArch64_AAPCS;
@@ -3038,6 +3132,7 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
// Assign locations to all of the incoming arguments.
SmallVector<CCValAssign, 16> ArgLocs;
+ DenseMap<unsigned, SDValue> CopiedRegs;
CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
*DAG.getContext());
@@ -3094,11 +3189,10 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
continue;
}
+ SDValue ArgValue;
if (VA.isRegLoc()) {
// Arguments stored in registers.
EVT RegVT = VA.getLocVT();
-
- SDValue ArgValue;
const TargetRegisterClass *RC;
if (RegVT == MVT::i32)
@@ -3113,6 +3207,11 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
RC = &AArch64::FPR64RegClass;
else if (RegVT == MVT::f128 || RegVT.is128BitVector())
RC = &AArch64::FPR128RegClass;
+ else if (RegVT.isScalableVector() &&
+ RegVT.getVectorElementType() == MVT::i1)
+ RC = &AArch64::PPRRegClass;
+ else if (RegVT.isScalableVector())
+ RC = &AArch64::ZPRRegClass;
else
llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering");
@@ -3128,20 +3227,23 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
llvm_unreachable("Unknown loc info!");
case CCValAssign::Full:
break;
+ case CCValAssign::Indirect:
+ assert(VA.getValVT().isScalableVector() &&
+ "Only scalable vectors can be passed indirectly");
+ llvm_unreachable("Spilling of SVE vectors not yet implemented");
case CCValAssign::BCvt:
ArgValue = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), ArgValue);
break;
case CCValAssign::AExt:
case CCValAssign::SExt:
case CCValAssign::ZExt:
- // SelectionDAGBuilder will insert appropriate AssertZExt & AssertSExt
- // nodes after our lowering.
- assert(RegVT == Ins[i].VT && "incorrect register location selected");
+ break;
+ case CCValAssign::AExtUpper:
+ ArgValue = DAG.getNode(ISD::SRL, DL, RegVT, ArgValue,
+ DAG.getConstant(32, DL, RegVT));
+ ArgValue = DAG.getZExtOrTrunc(ArgValue, DL, VA.getValVT());
break;
}
-
- InVals.push_back(ArgValue);
-
} else { // VA.isRegLoc()
assert(VA.isMemLoc() && "CCValAssign is neither reg nor mem");
unsigned ArgOffset = VA.getLocMemOffset();
@@ -3156,7 +3258,6 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
// Create load nodes to retrieve arguments from the stack.
SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
- SDValue ArgValue;
// For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
ISD::LoadExtType ExtType = ISD::NON_EXTLOAD;
@@ -3165,9 +3266,14 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
switch (VA.getLocInfo()) {
default:
break;
+ case CCValAssign::Trunc:
case CCValAssign::BCvt:
MemVT = VA.getLocVT();
break;
+ case CCValAssign::Indirect:
+ assert(VA.getValVT().isScalableVector() &&
+ "Only scalable vectors can be passed indirectly");
+ llvm_unreachable("Spilling of SVE vectors not yet implemented");
case CCValAssign::SExt:
ExtType = ISD::SEXTLOAD;
break;
@@ -3184,8 +3290,11 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI),
MemVT);
- InVals.push_back(ArgValue);
}
+ if (Subtarget->isTargetILP32() && Ins[i].Flags.isPointer())
+ ArgValue = DAG.getNode(ISD::AssertZext, DL, ArgValue.getValueType(),
+ ArgValue, DAG.getValueType(MVT::i32));
+ InVals.push_back(ArgValue);
}
// varargs
@@ -3202,8 +3311,8 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
// This will point to the next argument passed via stack.
unsigned StackOffset = CCInfo.getNextStackOffset();
- // We currently pass all varargs at 8-byte alignment.
- StackOffset = ((StackOffset + 7) & ~7);
+ // We currently pass all varargs at 8-byte alignment, or 4 for ILP32
+ StackOffset = alignTo(StackOffset, Subtarget->isTargetILP32() ? 4 : 8);
FuncInfo->setVarArgsStackIndex(MFI.CreateFixedObject(4, StackOffset, true));
if (MFI.hasMustTailInVarArgFunc()) {
@@ -3233,8 +3342,8 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
assert(!FuncInfo->getSRetReturnReg());
MVT PtrTy = getPointerTy(DAG.getDataLayout());
- unsigned Reg =
- MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));
+ Register Reg =
+ MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));
FuncInfo->setSRetReturnReg(Reg);
SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), DL, Reg, InVals[I]);
@@ -3366,6 +3475,7 @@ SDValue AArch64TargetLowering::LowerCallResult(
: RetCC_AArch64_AAPCS;
// Assign locations to each value returned by this call.
SmallVector<CCValAssign, 16> RVLocs;
+ DenseMap<unsigned, SDValue> CopiedRegs;
CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
*DAG.getContext());
CCInfo.AnalyzeCallResult(Ins, RetCC);
@@ -3383,10 +3493,16 @@ SDValue AArch64TargetLowering::LowerCallResult(
continue;
}
- SDValue Val =
- DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InFlag);
- Chain = Val.getValue(1);
- InFlag = Val.getValue(2);
+ // Avoid copying a physreg twice since RegAllocFast is incompetent and only
+ // allows one use of a physreg per block.
+ SDValue Val = CopiedRegs.lookup(VA.getLocReg());
+ if (!Val) {
+ Val =
+ DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InFlag);
+ Chain = Val.getValue(1);
+ InFlag = Val.getValue(2);
+ CopiedRegs[VA.getLocReg()] = Val;
+ }
switch (VA.getLocInfo()) {
default:
@@ -3396,6 +3512,15 @@ SDValue AArch64TargetLowering::LowerCallResult(
case CCValAssign::BCvt:
Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
break;
+ case CCValAssign::AExtUpper:
+ Val = DAG.getNode(ISD::SRL, DL, VA.getLocVT(), Val,
+ DAG.getConstant(32, DL, VA.getLocVT()));
+ LLVM_FALLTHROUGH;
+ case CCValAssign::AExt:
+ LLVM_FALLTHROUGH;
+ case CCValAssign::ZExt:
+ Val = DAG.getZExtOrTrunc(Val, DL, VA.getValVT());
+ break;
}
InVals.push_back(Val);
@@ -3593,6 +3718,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
bool IsVarArg = CLI.IsVarArg;
MachineFunction &MF = DAG.getMachineFunction();
+ MachineFunction::CallSiteInfo CSInfo;
bool IsThisReturn = false;
AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
@@ -3709,6 +3835,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
getPointerTy(DAG.getDataLayout()));
SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
+ SmallSet<unsigned, 8> RegsUsed;
SmallVector<SDValue, 8> MemOpChains;
auto PtrVT = getPointerTy(DAG.getDataLayout());
@@ -3716,7 +3843,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
const auto &Forwards = FuncInfo->getForwardedMustTailRegParms();
for (const auto &F : Forwards) {
SDValue Val = DAG.getCopyFromReg(Chain, DL, F.VReg, F.VT);
- RegsToPass.push_back(std::make_pair(unsigned(F.PReg), Val));
+ RegsToPass.emplace_back(F.PReg, Val);
}
}
@@ -3747,12 +3874,25 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
}
Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
break;
+ case CCValAssign::AExtUpper:
+ assert(VA.getValVT() == MVT::i32 && "only expect 32 -> 64 upper bits");
+ Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
+ Arg = DAG.getNode(ISD::SHL, DL, VA.getLocVT(), Arg,
+ DAG.getConstant(32, DL, VA.getLocVT()));
+ break;
case CCValAssign::BCvt:
- Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
+ Arg = DAG.getBitcast(VA.getLocVT(), Arg);
+ break;
+ case CCValAssign::Trunc:
+ Arg = DAG.getZExtOrTrunc(Arg, DL, VA.getLocVT());
break;
case CCValAssign::FPExt:
Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
break;
+ case CCValAssign::Indirect:
+ assert(VA.getValVT().isScalableVector() &&
+ "Only scalable vectors can be passed indirectly");
+ llvm_unreachable("Spilling of SVE vectors not yet implemented");
}
if (VA.isRegLoc()) {
@@ -3764,7 +3904,33 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
"unexpected use of 'returned'");
IsThisReturn = true;
}
- RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
+ if (RegsUsed.count(VA.getLocReg())) {
+ // If this register has already been used then we're trying to pack
+ // parts of an [N x i32] into an X-register. The extension type will
+ // take care of putting the two halves in the right place but we have to
+ // combine them.
+ SDValue &Bits =
+ std::find_if(RegsToPass.begin(), RegsToPass.end(),
+ [=](const std::pair<unsigned, SDValue> &Elt) {
+ return Elt.first == VA.getLocReg();
+ })
+ ->second;
+ Bits = DAG.getNode(ISD::OR, DL, Bits.getValueType(), Bits, Arg);
+ // Call site info is used for function's parameter entry value
+ // tracking. For now we track only simple cases when parameter
+ // is transferred through whole register.
+ CSInfo.erase(std::remove_if(CSInfo.begin(), CSInfo.end(),
+ [&VA](MachineFunction::ArgRegPair ArgReg) {
+ return ArgReg.Reg == VA.getLocReg();
+ }),
+ CSInfo.end());
+ } else {
+ RegsToPass.emplace_back(VA.getLocReg(), Arg);
+ RegsUsed.insert(VA.getLocReg());
+ const TargetOptions &Options = DAG.getTarget().Options;
+ if (Options.EnableDebugEntryValues)
+ CSInfo.emplace_back(VA.getLocReg(), i);
+ }
} else {
assert(VA.isMemLoc());
@@ -3899,6 +4065,20 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
Ops.push_back(DAG.getRegister(RegToPass.first,
RegToPass.second.getValueType()));
+ // Check callee args/returns for SVE registers and set calling convention
+ // accordingly.
+ if (CallConv == CallingConv::C) {
+ bool CalleeOutSVE = any_of(Outs, [](ISD::OutputArg &Out){
+ return Out.VT.isScalableVector();
+ });
+ bool CalleeInSVE = any_of(Ins, [](ISD::InputArg &In){
+ return In.VT.isScalableVector();
+ });
+
+ if (CalleeInSVE || CalleeOutSVE)
+ CallConv = CallingConv::AArch64_SVE_VectorCall;
+ }
+
// Add a register mask operand representing the call-preserved registers.
const uint32_t *Mask;
const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
@@ -3930,12 +4110,15 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
// actual call instruction.
if (IsTailCall) {
MF.getFrameInfo().setHasTailCall();
- return DAG.getNode(AArch64ISD::TC_RETURN, DL, NodeTys, Ops);
+ SDValue Ret = DAG.getNode(AArch64ISD::TC_RETURN, DL, NodeTys, Ops);
+ DAG.addCallSiteInfo(Ret.getNode(), std::move(CSInfo));
+ return Ret;
}
// Returns a chain and a flag for retval copy to use.
Chain = DAG.getNode(AArch64ISD::CALL, DL, NodeTys, Ops);
InFlag = Chain.getValue(1);
+ DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo));
uint64_t CalleePopBytes =
DoesCalleeRestoreStack(CallConv, TailCallOpt) ? alignTo(NumBytes, 16) : 0;
@@ -3983,7 +4166,8 @@ AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
// Copy the result values into the output registers.
SDValue Flag;
- SmallVector<SDValue, 4> RetOps(1, Chain);
+ SmallVector<std::pair<unsigned, SDValue>, 4> RetVals;
+ SmallSet<unsigned, 4> RegsUsed;
for (unsigned i = 0, realRVLocIdx = 0; i != RVLocs.size();
++i, ++realRVLocIdx) {
CCValAssign &VA = RVLocs[i];
@@ -4005,11 +4189,38 @@ AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
case CCValAssign::BCvt:
Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
break;
+ case CCValAssign::AExt:
+ case CCValAssign::ZExt:
+ Arg = DAG.getZExtOrTrunc(Arg, DL, VA.getLocVT());
+ break;
+ case CCValAssign::AExtUpper:
+ assert(VA.getValVT() == MVT::i32 && "only expect 32 -> 64 upper bits");
+ Arg = DAG.getZExtOrTrunc(Arg, DL, VA.getLocVT());
+ Arg = DAG.getNode(ISD::SHL, DL, VA.getLocVT(), Arg,
+ DAG.getConstant(32, DL, VA.getLocVT()));
+ break;
}
- Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Flag);
+ if (RegsUsed.count(VA.getLocReg())) {
+ SDValue &Bits =
+ std::find_if(RetVals.begin(), RetVals.end(),
+ [=](const std::pair<unsigned, SDValue> &Elt) {
+ return Elt.first == VA.getLocReg();
+ })
+ ->second;
+ Bits = DAG.getNode(ISD::OR, DL, Bits.getValueType(), Bits, Arg);
+ } else {
+ RetVals.emplace_back(VA.getLocReg(), Arg);
+ RegsUsed.insert(VA.getLocReg());
+ }
+ }
+
+ SmallVector<SDValue, 4> RetOps(1, Chain);
+ for (auto &RetVal : RetVals) {
+ Chain = DAG.getCopyToReg(Chain, DL, RetVal.first, RetVal.second, Flag);
Flag = Chain.getValue(1);
- RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
+ RetOps.push_back(
+ DAG.getRegister(RetVal.first, RetVal.second.getValueType()));
}
// Windows AArch64 ABIs require that for returning structs by value we copy
@@ -4139,8 +4350,7 @@ SDValue AArch64TargetLowering::LowerGlobalAddress(SDValue Op,
SelectionDAG &DAG) const {
GlobalAddressSDNode *GN = cast<GlobalAddressSDNode>(Op);
const GlobalValue *GV = GN->getGlobal();
- unsigned char OpFlags =
- Subtarget->ClassifyGlobalReference(GV, getTargetMachine());
+ unsigned OpFlags = Subtarget->ClassifyGlobalReference(GV, getTargetMachine());
if (OpFlags != AArch64II::MO_NO_FLAG)
assert(cast<GlobalAddressSDNode>(Op)->getOffset() == 0 &&
@@ -4204,6 +4414,7 @@ AArch64TargetLowering::LowerDarwinGlobalTLSAddress(SDValue Op,
SDLoc DL(Op);
MVT PtrVT = getPointerTy(DAG.getDataLayout());
+ MVT PtrMemVT = getPointerMemTy(DAG.getDataLayout());
const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
SDValue TLVPAddr =
@@ -4214,13 +4425,15 @@ AArch64TargetLowering::LowerDarwinGlobalTLSAddress(SDValue Op,
// to obtain the address of the variable.
SDValue Chain = DAG.getEntryNode();
SDValue FuncTLVGet = DAG.getLoad(
- MVT::i64, DL, Chain, DescAddr,
+ PtrMemVT, DL, Chain, DescAddr,
MachinePointerInfo::getGOT(DAG.getMachineFunction()),
- /* Alignment = */ 8,
- MachineMemOperand::MONonTemporal | MachineMemOperand::MOInvariant |
- MachineMemOperand::MODereferenceable);
+ /* Alignment = */ PtrMemVT.getSizeInBits() / 8,
+ MachineMemOperand::MOInvariant | MachineMemOperand::MODereferenceable);
Chain = FuncTLVGet.getValue(1);
+ // Extend loaded pointer if necessary (i.e. if ILP32) to DAG pointer.
+ FuncTLVGet = DAG.getZExtOrTrunc(FuncTLVGet, DL, PtrVT);
+
MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
MFI.setAdjustsStack(true);
@@ -4470,7 +4683,7 @@ SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
// value of a libcall against zero, which is just what the rest of LowerBR_CC
// is expecting to deal with.
if (LHS.getValueType() == MVT::f128) {
- softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl);
+ softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl, LHS, RHS);
// If softenSetCCOperands returned a scalar, we need to compare the result
// against zero to select between true and false values.
@@ -4736,7 +4949,7 @@ SDValue AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
// Handle f128 first, since one possible outcome is a normal integer
// comparison which gets picked up by the next if statement.
if (LHS.getValueType() == MVT::f128) {
- softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl);
+ softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl, LHS, RHS);
// If softenSetCCOperands returned a scalar, use it.
if (!RHS.getNode()) {
@@ -4798,7 +5011,7 @@ SDValue AArch64TargetLowering::LowerSELECT_CC(ISD::CondCode CC, SDValue LHS,
// Handle f128 first, because it will result in a comparison of some RTLIB
// call result against zero.
if (LHS.getValueType() == MVT::f128) {
- softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl);
+ softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl, LHS, RHS);
// If softenSetCCOperands returned a scalar, we need to compare the result
// against zero to select between true and false values.
@@ -5096,6 +5309,7 @@ SDValue AArch64TargetLowering::LowerDarwin_VASTART(SDValue Op,
SDLoc DL(Op);
SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(),
getPointerTy(DAG.getDataLayout()));
+ FR = DAG.getZExtOrTrunc(FR, DL, getPointerMemTy(DAG.getDataLayout()));
const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
MachinePointerInfo(SV));
@@ -5202,15 +5416,15 @@ SDValue AArch64TargetLowering::LowerVACOPY(SDValue Op,
// AAPCS has three pointers and two ints (= 32 bytes), Darwin has single
// pointer.
SDLoc DL(Op);
- unsigned VaListSize =
- Subtarget->isTargetDarwin() || Subtarget->isTargetWindows() ? 8 : 32;
+ unsigned PtrSize = Subtarget->isTargetILP32() ? 4 : 8;
+ unsigned VaListSize = (Subtarget->isTargetDarwin() ||
+ Subtarget->isTargetWindows()) ? PtrSize : 32;
const Value *DestSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
- return DAG.getMemcpy(Op.getOperand(0), DL, Op.getOperand(1),
- Op.getOperand(2),
- DAG.getConstant(VaListSize, DL, MVT::i32),
- 8, false, false, false, MachinePointerInfo(DestSV),
+ return DAG.getMemcpy(Op.getOperand(0), DL, Op.getOperand(1), Op.getOperand(2),
+ DAG.getConstant(VaListSize, DL, MVT::i32), PtrSize,
+ false, false, false, MachinePointerInfo(DestSV),
MachinePointerInfo(SrcSV));
}
@@ -5224,12 +5438,15 @@ SDValue AArch64TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
SDValue Chain = Op.getOperand(0);
SDValue Addr = Op.getOperand(1);
unsigned Align = Op.getConstantOperandVal(3);
+ unsigned MinSlotSize = Subtarget->isTargetILP32() ? 4 : 8;
auto PtrVT = getPointerTy(DAG.getDataLayout());
-
- SDValue VAList = DAG.getLoad(PtrVT, DL, Chain, Addr, MachinePointerInfo(V));
+ auto PtrMemVT = getPointerMemTy(DAG.getDataLayout());
+ SDValue VAList =
+ DAG.getLoad(PtrMemVT, DL, Chain, Addr, MachinePointerInfo(V));
Chain = VAList.getValue(1);
+ VAList = DAG.getZExtOrTrunc(VAList, DL, PtrVT);
- if (Align > 8) {
+ if (Align > MinSlotSize) {
assert(((Align & (Align - 1)) == 0) && "Expected Align to be a power of 2");
VAList = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
DAG.getConstant(Align - 1, DL, PtrVT));
@@ -5238,14 +5455,14 @@ SDValue AArch64TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
}
Type *ArgTy = VT.getTypeForEVT(*DAG.getContext());
- uint64_t ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy);
+ unsigned ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy);
// Scalar integer and FP values smaller than 64 bits are implicitly extended
// up to 64 bits. At the very least, we have to increase the striding of the
// vaargs list to match this, and for FP values we need to introduce
// FP_ROUND nodes as well.
if (VT.isInteger() && !VT.isVector())
- ArgSize = 8;
+ ArgSize = std::max(ArgSize, MinSlotSize);
bool NeedFPTrunc = false;
if (VT.isFloatingPoint() && !VT.isVector() && VT != MVT::f64) {
ArgSize = 8;
@@ -5255,6 +5472,8 @@ SDValue AArch64TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
// Increment the pointer, VAList, to the next vaarg
SDValue VANext = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
DAG.getConstant(ArgSize, DL, PtrVT));
+ VANext = DAG.getZExtOrTrunc(VANext, DL, PtrMemVT);
+
// Store the incremented VAList to the legalized pointer
SDValue APStore =
DAG.getStore(Chain, DL, VANext, Addr, MachinePointerInfo(V));
@@ -5284,10 +5503,15 @@ SDValue AArch64TargetLowering::LowerFRAMEADDR(SDValue Op,
SDLoc DL(Op);
unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
SDValue FrameAddr =
- DAG.getCopyFromReg(DAG.getEntryNode(), DL, AArch64::FP, VT);
+ DAG.getCopyFromReg(DAG.getEntryNode(), DL, AArch64::FP, MVT::i64);
while (Depth--)
FrameAddr = DAG.getLoad(VT, DL, DAG.getEntryNode(), FrameAddr,
MachinePointerInfo());
+
+ if (Subtarget->isTargetILP32())
+ FrameAddr = DAG.getNode(ISD::AssertZext, DL, MVT::i64, FrameAddr,
+ DAG.getValueType(VT));
+
return FrameAddr;
}
@@ -5306,9 +5530,9 @@ SDValue AArch64TargetLowering::LowerSPONENTRY(SDValue Op,
// FIXME? Maybe this could be a TableGen attribute on some registers and
// this table could be generated automatically from RegInfo.
-unsigned AArch64TargetLowering::getRegisterByName(const char* RegName, EVT VT,
- SelectionDAG &DAG) const {
- unsigned Reg = MatchRegisterName(RegName);
+Register AArch64TargetLowering::
+getRegisterByName(const char* RegName, EVT VT, const MachineFunction &MF) const {
+ Register Reg = MatchRegisterName(RegName);
if (AArch64::X1 <= Reg && Reg <= AArch64::X28) {
const MCRegisterInfo *MRI = Subtarget->getRegisterInfo();
unsigned DwarfRegNum = MRI->getDwarfRegNum(Reg, false);
@@ -5653,6 +5877,21 @@ const char *AArch64TargetLowering::LowerXConstraint(EVT ConstraintVT) const {
return "r";
}
+enum PredicateConstraint {
+ Upl,
+ Upa,
+ Invalid
+};
+
+static PredicateConstraint parsePredicateConstraint(StringRef Constraint) {
+ PredicateConstraint P = PredicateConstraint::Invalid;
+ if (Constraint == "Upa")
+ P = PredicateConstraint::Upa;
+ if (Constraint == "Upl")
+ P = PredicateConstraint::Upl;
+ return P;
+}
+
/// getConstraintType - Given a constraint letter, return the type of
/// constraint it is for this target.
AArch64TargetLowering::ConstraintType
@@ -5661,19 +5900,30 @@ AArch64TargetLowering::getConstraintType(StringRef Constraint) const {
switch (Constraint[0]) {
default:
break;
- case 'z':
- return C_Other;
case 'x':
case 'w':
+ case 'y':
return C_RegisterClass;
// An address with a single base register. Due to the way we
// currently handle addresses it is the same as 'r'.
case 'Q':
return C_Memory;
+ case 'I':
+ case 'J':
+ case 'K':
+ case 'L':
+ case 'M':
+ case 'N':
+ case 'Y':
+ case 'Z':
+ return C_Immediate;
+ case 'z':
case 'S': // A symbolic address
return C_Other;
}
- }
+ } else if (parsePredicateConstraint(Constraint) !=
+ PredicateConstraint::Invalid)
+ return C_RegisterClass;
return TargetLowering::getConstraintType(Constraint);
}
@@ -5697,12 +5947,17 @@ AArch64TargetLowering::getSingleConstraintMatchWeight(
break;
case 'x':
case 'w':
+ case 'y':
if (type->isFloatingPointTy() || type->isVectorTy())
weight = CW_Register;
break;
case 'z':
weight = CW_Constant;
break;
+ case 'U':
+ if (parsePredicateConstraint(constraint) != PredicateConstraint::Invalid)
+ weight = CW_Register;
+ break;
}
return weight;
}
@@ -5719,6 +5974,8 @@ AArch64TargetLowering::getRegForInlineAsmConstraint(
case 'w':
if (!Subtarget->hasFPARMv8())
break;
+ if (VT.isScalableVector())
+ return std::make_pair(0U, &AArch64::ZPRRegClass);
if (VT.getSizeInBits() == 16)
return std::make_pair(0U, &AArch64::FPR16RegClass);
if (VT.getSizeInBits() == 32)
@@ -5733,9 +5990,25 @@ AArch64TargetLowering::getRegForInlineAsmConstraint(
case 'x':
if (!Subtarget->hasFPARMv8())
break;
+ if (VT.isScalableVector())
+ return std::make_pair(0U, &AArch64::ZPR_4bRegClass);
if (VT.getSizeInBits() == 128)
return std::make_pair(0U, &AArch64::FPR128_loRegClass);
break;
+ case 'y':
+ if (!Subtarget->hasFPARMv8())
+ break;
+ if (VT.isScalableVector())
+ return std::make_pair(0U, &AArch64::ZPR_3bRegClass);
+ break;
+ }
+ } else {
+ PredicateConstraint PC = parsePredicateConstraint(Constraint);
+ if (PC != PredicateConstraint::Invalid) {
+ assert(VT.isScalableVector());
+ bool restricted = (PC == PredicateConstraint::Upl);
+ return restricted ? std::make_pair(0U, &AArch64::PPR_3bRegClass)
+ : std::make_pair(0U, &AArch64::PPRRegClass);
}
}
if (StringRef("{cc}").equals_lower(Constraint))
@@ -6279,6 +6552,8 @@ static bool isREVMask(ArrayRef<int> M, EVT VT, unsigned BlockSize) {
static bool isZIPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
unsigned NumElts = VT.getVectorNumElements();
+ if (NumElts % 2 != 0)
+ return false;
WhichResult = (M[0] == 0 ? 0 : 1);
unsigned Idx = WhichResult * NumElts / 2;
for (unsigned i = 0; i != NumElts; i += 2) {
@@ -6446,8 +6721,7 @@ static SDValue tryFormConcatFromShuffle(SDValue Op, SelectionDAG &DAG) {
if (!isConcatMask(Mask, VT, SplitV0))
return SDValue();
- EVT CastVT = EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(),
- VT.getVectorNumElements() / 2);
+ EVT CastVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
if (SplitV0) {
V0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V0,
DAG.getConstant(0, DL, MVT::i64));
@@ -6790,6 +7064,41 @@ SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
return GenerateTBL(Op, ShuffleMask, DAG);
}
+SDValue AArch64TargetLowering::LowerSPLAT_VECTOR(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDLoc dl(Op);
+ EVT VT = Op.getValueType();
+ EVT ElemVT = VT.getScalarType();
+
+ SDValue SplatVal = Op.getOperand(0);
+
+ // Extend input splat value where needed to fit into a GPR (32b or 64b only)
+ // FPRs don't have this restriction.
+ switch (ElemVT.getSimpleVT().SimpleTy) {
+ case MVT::i8:
+ case MVT::i16:
+ SplatVal = DAG.getAnyExtOrTrunc(SplatVal, dl, MVT::i32);
+ break;
+ case MVT::i64:
+ SplatVal = DAG.getAnyExtOrTrunc(SplatVal, dl, MVT::i64);
+ break;
+ case MVT::i32:
+ // Fine as is
+ break;
+ // TODO: we can support splats of i1s and float types, but haven't added
+ // patterns yet.
+ case MVT::i1:
+ case MVT::f16:
+ case MVT::f32:
+ case MVT::f64:
+ default:
+ llvm_unreachable("Unsupported SPLAT_VECTOR input operand type");
+ break;
+ }
+
+ return DAG.getNode(AArch64ISD::DUP, dl, VT, SplatVal);
+}
+
static bool resolveBuildVector(BuildVectorSDNode *BVN, APInt &CnstBits,
APInt &UndefBits) {
EVT VT = BVN->getValueType(0);
@@ -8063,7 +8372,7 @@ bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
Info.ptrVal = I.getArgOperand(I.getNumArgOperands() - 1);
Info.offset = 0;
- Info.align = 0;
+ Info.align.reset();
// volatile loads with NEON intrinsics not supported
Info.flags = MachineMemOperand::MOLoad;
return true;
@@ -8089,7 +8398,7 @@ bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
Info.ptrVal = I.getArgOperand(I.getNumArgOperands() - 1);
Info.offset = 0;
- Info.align = 0;
+ Info.align.reset();
// volatile stores with NEON intrinsics not supported
Info.flags = MachineMemOperand::MOStore;
return true;
@@ -8101,7 +8410,7 @@ bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
Info.memVT = MVT::getVT(PtrTy->getElementType());
Info.ptrVal = I.getArgOperand(0);
Info.offset = 0;
- Info.align = DL.getABITypeAlignment(PtrTy->getElementType());
+ Info.align = MaybeAlign(DL.getABITypeAlignment(PtrTy->getElementType()));
Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile;
return true;
}
@@ -8112,7 +8421,7 @@ bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
Info.memVT = MVT::getVT(PtrTy->getElementType());
Info.ptrVal = I.getArgOperand(1);
Info.offset = 0;
- Info.align = DL.getABITypeAlignment(PtrTy->getElementType());
+ Info.align = MaybeAlign(DL.getABITypeAlignment(PtrTy->getElementType()));
Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOVolatile;
return true;
}
@@ -8122,7 +8431,7 @@ bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
Info.memVT = MVT::i128;
Info.ptrVal = I.getArgOperand(0);
Info.offset = 0;
- Info.align = 16;
+ Info.align = Align(16);
Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile;
return true;
case Intrinsic::aarch64_stlxp:
@@ -8131,7 +8440,7 @@ bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
Info.memVT = MVT::i128;
Info.ptrVal = I.getArgOperand(2);
Info.offset = 0;
- Info.align = 16;
+ Info.align = Align(16);
Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOVolatile;
return true;
default:
@@ -8278,7 +8587,7 @@ bool AArch64TargetLowering::isExtFreeImpl(const Instruction *Ext) const {
// Get the shift amount based on the scaling factor:
// log2(sizeof(IdxTy)) - log2(8).
uint64_t ShiftAmt =
- countTrailingZeros(DL.getTypeStoreSizeInBits(IdxTy)) - 3;
+ countTrailingZeros(DL.getTypeStoreSizeInBits(IdxTy).getFixedSize()) - 3;
// Is the constant foldable in the shift of the addressing mode?
// I.e., shift amount is between 1 and 4 inclusive.
if (ShiftAmt == 0 || ShiftAmt > 4)
@@ -8739,6 +9048,39 @@ EVT AArch64TargetLowering::getOptimalMemOpType(
return MVT::Other;
}
+LLT AArch64TargetLowering::getOptimalMemOpLLT(
+ uint64_t Size, unsigned DstAlign, unsigned SrcAlign, bool IsMemset,
+ bool ZeroMemset, bool MemcpyStrSrc,
+ const AttributeList &FuncAttributes) const {
+ bool CanImplicitFloat =
+ !FuncAttributes.hasFnAttribute(Attribute::NoImplicitFloat);
+ bool CanUseNEON = Subtarget->hasNEON() && CanImplicitFloat;
+ bool CanUseFP = Subtarget->hasFPARMv8() && CanImplicitFloat;
+ // Only use AdvSIMD to implement memset of 32-byte and above. It would have
+ // taken one instruction to materialize the v2i64 zero and one store (with
+ // restrictive addressing mode). Just do i64 stores.
+ bool IsSmallMemset = IsMemset && Size < 32;
+ auto AlignmentIsAcceptable = [&](EVT VT, unsigned AlignCheck) {
+ if (memOpAlign(SrcAlign, DstAlign, AlignCheck))
+ return true;
+ bool Fast;
+ return allowsMisalignedMemoryAccesses(VT, 0, 1, MachineMemOperand::MONone,
+ &Fast) &&
+ Fast;
+ };
+
+ if (CanUseNEON && IsMemset && !IsSmallMemset &&
+ AlignmentIsAcceptable(MVT::v2i64, 16))
+ return LLT::vector(2, 64);
+ if (CanUseFP && !IsSmallMemset && AlignmentIsAcceptable(MVT::f128, 16))
+ return LLT::scalar(128);
+ if (Size >= 8 && AlignmentIsAcceptable(MVT::i64, 8))
+ return LLT::scalar(64);
+ if (Size >= 4 && AlignmentIsAcceptable(MVT::i32, 4))
+ return LLT::scalar(32);
+ return LLT();
+}
+
// 12-bit optionally shifted immediates are legal for adds.
bool AArch64TargetLowering::isLegalAddImmediate(int64_t Immed) const {
if (Immed == std::numeric_limits<int64_t>::min()) {
@@ -10065,6 +10407,14 @@ static SDValue tryCombineShiftImm(unsigned IID, SDNode *N, SelectionDAG &DAG) {
Opcode = AArch64ISD::SQSHLU_I;
IsRightShift = false;
break;
+ case Intrinsic::aarch64_neon_sshl:
+ case Intrinsic::aarch64_neon_ushl:
+ // For positive shift amounts we can use SHL, as ushl/sshl perform a regular
+ // left shift for positive shift amounts. Below, we only replace the current
+ // node with VSHL, if this condition is met.
+ Opcode = AArch64ISD::VSHL;
+ IsRightShift = false;
+ break;
}
if (IsRightShift && ShiftAmount <= -1 && ShiftAmount >= -(int)ElemBits) {
@@ -10151,6 +10501,8 @@ static SDValue performIntrinsicCombine(SDNode *N,
case Intrinsic::aarch64_neon_sqshlu:
case Intrinsic::aarch64_neon_srshl:
case Intrinsic::aarch64_neon_urshl:
+ case Intrinsic::aarch64_neon_sshl:
+ case Intrinsic::aarch64_neon_ushl:
return tryCombineShiftImm(IID, N, DAG);
case Intrinsic::aarch64_crc32b:
case Intrinsic::aarch64_crc32cb:
@@ -10482,10 +10834,10 @@ static SDValue splitStores(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
return ReplacedSplat;
SDLoc DL(S);
- unsigned NumElts = VT.getVectorNumElements() / 2;
+
// Split VT into two.
- EVT HalfVT =
- EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), NumElts);
+ EVT HalfVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
+ unsigned NumElts = HalfVT.getVectorNumElements();
SDValue SubVector0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal,
DAG.getConstant(0, DL, MVT::i64));
SDValue SubVector1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal,
@@ -10567,7 +10919,7 @@ static SDValue performPostLD1Combine(SDNode *N,
// are predecessors to each other or the Vector.
SmallPtrSet<const SDNode *, 32> Visited;
SmallVector<const SDNode *, 16> Worklist;
- Visited.insert(N);
+ Visited.insert(Addr.getNode());
Worklist.push_back(User);
Worklist.push_back(LD);
Worklist.push_back(Vector.getNode());
@@ -11983,6 +12335,27 @@ bool AArch64TargetLowering::isMaskAndCmp0FoldingBeneficial(
return Mask->getValue().isPowerOf2();
}
+bool AArch64TargetLowering::
+ shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(
+ SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y,
+ unsigned OldShiftOpcode, unsigned NewShiftOpcode,
+ SelectionDAG &DAG) const {
+ // Does baseline recommend not to perform the fold by default?
+ if (!TargetLowering::shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(
+ X, XC, CC, Y, OldShiftOpcode, NewShiftOpcode, DAG))
+ return false;
+ // Else, if this is a vector shift, prefer 'shl'.
+ return X.getValueType().isScalarInteger() || NewShiftOpcode == ISD::SHL;
+}
+
+bool AArch64TargetLowering::shouldExpandShift(SelectionDAG &DAG,
+ SDNode *N) const {
+ if (DAG.getMachineFunction().getFunction().hasMinSize() &&
+ !Subtarget->isTargetWindows())
+ return false;
+ return true;
+}
+
void AArch64TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
// Update IsSplitCSR in AArch64unctionInfo.
AArch64FunctionInfo *AFI = Entry->getParent()->getInfo<AArch64FunctionInfo>();
@@ -12009,7 +12382,7 @@ void AArch64TargetLowering::insertCopiesSplitCSR(
else
llvm_unreachable("Unexpected register class in CSRsViaCopy!");
- unsigned NewVR = MRI->createVirtualRegister(RC);
+ Register NewVR = MRI->createVirtualRegister(RC);
// Create copy from CSR to a virtual register.
// FIXME: this currently does not emit CFI pseudo-instructions, it works
// fine for CXX_FAST_TLS since the C++-style TLS access functions should be
diff --git a/lib/Target/AArch64/AArch64ISelLowering.h b/lib/Target/AArch64/AArch64ISelLowering.h
index 4421c31f65c9..00fa96bc4e6d 100644
--- a/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/lib/Target/AArch64/AArch64ISelLowering.h
@@ -191,6 +191,11 @@ enum NodeType : unsigned {
FRECPE, FRECPS,
FRSQRTE, FRSQRTS,
+ SUNPKHI,
+ SUNPKLO,
+ UUNPKHI,
+ UUNPKLO,
+
// NEON Load/Store with post-increment base updates
LD2post = ISD::FIRST_TARGET_MEMORY_OPCODE,
LD3post,
@@ -261,6 +266,14 @@ public:
const SelectionDAG &DAG,
unsigned Depth = 0) const override;
+ MVT getPointerTy(const DataLayout &DL, uint32_t AS = 0) const override {
+ // Returning i64 unconditionally here (i.e. even for ILP32) means that the
+ // *DAG* representation of pointers will always be 64-bits. They will be
+ // truncated and extended when transferred to memory, but the 64-bit DAG
+ // allows us to use AArch64's addressing modes much more easily.
+ return MVT::getIntegerVT(64);
+ }
+
bool targetShrinkDemandedConstant(SDValue Op, const APInt &Demanded,
TargetLoweringOpt &TLO) const override;
@@ -272,6 +285,10 @@ public:
EVT VT, unsigned AddrSpace = 0, unsigned Align = 1,
MachineMemOperand::Flags Flags = MachineMemOperand::MONone,
bool *Fast = nullptr) const override;
+ /// LLT variant.
+ bool allowsMisalignedMemoryAccesses(
+ LLT Ty, unsigned AddrSpace, unsigned Align, MachineMemOperand::Flags Flags,
+ bool *Fast = nullptr) const override;
/// Provide custom lowering hooks for some operations.
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
@@ -358,6 +375,10 @@ public:
bool IsMemset, bool ZeroMemset, bool MemcpyStrSrc,
const AttributeList &FuncAttributes) const override;
+ LLT getOptimalMemOpLLT(uint64_t Size, unsigned DstAlign, unsigned SrcAlign,
+ bool IsMemset, bool ZeroMemset, bool MemcpyStrSrc,
+ const AttributeList &FuncAttributes) const override;
+
/// Return true if the addressing mode represented by AM is legal for this
/// target, for a load/store of the specified type.
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty,
@@ -480,11 +501,12 @@ public:
return VT.getSizeInBits() >= 64; // vector 'bic'
}
- bool shouldExpandShift(SelectionDAG &DAG, SDNode *N) const override {
- if (DAG.getMachineFunction().getFunction().hasMinSize())
- return false;
- return true;
- }
+ bool shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(
+ SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y,
+ unsigned OldShiftOpcode, unsigned NewShiftOpcode,
+ SelectionDAG &DAG) const override;
+
+ bool shouldExpandShift(SelectionDAG &DAG, SDNode *N) const override;
bool shouldTransformSignedTruncationCheck(EVT XVT,
unsigned KeptBits) const override {
@@ -655,6 +677,7 @@ private:
SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerSPLAT_VECTOR(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerVectorSRA_SRL_SHL(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerShiftLeftParts(SDValue Op, SelectionDAG &DAG) const;
@@ -690,8 +713,8 @@ private:
unsigned combineRepeatedFPDivisors() const override;
ConstraintType getConstraintType(StringRef Constraint) const override;
- unsigned getRegisterByName(const char* RegName, EVT VT,
- SelectionDAG &DAG) const override;
+ Register getRegisterByName(const char* RegName, EVT VT,
+ const MachineFunction &MF) const override;
/// Examine constraint string and operand type and determine a weight value.
/// The operand object must already have been set up with the operand type.
diff --git a/lib/Target/AArch64/AArch64InstrAtomics.td b/lib/Target/AArch64/AArch64InstrAtomics.td
index e22cb44d81ae..459b53923625 100644
--- a/lib/Target/AArch64/AArch64InstrAtomics.td
+++ b/lib/Target/AArch64/AArch64InstrAtomics.td
@@ -204,19 +204,27 @@ def : Pat<(relaxed_store<atomic_store_64>
def ldxr_1 : PatFrag<(ops node:$ptr), (int_aarch64_ldxr node:$ptr), [{
return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i8;
-}]>;
+}]> {
+ let GISelPredicateCode = [{ return isLoadStoreOfNumBytes(MI, 1); }];
+}
def ldxr_2 : PatFrag<(ops node:$ptr), (int_aarch64_ldxr node:$ptr), [{
return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i16;
-}]>;
+}]> {
+ let GISelPredicateCode = [{ return isLoadStoreOfNumBytes(MI, 2); }];
+}
def ldxr_4 : PatFrag<(ops node:$ptr), (int_aarch64_ldxr node:$ptr), [{
return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i32;
-}]>;
+}]> {
+ let GISelPredicateCode = [{ return isLoadStoreOfNumBytes(MI, 4); }];
+}
def ldxr_8 : PatFrag<(ops node:$ptr), (int_aarch64_ldxr node:$ptr), [{
return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i64;
-}]>;
+}]> {
+ let GISelPredicateCode = [{ return isLoadStoreOfNumBytes(MI, 8); }];
+}
def : Pat<(ldxr_1 GPR64sp:$addr),
(SUBREG_TO_REG (i64 0), (LDXRB GPR64sp:$addr), sub_32)>;
@@ -237,19 +245,27 @@ def : Pat<(and (ldxr_4 GPR64sp:$addr), 0xffffffff),
def ldaxr_1 : PatFrag<(ops node:$ptr), (int_aarch64_ldaxr node:$ptr), [{
return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i8;
-}]>;
+}]> {
+ let GISelPredicateCode = [{ return isLoadStoreOfNumBytes(MI, 1); }];
+}
def ldaxr_2 : PatFrag<(ops node:$ptr), (int_aarch64_ldaxr node:$ptr), [{
return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i16;
-}]>;
+}]> {
+ let GISelPredicateCode = [{ return isLoadStoreOfNumBytes(MI, 2); }];
+}
def ldaxr_4 : PatFrag<(ops node:$ptr), (int_aarch64_ldaxr node:$ptr), [{
return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i32;
-}]>;
+}]> {
+ let GISelPredicateCode = [{ return isLoadStoreOfNumBytes(MI, 4); }];
+}
def ldaxr_8 : PatFrag<(ops node:$ptr), (int_aarch64_ldaxr node:$ptr), [{
return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i64;
-}]>;
+}]> {
+ let GISelPredicateCode = [{ return isLoadStoreOfNumBytes(MI, 8); }];
+}
def : Pat<(ldaxr_1 GPR64sp:$addr),
(SUBREG_TO_REG (i64 0), (LDAXRB GPR64sp:$addr), sub_32)>;
@@ -271,22 +287,30 @@ def : Pat<(and (ldaxr_4 GPR64sp:$addr), 0xffffffff),
def stxr_1 : PatFrag<(ops node:$val, node:$ptr),
(int_aarch64_stxr node:$val, node:$ptr), [{
return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i8;
-}]>;
+}]> {
+ let GISelPredicateCode = [{ return isLoadStoreOfNumBytes(MI, 1); }];
+}
def stxr_2 : PatFrag<(ops node:$val, node:$ptr),
(int_aarch64_stxr node:$val, node:$ptr), [{
return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i16;
-}]>;
+}]> {
+ let GISelPredicateCode = [{ return isLoadStoreOfNumBytes(MI, 2); }];
+}
def stxr_4 : PatFrag<(ops node:$val, node:$ptr),
(int_aarch64_stxr node:$val, node:$ptr), [{
return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i32;
-}]>;
+}]> {
+ let GISelPredicateCode = [{ return isLoadStoreOfNumBytes(MI, 4); }];
+}
def stxr_8 : PatFrag<(ops node:$val, node:$ptr),
(int_aarch64_stxr node:$val, node:$ptr), [{
return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i64;
-}]>;
+}]> {
+ let GISelPredicateCode = [{ return isLoadStoreOfNumBytes(MI, 8); }];
+}
def : Pat<(stxr_1 GPR64:$val, GPR64sp:$addr),
@@ -317,22 +341,30 @@ def : Pat<(stxr_4 (and GPR64:$val, 0xffffffff), GPR64sp:$addr),
def stlxr_1 : PatFrag<(ops node:$val, node:$ptr),
(int_aarch64_stlxr node:$val, node:$ptr), [{
return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i8;
-}]>;
+}]> {
+ let GISelPredicateCode = [{ return isLoadStoreOfNumBytes(MI, 1); }];
+}
def stlxr_2 : PatFrag<(ops node:$val, node:$ptr),
(int_aarch64_stlxr node:$val, node:$ptr), [{
return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i16;
-}]>;
+}]> {
+ let GISelPredicateCode = [{ return isLoadStoreOfNumBytes(MI, 2); }];
+}
def stlxr_4 : PatFrag<(ops node:$val, node:$ptr),
(int_aarch64_stlxr node:$val, node:$ptr), [{
return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i32;
-}]>;
+}]> {
+ let GISelPredicateCode = [{ return isLoadStoreOfNumBytes(MI, 4); }];
+}
def stlxr_8 : PatFrag<(ops node:$val, node:$ptr),
(int_aarch64_stlxr node:$val, node:$ptr), [{
return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i64;
-}]>;
+}]> {
+ let GISelPredicateCode = [{ return isLoadStoreOfNumBytes(MI, 8); }];
+}
def : Pat<(stlxr_1 GPR64:$val, GPR64sp:$addr),
@@ -422,4 +454,3 @@ let Predicates = [HasLSE] in {
defm : LDOPregister_patterns_mod<"LDADD", "atomic_load_sub", "SUB">;
defm : LDOPregister_patterns_mod<"LDCLR", "atomic_load_and", "ORN">;
}
-
diff --git a/lib/Target/AArch64/AArch64InstrFormats.td b/lib/Target/AArch64/AArch64InstrFormats.td
index d619137b55c5..f555e4123307 100644
--- a/lib/Target/AArch64/AArch64InstrFormats.td
+++ b/lib/Target/AArch64/AArch64InstrFormats.td
@@ -480,76 +480,40 @@ def BranchTarget14Operand : BranchTarget<14>;
def BranchTarget26Operand : BranchTarget<26>;
def PCRelLabel19Operand : PCRelLabel<19>;
-def MovZSymbolG3AsmOperand : AsmOperandClass {
- let Name = "MovZSymbolG3";
+def MovWSymbolG3AsmOperand : AsmOperandClass {
+ let Name = "MovWSymbolG3";
let RenderMethod = "addImmOperands";
}
-def movz_symbol_g3 : Operand<i32> {
- let ParserMatchClass = MovZSymbolG3AsmOperand;
+def movw_symbol_g3 : Operand<i32> {
+ let ParserMatchClass = MovWSymbolG3AsmOperand;
}
-def MovZSymbolG2AsmOperand : AsmOperandClass {
- let Name = "MovZSymbolG2";
+def MovWSymbolG2AsmOperand : AsmOperandClass {
+ let Name = "MovWSymbolG2";
let RenderMethod = "addImmOperands";
}
-def movz_symbol_g2 : Operand<i32> {
- let ParserMatchClass = MovZSymbolG2AsmOperand;
+def movw_symbol_g2 : Operand<i32> {
+ let ParserMatchClass = MovWSymbolG2AsmOperand;
}
-def MovZSymbolG1AsmOperand : AsmOperandClass {
- let Name = "MovZSymbolG1";
+def MovWSymbolG1AsmOperand : AsmOperandClass {
+ let Name = "MovWSymbolG1";
let RenderMethod = "addImmOperands";
}
-def movz_symbol_g1 : Operand<i32> {
- let ParserMatchClass = MovZSymbolG1AsmOperand;
+def movw_symbol_g1 : Operand<i32> {
+ let ParserMatchClass = MovWSymbolG1AsmOperand;
}
-def MovZSymbolG0AsmOperand : AsmOperandClass {
- let Name = "MovZSymbolG0";
+def MovWSymbolG0AsmOperand : AsmOperandClass {
+ let Name = "MovWSymbolG0";
let RenderMethod = "addImmOperands";
}
-def movz_symbol_g0 : Operand<i32> {
- let ParserMatchClass = MovZSymbolG0AsmOperand;
-}
-
-def MovKSymbolG3AsmOperand : AsmOperandClass {
- let Name = "MovKSymbolG3";
- let RenderMethod = "addImmOperands";
-}
-
-def movk_symbol_g3 : Operand<i32> {
- let ParserMatchClass = MovKSymbolG3AsmOperand;
-}
-
-def MovKSymbolG2AsmOperand : AsmOperandClass {
- let Name = "MovKSymbolG2";
- let RenderMethod = "addImmOperands";
-}
-
-def movk_symbol_g2 : Operand<i32> {
- let ParserMatchClass = MovKSymbolG2AsmOperand;
-}
-
-def MovKSymbolG1AsmOperand : AsmOperandClass {
- let Name = "MovKSymbolG1";
- let RenderMethod = "addImmOperands";
-}
-
-def movk_symbol_g1 : Operand<i32> {
- let ParserMatchClass = MovKSymbolG1AsmOperand;
-}
-
-def MovKSymbolG0AsmOperand : AsmOperandClass {
- let Name = "MovKSymbolG0";
- let RenderMethod = "addImmOperands";
-}
-
-def movk_symbol_g0 : Operand<i32> {
- let ParserMatchClass = MovKSymbolG0AsmOperand;
+def movw_symbol_g0 : Operand<i32> {
+ let ParserMatchClass = MovWSymbolG0AsmOperand;
}
class fixedpoint_i32<ValueType FloatVT>
@@ -673,6 +637,11 @@ def logical_imm64_XFORM : SDNodeXForm<imm, [{
return CurDAG->getTargetConstant(enc, SDLoc(N), MVT::i32);
}]>;
+def gi_logical_imm32_XFORM : GICustomOperandRenderer<"renderLogicalImm32">,
+ GISDNodeXFormEquiv<logical_imm32_XFORM>;
+def gi_logical_imm64_XFORM : GICustomOperandRenderer<"renderLogicalImm64">,
+ GISDNodeXFormEquiv<logical_imm64_XFORM>;
+
let DiagnosticType = "LogicalSecondSource" in {
def LogicalImm32Operand : AsmOperandClass {
let Name = "LogicalImm32";
@@ -714,12 +683,15 @@ def logical_imm64_not : Operand<i64> {
let ParserMatchClass = LogicalImm64NotOperand;
}
-// imm0_65535 predicate - True if the immediate is in the range [0,65535].
-def imm0_65535 : Operand<i32>, ImmLeaf<i32, [{
+// iXX_imm0_65535 predicates - True if the immediate is in the range [0,65535].
+let ParserMatchClass = AsmImmRange<0, 65535>, PrintMethod = "printImmHex" in {
+def i32_imm0_65535 : Operand<i32>, TImmLeaf<i32, [{
return ((uint32_t)Imm) < 65536;
-}]> {
- let ParserMatchClass = AsmImmRange<0, 65535>;
- let PrintMethod = "printImmHex";
+}]>;
+
+def i64_imm0_65535 : Operand<i64>, TImmLeaf<i64, [{
+ return ((uint64_t)Imm) < 65536;
+}]>;
}
// imm0_255 predicate - True if the immediate is in the range [0,255].
@@ -815,6 +787,14 @@ class arith_shifted_reg<ValueType Ty, RegisterClass regclass, int width>
def arith_shifted_reg32 : arith_shifted_reg<i32, GPR32, 32>;
def arith_shifted_reg64 : arith_shifted_reg<i64, GPR64, 64>;
+def gi_arith_shifted_reg32 :
+ GIComplexOperandMatcher<s32, "selectArithShiftedRegister">,
+ GIComplexPatternEquiv<arith_shifted_reg32>;
+
+def gi_arith_shifted_reg64 :
+ GIComplexOperandMatcher<s64, "selectArithShiftedRegister">,
+ GIComplexPatternEquiv<arith_shifted_reg64>;
+
// An arithmetic shifter operand:
// {7-6} - shift type: 00 = lsl, 01 = lsr, 10 = asr, 11 = ror
// {5-0} - imm6
@@ -837,6 +817,14 @@ class logical_shifted_reg<ValueType Ty, RegisterClass regclass, Operand shiftop>
def logical_shifted_reg32 : logical_shifted_reg<i32, GPR32, logical_shift32>;
def logical_shifted_reg64 : logical_shifted_reg<i64, GPR64, logical_shift64>;
+def gi_logical_shifted_reg32 :
+ GIComplexOperandMatcher<s32, "selectLogicalShiftedRegister">,
+ GIComplexPatternEquiv<logical_shifted_reg32>;
+
+def gi_logical_shifted_reg64 :
+ GIComplexOperandMatcher<s64, "selectLogicalShiftedRegister">,
+ GIComplexPatternEquiv<logical_shifted_reg64>;
+
// A logical vector shifter operand:
// {7-6} - shift type: 00 = lsl
// {5-0} - imm6: #0, #8, #16, or #24
@@ -918,6 +906,14 @@ class neg_addsub_shifted_imm<ValueType Ty>
def neg_addsub_shifted_imm32 : neg_addsub_shifted_imm<i32>;
def neg_addsub_shifted_imm64 : neg_addsub_shifted_imm<i64>;
+def gi_neg_addsub_shifted_imm32 :
+ GIComplexOperandMatcher<s32, "selectNegArithImmed">,
+ GIComplexPatternEquiv<neg_addsub_shifted_imm32>;
+
+def gi_neg_addsub_shifted_imm64 :
+ GIComplexOperandMatcher<s64, "selectNegArithImmed">,
+ GIComplexPatternEquiv<neg_addsub_shifted_imm64>;
+
// An extend operand:
// {5-3} - extend type
// {2-0} - imm3
@@ -948,6 +944,21 @@ class arith_extended_reg32to64<ValueType Ty> : Operand<Ty>,
let MIOperandInfo = (ops GPR32, arith_extend64);
}
+def arith_extended_reg32_i32 : arith_extended_reg32<i32>;
+def gi_arith_extended_reg32_i32 :
+ GIComplexOperandMatcher<s32, "selectArithExtendedRegister">,
+ GIComplexPatternEquiv<arith_extended_reg32_i32>;
+
+def arith_extended_reg32_i64 : arith_extended_reg32<i64>;
+def gi_arith_extended_reg32_i64 :
+ GIComplexOperandMatcher<s64, "selectArithExtendedRegister">,
+ GIComplexPatternEquiv<arith_extended_reg32_i64>;
+
+def arith_extended_reg32to64_i64 : arith_extended_reg32to64<i64>;
+def gi_arith_extended_reg32to64_i64 :
+ GIComplexOperandMatcher<s64, "selectArithExtendedRegister">,
+ GIComplexPatternEquiv<arith_extended_reg32to64_i64>;
+
// Floating-point immediate.
def fpimm16 : Operand<f16>,
FPImmLeaf<f16, [{
@@ -1000,8 +1011,8 @@ class AsmVectorIndex<int Min, int Max, string NamePrefix=""> : AsmOperandClass {
let RenderMethod = "addVectorIndexOperands";
}
-class AsmVectorIndexOpnd<AsmOperandClass mc, code pred>
- : Operand<i64>, ImmLeaf<i64, pred> {
+class AsmVectorIndexOpnd<ValueType ty, AsmOperandClass mc, code pred>
+ : Operand<ty>, ImmLeaf<ty, pred> {
let ParserMatchClass = mc;
let PrintMethod = "printVectorIndex";
}
@@ -1012,11 +1023,17 @@ def VectorIndexHOperand : AsmVectorIndex<0, 7>;
def VectorIndexSOperand : AsmVectorIndex<0, 3>;
def VectorIndexDOperand : AsmVectorIndex<0, 1>;
-def VectorIndex1 : AsmVectorIndexOpnd<VectorIndex1Operand, [{ return ((uint64_t)Imm) == 1; }]>;
-def VectorIndexB : AsmVectorIndexOpnd<VectorIndexBOperand, [{ return ((uint64_t)Imm) < 16; }]>;
-def VectorIndexH : AsmVectorIndexOpnd<VectorIndexHOperand, [{ return ((uint64_t)Imm) < 8; }]>;
-def VectorIndexS : AsmVectorIndexOpnd<VectorIndexSOperand, [{ return ((uint64_t)Imm) < 4; }]>;
-def VectorIndexD : AsmVectorIndexOpnd<VectorIndexDOperand, [{ return ((uint64_t)Imm) < 2; }]>;
+def VectorIndex1 : AsmVectorIndexOpnd<i64, VectorIndex1Operand, [{ return ((uint64_t)Imm) == 1; }]>;
+def VectorIndexB : AsmVectorIndexOpnd<i64, VectorIndexBOperand, [{ return ((uint64_t)Imm) < 16; }]>;
+def VectorIndexH : AsmVectorIndexOpnd<i64, VectorIndexHOperand, [{ return ((uint64_t)Imm) < 8; }]>;
+def VectorIndexS : AsmVectorIndexOpnd<i64, VectorIndexSOperand, [{ return ((uint64_t)Imm) < 4; }]>;
+def VectorIndexD : AsmVectorIndexOpnd<i64, VectorIndexDOperand, [{ return ((uint64_t)Imm) < 2; }]>;
+
+def VectorIndex132b : AsmVectorIndexOpnd<i32, VectorIndex1Operand, [{ return ((uint64_t)Imm) == 1; }]>;
+def VectorIndexB32b : AsmVectorIndexOpnd<i32, VectorIndexBOperand, [{ return ((uint64_t)Imm) < 16; }]>;
+def VectorIndexH32b : AsmVectorIndexOpnd<i32, VectorIndexHOperand, [{ return ((uint64_t)Imm) < 8; }]>;
+def VectorIndexS32b : AsmVectorIndexOpnd<i32, VectorIndexSOperand, [{ return ((uint64_t)Imm) < 4; }]>;
+def VectorIndexD32b : AsmVectorIndexOpnd<i32, VectorIndexDOperand, [{ return ((uint64_t)Imm) < 2; }]>;
def SVEVectorIndexExtDupBOperand : AsmVectorIndex<0, 63, "SVE">;
def SVEVectorIndexExtDupHOperand : AsmVectorIndex<0, 31, "SVE">;
@@ -1025,15 +1042,15 @@ def SVEVectorIndexExtDupDOperand : AsmVectorIndex<0, 7, "SVE">;
def SVEVectorIndexExtDupQOperand : AsmVectorIndex<0, 3, "SVE">;
def sve_elm_idx_extdup_b
- : AsmVectorIndexOpnd<SVEVectorIndexExtDupBOperand, [{ return ((uint64_t)Imm) < 64; }]>;
+ : AsmVectorIndexOpnd<i64, SVEVectorIndexExtDupBOperand, [{ return ((uint64_t)Imm) < 64; }]>;
def sve_elm_idx_extdup_h
- : AsmVectorIndexOpnd<SVEVectorIndexExtDupHOperand, [{ return ((uint64_t)Imm) < 32; }]>;
+ : AsmVectorIndexOpnd<i64, SVEVectorIndexExtDupHOperand, [{ return ((uint64_t)Imm) < 32; }]>;
def sve_elm_idx_extdup_s
- : AsmVectorIndexOpnd<SVEVectorIndexExtDupSOperand, [{ return ((uint64_t)Imm) < 16; }]>;
+ : AsmVectorIndexOpnd<i64, SVEVectorIndexExtDupSOperand, [{ return ((uint64_t)Imm) < 16; }]>;
def sve_elm_idx_extdup_d
- : AsmVectorIndexOpnd<SVEVectorIndexExtDupDOperand, [{ return ((uint64_t)Imm) < 8; }]>;
+ : AsmVectorIndexOpnd<i64, SVEVectorIndexExtDupDOperand, [{ return ((uint64_t)Imm) < 8; }]>;
def sve_elm_idx_extdup_q
- : AsmVectorIndexOpnd<SVEVectorIndexExtDupQOperand, [{ return ((uint64_t)Imm) < 4; }]>;
+ : AsmVectorIndexOpnd<i64, SVEVectorIndexExtDupQOperand, [{ return ((uint64_t)Imm) < 4; }]>;
// 8-bit immediate for AdvSIMD where 64-bit values of the form:
// aaaaaaaa bbbbbbbb cccccccc dddddddd eeeeeeee ffffffff gggggggg hhhhhhhh
@@ -1082,6 +1099,45 @@ class RtSystemI<bit L, dag oops, dag iops, string asm, string operands>
let Inst{4-0} = Rt;
}
+// System instructions for transactional memory extension
+class TMBaseSystemI<bit L, bits<4> CRm, bits<3> op2, dag oops, dag iops,
+ string asm, string operands, list<dag> pattern>
+ : BaseSystemI<L, oops, iops, asm, operands, pattern>,
+ Sched<[WriteSys]> {
+ let Inst{20-12} = 0b000110011;
+ let Inst{11-8} = CRm;
+ let Inst{7-5} = op2;
+ let DecoderMethod = "";
+
+ let mayLoad = 1;
+ let mayStore = 1;
+}
+
+// System instructions for transactional memory - single input operand
+class TMSystemI<bits<4> CRm, string asm, list<dag> pattern>
+ : TMBaseSystemI<0b1, CRm, 0b011,
+ (outs GPR64:$Rt), (ins), asm, "\t$Rt", pattern> {
+ bits<5> Rt;
+ let Inst{4-0} = Rt;
+}
+
+// System instructions for transactional memory - no operand
+class TMSystemINoOperand<bits<4> CRm, string asm, list<dag> pattern>
+ : TMBaseSystemI<0b0, CRm, 0b011, (outs), (ins), asm, "", pattern> {
+ let Inst{4-0} = 0b11111;
+}
+
+// System instructions for exit from transactions
+class TMSystemException<bits<3> op1, string asm, list<dag> pattern>
+ : I<(outs), (ins i64_imm0_65535:$imm), asm, "\t$imm", "", pattern>,
+ Sched<[WriteSys]> {
+ bits<16> imm;
+ let Inst{31-24} = 0b11010100;
+ let Inst{23-21} = op1;
+ let Inst{20-5} = imm;
+ let Inst{4-0} = 0b00000;
+}
+
// Hint instructions that take both a CRm and a 3-bit immediate.
// NOTE: ideally, this would have mayStore = 0, mayLoad = 0, but we cannot
// model patterns with sufficiently fine granularity
@@ -2180,11 +2236,11 @@ multiclass AddSub<bit isSub, string mnemonic, string alias,
// Add/Subtract extended register
let AddedComplexity = 1, hasSideEffects = 0 in {
def Wrx : BaseAddSubEReg<isSub, 0, GPR32sp, GPR32sp,
- arith_extended_reg32<i32>, mnemonic, OpNode> {
+ arith_extended_reg32_i32, mnemonic, OpNode> {
let Inst{31} = 0;
}
def Xrx : BaseAddSubEReg<isSub, 0, GPR64sp, GPR64sp,
- arith_extended_reg32to64<i64>, mnemonic, OpNode> {
+ arith_extended_reg32to64_i64, mnemonic, OpNode> {
let Inst{31} = 1;
}
}
@@ -2254,11 +2310,11 @@ multiclass AddSubS<bit isSub, string mnemonic, SDNode OpNode, string cmp,
// Add/Subtract extended register
let AddedComplexity = 1 in {
def Wrx : BaseAddSubEReg<isSub, 1, GPR32, GPR32sp,
- arith_extended_reg32<i32>, mnemonic, OpNode> {
+ arith_extended_reg32_i32, mnemonic, OpNode> {
let Inst{31} = 0;
}
def Xrx : BaseAddSubEReg<isSub, 1, GPR64, GPR64sp,
- arith_extended_reg32<i64>, mnemonic, OpNode> {
+ arith_extended_reg32_i64, mnemonic, OpNode> {
let Inst{31} = 1;
}
}
@@ -2969,6 +3025,22 @@ def ro_Xindexed32 : ComplexPattern<i64, 4, "SelectAddrModeXRO<32>", []>;
def ro_Xindexed64 : ComplexPattern<i64, 4, "SelectAddrModeXRO<64>", []>;
def ro_Xindexed128 : ComplexPattern<i64, 4, "SelectAddrModeXRO<128>", []>;
+def gi_ro_Xindexed8 :
+ GIComplexOperandMatcher<s64, "selectAddrModeXRO<8>">,
+ GIComplexPatternEquiv<ro_Xindexed8>;
+def gi_ro_Xindexed16 :
+ GIComplexOperandMatcher<s64, "selectAddrModeXRO<16>">,
+ GIComplexPatternEquiv<ro_Xindexed16>;
+def gi_ro_Xindexed32 :
+ GIComplexOperandMatcher<s64, "selectAddrModeXRO<32>">,
+ GIComplexPatternEquiv<ro_Xindexed32>;
+def gi_ro_Xindexed64 :
+ GIComplexOperandMatcher<s64, "selectAddrModeXRO<64>">,
+ GIComplexPatternEquiv<ro_Xindexed64>;
+def gi_ro_Xindexed128 :
+ GIComplexOperandMatcher<s64, "selectAddrModeXRO<128>">,
+ GIComplexPatternEquiv<ro_Xindexed128>;
+
def ro_Windexed8 : ComplexPattern<i64, 4, "SelectAddrModeWRO<8>", []>;
def ro_Windexed16 : ComplexPattern<i64, 4, "SelectAddrModeWRO<16>", []>;
def ro_Windexed32 : ComplexPattern<i64, 4, "SelectAddrModeWRO<32>", []>;
@@ -4086,7 +4158,7 @@ multiclass MemTagStore<bits<2> opc1, string insn> {
let mayLoad = 0, mayStore = 0, hasSideEffects = 1 in
class ExceptionGeneration<bits<3> op1, bits<2> ll, string asm>
- : I<(outs), (ins imm0_65535:$imm), asm, "\t$imm", "", []>,
+ : I<(outs), (ins i32_imm0_65535:$imm), asm, "\t$imm", "", []>,
Sched<[WriteSys]> {
bits<16> imm;
let Inst{31-24} = 0b11010100;
diff --git a/lib/Target/AArch64/AArch64InstrInfo.cpp b/lib/Target/AArch64/AArch64InstrInfo.cpp
index 215e96a82d0e..5c35e5bcdd30 100644
--- a/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -32,6 +32,7 @@
#include "llvm/CodeGen/TargetSubtargetInfo.h"
#include "llvm/IR/DebugLoc.h"
#include "llvm/IR/GlobalValue.h"
+#include "llvm/MC/MCAsmInfo.h"
#include "llvm/MC/MCInst.h"
#include "llvm/MC/MCInstrDesc.h"
#include "llvm/Support/Casting.h"
@@ -82,6 +83,10 @@ unsigned AArch64InstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
return getInlineAsmLength(MI.getOperand(0).getSymbolName(), *MAI);
}
+ // Meta-instructions emit no code.
+ if (MI.isMetaInstruction())
+ return 0;
+
// FIXME: We currently only handle pseudoinstructions that don't get expanded
// before the assembly printer.
unsigned NumBytes = 0;
@@ -91,12 +96,6 @@ unsigned AArch64InstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
// Anything not explicitly designated otherwise is a normal 4-byte insn.
NumBytes = 4;
break;
- case TargetOpcode::DBG_VALUE:
- case TargetOpcode::EH_LABEL:
- case TargetOpcode::IMPLICIT_DEF:
- case TargetOpcode::KILL:
- NumBytes = 0;
- break;
case TargetOpcode::STACKMAP:
// The upper bound for a stackmap intrinsic is the full length of its shadow
NumBytes = StackMapOpers(&MI).getNumPatchBytes();
@@ -416,7 +415,7 @@ unsigned AArch64InstrInfo::insertBranch(
// Find the original register that VReg is copied from.
static unsigned removeCopies(const MachineRegisterInfo &MRI, unsigned VReg) {
- while (TargetRegisterInfo::isVirtualRegister(VReg)) {
+ while (Register::isVirtualRegister(VReg)) {
const MachineInstr *DefMI = MRI.getVRegDef(VReg);
if (!DefMI->isFullCopy())
return VReg;
@@ -431,7 +430,7 @@ static unsigned removeCopies(const MachineRegisterInfo &MRI, unsigned VReg) {
static unsigned canFoldIntoCSel(const MachineRegisterInfo &MRI, unsigned VReg,
unsigned *NewVReg = nullptr) {
VReg = removeCopies(MRI, VReg);
- if (!TargetRegisterInfo::isVirtualRegister(VReg))
+ if (!Register::isVirtualRegister(VReg))
return 0;
bool Is64Bit = AArch64::GPR64allRegClass.hasSubClassEq(MRI.getRegClass(VReg));
@@ -574,7 +573,7 @@ void AArch64InstrInfo::insertSelect(MachineBasicBlock &MBB,
CC = AArch64CC::NE;
break;
}
- unsigned SrcReg = Cond[2].getReg();
+ Register SrcReg = Cond[2].getReg();
if (Is64Bit) {
// cmp reg, #0 is actually subs xzr, reg, #0.
MRI.constrainRegClass(SrcReg, &AArch64::GPR64spRegClass);
@@ -930,7 +929,7 @@ bool AArch64InstrInfo::isCoalescableExtInstr(const MachineInstr &MI,
}
bool AArch64InstrInfo::areMemAccessesTriviallyDisjoint(
- const MachineInstr &MIa, const MachineInstr &MIb, AliasAnalysis *AA) const {
+ const MachineInstr &MIa, const MachineInstr &MIb) const {
const TargetRegisterInfo *TRI = &getRegisterInfo();
const MachineOperand *BaseOpA = nullptr, *BaseOpB = nullptr;
int64_t OffsetA = 0, OffsetB = 0;
@@ -1071,8 +1070,8 @@ static bool UpdateOperandRegClass(MachineInstr &Instr) {
assert(MO.isReg() &&
"Operand has register constraints without being a register!");
- unsigned Reg = MO.getReg();
- if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
+ Register Reg = MO.getReg();
+ if (Register::isPhysicalRegister(Reg)) {
if (!OpRegCstraints->contains(Reg))
return false;
} else if (!OpRegCstraints->hasSubClassEq(MRI->getRegClass(Reg)) &&
@@ -1472,6 +1471,8 @@ bool AArch64InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
return false;
MachineBasicBlock &MBB = *MI.getParent();
+ auto &Subtarget = MBB.getParent()->getSubtarget<AArch64Subtarget>();
+ auto TRI = Subtarget.getRegisterInfo();
DebugLoc DL = MI.getDebugLoc();
if (MI.getOpcode() == AArch64::CATCHRET) {
@@ -1497,21 +1498,32 @@ bool AArch64InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
return true;
}
- unsigned Reg = MI.getOperand(0).getReg();
+ Register Reg = MI.getOperand(0).getReg();
const GlobalValue *GV =
cast<GlobalValue>((*MI.memoperands_begin())->getValue());
const TargetMachine &TM = MBB.getParent()->getTarget();
- unsigned char OpFlags = Subtarget.ClassifyGlobalReference(GV, TM);
+ unsigned OpFlags = Subtarget.ClassifyGlobalReference(GV, TM);
const unsigned char MO_NC = AArch64II::MO_NC;
if ((OpFlags & AArch64II::MO_GOT) != 0) {
BuildMI(MBB, MI, DL, get(AArch64::LOADgot), Reg)
.addGlobalAddress(GV, 0, OpFlags);
- BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg)
- .addReg(Reg, RegState::Kill)
- .addImm(0)
- .addMemOperand(*MI.memoperands_begin());
+ if (Subtarget.isTargetILP32()) {
+ unsigned Reg32 = TRI->getSubReg(Reg, AArch64::sub_32);
+ BuildMI(MBB, MI, DL, get(AArch64::LDRWui))
+ .addDef(Reg32, RegState::Dead)
+ .addUse(Reg, RegState::Kill)
+ .addImm(0)
+ .addMemOperand(*MI.memoperands_begin())
+ .addDef(Reg, RegState::Implicit);
+ } else {
+ BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg)
+ .addReg(Reg, RegState::Kill)
+ .addImm(0)
+ .addMemOperand(*MI.memoperands_begin());
+ }
} else if (TM.getCodeModel() == CodeModel::Large) {
+ assert(!Subtarget.isTargetILP32() && "how can large exist in ILP32?");
BuildMI(MBB, MI, DL, get(AArch64::MOVZXi), Reg)
.addGlobalAddress(GV, 0, AArch64II::MO_G0 | MO_NC)
.addImm(0);
@@ -1538,10 +1550,20 @@ bool AArch64InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
BuildMI(MBB, MI, DL, get(AArch64::ADRP), Reg)
.addGlobalAddress(GV, 0, OpFlags | AArch64II::MO_PAGE);
unsigned char LoFlags = OpFlags | AArch64II::MO_PAGEOFF | MO_NC;
- BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg)
- .addReg(Reg, RegState::Kill)
- .addGlobalAddress(GV, 0, LoFlags)
- .addMemOperand(*MI.memoperands_begin());
+ if (Subtarget.isTargetILP32()) {
+ unsigned Reg32 = TRI->getSubReg(Reg, AArch64::sub_32);
+ BuildMI(MBB, MI, DL, get(AArch64::LDRWui))
+ .addDef(Reg32, RegState::Dead)
+ .addUse(Reg, RegState::Kill)
+ .addGlobalAddress(GV, 0, LoFlags)
+ .addMemOperand(*MI.memoperands_begin())
+ .addDef(Reg, RegState::Implicit);
+ } else {
+ BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg)
+ .addReg(Reg, RegState::Kill)
+ .addGlobalAddress(GV, 0, LoFlags)
+ .addMemOperand(*MI.memoperands_begin());
+ }
}
MBB.erase(MI);
@@ -1581,7 +1603,7 @@ bool AArch64InstrInfo::isGPRCopy(const MachineInstr &MI) {
break;
case TargetOpcode::COPY: {
// GPR32 copies will by lowered to ORRXrs
- unsigned DstReg = MI.getOperand(0).getReg();
+ Register DstReg = MI.getOperand(0).getReg();
return (AArch64::GPR32RegClass.contains(DstReg) ||
AArch64::GPR64RegClass.contains(DstReg));
}
@@ -1611,7 +1633,7 @@ bool AArch64InstrInfo::isFPRCopy(const MachineInstr &MI) {
break;
case TargetOpcode::COPY: {
// FPR64 copies will by lowered to ORR.16b
- unsigned DstReg = MI.getOperand(0).getReg();
+ Register DstReg = MI.getOperand(0).getReg();
return (AArch64::FPR64RegClass.contains(DstReg) ||
AArch64::FPR128RegClass.contains(DstReg));
}
@@ -1917,7 +1939,7 @@ bool AArch64InstrInfo::isCandidateToMergeOrPair(const MachineInstr &MI) const {
// e.g., ldr x0, [x0]
// This case will never occur with an FI base.
if (MI.getOperand(1).isReg()) {
- unsigned BaseReg = MI.getOperand(1).getReg();
+ Register BaseReg = MI.getOperand(1).getReg();
const TargetRegisterInfo *TRI = &getRegisterInfo();
if (MI.modifiesRegister(BaseReg, TRI))
return false;
@@ -1928,6 +1950,17 @@ bool AArch64InstrInfo::isCandidateToMergeOrPair(const MachineInstr &MI) const {
if (isLdStPairSuppressed(MI))
return false;
+ // Do not pair any callee-save store/reload instructions in the
+ // prologue/epilogue if the CFI information encoded the operations as separate
+ // instructions, as that will cause the size of the actual prologue to mismatch
+ // with the prologue size recorded in the Windows CFI.
+ const MCAsmInfo *MAI = MI.getMF()->getTarget().getMCAsmInfo();
+ bool NeedsWinCFI = MAI->usesWindowsCFI() &&
+ MI.getMF()->getFunction().needsUnwindTableEntry();
+ if (NeedsWinCFI && (MI.getFlag(MachineInstr::FrameSetup) ||
+ MI.getFlag(MachineInstr::FrameDestroy)))
+ return false;
+
// On some CPUs quad load/store pairs are slower than two single load/stores.
if (Subtarget.isPaired128Slow()) {
switch (MI.getOpcode()) {
@@ -2165,6 +2198,18 @@ bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, unsigned &Scale,
MinOffset = -256;
MaxOffset = 255;
break;
+ case AArch64::LDR_PXI:
+ case AArch64::STR_PXI:
+ Scale = Width = 2;
+ MinOffset = -256;
+ MaxOffset = 255;
+ break;
+ case AArch64::LDR_ZXI:
+ case AArch64::STR_ZXI:
+ Scale = Width = 16;
+ MinOffset = -256;
+ MaxOffset = 255;
+ break;
case AArch64::ST2GOffset:
case AArch64::STZ2GOffset:
Scale = 16;
@@ -2350,7 +2395,7 @@ static const MachineInstrBuilder &AddSubReg(const MachineInstrBuilder &MIB,
if (!SubIdx)
return MIB.addReg(Reg, State);
- if (TargetRegisterInfo::isPhysicalRegister(Reg))
+ if (Register::isPhysicalRegister(Reg))
return MIB.addReg(TRI->getSubReg(Reg, SubIdx), State);
return MIB.addReg(Reg, State, SubIdx);
}
@@ -2474,6 +2519,27 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
return;
}
+ // Copy a Predicate register by ORRing with itself.
+ if (AArch64::PPRRegClass.contains(DestReg) &&
+ AArch64::PPRRegClass.contains(SrcReg)) {
+ assert(Subtarget.hasSVE() && "Unexpected SVE register.");
+ BuildMI(MBB, I, DL, get(AArch64::ORR_PPzPP), DestReg)
+ .addReg(SrcReg) // Pg
+ .addReg(SrcReg)
+ .addReg(SrcReg, getKillRegState(KillSrc));
+ return;
+ }
+
+ // Copy a Z register by ORRing with itself.
+ if (AArch64::ZPRRegClass.contains(DestReg) &&
+ AArch64::ZPRRegClass.contains(SrcReg)) {
+ assert(Subtarget.hasSVE() && "Unexpected SVE register.");
+ BuildMI(MBB, I, DL, get(AArch64::ORR_ZZZ), DestReg)
+ .addReg(SrcReg)
+ .addReg(SrcReg, getKillRegState(KillSrc));
+ return;
+ }
+
if (AArch64::GPR64spRegClass.contains(DestReg) &&
(AArch64::GPR64spRegClass.contains(SrcReg) || SrcReg == AArch64::XZR)) {
if (DestReg == AArch64::SP || SrcReg == AArch64::SP) {
@@ -2722,7 +2788,7 @@ static void storeRegPairToStackSlot(const TargetRegisterInfo &TRI,
MachineMemOperand *MMO) {
unsigned SrcReg0 = SrcReg;
unsigned SrcReg1 = SrcReg;
- if (TargetRegisterInfo::isPhysicalRegister(SrcReg)) {
+ if (Register::isPhysicalRegister(SrcReg)) {
SrcReg0 = TRI.getSubReg(SrcReg, SubIdx0);
SubIdx0 = 0;
SrcReg1 = TRI.getSubReg(SrcReg, SubIdx1);
@@ -2761,7 +2827,7 @@ void AArch64InstrInfo::storeRegToStackSlot(
case 4:
if (AArch64::GPR32allRegClass.hasSubClassEq(RC)) {
Opc = AArch64::STRWui;
- if (TargetRegisterInfo::isVirtualRegister(SrcReg))
+ if (Register::isVirtualRegister(SrcReg))
MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR32RegClass);
else
assert(SrcReg != AArch64::WSP);
@@ -2771,7 +2837,7 @@ void AArch64InstrInfo::storeRegToStackSlot(
case 8:
if (AArch64::GPR64allRegClass.hasSubClassEq(RC)) {
Opc = AArch64::STRXui;
- if (TargetRegisterInfo::isVirtualRegister(SrcReg))
+ if (Register::isVirtualRegister(SrcReg))
MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR64RegClass);
else
assert(SrcReg != AArch64::SP);
@@ -2852,7 +2918,7 @@ static void loadRegPairFromStackSlot(const TargetRegisterInfo &TRI,
unsigned DestReg0 = DestReg;
unsigned DestReg1 = DestReg;
bool IsUndef = true;
- if (TargetRegisterInfo::isPhysicalRegister(DestReg)) {
+ if (Register::isPhysicalRegister(DestReg)) {
DestReg0 = TRI.getSubReg(DestReg, SubIdx0);
SubIdx0 = 0;
DestReg1 = TRI.getSubReg(DestReg, SubIdx1);
@@ -2892,7 +2958,7 @@ void AArch64InstrInfo::loadRegFromStackSlot(
case 4:
if (AArch64::GPR32allRegClass.hasSubClassEq(RC)) {
Opc = AArch64::LDRWui;
- if (TargetRegisterInfo::isVirtualRegister(DestReg))
+ if (Register::isVirtualRegister(DestReg))
MF.getRegInfo().constrainRegClass(DestReg, &AArch64::GPR32RegClass);
else
assert(DestReg != AArch64::WSP);
@@ -2902,7 +2968,7 @@ void AArch64InstrInfo::loadRegFromStackSlot(
case 8:
if (AArch64::GPR64allRegClass.hasSubClassEq(RC)) {
Opc = AArch64::LDRXui;
- if (TargetRegisterInfo::isVirtualRegister(DestReg))
+ if (Register::isVirtualRegister(DestReg))
MF.getRegInfo().constrainRegClass(DestReg, &AArch64::GPR64RegClass);
else
assert(DestReg != AArch64::SP);
@@ -2972,21 +3038,39 @@ void AArch64InstrInfo::loadRegFromStackSlot(
MI.addMemOperand(MMO);
}
-void llvm::emitFrameOffset(MachineBasicBlock &MBB,
- MachineBasicBlock::iterator MBBI, const DebugLoc &DL,
- unsigned DestReg, unsigned SrcReg, int Offset,
- const TargetInstrInfo *TII,
- MachineInstr::MIFlag Flag, bool SetNZCV,
- bool NeedsWinCFI, bool *HasWinCFI) {
- if (DestReg == SrcReg && Offset == 0)
- return;
-
- assert((DestReg != AArch64::SP || Offset % 16 == 0) &&
- "SP increment/decrement not 16-byte aligned");
-
- bool isSub = Offset < 0;
- if (isSub)
- Offset = -Offset;
+// Helper function to emit a frame offset adjustment from a given
+// pointer (SrcReg), stored into DestReg. This function is explicit
+// in that it requires the opcode.
+static void emitFrameOffsetAdj(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ const DebugLoc &DL, unsigned DestReg,
+ unsigned SrcReg, int64_t Offset, unsigned Opc,
+ const TargetInstrInfo *TII,
+ MachineInstr::MIFlag Flag, bool NeedsWinCFI,
+ bool *HasWinCFI) {
+ int Sign = 1;
+ unsigned MaxEncoding, ShiftSize;
+ switch (Opc) {
+ case AArch64::ADDXri:
+ case AArch64::ADDSXri:
+ case AArch64::SUBXri:
+ case AArch64::SUBSXri:
+ MaxEncoding = 0xfff;
+ ShiftSize = 12;
+ break;
+ case AArch64::ADDVL_XXI:
+ case AArch64::ADDPL_XXI:
+ MaxEncoding = 31;
+ ShiftSize = 0;
+ if (Offset < 0) {
+ MaxEncoding = 32;
+ Sign = -1;
+ Offset = -Offset;
+ }
+ break;
+ default:
+ llvm_unreachable("Unsupported opcode");
+ }
// FIXME: If the offset won't fit in 24-bits, compute the offset into a
// scratch register. If DestReg is a virtual register, use it as the
@@ -2999,65 +3083,94 @@ void llvm::emitFrameOffset(MachineBasicBlock &MBB,
// of code.
// assert(Offset < (1 << 24) && "unimplemented reg plus immediate");
- unsigned Opc;
- if (SetNZCV)
- Opc = isSub ? AArch64::SUBSXri : AArch64::ADDSXri;
- else
- Opc = isSub ? AArch64::SUBXri : AArch64::ADDXri;
- const unsigned MaxEncoding = 0xfff;
- const unsigned ShiftSize = 12;
const unsigned MaxEncodableValue = MaxEncoding << ShiftSize;
- while (((unsigned)Offset) >= (1 << ShiftSize)) {
- unsigned ThisVal;
- if (((unsigned)Offset) > MaxEncodableValue) {
- ThisVal = MaxEncodableValue;
- } else {
- ThisVal = Offset & MaxEncodableValue;
+ do {
+ unsigned ThisVal = std::min<unsigned>(Offset, MaxEncodableValue);
+ unsigned LocalShiftSize = 0;
+ if (ThisVal > MaxEncoding) {
+ ThisVal = ThisVal >> ShiftSize;
+ LocalShiftSize = ShiftSize;
}
assert((ThisVal >> ShiftSize) <= MaxEncoding &&
"Encoding cannot handle value that big");
- BuildMI(MBB, MBBI, DL, TII->get(Opc), DestReg)
- .addReg(SrcReg)
- .addImm(ThisVal >> ShiftSize)
- .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, ShiftSize))
- .setMIFlag(Flag);
-
- if (NeedsWinCFI && SrcReg == AArch64::SP && DestReg == AArch64::SP) {
+ auto MBI = BuildMI(MBB, MBBI, DL, TII->get(Opc), DestReg)
+ .addReg(SrcReg)
+ .addImm(Sign * (int)ThisVal);
+ if (ShiftSize)
+ MBI = MBI.addImm(
+ AArch64_AM::getShifterImm(AArch64_AM::LSL, LocalShiftSize));
+ MBI = MBI.setMIFlag(Flag);
+
+ if (NeedsWinCFI) {
+ assert(Sign == 1 && "SEH directives should always have a positive sign");
+ int Imm = (int)(ThisVal << LocalShiftSize);
+ if ((DestReg == AArch64::FP && SrcReg == AArch64::SP) ||
+ (SrcReg == AArch64::FP && DestReg == AArch64::SP)) {
+ if (HasWinCFI)
+ *HasWinCFI = true;
+ if (Imm == 0)
+ BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_SetFP)).setMIFlag(Flag);
+ else
+ BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_AddFP))
+ .addImm(Imm)
+ .setMIFlag(Flag);
+ assert((Offset - Imm) == 0 && "Expected remaining offset to be zero to "
+ "emit a single SEH directive");
+ } else if (DestReg == AArch64::SP) {
+ if (HasWinCFI)
+ *HasWinCFI = true;
+ assert(SrcReg == AArch64::SP && "Unexpected SrcReg for SEH_StackAlloc");
+ BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_StackAlloc))
+ .addImm(Imm)
+ .setMIFlag(Flag);
+ }
if (HasWinCFI)
*HasWinCFI = true;
- BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_StackAlloc))
- .addImm(ThisVal)
- .setMIFlag(Flag);
}
SrcReg = DestReg;
- Offset -= ThisVal;
- if (Offset == 0)
- return;
- }
- BuildMI(MBB, MBBI, DL, TII->get(Opc), DestReg)
- .addReg(SrcReg)
- .addImm(Offset)
- .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0))
- .setMIFlag(Flag);
+ Offset -= ThisVal << LocalShiftSize;
+ } while (Offset);
+}
- if (NeedsWinCFI) {
- if ((DestReg == AArch64::FP && SrcReg == AArch64::SP) ||
- (SrcReg == AArch64::FP && DestReg == AArch64::SP)) {
- if (HasWinCFI)
- *HasWinCFI = true;
- if (Offset == 0)
- BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_SetFP)).
- setMIFlag(Flag);
- else
- BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_AddFP)).
- addImm(Offset).setMIFlag(Flag);
- } else if (DestReg == AArch64::SP) {
- if (HasWinCFI)
- *HasWinCFI = true;
- BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_StackAlloc)).
- addImm(Offset).setMIFlag(Flag);
+void llvm::emitFrameOffset(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI, const DebugLoc &DL,
+ unsigned DestReg, unsigned SrcReg,
+ StackOffset Offset, const TargetInstrInfo *TII,
+ MachineInstr::MIFlag Flag, bool SetNZCV,
+ bool NeedsWinCFI, bool *HasWinCFI) {
+ int64_t Bytes, NumPredicateVectors, NumDataVectors;
+ Offset.getForFrameOffset(Bytes, NumPredicateVectors, NumDataVectors);
+
+ // First emit non-scalable frame offsets, or a simple 'mov'.
+ if (Bytes || (!Offset && SrcReg != DestReg)) {
+ assert((DestReg != AArch64::SP || Bytes % 16 == 0) &&
+ "SP increment/decrement not 16-byte aligned");
+ unsigned Opc = SetNZCV ? AArch64::ADDSXri : AArch64::ADDXri;
+ if (Bytes < 0) {
+ Bytes = -Bytes;
+ Opc = SetNZCV ? AArch64::SUBSXri : AArch64::SUBXri;
}
+ emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, Bytes, Opc, TII, Flag,
+ NeedsWinCFI, HasWinCFI);
+ SrcReg = DestReg;
+ }
+
+ assert(!(SetNZCV && (NumPredicateVectors || NumDataVectors)) &&
+ "SetNZCV not supported with SVE vectors");
+ assert(!(NeedsWinCFI && (NumPredicateVectors || NumDataVectors)) &&
+ "WinCFI not supported with SVE vectors");
+
+ if (NumDataVectors) {
+ emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, NumDataVectors,
+ AArch64::ADDVL_XXI, TII, Flag, NeedsWinCFI, nullptr);
+ SrcReg = DestReg;
+ }
+
+ if (NumPredicateVectors) {
+ assert(DestReg != AArch64::SP && "Unaligned access to SP");
+ emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, NumPredicateVectors,
+ AArch64::ADDPL_XXI, TII, Flag, NeedsWinCFI, nullptr);
}
}
@@ -3079,15 +3192,13 @@ MachineInstr *AArch64InstrInfo::foldMemoryOperandImpl(
// <rdar://problem/11522048>
//
if (MI.isFullCopy()) {
- unsigned DstReg = MI.getOperand(0).getReg();
- unsigned SrcReg = MI.getOperand(1).getReg();
- if (SrcReg == AArch64::SP &&
- TargetRegisterInfo::isVirtualRegister(DstReg)) {
+ Register DstReg = MI.getOperand(0).getReg();
+ Register SrcReg = MI.getOperand(1).getReg();
+ if (SrcReg == AArch64::SP && Register::isVirtualRegister(DstReg)) {
MF.getRegInfo().constrainRegClass(DstReg, &AArch64::GPR64RegClass);
return nullptr;
}
- if (DstReg == AArch64::SP &&
- TargetRegisterInfo::isVirtualRegister(SrcReg)) {
+ if (DstReg == AArch64::SP && Register::isVirtualRegister(SrcReg)) {
MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR64RegClass);
return nullptr;
}
@@ -3127,14 +3238,13 @@ MachineInstr *AArch64InstrInfo::foldMemoryOperandImpl(
MachineBasicBlock &MBB = *MI.getParent();
const MachineOperand &DstMO = MI.getOperand(0);
const MachineOperand &SrcMO = MI.getOperand(1);
- unsigned DstReg = DstMO.getReg();
- unsigned SrcReg = SrcMO.getReg();
+ Register DstReg = DstMO.getReg();
+ Register SrcReg = SrcMO.getReg();
// This is slightly expensive to compute for physical regs since
// getMinimalPhysRegClass is slow.
auto getRegClass = [&](unsigned Reg) {
- return TargetRegisterInfo::isVirtualRegister(Reg)
- ? MRI.getRegClass(Reg)
- : TRI.getMinimalPhysRegClass(Reg);
+ return Register::isVirtualRegister(Reg) ? MRI.getRegClass(Reg)
+ : TRI.getMinimalPhysRegClass(Reg);
};
if (DstMO.getSubReg() == 0 && SrcMO.getSubReg() == 0) {
@@ -3159,8 +3269,7 @@ MachineInstr *AArch64InstrInfo::foldMemoryOperandImpl(
//
// STRXui %xzr, %stack.0
//
- if (IsSpill && DstMO.isUndef() &&
- TargetRegisterInfo::isPhysicalRegister(SrcReg)) {
+ if (IsSpill && DstMO.isUndef() && Register::isPhysicalRegister(SrcReg)) {
assert(SrcMO.getSubReg() == 0 &&
"Unexpected subreg on physical register");
const TargetRegisterClass *SpillRC;
@@ -3243,10 +3352,23 @@ MachineInstr *AArch64InstrInfo::foldMemoryOperandImpl(
return nullptr;
}
-int llvm::isAArch64FrameOffsetLegal(const MachineInstr &MI, int &Offset,
+static bool isSVEScaledImmInstruction(unsigned Opcode) {
+ switch (Opcode) {
+ case AArch64::LDR_ZXI:
+ case AArch64::STR_ZXI:
+ case AArch64::LDR_PXI:
+ case AArch64::STR_PXI:
+ return true;
+ default:
+ return false;
+ }
+}
+
+int llvm::isAArch64FrameOffsetLegal(const MachineInstr &MI,
+ StackOffset &SOffset,
bool *OutUseUnscaledOp,
unsigned *OutUnscaledOp,
- int *EmittableOffset) {
+ int64_t *EmittableOffset) {
// Set output values in case of early exit.
if (EmittableOffset)
*EmittableOffset = 0;
@@ -3285,6 +3407,10 @@ int llvm::isAArch64FrameOffsetLegal(const MachineInstr &MI, int &Offset,
llvm_unreachable("unhandled opcode in isAArch64FrameOffsetLegal");
// Construct the complete offset.
+ bool IsMulVL = isSVEScaledImmInstruction(MI.getOpcode());
+ int64_t Offset =
+ IsMulVL ? (SOffset.getScalableBytes()) : (SOffset.getBytes());
+
const MachineOperand &ImmOpnd =
MI.getOperand(AArch64InstrInfo::getLoadStoreImmIdx(MI.getOpcode()));
Offset += ImmOpnd.getImm() * Scale;
@@ -3304,7 +3430,7 @@ int llvm::isAArch64FrameOffsetLegal(const MachineInstr &MI, int &Offset,
"Cannot have remainder when using unscaled op");
assert(MinOff < MaxOff && "Unexpected Min/Max offsets");
- int NewOffset = Offset / Scale;
+ int64_t NewOffset = Offset / Scale;
if (MinOff <= NewOffset && NewOffset <= MaxOff)
Offset = Remainder;
else {
@@ -3319,27 +3445,33 @@ int llvm::isAArch64FrameOffsetLegal(const MachineInstr &MI, int &Offset,
if (OutUnscaledOp && UnscaledOp)
*OutUnscaledOp = *UnscaledOp;
+ if (IsMulVL)
+ SOffset = StackOffset(Offset, MVT::nxv1i8) +
+ StackOffset(SOffset.getBytes(), MVT::i8);
+ else
+ SOffset = StackOffset(Offset, MVT::i8) +
+ StackOffset(SOffset.getScalableBytes(), MVT::nxv1i8);
return AArch64FrameOffsetCanUpdate |
- (Offset == 0 ? AArch64FrameOffsetIsLegal : 0);
+ (SOffset ? 0 : AArch64FrameOffsetIsLegal);
}
bool llvm::rewriteAArch64FrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
- unsigned FrameReg, int &Offset,
+ unsigned FrameReg, StackOffset &Offset,
const AArch64InstrInfo *TII) {
unsigned Opcode = MI.getOpcode();
unsigned ImmIdx = FrameRegIdx + 1;
if (Opcode == AArch64::ADDSXri || Opcode == AArch64::ADDXri) {
- Offset += MI.getOperand(ImmIdx).getImm();
+ Offset += StackOffset(MI.getOperand(ImmIdx).getImm(), MVT::i8);
emitFrameOffset(*MI.getParent(), MI, MI.getDebugLoc(),
MI.getOperand(0).getReg(), FrameReg, Offset, TII,
MachineInstr::NoFlags, (Opcode == AArch64::ADDSXri));
MI.eraseFromParent();
- Offset = 0;
+ Offset = StackOffset();
return true;
}
- int NewOffset;
+ int64_t NewOffset;
unsigned UnscaledOp;
bool UseUnscaledOp;
int Status = isAArch64FrameOffsetLegal(MI, Offset, &UseUnscaledOp,
@@ -3352,7 +3484,7 @@ bool llvm::rewriteAArch64FrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
MI.setDesc(TII->get(UnscaledOp));
MI.getOperand(ImmIdx).ChangeToImmediate(NewOffset);
- return Offset == 0;
+ return !Offset;
}
return false;
@@ -3428,13 +3560,19 @@ static bool isCombineInstrCandidateFP(const MachineInstr &Inst) {
switch (Inst.getOpcode()) {
default:
break;
+ case AArch64::FADDHrr:
case AArch64::FADDSrr:
case AArch64::FADDDrr:
+ case AArch64::FADDv4f16:
+ case AArch64::FADDv8f16:
case AArch64::FADDv2f32:
case AArch64::FADDv2f64:
case AArch64::FADDv4f32:
+ case AArch64::FSUBHrr:
case AArch64::FSUBSrr:
case AArch64::FSUBDrr:
+ case AArch64::FSUBv4f16:
+ case AArch64::FSUBv8f16:
case AArch64::FSUBv2f32:
case AArch64::FSUBv2f64:
case AArch64::FSUBv4f32:
@@ -3459,7 +3597,7 @@ static bool canCombine(MachineBasicBlock &MBB, MachineOperand &MO,
MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
MachineInstr *MI = nullptr;
- if (MO.isReg() && TargetRegisterInfo::isVirtualRegister(MO.getReg()))
+ if (MO.isReg() && Register::isVirtualRegister(MO.getReg()))
MI = MRI.getUniqueVRegDef(MO.getReg());
// And it needs to be in the trace (otherwise, it won't have a depth).
if (!MI || MI->getParent() != &MBB || (unsigned)MI->getOpcode() != CombineOpc)
@@ -3544,86 +3682,48 @@ static bool getMaddPatterns(MachineInstr &Root,
Opc = NewOpc;
}
+ auto setFound = [&](int Opcode, int Operand, unsigned ZeroReg,
+ MachineCombinerPattern Pattern) {
+ if (canCombineWithMUL(MBB, Root.getOperand(Operand), Opcode, ZeroReg)) {
+ Patterns.push_back(Pattern);
+ Found = true;
+ }
+ };
+
+ typedef MachineCombinerPattern MCP;
+
switch (Opc) {
default:
break;
case AArch64::ADDWrr:
assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() &&
"ADDWrr does not have register operands");
- if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDWrrr,
- AArch64::WZR)) {
- Patterns.push_back(MachineCombinerPattern::MULADDW_OP1);
- Found = true;
- }
- if (canCombineWithMUL(MBB, Root.getOperand(2), AArch64::MADDWrrr,
- AArch64::WZR)) {
- Patterns.push_back(MachineCombinerPattern::MULADDW_OP2);
- Found = true;
- }
+ setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULADDW_OP1);
+ setFound(AArch64::MADDWrrr, 2, AArch64::WZR, MCP::MULADDW_OP2);
break;
case AArch64::ADDXrr:
- if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDXrrr,
- AArch64::XZR)) {
- Patterns.push_back(MachineCombinerPattern::MULADDX_OP1);
- Found = true;
- }
- if (canCombineWithMUL(MBB, Root.getOperand(2), AArch64::MADDXrrr,
- AArch64::XZR)) {
- Patterns.push_back(MachineCombinerPattern::MULADDX_OP2);
- Found = true;
- }
+ setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULADDX_OP1);
+ setFound(AArch64::MADDXrrr, 2, AArch64::XZR, MCP::MULADDX_OP2);
break;
case AArch64::SUBWrr:
- if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDWrrr,
- AArch64::WZR)) {
- Patterns.push_back(MachineCombinerPattern::MULSUBW_OP1);
- Found = true;
- }
- if (canCombineWithMUL(MBB, Root.getOperand(2), AArch64::MADDWrrr,
- AArch64::WZR)) {
- Patterns.push_back(MachineCombinerPattern::MULSUBW_OP2);
- Found = true;
- }
+ setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULSUBW_OP1);
+ setFound(AArch64::MADDWrrr, 2, AArch64::WZR, MCP::MULSUBW_OP2);
break;
case AArch64::SUBXrr:
- if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDXrrr,
- AArch64::XZR)) {
- Patterns.push_back(MachineCombinerPattern::MULSUBX_OP1);
- Found = true;
- }
- if (canCombineWithMUL(MBB, Root.getOperand(2), AArch64::MADDXrrr,
- AArch64::XZR)) {
- Patterns.push_back(MachineCombinerPattern::MULSUBX_OP2);
- Found = true;
- }
+ setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULSUBX_OP1);
+ setFound(AArch64::MADDXrrr, 2, AArch64::XZR, MCP::MULSUBX_OP2);
break;
case AArch64::ADDWri:
- if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDWrrr,
- AArch64::WZR)) {
- Patterns.push_back(MachineCombinerPattern::MULADDWI_OP1);
- Found = true;
- }
+ setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULADDWI_OP1);
break;
case AArch64::ADDXri:
- if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDXrrr,
- AArch64::XZR)) {
- Patterns.push_back(MachineCombinerPattern::MULADDXI_OP1);
- Found = true;
- }
+ setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULADDXI_OP1);
break;
case AArch64::SUBWri:
- if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDWrrr,
- AArch64::WZR)) {
- Patterns.push_back(MachineCombinerPattern::MULSUBWI_OP1);
- Found = true;
- }
+ setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULSUBWI_OP1);
break;
case AArch64::SUBXri:
- if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDXrrr,
- AArch64::XZR)) {
- Patterns.push_back(MachineCombinerPattern::MULSUBXI_OP1);
- Found = true;
- }
+ setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULSUBXI_OP1);
break;
}
return Found;
@@ -3640,204 +3740,135 @@ static bool getFMAPatterns(MachineInstr &Root,
MachineBasicBlock &MBB = *Root.getParent();
bool Found = false;
+ auto Match = [&](int Opcode, int Operand,
+ MachineCombinerPattern Pattern) -> bool {
+ if (canCombineWithFMUL(MBB, Root.getOperand(Operand), Opcode)) {
+ Patterns.push_back(Pattern);
+ return true;
+ }
+ return false;
+ };
+
+ typedef MachineCombinerPattern MCP;
+
switch (Root.getOpcode()) {
default:
assert(false && "Unsupported FP instruction in combiner\n");
break;
+ case AArch64::FADDHrr:
+ assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() &&
+ "FADDHrr does not have register operands");
+
+ Found = Match(AArch64::FMULHrr, 1, MCP::FMULADDH_OP1);
+ Found |= Match(AArch64::FMULHrr, 2, MCP::FMULADDH_OP2);
+ break;
case AArch64::FADDSrr:
assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() &&
- "FADDWrr does not have register operands");
- if (canCombineWithFMUL(MBB, Root.getOperand(1), AArch64::FMULSrr)) {
- Patterns.push_back(MachineCombinerPattern::FMULADDS_OP1);
- Found = true;
- } else if (canCombineWithFMUL(MBB, Root.getOperand(1),
- AArch64::FMULv1i32_indexed)) {
- Patterns.push_back(MachineCombinerPattern::FMLAv1i32_indexed_OP1);
- Found = true;
- }
- if (canCombineWithFMUL(MBB, Root.getOperand(2), AArch64::FMULSrr)) {
- Patterns.push_back(MachineCombinerPattern::FMULADDS_OP2);
- Found = true;
- } else if (canCombineWithFMUL(MBB, Root.getOperand(2),
- AArch64::FMULv1i32_indexed)) {
- Patterns.push_back(MachineCombinerPattern::FMLAv1i32_indexed_OP2);
- Found = true;
- }
+ "FADDSrr does not have register operands");
+
+ Found |= Match(AArch64::FMULSrr, 1, MCP::FMULADDS_OP1) ||
+ Match(AArch64::FMULv1i32_indexed, 1, MCP::FMLAv1i32_indexed_OP1);
+
+ Found |= Match(AArch64::FMULSrr, 2, MCP::FMULADDS_OP2) ||
+ Match(AArch64::FMULv1i32_indexed, 2, MCP::FMLAv1i32_indexed_OP2);
break;
case AArch64::FADDDrr:
- if (canCombineWithFMUL(MBB, Root.getOperand(1), AArch64::FMULDrr)) {
- Patterns.push_back(MachineCombinerPattern::FMULADDD_OP1);
- Found = true;
- } else if (canCombineWithFMUL(MBB, Root.getOperand(1),
- AArch64::FMULv1i64_indexed)) {
- Patterns.push_back(MachineCombinerPattern::FMLAv1i64_indexed_OP1);
- Found = true;
- }
- if (canCombineWithFMUL(MBB, Root.getOperand(2), AArch64::FMULDrr)) {
- Patterns.push_back(MachineCombinerPattern::FMULADDD_OP2);
- Found = true;
- } else if (canCombineWithFMUL(MBB, Root.getOperand(2),
- AArch64::FMULv1i64_indexed)) {
- Patterns.push_back(MachineCombinerPattern::FMLAv1i64_indexed_OP2);
- Found = true;
- }
+ Found |= Match(AArch64::FMULDrr, 1, MCP::FMULADDD_OP1) ||
+ Match(AArch64::FMULv1i64_indexed, 1, MCP::FMLAv1i64_indexed_OP1);
+
+ Found |= Match(AArch64::FMULDrr, 2, MCP::FMULADDD_OP2) ||
+ Match(AArch64::FMULv1i64_indexed, 2, MCP::FMLAv1i64_indexed_OP2);
+ break;
+ case AArch64::FADDv4f16:
+ Found |= Match(AArch64::FMULv4i16_indexed, 1, MCP::FMLAv4i16_indexed_OP1) ||
+ Match(AArch64::FMULv4f16, 1, MCP::FMLAv4f16_OP1);
+
+ Found |= Match(AArch64::FMULv4i16_indexed, 2, MCP::FMLAv4i16_indexed_OP2) ||
+ Match(AArch64::FMULv4f16, 2, MCP::FMLAv4f16_OP2);
+ break;
+ case AArch64::FADDv8f16:
+ Found |= Match(AArch64::FMULv8i16_indexed, 1, MCP::FMLAv8i16_indexed_OP1) ||
+ Match(AArch64::FMULv8f16, 1, MCP::FMLAv8f16_OP1);
+
+ Found |= Match(AArch64::FMULv8i16_indexed, 2, MCP::FMLAv8i16_indexed_OP2) ||
+ Match(AArch64::FMULv8f16, 2, MCP::FMLAv8f16_OP2);
break;
case AArch64::FADDv2f32:
- if (canCombineWithFMUL(MBB, Root.getOperand(1),
- AArch64::FMULv2i32_indexed)) {
- Patterns.push_back(MachineCombinerPattern::FMLAv2i32_indexed_OP1);
- Found = true;
- } else if (canCombineWithFMUL(MBB, Root.getOperand(1),
- AArch64::FMULv2f32)) {
- Patterns.push_back(MachineCombinerPattern::FMLAv2f32_OP1);
- Found = true;
- }
- if (canCombineWithFMUL(MBB, Root.getOperand(2),
- AArch64::FMULv2i32_indexed)) {
- Patterns.push_back(MachineCombinerPattern::FMLAv2i32_indexed_OP2);
- Found = true;
- } else if (canCombineWithFMUL(MBB, Root.getOperand(2),
- AArch64::FMULv2f32)) {
- Patterns.push_back(MachineCombinerPattern::FMLAv2f32_OP2);
- Found = true;
- }
+ Found |= Match(AArch64::FMULv2i32_indexed, 1, MCP::FMLAv2i32_indexed_OP1) ||
+ Match(AArch64::FMULv2f32, 1, MCP::FMLAv2f32_OP1);
+
+ Found |= Match(AArch64::FMULv2i32_indexed, 2, MCP::FMLAv2i32_indexed_OP2) ||
+ Match(AArch64::FMULv2f32, 2, MCP::FMLAv2f32_OP2);
break;
case AArch64::FADDv2f64:
- if (canCombineWithFMUL(MBB, Root.getOperand(1),
- AArch64::FMULv2i64_indexed)) {
- Patterns.push_back(MachineCombinerPattern::FMLAv2i64_indexed_OP1);
- Found = true;
- } else if (canCombineWithFMUL(MBB, Root.getOperand(1),
- AArch64::FMULv2f64)) {
- Patterns.push_back(MachineCombinerPattern::FMLAv2f64_OP1);
- Found = true;
- }
- if (canCombineWithFMUL(MBB, Root.getOperand(2),
- AArch64::FMULv2i64_indexed)) {
- Patterns.push_back(MachineCombinerPattern::FMLAv2i64_indexed_OP2);
- Found = true;
- } else if (canCombineWithFMUL(MBB, Root.getOperand(2),
- AArch64::FMULv2f64)) {
- Patterns.push_back(MachineCombinerPattern::FMLAv2f64_OP2);
- Found = true;
- }
+ Found |= Match(AArch64::FMULv2i64_indexed, 1, MCP::FMLAv2i64_indexed_OP1) ||
+ Match(AArch64::FMULv2f64, 1, MCP::FMLAv2f64_OP1);
+
+ Found |= Match(AArch64::FMULv2i64_indexed, 2, MCP::FMLAv2i64_indexed_OP2) ||
+ Match(AArch64::FMULv2f64, 2, MCP::FMLAv2f64_OP2);
break;
case AArch64::FADDv4f32:
- if (canCombineWithFMUL(MBB, Root.getOperand(1),
- AArch64::FMULv4i32_indexed)) {
- Patterns.push_back(MachineCombinerPattern::FMLAv4i32_indexed_OP1);
- Found = true;
- } else if (canCombineWithFMUL(MBB, Root.getOperand(1),
- AArch64::FMULv4f32)) {
- Patterns.push_back(MachineCombinerPattern::FMLAv4f32_OP1);
- Found = true;
- }
- if (canCombineWithFMUL(MBB, Root.getOperand(2),
- AArch64::FMULv4i32_indexed)) {
- Patterns.push_back(MachineCombinerPattern::FMLAv4i32_indexed_OP2);
- Found = true;
- } else if (canCombineWithFMUL(MBB, Root.getOperand(2),
- AArch64::FMULv4f32)) {
- Patterns.push_back(MachineCombinerPattern::FMLAv4f32_OP2);
- Found = true;
- }
- break;
+ Found |= Match(AArch64::FMULv4i32_indexed, 1, MCP::FMLAv4i32_indexed_OP1) ||
+ Match(AArch64::FMULv4f32, 1, MCP::FMLAv4f32_OP1);
+ Found |= Match(AArch64::FMULv4i32_indexed, 2, MCP::FMLAv4i32_indexed_OP2) ||
+ Match(AArch64::FMULv4f32, 2, MCP::FMLAv4f32_OP2);
+ break;
+ case AArch64::FSUBHrr:
+ Found = Match(AArch64::FMULHrr, 1, MCP::FMULSUBH_OP1);
+ Found |= Match(AArch64::FMULHrr, 2, MCP::FMULSUBH_OP2);
+ Found |= Match(AArch64::FNMULHrr, 1, MCP::FNMULSUBH_OP1);
+ break;
case AArch64::FSUBSrr:
- if (canCombineWithFMUL(MBB, Root.getOperand(1), AArch64::FMULSrr)) {
- Patterns.push_back(MachineCombinerPattern::FMULSUBS_OP1);
- Found = true;
- }
- if (canCombineWithFMUL(MBB, Root.getOperand(2), AArch64::FMULSrr)) {
- Patterns.push_back(MachineCombinerPattern::FMULSUBS_OP2);
- Found = true;
- } else if (canCombineWithFMUL(MBB, Root.getOperand(2),
- AArch64::FMULv1i32_indexed)) {
- Patterns.push_back(MachineCombinerPattern::FMLSv1i32_indexed_OP2);
- Found = true;
- }
- if (canCombineWithFMUL(MBB, Root.getOperand(1), AArch64::FNMULSrr)) {
- Patterns.push_back(MachineCombinerPattern::FNMULSUBS_OP1);
- Found = true;
- }
+ Found = Match(AArch64::FMULSrr, 1, MCP::FMULSUBS_OP1);
+
+ Found |= Match(AArch64::FMULSrr, 2, MCP::FMULSUBS_OP2) ||
+ Match(AArch64::FMULv1i32_indexed, 2, MCP::FMLSv1i32_indexed_OP2);
+
+ Found |= Match(AArch64::FNMULSrr, 1, MCP::FNMULSUBS_OP1);
break;
case AArch64::FSUBDrr:
- if (canCombineWithFMUL(MBB, Root.getOperand(1), AArch64::FMULDrr)) {
- Patterns.push_back(MachineCombinerPattern::FMULSUBD_OP1);
- Found = true;
- }
- if (canCombineWithFMUL(MBB, Root.getOperand(2), AArch64::FMULDrr)) {
- Patterns.push_back(MachineCombinerPattern::FMULSUBD_OP2);
- Found = true;
- } else if (canCombineWithFMUL(MBB, Root.getOperand(2),
- AArch64::FMULv1i64_indexed)) {
- Patterns.push_back(MachineCombinerPattern::FMLSv1i64_indexed_OP2);
- Found = true;
- }
- if (canCombineWithFMUL(MBB, Root.getOperand(1), AArch64::FNMULDrr)) {
- Patterns.push_back(MachineCombinerPattern::FNMULSUBD_OP1);
- Found = true;
- }
+ Found = Match(AArch64::FMULDrr, 1, MCP::FMULSUBD_OP1);
+
+ Found |= Match(AArch64::FMULDrr, 2, MCP::FMULSUBD_OP2) ||
+ Match(AArch64::FMULv1i64_indexed, 2, MCP::FMLSv1i64_indexed_OP2);
+
+ Found |= Match(AArch64::FNMULDrr, 1, MCP::FNMULSUBD_OP1);
+ break;
+ case AArch64::FSUBv4f16:
+ Found |= Match(AArch64::FMULv4i16_indexed, 2, MCP::FMLSv4i16_indexed_OP2) ||
+ Match(AArch64::FMULv4f16, 2, MCP::FMLSv4f16_OP2);
+
+ Found |= Match(AArch64::FMULv4i16_indexed, 1, MCP::FMLSv4i16_indexed_OP1) ||
+ Match(AArch64::FMULv4f16, 1, MCP::FMLSv4f16_OP1);
+ break;
+ case AArch64::FSUBv8f16:
+ Found |= Match(AArch64::FMULv8i16_indexed, 2, MCP::FMLSv8i16_indexed_OP2) ||
+ Match(AArch64::FMULv8f16, 2, MCP::FMLSv8f16_OP2);
+
+ Found |= Match(AArch64::FMULv8i16_indexed, 1, MCP::FMLSv8i16_indexed_OP1) ||
+ Match(AArch64::FMULv8f16, 1, MCP::FMLSv8f16_OP1);
break;
case AArch64::FSUBv2f32:
- if (canCombineWithFMUL(MBB, Root.getOperand(2),
- AArch64::FMULv2i32_indexed)) {
- Patterns.push_back(MachineCombinerPattern::FMLSv2i32_indexed_OP2);
- Found = true;
- } else if (canCombineWithFMUL(MBB, Root.getOperand(2),
- AArch64::FMULv2f32)) {
- Patterns.push_back(MachineCombinerPattern::FMLSv2f32_OP2);
- Found = true;
- }
- if (canCombineWithFMUL(MBB, Root.getOperand(1),
- AArch64::FMULv2i32_indexed)) {
- Patterns.push_back(MachineCombinerPattern::FMLSv2i32_indexed_OP1);
- Found = true;
- } else if (canCombineWithFMUL(MBB, Root.getOperand(1),
- AArch64::FMULv2f32)) {
- Patterns.push_back(MachineCombinerPattern::FMLSv2f32_OP1);
- Found = true;
- }
+ Found |= Match(AArch64::FMULv2i32_indexed, 2, MCP::FMLSv2i32_indexed_OP2) ||
+ Match(AArch64::FMULv2f32, 2, MCP::FMLSv2f32_OP2);
+
+ Found |= Match(AArch64::FMULv2i32_indexed, 1, MCP::FMLSv2i32_indexed_OP1) ||
+ Match(AArch64::FMULv2f32, 1, MCP::FMLSv2f32_OP1);
break;
case AArch64::FSUBv2f64:
- if (canCombineWithFMUL(MBB, Root.getOperand(2),
- AArch64::FMULv2i64_indexed)) {
- Patterns.push_back(MachineCombinerPattern::FMLSv2i64_indexed_OP2);
- Found = true;
- } else if (canCombineWithFMUL(MBB, Root.getOperand(2),
- AArch64::FMULv2f64)) {
- Patterns.push_back(MachineCombinerPattern::FMLSv2f64_OP2);
- Found = true;
- }
- if (canCombineWithFMUL(MBB, Root.getOperand(1),
- AArch64::FMULv2i64_indexed)) {
- Patterns.push_back(MachineCombinerPattern::FMLSv2i64_indexed_OP1);
- Found = true;
- } else if (canCombineWithFMUL(MBB, Root.getOperand(1),
- AArch64::FMULv2f64)) {
- Patterns.push_back(MachineCombinerPattern::FMLSv2f64_OP1);
- Found = true;
- }
+ Found |= Match(AArch64::FMULv2i64_indexed, 2, MCP::FMLSv2i64_indexed_OP2) ||
+ Match(AArch64::FMULv2f64, 2, MCP::FMLSv2f64_OP2);
+
+ Found |= Match(AArch64::FMULv2i64_indexed, 1, MCP::FMLSv2i64_indexed_OP1) ||
+ Match(AArch64::FMULv2f64, 1, MCP::FMLSv2f64_OP1);
break;
case AArch64::FSUBv4f32:
- if (canCombineWithFMUL(MBB, Root.getOperand(2),
- AArch64::FMULv4i32_indexed)) {
- Patterns.push_back(MachineCombinerPattern::FMLSv4i32_indexed_OP2);
- Found = true;
- } else if (canCombineWithFMUL(MBB, Root.getOperand(2),
- AArch64::FMULv4f32)) {
- Patterns.push_back(MachineCombinerPattern::FMLSv4f32_OP2);
- Found = true;
- }
- if (canCombineWithFMUL(MBB, Root.getOperand(1),
- AArch64::FMULv4i32_indexed)) {
- Patterns.push_back(MachineCombinerPattern::FMLSv4i32_indexed_OP1);
- Found = true;
- } else if (canCombineWithFMUL(MBB, Root.getOperand(1),
- AArch64::FMULv4f32)) {
- Patterns.push_back(MachineCombinerPattern::FMLSv4f32_OP1);
- Found = true;
- }
+ Found |= Match(AArch64::FMULv4i32_indexed, 2, MCP::FMLSv4i32_indexed_OP2) ||
+ Match(AArch64::FMULv4f32, 2, MCP::FMLSv4f32_OP2);
+
+ Found |= Match(AArch64::FMULv4i32_indexed, 1, MCP::FMLSv4i32_indexed_OP1) ||
+ Match(AArch64::FMULv4f32, 1, MCP::FMLSv4f32_OP1);
break;
}
return Found;
@@ -3851,6 +3882,10 @@ bool AArch64InstrInfo::isThroughputPattern(
switch (Pattern) {
default:
break;
+ case MachineCombinerPattern::FMULADDH_OP1:
+ case MachineCombinerPattern::FMULADDH_OP2:
+ case MachineCombinerPattern::FMULSUBH_OP1:
+ case MachineCombinerPattern::FMULSUBH_OP2:
case MachineCombinerPattern::FMULADDS_OP1:
case MachineCombinerPattern::FMULADDS_OP2:
case MachineCombinerPattern::FMULSUBS_OP1:
@@ -3859,12 +3894,21 @@ bool AArch64InstrInfo::isThroughputPattern(
case MachineCombinerPattern::FMULADDD_OP2:
case MachineCombinerPattern::FMULSUBD_OP1:
case MachineCombinerPattern::FMULSUBD_OP2:
+ case MachineCombinerPattern::FNMULSUBH_OP1:
case MachineCombinerPattern::FNMULSUBS_OP1:
case MachineCombinerPattern::FNMULSUBD_OP1:
+ case MachineCombinerPattern::FMLAv4i16_indexed_OP1:
+ case MachineCombinerPattern::FMLAv4i16_indexed_OP2:
+ case MachineCombinerPattern::FMLAv8i16_indexed_OP1:
+ case MachineCombinerPattern::FMLAv8i16_indexed_OP2:
case MachineCombinerPattern::FMLAv1i32_indexed_OP1:
case MachineCombinerPattern::FMLAv1i32_indexed_OP2:
case MachineCombinerPattern::FMLAv1i64_indexed_OP1:
case MachineCombinerPattern::FMLAv1i64_indexed_OP2:
+ case MachineCombinerPattern::FMLAv4f16_OP2:
+ case MachineCombinerPattern::FMLAv4f16_OP1:
+ case MachineCombinerPattern::FMLAv8f16_OP1:
+ case MachineCombinerPattern::FMLAv8f16_OP2:
case MachineCombinerPattern::FMLAv2f32_OP2:
case MachineCombinerPattern::FMLAv2f32_OP1:
case MachineCombinerPattern::FMLAv2f64_OP1:
@@ -3877,10 +3921,18 @@ bool AArch64InstrInfo::isThroughputPattern(
case MachineCombinerPattern::FMLAv4f32_OP2:
case MachineCombinerPattern::FMLAv4i32_indexed_OP1:
case MachineCombinerPattern::FMLAv4i32_indexed_OP2:
+ case MachineCombinerPattern::FMLSv4i16_indexed_OP1:
+ case MachineCombinerPattern::FMLSv4i16_indexed_OP2:
+ case MachineCombinerPattern::FMLSv8i16_indexed_OP1:
+ case MachineCombinerPattern::FMLSv8i16_indexed_OP2:
case MachineCombinerPattern::FMLSv1i32_indexed_OP2:
case MachineCombinerPattern::FMLSv1i64_indexed_OP2:
case MachineCombinerPattern::FMLSv2i32_indexed_OP2:
case MachineCombinerPattern::FMLSv2i64_indexed_OP2:
+ case MachineCombinerPattern::FMLSv4f16_OP1:
+ case MachineCombinerPattern::FMLSv4f16_OP2:
+ case MachineCombinerPattern::FMLSv8f16_OP1:
+ case MachineCombinerPattern::FMLSv8f16_OP2:
case MachineCombinerPattern::FMLSv2f32_OP2:
case MachineCombinerPattern::FMLSv2f64_OP2:
case MachineCombinerPattern::FMLSv4i32_indexed_OP2:
@@ -3933,15 +3985,15 @@ genFusedMultiply(MachineFunction &MF, MachineRegisterInfo &MRI,
SmallVectorImpl<MachineInstr *> &InsInstrs, unsigned IdxMulOpd,
unsigned MaddOpc, const TargetRegisterClass *RC,
FMAInstKind kind = FMAInstKind::Default,
- const unsigned *ReplacedAddend = nullptr) {
+ const Register *ReplacedAddend = nullptr) {
assert(IdxMulOpd == 1 || IdxMulOpd == 2);
unsigned IdxOtherOpd = IdxMulOpd == 1 ? 2 : 1;
MachineInstr *MUL = MRI.getUniqueVRegDef(Root.getOperand(IdxMulOpd).getReg());
- unsigned ResultReg = Root.getOperand(0).getReg();
- unsigned SrcReg0 = MUL->getOperand(1).getReg();
+ Register ResultReg = Root.getOperand(0).getReg();
+ Register SrcReg0 = MUL->getOperand(1).getReg();
bool Src0IsKill = MUL->getOperand(1).isKill();
- unsigned SrcReg1 = MUL->getOperand(2).getReg();
+ Register SrcReg1 = MUL->getOperand(2).getReg();
bool Src1IsKill = MUL->getOperand(2).isKill();
unsigned SrcReg2;
@@ -3955,13 +4007,13 @@ genFusedMultiply(MachineFunction &MF, MachineRegisterInfo &MRI,
Src2IsKill = Root.getOperand(IdxOtherOpd).isKill();
}
- if (TargetRegisterInfo::isVirtualRegister(ResultReg))
+ if (Register::isVirtualRegister(ResultReg))
MRI.constrainRegClass(ResultReg, RC);
- if (TargetRegisterInfo::isVirtualRegister(SrcReg0))
+ if (Register::isVirtualRegister(SrcReg0))
MRI.constrainRegClass(SrcReg0, RC);
- if (TargetRegisterInfo::isVirtualRegister(SrcReg1))
+ if (Register::isVirtualRegister(SrcReg1))
MRI.constrainRegClass(SrcReg1, RC);
- if (TargetRegisterInfo::isVirtualRegister(SrcReg2))
+ if (Register::isVirtualRegister(SrcReg2))
MRI.constrainRegClass(SrcReg2, RC);
MachineInstrBuilder MIB;
@@ -4015,19 +4067,19 @@ static MachineInstr *genMaddR(MachineFunction &MF, MachineRegisterInfo &MRI,
assert(IdxMulOpd == 1 || IdxMulOpd == 2);
MachineInstr *MUL = MRI.getUniqueVRegDef(Root.getOperand(IdxMulOpd).getReg());
- unsigned ResultReg = Root.getOperand(0).getReg();
- unsigned SrcReg0 = MUL->getOperand(1).getReg();
+ Register ResultReg = Root.getOperand(0).getReg();
+ Register SrcReg0 = MUL->getOperand(1).getReg();
bool Src0IsKill = MUL->getOperand(1).isKill();
- unsigned SrcReg1 = MUL->getOperand(2).getReg();
+ Register SrcReg1 = MUL->getOperand(2).getReg();
bool Src1IsKill = MUL->getOperand(2).isKill();
- if (TargetRegisterInfo::isVirtualRegister(ResultReg))
+ if (Register::isVirtualRegister(ResultReg))
MRI.constrainRegClass(ResultReg, RC);
- if (TargetRegisterInfo::isVirtualRegister(SrcReg0))
+ if (Register::isVirtualRegister(SrcReg0))
MRI.constrainRegClass(SrcReg0, RC);
- if (TargetRegisterInfo::isVirtualRegister(SrcReg1))
+ if (Register::isVirtualRegister(SrcReg1))
MRI.constrainRegClass(SrcReg1, RC);
- if (TargetRegisterInfo::isVirtualRegister(VR))
+ if (Register::isVirtualRegister(VR))
MRI.constrainRegClass(VR, RC);
MachineInstrBuilder MIB =
@@ -4116,7 +4168,7 @@ void AArch64InstrInfo::genAlternativeCodeSequence(
Opc = AArch64::MADDXrrr;
RC = &AArch64::GPR64RegClass;
}
- unsigned NewVR = MRI.createVirtualRegister(OrrRC);
+ Register NewVR = MRI.createVirtualRegister(OrrRC);
uint64_t Imm = Root.getOperand(2).getImm();
if (Root.getOperand(3).isImm()) {
@@ -4158,7 +4210,7 @@ void AArch64InstrInfo::genAlternativeCodeSequence(
Opc = AArch64::MADDXrrr;
RC = &AArch64::GPR64RegClass;
}
- unsigned NewVR = MRI.createVirtualRegister(SubRC);
+ Register NewVR = MRI.createVirtualRegister(SubRC);
// SUB NewVR, 0, C
MachineInstrBuilder MIB1 =
BuildMI(MF, Root.getDebugLoc(), TII->get(SubOpc), NewVR)
@@ -4208,7 +4260,7 @@ void AArch64InstrInfo::genAlternativeCodeSequence(
Opc = AArch64::MADDXrrr;
RC = &AArch64::GPR64RegClass;
}
- unsigned NewVR = MRI.createVirtualRegister(OrrRC);
+ Register NewVR = MRI.createVirtualRegister(OrrRC);
uint64_t Imm = Root.getOperand(2).getImm();
if (Root.getOperand(3).isImm()) {
unsigned Val = Root.getOperand(3).getImm();
@@ -4228,34 +4280,35 @@ void AArch64InstrInfo::genAlternativeCodeSequence(
break;
}
// Floating Point Support
+ case MachineCombinerPattern::FMULADDH_OP1:
+ Opc = AArch64::FMADDHrrr;
+ RC = &AArch64::FPR16RegClass;
+ MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
+ break;
case MachineCombinerPattern::FMULADDS_OP1:
+ Opc = AArch64::FMADDSrrr;
+ RC = &AArch64::FPR32RegClass;
+ MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
+ break;
case MachineCombinerPattern::FMULADDD_OP1:
- // MUL I=A,B,0
- // ADD R,I,C
- // ==> MADD R,A,B,C
- // --- Create(MADD);
- if (Pattern == MachineCombinerPattern::FMULADDS_OP1) {
- Opc = AArch64::FMADDSrrr;
- RC = &AArch64::FPR32RegClass;
- } else {
- Opc = AArch64::FMADDDrrr;
- RC = &AArch64::FPR64RegClass;
- }
+ Opc = AArch64::FMADDDrrr;
+ RC = &AArch64::FPR64RegClass;
MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
break;
+
+ case MachineCombinerPattern::FMULADDH_OP2:
+ Opc = AArch64::FMADDHrrr;
+ RC = &AArch64::FPR16RegClass;
+ MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
+ break;
case MachineCombinerPattern::FMULADDS_OP2:
+ Opc = AArch64::FMADDSrrr;
+ RC = &AArch64::FPR32RegClass;
+ MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
+ break;
case MachineCombinerPattern::FMULADDD_OP2:
- // FMUL I=A,B,0
- // FADD R,C,I
- // ==> FMADD R,A,B,C
- // --- Create(FMADD);
- if (Pattern == MachineCombinerPattern::FMULADDS_OP2) {
- Opc = AArch64::FMADDSrrr;
- RC = &AArch64::FPR32RegClass;
- } else {
- Opc = AArch64::FMADDDrrr;
- RC = &AArch64::FPR64RegClass;
- }
+ Opc = AArch64::FMADDDrrr;
+ RC = &AArch64::FPR64RegClass;
MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
break;
@@ -4285,6 +4338,31 @@ void AArch64InstrInfo::genAlternativeCodeSequence(
FMAInstKind::Indexed);
break;
+ case MachineCombinerPattern::FMLAv4i16_indexed_OP1:
+ RC = &AArch64::FPR64RegClass;
+ Opc = AArch64::FMLAv4i16_indexed;
+ MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
+ FMAInstKind::Indexed);
+ break;
+ case MachineCombinerPattern::FMLAv4f16_OP1:
+ RC = &AArch64::FPR64RegClass;
+ Opc = AArch64::FMLAv4f16;
+ MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
+ FMAInstKind::Accumulator);
+ break;
+ case MachineCombinerPattern::FMLAv4i16_indexed_OP2:
+ RC = &AArch64::FPR64RegClass;
+ Opc = AArch64::FMLAv4i16_indexed;
+ MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
+ FMAInstKind::Indexed);
+ break;
+ case MachineCombinerPattern::FMLAv4f16_OP2:
+ RC = &AArch64::FPR64RegClass;
+ Opc = AArch64::FMLAv4f16;
+ MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
+ FMAInstKind::Accumulator);
+ break;
+
case MachineCombinerPattern::FMLAv2i32_indexed_OP1:
case MachineCombinerPattern::FMLAv2f32_OP1:
RC = &AArch64::FPR64RegClass;
@@ -4312,6 +4390,31 @@ void AArch64InstrInfo::genAlternativeCodeSequence(
}
break;
+ case MachineCombinerPattern::FMLAv8i16_indexed_OP1:
+ RC = &AArch64::FPR128RegClass;
+ Opc = AArch64::FMLAv8i16_indexed;
+ MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
+ FMAInstKind::Indexed);
+ break;
+ case MachineCombinerPattern::FMLAv8f16_OP1:
+ RC = &AArch64::FPR128RegClass;
+ Opc = AArch64::FMLAv8f16;
+ MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
+ FMAInstKind::Accumulator);
+ break;
+ case MachineCombinerPattern::FMLAv8i16_indexed_OP2:
+ RC = &AArch64::FPR128RegClass;
+ Opc = AArch64::FMLAv8i16_indexed;
+ MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
+ FMAInstKind::Indexed);
+ break;
+ case MachineCombinerPattern::FMLAv8f16_OP2:
+ RC = &AArch64::FPR128RegClass;
+ Opc = AArch64::FMLAv8f16;
+ MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
+ FMAInstKind::Accumulator);
+ break;
+
case MachineCombinerPattern::FMLAv2i64_indexed_OP1:
case MachineCombinerPattern::FMLAv2f64_OP1:
RC = &AArch64::FPR128RegClass;
@@ -4367,56 +4470,53 @@ void AArch64InstrInfo::genAlternativeCodeSequence(
}
break;
+ case MachineCombinerPattern::FMULSUBH_OP1:
+ Opc = AArch64::FNMSUBHrrr;
+ RC = &AArch64::FPR16RegClass;
+ MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
+ break;
case MachineCombinerPattern::FMULSUBS_OP1:
- case MachineCombinerPattern::FMULSUBD_OP1: {
- // FMUL I=A,B,0
- // FSUB R,I,C
- // ==> FNMSUB R,A,B,C // = -C + A*B
- // --- Create(FNMSUB);
- if (Pattern == MachineCombinerPattern::FMULSUBS_OP1) {
- Opc = AArch64::FNMSUBSrrr;
- RC = &AArch64::FPR32RegClass;
- } else {
- Opc = AArch64::FNMSUBDrrr;
- RC = &AArch64::FPR64RegClass;
- }
+ Opc = AArch64::FNMSUBSrrr;
+ RC = &AArch64::FPR32RegClass;
+ MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
+ break;
+ case MachineCombinerPattern::FMULSUBD_OP1:
+ Opc = AArch64::FNMSUBDrrr;
+ RC = &AArch64::FPR64RegClass;
MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
break;
- }
+ case MachineCombinerPattern::FNMULSUBH_OP1:
+ Opc = AArch64::FNMADDHrrr;
+ RC = &AArch64::FPR16RegClass;
+ MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
+ break;
case MachineCombinerPattern::FNMULSUBS_OP1:
- case MachineCombinerPattern::FNMULSUBD_OP1: {
- // FNMUL I=A,B,0
- // FSUB R,I,C
- // ==> FNMADD R,A,B,C // = -A*B - C
- // --- Create(FNMADD);
- if (Pattern == MachineCombinerPattern::FNMULSUBS_OP1) {
- Opc = AArch64::FNMADDSrrr;
- RC = &AArch64::FPR32RegClass;
- } else {
- Opc = AArch64::FNMADDDrrr;
- RC = &AArch64::FPR64RegClass;
- }
+ Opc = AArch64::FNMADDSrrr;
+ RC = &AArch64::FPR32RegClass;
+ MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
+ break;
+ case MachineCombinerPattern::FNMULSUBD_OP1:
+ Opc = AArch64::FNMADDDrrr;
+ RC = &AArch64::FPR64RegClass;
MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
break;
- }
+ case MachineCombinerPattern::FMULSUBH_OP2:
+ Opc = AArch64::FMSUBHrrr;
+ RC = &AArch64::FPR16RegClass;
+ MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
+ break;
case MachineCombinerPattern::FMULSUBS_OP2:
- case MachineCombinerPattern::FMULSUBD_OP2: {
- // FMUL I=A,B,0
- // FSUB R,C,I
- // ==> FMSUB R,A,B,C (computes C - A*B)
- // --- Create(FMSUB);
- if (Pattern == MachineCombinerPattern::FMULSUBS_OP2) {
- Opc = AArch64::FMSUBSrrr;
- RC = &AArch64::FPR32RegClass;
- } else {
- Opc = AArch64::FMSUBDrrr;
- RC = &AArch64::FPR64RegClass;
- }
+ Opc = AArch64::FMSUBSrrr;
+ RC = &AArch64::FPR32RegClass;
+ MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
+ break;
+ case MachineCombinerPattern::FMULSUBD_OP2:
+ Opc = AArch64::FMSUBDrrr;
+ RC = &AArch64::FPR64RegClass;
MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
break;
- }
case MachineCombinerPattern::FMLSv1i32_indexed_OP2:
Opc = AArch64::FMLSv1i32_indexed;
@@ -4432,6 +4532,39 @@ void AArch64InstrInfo::genAlternativeCodeSequence(
FMAInstKind::Indexed);
break;
+ case MachineCombinerPattern::FMLSv4f16_OP1:
+ case MachineCombinerPattern::FMLSv4i16_indexed_OP1: {
+ RC = &AArch64::FPR64RegClass;
+ Register NewVR = MRI.createVirtualRegister(RC);
+ MachineInstrBuilder MIB1 =
+ BuildMI(MF, Root.getDebugLoc(), TII->get(AArch64::FNEGv4f16), NewVR)
+ .add(Root.getOperand(2));
+ InsInstrs.push_back(MIB1);
+ InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
+ if (Pattern == MachineCombinerPattern::FMLSv4f16_OP1) {
+ Opc = AArch64::FMLAv4f16;
+ MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
+ FMAInstKind::Accumulator, &NewVR);
+ } else {
+ Opc = AArch64::FMLAv4i16_indexed;
+ MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
+ FMAInstKind::Indexed, &NewVR);
+ }
+ break;
+ }
+ case MachineCombinerPattern::FMLSv4f16_OP2:
+ RC = &AArch64::FPR64RegClass;
+ Opc = AArch64::FMLSv4f16;
+ MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
+ FMAInstKind::Accumulator);
+ break;
+ case MachineCombinerPattern::FMLSv4i16_indexed_OP2:
+ RC = &AArch64::FPR64RegClass;
+ Opc = AArch64::FMLSv4i16_indexed;
+ MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
+ FMAInstKind::Indexed);
+ break;
+
case MachineCombinerPattern::FMLSv2f32_OP2:
case MachineCombinerPattern::FMLSv2i32_indexed_OP2:
RC = &AArch64::FPR64RegClass;
@@ -4446,6 +4579,39 @@ void AArch64InstrInfo::genAlternativeCodeSequence(
}
break;
+ case MachineCombinerPattern::FMLSv8f16_OP1:
+ case MachineCombinerPattern::FMLSv8i16_indexed_OP1: {
+ RC = &AArch64::FPR128RegClass;
+ Register NewVR = MRI.createVirtualRegister(RC);
+ MachineInstrBuilder MIB1 =
+ BuildMI(MF, Root.getDebugLoc(), TII->get(AArch64::FNEGv8f16), NewVR)
+ .add(Root.getOperand(2));
+ InsInstrs.push_back(MIB1);
+ InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
+ if (Pattern == MachineCombinerPattern::FMLSv8f16_OP1) {
+ Opc = AArch64::FMLAv8f16;
+ MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
+ FMAInstKind::Accumulator, &NewVR);
+ } else {
+ Opc = AArch64::FMLAv8i16_indexed;
+ MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
+ FMAInstKind::Indexed, &NewVR);
+ }
+ break;
+ }
+ case MachineCombinerPattern::FMLSv8f16_OP2:
+ RC = &AArch64::FPR128RegClass;
+ Opc = AArch64::FMLSv8f16;
+ MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
+ FMAInstKind::Accumulator);
+ break;
+ case MachineCombinerPattern::FMLSv8i16_indexed_OP2:
+ RC = &AArch64::FPR128RegClass;
+ Opc = AArch64::FMLSv8i16_indexed;
+ MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
+ FMAInstKind::Indexed);
+ break;
+
case MachineCombinerPattern::FMLSv2f64_OP2:
case MachineCombinerPattern::FMLSv2i64_indexed_OP2:
RC = &AArch64::FPR128RegClass;
@@ -4476,7 +4642,7 @@ void AArch64InstrInfo::genAlternativeCodeSequence(
case MachineCombinerPattern::FMLSv2f32_OP1:
case MachineCombinerPattern::FMLSv2i32_indexed_OP1: {
RC = &AArch64::FPR64RegClass;
- unsigned NewVR = MRI.createVirtualRegister(RC);
+ Register NewVR = MRI.createVirtualRegister(RC);
MachineInstrBuilder MIB1 =
BuildMI(MF, Root.getDebugLoc(), TII->get(AArch64::FNEGv2f32), NewVR)
.add(Root.getOperand(2));
@@ -4496,7 +4662,7 @@ void AArch64InstrInfo::genAlternativeCodeSequence(
case MachineCombinerPattern::FMLSv4f32_OP1:
case MachineCombinerPattern::FMLSv4i32_indexed_OP1: {
RC = &AArch64::FPR128RegClass;
- unsigned NewVR = MRI.createVirtualRegister(RC);
+ Register NewVR = MRI.createVirtualRegister(RC);
MachineInstrBuilder MIB1 =
BuildMI(MF, Root.getDebugLoc(), TII->get(AArch64::FNEGv4f32), NewVR)
.add(Root.getOperand(2));
@@ -4516,7 +4682,7 @@ void AArch64InstrInfo::genAlternativeCodeSequence(
case MachineCombinerPattern::FMLSv2f64_OP1:
case MachineCombinerPattern::FMLSv2i64_indexed_OP1: {
RC = &AArch64::FPR128RegClass;
- unsigned NewVR = MRI.createVirtualRegister(RC);
+ Register NewVR = MRI.createVirtualRegister(RC);
MachineInstrBuilder MIB1 =
BuildMI(MF, Root.getDebugLoc(), TII->get(AArch64::FNEGv2f64), NewVR)
.add(Root.getOperand(2));
@@ -4617,15 +4783,15 @@ bool AArch64InstrInfo::optimizeCondBranch(MachineInstr &MI) const {
MachineBasicBlock *MBB = MI.getParent();
MachineFunction *MF = MBB->getParent();
MachineRegisterInfo *MRI = &MF->getRegInfo();
- unsigned VReg = MI.getOperand(0).getReg();
- if (!TargetRegisterInfo::isVirtualRegister(VReg))
+ Register VReg = MI.getOperand(0).getReg();
+ if (!Register::isVirtualRegister(VReg))
return false;
MachineInstr *DefMI = MRI->getVRegDef(VReg);
// Look through COPY instructions to find definition.
while (DefMI->isCopy()) {
- unsigned CopyVReg = DefMI->getOperand(1).getReg();
+ Register CopyVReg = DefMI->getOperand(1).getReg();
if (!MRI->hasOneNonDBGUse(CopyVReg))
return false;
if (!MRI->hasOneDef(CopyVReg))
@@ -4653,8 +4819,8 @@ bool AArch64InstrInfo::optimizeCondBranch(MachineInstr &MI) const {
return false;
MachineOperand &MO = DefMI->getOperand(1);
- unsigned NewReg = MO.getReg();
- if (!TargetRegisterInfo::isVirtualRegister(NewReg))
+ Register NewReg = MO.getReg();
+ if (!Register::isVirtualRegister(NewReg))
return false;
assert(!MRI->def_empty(NewReg) && "Register must be defined.");
@@ -4737,9 +4903,13 @@ AArch64InstrInfo::getSerializableBitmaskMachineOperandTargetFlags() const {
static const std::pair<unsigned, const char *> TargetFlags[] = {
{MO_COFFSTUB, "aarch64-coffstub"},
- {MO_GOT, "aarch64-got"}, {MO_NC, "aarch64-nc"},
- {MO_S, "aarch64-s"}, {MO_TLS, "aarch64-tls"},
- {MO_DLLIMPORT, "aarch64-dllimport"}};
+ {MO_GOT, "aarch64-got"},
+ {MO_NC, "aarch64-nc"},
+ {MO_S, "aarch64-s"},
+ {MO_TLS, "aarch64-tls"},
+ {MO_DLLIMPORT, "aarch64-dllimport"},
+ {MO_PREL, "aarch64-prel"},
+ {MO_TAGGED, "aarch64-tagged"}};
return makeArrayRef(TargetFlags);
}
diff --git a/lib/Target/AArch64/AArch64InstrInfo.h b/lib/Target/AArch64/AArch64InstrInfo.h
index 7be4daba7dc4..1688045e4fb8 100644
--- a/lib/Target/AArch64/AArch64InstrInfo.h
+++ b/lib/Target/AArch64/AArch64InstrInfo.h
@@ -15,6 +15,7 @@
#include "AArch64.h"
#include "AArch64RegisterInfo.h"
+#include "AArch64StackOffset.h"
#include "llvm/ADT/Optional.h"
#include "llvm/CodeGen/MachineCombinerPattern.h"
#include "llvm/CodeGen/TargetInstrInfo.h"
@@ -55,8 +56,7 @@ public:
bool
areMemAccessesTriviallyDisjoint(const MachineInstr &MIa,
- const MachineInstr &MIb,
- AliasAnalysis *AA = nullptr) const override;
+ const MachineInstr &MIb) const override;
unsigned isLoadFromStackSlot(const MachineInstr &MI,
int &FrameIndex) const override;
@@ -299,7 +299,7 @@ private:
/// if necessary, to be replaced by the scavenger at the end of PEI.
void emitFrameOffset(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
const DebugLoc &DL, unsigned DestReg, unsigned SrcReg,
- int Offset, const TargetInstrInfo *TII,
+ StackOffset Offset, const TargetInstrInfo *TII,
MachineInstr::MIFlag = MachineInstr::NoFlags,
bool SetNZCV = false, bool NeedsWinCFI = false,
bool *HasWinCFI = nullptr);
@@ -308,7 +308,7 @@ void emitFrameOffset(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
/// FP. Return false if the offset could not be handled directly in MI, and
/// return the left-over portion by reference.
bool rewriteAArch64FrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
- unsigned FrameReg, int &Offset,
+ unsigned FrameReg, StackOffset &Offset,
const AArch64InstrInfo *TII);
/// Use to report the frame offset status in isAArch64FrameOffsetLegal.
@@ -332,10 +332,10 @@ enum AArch64FrameOffsetStatus {
/// If set, @p EmittableOffset contains the amount that can be set in @p MI
/// (possibly with @p OutUnscaledOp if OutUseUnscaledOp is true) and that
/// is a legal offset.
-int isAArch64FrameOffsetLegal(const MachineInstr &MI, int &Offset,
+int isAArch64FrameOffsetLegal(const MachineInstr &MI, StackOffset &Offset,
bool *OutUseUnscaledOp = nullptr,
unsigned *OutUnscaledOp = nullptr,
- int *EmittableOffset = nullptr);
+ int64_t *EmittableOffset = nullptr);
static inline bool isUncondBranchOpcode(int Opc) { return Opc == AArch64::B; }
diff --git a/lib/Target/AArch64/AArch64InstrInfo.td b/lib/Target/AArch64/AArch64InstrInfo.td
index eed53f36d574..1981bd5d3bf0 100644
--- a/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/lib/Target/AArch64/AArch64InstrInfo.td
@@ -62,6 +62,9 @@ def HasAM : Predicate<"Subtarget->hasAM()">,
def HasSEL2 : Predicate<"Subtarget->hasSEL2()">,
AssemblerPredicate<"FeatureSEL2", "sel2">;
+def HasPMU : Predicate<"Subtarget->hasPMU()">,
+ AssemblerPredicate<"FeaturePMU", "pmu">;
+
def HasTLB_RMI : Predicate<"Subtarget->hasTLB_RMI()">,
AssemblerPredicate<"FeatureTLB_RMI", "tlb-rmi">;
@@ -116,7 +119,7 @@ def HasSVE2SM4 : Predicate<"Subtarget->hasSVE2SM4()">,
def HasSVE2SHA3 : Predicate<"Subtarget->hasSVE2SHA3()">,
AssemblerPredicate<"FeatureSVE2SHA3", "sve2-sha3">;
def HasSVE2BitPerm : Predicate<"Subtarget->hasSVE2BitPerm()">,
- AssemblerPredicate<"FeatureSVE2BitPerm", "bitperm">;
+ AssemblerPredicate<"FeatureSVE2BitPerm", "sve2-bitperm">;
def HasRCPC : Predicate<"Subtarget->hasRCPC()">,
AssemblerPredicate<"FeatureRCPC", "rcpc">;
def HasAltNZCV : Predicate<"Subtarget->hasAlternativeNZCV()">,
@@ -133,6 +136,12 @@ def HasBTI : Predicate<"Subtarget->hasBTI()">,
AssemblerPredicate<"FeatureBranchTargetId", "bti">;
def HasMTE : Predicate<"Subtarget->hasMTE()">,
AssemblerPredicate<"FeatureMTE", "mte">;
+def HasTME : Predicate<"Subtarget->hasTME()">,
+ AssemblerPredicate<"FeatureTME", "tme">;
+def HasETE : Predicate<"Subtarget->hasETE()">,
+ AssemblerPredicate<"FeatureETE", "ete">;
+def HasTRBE : Predicate<"Subtarget->hasTRBE()">,
+ AssemblerPredicate<"FeatureTRBE", "trbe">;
def IsLE : Predicate<"Subtarget->isLittleEndian()">;
def IsBE : Predicate<"!Subtarget->isLittleEndian()">;
def IsWindows : Predicate<"Subtarget->isTargetWindows()">;
@@ -415,6 +424,14 @@ def AArch64stzg : SDNode<"AArch64ISD::STZG", SDT_AArch64SETTAG, [SDNPHasChain, S
def AArch64st2g : SDNode<"AArch64ISD::ST2G", SDT_AArch64SETTAG, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
def AArch64stz2g : SDNode<"AArch64ISD::STZ2G", SDT_AArch64SETTAG, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+def SDT_AArch64unpk : SDTypeProfile<1, 1, [
+ SDTCisInt<0>, SDTCisInt<1>, SDTCisOpSmallerThanOp<1, 0>
+]>;
+def AArch64sunpkhi : SDNode<"AArch64ISD::SUNPKHI", SDT_AArch64unpk>;
+def AArch64sunpklo : SDNode<"AArch64ISD::SUNPKLO", SDT_AArch64unpk>;
+def AArch64uunpkhi : SDNode<"AArch64ISD::UUNPKHI", SDT_AArch64unpk>;
+def AArch64uunpklo : SDNode<"AArch64ISD::UUNPKLO", SDT_AArch64unpk>;
+
//===----------------------------------------------------------------------===//
//===----------------------------------------------------------------------===//
@@ -431,6 +448,13 @@ let RecomputePerFunction = 1 in {
def UseBTI : Predicate<[{ MF->getFunction().hasFnAttribute("branch-target-enforcement") }]>;
def NotUseBTI : Predicate<[{ !MF->getFunction().hasFnAttribute("branch-target-enforcement") }]>;
+
+ // Toggles patterns which aren't beneficial in GlobalISel when we aren't
+ // optimizing. This allows us to selectively use patterns without impacting
+ // SelectionDAG's behaviour.
+ // FIXME: One day there will probably be a nicer way to check for this, but
+ // today is not that day.
+ def OptimizedGISelOrOtherSelector : Predicate<"!MF->getFunction().hasOptNone() || MF->getProperties().hasProperty(MachineFunctionProperties::Property::FailedISel) || !MF->getProperties().hasProperty(MachineFunctionProperties::Property::Legalized)">;
}
include "AArch64InstrFormats.td"
@@ -785,7 +809,11 @@ def MOVbaseTLS : Pseudo<(outs GPR64:$dst), (ins),
let Uses = [ X9 ], Defs = [ X16, X17, LR, NZCV ] in {
def HWASAN_CHECK_MEMACCESS : Pseudo<
(outs), (ins GPR64noip:$ptr, i32imm:$accessinfo),
- [(int_hwasan_check_memaccess X9, GPR64noip:$ptr, (i32 imm:$accessinfo))]>,
+ [(int_hwasan_check_memaccess X9, GPR64noip:$ptr, (i32 timm:$accessinfo))]>,
+ Sched<[]>;
+def HWASAN_CHECK_MEMACCESS_SHORTGRANULES : Pseudo<
+ (outs), (ins GPR64noip:$ptr, i32imm:$accessinfo),
+ [(int_hwasan_check_memaccess_shortgranules X9, GPR64noip:$ptr, (i32 timm:$accessinfo))]>,
Sched<[]>;
}
@@ -804,6 +832,23 @@ def : InstAlias<"sys $op1, $Cn, $Cm, $op2",
(SYSxt imm0_7:$op1, sys_cr_op:$Cn,
sys_cr_op:$Cm, imm0_7:$op2, XZR)>;
+
+let Predicates = [HasTME] in {
+
+def TSTART : TMSystemI<0b0000, "tstart",
+ [(set GPR64:$Rt, (int_aarch64_tstart))]>;
+
+def TCOMMIT : TMSystemINoOperand<0b0000, "tcommit", [(int_aarch64_tcommit)]>;
+
+def TCANCEL : TMSystemException<0b011, "tcancel",
+ [(int_aarch64_tcancel i64_imm0_65535:$imm)]>;
+
+def TTEST : TMSystemI<0b0001, "ttest", [(set GPR64:$Rt, (int_aarch64_ttest))]> {
+ let mayLoad = 0;
+ let mayStore = 0;
+}
+} // HasTME
+
//===----------------------------------------------------------------------===//
// Move immediate instructions.
//===----------------------------------------------------------------------===//
@@ -815,37 +860,37 @@ let PostEncoderMethod = "fixMOVZ" in
defm MOVZ : MoveImmediate<0b10, "movz">;
// First group of aliases covers an implicit "lsl #0".
-def : InstAlias<"movk $dst, $imm", (MOVKWi GPR32:$dst, imm0_65535:$imm, 0), 0>;
-def : InstAlias<"movk $dst, $imm", (MOVKXi GPR64:$dst, imm0_65535:$imm, 0), 0>;
-def : InstAlias<"movn $dst, $imm", (MOVNWi GPR32:$dst, imm0_65535:$imm, 0)>;
-def : InstAlias<"movn $dst, $imm", (MOVNXi GPR64:$dst, imm0_65535:$imm, 0)>;
-def : InstAlias<"movz $dst, $imm", (MOVZWi GPR32:$dst, imm0_65535:$imm, 0)>;
-def : InstAlias<"movz $dst, $imm", (MOVZXi GPR64:$dst, imm0_65535:$imm, 0)>;
+def : InstAlias<"movk $dst, $imm", (MOVKWi GPR32:$dst, i32_imm0_65535:$imm, 0), 0>;
+def : InstAlias<"movk $dst, $imm", (MOVKXi GPR64:$dst, i32_imm0_65535:$imm, 0), 0>;
+def : InstAlias<"movn $dst, $imm", (MOVNWi GPR32:$dst, i32_imm0_65535:$imm, 0)>;
+def : InstAlias<"movn $dst, $imm", (MOVNXi GPR64:$dst, i32_imm0_65535:$imm, 0)>;
+def : InstAlias<"movz $dst, $imm", (MOVZWi GPR32:$dst, i32_imm0_65535:$imm, 0)>;
+def : InstAlias<"movz $dst, $imm", (MOVZXi GPR64:$dst, i32_imm0_65535:$imm, 0)>;
// Next, we have various ELF relocations with the ":XYZ_g0:sym" syntax.
-def : InstAlias<"movz $Rd, $sym", (MOVZXi GPR64:$Rd, movz_symbol_g3:$sym, 48)>;
-def : InstAlias<"movz $Rd, $sym", (MOVZXi GPR64:$Rd, movz_symbol_g2:$sym, 32)>;
-def : InstAlias<"movz $Rd, $sym", (MOVZXi GPR64:$Rd, movz_symbol_g1:$sym, 16)>;
-def : InstAlias<"movz $Rd, $sym", (MOVZXi GPR64:$Rd, movz_symbol_g0:$sym, 0)>;
+def : InstAlias<"movz $Rd, $sym", (MOVZXi GPR64:$Rd, movw_symbol_g3:$sym, 48)>;
+def : InstAlias<"movz $Rd, $sym", (MOVZXi GPR64:$Rd, movw_symbol_g2:$sym, 32)>;
+def : InstAlias<"movz $Rd, $sym", (MOVZXi GPR64:$Rd, movw_symbol_g1:$sym, 16)>;
+def : InstAlias<"movz $Rd, $sym", (MOVZXi GPR64:$Rd, movw_symbol_g0:$sym, 0)>;
-def : InstAlias<"movn $Rd, $sym", (MOVNXi GPR64:$Rd, movz_symbol_g3:$sym, 48)>;
-def : InstAlias<"movn $Rd, $sym", (MOVNXi GPR64:$Rd, movz_symbol_g2:$sym, 32)>;
-def : InstAlias<"movn $Rd, $sym", (MOVNXi GPR64:$Rd, movz_symbol_g1:$sym, 16)>;
-def : InstAlias<"movn $Rd, $sym", (MOVNXi GPR64:$Rd, movz_symbol_g0:$sym, 0)>;
+def : InstAlias<"movn $Rd, $sym", (MOVNXi GPR64:$Rd, movw_symbol_g3:$sym, 48)>;
+def : InstAlias<"movn $Rd, $sym", (MOVNXi GPR64:$Rd, movw_symbol_g2:$sym, 32)>;
+def : InstAlias<"movn $Rd, $sym", (MOVNXi GPR64:$Rd, movw_symbol_g1:$sym, 16)>;
+def : InstAlias<"movn $Rd, $sym", (MOVNXi GPR64:$Rd, movw_symbol_g0:$sym, 0)>;
-def : InstAlias<"movk $Rd, $sym", (MOVKXi GPR64:$Rd, movk_symbol_g3:$sym, 48), 0>;
-def : InstAlias<"movk $Rd, $sym", (MOVKXi GPR64:$Rd, movk_symbol_g2:$sym, 32), 0>;
-def : InstAlias<"movk $Rd, $sym", (MOVKXi GPR64:$Rd, movk_symbol_g1:$sym, 16), 0>;
-def : InstAlias<"movk $Rd, $sym", (MOVKXi GPR64:$Rd, movk_symbol_g0:$sym, 0), 0>;
+def : InstAlias<"movk $Rd, $sym", (MOVKXi GPR64:$Rd, movw_symbol_g3:$sym, 48), 0>;
+def : InstAlias<"movk $Rd, $sym", (MOVKXi GPR64:$Rd, movw_symbol_g2:$sym, 32), 0>;
+def : InstAlias<"movk $Rd, $sym", (MOVKXi GPR64:$Rd, movw_symbol_g1:$sym, 16), 0>;
+def : InstAlias<"movk $Rd, $sym", (MOVKXi GPR64:$Rd, movw_symbol_g0:$sym, 0), 0>;
-def : InstAlias<"movz $Rd, $sym", (MOVZWi GPR32:$Rd, movz_symbol_g1:$sym, 16)>;
-def : InstAlias<"movz $Rd, $sym", (MOVZWi GPR32:$Rd, movz_symbol_g0:$sym, 0)>;
+def : InstAlias<"movz $Rd, $sym", (MOVZWi GPR32:$Rd, movw_symbol_g1:$sym, 16)>;
+def : InstAlias<"movz $Rd, $sym", (MOVZWi GPR32:$Rd, movw_symbol_g0:$sym, 0)>;
-def : InstAlias<"movn $Rd, $sym", (MOVNWi GPR32:$Rd, movz_symbol_g1:$sym, 16)>;
-def : InstAlias<"movn $Rd, $sym", (MOVNWi GPR32:$Rd, movz_symbol_g0:$sym, 0)>;
+def : InstAlias<"movn $Rd, $sym", (MOVNWi GPR32:$Rd, movw_symbol_g1:$sym, 16)>;
+def : InstAlias<"movn $Rd, $sym", (MOVNWi GPR32:$Rd, movw_symbol_g0:$sym, 0)>;
-def : InstAlias<"movk $Rd, $sym", (MOVKWi GPR32:$Rd, movk_symbol_g1:$sym, 16), 0>;
-def : InstAlias<"movk $Rd, $sym", (MOVKWi GPR32:$Rd, movk_symbol_g0:$sym, 0), 0>;
+def : InstAlias<"movk $Rd, $sym", (MOVKWi GPR32:$Rd, movw_symbol_g1:$sym, 16), 0>;
+def : InstAlias<"movk $Rd, $sym", (MOVKWi GPR32:$Rd, movw_symbol_g0:$sym, 0), 0>;
// Final group of aliases covers true "mov $Rd, $imm" cases.
multiclass movw_mov_alias<string basename,Instruction INST, RegisterClass GPR,
@@ -917,8 +962,12 @@ def trunc_imm : SDNodeXForm<imm, [{
def gi_trunc_imm : GICustomOperandRenderer<"renderTruncImm">,
GISDNodeXFormEquiv<trunc_imm>;
+let Predicates = [OptimizedGISelOrOtherSelector] in {
+// The SUBREG_TO_REG isn't eliminated at -O0, which can result in pointless
+// copies.
def : Pat<(i64 i64imm_32bit:$src),
(SUBREG_TO_REG (i64 0), (MOVi32imm (trunc_imm imm:$src)), sub_32)>;
+}
// Materialize FP constants via MOVi32imm/MOVi64imm (MachO large code model).
def bitcast_fpimm_to_i32 : SDNodeXForm<fpimm, [{
@@ -1012,10 +1061,10 @@ def : Pat<(sub GPR32:$Rn, arith_shifted_reg32:$Rm),
def : Pat<(sub GPR64:$Rn, arith_shifted_reg64:$Rm),
(SUBSXrs GPR64:$Rn, arith_shifted_reg64:$Rm)>;
let AddedComplexity = 1 in {
-def : Pat<(sub GPR32sp:$R2, arith_extended_reg32<i32>:$R3),
- (SUBSWrx GPR32sp:$R2, arith_extended_reg32<i32>:$R3)>;
-def : Pat<(sub GPR64sp:$R2, arith_extended_reg32to64<i64>:$R3),
- (SUBSXrx GPR64sp:$R2, arith_extended_reg32to64<i64>:$R3)>;
+def : Pat<(sub GPR32sp:$R2, arith_extended_reg32_i32:$R3),
+ (SUBSWrx GPR32sp:$R2, arith_extended_reg32_i32:$R3)>;
+def : Pat<(sub GPR64sp:$R2, arith_extended_reg32to64_i64:$R3),
+ (SUBSXrx GPR64sp:$R2, arith_extended_reg32to64_i64:$R3)>;
}
// Because of the immediate format for add/sub-imm instructions, the
@@ -2165,8 +2214,8 @@ def : InstAlias<"prfm $Rt, [$Rn]", (PRFMui prfop:$Rt, GPR64sp:$Rn, 0)>;
def alignedglobal : PatLeaf<(iPTR iPTR:$label), [{
if (auto *G = dyn_cast<GlobalAddressSDNode>(N)) {
const DataLayout &DL = MF->getDataLayout();
- unsigned Align = G->getGlobal()->getPointerAlignment(DL);
- return Align >= 4 && G->getOffset() % 4 == 0;
+ MaybeAlign Align = G->getGlobal()->getPointerAlignment(DL);
+ return Align && *Align >= 4 && G->getOffset() % 4 == 0;
}
if (auto *C = dyn_cast<ConstantPoolSDNode>(N))
return C->getAlignment() >= 4 && C->getOffset() % 4 == 0;
@@ -3281,20 +3330,37 @@ defm FNMSUB : ThreeOperandFPData<1, 1, "fnmsub",
// N.b. FMSUB etc have the accumulator at the *end* of (outs), unlike
// the NEON variant.
+
+// Here we handle first -(a + b*c) for FNMADD:
+
+let Predicates = [HasNEON, HasFullFP16] in
+def : Pat<(f16 (fma (fneg FPR16:$Rn), FPR16:$Rm, FPR16:$Ra)),
+ (FMSUBHrrr FPR16:$Rn, FPR16:$Rm, FPR16:$Ra)>;
+
def : Pat<(f32 (fma (fneg FPR32:$Rn), FPR32:$Rm, FPR32:$Ra)),
(FMSUBSrrr FPR32:$Rn, FPR32:$Rm, FPR32:$Ra)>;
def : Pat<(f64 (fma (fneg FPR64:$Rn), FPR64:$Rm, FPR64:$Ra)),
(FMSUBDrrr FPR64:$Rn, FPR64:$Rm, FPR64:$Ra)>;
-// We handled -(a + b*c) for FNMADD above, now it's time for "(-a) + (-b)*c" and
-// "(-a) + b*(-c)".
+// Now it's time for "(-a) + (-b)*c"
+
+let Predicates = [HasNEON, HasFullFP16] in
+def : Pat<(f16 (fma (fneg FPR16:$Rn), FPR16:$Rm, (fneg FPR16:$Ra))),
+ (FNMADDHrrr FPR16:$Rn, FPR16:$Rm, FPR16:$Ra)>;
+
def : Pat<(f32 (fma (fneg FPR32:$Rn), FPR32:$Rm, (fneg FPR32:$Ra))),
(FNMADDSrrr FPR32:$Rn, FPR32:$Rm, FPR32:$Ra)>;
def : Pat<(f64 (fma (fneg FPR64:$Rn), FPR64:$Rm, (fneg FPR64:$Ra))),
(FNMADDDrrr FPR64:$Rn, FPR64:$Rm, FPR64:$Ra)>;
+// And here "(-a) + b*(-c)"
+
+let Predicates = [HasNEON, HasFullFP16] in
+def : Pat<(f16 (fma FPR16:$Rn, (fneg FPR16:$Rm), (fneg FPR16:$Ra))),
+ (FNMADDHrrr FPR16:$Rn, FPR16:$Rm, FPR16:$Ra)>;
+
def : Pat<(f32 (fma FPR32:$Rn, (fneg FPR32:$Rm), (fneg FPR32:$Ra))),
(FNMADDSrrr FPR32:$Rn, FPR32:$Rm, FPR32:$Ra)>;
@@ -6939,5 +7005,124 @@ def : Pat<(AArch64tcret texternalsym:$dst, (i32 timm:$FPDiff)),
def MOVMCSym : Pseudo<(outs GPR64:$dst), (ins i64imm:$sym), []>, Sched<[]>;
def : Pat<(i64 (AArch64LocalRecover mcsym:$sym)), (MOVMCSym mcsym:$sym)>;
+// Extracting lane zero is a special case where we can just use a plain
+// EXTRACT_SUBREG instruction, which will become FMOV. This is easier for the
+// rest of the compiler, especially the register allocator and copy propagation,
+// to reason about, so is preferred when it's possible to use it.
+let AddedComplexity = 10 in {
+ def : Pat<(i64 (extractelt (v2i64 V128:$V), (i64 0))), (EXTRACT_SUBREG V128:$V, dsub)>;
+ def : Pat<(i32 (extractelt (v4i32 V128:$V), (i64 0))), (EXTRACT_SUBREG V128:$V, ssub)>;
+ def : Pat<(i32 (extractelt (v2i32 V64:$V), (i64 0))), (EXTRACT_SUBREG V64:$V, ssub)>;
+}
+
+// dot_v4i8
+class mul_v4i8<SDPatternOperator ldop> :
+ PatFrag<(ops node:$Rn, node:$Rm, node:$offset),
+ (mul (ldop (add node:$Rn, node:$offset)),
+ (ldop (add node:$Rm, node:$offset)))>;
+class mulz_v4i8<SDPatternOperator ldop> :
+ PatFrag<(ops node:$Rn, node:$Rm),
+ (mul (ldop node:$Rn), (ldop node:$Rm))>;
+
+def load_v4i8 :
+ OutPatFrag<(ops node:$R),
+ (INSERT_SUBREG
+ (v2i32 (IMPLICIT_DEF)),
+ (i32 (COPY_TO_REGCLASS (LDRWui node:$R, (i64 0)), FPR32)),
+ ssub)>;
+
+class dot_v4i8<Instruction DOT, SDPatternOperator ldop> :
+ Pat<(i32 (add (mul_v4i8<ldop> GPR64sp:$Rn, GPR64sp:$Rm, (i64 3)),
+ (add (mul_v4i8<ldop> GPR64sp:$Rn, GPR64sp:$Rm, (i64 2)),
+ (add (mul_v4i8<ldop> GPR64sp:$Rn, GPR64sp:$Rm, (i64 1)),
+ (mulz_v4i8<ldop> GPR64sp:$Rn, GPR64sp:$Rm))))),
+ (EXTRACT_SUBREG (i64 (DOT (DUPv2i32gpr WZR),
+ (load_v4i8 GPR64sp:$Rn),
+ (load_v4i8 GPR64sp:$Rm))),
+ sub_32)>, Requires<[HasDotProd]>;
+
+// dot_v8i8
+class ee_v8i8<SDPatternOperator extend> :
+ PatFrag<(ops node:$V, node:$K),
+ (v4i16 (extract_subvector (v8i16 (extend node:$V)), node:$K))>;
+
+class mul_v8i8<SDPatternOperator mulop, SDPatternOperator extend> :
+ PatFrag<(ops node:$M, node:$N, node:$K),
+ (mulop (v4i16 (ee_v8i8<extend> node:$M, node:$K)),
+ (v4i16 (ee_v8i8<extend> node:$N, node:$K)))>;
+
+class idot_v8i8<SDPatternOperator mulop, SDPatternOperator extend> :
+ PatFrag<(ops node:$M, node:$N),
+ (i32 (extractelt
+ (v4i32 (AArch64uaddv
+ (add (mul_v8i8<mulop, extend> node:$M, node:$N, (i64 0)),
+ (mul_v8i8<mulop, extend> node:$M, node:$N, (i64 4))))),
+ (i64 0)))>;
+
+// vaddv_[su]32 is special; -> ADDP Vd.2S,Vn.2S,Vm.2S; return Vd.s[0];Vn==Vm
+def VADDV_32 : OutPatFrag<(ops node:$R), (ADDPv2i32 node:$R, node:$R)>;
+
+class odot_v8i8<Instruction DOT> :
+ OutPatFrag<(ops node:$Vm, node:$Vn),
+ (EXTRACT_SUBREG
+ (VADDV_32
+ (i64 (DOT (DUPv2i32gpr WZR),
+ (v8i8 node:$Vm),
+ (v8i8 node:$Vn)))),
+ sub_32)>;
+
+class dot_v8i8<Instruction DOT, SDPatternOperator mulop,
+ SDPatternOperator extend> :
+ Pat<(idot_v8i8<mulop, extend> V64:$Vm, V64:$Vn),
+ (odot_v8i8<DOT> V64:$Vm, V64:$Vn)>,
+ Requires<[HasDotProd]>;
+
+// dot_v16i8
+class ee_v16i8<SDPatternOperator extend> :
+ PatFrag<(ops node:$V, node:$K1, node:$K2),
+ (v4i16 (extract_subvector
+ (v8i16 (extend
+ (v8i8 (extract_subvector node:$V, node:$K1)))), node:$K2))>;
+
+class mul_v16i8<SDPatternOperator mulop, SDPatternOperator extend> :
+ PatFrag<(ops node:$M, node:$N, node:$K1, node:$K2),
+ (v4i32
+ (mulop (v4i16 (ee_v16i8<extend> node:$M, node:$K1, node:$K2)),
+ (v4i16 (ee_v16i8<extend> node:$N, node:$K1, node:$K2))))>;
+
+class idot_v16i8<SDPatternOperator m, SDPatternOperator x> :
+ PatFrag<(ops node:$M, node:$N),
+ (i32 (extractelt
+ (v4i32 (AArch64uaddv
+ (add
+ (add (mul_v16i8<m, x> node:$M, node:$N, (i64 0), (i64 0)),
+ (mul_v16i8<m, x> node:$M, node:$N, (i64 8), (i64 0))),
+ (add (mul_v16i8<m, x> node:$M, node:$N, (i64 0), (i64 4)),
+ (mul_v16i8<m, x> node:$M, node:$N, (i64 8), (i64 4)))))),
+ (i64 0)))>;
+
+class odot_v16i8<Instruction DOT> :
+ OutPatFrag<(ops node:$Vm, node:$Vn),
+ (i32 (ADDVv4i32v
+ (DOT (DUPv4i32gpr WZR), node:$Vm, node:$Vn)))>;
+
+class dot_v16i8<Instruction DOT, SDPatternOperator mulop,
+ SDPatternOperator extend> :
+ Pat<(idot_v16i8<mulop, extend> V128:$Vm, V128:$Vn),
+ (odot_v16i8<DOT> V128:$Vm, V128:$Vn)>,
+ Requires<[HasDotProd]>;
+
+let AddedComplexity = 10 in {
+ def : dot_v4i8<SDOTv8i8, sextloadi8>;
+ def : dot_v4i8<UDOTv8i8, zextloadi8>;
+ def : dot_v8i8<SDOTv8i8, AArch64smull, sext>;
+ def : dot_v8i8<UDOTv8i8, AArch64umull, zext>;
+ def : dot_v16i8<SDOTv16i8, AArch64smull, sext>;
+ def : dot_v16i8<UDOTv16i8, AArch64umull, zext>;
+
+ // FIXME: add patterns to generate vector by element dot product.
+ // FIXME: add SVE dot-product patterns.
+}
+
include "AArch64InstrAtomics.td"
include "AArch64SVEInstrInfo.td"
diff --git a/lib/Target/AArch64/AArch64InstructionSelector.cpp b/lib/Target/AArch64/AArch64InstructionSelector.cpp
index 4e13fb8e2027..961f38cad1e4 100644
--- a/lib/Target/AArch64/AArch64InstructionSelector.cpp
+++ b/lib/Target/AArch64/AArch64InstructionSelector.cpp
@@ -51,9 +51,19 @@ public:
const AArch64Subtarget &STI,
const AArch64RegisterBankInfo &RBI);
- bool select(MachineInstr &I, CodeGenCoverage &CoverageInfo) const override;
+ bool select(MachineInstr &I) override;
static const char *getName() { return DEBUG_TYPE; }
+ void setupMF(MachineFunction &MF, GISelKnownBits &KB,
+ CodeGenCoverage &CoverageInfo) override {
+ InstructionSelector::setupMF(MF, KB, CoverageInfo);
+
+ // hasFnAttribute() is expensive to call on every BRCOND selection, so
+ // cache it here for each run of the selector.
+ ProduceNonFlagSettingCondBr =
+ !MF.getFunction().hasFnAttribute(Attribute::SpeculativeLoadHardening);
+ }
+
private:
/// tblgen-erated 'select' implementation, used as the initial selector for
/// the patterns that don't require complex C++.
@@ -68,6 +78,10 @@ private:
bool earlySelectSHL(MachineInstr &I, MachineRegisterInfo &MRI) const;
+ /// Eliminate same-sized cross-bank copies into stores before selectImpl().
+ void contractCrossBankCopyIntoStore(MachineInstr &I,
+ MachineRegisterInfo &MRI) const;
+
bool selectVaStartAAPCS(MachineInstr &I, MachineFunction &MF,
MachineRegisterInfo &MRI) const;
bool selectVaStartDarwin(MachineInstr &I, MachineFunction &MF,
@@ -101,8 +115,6 @@ private:
bool selectMergeValues(MachineInstr &I, MachineRegisterInfo &MRI) const;
bool selectUnmergeValues(MachineInstr &I, MachineRegisterInfo &MRI) const;
- void collectShuffleMaskIndices(MachineInstr &I, MachineRegisterInfo &MRI,
- SmallVectorImpl<Optional<int>> &Idxs) const;
bool selectShuffleVector(MachineInstr &I, MachineRegisterInfo &MRI) const;
bool selectExtractElt(MachineInstr &I, MachineRegisterInfo &MRI) const;
bool selectConcatVectors(MachineInstr &I, MachineRegisterInfo &MRI) const;
@@ -116,6 +128,7 @@ private:
bool selectIntrinsicRound(MachineInstr &I, MachineRegisterInfo &MRI) const;
bool selectJumpTable(MachineInstr &I, MachineRegisterInfo &MRI) const;
bool selectBrJT(MachineInstr &I, MachineRegisterInfo &MRI) const;
+ bool selectTLSGlobalValue(MachineInstr &I, MachineRegisterInfo &MRI) const;
unsigned emitConstantPoolEntry(Constant *CPVal, MachineFunction &MF) const;
MachineInstr *emitLoadFromConstantPool(Constant *CPVal,
@@ -128,6 +141,8 @@ private:
MachineInstr *emitIntegerCompare(MachineOperand &LHS, MachineOperand &RHS,
MachineOperand &Predicate,
MachineIRBuilder &MIRBuilder) const;
+ MachineInstr *emitADD(Register DefReg, MachineOperand &LHS, MachineOperand &RHS,
+ MachineIRBuilder &MIRBuilder) const;
MachineInstr *emitCMN(MachineOperand &LHS, MachineOperand &RHS,
MachineIRBuilder &MIRBuilder) const;
MachineInstr *emitTST(const Register &LHS, const Register &RHS,
@@ -155,7 +170,9 @@ private:
ComplexRendererFns selectShiftA_64(const MachineOperand &Root) const;
ComplexRendererFns selectShiftB_64(const MachineOperand &Root) const;
+ ComplexRendererFns select12BitValueWithLeftShift(uint64_t Immed) const;
ComplexRendererFns selectArithImmed(MachineOperand &Root) const;
+ ComplexRendererFns selectNegArithImmed(MachineOperand &Root) const;
ComplexRendererFns selectAddrModeUnscaled(MachineOperand &Root,
unsigned Size) const;
@@ -183,11 +200,48 @@ private:
return selectAddrModeIndexed(Root, Width / 8);
}
+ bool isWorthFoldingIntoExtendedReg(MachineInstr &MI,
+ const MachineRegisterInfo &MRI) const;
+ ComplexRendererFns
+ selectAddrModeShiftedExtendXReg(MachineOperand &Root,
+ unsigned SizeInBytes) const;
+ ComplexRendererFns selectAddrModeRegisterOffset(MachineOperand &Root) const;
+ ComplexRendererFns selectAddrModeXRO(MachineOperand &Root,
+ unsigned SizeInBytes) const;
+ template <int Width>
+ ComplexRendererFns selectAddrModeXRO(MachineOperand &Root) const {
+ return selectAddrModeXRO(Root, Width / 8);
+ }
+
+ ComplexRendererFns selectShiftedRegister(MachineOperand &Root) const;
+
+ ComplexRendererFns selectArithShiftedRegister(MachineOperand &Root) const {
+ return selectShiftedRegister(Root);
+ }
+
+ ComplexRendererFns selectLogicalShiftedRegister(MachineOperand &Root) const {
+ // TODO: selectShiftedRegister should allow for rotates on logical shifts.
+ // For now, make them the same. The only difference between the two is that
+ // logical shifts are allowed to fold in rotates. Otherwise, these are
+ // functionally the same.
+ return selectShiftedRegister(Root);
+ }
+
+ /// Instructions that accept extend modifiers like UXTW expect the register
+ /// being extended to be a GPR32. Narrow ExtReg to a 32-bit register using a
+ /// subregister copy if necessary. Return either ExtReg, or the result of the
+ /// new copy.
+ Register narrowExtendRegIfNeeded(Register ExtReg,
+ MachineIRBuilder &MIB) const;
+ ComplexRendererFns selectArithExtendedRegister(MachineOperand &Root) const;
+
void renderTruncImm(MachineInstrBuilder &MIB, const MachineInstr &MI) const;
+ void renderLogicalImm32(MachineInstrBuilder &MIB, const MachineInstr &I) const;
+ void renderLogicalImm64(MachineInstrBuilder &MIB, const MachineInstr &I) const;
// Materialize a GlobalValue or BlockAddress using a movz+movk sequence.
void materializeLargeCMVal(MachineInstr &I, const Value *V,
- unsigned char OpFlags) const;
+ unsigned OpFlags) const;
// Optimization methods.
bool tryOptVectorShuffle(MachineInstr &I) const;
@@ -197,12 +251,22 @@ private:
MachineOperand &Predicate,
MachineIRBuilder &MIRBuilder) const;
+ /// Return true if \p MI is a load or store of \p NumBytes bytes.
+ bool isLoadStoreOfNumBytes(const MachineInstr &MI, unsigned NumBytes) const;
+
+ /// Returns true if \p MI is guaranteed to have the high-half of a 64-bit
+ /// register zeroed out. In other words, the result of MI has been explicitly
+ /// zero extended.
+ bool isDef32(const MachineInstr &MI) const;
+
const AArch64TargetMachine &TM;
const AArch64Subtarget &STI;
const AArch64InstrInfo &TII;
const AArch64RegisterInfo &TRI;
const AArch64RegisterBankInfo &RBI;
+ bool ProduceNonFlagSettingCondBr = false;
+
#define GET_GLOBALISEL_PREDICATES_DECL
#include "AArch64GenGlobalISel.inc"
#undef GET_GLOBALISEL_PREDICATES_DECL
@@ -312,7 +376,7 @@ static bool getSubRegForClass(const TargetRegisterClass *RC,
SubReg = AArch64::hsub;
break;
case 32:
- if (RC == &AArch64::GPR32RegClass)
+ if (RC != &AArch64::FPR32RegClass)
SubReg = AArch64::sub_32;
else
SubReg = AArch64::ssub;
@@ -357,7 +421,7 @@ static bool unsupportedBinOp(const MachineInstr &I,
// so, this will need to be taught about that, and we'll need to get the
// bank out of the minimal class for the register.
// Either way, this needs to be documented (and possibly verified).
- if (!TargetRegisterInfo::isVirtualRegister(MO.getReg())) {
+ if (!Register::isVirtualRegister(MO.getReg())) {
LLVM_DEBUG(dbgs() << "Generic inst has physical register operand\n");
return true;
}
@@ -492,8 +556,8 @@ static bool isValidCopy(const MachineInstr &I, const RegisterBank &DstBank,
const MachineRegisterInfo &MRI,
const TargetRegisterInfo &TRI,
const RegisterBankInfo &RBI) {
- const unsigned DstReg = I.getOperand(0).getReg();
- const unsigned SrcReg = I.getOperand(1).getReg();
+ const Register DstReg = I.getOperand(0).getReg();
+ const Register SrcReg = I.getOperand(1).getReg();
const unsigned DstSize = RBI.getSizeInBits(DstReg, MRI, TRI);
const unsigned SrcSize = RBI.getSizeInBits(SrcReg, MRI, TRI);
@@ -502,7 +566,7 @@ static bool isValidCopy(const MachineInstr &I, const RegisterBank &DstBank,
(DstSize == SrcSize ||
// Copies are a mean to setup initial types, the number of
// bits may not exactly match.
- (TargetRegisterInfo::isPhysicalRegister(SrcReg) && DstSize <= SrcSize) ||
+ (Register::isPhysicalRegister(SrcReg) && DstSize <= SrcSize) ||
// Copies are a mean to copy bits around, as long as we are
// on the same register class, that's fine. Otherwise, that
// means we need some SUBREG_TO_REG or AND & co.
@@ -526,7 +590,7 @@ static bool isValidCopy(const MachineInstr &I, const RegisterBank &DstBank,
/// SubRegCopy (To class) = COPY CopyReg:SubReg
/// Dst = COPY SubRegCopy
static bool selectSubregisterCopy(MachineInstr &I, MachineRegisterInfo &MRI,
- const RegisterBankInfo &RBI, unsigned SrcReg,
+ const RegisterBankInfo &RBI, Register SrcReg,
const TargetRegisterClass *From,
const TargetRegisterClass *To,
unsigned SubReg) {
@@ -539,7 +603,7 @@ static bool selectSubregisterCopy(MachineInstr &I, MachineRegisterInfo &MRI,
// It's possible that the destination register won't be constrained. Make
// sure that happens.
- if (!TargetRegisterInfo::isPhysicalRegister(I.getOperand(0).getReg()))
+ if (!Register::isPhysicalRegister(I.getOperand(0).getReg()))
RBI.constrainGenericRegister(I.getOperand(0).getReg(), *To, MRI);
return true;
@@ -553,8 +617,8 @@ static std::pair<const TargetRegisterClass *, const TargetRegisterClass *>
getRegClassesForCopy(MachineInstr &I, const TargetInstrInfo &TII,
MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI,
const RegisterBankInfo &RBI) {
- unsigned DstReg = I.getOperand(0).getReg();
- unsigned SrcReg = I.getOperand(1).getReg();
+ Register DstReg = I.getOperand(0).getReg();
+ Register SrcReg = I.getOperand(1).getReg();
const RegisterBank &DstRegBank = *RBI.getRegBank(DstReg, MRI, TRI);
const RegisterBank &SrcRegBank = *RBI.getRegBank(SrcReg, MRI, TRI);
unsigned DstSize = RBI.getSizeInBits(DstReg, MRI, TRI);
@@ -579,8 +643,8 @@ static bool selectCopy(MachineInstr &I, const TargetInstrInfo &TII,
MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI,
const RegisterBankInfo &RBI) {
- unsigned DstReg = I.getOperand(0).getReg();
- unsigned SrcReg = I.getOperand(1).getReg();
+ Register DstReg = I.getOperand(0).getReg();
+ Register SrcReg = I.getOperand(1).getReg();
const RegisterBank &DstRegBank = *RBI.getRegBank(DstReg, MRI, TRI);
const RegisterBank &SrcRegBank = *RBI.getRegBank(SrcReg, MRI, TRI);
@@ -607,11 +671,10 @@ static bool selectCopy(MachineInstr &I, const TargetInstrInfo &TII,
// result.
auto CheckCopy = [&]() {
// If we have a bitcast or something, we can't have physical registers.
- assert(
- (I.isCopy() ||
- (!TargetRegisterInfo::isPhysicalRegister(I.getOperand(0).getReg()) &&
- !TargetRegisterInfo::isPhysicalRegister(I.getOperand(1).getReg()))) &&
- "No phys reg on generic operator!");
+ assert((I.isCopy() ||
+ (!Register::isPhysicalRegister(I.getOperand(0).getReg()) &&
+ !Register::isPhysicalRegister(I.getOperand(1).getReg()))) &&
+ "No phys reg on generic operator!");
assert(KnownValid || isValidCopy(I, DstRegBank, MRI, TRI, RBI));
(void)KnownValid;
return true;
@@ -626,38 +689,38 @@ static bool selectCopy(MachineInstr &I, const TargetInstrInfo &TII,
return false;
}
- // Is this a cross-bank copy?
- if (DstRegBank.getID() != SrcRegBank.getID()) {
- // If we're doing a cross-bank copy on different-sized registers, we need
- // to do a bit more work.
- unsigned SrcSize = TRI.getRegSizeInBits(*SrcRC);
- unsigned DstSize = TRI.getRegSizeInBits(*DstRC);
-
- if (SrcSize > DstSize) {
- // We're doing a cross-bank copy into a smaller register. We need a
- // subregister copy. First, get a register class that's on the same bank
- // as the destination, but the same size as the source.
- const TargetRegisterClass *SubregRC =
- getMinClassForRegBank(DstRegBank, SrcSize, true);
- assert(SubregRC && "Didn't get a register class for subreg?");
-
- // Get the appropriate subregister for the destination.
- unsigned SubReg = 0;
- if (!getSubRegForClass(DstRC, TRI, SubReg)) {
- LLVM_DEBUG(dbgs() << "Couldn't determine subregister for copy.\n");
- return false;
- }
-
- // Now, insert a subregister copy using the new register class.
- selectSubregisterCopy(I, MRI, RBI, SrcReg, SubregRC, DstRC, SubReg);
- return CheckCopy();
+ unsigned SrcSize = TRI.getRegSizeInBits(*SrcRC);
+ unsigned DstSize = TRI.getRegSizeInBits(*DstRC);
+
+ // If we're doing a cross-bank copy on different-sized registers, we need
+ // to do a bit more work.
+ if (SrcSize > DstSize) {
+ // We're doing a cross-bank copy into a smaller register. We need a
+ // subregister copy. First, get a register class that's on the same bank
+ // as the destination, but the same size as the source.
+ const TargetRegisterClass *SubregRC =
+ getMinClassForRegBank(DstRegBank, SrcSize, true);
+ assert(SubregRC && "Didn't get a register class for subreg?");
+
+ // Get the appropriate subregister for the destination.
+ unsigned SubReg = 0;
+ if (!getSubRegForClass(DstRC, TRI, SubReg)) {
+ LLVM_DEBUG(dbgs() << "Couldn't determine subregister for copy.\n");
+ return false;
}
- else if (DstRegBank.getID() == AArch64::GPRRegBankID && DstSize == 32 &&
- SrcSize == 16) {
+ // Now, insert a subregister copy using the new register class.
+ selectSubregisterCopy(I, MRI, RBI, SrcReg, SubregRC, DstRC, SubReg);
+ return CheckCopy();
+ }
+
+ // Is this a cross-bank copy?
+ if (DstRegBank.getID() != SrcRegBank.getID()) {
+ if (DstRegBank.getID() == AArch64::GPRRegBankID && DstSize == 32 &&
+ SrcSize == 16) {
// Special case for FPR16 to GPR32.
// FIXME: This can probably be generalized like the above case.
- unsigned PromoteReg =
+ Register PromoteReg =
MRI.createVirtualRegister(&AArch64::FPR32RegClass);
BuildMI(*I.getParent(), I, I.getDebugLoc(),
TII.get(AArch64::SUBREG_TO_REG), PromoteReg)
@@ -674,7 +737,7 @@ static bool selectCopy(MachineInstr &I, const TargetInstrInfo &TII,
// If the destination is a physical register, then there's nothing to
// change, so we're done.
- if (TargetRegisterInfo::isPhysicalRegister(DstReg))
+ if (Register::isPhysicalRegister(DstReg))
return CheckCopy();
}
@@ -955,7 +1018,9 @@ bool AArch64InstructionSelector::selectVectorSHL(
return false;
unsigned Opc = 0;
- if (Ty == LLT::vector(4, 32)) {
+ if (Ty == LLT::vector(2, 64)) {
+ Opc = AArch64::USHLv2i64;
+ } else if (Ty == LLT::vector(4, 32)) {
Opc = AArch64::USHLv4i32;
} else if (Ty == LLT::vector(2, 32)) {
Opc = AArch64::USHLv2i32;
@@ -989,7 +1054,11 @@ bool AArch64InstructionSelector::selectVectorASHR(
unsigned Opc = 0;
unsigned NegOpc = 0;
const TargetRegisterClass *RC = nullptr;
- if (Ty == LLT::vector(4, 32)) {
+ if (Ty == LLT::vector(2, 64)) {
+ Opc = AArch64::SSHLv2i64;
+ NegOpc = AArch64::NEGv2i64;
+ RC = &AArch64::FPR128RegClass;
+ } else if (Ty == LLT::vector(4, 32)) {
Opc = AArch64::SSHLv4i32;
NegOpc = AArch64::NEGv4i32;
RC = &AArch64::FPR128RegClass;
@@ -1044,7 +1113,7 @@ bool AArch64InstructionSelector::selectVaStartDarwin(
}
void AArch64InstructionSelector::materializeLargeCMVal(
- MachineInstr &I, const Value *V, unsigned char OpFlags) const {
+ MachineInstr &I, const Value *V, unsigned OpFlags) const {
MachineBasicBlock &MBB = *I.getParent();
MachineFunction &MF = *MBB.getParent();
MachineRegisterInfo &MRI = MF.getRegInfo();
@@ -1097,8 +1166,8 @@ void AArch64InstructionSelector::preISelLower(MachineInstr &I) const {
// some reason we receive input GMIR that has an s64 shift amount that's not
// a G_CONSTANT, insert a truncate so that we can still select the s32
// register-register variant.
- unsigned SrcReg = I.getOperand(1).getReg();
- unsigned ShiftReg = I.getOperand(2).getReg();
+ Register SrcReg = I.getOperand(1).getReg();
+ Register ShiftReg = I.getOperand(2).getReg();
const LLT ShiftTy = MRI.getType(ShiftReg);
const LLT SrcTy = MRI.getType(SrcReg);
if (SrcTy.isVector())
@@ -1118,6 +1187,9 @@ void AArch64InstructionSelector::preISelLower(MachineInstr &I) const {
}
return;
}
+ case TargetOpcode::G_STORE:
+ contractCrossBankCopyIntoStore(I, MRI);
+ return;
default:
return;
}
@@ -1158,6 +1230,48 @@ bool AArch64InstructionSelector::earlySelectSHL(
return constrainSelectedInstRegOperands(*NewI, TII, TRI, RBI);
}
+void AArch64InstructionSelector::contractCrossBankCopyIntoStore(
+ MachineInstr &I, MachineRegisterInfo &MRI) const {
+ assert(I.getOpcode() == TargetOpcode::G_STORE && "Expected G_STORE");
+ // If we're storing a scalar, it doesn't matter what register bank that
+ // scalar is on. All that matters is the size.
+ //
+ // So, if we see something like this (with a 32-bit scalar as an example):
+ //
+ // %x:gpr(s32) = ... something ...
+ // %y:fpr(s32) = COPY %x:gpr(s32)
+ // G_STORE %y:fpr(s32)
+ //
+ // We can fix this up into something like this:
+ //
+ // G_STORE %x:gpr(s32)
+ //
+ // And then continue the selection process normally.
+ MachineInstr *Def = getDefIgnoringCopies(I.getOperand(0).getReg(), MRI);
+ if (!Def)
+ return;
+ Register DefDstReg = Def->getOperand(0).getReg();
+ LLT DefDstTy = MRI.getType(DefDstReg);
+ Register StoreSrcReg = I.getOperand(0).getReg();
+ LLT StoreSrcTy = MRI.getType(StoreSrcReg);
+
+ // If we get something strange like a physical register, then we shouldn't
+ // go any further.
+ if (!DefDstTy.isValid())
+ return;
+
+ // Are the source and dst types the same size?
+ if (DefDstTy.getSizeInBits() != StoreSrcTy.getSizeInBits())
+ return;
+
+ if (RBI.getRegBank(StoreSrcReg, MRI, TRI) ==
+ RBI.getRegBank(DefDstReg, MRI, TRI))
+ return;
+
+ // We have a cross-bank copy, which is entering a store. Let's fold it.
+ I.getOperand(0).setReg(DefDstReg);
+}
+
bool AArch64InstructionSelector::earlySelect(MachineInstr &I) const {
assert(I.getParent() && "Instruction should be in a basic block!");
assert(I.getParent()->getParent() && "Instruction should be in a function!");
@@ -1169,13 +1283,37 @@ bool AArch64InstructionSelector::earlySelect(MachineInstr &I) const {
switch (I.getOpcode()) {
case TargetOpcode::G_SHL:
return earlySelectSHL(I, MRI);
+ case TargetOpcode::G_CONSTANT: {
+ bool IsZero = false;
+ if (I.getOperand(1).isCImm())
+ IsZero = I.getOperand(1).getCImm()->getZExtValue() == 0;
+ else if (I.getOperand(1).isImm())
+ IsZero = I.getOperand(1).getImm() == 0;
+
+ if (!IsZero)
+ return false;
+
+ Register DefReg = I.getOperand(0).getReg();
+ LLT Ty = MRI.getType(DefReg);
+ if (Ty != LLT::scalar(64) && Ty != LLT::scalar(32))
+ return false;
+
+ if (Ty == LLT::scalar(64)) {
+ I.getOperand(1).ChangeToRegister(AArch64::XZR, false);
+ RBI.constrainGenericRegister(DefReg, AArch64::GPR64RegClass, MRI);
+ } else {
+ I.getOperand(1).ChangeToRegister(AArch64::WZR, false);
+ RBI.constrainGenericRegister(DefReg, AArch64::GPR32RegClass, MRI);
+ }
+ I.setDesc(TII.get(TargetOpcode::COPY));
+ return true;
+ }
default:
return false;
}
}
-bool AArch64InstructionSelector::select(MachineInstr &I,
- CodeGenCoverage &CoverageInfo) const {
+bool AArch64InstructionSelector::select(MachineInstr &I) {
assert(I.getParent() && "Instruction should be in a basic block!");
assert(I.getParent()->getParent() && "Instruction should be in a function!");
@@ -1244,7 +1382,7 @@ bool AArch64InstructionSelector::select(MachineInstr &I,
if (earlySelect(I))
return true;
- if (selectImpl(I, CoverageInfo))
+ if (selectImpl(I, *CoverageInfo))
return true;
LLT Ty =
@@ -1439,14 +1577,43 @@ bool AArch64InstructionSelector::select(MachineInstr &I,
return true;
}
case TargetOpcode::G_EXTRACT: {
- LLT SrcTy = MRI.getType(I.getOperand(1).getReg());
- LLT DstTy = MRI.getType(I.getOperand(0).getReg());
+ Register DstReg = I.getOperand(0).getReg();
+ Register SrcReg = I.getOperand(1).getReg();
+ LLT SrcTy = MRI.getType(SrcReg);
+ LLT DstTy = MRI.getType(DstReg);
(void)DstTy;
unsigned SrcSize = SrcTy.getSizeInBits();
- // Larger extracts are vectors, same-size extracts should be something else
- // by now (either split up or simplified to a COPY).
- if (SrcTy.getSizeInBits() > 64 || Ty.getSizeInBits() > 32)
- return false;
+
+ if (SrcTy.getSizeInBits() > 64) {
+ // This should be an extract of an s128, which is like a vector extract.
+ if (SrcTy.getSizeInBits() != 128)
+ return false;
+ // Only support extracting 64 bits from an s128 at the moment.
+ if (DstTy.getSizeInBits() != 64)
+ return false;
+
+ const RegisterBank &SrcRB = *RBI.getRegBank(SrcReg, MRI, TRI);
+ const RegisterBank &DstRB = *RBI.getRegBank(DstReg, MRI, TRI);
+ // Check we have the right regbank always.
+ assert(SrcRB.getID() == AArch64::FPRRegBankID &&
+ DstRB.getID() == AArch64::FPRRegBankID &&
+ "Wrong extract regbank!");
+ (void)SrcRB;
+
+ // Emit the same code as a vector extract.
+ // Offset must be a multiple of 64.
+ unsigned Offset = I.getOperand(2).getImm();
+ if (Offset % 64 != 0)
+ return false;
+ unsigned LaneIdx = Offset / 64;
+ MachineIRBuilder MIB(I);
+ MachineInstr *Extract = emitExtractVectorElt(
+ DstReg, DstRB, LLT::scalar(64), SrcReg, LaneIdx, MIB);
+ if (!Extract)
+ return false;
+ I.eraseFromParent();
+ return true;
+ }
I.setDesc(TII.get(SrcSize == 64 ? AArch64::UBFMXri : AArch64::UBFMWri));
MachineInstrBuilder(MF, I).addImm(I.getOperand(2).getImm() +
@@ -1458,7 +1625,7 @@ bool AArch64InstructionSelector::select(MachineInstr &I,
return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
}
- Register DstReg = MRI.createGenericVirtualRegister(LLT::scalar(64));
+ DstReg = MRI.createGenericVirtualRegister(LLT::scalar(64));
MIB.setInsertPt(MIB.getMBB(), std::next(I.getIterator()));
MIB.buildInstr(TargetOpcode::COPY, {I.getOperand(0).getReg()}, {})
.addReg(DstReg, 0, AArch64::sub_32);
@@ -1521,11 +1688,10 @@ bool AArch64InstructionSelector::select(MachineInstr &I,
case TargetOpcode::G_GLOBAL_VALUE: {
auto GV = I.getOperand(1).getGlobal();
- if (GV->isThreadLocal()) {
- // FIXME: we don't support TLS yet.
- return false;
- }
- unsigned char OpFlags = STI.ClassifyGlobalReference(GV, TM);
+ if (GV->isThreadLocal())
+ return selectTLSGlobalValue(I, MRI);
+
+ unsigned OpFlags = STI.ClassifyGlobalReference(GV, TM);
if (OpFlags & AArch64II::MO_GOT) {
I.setDesc(TII.get(AArch64::LOADgot));
I.getOperand(1).setTargetFlags(OpFlags);
@@ -1562,8 +1728,15 @@ bool AArch64InstructionSelector::select(MachineInstr &I,
}
auto &MemOp = **I.memoperands_begin();
- if (MemOp.getOrdering() != AtomicOrdering::NotAtomic) {
- LLVM_DEBUG(dbgs() << "Atomic load/store not supported yet\n");
+ if (MemOp.isAtomic()) {
+ // For now we just support s8 acquire loads to be able to compile stack
+ // protector code.
+ if (MemOp.getOrdering() == AtomicOrdering::Acquire &&
+ MemOp.getSize() == 1) {
+ I.setDesc(TII.get(AArch64::LDARB));
+ return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
+ }
+ LLVM_DEBUG(dbgs() << "Atomic load/store not fully supported yet\n");
return false;
}
unsigned MemSizeInBits = MemOp.getSize() * 8;
@@ -1598,7 +1771,7 @@ bool AArch64InstructionSelector::select(MachineInstr &I,
const unsigned Size = MemSizeInBits / 8;
const unsigned Scale = Log2_32(Size);
if ((Imm & (Size - 1)) == 0 && Imm >= 0 && Imm < (0x1000 << Scale)) {
- unsigned Ptr2Reg = PtrMI->getOperand(1).getReg();
+ Register Ptr2Reg = PtrMI->getOperand(1).getReg();
I.getOperand(1).setReg(Ptr2Reg);
PtrMI = MRI.getVRegDef(Ptr2Reg);
Offset = Imm / Size;
@@ -1688,8 +1861,7 @@ bool AArch64InstructionSelector::select(MachineInstr &I,
return selectVectorSHL(I, MRI);
LLVM_FALLTHROUGH;
case TargetOpcode::G_OR:
- case TargetOpcode::G_LSHR:
- case TargetOpcode::G_GEP: {
+ case TargetOpcode::G_LSHR: {
// Reject the various things we don't support yet.
if (unsupportedBinOp(I, RBI, MRI, TRI))
return false;
@@ -1711,6 +1883,13 @@ bool AArch64InstructionSelector::select(MachineInstr &I,
return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
}
+ case TargetOpcode::G_GEP: {
+ MachineIRBuilder MIRBuilder(I);
+ emitADD(I.getOperand(0).getReg(), I.getOperand(1), I.getOperand(2),
+ MIRBuilder);
+ I.eraseFromParent();
+ return true;
+ }
case TargetOpcode::G_UADDO: {
// TODO: Support other types.
unsigned OpSize = Ty.getSizeInBits();
@@ -1816,6 +1995,16 @@ bool AArch64InstructionSelector::select(MachineInstr &I,
constrainSelectedInstRegOperands(I, TII, TRI, RBI);
return true;
}
+
+ if (!SrcTy.isVector() && SrcTy.getSizeInBits() == 128) {
+ MachineIRBuilder MIB(I);
+ MachineInstr *Extract = emitExtractVectorElt(
+ DstReg, DstRB, LLT::scalar(DstTy.getSizeInBits()), SrcReg, 0, MIB);
+ if (!Extract)
+ return false;
+ I.eraseFromParent();
+ return true;
+ }
}
return false;
@@ -1868,21 +2057,41 @@ bool AArch64InstructionSelector::select(MachineInstr &I,
case TargetOpcode::G_ZEXT:
case TargetOpcode::G_SEXT: {
unsigned Opcode = I.getOpcode();
- const LLT DstTy = MRI.getType(I.getOperand(0).getReg()),
- SrcTy = MRI.getType(I.getOperand(1).getReg());
- const bool isSigned = Opcode == TargetOpcode::G_SEXT;
+ const bool IsSigned = Opcode == TargetOpcode::G_SEXT;
const Register DefReg = I.getOperand(0).getReg();
const Register SrcReg = I.getOperand(1).getReg();
- const RegisterBank &RB = *RBI.getRegBank(DefReg, MRI, TRI);
+ const LLT DstTy = MRI.getType(DefReg);
+ const LLT SrcTy = MRI.getType(SrcReg);
+ unsigned DstSize = DstTy.getSizeInBits();
+ unsigned SrcSize = SrcTy.getSizeInBits();
- if (RB.getID() != AArch64::GPRRegBankID) {
- LLVM_DEBUG(dbgs() << TII.getName(I.getOpcode()) << " on bank: " << RB
- << ", expected: GPR\n");
- return false;
- }
+ assert((*RBI.getRegBank(DefReg, MRI, TRI)).getID() ==
+ AArch64::GPRRegBankID &&
+ "Unexpected ext regbank");
+ MachineIRBuilder MIB(I);
MachineInstr *ExtI;
- if (DstTy == LLT::scalar(64)) {
+ if (DstTy.isVector())
+ return false; // Should be handled by imported patterns.
+
+ // First check if we're extending the result of a load which has a dest type
+ // smaller than 32 bits, then this zext is redundant. GPR32 is the smallest
+ // GPR register on AArch64 and all loads which are smaller automatically
+ // zero-extend the upper bits. E.g.
+ // %v(s8) = G_LOAD %p, :: (load 1)
+ // %v2(s32) = G_ZEXT %v(s8)
+ if (!IsSigned) {
+ auto *LoadMI = getOpcodeDef(TargetOpcode::G_LOAD, SrcReg, MRI);
+ if (LoadMI &&
+ RBI.getRegBank(SrcReg, MRI, TRI)->getID() == AArch64::GPRRegBankID) {
+ const MachineMemOperand *MemOp = *LoadMI->memoperands_begin();
+ unsigned BytesLoaded = MemOp->getSize();
+ if (BytesLoaded < 4 && SrcTy.getSizeInBytes() == BytesLoaded)
+ return selectCopy(I, TII, MRI, TRI, RBI);
+ }
+ }
+
+ if (DstSize == 64) {
// FIXME: Can we avoid manually doing this?
if (!RBI.constrainGenericRegister(SrcReg, AArch64::GPR32RegClass, MRI)) {
LLVM_DEBUG(dbgs() << "Failed to constrain " << TII.getName(Opcode)
@@ -1890,33 +2099,26 @@ bool AArch64InstructionSelector::select(MachineInstr &I,
return false;
}
- const Register SrcXReg =
- MRI.createVirtualRegister(&AArch64::GPR64RegClass);
- BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::SUBREG_TO_REG))
- .addDef(SrcXReg)
- .addImm(0)
- .addUse(SrcReg)
- .addImm(AArch64::sub_32);
-
- const unsigned NewOpc = isSigned ? AArch64::SBFMXri : AArch64::UBFMXri;
- ExtI = BuildMI(MBB, I, I.getDebugLoc(), TII.get(NewOpc))
- .addDef(DefReg)
- .addUse(SrcXReg)
- .addImm(0)
- .addImm(SrcTy.getSizeInBits() - 1);
- } else if (DstTy.isScalar() && DstTy.getSizeInBits() <= 32) {
- const unsigned NewOpc = isSigned ? AArch64::SBFMWri : AArch64::UBFMWri;
- ExtI = BuildMI(MBB, I, I.getDebugLoc(), TII.get(NewOpc))
- .addDef(DefReg)
- .addUse(SrcReg)
- .addImm(0)
- .addImm(SrcTy.getSizeInBits() - 1);
+ auto SubregToReg =
+ MIB.buildInstr(AArch64::SUBREG_TO_REG, {&AArch64::GPR64RegClass}, {})
+ .addImm(0)
+ .addUse(SrcReg)
+ .addImm(AArch64::sub_32);
+
+ ExtI = MIB.buildInstr(IsSigned ? AArch64::SBFMXri : AArch64::UBFMXri,
+ {DefReg}, {SubregToReg})
+ .addImm(0)
+ .addImm(SrcSize - 1);
+ } else if (DstSize <= 32) {
+ ExtI = MIB.buildInstr(IsSigned ? AArch64::SBFMWri : AArch64::UBFMWri,
+ {DefReg}, {SrcReg})
+ .addImm(0)
+ .addImm(SrcSize - 1);
} else {
return false;
}
constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
-
I.eraseFromParent();
return true;
}
@@ -2163,6 +2365,37 @@ bool AArch64InstructionSelector::selectJumpTable(
return constrainSelectedInstRegOperands(*MovMI, TII, TRI, RBI);
}
+bool AArch64InstructionSelector::selectTLSGlobalValue(
+ MachineInstr &I, MachineRegisterInfo &MRI) const {
+ if (!STI.isTargetMachO())
+ return false;
+ MachineFunction &MF = *I.getParent()->getParent();
+ MF.getFrameInfo().setAdjustsStack(true);
+
+ const GlobalValue &GV = *I.getOperand(1).getGlobal();
+ MachineIRBuilder MIB(I);
+
+ MIB.buildInstr(AArch64::LOADgot, {AArch64::X0}, {})
+ .addGlobalAddress(&GV, 0, AArch64II::MO_TLS);
+
+ auto Load = MIB.buildInstr(AArch64::LDRXui, {&AArch64::GPR64commonRegClass},
+ {Register(AArch64::X0)})
+ .addImm(0);
+
+ // TLS calls preserve all registers except those that absolutely must be
+ // trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be
+ // silly).
+ MIB.buildInstr(AArch64::BLR, {}, {Load})
+ .addDef(AArch64::X0, RegState::Implicit)
+ .addRegMask(TRI.getTLSCallPreservedMask());
+
+ MIB.buildCopy(I.getOperand(0).getReg(), Register(AArch64::X0));
+ RBI.constrainGenericRegister(I.getOperand(0).getReg(), AArch64::GPR64RegClass,
+ MRI);
+ I.eraseFromParent();
+ return true;
+}
+
bool AArch64InstructionSelector::selectIntrinsicTrunc(
MachineInstr &I, MachineRegisterInfo &MRI) const {
const LLT SrcTy = MRI.getType(I.getOperand(0).getReg());
@@ -2478,16 +2711,40 @@ bool AArch64InstructionSelector::selectMergeValues(
const LLT DstTy = MRI.getType(I.getOperand(0).getReg());
const LLT SrcTy = MRI.getType(I.getOperand(1).getReg());
assert(!DstTy.isVector() && !SrcTy.isVector() && "invalid merge operation");
+ const RegisterBank &RB = *RBI.getRegBank(I.getOperand(1).getReg(), MRI, TRI);
- // At the moment we only support merging two s32s into an s64.
if (I.getNumOperands() != 3)
return false;
- if (DstTy.getSizeInBits() != 64 || SrcTy.getSizeInBits() != 32)
- return false;
- const RegisterBank &RB = *RBI.getRegBank(I.getOperand(1).getReg(), MRI, TRI);
+
+ // Merging 2 s64s into an s128.
+ if (DstTy == LLT::scalar(128)) {
+ if (SrcTy.getSizeInBits() != 64)
+ return false;
+ MachineIRBuilder MIB(I);
+ Register DstReg = I.getOperand(0).getReg();
+ Register Src1Reg = I.getOperand(1).getReg();
+ Register Src2Reg = I.getOperand(2).getReg();
+ auto Tmp = MIB.buildInstr(TargetOpcode::IMPLICIT_DEF, {DstTy}, {});
+ MachineInstr *InsMI =
+ emitLaneInsert(None, Tmp.getReg(0), Src1Reg, /* LaneIdx */ 0, RB, MIB);
+ if (!InsMI)
+ return false;
+ MachineInstr *Ins2MI = emitLaneInsert(DstReg, InsMI->getOperand(0).getReg(),
+ Src2Reg, /* LaneIdx */ 1, RB, MIB);
+ if (!Ins2MI)
+ return false;
+ constrainSelectedInstRegOperands(*InsMI, TII, TRI, RBI);
+ constrainSelectedInstRegOperands(*Ins2MI, TII, TRI, RBI);
+ I.eraseFromParent();
+ return true;
+ }
+
if (RB.getID() != AArch64::GPRRegBankID)
return false;
+ if (DstTy.getSizeInBits() != 64 || SrcTy.getSizeInBits() != 32)
+ return false;
+
auto *DstRC = &AArch64::GPR64RegClass;
Register SubToRegDef = MRI.createVirtualRegister(DstRC);
MachineInstr &SubRegMI = *BuildMI(*I.getParent(), I, I.getDebugLoc(),
@@ -2695,7 +2952,8 @@ bool AArch64InstructionSelector::selectUnmergeValues(
const LLT NarrowTy = MRI.getType(I.getOperand(0).getReg());
const LLT WideTy = MRI.getType(SrcReg);
(void)WideTy;
- assert(WideTy.isVector() && "can only unmerge from vector types!");
+ assert((WideTy.isVector() || WideTy.getSizeInBits() == 128) &&
+ "can only unmerge from vector or s128 types!");
assert(WideTy.getSizeInBits() > NarrowTy.getSizeInBits() &&
"source register size too small!");
@@ -2802,29 +3060,6 @@ bool AArch64InstructionSelector::selectConcatVectors(
return true;
}
-void AArch64InstructionSelector::collectShuffleMaskIndices(
- MachineInstr &I, MachineRegisterInfo &MRI,
- SmallVectorImpl<Optional<int>> &Idxs) const {
- MachineInstr *MaskDef = MRI.getVRegDef(I.getOperand(3).getReg());
- assert(
- MaskDef->getOpcode() == TargetOpcode::G_BUILD_VECTOR &&
- "G_SHUFFLE_VECTOR should have a constant mask operand as G_BUILD_VECTOR");
- // Find the constant indices.
- for (unsigned i = 1, e = MaskDef->getNumOperands(); i < e; ++i) {
- // Look through copies.
- MachineInstr *ScalarDef =
- getDefIgnoringCopies(MaskDef->getOperand(i).getReg(), MRI);
- assert(ScalarDef && "Could not find vreg def of shufflevec index op");
- if (ScalarDef->getOpcode() != TargetOpcode::G_CONSTANT) {
- // This be an undef if not a constant.
- assert(ScalarDef->getOpcode() == TargetOpcode::G_IMPLICIT_DEF);
- Idxs.push_back(None);
- } else {
- Idxs.push_back(ScalarDef->getOperand(1).getCImm()->getSExtValue());
- }
- }
-}
-
unsigned
AArch64InstructionSelector::emitConstantPoolEntry(Constant *CPVal,
MachineFunction &MF) const {
@@ -2906,6 +3141,31 @@ getInsertVecEltOpInfo(const RegisterBank &RB, unsigned EltSize) {
}
MachineInstr *
+AArch64InstructionSelector::emitADD(Register DefReg, MachineOperand &LHS,
+ MachineOperand &RHS,
+ MachineIRBuilder &MIRBuilder) const {
+ assert(LHS.isReg() && RHS.isReg() && "Expected LHS and RHS to be registers!");
+ MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo();
+ static const unsigned OpcTable[2][2]{{AArch64::ADDXrr, AArch64::ADDXri},
+ {AArch64::ADDWrr, AArch64::ADDWri}};
+ bool Is32Bit = MRI.getType(LHS.getReg()).getSizeInBits() == 32;
+ auto ImmFns = selectArithImmed(RHS);
+ unsigned Opc = OpcTable[Is32Bit][ImmFns.hasValue()];
+ auto AddMI = MIRBuilder.buildInstr(Opc, {DefReg}, {LHS.getReg()});
+
+ // If we matched a valid constant immediate, add those operands.
+ if (ImmFns) {
+ for (auto &RenderFn : *ImmFns)
+ RenderFn(AddMI);
+ } else {
+ AddMI.addUse(RHS.getReg());
+ }
+
+ constrainSelectedInstRegOperands(*AddMI, TII, TRI, RBI);
+ return &*AddMI;
+}
+
+MachineInstr *
AArch64InstructionSelector::emitCMN(MachineOperand &LHS, MachineOperand &RHS,
MachineIRBuilder &MIRBuilder) const {
assert(LHS.isReg() && RHS.isReg() && "Expected LHS and RHS to be registers!");
@@ -3151,7 +3411,7 @@ bool AArch64InstructionSelector::tryOptSelect(MachineInstr &I) const {
// Can't see past copies from physregs.
if (Opc == TargetOpcode::COPY &&
- TargetRegisterInfo::isPhysicalRegister(CondDef->getOperand(1).getReg()))
+ Register::isPhysicalRegister(CondDef->getOperand(1).getReg()))
return false;
CondDef = MRI.getVRegDef(CondDef->getOperand(1).getReg());
@@ -3342,16 +3602,9 @@ bool AArch64InstructionSelector::tryOptVectorDup(MachineInstr &I) const {
return false;
// The shuffle's second operand doesn't matter if the mask is all zero.
- auto *ZeroVec = getOpcodeDef(G_BUILD_VECTOR, I.getOperand(3).getReg(), MRI);
- if (!ZeroVec)
+ const Constant *Mask = I.getOperand(3).getShuffleMask();
+ if (!isa<ConstantAggregateZero>(Mask))
return false;
- int64_t Zero = 0;
- if (!mi_match(ZeroVec->getOperand(1).getReg(), MRI, m_ICst(Zero)) || Zero)
- return false;
- for (unsigned i = 1, e = ZeroVec->getNumOperands() - 1; i < e; ++i) {
- if (ZeroVec->getOperand(i).getReg() != ZeroVec->getOperand(1).getReg())
- return false; // This wasn't an all zeros vector.
- }
// We're done, now find out what kind of splat we need.
LLT VecTy = MRI.getType(I.getOperand(0).getReg());
@@ -3399,19 +3652,14 @@ bool AArch64InstructionSelector::selectShuffleVector(
const LLT Src1Ty = MRI.getType(Src1Reg);
Register Src2Reg = I.getOperand(2).getReg();
const LLT Src2Ty = MRI.getType(Src2Reg);
+ const Constant *ShuffleMask = I.getOperand(3).getShuffleMask();
MachineBasicBlock &MBB = *I.getParent();
MachineFunction &MF = *MBB.getParent();
LLVMContext &Ctx = MF.getFunction().getContext();
- // G_SHUFFLE_VECTOR doesn't really have a strictly enforced constant mask
- // operand, it comes in as a normal vector value which we have to analyze to
- // find the mask indices. If the mask element is undef, then
- // collectShuffleMaskIndices() will add a None entry for that index into
- // the list.
- SmallVector<Optional<int>, 8> Mask;
- collectShuffleMaskIndices(I, MRI, Mask);
- assert(!Mask.empty() && "Expected to find mask indices");
+ SmallVector<int, 8> Mask;
+ ShuffleVectorInst::getShuffleMask(ShuffleMask, Mask);
// G_SHUFFLE_VECTOR is weird in that the source operands can be scalars, if
// it's originated from a <1 x T> type. Those should have been lowered into
@@ -3424,10 +3672,10 @@ bool AArch64InstructionSelector::selectShuffleVector(
unsigned BytesPerElt = DstTy.getElementType().getSizeInBits() / 8;
SmallVector<Constant *, 64> CstIdxs;
- for (auto &MaybeVal : Mask) {
+ for (int Val : Mask) {
// For now, any undef indexes we'll just assume to be 0. This should be
// optimized in future, e.g. to select DUP etc.
- int Val = MaybeVal.hasValue() ? *MaybeVal : 0;
+ Val = Val < 0 ? 0 : Val;
for (unsigned Byte = 0; Byte < BytesPerElt; ++Byte) {
unsigned Offset = Byte + Val * BytesPerElt;
CstIdxs.emplace_back(ConstantInt::get(Type::getInt8Ty(Ctx), Offset));
@@ -3684,21 +3932,6 @@ static unsigned findIntrinsicID(MachineInstr &I) {
return IntrinOp->getIntrinsicID();
}
-/// Helper function to emit the correct opcode for a llvm.aarch64.stlxr
-/// intrinsic.
-static unsigned getStlxrOpcode(unsigned NumBytesToStore) {
- switch (NumBytesToStore) {
- // TODO: 1, 2, and 4 byte stores.
- case 8:
- return AArch64::STLXRX;
- default:
- LLVM_DEBUG(dbgs() << "Unexpected number of bytes to store! ("
- << NumBytesToStore << ")\n");
- break;
- }
- return 0;
-}
-
bool AArch64InstructionSelector::selectIntrinsicWithSideEffects(
MachineInstr &I, MachineRegisterInfo &MRI) const {
// Find the intrinsic ID.
@@ -3719,32 +3952,6 @@ bool AArch64InstructionSelector::selectIntrinsicWithSideEffects(
return false;
MIRBuilder.buildInstr(AArch64::BRK, {}, {}).addImm(0xF000);
break;
- case Intrinsic::aarch64_stlxr:
- Register StatReg = I.getOperand(0).getReg();
- assert(RBI.getSizeInBits(StatReg, MRI, TRI) == 32 &&
- "Status register must be 32 bits!");
- Register SrcReg = I.getOperand(2).getReg();
-
- if (RBI.getSizeInBits(SrcReg, MRI, TRI) != 64) {
- LLVM_DEBUG(dbgs() << "Only support 64-bit sources right now.\n");
- return false;
- }
-
- Register PtrReg = I.getOperand(3).getReg();
- assert(MRI.getType(PtrReg).isPointer() && "Expected pointer operand");
-
- // Expect only one memory operand.
- if (!I.hasOneMemOperand())
- return false;
-
- const MachineMemOperand *MemOp = *I.memoperands_begin();
- unsigned NumBytesToStore = MemOp->getSize();
- unsigned Opc = getStlxrOpcode(NumBytesToStore);
- if (!Opc)
- return false;
-
- auto StoreMI = MIRBuilder.buildInstr(Opc, {StatReg}, {SrcReg, PtrReg});
- constrainSelectedInstRegOperands(*StoreMI, TII, TRI, RBI);
}
I.eraseFromParent();
@@ -3860,6 +4067,30 @@ AArch64InstructionSelector::selectShiftB_64(const MachineOperand &Root) const {
return {{[=](MachineInstrBuilder &MIB) { MIB.addImm(Enc); }}};
}
+/// Helper to select an immediate value that can be represented as a 12-bit
+/// value shifted left by either 0 or 12. If it is possible to do so, return
+/// the immediate and shift value. If not, return None.
+///
+/// Used by selectArithImmed and selectNegArithImmed.
+InstructionSelector::ComplexRendererFns
+AArch64InstructionSelector::select12BitValueWithLeftShift(
+ uint64_t Immed) const {
+ unsigned ShiftAmt;
+ if (Immed >> 12 == 0) {
+ ShiftAmt = 0;
+ } else if ((Immed & 0xfff) == 0 && Immed >> 24 == 0) {
+ ShiftAmt = 12;
+ Immed = Immed >> 12;
+ } else
+ return None;
+
+ unsigned ShVal = AArch64_AM::getShifterImm(AArch64_AM::LSL, ShiftAmt);
+ return {{
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(Immed); },
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(ShVal); },
+ }};
+}
+
/// SelectArithImmed - Select an immediate value that can be represented as
/// a 12-bit value shifted left by either 0 or 12. If so, return true with
/// Val set to the 12-bit value and Shift set to the shifter operand.
@@ -3873,22 +4104,229 @@ AArch64InstructionSelector::selectArithImmed(MachineOperand &Root) const {
auto MaybeImmed = getImmedFromMO(Root);
if (MaybeImmed == None)
return None;
+ return select12BitValueWithLeftShift(*MaybeImmed);
+}
+
+/// SelectNegArithImmed - As above, but negates the value before trying to
+/// select it.
+InstructionSelector::ComplexRendererFns
+AArch64InstructionSelector::selectNegArithImmed(MachineOperand &Root) const {
+ // We need a register here, because we need to know if we have a 64 or 32
+ // bit immediate.
+ if (!Root.isReg())
+ return None;
+ auto MaybeImmed = getImmedFromMO(Root);
+ if (MaybeImmed == None)
+ return None;
uint64_t Immed = *MaybeImmed;
- unsigned ShiftAmt;
- if (Immed >> 12 == 0) {
- ShiftAmt = 0;
- } else if ((Immed & 0xfff) == 0 && Immed >> 24 == 0) {
- ShiftAmt = 12;
- Immed = Immed >> 12;
- } else
+ // This negation is almost always valid, but "cmp wN, #0" and "cmn wN, #0"
+ // have the opposite effect on the C flag, so this pattern mustn't match under
+ // those circumstances.
+ if (Immed == 0)
return None;
- unsigned ShVal = AArch64_AM::getShifterImm(AArch64_AM::LSL, ShiftAmt);
- return {{
- [=](MachineInstrBuilder &MIB) { MIB.addImm(Immed); },
- [=](MachineInstrBuilder &MIB) { MIB.addImm(ShVal); },
- }};
+ // Check if we're dealing with a 32-bit type on the root or a 64-bit type on
+ // the root.
+ MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
+ if (MRI.getType(Root.getReg()).getSizeInBits() == 32)
+ Immed = ~((uint32_t)Immed) + 1;
+ else
+ Immed = ~Immed + 1ULL;
+
+ if (Immed & 0xFFFFFFFFFF000000ULL)
+ return None;
+
+ Immed &= 0xFFFFFFULL;
+ return select12BitValueWithLeftShift(Immed);
+}
+
+/// Return true if it is worth folding MI into an extended register. That is,
+/// if it's safe to pull it into the addressing mode of a load or store as a
+/// shift.
+bool AArch64InstructionSelector::isWorthFoldingIntoExtendedReg(
+ MachineInstr &MI, const MachineRegisterInfo &MRI) const {
+ // Always fold if there is one use, or if we're optimizing for size.
+ Register DefReg = MI.getOperand(0).getReg();
+ if (MRI.hasOneUse(DefReg) ||
+ MI.getParent()->getParent()->getFunction().hasMinSize())
+ return true;
+
+ // It's better to avoid folding and recomputing shifts when we don't have a
+ // fastpath.
+ if (!STI.hasLSLFast())
+ return false;
+
+ // We have a fastpath, so folding a shift in and potentially computing it
+ // many times may be beneficial. Check if this is only used in memory ops.
+ // If it is, then we should fold.
+ return all_of(MRI.use_instructions(DefReg),
+ [](MachineInstr &Use) { return Use.mayLoadOrStore(); });
+}
+
+/// This is used for computing addresses like this:
+///
+/// ldr x1, [x2, x3, lsl #3]
+///
+/// Where x2 is the base register, and x3 is an offset register. The shift-left
+/// is a constant value specific to this load instruction. That is, we'll never
+/// see anything other than a 3 here (which corresponds to the size of the
+/// element being loaded.)
+InstructionSelector::ComplexRendererFns
+AArch64InstructionSelector::selectAddrModeShiftedExtendXReg(
+ MachineOperand &Root, unsigned SizeInBytes) const {
+ if (!Root.isReg())
+ return None;
+ MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
+
+ // Make sure that the memory op is a valid size.
+ int64_t LegalShiftVal = Log2_32(SizeInBytes);
+ if (LegalShiftVal == 0)
+ return None;
+
+ // We want to find something like this:
+ //
+ // val = G_CONSTANT LegalShiftVal
+ // shift = G_SHL off_reg val
+ // ptr = G_GEP base_reg shift
+ // x = G_LOAD ptr
+ //
+ // And fold it into this addressing mode:
+ //
+ // ldr x, [base_reg, off_reg, lsl #LegalShiftVal]
+
+ // Check if we can find the G_GEP.
+ MachineInstr *Gep = getOpcodeDef(TargetOpcode::G_GEP, Root.getReg(), MRI);
+ if (!Gep || !isWorthFoldingIntoExtendedReg(*Gep, MRI))
+ return None;
+
+ // Now, try to match an opcode which will match our specific offset.
+ // We want a G_SHL or a G_MUL.
+ MachineInstr *OffsetInst = getDefIgnoringCopies(Gep->getOperand(2).getReg(), MRI);
+ if (!OffsetInst)
+ return None;
+
+ unsigned OffsetOpc = OffsetInst->getOpcode();
+ if (OffsetOpc != TargetOpcode::G_SHL && OffsetOpc != TargetOpcode::G_MUL)
+ return None;
+
+ if (!isWorthFoldingIntoExtendedReg(*OffsetInst, MRI))
+ return None;
+
+ // Now, try to find the specific G_CONSTANT. Start by assuming that the
+ // register we will offset is the LHS, and the register containing the
+ // constant is the RHS.
+ Register OffsetReg = OffsetInst->getOperand(1).getReg();
+ Register ConstantReg = OffsetInst->getOperand(2).getReg();
+ auto ValAndVReg = getConstantVRegValWithLookThrough(ConstantReg, MRI);
+ if (!ValAndVReg) {
+ // We didn't get a constant on the RHS. If the opcode is a shift, then
+ // we're done.
+ if (OffsetOpc == TargetOpcode::G_SHL)
+ return None;
+
+ // If we have a G_MUL, we can use either register. Try looking at the RHS.
+ std::swap(OffsetReg, ConstantReg);
+ ValAndVReg = getConstantVRegValWithLookThrough(ConstantReg, MRI);
+ if (!ValAndVReg)
+ return None;
+ }
+
+ // The value must fit into 3 bits, and must be positive. Make sure that is
+ // true.
+ int64_t ImmVal = ValAndVReg->Value;
+
+ // Since we're going to pull this into a shift, the constant value must be
+ // a power of 2. If we got a multiply, then we need to check this.
+ if (OffsetOpc == TargetOpcode::G_MUL) {
+ if (!isPowerOf2_32(ImmVal))
+ return None;
+
+ // Got a power of 2. So, the amount we'll shift is the log base-2 of that.
+ ImmVal = Log2_32(ImmVal);
+ }
+
+ if ((ImmVal & 0x7) != ImmVal)
+ return None;
+
+ // We are only allowed to shift by LegalShiftVal. This shift value is built
+ // into the instruction, so we can't just use whatever we want.
+ if (ImmVal != LegalShiftVal)
+ return None;
+
+ // We can use the LHS of the GEP as the base, and the LHS of the shift as an
+ // offset. Signify that we are shifting by setting the shift flag to 1.
+ return {{[=](MachineInstrBuilder &MIB) {
+ MIB.addUse(Gep->getOperand(1).getReg());
+ },
+ [=](MachineInstrBuilder &MIB) { MIB.addUse(OffsetReg); },
+ [=](MachineInstrBuilder &MIB) {
+ // Need to add both immediates here to make sure that they are both
+ // added to the instruction.
+ MIB.addImm(0);
+ MIB.addImm(1);
+ }}};
+}
+
+/// This is used for computing addresses like this:
+///
+/// ldr x1, [x2, x3]
+///
+/// Where x2 is the base register, and x3 is an offset register.
+///
+/// When possible (or profitable) to fold a G_GEP into the address calculation,
+/// this will do so. Otherwise, it will return None.
+InstructionSelector::ComplexRendererFns
+AArch64InstructionSelector::selectAddrModeRegisterOffset(
+ MachineOperand &Root) const {
+ MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
+
+ // We need a GEP.
+ MachineInstr *Gep = MRI.getVRegDef(Root.getReg());
+ if (!Gep || Gep->getOpcode() != TargetOpcode::G_GEP)
+ return None;
+
+ // If this is used more than once, let's not bother folding.
+ // TODO: Check if they are memory ops. If they are, then we can still fold
+ // without having to recompute anything.
+ if (!MRI.hasOneUse(Gep->getOperand(0).getReg()))
+ return None;
+
+ // Base is the GEP's LHS, offset is its RHS.
+ return {{[=](MachineInstrBuilder &MIB) {
+ MIB.addUse(Gep->getOperand(1).getReg());
+ },
+ [=](MachineInstrBuilder &MIB) {
+ MIB.addUse(Gep->getOperand(2).getReg());
+ },
+ [=](MachineInstrBuilder &MIB) {
+ // Need to add both immediates here to make sure that they are both
+ // added to the instruction.
+ MIB.addImm(0);
+ MIB.addImm(0);
+ }}};
+}
+
+/// This is intended to be equivalent to selectAddrModeXRO in
+/// AArch64ISelDAGtoDAG. It's used for selecting X register offset loads.
+InstructionSelector::ComplexRendererFns
+AArch64InstructionSelector::selectAddrModeXRO(MachineOperand &Root,
+ unsigned SizeInBytes) const {
+ MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
+
+ // If we have a constant offset, then we probably don't want to match a
+ // register offset.
+ if (isBaseWithConstantOffset(Root, MRI))
+ return None;
+
+ // Try to fold shifts into the addressing mode.
+ auto AddrModeFns = selectAddrModeShiftedExtendXReg(Root, SizeInBytes);
+ if (AddrModeFns)
+ return AddrModeFns;
+
+ // If that doesn't work, see if it's possible to fold in registers from
+ // a GEP.
+ return selectAddrModeRegisterOffset(Root);
}
/// Select a "register plus unscaled signed 9-bit immediate" address. This
@@ -3994,6 +4432,205 @@ AArch64InstructionSelector::selectAddrModeIndexed(MachineOperand &Root,
}};
}
+/// Given a shift instruction, return the correct shift type for that
+/// instruction.
+static AArch64_AM::ShiftExtendType getShiftTypeForInst(MachineInstr &MI) {
+ // TODO: Handle AArch64_AM::ROR
+ switch (MI.getOpcode()) {
+ default:
+ return AArch64_AM::InvalidShiftExtend;
+ case TargetOpcode::G_SHL:
+ return AArch64_AM::LSL;
+ case TargetOpcode::G_LSHR:
+ return AArch64_AM::LSR;
+ case TargetOpcode::G_ASHR:
+ return AArch64_AM::ASR;
+ }
+}
+
+/// Select a "shifted register" operand. If the value is not shifted, set the
+/// shift operand to a default value of "lsl 0".
+///
+/// TODO: Allow shifted register to be rotated in logical instructions.
+InstructionSelector::ComplexRendererFns
+AArch64InstructionSelector::selectShiftedRegister(MachineOperand &Root) const {
+ if (!Root.isReg())
+ return None;
+ MachineRegisterInfo &MRI =
+ Root.getParent()->getParent()->getParent()->getRegInfo();
+
+ // Check if the operand is defined by an instruction which corresponds to
+ // a ShiftExtendType. E.g. a G_SHL, G_LSHR, etc.
+ //
+ // TODO: Handle AArch64_AM::ROR for logical instructions.
+ MachineInstr *ShiftInst = MRI.getVRegDef(Root.getReg());
+ if (!ShiftInst)
+ return None;
+ AArch64_AM::ShiftExtendType ShType = getShiftTypeForInst(*ShiftInst);
+ if (ShType == AArch64_AM::InvalidShiftExtend)
+ return None;
+ if (!isWorthFoldingIntoExtendedReg(*ShiftInst, MRI))
+ return None;
+
+ // Need an immediate on the RHS.
+ MachineOperand &ShiftRHS = ShiftInst->getOperand(2);
+ auto Immed = getImmedFromMO(ShiftRHS);
+ if (!Immed)
+ return None;
+
+ // We have something that we can fold. Fold in the shift's LHS and RHS into
+ // the instruction.
+ MachineOperand &ShiftLHS = ShiftInst->getOperand(1);
+ Register ShiftReg = ShiftLHS.getReg();
+
+ unsigned NumBits = MRI.getType(ShiftReg).getSizeInBits();
+ unsigned Val = *Immed & (NumBits - 1);
+ unsigned ShiftVal = AArch64_AM::getShifterImm(ShType, Val);
+
+ return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(ShiftReg); },
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(ShiftVal); }}};
+}
+
+/// Get the correct ShiftExtendType for an extend instruction.
+static AArch64_AM::ShiftExtendType
+getExtendTypeForInst(MachineInstr &MI, MachineRegisterInfo &MRI) {
+ unsigned Opc = MI.getOpcode();
+
+ // Handle explicit extend instructions first.
+ if (Opc == TargetOpcode::G_SEXT || Opc == TargetOpcode::G_SEXT_INREG) {
+ unsigned Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
+ assert(Size != 64 && "Extend from 64 bits?");
+ switch (Size) {
+ case 8:
+ return AArch64_AM::SXTB;
+ case 16:
+ return AArch64_AM::SXTH;
+ case 32:
+ return AArch64_AM::SXTW;
+ default:
+ return AArch64_AM::InvalidShiftExtend;
+ }
+ }
+
+ if (Opc == TargetOpcode::G_ZEXT || Opc == TargetOpcode::G_ANYEXT) {
+ unsigned Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
+ assert(Size != 64 && "Extend from 64 bits?");
+ switch (Size) {
+ case 8:
+ return AArch64_AM::UXTB;
+ case 16:
+ return AArch64_AM::UXTH;
+ case 32:
+ return AArch64_AM::UXTW;
+ default:
+ return AArch64_AM::InvalidShiftExtend;
+ }
+ }
+
+ // Don't have an explicit extend. Try to handle a G_AND with a constant mask
+ // on the RHS.
+ if (Opc != TargetOpcode::G_AND)
+ return AArch64_AM::InvalidShiftExtend;
+
+ Optional<uint64_t> MaybeAndMask = getImmedFromMO(MI.getOperand(2));
+ if (!MaybeAndMask)
+ return AArch64_AM::InvalidShiftExtend;
+ uint64_t AndMask = *MaybeAndMask;
+ switch (AndMask) {
+ default:
+ return AArch64_AM::InvalidShiftExtend;
+ case 0xFF:
+ return AArch64_AM::UXTB;
+ case 0xFFFF:
+ return AArch64_AM::UXTH;
+ case 0xFFFFFFFF:
+ return AArch64_AM::UXTW;
+ }
+}
+
+Register AArch64InstructionSelector::narrowExtendRegIfNeeded(
+ Register ExtReg, MachineIRBuilder &MIB) const {
+ MachineRegisterInfo &MRI = *MIB.getMRI();
+ if (MRI.getType(ExtReg).getSizeInBits() == 32)
+ return ExtReg;
+
+ // Insert a copy to move ExtReg to GPR32.
+ Register NarrowReg = MRI.createVirtualRegister(&AArch64::GPR32RegClass);
+ auto Copy = MIB.buildCopy({NarrowReg}, {ExtReg});
+
+ // Select the copy into a subregister copy.
+ selectCopy(*Copy, TII, MRI, TRI, RBI);
+ return Copy.getReg(0);
+}
+
+/// Select an "extended register" operand. This operand folds in an extend
+/// followed by an optional left shift.
+InstructionSelector::ComplexRendererFns
+AArch64InstructionSelector::selectArithExtendedRegister(
+ MachineOperand &Root) const {
+ if (!Root.isReg())
+ return None;
+ MachineRegisterInfo &MRI =
+ Root.getParent()->getParent()->getParent()->getRegInfo();
+
+ uint64_t ShiftVal = 0;
+ Register ExtReg;
+ AArch64_AM::ShiftExtendType Ext;
+ MachineInstr *RootDef = getDefIgnoringCopies(Root.getReg(), MRI);
+ if (!RootDef)
+ return None;
+
+ if (!isWorthFoldingIntoExtendedReg(*RootDef, MRI))
+ return None;
+
+ // Check if we can fold a shift and an extend.
+ if (RootDef->getOpcode() == TargetOpcode::G_SHL) {
+ // Look for a constant on the RHS of the shift.
+ MachineOperand &RHS = RootDef->getOperand(2);
+ Optional<uint64_t> MaybeShiftVal = getImmedFromMO(RHS);
+ if (!MaybeShiftVal)
+ return None;
+ ShiftVal = *MaybeShiftVal;
+ if (ShiftVal > 4)
+ return None;
+ // Look for a valid extend instruction on the LHS of the shift.
+ MachineOperand &LHS = RootDef->getOperand(1);
+ MachineInstr *ExtDef = getDefIgnoringCopies(LHS.getReg(), MRI);
+ if (!ExtDef)
+ return None;
+ Ext = getExtendTypeForInst(*ExtDef, MRI);
+ if (Ext == AArch64_AM::InvalidShiftExtend)
+ return None;
+ ExtReg = ExtDef->getOperand(1).getReg();
+ } else {
+ // Didn't get a shift. Try just folding an extend.
+ Ext = getExtendTypeForInst(*RootDef, MRI);
+ if (Ext == AArch64_AM::InvalidShiftExtend)
+ return None;
+ ExtReg = RootDef->getOperand(1).getReg();
+
+ // If we have a 32 bit instruction which zeroes out the high half of a
+ // register, we get an implicit zero extend for free. Check if we have one.
+ // FIXME: We actually emit the extend right now even though we don't have
+ // to.
+ if (Ext == AArch64_AM::UXTW && MRI.getType(ExtReg).getSizeInBits() == 32) {
+ MachineInstr *ExtInst = MRI.getVRegDef(ExtReg);
+ if (ExtInst && isDef32(*ExtInst))
+ return None;
+ }
+ }
+
+ // We require a GPR32 here. Narrow the ExtReg if needed using a subregister
+ // copy.
+ MachineIRBuilder MIB(*RootDef);
+ ExtReg = narrowExtendRegIfNeeded(ExtReg, MIB);
+
+ return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(ExtReg); },
+ [=](MachineInstrBuilder &MIB) {
+ MIB.addImm(getArithExtendImm(Ext, ShiftVal));
+ }}};
+}
+
void AArch64InstructionSelector::renderTruncImm(MachineInstrBuilder &MIB,
const MachineInstr &MI) const {
const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
@@ -4003,6 +4640,51 @@ void AArch64InstructionSelector::renderTruncImm(MachineInstrBuilder &MIB,
MIB.addImm(CstVal.getValue());
}
+void AArch64InstructionSelector::renderLogicalImm32(
+ MachineInstrBuilder &MIB, const MachineInstr &I) const {
+ assert(I.getOpcode() == TargetOpcode::G_CONSTANT && "Expected G_CONSTANT");
+ uint64_t CstVal = I.getOperand(1).getCImm()->getZExtValue();
+ uint64_t Enc = AArch64_AM::encodeLogicalImmediate(CstVal, 32);
+ MIB.addImm(Enc);
+}
+
+void AArch64InstructionSelector::renderLogicalImm64(
+ MachineInstrBuilder &MIB, const MachineInstr &I) const {
+ assert(I.getOpcode() == TargetOpcode::G_CONSTANT && "Expected G_CONSTANT");
+ uint64_t CstVal = I.getOperand(1).getCImm()->getZExtValue();
+ uint64_t Enc = AArch64_AM::encodeLogicalImmediate(CstVal, 64);
+ MIB.addImm(Enc);
+}
+
+bool AArch64InstructionSelector::isLoadStoreOfNumBytes(
+ const MachineInstr &MI, unsigned NumBytes) const {
+ if (!MI.mayLoadOrStore())
+ return false;
+ assert(MI.hasOneMemOperand() &&
+ "Expected load/store to have only one mem op!");
+ return (*MI.memoperands_begin())->getSize() == NumBytes;
+}
+
+bool AArch64InstructionSelector::isDef32(const MachineInstr &MI) const {
+ const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
+ if (MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() != 32)
+ return false;
+
+ // Only return true if we know the operation will zero-out the high half of
+ // the 64-bit register. Truncates can be subregister copies, which don't
+ // zero out the high bits. Copies and other copy-like instructions can be
+ // fed by truncates, or could be lowered as subregister copies.
+ switch (MI.getOpcode()) {
+ default:
+ return true;
+ case TargetOpcode::COPY:
+ case TargetOpcode::G_BITCAST:
+ case TargetOpcode::G_TRUNC:
+ case TargetOpcode::G_PHI:
+ return false;
+ }
+}
+
namespace llvm {
InstructionSelector *
createAArch64InstructionSelector(const AArch64TargetMachine &TM,
diff --git a/lib/Target/AArch64/AArch64LegalizerInfo.cpp b/lib/Target/AArch64/AArch64LegalizerInfo.cpp
index a985b330eafa..7a1901bd5b1e 100644
--- a/lib/Target/AArch64/AArch64LegalizerInfo.cpp
+++ b/lib/Target/AArch64/AArch64LegalizerInfo.cpp
@@ -13,7 +13,9 @@
#include "AArch64LegalizerInfo.h"
#include "AArch64Subtarget.h"
+#include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
+#include "llvm/CodeGen/GlobalISel/Utils.h"
#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/TargetOpcodes.h"
@@ -50,6 +52,12 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) {
const LLT v2s64 = LLT::vector(2, 64);
const LLT v2p0 = LLT::vector(2, p0);
+ // FIXME: support subtargets which have neon/fp-armv8 disabled.
+ if (!ST.hasNEON() || !ST.hasFPARMv8()) {
+ computeTables();
+ return;
+ }
+
getActionDefinitionsBuilder(G_IMPLICIT_DEF)
.legalFor({p0, s1, s8, s16, s32, s64, v4s32, v2s64})
.clampScalar(0, s1, s64)
@@ -74,7 +82,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) {
getActionDefinitionsBuilder(G_BSWAP)
.legalFor({s32, s64, v4s32, v2s32, v2s64})
- .clampScalar(0, s16, s64)
+ .clampScalar(0, s32, s64)
.widenScalarToNextPow2(0);
getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL, G_AND, G_OR, G_XOR})
@@ -104,6 +112,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) {
getActionDefinitionsBuilder({G_SDIV, G_UDIV})
.legalFor({s32, s64})
+ .libcallFor({s128})
.clampScalar(0, s32, s64)
.widenScalarToNextPow2(0)
.scalarize(0);
@@ -115,8 +124,12 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) {
return !SrcTy.isVector() && SrcTy.getSizeInBits() == 32 &&
AmtTy.getSizeInBits() == 32;
})
- .legalFor(
- {{s32, s32}, {s32, s64}, {s64, s64}, {v2s32, v2s32}, {v4s32, v4s32}})
+ .legalFor({{s32, s32},
+ {s32, s64},
+ {s64, s64},
+ {v2s32, v2s32},
+ {v4s32, v4s32},
+ {v2s64, v2s64}})
.clampScalar(1, s32, s64)
.clampScalar(0, s32, s64)
.minScalarSameAs(1, 0);
@@ -191,14 +204,14 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) {
.legalIf([=](const LegalityQuery &Query) {
const LLT &Ty0 = Query.Types[0];
const LLT &Ty1 = Query.Types[1];
- if (Ty1 != s32 && Ty1 != s64)
+ if (Ty1 != s32 && Ty1 != s64 && Ty1 != s128)
return false;
if (Ty1 == p0)
return true;
return isPowerOf2_32(Ty0.getSizeInBits()) &&
(Ty0.getSizeInBits() == 1 || Ty0.getSizeInBits() >= 8);
})
- .clampScalar(1, s32, s64)
+ .clampScalar(1, s32, s128)
.widenScalarToNextPow2(1)
.maxScalarIf(typeInSet(1, {s32}), 0, s16)
.maxScalarIf(typeInSet(1, {s64}), 0, s32)
@@ -236,6 +249,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) {
{s32, p0, 32, 8},
{s64, p0, 64, 8},
{p0, p0, 64, 8},
+ {s128, p0, 128, 8},
{v8s8, p0, 64, 8},
{v16s8, p0, 128, 8},
{v4s16, p0, 64, 8},
@@ -247,14 +261,12 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) {
.legalForTypesWithMemDesc({{s32, p0, 8, 8},
{s32, p0, 16, 8}})
.clampScalar(0, s8, s64)
- .widenScalarToNextPow2(0)
- // TODO: We could support sum-of-pow2's but the lowering code doesn't know
- // how to do that yet.
- .unsupportedIfMemSizeNotPow2()
+ .lowerIfMemSizeNotPow2()
// Lower any any-extending loads left into G_ANYEXT and G_LOAD
.lowerIf([=](const LegalityQuery &Query) {
return Query.Types[0].getSizeInBits() != Query.MMODescrs[0].SizeInBits;
})
+ .widenScalarToNextPow2(0)
.clampMaxNumElements(0, s32, 2)
.clampMaxNumElements(0, s64, 1)
.customIf(IsPtrVecPred);
@@ -262,9 +274,12 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) {
getActionDefinitionsBuilder(G_STORE)
.legalForTypesWithMemDesc({{s8, p0, 8, 8},
{s16, p0, 16, 8},
+ {s32, p0, 8, 8},
+ {s32, p0, 16, 8},
{s32, p0, 32, 8},
{s64, p0, 64, 8},
{p0, p0, 64, 8},
+ {s128, p0, 128, 8},
{v16s8, p0, 128, 8},
{v4s16, p0, 64, 8},
{v8s16, p0, 128, 8},
@@ -272,10 +287,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) {
{v4s32, p0, 128, 8},
{v2s64, p0, 128, 8}})
.clampScalar(0, s8, s64)
- .widenScalarToNextPow2(0)
- // TODO: We could support sum-of-pow2's but the lowering code doesn't know
- // how to do that yet.
- .unsupportedIfMemSizeNotPow2()
+ .lowerIfMemSizeNotPow2()
.lowerIf([=](const LegalityQuery &Query) {
return Query.Types[0].isScalar() &&
Query.Types[0].getSizeInBits() != Query.MMODescrs[0].SizeInBits;
@@ -305,8 +317,8 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) {
{v8s16, v8s16},
{v8s8, v8s8},
{v16s8, v16s8}})
- .clampScalar(0, s32, s32)
.clampScalar(1, s32, s64)
+ .clampScalar(0, s32, s32)
.minScalarEltSameAsIf(
[=](const LegalityQuery &Query) {
const LLT &Ty = Query.Types[0];
@@ -330,33 +342,40 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) {
.widenScalarToNextPow2(1);
// Extensions
- getActionDefinitionsBuilder({G_ZEXT, G_SEXT, G_ANYEXT})
- .legalIf([=](const LegalityQuery &Query) {
- unsigned DstSize = Query.Types[0].getSizeInBits();
-
- // Make sure that we have something that will fit in a register, and
- // make sure it's a power of 2.
- if (DstSize < 8 || DstSize > 128 || !isPowerOf2_32(DstSize))
- return false;
+ auto ExtLegalFunc = [=](const LegalityQuery &Query) {
+ unsigned DstSize = Query.Types[0].getSizeInBits();
+
+ if (DstSize == 128 && !Query.Types[0].isVector())
+ return false; // Extending to a scalar s128 needs narrowing.
+
+ // Make sure that we have something that will fit in a register, and
+ // make sure it's a power of 2.
+ if (DstSize < 8 || DstSize > 128 || !isPowerOf2_32(DstSize))
+ return false;
- const LLT &SrcTy = Query.Types[1];
+ const LLT &SrcTy = Query.Types[1];
- // Special case for s1.
- if (SrcTy == s1)
- return true;
+ // Special case for s1.
+ if (SrcTy == s1)
+ return true;
- // Make sure we fit in a register otherwise. Don't bother checking that
- // the source type is below 128 bits. We shouldn't be allowing anything
- // through which is wider than the destination in the first place.
- unsigned SrcSize = SrcTy.getSizeInBits();
- if (SrcSize < 8 || !isPowerOf2_32(SrcSize))
- return false;
+ // Make sure we fit in a register otherwise. Don't bother checking that
+ // the source type is below 128 bits. We shouldn't be allowing anything
+ // through which is wider than the destination in the first place.
+ unsigned SrcSize = SrcTy.getSizeInBits();
+ if (SrcSize < 8 || !isPowerOf2_32(SrcSize))
+ return false;
- return true;
- });
+ return true;
+ };
+ getActionDefinitionsBuilder({G_ZEXT, G_SEXT, G_ANYEXT})
+ .legalIf(ExtLegalFunc)
+ .clampScalar(0, s64, s64); // Just for s128, others are handled above.
getActionDefinitionsBuilder(G_TRUNC).alwaysLegal();
+ getActionDefinitionsBuilder(G_SEXT_INREG).lower();
+
// FP conversions
getActionDefinitionsBuilder(G_FPTRUNC).legalFor(
{{s16, s32}, {s16, s64}, {s32, s64}, {v4s16, v4s32}, {v2s32, v2s64}});
@@ -591,6 +610,8 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) {
return Query.Types[0] == p0 && Query.Types[1] == s64;
});
+ getActionDefinitionsBuilder(G_DYN_STACKALLOC).lower();
+
computeTables();
verify(*ST.getInstrInfo());
}
@@ -617,6 +638,24 @@ bool AArch64LegalizerInfo::legalizeCustom(MachineInstr &MI,
llvm_unreachable("expected switch to return");
}
+bool AArch64LegalizerInfo::legalizeIntrinsic(
+ MachineInstr &MI, MachineRegisterInfo &MRI,
+ MachineIRBuilder &MIRBuilder) const {
+ switch (MI.getIntrinsicID()) {
+ case Intrinsic::memcpy:
+ case Intrinsic::memset:
+ case Intrinsic::memmove:
+ if (createMemLibcall(MIRBuilder, MRI, MI) ==
+ LegalizerHelper::UnableToLegalize)
+ return false;
+ MI.eraseFromParent();
+ return true;
+ default:
+ break;
+ }
+ return true;
+}
+
bool AArch64LegalizerInfo::legalizeShlAshrLshr(
MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &MIRBuilder,
GISelChangeObserver &Observer) const {
@@ -655,7 +694,7 @@ bool AArch64LegalizerInfo::legalizeLoadStore(
// legalized. In order to allow further legalization of the inst, we create
// a new instruction and erase the existing one.
- unsigned ValReg = MI.getOperand(0).getReg();
+ Register ValReg = MI.getOperand(0).getReg();
const LLT ValTy = MRI.getType(ValReg);
if (!ValTy.isVector() || !ValTy.getElementType().isPointer() ||
@@ -672,7 +711,7 @@ bool AArch64LegalizerInfo::legalizeLoadStore(
auto Bitcast = MIRBuilder.buildBitcast({NewTy}, {ValReg});
MIRBuilder.buildStore(Bitcast.getReg(0), MI.getOperand(1).getReg(), MMO);
} else {
- unsigned NewReg = MRI.createGenericVirtualRegister(NewTy);
+ Register NewReg = MRI.createGenericVirtualRegister(NewTy);
auto NewLoad = MIRBuilder.buildLoad(NewReg, MI.getOperand(1).getReg(), MMO);
MIRBuilder.buildBitcast({ValReg}, {NewLoad});
}
diff --git a/lib/Target/AArch64/AArch64LegalizerInfo.h b/lib/Target/AArch64/AArch64LegalizerInfo.h
index f3362a18620f..15161bab466c 100644
--- a/lib/Target/AArch64/AArch64LegalizerInfo.h
+++ b/lib/Target/AArch64/AArch64LegalizerInfo.h
@@ -31,6 +31,9 @@ public:
MachineIRBuilder &MIRBuilder,
GISelChangeObserver &Observer) const override;
+ bool legalizeIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI,
+ MachineIRBuilder &MIRBuilder) const override;
+
private:
bool legalizeVaArg(MachineInstr &MI, MachineRegisterInfo &MRI,
MachineIRBuilder &MIRBuilder) const;
diff --git a/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp b/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
index 65b5f906e3f6..a0c4a25bb5b9 100644
--- a/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
+++ b/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
@@ -201,8 +201,22 @@ static bool isNarrowStore(unsigned Opc) {
}
}
+// These instruction set memory tag and either keep memory contents unchanged or
+// set it to zero, ignoring the address part of the source register.
+static bool isTagStore(const MachineInstr &MI) {
+ switch (MI.getOpcode()) {
+ default:
+ return false;
+ case AArch64::STGOffset:
+ case AArch64::STZGOffset:
+ case AArch64::ST2GOffset:
+ case AArch64::STZ2GOffset:
+ return true;
+ }
+}
+
// Scaling factor for unscaled load or store.
-static int getMemScale(MachineInstr &MI) {
+static int getMemScale(const MachineInstr &MI) {
switch (MI.getOpcode()) {
default:
llvm_unreachable("Opcode has unknown scale!");
@@ -255,6 +269,11 @@ static int getMemScale(MachineInstr &MI) {
case AArch64::STURQi:
case AArch64::LDPQi:
case AArch64::STPQi:
+ case AArch64::STGOffset:
+ case AArch64::STZGOffset:
+ case AArch64::ST2GOffset:
+ case AArch64::STZ2GOffset:
+ case AArch64::STGPi:
return 16;
}
}
@@ -449,6 +468,16 @@ static unsigned getPreIndexedOpcode(unsigned Opc) {
return AArch64::STPWpre;
case AArch64::STPXi:
return AArch64::STPXpre;
+ case AArch64::STGOffset:
+ return AArch64::STGPreIndex;
+ case AArch64::STZGOffset:
+ return AArch64::STZGPreIndex;
+ case AArch64::ST2GOffset:
+ return AArch64::ST2GPreIndex;
+ case AArch64::STZ2GOffset:
+ return AArch64::STZ2GPreIndex;
+ case AArch64::STGPi:
+ return AArch64::STGPpre;
}
}
@@ -518,6 +547,16 @@ static unsigned getPostIndexedOpcode(unsigned Opc) {
return AArch64::STPWpost;
case AArch64::STPXi:
return AArch64::STPXpost;
+ case AArch64::STGOffset:
+ return AArch64::STGPostIndex;
+ case AArch64::STZGOffset:
+ return AArch64::STZGPostIndex;
+ case AArch64::ST2GOffset:
+ return AArch64::ST2GPostIndex;
+ case AArch64::STZ2GOffset:
+ return AArch64::STZ2GPostIndex;
+ case AArch64::STGPi:
+ return AArch64::STGPpost;
}
}
@@ -536,10 +575,30 @@ static bool isPairedLdSt(const MachineInstr &MI) {
case AArch64::STPQi:
case AArch64::STPWi:
case AArch64::STPXi:
+ case AArch64::STGPi:
return true;
}
}
+// Returns the scale and offset range of pre/post indexed variants of MI.
+static void getPrePostIndexedMemOpInfo(const MachineInstr &MI, int &Scale,
+ int &MinOffset, int &MaxOffset) {
+ bool IsPaired = isPairedLdSt(MI);
+ bool IsTagStore = isTagStore(MI);
+ // ST*G and all paired ldst have the same scale in pre/post-indexed variants
+ // as in the "unsigned offset" variant.
+ // All other pre/post indexed ldst instructions are unscaled.
+ Scale = (IsTagStore || IsPaired) ? getMemScale(MI) : 1;
+
+ if (IsPaired) {
+ MinOffset = -64;
+ MaxOffset = 63;
+ } else {
+ MinOffset = -256;
+ MaxOffset = 255;
+ }
+}
+
static const MachineOperand &getLdStRegOp(const MachineInstr &MI,
unsigned PairedRegOp = 0) {
assert(PairedRegOp < 2 && "Unexpected register operand idx.");
@@ -618,6 +677,11 @@ static bool isMergeableLdStUpdate(MachineInstr &MI) {
case AArch64::LDRWui:
case AArch64::LDRHHui:
case AArch64::LDRBBui:
+ case AArch64::STGOffset:
+ case AArch64::STZGOffset:
+ case AArch64::ST2GOffset:
+ case AArch64::STZ2GOffset:
+ case AArch64::STGPi:
// Unscaled instructions.
case AArch64::STURSi:
case AArch64::STURDi:
@@ -808,7 +872,7 @@ AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I,
// STRWui %w1, ...
// USE kill %w1 ; need to clear kill flag when moving STRWui downwards
// STRW %w0
- unsigned Reg = getLdStRegOp(*I).getReg();
+ Register Reg = getLdStRegOp(*I).getReg();
for (MachineInstr &MI : make_range(std::next(I), Paired))
MI.clearRegisterKills(Reg, TRI);
}
@@ -837,9 +901,9 @@ AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I,
MachineOperand &DstMO = MIB->getOperand(SExtIdx);
// Right now, DstMO has the extended register, since it comes from an
// extended opcode.
- unsigned DstRegX = DstMO.getReg();
+ Register DstRegX = DstMO.getReg();
// Get the W variant of that register.
- unsigned DstRegW = TRI->getSubReg(DstRegX, AArch64::sub_32);
+ Register DstRegW = TRI->getSubReg(DstRegX, AArch64::sub_32);
// Update the result of LDP to use the W instead of the X variant.
DstMO.setReg(DstRegW);
LLVM_DEBUG(((MachineInstr *)MIB)->print(dbgs()));
@@ -882,9 +946,9 @@ AArch64LoadStoreOpt::promoteLoadFromStore(MachineBasicBlock::iterator LoadI,
int LoadSize = getMemScale(*LoadI);
int StoreSize = getMemScale(*StoreI);
- unsigned LdRt = getLdStRegOp(*LoadI).getReg();
+ Register LdRt = getLdStRegOp(*LoadI).getReg();
const MachineOperand &StMO = getLdStRegOp(*StoreI);
- unsigned StRt = getLdStRegOp(*StoreI).getReg();
+ Register StRt = getLdStRegOp(*StoreI).getReg();
bool IsStoreXReg = TRI->getRegClass(AArch64::GPR64RegClassID)->contains(StRt);
assert((IsStoreXReg ||
@@ -933,10 +997,10 @@ AArch64LoadStoreOpt::promoteLoadFromStore(MachineBasicBlock::iterator LoadI,
? getLdStOffsetOp(*StoreI).getImm()
: getLdStOffsetOp(*StoreI).getImm() * StoreSize;
int Width = LoadSize * 8;
- unsigned DestReg = IsStoreXReg
- ? TRI->getMatchingSuperReg(LdRt, AArch64::sub_32,
- &AArch64::GPR64RegClass)
- : LdRt;
+ unsigned DestReg =
+ IsStoreXReg ? Register(TRI->getMatchingSuperReg(
+ LdRt, AArch64::sub_32, &AArch64::GPR64RegClass))
+ : LdRt;
assert((UnscaledLdOffset >= UnscaledStOffset &&
(UnscaledLdOffset + LoadSize) <= UnscaledStOffset + StoreSize) &&
@@ -1042,7 +1106,7 @@ bool AArch64LoadStoreOpt::findMatchingStore(
MachineBasicBlock::iterator B = I->getParent()->begin();
MachineBasicBlock::iterator MBBI = I;
MachineInstr &LoadMI = *I;
- unsigned BaseReg = getLdStBaseOp(LoadMI).getReg();
+ Register BaseReg = getLdStBaseOp(LoadMI).getReg();
// If the load is the first instruction in the block, there's obviously
// not any matching store.
@@ -1156,8 +1220,8 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
bool MayLoad = FirstMI.mayLoad();
bool IsUnscaled = TII->isUnscaledLdSt(FirstMI);
- unsigned Reg = getLdStRegOp(FirstMI).getReg();
- unsigned BaseReg = getLdStBaseOp(FirstMI).getReg();
+ Register Reg = getLdStRegOp(FirstMI).getReg();
+ Register BaseReg = getLdStBaseOp(FirstMI).getReg();
int Offset = getLdStOffsetOp(FirstMI).getImm();
int OffsetStride = IsUnscaled ? getMemScale(FirstMI) : 1;
bool IsPromotableZeroStore = isPromotableZeroStoreInst(FirstMI);
@@ -1188,7 +1252,7 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
// check for +1/-1. Make sure to check the new instruction offset is
// actually an immediate and not a symbolic reference destined for
// a relocation.
- unsigned MIBaseReg = getLdStBaseOp(MI).getReg();
+ Register MIBaseReg = getLdStBaseOp(MI).getReg();
int MIOffset = getLdStOffsetOp(MI).getImm();
bool MIIsUnscaled = TII->isUnscaledLdSt(MI);
if (IsUnscaled != MIIsUnscaled) {
@@ -1328,18 +1392,19 @@ AArch64LoadStoreOpt::mergeUpdateInsn(MachineBasicBlock::iterator I,
unsigned NewOpc = IsPreIdx ? getPreIndexedOpcode(I->getOpcode())
: getPostIndexedOpcode(I->getOpcode());
MachineInstrBuilder MIB;
+ int Scale, MinOffset, MaxOffset;
+ getPrePostIndexedMemOpInfo(*I, Scale, MinOffset, MaxOffset);
if (!isPairedLdSt(*I)) {
// Non-paired instruction.
MIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc))
.add(getLdStRegOp(*Update))
.add(getLdStRegOp(*I))
.add(getLdStBaseOp(*I))
- .addImm(Value)
+ .addImm(Value / Scale)
.setMemRefs(I->memoperands())
.setMIFlags(I->mergeFlagsWith(*Update));
} else {
// Paired instruction.
- int Scale = getMemScale(*I);
MIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc))
.add(getLdStRegOp(*Update))
.add(getLdStRegOp(*I, 0))
@@ -1395,28 +1460,21 @@ bool AArch64LoadStoreOpt::isMatchingUpdateInsn(MachineInstr &MemMI,
MI.getOperand(1).getReg() != BaseReg)
break;
- bool IsPairedInsn = isPairedLdSt(MemMI);
int UpdateOffset = MI.getOperand(2).getImm();
if (MI.getOpcode() == AArch64::SUBXri)
UpdateOffset = -UpdateOffset;
- // For non-paired load/store instructions, the immediate must fit in a
- // signed 9-bit integer.
- if (!IsPairedInsn && (UpdateOffset > 255 || UpdateOffset < -256))
+ // The immediate must be a multiple of the scaling factor of the pre/post
+ // indexed instruction.
+ int Scale, MinOffset, MaxOffset;
+ getPrePostIndexedMemOpInfo(MemMI, Scale, MinOffset, MaxOffset);
+ if (UpdateOffset % Scale != 0)
break;
- // For paired load/store instructions, the immediate must be a multiple of
- // the scaling factor. The scaled offset must also fit into a signed 7-bit
- // integer.
- if (IsPairedInsn) {
- int Scale = getMemScale(MemMI);
- if (UpdateOffset % Scale != 0)
- break;
-
- int ScaledOffset = UpdateOffset / Scale;
- if (ScaledOffset > 63 || ScaledOffset < -64)
- break;
- }
+ // Scaled offset must fit in the instruction immediate.
+ int ScaledOffset = UpdateOffset / Scale;
+ if (ScaledOffset > MaxOffset || ScaledOffset < MinOffset)
+ break;
// If we have a non-zero Offset, we check that it matches the amount
// we're adding to the register.
@@ -1433,7 +1491,7 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnForward(
MachineInstr &MemMI = *I;
MachineBasicBlock::iterator MBBI = I;
- unsigned BaseReg = getLdStBaseOp(MemMI).getReg();
+ Register BaseReg = getLdStBaseOp(MemMI).getReg();
int MIUnscaledOffset = getLdStOffsetOp(MemMI).getImm() * getMemScale(MemMI);
// Scan forward looking for post-index opportunities. Updating instructions
@@ -1442,13 +1500,19 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnForward(
if (MIUnscaledOffset != UnscaledOffset)
return E;
- // If the base register overlaps a destination register, we can't
- // merge the update.
- bool IsPairedInsn = isPairedLdSt(MemMI);
- for (unsigned i = 0, e = IsPairedInsn ? 2 : 1; i != e; ++i) {
- unsigned DestReg = getLdStRegOp(MemMI, i).getReg();
- if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg))
- return E;
+ // If the base register overlaps a source/destination register, we can't
+ // merge the update. This does not apply to tag store instructions which
+ // ignore the address part of the source register.
+ // This does not apply to STGPi as well, which does not have unpredictable
+ // behavior in this case unlike normal stores, and always performs writeback
+ // after reading the source register value.
+ if (!isTagStore(MemMI) && MemMI.getOpcode() != AArch64::STGPi) {
+ bool IsPairedInsn = isPairedLdSt(MemMI);
+ for (unsigned i = 0, e = IsPairedInsn ? 2 : 1; i != e; ++i) {
+ Register DestReg = getLdStRegOp(MemMI, i).getReg();
+ if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg))
+ return E;
+ }
}
// Track which register units have been modified and used between the first
@@ -1487,7 +1551,7 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward(
MachineInstr &MemMI = *I;
MachineBasicBlock::iterator MBBI = I;
- unsigned BaseReg = getLdStBaseOp(MemMI).getReg();
+ Register BaseReg = getLdStBaseOp(MemMI).getReg();
int Offset = getLdStOffsetOp(MemMI).getImm();
// If the load/store is the first instruction in the block, there's obviously
@@ -1496,11 +1560,13 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward(
return E;
// If the base register overlaps a destination register, we can't
// merge the update.
- bool IsPairedInsn = isPairedLdSt(MemMI);
- for (unsigned i = 0, e = IsPairedInsn ? 2 : 1; i != e; ++i) {
- unsigned DestReg = getLdStRegOp(MemMI, i).getReg();
- if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg))
- return E;
+ if (!isTagStore(MemMI)) {
+ bool IsPairedInsn = isPairedLdSt(MemMI);
+ for (unsigned i = 0, e = IsPairedInsn ? 2 : 1; i != e; ++i) {
+ Register DestReg = getLdStRegOp(MemMI, i).getReg();
+ if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg))
+ return E;
+ }
}
// Track which register units have been modified and used between the first
@@ -1659,7 +1725,7 @@ bool AArch64LoadStoreOpt::tryToMergeLdStUpdate
// however, is not, so adjust here.
int UnscaledOffset = getLdStOffsetOp(MI).getImm() * getMemScale(MI);
- // Look forward to try to find a post-index instruction. For example,
+ // Look forward to try to find a pre-index instruction. For example,
// ldr x1, [x0, #64]
// add x0, x0, #64
// merged into:
diff --git a/lib/Target/AArch64/AArch64MCInstLower.cpp b/lib/Target/AArch64/AArch64MCInstLower.cpp
index e7d4a2789a28..afd5ae6bcbf2 100644
--- a/lib/Target/AArch64/AArch64MCInstLower.cpp
+++ b/lib/Target/AArch64/AArch64MCInstLower.cpp
@@ -148,6 +148,8 @@ MCOperand AArch64MCInstLower::lowerSymbolOperandELF(const MachineOperand &MO,
RefFlags |= AArch64MCExpr::VK_TLSDESC;
break;
}
+ } else if (MO.getTargetFlags() & AArch64II::MO_PREL) {
+ RefFlags |= AArch64MCExpr::VK_PREL;
} else {
// No modifier means this is a generic reference, classified as absolute for
// the cases where it matters (:abs_g0: etc).
diff --git a/lib/Target/AArch64/AArch64MachineFunctionInfo.h b/lib/Target/AArch64/AArch64MachineFunctionInfo.h
index 0efeeb272ec1..0009fb7b5520 100644
--- a/lib/Target/AArch64/AArch64MachineFunctionInfo.h
+++ b/lib/Target/AArch64/AArch64MachineFunctionInfo.h
@@ -19,6 +19,7 @@
#include "llvm/ADT/SmallVector.h"
#include "llvm/CodeGen/CallingConvLower.h"
#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/IR/Function.h"
#include "llvm/MC/MCLinkerOptimizationHint.h"
#include <cassert>
@@ -95,6 +96,13 @@ class AArch64FunctionInfo final : public MachineFunctionInfo {
/// returned struct in a register. This field holds the virtual register into
/// which the sret argument is passed.
unsigned SRetReturnReg = 0;
+ /// SVE stack size (for predicates and data vectors) are maintained here
+ /// rather than in FrameInfo, as the placement and Stack IDs are target
+ /// specific.
+ uint64_t StackSizeSVE = 0;
+
+ /// HasCalculatedStackSizeSVE indicates whether StackSizeSVE is valid.
+ bool HasCalculatedStackSizeSVE = false;
/// Has a value when it is known whether or not the function uses a
/// redzone, and no value otherwise.
@@ -131,6 +139,15 @@ public:
ArgumentStackToRestore = bytes;
}
+ bool hasCalculatedStackSizeSVE() const { return HasCalculatedStackSizeSVE; }
+
+ void setStackSizeSVE(uint64_t S) {
+ HasCalculatedStackSizeSVE = true;
+ StackSizeSVE = S;
+ }
+
+ uint64_t getStackSizeSVE() const { return StackSizeSVE; }
+
bool hasStackFrame() const { return HasStackFrame; }
void setHasStackFrame(bool s) { HasStackFrame = s; }
diff --git a/lib/Target/AArch64/AArch64PBQPRegAlloc.cpp b/lib/Target/AArch64/AArch64PBQPRegAlloc.cpp
index aff861aae6be..d503c39b1f90 100644
--- a/lib/Target/AArch64/AArch64PBQPRegAlloc.cpp
+++ b/lib/Target/AArch64/AArch64PBQPRegAlloc.cpp
@@ -162,11 +162,11 @@ bool A57ChainingConstraint::addIntraChainConstraint(PBQPRAGraph &G, unsigned Rd,
LiveIntervals &LIs = G.getMetadata().LIS;
- if (TRI->isPhysicalRegister(Rd) || TRI->isPhysicalRegister(Ra)) {
- LLVM_DEBUG(dbgs() << "Rd is a physical reg:" << TRI->isPhysicalRegister(Rd)
- << '\n');
- LLVM_DEBUG(dbgs() << "Ra is a physical reg:" << TRI->isPhysicalRegister(Ra)
- << '\n');
+ if (Register::isPhysicalRegister(Rd) || Register::isPhysicalRegister(Ra)) {
+ LLVM_DEBUG(dbgs() << "Rd is a physical reg:"
+ << Register::isPhysicalRegister(Rd) << '\n');
+ LLVM_DEBUG(dbgs() << "Ra is a physical reg:"
+ << Register::isPhysicalRegister(Ra) << '\n');
return false;
}
@@ -359,8 +359,8 @@ void A57ChainingConstraint::apply(PBQPRAGraph &G) {
case AArch64::FMADDDrrr:
case AArch64::FNMSUBDrrr:
case AArch64::FNMADDDrrr: {
- unsigned Rd = MI.getOperand(0).getReg();
- unsigned Ra = MI.getOperand(3).getReg();
+ Register Rd = MI.getOperand(0).getReg();
+ Register Ra = MI.getOperand(3).getReg();
if (addIntraChainConstraint(G, Rd, Ra))
addInterChainConstraint(G, Rd, Ra);
@@ -369,7 +369,7 @@ void A57ChainingConstraint::apply(PBQPRAGraph &G) {
case AArch64::FMLAv2f32:
case AArch64::FMLSv2f32: {
- unsigned Rd = MI.getOperand(0).getReg();
+ Register Rd = MI.getOperand(0).getReg();
addInterChainConstraint(G, Rd, Rd);
break;
}
diff --git a/lib/Target/AArch64/AArch64PreLegalizerCombiner.cpp b/lib/Target/AArch64/AArch64PreLegalizerCombiner.cpp
index 5f7245bfbd74..d30ea120bae4 100644
--- a/lib/Target/AArch64/AArch64PreLegalizerCombiner.cpp
+++ b/lib/Target/AArch64/AArch64PreLegalizerCombiner.cpp
@@ -15,7 +15,9 @@
#include "llvm/CodeGen/GlobalISel/Combiner.h"
#include "llvm/CodeGen/GlobalISel/CombinerHelper.h"
#include "llvm/CodeGen/GlobalISel/CombinerInfo.h"
+#include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
+#include "llvm/CodeGen/MachineDominators.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/Support/Debug.h"
@@ -25,12 +27,31 @@
using namespace llvm;
using namespace MIPatternMatch;
+#define AARCH64PRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS
+#include "AArch64GenGICombiner.inc"
+#undef AARCH64PRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS
+
namespace {
+#define AARCH64PRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H
+#include "AArch64GenGICombiner.inc"
+#undef AARCH64PRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H
+
class AArch64PreLegalizerCombinerInfo : public CombinerInfo {
+ GISelKnownBits *KB;
+ MachineDominatorTree *MDT;
+
public:
- AArch64PreLegalizerCombinerInfo()
+ AArch64GenPreLegalizerCombinerHelper Generated;
+
+ AArch64PreLegalizerCombinerInfo(bool EnableOpt, bool OptSize, bool MinSize,
+ GISelKnownBits *KB, MachineDominatorTree *MDT)
: CombinerInfo(/*AllowIllegalOps*/ true, /*ShouldLegalizeIllegal*/ false,
- /*LegalizerInfo*/ nullptr) {}
+ /*LegalizerInfo*/ nullptr, EnableOpt, OptSize, MinSize),
+ KB(KB), MDT(MDT) {
+ if (!Generated.parseCommandLineOption())
+ report_fatal_error("Invalid rule identifier");
+ }
+
virtual bool combine(GISelChangeObserver &Observer, MachineInstr &MI,
MachineIRBuilder &B) const override;
};
@@ -38,24 +59,50 @@ public:
bool AArch64PreLegalizerCombinerInfo::combine(GISelChangeObserver &Observer,
MachineInstr &MI,
MachineIRBuilder &B) const {
- CombinerHelper Helper(Observer, B);
+ CombinerHelper Helper(Observer, B, KB, MDT);
switch (MI.getOpcode()) {
- default:
- return false;
- case TargetOpcode::COPY:
- return Helper.tryCombineCopy(MI);
- case TargetOpcode::G_BR:
- return Helper.tryCombineBr(MI);
+ case TargetOpcode::G_CONCAT_VECTORS:
+ return Helper.tryCombineConcatVectors(MI);
+ case TargetOpcode::G_SHUFFLE_VECTOR:
+ return Helper.tryCombineShuffleVector(MI);
case TargetOpcode::G_LOAD:
case TargetOpcode::G_SEXTLOAD:
- case TargetOpcode::G_ZEXTLOAD:
- return Helper.tryCombineExtendingLoads(MI);
+ case TargetOpcode::G_ZEXTLOAD: {
+ bool Changed = false;
+ Changed |= Helper.tryCombineExtendingLoads(MI);
+ Changed |= Helper.tryCombineIndexedLoadStore(MI);
+ return Changed;
+ }
+ case TargetOpcode::G_STORE:
+ return Helper.tryCombineIndexedLoadStore(MI);
+ case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS:
+ switch (MI.getIntrinsicID()) {
+ case Intrinsic::memcpy:
+ case Intrinsic::memmove:
+ case Intrinsic::memset: {
+ // If we're at -O0 set a maxlen of 32 to inline, otherwise let the other
+ // heuristics decide.
+ unsigned MaxLen = EnableOpt ? 0 : 32;
+ // Try to inline memcpy type calls if optimizations are enabled.
+ return (!EnableMinSize) ? Helper.tryCombineMemCpyFamily(MI, MaxLen)
+ : false;
+ }
+ default:
+ break;
+ }
}
+ if (Generated.tryCombineAll(Observer, MI, B))
+ return true;
+
return false;
}
+#define AARCH64PRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP
+#include "AArch64GenGICombiner.inc"
+#undef AARCH64PRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP
+
// Pass boilerplate
// ================
@@ -63,24 +110,33 @@ class AArch64PreLegalizerCombiner : public MachineFunctionPass {
public:
static char ID;
- AArch64PreLegalizerCombiner();
+ AArch64PreLegalizerCombiner(bool IsOptNone = false);
StringRef getPassName() const override { return "AArch64PreLegalizerCombiner"; }
bool runOnMachineFunction(MachineFunction &MF) override;
void getAnalysisUsage(AnalysisUsage &AU) const override;
+private:
+ bool IsOptNone;
};
-}
+} // end anonymous namespace
void AArch64PreLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const {
AU.addRequired<TargetPassConfig>();
AU.setPreservesCFG();
getSelectionDAGFallbackAnalysisUsage(AU);
+ AU.addRequired<GISelKnownBitsAnalysis>();
+ AU.addPreserved<GISelKnownBitsAnalysis>();
+ if (!IsOptNone) {
+ AU.addRequired<MachineDominatorTree>();
+ AU.addPreserved<MachineDominatorTree>();
+ }
MachineFunctionPass::getAnalysisUsage(AU);
}
-AArch64PreLegalizerCombiner::AArch64PreLegalizerCombiner() : MachineFunctionPass(ID) {
+AArch64PreLegalizerCombiner::AArch64PreLegalizerCombiner(bool IsOptNone)
+ : MachineFunctionPass(ID), IsOptNone(IsOptNone) {
initializeAArch64PreLegalizerCombinerPass(*PassRegistry::getPassRegistry());
}
@@ -89,7 +145,14 @@ bool AArch64PreLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) {
MachineFunctionProperties::Property::FailedISel))
return false;
auto *TPC = &getAnalysis<TargetPassConfig>();
- AArch64PreLegalizerCombinerInfo PCInfo;
+ const Function &F = MF.getFunction();
+ bool EnableOpt =
+ MF.getTarget().getOptLevel() != CodeGenOpt::None && !skipFunction(F);
+ GISelKnownBits *KB = &getAnalysis<GISelKnownBitsAnalysis>().get(MF);
+ MachineDominatorTree *MDT =
+ IsOptNone ? nullptr : &getAnalysis<MachineDominatorTree>();
+ AArch64PreLegalizerCombinerInfo PCInfo(EnableOpt, F.hasOptSize(),
+ F.hasMinSize(), KB, MDT);
Combiner C(PCInfo, TPC);
return C.combineMachineInstrs(MF, /*CSEInfo*/ nullptr);
}
@@ -99,13 +162,14 @@ INITIALIZE_PASS_BEGIN(AArch64PreLegalizerCombiner, DEBUG_TYPE,
"Combine AArch64 machine instrs before legalization",
false, false)
INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
+INITIALIZE_PASS_DEPENDENCY(GISelKnownBitsAnalysis)
INITIALIZE_PASS_END(AArch64PreLegalizerCombiner, DEBUG_TYPE,
"Combine AArch64 machine instrs before legalization", false,
false)
namespace llvm {
-FunctionPass *createAArch64PreLegalizeCombiner() {
- return new AArch64PreLegalizerCombiner();
+FunctionPass *createAArch64PreLegalizeCombiner(bool IsOptNone) {
+ return new AArch64PreLegalizerCombiner(IsOptNone);
}
} // end namespace llvm
diff --git a/lib/Target/AArch64/AArch64RegisterBankInfo.cpp b/lib/Target/AArch64/AArch64RegisterBankInfo.cpp
index b52259cc9acd..8ec73aa3c040 100644
--- a/lib/Target/AArch64/AArch64RegisterBankInfo.cpp
+++ b/lib/Target/AArch64/AArch64RegisterBankInfo.cpp
@@ -563,12 +563,12 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
return getSameKindOfOperandsMapping(MI);
}
case TargetOpcode::COPY: {
- unsigned DstReg = MI.getOperand(0).getReg();
- unsigned SrcReg = MI.getOperand(1).getReg();
+ Register DstReg = MI.getOperand(0).getReg();
+ Register SrcReg = MI.getOperand(1).getReg();
// Check if one of the register is not a generic register.
- if ((TargetRegisterInfo::isPhysicalRegister(DstReg) ||
+ if ((Register::isPhysicalRegister(DstReg) ||
!MRI.getType(DstReg).isValid()) ||
- (TargetRegisterInfo::isPhysicalRegister(SrcReg) ||
+ (Register::isPhysicalRegister(SrcReg) ||
!MRI.getType(SrcReg).isValid())) {
const RegisterBank *DstRB = getRegBank(DstReg, MRI, TRI);
const RegisterBank *SrcRB = getRegBank(SrcReg, MRI, TRI);
@@ -635,6 +635,12 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
// Some of the floating-point instructions have mixed GPR and FPR operands:
// fine-tune the computed mapping.
switch (Opc) {
+ case TargetOpcode::G_TRUNC: {
+ LLT SrcTy = MRI.getType(MI.getOperand(1).getReg());
+ if (!SrcTy.isVector() && SrcTy.getSizeInBits() == 128)
+ OpRegBankIdx = {PMI_FirstFPR, PMI_FirstFPR};
+ break;
+ }
case TargetOpcode::G_SITOFP:
case TargetOpcode::G_UITOFP:
if (MRI.getType(MI.getOperand(0).getReg()).isVector())
@@ -687,7 +693,7 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case TargetOpcode::G_STORE:
// Check if that store is fed by fp instructions.
if (OpRegBankIdx[0] == PMI_FirstGPR) {
- unsigned VReg = MI.getOperand(0).getReg();
+ Register VReg = MI.getOperand(0).getReg();
if (!VReg)
break;
MachineInstr *DefMI = MRI.getVRegDef(VReg);
@@ -702,11 +708,10 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
break;
// If we're taking in vectors, we have no choice but to put everything on
- // FPRs.
+ // FPRs, except for the condition. The condition must always be on a GPR.
LLT SrcTy = MRI.getType(MI.getOperand(2).getReg());
if (SrcTy.isVector()) {
- for (unsigned Idx = 0; Idx < 4; ++Idx)
- OpRegBankIdx[Idx] = PMI_FirstFPR;
+ OpRegBankIdx = {PMI_FirstFPR, PMI_FirstGPR, PMI_FirstFPR, PMI_FirstFPR};
break;
}
@@ -740,7 +745,7 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
// This doesn't check the condition, since it's just whatever is in NZCV.
// This isn't passed explicitly in a register to fcsel/csel.
for (unsigned Idx = 2; Idx < 4; ++Idx) {
- unsigned VReg = MI.getOperand(Idx).getReg();
+ Register VReg = MI.getOperand(Idx).getReg();
MachineInstr *DefMI = MRI.getVRegDef(VReg);
if (getRegBank(VReg, MRI, TRI) == &AArch64::FPRRegBank ||
onlyDefinesFP(*DefMI, MRI, TRI))
@@ -750,8 +755,7 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
// If we have more FP constraints than not, then move everything over to
// FPR.
if (NumFP >= 2)
- for (unsigned Idx = 0; Idx < 4; ++Idx)
- OpRegBankIdx[Idx] = PMI_FirstFPR;
+ OpRegBankIdx = {PMI_FirstFPR, PMI_FirstGPR, PMI_FirstFPR, PMI_FirstFPR};
break;
}
@@ -764,7 +768,7 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
LLT SrcTy = MRI.getType(MI.getOperand(MI.getNumOperands()-1).getReg());
// UNMERGE into scalars from a vector should always use FPR.
// Likewise if any of the uses are FP instructions.
- if (SrcTy.isVector() ||
+ if (SrcTy.isVector() || SrcTy == LLT::scalar(128) ||
any_of(MRI.use_instructions(MI.getOperand(0).getReg()),
[&](MachineInstr &MI) { return onlyUsesFP(MI, MRI, TRI); })) {
// Set the register bank of every operand to FPR.
@@ -795,12 +799,21 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
// Index needs to be a GPR.
OpRegBankIdx[3] = PMI_FirstGPR;
break;
+ case TargetOpcode::G_EXTRACT: {
+ // For s128 sources we have to use fpr.
+ LLT SrcTy = MRI.getType(MI.getOperand(1).getReg());
+ if (SrcTy.getSizeInBits() == 128) {
+ OpRegBankIdx[0] = PMI_FirstFPR;
+ OpRegBankIdx[1] = PMI_FirstFPR;
+ }
+ break;
+ }
case TargetOpcode::G_BUILD_VECTOR:
// If the first source operand belongs to a FPR register bank, then make
// sure that we preserve that.
if (OpRegBankIdx[1] != PMI_FirstGPR)
break;
- unsigned VReg = MI.getOperand(1).getReg();
+ Register VReg = MI.getOperand(1).getReg();
if (!VReg)
break;
diff --git a/lib/Target/AArch64/AArch64RegisterInfo.cpp b/lib/Target/AArch64/AArch64RegisterInfo.cpp
index 6d5a4e3d2f76..de176088595d 100644
--- a/lib/Target/AArch64/AArch64RegisterInfo.cpp
+++ b/lib/Target/AArch64/AArch64RegisterInfo.cpp
@@ -15,6 +15,7 @@
#include "AArch64FrameLowering.h"
#include "AArch64InstrInfo.h"
#include "AArch64MachineFunctionInfo.h"
+#include "AArch64StackOffset.h"
#include "AArch64Subtarget.h"
#include "MCTargetDesc/AArch64AddressingModes.h"
#include "llvm/ADT/BitVector.h"
@@ -23,10 +24,10 @@
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/RegisterScavenging.h"
-#include "llvm/IR/Function.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
#include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/IR/Function.h"
#include "llvm/Support/raw_ostream.h"
-#include "llvm/CodeGen/TargetFrameLowering.h"
#include "llvm/Target/TargetOptions.h"
using namespace llvm;
@@ -63,8 +64,9 @@ AArch64RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
return CSR_AArch64_AAPCS_SwiftError_SaveList;
if (MF->getFunction().getCallingConv() == CallingConv::PreserveMost)
return CSR_AArch64_RT_MostRegs_SaveList;
- else
- return CSR_AArch64_AAPCS_SaveList;
+ if (MF->getSubtarget<AArch64Subtarget>().isTargetDarwin())
+ return CSR_Darwin_AArch64_AAPCS_SaveList;
+ return CSR_AArch64_AAPCS_SaveList;
}
const MCPhysReg *AArch64RegisterInfo::getCalleeSavedRegsViaCopy(
@@ -120,6 +122,8 @@ AArch64RegisterInfo::getCallPreservedMask(const MachineFunction &MF,
: CSR_AArch64_CXX_TLS_Darwin_RegMask;
if (CC == CallingConv::AArch64_VectorCall)
return SCS ? CSR_AArch64_AAVPCS_SCS_RegMask : CSR_AArch64_AAVPCS_RegMask;
+ if (CC == CallingConv::AArch64_SVE_VectorCall)
+ return CSR_AArch64_SVE_AAPCS_RegMask;
if (MF.getSubtarget<AArch64Subtarget>().getTargetLowering()
->supportSwiftError() &&
MF.getFunction().getAttributes().hasAttrSomewhere(Attribute::SwiftError))
@@ -388,7 +392,7 @@ bool AArch64RegisterInfo::isFrameOffsetLegal(const MachineInstr *MI,
int64_t Offset) const {
assert(Offset <= INT_MAX && "Offset too big to fit in int.");
assert(MI && "Unable to get the legal offset for nil instruction.");
- int SaveOffset = Offset;
+ StackOffset SaveOffset(Offset, MVT::i8);
return isAArch64FrameOffsetLegal(*MI, SaveOffset) & AArch64FrameOffsetIsLegal;
}
@@ -418,7 +422,9 @@ void AArch64RegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB,
void AArch64RegisterInfo::resolveFrameIndex(MachineInstr &MI, unsigned BaseReg,
int64_t Offset) const {
- int Off = Offset; // ARM doesn't need the general 64-bit offsets
+ // ARM doesn't need the general 64-bit offsets
+ StackOffset Off(Offset, MVT::i8);
+
unsigned i = 0;
while (!MI.getOperand(i).isFI()) {
@@ -441,40 +447,69 @@ void AArch64RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
MachineInstr &MI = *II;
MachineBasicBlock &MBB = *MI.getParent();
MachineFunction &MF = *MBB.getParent();
+ const MachineFrameInfo &MFI = MF.getFrameInfo();
const AArch64InstrInfo *TII =
MF.getSubtarget<AArch64Subtarget>().getInstrInfo();
const AArch64FrameLowering *TFI = getFrameLowering(MF);
int FrameIndex = MI.getOperand(FIOperandNum).getIndex();
+ bool Tagged =
+ MI.getOperand(FIOperandNum).getTargetFlags() & AArch64II::MO_TAGGED;
unsigned FrameReg;
- int Offset;
// Special handling of dbg_value, stackmap and patchpoint instructions.
if (MI.isDebugValue() || MI.getOpcode() == TargetOpcode::STACKMAP ||
MI.getOpcode() == TargetOpcode::PATCHPOINT) {
- Offset = TFI->resolveFrameIndexReference(MF, FrameIndex, FrameReg,
- /*PreferFP=*/true,
- /*ForSimm=*/false);
- Offset += MI.getOperand(FIOperandNum + 1).getImm();
+ StackOffset Offset =
+ TFI->resolveFrameIndexReference(MF, FrameIndex, FrameReg,
+ /*PreferFP=*/true,
+ /*ForSimm=*/false);
+ Offset += StackOffset(MI.getOperand(FIOperandNum + 1).getImm(), MVT::i8);
MI.getOperand(FIOperandNum).ChangeToRegister(FrameReg, false /*isDef*/);
- MI.getOperand(FIOperandNum + 1).ChangeToImmediate(Offset);
+ MI.getOperand(FIOperandNum + 1).ChangeToImmediate(Offset.getBytes());
return;
}
if (MI.getOpcode() == TargetOpcode::LOCAL_ESCAPE) {
MachineOperand &FI = MI.getOperand(FIOperandNum);
- Offset = TFI->getNonLocalFrameIndexReference(MF, FrameIndex);
+ int Offset = TFI->getNonLocalFrameIndexReference(MF, FrameIndex);
FI.ChangeToImmediate(Offset);
return;
}
+ StackOffset Offset;
if (MI.getOpcode() == AArch64::TAGPstack) {
// TAGPstack must use the virtual frame register in its 3rd operand.
- const MachineFrameInfo &MFI = MF.getFrameInfo();
const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
FrameReg = MI.getOperand(3).getReg();
- Offset =
- MFI.getObjectOffset(FrameIndex) + AFI->getTaggedBasePointerOffset();
+ Offset = {MFI.getObjectOffset(FrameIndex) +
+ AFI->getTaggedBasePointerOffset(),
+ MVT::i8};
+ } else if (Tagged) {
+ StackOffset SPOffset = {
+ MFI.getObjectOffset(FrameIndex) + (int64_t)MFI.getStackSize(), MVT::i8};
+ if (MFI.hasVarSizedObjects() ||
+ isAArch64FrameOffsetLegal(MI, SPOffset, nullptr, nullptr, nullptr) !=
+ (AArch64FrameOffsetCanUpdate | AArch64FrameOffsetIsLegal)) {
+ // Can't update to SP + offset in place. Precalculate the tagged pointer
+ // in a scratch register.
+ Offset = TFI->resolveFrameIndexReference(
+ MF, FrameIndex, FrameReg, /*PreferFP=*/false, /*ForSimm=*/true);
+ Register ScratchReg =
+ MF.getRegInfo().createVirtualRegister(&AArch64::GPR64RegClass);
+ emitFrameOffset(MBB, II, MI.getDebugLoc(), ScratchReg, FrameReg, Offset,
+ TII);
+ BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(AArch64::LDG), ScratchReg)
+ .addReg(ScratchReg)
+ .addReg(ScratchReg)
+ .addImm(0);
+ MI.getOperand(FIOperandNum)
+ .ChangeToRegister(ScratchReg, false, false, true);
+ return;
+ }
+ FrameReg = AArch64::SP;
+ Offset = {MFI.getObjectOffset(FrameIndex) + (int64_t)MFI.getStackSize(),
+ MVT::i8};
} else {
Offset = TFI->resolveFrameIndexReference(
MF, FrameIndex, FrameReg, /*PreferFP=*/false, /*ForSimm=*/true);
@@ -490,7 +525,7 @@ void AArch64RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
// If we get here, the immediate doesn't fit into the instruction. We folded
// as much as possible above. Handle the rest, providing a register that is
// SP+LargeImm.
- unsigned ScratchReg =
+ Register ScratchReg =
MF.getRegInfo().createVirtualRegister(&AArch64::GPR64RegClass);
emitFrameOffset(MBB, II, MI.getDebugLoc(), ScratchReg, FrameReg, Offset, TII);
MI.getOperand(FIOperandNum).ChangeToRegister(ScratchReg, false, false, true);
diff --git a/lib/Target/AArch64/AArch64SIMDInstrOpt.cpp b/lib/Target/AArch64/AArch64SIMDInstrOpt.cpp
index 854670079e40..28a7e680849b 100644
--- a/lib/Target/AArch64/AArch64SIMDInstrOpt.cpp
+++ b/lib/Target/AArch64/AArch64SIMDInstrOpt.cpp
@@ -426,16 +426,16 @@ bool AArch64SIMDInstrOpt::optimizeVectElement(MachineInstr &MI) {
MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
// Get the operands of the current SIMD arithmetic instruction.
- unsigned MulDest = MI.getOperand(0).getReg();
- unsigned SrcReg0 = MI.getOperand(1).getReg();
+ Register MulDest = MI.getOperand(0).getReg();
+ Register SrcReg0 = MI.getOperand(1).getReg();
unsigned Src0IsKill = getKillRegState(MI.getOperand(1).isKill());
- unsigned SrcReg1 = MI.getOperand(2).getReg();
+ Register SrcReg1 = MI.getOperand(2).getReg();
unsigned Src1IsKill = getKillRegState(MI.getOperand(2).isKill());
unsigned DupDest;
// Instructions of interest have either 4 or 5 operands.
if (MI.getNumOperands() == 5) {
- unsigned SrcReg2 = MI.getOperand(3).getReg();
+ Register SrcReg2 = MI.getOperand(3).getReg();
unsigned Src2IsKill = getKillRegState(MI.getOperand(3).isKill());
unsigned LaneNumber = MI.getOperand(4).getImm();
// Create a new DUP instruction. Note that if an equivalent DUP instruction
diff --git a/lib/Target/AArch64/AArch64SVEInstrInfo.td b/lib/Target/AArch64/AArch64SVEInstrInfo.td
index 79ab42f4c080..b573eac76754 100644
--- a/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -82,11 +82,11 @@ let Predicates = [HasSVE] in {
defm SDIVR_ZPmZ : sve_int_bin_pred_arit_2_div<0b110, "sdivr">;
defm UDIVR_ZPmZ : sve_int_bin_pred_arit_2_div<0b111, "udivr">;
- defm SDOT_ZZZ : sve_intx_dot<0b0, "sdot">;
- defm UDOT_ZZZ : sve_intx_dot<0b1, "udot">;
+ defm SDOT_ZZZ : sve_intx_dot<0b0, "sdot", int_aarch64_sve_sdot>;
+ defm UDOT_ZZZ : sve_intx_dot<0b1, "udot", int_aarch64_sve_udot>;
- defm SDOT_ZZZI : sve_intx_dot_by_indexed_elem<0b0, "sdot">;
- defm UDOT_ZZZI : sve_intx_dot_by_indexed_elem<0b1, "udot">;
+ defm SDOT_ZZZI : sve_intx_dot_by_indexed_elem<0b0, "sdot", int_aarch64_sve_sdot_lane>;
+ defm UDOT_ZZZI : sve_intx_dot_by_indexed_elem<0b1, "udot", int_aarch64_sve_udot_lane>;
defm SXTB_ZPmZ : sve_int_un_pred_arit_0_h<0b000, "sxtb">;
defm UXTB_ZPmZ : sve_int_un_pred_arit_0_h<0b001, "uxtb">;
@@ -94,14 +94,14 @@ let Predicates = [HasSVE] in {
defm UXTH_ZPmZ : sve_int_un_pred_arit_0_w<0b011, "uxth">;
defm SXTW_ZPmZ : sve_int_un_pred_arit_0_d<0b100, "sxtw">;
defm UXTW_ZPmZ : sve_int_un_pred_arit_0_d<0b101, "uxtw">;
- defm ABS_ZPmZ : sve_int_un_pred_arit_0< 0b110, "abs">;
- defm NEG_ZPmZ : sve_int_un_pred_arit_0< 0b111, "neg">;
-
- defm CLS_ZPmZ : sve_int_un_pred_arit_1< 0b000, "cls">;
- defm CLZ_ZPmZ : sve_int_un_pred_arit_1< 0b001, "clz">;
- defm CNT_ZPmZ : sve_int_un_pred_arit_1< 0b010, "cnt">;
- defm CNOT_ZPmZ : sve_int_un_pred_arit_1< 0b011, "cnot">;
- defm NOT_ZPmZ : sve_int_un_pred_arit_1< 0b110, "not">;
+ defm ABS_ZPmZ : sve_int_un_pred_arit_0< 0b110, "abs", int_aarch64_sve_abs>;
+ defm NEG_ZPmZ : sve_int_un_pred_arit_0< 0b111, "neg", int_aarch64_sve_neg>;
+
+ defm CLS_ZPmZ : sve_int_un_pred_arit_1< 0b000, "cls", null_frag>;
+ defm CLZ_ZPmZ : sve_int_un_pred_arit_1< 0b001, "clz", null_frag>;
+ defm CNT_ZPmZ : sve_int_un_pred_arit_1< 0b010, "cnt", int_aarch64_sve_cnt>;
+ defm CNOT_ZPmZ : sve_int_un_pred_arit_1< 0b011, "cnot", null_frag>;
+ defm NOT_ZPmZ : sve_int_un_pred_arit_1< 0b110, "not", null_frag>;
defm FABS_ZPmZ : sve_int_un_pred_arit_1_fp<0b100, "fabs">;
defm FNEG_ZPmZ : sve_int_un_pred_arit_1_fp<0b101, "fneg">;
@@ -138,12 +138,12 @@ let Predicates = [HasSVE] in {
defm FDIVR_ZPmZ : sve_fp_2op_p_zds<0b1100, "fdivr">;
defm FDIV_ZPmZ : sve_fp_2op_p_zds<0b1101, "fdiv">;
- defm FADD_ZZZ : sve_fp_3op_u_zd<0b000, "fadd">;
- defm FSUB_ZZZ : sve_fp_3op_u_zd<0b001, "fsub">;
- defm FMUL_ZZZ : sve_fp_3op_u_zd<0b010, "fmul">;
- defm FTSMUL_ZZZ : sve_fp_3op_u_zd<0b011, "ftsmul">;
- defm FRECPS_ZZZ : sve_fp_3op_u_zd<0b110, "frecps">;
- defm FRSQRTS_ZZZ : sve_fp_3op_u_zd<0b111, "frsqrts">;
+ defm FADD_ZZZ : sve_fp_3op_u_zd<0b000, "fadd", fadd>;
+ defm FSUB_ZZZ : sve_fp_3op_u_zd<0b001, "fsub", null_frag>;
+ defm FMUL_ZZZ : sve_fp_3op_u_zd<0b010, "fmul", null_frag>;
+ defm FTSMUL_ZZZ : sve_fp_3op_u_zd<0b011, "ftsmul", null_frag>;
+ defm FRECPS_ZZZ : sve_fp_3op_u_zd<0b110, "frecps", null_frag>;
+ defm FRSQRTS_ZZZ : sve_fp_3op_u_zd<0b111, "frsqrts", null_frag>;
defm FTSSEL_ZZZ : sve_int_bin_cons_misc_0_b<"ftssel">;
@@ -187,7 +187,7 @@ let Predicates = [HasSVE] in {
defm FCPY_ZPmI : sve_int_dup_fpimm_pred<"fcpy">;
// Splat scalar register (unpredicated, GPR or vector + element index)
- defm DUP_ZR : sve_int_perm_dup_r<"dup">;
+ defm DUP_ZR : sve_int_perm_dup_r<"dup", AArch64dup>;
defm DUP_ZZI : sve_int_perm_dup_i<"dup">;
// Splat scalar register (predicated)
@@ -211,13 +211,13 @@ let Predicates = [HasSVE] in {
defm REV_PP : sve_int_perm_reverse_p<"rev">;
defm REV_ZZ : sve_int_perm_reverse_z<"rev">;
- defm SUNPKLO_ZZ : sve_int_perm_unpk<0b00, "sunpklo">;
- defm SUNPKHI_ZZ : sve_int_perm_unpk<0b01, "sunpkhi">;
- defm UUNPKLO_ZZ : sve_int_perm_unpk<0b10, "uunpklo">;
- defm UUNPKHI_ZZ : sve_int_perm_unpk<0b11, "uunpkhi">;
+ defm SUNPKLO_ZZ : sve_int_perm_unpk<0b00, "sunpklo", AArch64sunpklo>;
+ defm SUNPKHI_ZZ : sve_int_perm_unpk<0b01, "sunpkhi", AArch64sunpkhi>;
+ defm UUNPKLO_ZZ : sve_int_perm_unpk<0b10, "uunpklo", AArch64uunpklo>;
+ defm UUNPKHI_ZZ : sve_int_perm_unpk<0b11, "uunpkhi", AArch64uunpkhi>;
- def PUNPKLO_PP : sve_int_perm_punpk<0b0, "punpklo">;
- def PUNPKHI_PP : sve_int_perm_punpk<0b1, "punpkhi">;
+ defm PUNPKLO_PP : sve_int_perm_punpk<0b0, "punpklo", int_aarch64_sve_punpklo>;
+ defm PUNPKHI_PP : sve_int_perm_punpk<0b1, "punpkhi", int_aarch64_sve_punpkhi>;
defm MOVPRFX_ZPzZ : sve_int_movprfx_pred_zero<0b000, "movprfx">;
defm MOVPRFX_ZPmZ : sve_int_movprfx_pred_merge<0b001, "movprfx">;
@@ -1020,6 +1020,56 @@ let Predicates = [HasSVE] in {
(FCMGT_PPzZZ_S PPR32:$Zd, PPR3bAny:$Pg, ZPR32:$Zn, ZPR32:$Zm), 0>;
def : InstAlias<"fcmlt $Zd, $Pg/z, $Zm, $Zn",
(FCMGT_PPzZZ_D PPR64:$Zd, PPR3bAny:$Pg, ZPR64:$Zn, ZPR64:$Zm), 0>;
+
+ def : Pat<(nxv16i8 (bitconvert (nxv8i16 ZPR:$src))), (nxv16i8 ZPR:$src)>;
+ def : Pat<(nxv16i8 (bitconvert (nxv4i32 ZPR:$src))), (nxv16i8 ZPR:$src)>;
+ def : Pat<(nxv16i8 (bitconvert (nxv2i64 ZPR:$src))), (nxv16i8 ZPR:$src)>;
+ def : Pat<(nxv16i8 (bitconvert (nxv8f16 ZPR:$src))), (nxv16i8 ZPR:$src)>;
+ def : Pat<(nxv16i8 (bitconvert (nxv4f32 ZPR:$src))), (nxv16i8 ZPR:$src)>;
+ def : Pat<(nxv16i8 (bitconvert (nxv2f64 ZPR:$src))), (nxv16i8 ZPR:$src)>;
+
+ def : Pat<(nxv8i16 (bitconvert (nxv16i8 ZPR:$src))), (nxv8i16 ZPR:$src)>;
+ def : Pat<(nxv8i16 (bitconvert (nxv4i32 ZPR:$src))), (nxv8i16 ZPR:$src)>;
+ def : Pat<(nxv8i16 (bitconvert (nxv2i64 ZPR:$src))), (nxv8i16 ZPR:$src)>;
+ def : Pat<(nxv8i16 (bitconvert (nxv8f16 ZPR:$src))), (nxv8i16 ZPR:$src)>;
+ def : Pat<(nxv8i16 (bitconvert (nxv4f32 ZPR:$src))), (nxv8i16 ZPR:$src)>;
+ def : Pat<(nxv8i16 (bitconvert (nxv2f64 ZPR:$src))), (nxv8i16 ZPR:$src)>;
+
+ def : Pat<(nxv4i32 (bitconvert (nxv16i8 ZPR:$src))), (nxv4i32 ZPR:$src)>;
+ def : Pat<(nxv4i32 (bitconvert (nxv8i16 ZPR:$src))), (nxv4i32 ZPR:$src)>;
+ def : Pat<(nxv4i32 (bitconvert (nxv2i64 ZPR:$src))), (nxv4i32 ZPR:$src)>;
+ def : Pat<(nxv4i32 (bitconvert (nxv8f16 ZPR:$src))), (nxv4i32 ZPR:$src)>;
+ def : Pat<(nxv4i32 (bitconvert (nxv4f32 ZPR:$src))), (nxv4i32 ZPR:$src)>;
+ def : Pat<(nxv4i32 (bitconvert (nxv2f64 ZPR:$src))), (nxv4i32 ZPR:$src)>;
+
+ def : Pat<(nxv2i64 (bitconvert (nxv16i8 ZPR:$src))), (nxv2i64 ZPR:$src)>;
+ def : Pat<(nxv2i64 (bitconvert (nxv8i16 ZPR:$src))), (nxv2i64 ZPR:$src)>;
+ def : Pat<(nxv2i64 (bitconvert (nxv4i32 ZPR:$src))), (nxv2i64 ZPR:$src)>;
+ def : Pat<(nxv2i64 (bitconvert (nxv8f16 ZPR:$src))), (nxv2i64 ZPR:$src)>;
+ def : Pat<(nxv2i64 (bitconvert (nxv4f32 ZPR:$src))), (nxv2i64 ZPR:$src)>;
+ def : Pat<(nxv2i64 (bitconvert (nxv2f64 ZPR:$src))), (nxv2i64 ZPR:$src)>;
+
+ def : Pat<(nxv8f16 (bitconvert (nxv16i8 ZPR:$src))), (nxv8f16 ZPR:$src)>;
+ def : Pat<(nxv8f16 (bitconvert (nxv8i16 ZPR:$src))), (nxv8f16 ZPR:$src)>;
+ def : Pat<(nxv8f16 (bitconvert (nxv4i32 ZPR:$src))), (nxv8f16 ZPR:$src)>;
+ def : Pat<(nxv8f16 (bitconvert (nxv2i64 ZPR:$src))), (nxv8f16 ZPR:$src)>;
+ def : Pat<(nxv8f16 (bitconvert (nxv4f32 ZPR:$src))), (nxv8f16 ZPR:$src)>;
+ def : Pat<(nxv8f16 (bitconvert (nxv2f64 ZPR:$src))), (nxv8f16 ZPR:$src)>;
+
+ def : Pat<(nxv4f32 (bitconvert (nxv16i8 ZPR:$src))), (nxv4f32 ZPR:$src)>;
+ def : Pat<(nxv4f32 (bitconvert (nxv8i16 ZPR:$src))), (nxv4f32 ZPR:$src)>;
+ def : Pat<(nxv4f32 (bitconvert (nxv4i32 ZPR:$src))), (nxv4f32 ZPR:$src)>;
+ def : Pat<(nxv4f32 (bitconvert (nxv2i64 ZPR:$src))), (nxv4f32 ZPR:$src)>;
+ def : Pat<(nxv4f32 (bitconvert (nxv8f16 ZPR:$src))), (nxv4f32 ZPR:$src)>;
+ def : Pat<(nxv4f32 (bitconvert (nxv2f64 ZPR:$src))), (nxv4f32 ZPR:$src)>;
+
+ def : Pat<(nxv2f64 (bitconvert (nxv16i8 ZPR:$src))), (nxv2f64 ZPR:$src)>;
+ def : Pat<(nxv2f64 (bitconvert (nxv8i16 ZPR:$src))), (nxv2f64 ZPR:$src)>;
+ def : Pat<(nxv2f64 (bitconvert (nxv4i32 ZPR:$src))), (nxv2f64 ZPR:$src)>;
+ def : Pat<(nxv2f64 (bitconvert (nxv2i64 ZPR:$src))), (nxv2f64 ZPR:$src)>;
+ def : Pat<(nxv2f64 (bitconvert (nxv8f16 ZPR:$src))), (nxv2f64 ZPR:$src)>;
+ def : Pat<(nxv2f64 (bitconvert (nxv4f32 ZPR:$src))), (nxv2f64 ZPR:$src)>;
+
}
let Predicates = [HasSVE2] in {
@@ -1164,6 +1214,13 @@ let Predicates = [HasSVE2] in {
defm SQRSHLR_ZPmZ : sve2_int_arith_pred<0b011100, "sqrshlr">;
defm UQRSHLR_ZPmZ : sve2_int_arith_pred<0b011110, "uqrshlr">;
+ // SVE2 predicated shifts
+ defm SQSHL_ZPmI : sve_int_bin_pred_shift_imm_left< 0b0110, "sqshl">;
+ defm UQSHL_ZPmI : sve_int_bin_pred_shift_imm_left< 0b0111, "uqshl">;
+ defm SRSHR_ZPmI : sve_int_bin_pred_shift_imm_right<0b1100, "srshr">;
+ defm URSHR_ZPmI : sve_int_bin_pred_shift_imm_right<0b1101, "urshr">;
+ defm SQSHLU_ZPmI : sve_int_bin_pred_shift_imm_left< 0b1111, "sqshlu">;
+
// SVE2 integer add/subtract long
defm SADDLB_ZZZ : sve2_wide_int_arith_long<0b00000, "saddlb">;
defm SADDLT_ZZZ : sve2_wide_int_arith_long<0b00001, "saddlt">;
@@ -1199,14 +1256,14 @@ let Predicates = [HasSVE2] in {
defm PMULLT_ZZZ : sve2_pmul_long<0b1, "pmullt">;
// SVE2 bitwise shift and insert
- defm SRI_ZZI : sve2_int_bin_cons_shift_imm_right<0b0, "sri">;
- defm SLI_ZZI : sve2_int_bin_cons_shift_imm_left< 0b1, "sli">;
+ defm SRI_ZZI : sve2_int_bin_shift_imm_right<0b0, "sri">;
+ defm SLI_ZZI : sve2_int_bin_shift_imm_left< 0b1, "sli">;
// SVE2 bitwise shift right and accumulate
- defm SSRA_ZZI : sve2_int_bin_accum_cons_shift_imm_right<0b00, "ssra">;
- defm USRA_ZZI : sve2_int_bin_accum_cons_shift_imm_right<0b01, "usra">;
- defm SRSRA_ZZI : sve2_int_bin_accum_cons_shift_imm_right<0b10, "srsra">;
- defm URSRA_ZZI : sve2_int_bin_accum_cons_shift_imm_right<0b11, "ursra">;
+ defm SSRA_ZZI : sve2_int_bin_accum_shift_imm_right<0b00, "ssra">;
+ defm USRA_ZZI : sve2_int_bin_accum_shift_imm_right<0b01, "usra">;
+ defm SRSRA_ZZI : sve2_int_bin_accum_shift_imm_right<0b10, "srsra">;
+ defm URSRA_ZZI : sve2_int_bin_accum_shift_imm_right<0b11, "ursra">;
// SVE2 complex integer add
defm CADD_ZZI : sve2_int_cadd<0b0, "cadd">;
@@ -1228,41 +1285,47 @@ let Predicates = [HasSVE2] in {
defm SBCLB_ZZZ : sve2_int_addsub_long_carry<0b10, "sbclb">;
defm SBCLT_ZZZ : sve2_int_addsub_long_carry<0b11, "sbclt">;
- // SVE2 bitwise shift right narrow
- defm SQSHRUNB_ZZI : sve2_int_bin_cons_shift_imm_right_narrow<0b0000, "sqshrunb">;
- defm SQSHRUNT_ZZI : sve2_int_bin_cons_shift_imm_right_narrow<0b0001, "sqshrunt">;
- defm SQRSHRUNB_ZZI : sve2_int_bin_cons_shift_imm_right_narrow<0b0010, "sqrshrunb">;
- defm SQRSHRUNT_ZZI : sve2_int_bin_cons_shift_imm_right_narrow<0b0011, "sqrshrunt">;
- defm SHRNB_ZZI : sve2_int_bin_cons_shift_imm_right_narrow<0b0100, "shrnb">;
- defm SHRNT_ZZI : sve2_int_bin_cons_shift_imm_right_narrow<0b0101, "shrnt">;
- defm RSHRNB_ZZI : sve2_int_bin_cons_shift_imm_right_narrow<0b0110, "rshrnb">;
- defm RSHRNT_ZZI : sve2_int_bin_cons_shift_imm_right_narrow<0b0111, "rshrnt">;
- defm SQSHRNB_ZZI : sve2_int_bin_cons_shift_imm_right_narrow<0b1000, "sqshrnb">;
- defm SQSHRNT_ZZI : sve2_int_bin_cons_shift_imm_right_narrow<0b1001, "sqshrnt">;
- defm SQRSHRNB_ZZI : sve2_int_bin_cons_shift_imm_right_narrow<0b1010, "sqrshrnb">;
- defm SQRSHRNT_ZZI : sve2_int_bin_cons_shift_imm_right_narrow<0b1011, "sqrshrnt">;
- defm UQSHRNB_ZZI : sve2_int_bin_cons_shift_imm_right_narrow<0b1100, "uqshrnb">;
- defm UQSHRNT_ZZI : sve2_int_bin_cons_shift_imm_right_narrow<0b1101, "uqshrnt">;
- defm UQRSHRNB_ZZI : sve2_int_bin_cons_shift_imm_right_narrow<0b1110, "uqrshrnb">;
- defm UQRSHRNT_ZZI : sve2_int_bin_cons_shift_imm_right_narrow<0b1111, "uqrshrnt">;
-
- // SVE2 integer add/subtract narrow high part
- defm ADDHNB_ZZZ : sve2_int_addsub_narrow_high<0b000, "addhnb">;
- defm ADDHNT_ZZZ : sve2_int_addsub_narrow_high<0b001, "addhnt">;
- defm RADDHNB_ZZZ : sve2_int_addsub_narrow_high<0b010, "raddhnb">;
- defm RADDHNT_ZZZ : sve2_int_addsub_narrow_high<0b011, "raddhnt">;
- defm SUBHNB_ZZZ : sve2_int_addsub_narrow_high<0b100, "subhnb">;
- defm SUBHNT_ZZZ : sve2_int_addsub_narrow_high<0b101, "subhnt">;
- defm RSUBHNB_ZZZ : sve2_int_addsub_narrow_high<0b110, "rsubhnb">;
- defm RSUBHNT_ZZZ : sve2_int_addsub_narrow_high<0b111, "rsubhnt">;
-
- // SVE2 saturating extract narrow
- defm SQXTNB_ZZ : sve2_int_sat_extract_narrow<0b000, "sqxtnb">;
- defm SQXTNT_ZZ : sve2_int_sat_extract_narrow<0b001, "sqxtnt">;
- defm UQXTNB_ZZ : sve2_int_sat_extract_narrow<0b010, "uqxtnb">;
- defm UQXTNT_ZZ : sve2_int_sat_extract_narrow<0b011, "uqxtnt">;
- defm SQXTUNB_ZZ : sve2_int_sat_extract_narrow<0b100, "sqxtunb">;
- defm SQXTUNT_ZZ : sve2_int_sat_extract_narrow<0b101, "sqxtunt">;
+ // SVE2 bitwise shift right narrow (bottom)
+ defm SQSHRUNB_ZZI : sve2_int_bin_shift_imm_right_narrow_bottom<0b000, "sqshrunb">;
+ defm SQRSHRUNB_ZZI : sve2_int_bin_shift_imm_right_narrow_bottom<0b001, "sqrshrunb">;
+ defm SHRNB_ZZI : sve2_int_bin_shift_imm_right_narrow_bottom<0b010, "shrnb">;
+ defm RSHRNB_ZZI : sve2_int_bin_shift_imm_right_narrow_bottom<0b011, "rshrnb">;
+ defm SQSHRNB_ZZI : sve2_int_bin_shift_imm_right_narrow_bottom<0b100, "sqshrnb">;
+ defm SQRSHRNB_ZZI : sve2_int_bin_shift_imm_right_narrow_bottom<0b101, "sqrshrnb">;
+ defm UQSHRNB_ZZI : sve2_int_bin_shift_imm_right_narrow_bottom<0b110, "uqshrnb">;
+ defm UQRSHRNB_ZZI : sve2_int_bin_shift_imm_right_narrow_bottom<0b111, "uqrshrnb">;
+
+ // SVE2 bitwise shift right narrow (top)
+ defm SQSHRUNT_ZZI : sve2_int_bin_shift_imm_right_narrow_top<0b000, "sqshrunt">;
+ defm SQRSHRUNT_ZZI : sve2_int_bin_shift_imm_right_narrow_top<0b001, "sqrshrunt">;
+ defm SHRNT_ZZI : sve2_int_bin_shift_imm_right_narrow_top<0b010, "shrnt">;
+ defm RSHRNT_ZZI : sve2_int_bin_shift_imm_right_narrow_top<0b011, "rshrnt">;
+ defm SQSHRNT_ZZI : sve2_int_bin_shift_imm_right_narrow_top<0b100, "sqshrnt">;
+ defm SQRSHRNT_ZZI : sve2_int_bin_shift_imm_right_narrow_top<0b101, "sqrshrnt">;
+ defm UQSHRNT_ZZI : sve2_int_bin_shift_imm_right_narrow_top<0b110, "uqshrnt">;
+ defm UQRSHRNT_ZZI : sve2_int_bin_shift_imm_right_narrow_top<0b111, "uqrshrnt">;
+
+ // SVE2 integer add/subtract narrow high part (bottom)
+ defm ADDHNB_ZZZ : sve2_int_addsub_narrow_high_bottom<0b00, "addhnb">;
+ defm RADDHNB_ZZZ : sve2_int_addsub_narrow_high_bottom<0b01, "raddhnb">;
+ defm SUBHNB_ZZZ : sve2_int_addsub_narrow_high_bottom<0b10, "subhnb">;
+ defm RSUBHNB_ZZZ : sve2_int_addsub_narrow_high_bottom<0b11, "rsubhnb">;
+
+ // SVE2 integer add/subtract narrow high part (top)
+ defm ADDHNT_ZZZ : sve2_int_addsub_narrow_high_top<0b00, "addhnt">;
+ defm RADDHNT_ZZZ : sve2_int_addsub_narrow_high_top<0b01, "raddhnt">;
+ defm SUBHNT_ZZZ : sve2_int_addsub_narrow_high_top<0b10, "subhnt">;
+ defm RSUBHNT_ZZZ : sve2_int_addsub_narrow_high_top<0b11, "rsubhnt">;
+
+ // SVE2 saturating extract narrow (bottom)
+ defm SQXTNB_ZZ : sve2_int_sat_extract_narrow_bottom<0b00, "sqxtnb">;
+ defm UQXTNB_ZZ : sve2_int_sat_extract_narrow_bottom<0b01, "uqxtnb">;
+ defm SQXTUNB_ZZ : sve2_int_sat_extract_narrow_bottom<0b10, "sqxtunb">;
+
+ // SVE2 saturating extract narrow (top)
+ defm SQXTNT_ZZ : sve2_int_sat_extract_narrow_top<0b00, "sqxtnt">;
+ defm UQXTNT_ZZ : sve2_int_sat_extract_narrow_top<0b01, "uqxtnt">;
+ defm SQXTUNT_ZZ : sve2_int_sat_extract_narrow_top<0b10, "sqxtunt">;
// SVE2 character match
defm MATCH_PPzZZ : sve2_char_match<0b0, "match">;
@@ -1289,10 +1352,14 @@ let Predicates = [HasSVE2] in {
// SVE2 histogram generation (vector)
defm HISTCNT_ZPzZZ : sve2_hist_gen_vector<"histcnt">;
+ // SVE2 floating-point base 2 logarithm as integer
+ defm FLOGB_ZPmZ : sve2_fp_flogb<"flogb">;
+
// SVE2 floating-point convert precision
defm FCVTXNT_ZPmZ : sve2_fp_convert_down_odd_rounding<"fcvtxnt">;
defm FCVTNT_ZPmZ : sve2_fp_convert_down_narrow<"fcvtnt">;
defm FCVTLT_ZPmZ : sve2_fp_convert_up_long<"fcvtlt">;
+ def FCVTX_ZPmZ_DtoS : sve_fp_2op_p_zd<0b0001010, "fcvtx", ZPR64, ZPR32, ElementSizeD>;
// SVE2 floating-point pairwise operations
defm FADDP_ZPmZZ : sve2_fp_pairwise_pred<0b000, "faddp">;
@@ -1321,58 +1388,45 @@ let Predicates = [HasSVE2] in {
def BSL2N_ZZZZ_D : sve2_int_bitwise_ternary_op_d<0b101, "bsl2n">;
def NBSL_ZZZZ_D : sve2_int_bitwise_ternary_op_d<0b111, "nbsl">;
- // sve_int_rotate_imm
+ // SVE2 bitwise xor and rotate right by immediate
defm XAR_ZZZI : sve2_int_rotate_right_imm<"xar">;
// SVE2 extract vector (immediate offset, constructive)
def EXT_ZZI_B : sve2_int_perm_extract_i_cons<"ext">;
- // SVE floating-point convert precision
- def FCVTX_ZPmZ_DtoS : sve_fp_2op_p_zd<0b0001010, "fcvtx", ZPR64, ZPR32, ElementSizeD>;
-
- // SVE floating-point convert to integer
- defm FLOGB_ZPmZ : sve2_fp_flogb<"flogb">;
-
- // Non-temporal contiguous loads (vector + register)
- defm LDNT1SB_ZZR_S : sve2_mem_cldnt_vs<0b00000, "ldnt1sb", Z_s, ZPR32>;
- defm LDNT1B_ZZR_S : sve2_mem_cldnt_vs<0b00001, "ldnt1b", Z_s, ZPR32>;
- defm LDNT1SH_ZZR_S : sve2_mem_cldnt_vs<0b00100, "ldnt1sh", Z_s, ZPR32>;
- defm LDNT1H_ZZR_S : sve2_mem_cldnt_vs<0b00101, "ldnt1h", Z_s, ZPR32>;
- defm LDNT1W_ZZR_S : sve2_mem_cldnt_vs<0b01001, "ldnt1w", Z_s, ZPR32>;
-
- defm LDNT1SB_ZZR_D : sve2_mem_cldnt_vs<0b10000, "ldnt1sb", Z_d, ZPR64>;
- defm LDNT1B_ZZR_D : sve2_mem_cldnt_vs<0b10010, "ldnt1b", Z_d, ZPR64>;
- defm LDNT1SH_ZZR_D : sve2_mem_cldnt_vs<0b10100, "ldnt1sh", Z_d, ZPR64>;
- defm LDNT1H_ZZR_D : sve2_mem_cldnt_vs<0b10110, "ldnt1h", Z_d, ZPR64>;
- defm LDNT1SW_ZZR_D : sve2_mem_cldnt_vs<0b11000, "ldnt1sw", Z_d, ZPR64>;
- defm LDNT1W_ZZR_D : sve2_mem_cldnt_vs<0b11010, "ldnt1w", Z_d, ZPR64>;
- defm LDNT1D_ZZR_D : sve2_mem_cldnt_vs<0b11110, "ldnt1d", Z_d, ZPR64>;
+ // SVE2 non-temporal gather loads
+ defm LDNT1SB_ZZR_S : sve2_mem_gldnt_vs<0b00000, "ldnt1sb", Z_s, ZPR32>;
+ defm LDNT1B_ZZR_S : sve2_mem_gldnt_vs<0b00001, "ldnt1b", Z_s, ZPR32>;
+ defm LDNT1SH_ZZR_S : sve2_mem_gldnt_vs<0b00100, "ldnt1sh", Z_s, ZPR32>;
+ defm LDNT1H_ZZR_S : sve2_mem_gldnt_vs<0b00101, "ldnt1h", Z_s, ZPR32>;
+ defm LDNT1W_ZZR_S : sve2_mem_gldnt_vs<0b01001, "ldnt1w", Z_s, ZPR32>;
+
+ defm LDNT1SB_ZZR_D : sve2_mem_gldnt_vs<0b10000, "ldnt1sb", Z_d, ZPR64>;
+ defm LDNT1B_ZZR_D : sve2_mem_gldnt_vs<0b10010, "ldnt1b", Z_d, ZPR64>;
+ defm LDNT1SH_ZZR_D : sve2_mem_gldnt_vs<0b10100, "ldnt1sh", Z_d, ZPR64>;
+ defm LDNT1H_ZZR_D : sve2_mem_gldnt_vs<0b10110, "ldnt1h", Z_d, ZPR64>;
+ defm LDNT1SW_ZZR_D : sve2_mem_gldnt_vs<0b11000, "ldnt1sw", Z_d, ZPR64>;
+ defm LDNT1W_ZZR_D : sve2_mem_gldnt_vs<0b11010, "ldnt1w", Z_d, ZPR64>;
+ defm LDNT1D_ZZR_D : sve2_mem_gldnt_vs<0b11110, "ldnt1d", Z_d, ZPR64>;
// SVE2 vector splice (constructive)
defm SPLICE_ZPZZ : sve2_int_perm_splice_cons<"splice">;
- // Predicated shifts
- defm SQSHL_ZPmI : sve_int_bin_pred_shift_imm_left< 0b0110, "sqshl">;
- defm UQSHL_ZPmI : sve_int_bin_pred_shift_imm_left< 0b0111, "uqshl">;
- defm SRSHR_ZPmI : sve_int_bin_pred_shift_imm_right<0b1100, "srshr">;
- defm URSHR_ZPmI : sve_int_bin_pred_shift_imm_right<0b1101, "urshr">;
- defm SQSHLU_ZPmI : sve_int_bin_pred_shift_imm_left< 0b1111, "sqshlu">;
-
- // Non-temporal contiguous stores (vector + register)
- defm STNT1B_ZZR_S : sve2_mem_cstnt_vs<0b001, "stnt1b", Z_s, ZPR32>;
- defm STNT1H_ZZR_S : sve2_mem_cstnt_vs<0b011, "stnt1h", Z_s, ZPR32>;
- defm STNT1W_ZZR_S : sve2_mem_cstnt_vs<0b101, "stnt1w", Z_s, ZPR32>;
+ // SVE2 non-temporal scatter stores
+ defm STNT1B_ZZR_S : sve2_mem_sstnt_vs<0b001, "stnt1b", Z_s, ZPR32>;
+ defm STNT1H_ZZR_S : sve2_mem_sstnt_vs<0b011, "stnt1h", Z_s, ZPR32>;
+ defm STNT1W_ZZR_S : sve2_mem_sstnt_vs<0b101, "stnt1w", Z_s, ZPR32>;
- defm STNT1B_ZZR_D : sve2_mem_cstnt_vs<0b000, "stnt1b", Z_d, ZPR64>;
- defm STNT1H_ZZR_D : sve2_mem_cstnt_vs<0b010, "stnt1h", Z_d, ZPR64>;
- defm STNT1W_ZZR_D : sve2_mem_cstnt_vs<0b100, "stnt1w", Z_d, ZPR64>;
- defm STNT1D_ZZR_D : sve2_mem_cstnt_vs<0b110, "stnt1d", Z_d, ZPR64>;
+ defm STNT1B_ZZR_D : sve2_mem_sstnt_vs<0b000, "stnt1b", Z_d, ZPR64>;
+ defm STNT1H_ZZR_D : sve2_mem_sstnt_vs<0b010, "stnt1h", Z_d, ZPR64>;
+ defm STNT1W_ZZR_D : sve2_mem_sstnt_vs<0b100, "stnt1w", Z_d, ZPR64>;
+ defm STNT1D_ZZR_D : sve2_mem_sstnt_vs<0b110, "stnt1d", Z_d, ZPR64>;
- // SVE table lookup (three sources)
+ // SVE2 table lookup (three sources)
defm TBL_ZZZZ : sve2_int_perm_tbl<"tbl">;
defm TBX_ZZZ : sve2_int_perm_tbx<"tbx">;
- // SVE integer compare scalar count and limit
+ // SVE2 integer compare scalar count and limit
defm WHILEGE_PWW : sve_int_while4_rr<0b000, "whilege">;
defm WHILEGT_PWW : sve_int_while4_rr<0b001, "whilegt">;
defm WHILEHS_PWW : sve_int_while4_rr<0b100, "whilehs">;
@@ -1383,7 +1437,7 @@ let Predicates = [HasSVE2] in {
defm WHILEHS_PXX : sve_int_while8_rr<0b100, "whilehs">;
defm WHILEHI_PXX : sve_int_while8_rr<0b101, "whilehi">;
- // SVE pointer conflict compare
+ // SVE2 pointer conflict compare
defm WHILEWR_PXX : sve2_int_while_rr<0b0, "whilewr">;
defm WHILERW_PXX : sve2_int_while_rr<0b1, "whilerw">;
}
diff --git a/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp b/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
index 60dbace03ca6..ba61ed726e84 100644
--- a/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
+++ b/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
@@ -32,7 +32,7 @@ SDValue AArch64SelectionDAGInfo::EmitTargetCodeForMemset(
const AArch64TargetLowering &TLI = *STI.getTargetLowering();
EVT IntPtr = TLI.getPointerTy(DAG.getDataLayout());
- Type *IntPtrTy = DAG.getDataLayout().getIntPtrType(*DAG.getContext());
+ Type *IntPtrTy = Type::getInt8PtrTy(*DAG.getContext());
TargetLowering::ArgListTy Args;
TargetLowering::ArgListEntry Entry;
Entry.Node = Dst;
diff --git a/lib/Target/AArch64/AArch64SpeculationHardening.cpp b/lib/Target/AArch64/AArch64SpeculationHardening.cpp
index 3087e6ce441d..7307961ddb5f 100644
--- a/lib/Target/AArch64/AArch64SpeculationHardening.cpp
+++ b/lib/Target/AArch64/AArch64SpeculationHardening.cpp
@@ -106,6 +106,7 @@
#include "llvm/IR/DebugLoc.h"
#include "llvm/Pass.h"
#include "llvm/Support/CodeGen.h"
+#include "llvm/Support/Debug.h"
#include "llvm/Target/TargetMachine.h"
#include <cassert>
@@ -115,9 +116,9 @@ using namespace llvm;
#define AARCH64_SPECULATION_HARDENING_NAME "AArch64 speculation hardening pass"
-cl::opt<bool> HardenLoads("aarch64-slh-loads", cl::Hidden,
- cl::desc("Sanitize loads from memory."),
- cl::init(true));
+static cl::opt<bool> HardenLoads("aarch64-slh-loads", cl::Hidden,
+ cl::desc("Sanitize loads from memory."),
+ cl::init(true));
namespace {
@@ -521,7 +522,7 @@ bool AArch64SpeculationHardening::slhLoads(MachineBasicBlock &MBB) {
for (auto Use : MI.uses()) {
if (!Use.isReg())
continue;
- unsigned Reg = Use.getReg();
+ Register Reg = Use.getReg();
// Some loads of floating point data have implicit defs/uses on a
// super register of that floating point data. Some examples:
// $s0 = LDRSui $sp, 22, implicit-def $q0
@@ -561,8 +562,8 @@ bool AArch64SpeculationHardening::expandSpeculationSafeValue(
// miss-speculation isn't happening because we're already inserting barriers
// to guarantee that.
if (!UseControlFlowSpeculationBarrier && !UsesFullSpeculationBarrier) {
- unsigned DstReg = MI.getOperand(0).getReg();
- unsigned SrcReg = MI.getOperand(1).getReg();
+ Register DstReg = MI.getOperand(0).getReg();
+ Register SrcReg = MI.getOperand(1).getReg();
// Mark this register and all its aliasing registers as needing to be
// value speculation hardened before its next use, by using a CSDB
// barrier instruction.
diff --git a/lib/Target/AArch64/AArch64StackOffset.h b/lib/Target/AArch64/AArch64StackOffset.h
new file mode 100644
index 000000000000..13f12a6c9c30
--- /dev/null
+++ b/lib/Target/AArch64/AArch64StackOffset.h
@@ -0,0 +1,138 @@
+//==--AArch64StackOffset.h ---------------------------------------*- C++ -*-==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declaration of the StackOffset class, which is used to
+// describe scalable and non-scalable offsets during frame lowering.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64STACKOFFSET_H
+#define LLVM_LIB_TARGET_AARCH64_AARCH64STACKOFFSET_H
+
+#include "llvm/Support/MachineValueType.h"
+
+namespace llvm {
+
+/// StackOffset is a wrapper around scalable and non-scalable offsets and is
+/// used in several functions such as 'isAArch64FrameOffsetLegal' and
+/// 'emitFrameOffset()'. StackOffsets are described by MVTs, e.g.
+//
+/// StackOffset(1, MVT::nxv16i8)
+//
+/// would describe an offset as being the size of a single SVE vector.
+///
+/// The class also implements simple arithmetic (addition/subtraction) on these
+/// offsets, e.g.
+//
+/// StackOffset(1, MVT::nxv16i8) + StackOffset(1, MVT::i64)
+//
+/// describes an offset that spans the combined storage required for an SVE
+/// vector and a 64bit GPR.
+class StackOffset {
+ int64_t Bytes;
+ int64_t ScalableBytes;
+
+ explicit operator int() const;
+
+public:
+ using Part = std::pair<int64_t, MVT>;
+
+ StackOffset() : Bytes(0), ScalableBytes(0) {}
+
+ StackOffset(int64_t Offset, MVT::SimpleValueType T) : StackOffset() {
+ assert(MVT(T).getSizeInBits() % 8 == 0 &&
+ "Offset type is not a multiple of bytes");
+ *this += Part(Offset, T);
+ }
+
+ StackOffset(const StackOffset &Other)
+ : Bytes(Other.Bytes), ScalableBytes(Other.ScalableBytes) {}
+
+ StackOffset &operator=(const StackOffset &) = default;
+
+ StackOffset &operator+=(const StackOffset::Part &Other) {
+ int64_t OffsetInBytes = Other.first * (Other.second.getSizeInBits() / 8);
+ if (Other.second.isScalableVector())
+ ScalableBytes += OffsetInBytes;
+ else
+ Bytes += OffsetInBytes;
+ return *this;
+ }
+
+ StackOffset &operator+=(const StackOffset &Other) {
+ Bytes += Other.Bytes;
+ ScalableBytes += Other.ScalableBytes;
+ return *this;
+ }
+
+ StackOffset operator+(const StackOffset &Other) const {
+ StackOffset Res(*this);
+ Res += Other;
+ return Res;
+ }
+
+ StackOffset &operator-=(const StackOffset &Other) {
+ Bytes -= Other.Bytes;
+ ScalableBytes -= Other.ScalableBytes;
+ return *this;
+ }
+
+ StackOffset operator-(const StackOffset &Other) const {
+ StackOffset Res(*this);
+ Res -= Other;
+ return Res;
+ }
+
+ StackOffset operator-() const {
+ StackOffset Res = {};
+ const StackOffset Other(*this);
+ Res -= Other;
+ return Res;
+ }
+
+ /// Returns the scalable part of the offset in bytes.
+ int64_t getScalableBytes() const { return ScalableBytes; }
+
+ /// Returns the non-scalable part of the offset in bytes.
+ int64_t getBytes() const { return Bytes; }
+
+ /// Returns the offset in parts to which this frame offset can be
+ /// decomposed for the purpose of describing a frame offset.
+ /// For non-scalable offsets this is simply its byte size.
+ void getForFrameOffset(int64_t &NumBytes, int64_t &NumPredicateVectors,
+ int64_t &NumDataVectors) const {
+ assert(isValid() && "Invalid frame offset");
+
+ NumBytes = Bytes;
+ NumDataVectors = 0;
+ NumPredicateVectors = ScalableBytes / 2;
+ // This method is used to get the offsets to adjust the frame offset.
+ // If the function requires ADDPL to be used and needs more than two ADDPL
+ // instructions, part of the offset is folded into NumDataVectors so that it
+ // uses ADDVL for part of it, reducing the number of ADDPL instructions.
+ if (NumPredicateVectors % 8 == 0 || NumPredicateVectors < -64 ||
+ NumPredicateVectors > 62) {
+ NumDataVectors = NumPredicateVectors / 8;
+ NumPredicateVectors -= NumDataVectors * 8;
+ }
+ }
+
+ /// Returns whether the offset is known zero.
+ explicit operator bool() const { return Bytes || ScalableBytes; }
+
+ bool isValid() const {
+ // The smallest scalable element supported by scaled SVE addressing
+ // modes are predicates, which are 2 scalable bytes in size. So the scalable
+ // byte offset must always be a multiple of 2.
+ return ScalableBytes % 2 == 0;
+ }
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/lib/Target/AArch64/AArch64StackTagging.cpp b/lib/Target/AArch64/AArch64StackTagging.cpp
index 6e99c48bf1d7..e6dbe01d3807 100644
--- a/lib/Target/AArch64/AArch64StackTagging.cpp
+++ b/lib/Target/AArch64/AArch64StackTagging.cpp
@@ -19,6 +19,7 @@
#include "llvm/ADT/Optional.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/CFG.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/ScalarEvolution.h"
#include "llvm/Analysis/ScalarEvolutionExpressions.h"
@@ -55,9 +56,215 @@ using namespace llvm;
#define DEBUG_TYPE "stack-tagging"
-static constexpr unsigned kTagGranuleSize = 16;
+static cl::opt<bool> ClMergeInit(
+ "stack-tagging-merge-init", cl::Hidden, cl::init(true), cl::ZeroOrMore,
+ cl::desc("merge stack variable initializers with tagging when possible"));
+
+static cl::opt<unsigned> ClScanLimit("stack-tagging-merge-init-scan-limit",
+ cl::init(40), cl::Hidden);
+
+static const Align kTagGranuleSize = Align(16);
namespace {
+
+class InitializerBuilder {
+ uint64_t Size;
+ const DataLayout *DL;
+ Value *BasePtr;
+ Function *SetTagFn;
+ Function *SetTagZeroFn;
+ Function *StgpFn;
+
+ // List of initializers sorted by start offset.
+ struct Range {
+ uint64_t Start, End;
+ Instruction *Inst;
+ };
+ SmallVector<Range, 4> Ranges;
+ // 8-aligned offset => 8-byte initializer
+ // Missing keys are zero initialized.
+ std::map<uint64_t, Value *> Out;
+
+public:
+ InitializerBuilder(uint64_t Size, const DataLayout *DL, Value *BasePtr,
+ Function *SetTagFn, Function *SetTagZeroFn,
+ Function *StgpFn)
+ : Size(Size), DL(DL), BasePtr(BasePtr), SetTagFn(SetTagFn),
+ SetTagZeroFn(SetTagZeroFn), StgpFn(StgpFn) {}
+
+ bool addRange(uint64_t Start, uint64_t End, Instruction *Inst) {
+ auto I = std::lower_bound(
+ Ranges.begin(), Ranges.end(), Start,
+ [](const Range &LHS, uint64_t RHS) { return LHS.End <= RHS; });
+ if (I != Ranges.end() && End > I->Start) {
+ // Overlap - bail.
+ return false;
+ }
+ Ranges.insert(I, {Start, End, Inst});
+ return true;
+ }
+
+ bool addStore(uint64_t Offset, StoreInst *SI, const DataLayout *DL) {
+ int64_t StoreSize = DL->getTypeStoreSize(SI->getOperand(0)->getType());
+ if (!addRange(Offset, Offset + StoreSize, SI))
+ return false;
+ IRBuilder<> IRB(SI);
+ applyStore(IRB, Offset, Offset + StoreSize, SI->getOperand(0));
+ return true;
+ }
+
+ bool addMemSet(uint64_t Offset, MemSetInst *MSI) {
+ uint64_t StoreSize = cast<ConstantInt>(MSI->getLength())->getZExtValue();
+ if (!addRange(Offset, Offset + StoreSize, MSI))
+ return false;
+ IRBuilder<> IRB(MSI);
+ applyMemSet(IRB, Offset, Offset + StoreSize,
+ cast<ConstantInt>(MSI->getValue()));
+ return true;
+ }
+
+ void applyMemSet(IRBuilder<> &IRB, int64_t Start, int64_t End,
+ ConstantInt *V) {
+ // Out[] does not distinguish between zero and undef, and we already know
+ // that this memset does not overlap with any other initializer. Nothing to
+ // do for memset(0).
+ if (V->isZero())
+ return;
+ for (int64_t Offset = Start - Start % 8; Offset < End; Offset += 8) {
+ uint64_t Cst = 0x0101010101010101UL;
+ int LowBits = Offset < Start ? (Start - Offset) * 8 : 0;
+ if (LowBits)
+ Cst = (Cst >> LowBits) << LowBits;
+ int HighBits = End - Offset < 8 ? (8 - (End - Offset)) * 8 : 0;
+ if (HighBits)
+ Cst = (Cst << HighBits) >> HighBits;
+ ConstantInt *C =
+ ConstantInt::get(IRB.getInt64Ty(), Cst * V->getZExtValue());
+
+ Value *&CurrentV = Out[Offset];
+ if (!CurrentV) {
+ CurrentV = C;
+ } else {
+ CurrentV = IRB.CreateOr(CurrentV, C);
+ }
+ }
+ }
+
+ // Take a 64-bit slice of the value starting at the given offset (in bytes).
+ // Offset can be negative. Pad with zeroes on both sides when necessary.
+ Value *sliceValue(IRBuilder<> &IRB, Value *V, int64_t Offset) {
+ if (Offset > 0) {
+ V = IRB.CreateLShr(V, Offset * 8);
+ V = IRB.CreateZExtOrTrunc(V, IRB.getInt64Ty());
+ } else if (Offset < 0) {
+ V = IRB.CreateZExtOrTrunc(V, IRB.getInt64Ty());
+ V = IRB.CreateShl(V, -Offset * 8);
+ } else {
+ V = IRB.CreateZExtOrTrunc(V, IRB.getInt64Ty());
+ }
+ return V;
+ }
+
+ void applyStore(IRBuilder<> &IRB, int64_t Start, int64_t End,
+ Value *StoredValue) {
+ StoredValue = flatten(IRB, StoredValue);
+ for (int64_t Offset = Start - Start % 8; Offset < End; Offset += 8) {
+ Value *V = sliceValue(IRB, StoredValue, Offset - Start);
+ Value *&CurrentV = Out[Offset];
+ if (!CurrentV) {
+ CurrentV = V;
+ } else {
+ CurrentV = IRB.CreateOr(CurrentV, V);
+ }
+ }
+ }
+
+ void generate(IRBuilder<> &IRB) {
+ LLVM_DEBUG(dbgs() << "Combined initializer\n");
+ // No initializers => the entire allocation is undef.
+ if (Ranges.empty()) {
+ emitUndef(IRB, 0, Size);
+ return;
+ }
+
+ // Look through 8-byte initializer list 16 bytes at a time;
+ // If one of the two 8-byte halfs is non-zero non-undef, emit STGP.
+ // Otherwise, emit zeroes up to next available item.
+ uint64_t LastOffset = 0;
+ for (uint64_t Offset = 0; Offset < Size; Offset += 16) {
+ auto I1 = Out.find(Offset);
+ auto I2 = Out.find(Offset + 8);
+ if (I1 == Out.end() && I2 == Out.end())
+ continue;
+
+ if (Offset > LastOffset)
+ emitZeroes(IRB, LastOffset, Offset - LastOffset);
+
+ Value *Store1 = I1 == Out.end() ? Constant::getNullValue(IRB.getInt64Ty())
+ : I1->second;
+ Value *Store2 = I2 == Out.end() ? Constant::getNullValue(IRB.getInt64Ty())
+ : I2->second;
+ emitPair(IRB, Offset, Store1, Store2);
+ LastOffset = Offset + 16;
+ }
+
+ // memset(0) does not update Out[], therefore the tail can be either undef
+ // or zero.
+ if (LastOffset < Size)
+ emitZeroes(IRB, LastOffset, Size - LastOffset);
+
+ for (const auto &R : Ranges) {
+ R.Inst->eraseFromParent();
+ }
+ }
+
+ void emitZeroes(IRBuilder<> &IRB, uint64_t Offset, uint64_t Size) {
+ LLVM_DEBUG(dbgs() << " [" << Offset << ", " << Offset + Size
+ << ") zero\n");
+ Value *Ptr = BasePtr;
+ if (Offset)
+ Ptr = IRB.CreateConstGEP1_32(Ptr, Offset);
+ IRB.CreateCall(SetTagZeroFn,
+ {Ptr, ConstantInt::get(IRB.getInt64Ty(), Size)});
+ }
+
+ void emitUndef(IRBuilder<> &IRB, uint64_t Offset, uint64_t Size) {
+ LLVM_DEBUG(dbgs() << " [" << Offset << ", " << Offset + Size
+ << ") undef\n");
+ Value *Ptr = BasePtr;
+ if (Offset)
+ Ptr = IRB.CreateConstGEP1_32(Ptr, Offset);
+ IRB.CreateCall(SetTagFn, {Ptr, ConstantInt::get(IRB.getInt64Ty(), Size)});
+ }
+
+ void emitPair(IRBuilder<> &IRB, uint64_t Offset, Value *A, Value *B) {
+ LLVM_DEBUG(dbgs() << " [" << Offset << ", " << Offset + 16 << "):\n");
+ LLVM_DEBUG(dbgs() << " " << *A << "\n " << *B << "\n");
+ Value *Ptr = BasePtr;
+ if (Offset)
+ Ptr = IRB.CreateConstGEP1_32(Ptr, Offset);
+ IRB.CreateCall(StgpFn, {Ptr, A, B});
+ }
+
+ Value *flatten(IRBuilder<> &IRB, Value *V) {
+ if (V->getType()->isIntegerTy())
+ return V;
+ // vector of pointers -> vector of ints
+ if (VectorType *VecTy = dyn_cast<VectorType>(V->getType())) {
+ LLVMContext &Ctx = IRB.getContext();
+ Type *EltTy = VecTy->getElementType();
+ if (EltTy->isPointerTy()) {
+ uint32_t EltSize = DL->getTypeSizeInBits(EltTy);
+ Type *NewTy = VectorType::get(IntegerType::get(Ctx, EltSize),
+ VecTy->getNumElements());
+ V = IRB.CreatePointerCast(V, NewTy);
+ }
+ }
+ return IRB.CreateBitOrPointerCast(
+ V, IRB.getIntNTy(DL->getTypeStoreSize(V->getType()) * 8));
+ }
+};
+
class AArch64StackTagging : public FunctionPass {
struct AllocaInfo {
AllocaInst *AI;
@@ -67,10 +274,15 @@ class AArch64StackTagging : public FunctionPass {
int Tag; // -1 for non-tagged allocations
};
+ bool MergeInit;
+
public:
static char ID; // Pass ID, replacement for typeid
- AArch64StackTagging() : FunctionPass(ID) {
+ AArch64StackTagging(bool MergeInit = true)
+ : FunctionPass(ID),
+ MergeInit(ClMergeInit.getNumOccurrences() > 0 ? ClMergeInit
+ : MergeInit) {
initializeAArch64StackTaggingPass(*PassRegistry::getPassRegistry());
}
@@ -81,6 +293,9 @@ public:
uint64_t Size);
void untagAlloca(AllocaInst *AI, Instruction *InsertBefore, uint64_t Size);
+ Instruction *collectInitializers(Instruction *StartInst, Value *StartPtr,
+ uint64_t Size, InitializerBuilder &IB);
+
Instruction *
insertBaseTaggedPointer(const MapVector<AllocaInst *, AllocaInfo> &Allocas,
const DominatorTree *DT);
@@ -92,9 +307,12 @@ private:
Function *F;
Function *SetTagFunc;
const DataLayout *DL;
+ AAResults *AA;
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.setPreservesCFG();
+ if (MergeInit)
+ AU.addRequired<AAResultsWrapperPass>();
}
};
@@ -107,8 +325,68 @@ INITIALIZE_PASS_BEGIN(AArch64StackTagging, DEBUG_TYPE, "AArch64 Stack Tagging",
INITIALIZE_PASS_END(AArch64StackTagging, DEBUG_TYPE, "AArch64 Stack Tagging",
false, false)
-FunctionPass *llvm::createAArch64StackTaggingPass() {
- return new AArch64StackTagging();
+FunctionPass *llvm::createAArch64StackTaggingPass(bool MergeInit) {
+ return new AArch64StackTagging(MergeInit);
+}
+
+Instruction *AArch64StackTagging::collectInitializers(Instruction *StartInst,
+ Value *StartPtr,
+ uint64_t Size,
+ InitializerBuilder &IB) {
+ MemoryLocation AllocaLoc{StartPtr, Size};
+ Instruction *LastInst = StartInst;
+ BasicBlock::iterator BI(StartInst);
+
+ unsigned Count = 0;
+ for (; Count < ClScanLimit && !BI->isTerminator(); ++BI) {
+ if (!isa<DbgInfoIntrinsic>(*BI))
+ ++Count;
+
+ if (isNoModRef(AA->getModRefInfo(&*BI, AllocaLoc)))
+ continue;
+
+ if (!isa<StoreInst>(BI) && !isa<MemSetInst>(BI)) {
+ // If the instruction is readnone, ignore it, otherwise bail out. We
+ // don't even allow readonly here because we don't want something like:
+ // A[1] = 2; strlen(A); A[2] = 2; -> memcpy(A, ...); strlen(A).
+ if (BI->mayWriteToMemory() || BI->mayReadFromMemory())
+ break;
+ continue;
+ }
+
+ if (StoreInst *NextStore = dyn_cast<StoreInst>(BI)) {
+ if (!NextStore->isSimple())
+ break;
+
+ // Check to see if this store is to a constant offset from the start ptr.
+ Optional<int64_t> Offset =
+ isPointerOffset(StartPtr, NextStore->getPointerOperand(), *DL);
+ if (!Offset)
+ break;
+
+ if (!IB.addStore(*Offset, NextStore, DL))
+ break;
+ LastInst = NextStore;
+ } else {
+ MemSetInst *MSI = cast<MemSetInst>(BI);
+
+ if (MSI->isVolatile() || !isa<ConstantInt>(MSI->getLength()))
+ break;
+
+ if (!isa<ConstantInt>(MSI->getValue()))
+ break;
+
+ // Check to see if this store is to a constant offset from the start ptr.
+ Optional<int64_t> Offset = isPointerOffset(StartPtr, MSI->getDest(), *DL);
+ if (!Offset)
+ break;
+
+ if (!IB.addMemSet(*Offset, MSI))
+ break;
+ LastInst = MSI;
+ }
+ }
+ return LastInst;
}
bool AArch64StackTagging::isInterestingAlloca(const AllocaInst &AI) {
@@ -127,8 +405,23 @@ bool AArch64StackTagging::isInterestingAlloca(const AllocaInst &AI) {
void AArch64StackTagging::tagAlloca(AllocaInst *AI, Instruction *InsertBefore,
Value *Ptr, uint64_t Size) {
+ auto SetTagZeroFunc =
+ Intrinsic::getDeclaration(F->getParent(), Intrinsic::aarch64_settag_zero);
+ auto StgpFunc =
+ Intrinsic::getDeclaration(F->getParent(), Intrinsic::aarch64_stgp);
+
+ InitializerBuilder IB(Size, DL, Ptr, SetTagFunc, SetTagZeroFunc, StgpFunc);
+ bool LittleEndian =
+ Triple(AI->getModule()->getTargetTriple()).isLittleEndian();
+ // Current implementation of initializer merging assumes little endianness.
+ if (MergeInit && !F->hasOptNone() && LittleEndian) {
+ LLVM_DEBUG(dbgs() << "collecting initializers for " << *AI
+ << ", size = " << Size << "\n");
+ InsertBefore = collectInitializers(InsertBefore, Ptr, Size, IB);
+ }
+
IRBuilder<> IRB(InsertBefore);
- IRB.CreateCall(SetTagFunc, {Ptr, ConstantInt::get(IRB.getInt64Ty(), Size)});
+ IB.generate(IRB);
}
void AArch64StackTagging::untagAlloca(AllocaInst *AI, Instruction *InsertBefore,
@@ -166,7 +459,8 @@ Instruction *AArch64StackTagging::insertBaseTaggedPointer(
}
void AArch64StackTagging::alignAndPadAlloca(AllocaInfo &Info) {
- unsigned NewAlignment = std::max(Info.AI->getAlignment(), kTagGranuleSize);
+ const Align NewAlignment =
+ max(MaybeAlign(Info.AI->getAlignment()), kTagGranuleSize);
Info.AI->setAlignment(NewAlignment);
uint64_t Size = Info.AI->getAllocationSizeInBits(*DL).getValue() / 8;
@@ -179,7 +473,7 @@ void AArch64StackTagging::alignAndPadAlloca(AllocaInfo &Info) {
Info.AI->isArrayAllocation()
? ArrayType::get(
Info.AI->getAllocatedType(),
- dyn_cast<ConstantInt>(Info.AI->getArraySize())->getZExtValue())
+ cast<ConstantInt>(Info.AI->getArraySize())->getZExtValue())
: Info.AI->getAllocatedType();
Type *PaddingType =
ArrayType::get(Type::getInt8Ty(F->getContext()), AlignedSize - Size);
@@ -187,7 +481,7 @@ void AArch64StackTagging::alignAndPadAlloca(AllocaInfo &Info) {
auto *NewAI = new AllocaInst(
TypeWithPadding, Info.AI->getType()->getAddressSpace(), nullptr, "", Info.AI);
NewAI->takeName(Info.AI);
- NewAI->setAlignment(Info.AI->getAlignment());
+ NewAI->setAlignment(MaybeAlign(Info.AI->getAlignment()));
NewAI->setUsedWithInAlloca(Info.AI->isUsedWithInAlloca());
NewAI->setSwiftError(Info.AI->isSwiftError());
NewAI->copyMetadata(*Info.AI);
@@ -198,6 +492,24 @@ void AArch64StackTagging::alignAndPadAlloca(AllocaInfo &Info) {
Info.AI = NewAI;
}
+// Helper function to check for post-dominance.
+static bool postDominates(const PostDominatorTree *PDT, const IntrinsicInst *A,
+ const IntrinsicInst *B) {
+ const BasicBlock *ABB = A->getParent();
+ const BasicBlock *BBB = B->getParent();
+
+ if (ABB != BBB)
+ return PDT->dominates(ABB, BBB);
+
+ for (const Instruction &I : *ABB) {
+ if (&I == B)
+ return true;
+ if (&I == A)
+ return false;
+ }
+ llvm_unreachable("Corrupt instruction list");
+}
+
// FIXME: check for MTE extension
bool AArch64StackTagging::runOnFunction(Function &Fn) {
if (!Fn.hasFnAttribute(Attribute::SanitizeMemTag))
@@ -205,6 +517,8 @@ bool AArch64StackTagging::runOnFunction(Function &Fn) {
F = &Fn;
DL = &Fn.getParent()->getDataLayout();
+ if (MergeInit)
+ AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
MapVector<AllocaInst *, AllocaInfo> Allocas; // need stable iteration order
SmallVector<Instruction *, 8> RetVec;
@@ -270,23 +584,31 @@ bool AArch64StackTagging::runOnFunction(Function &Fn) {
if (NumInterestingAllocas == 0)
return true;
+ std::unique_ptr<DominatorTree> DeleteDT;
+ DominatorTree *DT = nullptr;
+ if (auto *P = getAnalysisIfAvailable<DominatorTreeWrapperPass>())
+ DT = &P->getDomTree();
+
+ if (DT == nullptr && (NumInterestingAllocas > 1 ||
+ !F->hasFnAttribute(Attribute::OptimizeNone))) {
+ DeleteDT = std::make_unique<DominatorTree>(*F);
+ DT = DeleteDT.get();
+ }
+
+ std::unique_ptr<PostDominatorTree> DeletePDT;
+ PostDominatorTree *PDT = nullptr;
+ if (auto *P = getAnalysisIfAvailable<PostDominatorTreeWrapperPass>())
+ PDT = &P->getPostDomTree();
+
+ if (PDT == nullptr && !F->hasFnAttribute(Attribute::OptimizeNone)) {
+ DeletePDT = std::make_unique<PostDominatorTree>(*F);
+ PDT = DeletePDT.get();
+ }
+
SetTagFunc =
Intrinsic::getDeclaration(F->getParent(), Intrinsic::aarch64_settag);
- // Compute DT only if the function has the attribute, there are more than 1
- // interesting allocas, and it is not available for free.
- Instruction *Base;
- if (NumInterestingAllocas > 1) {
- auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
- if (DTWP) {
- Base = insertBaseTaggedPointer(Allocas, &DTWP->getDomTree());
- } else {
- DominatorTree DT(*F);
- Base = insertBaseTaggedPointer(Allocas, &DT);
- }
- } else {
- Base = insertBaseTaggedPointer(Allocas, nullptr);
- }
+ Instruction *Base = insertBaseTaggedPointer(Allocas, DT);
for (auto &I : Allocas) {
const AllocaInfo &Info = I.second;
@@ -309,11 +631,37 @@ bool AArch64StackTagging::runOnFunction(Function &Fn) {
if (UnrecognizedLifetimes.empty() && Info.LifetimeStart.size() == 1 &&
Info.LifetimeEnd.size() == 1) {
IntrinsicInst *Start = Info.LifetimeStart[0];
+ IntrinsicInst *End = Info.LifetimeEnd[0];
uint64_t Size =
dyn_cast<ConstantInt>(Start->getArgOperand(0))->getZExtValue();
Size = alignTo(Size, kTagGranuleSize);
tagAlloca(AI, Start->getNextNode(), Start->getArgOperand(1), Size);
- untagAlloca(AI, Info.LifetimeEnd[0], Size);
+ // We need to ensure that if we tag some object, we certainly untag it
+ // before the function exits.
+ if (PDT != nullptr && postDominates(PDT, End, Start)) {
+ untagAlloca(AI, End, Size);
+ } else {
+ SmallVector<Instruction *, 8> ReachableRetVec;
+ unsigned NumCoveredExits = 0;
+ for (auto &RI : RetVec) {
+ if (!isPotentiallyReachable(Start, RI, nullptr, DT))
+ continue;
+ ReachableRetVec.push_back(RI);
+ if (DT != nullptr && DT->dominates(End, RI))
+ ++NumCoveredExits;
+ }
+ // If there's a mix of covered and non-covered exits, just put the untag
+ // on exits, so we avoid the redundancy of untagging twice.
+ if (NumCoveredExits == ReachableRetVec.size()) {
+ untagAlloca(AI, End, Size);
+ } else {
+ for (auto &RI : ReachableRetVec)
+ untagAlloca(AI, RI, Size);
+ // We may have inserted untag outside of the lifetime interval.
+ // Remove the lifetime end call for this alloca.
+ End->eraseFromParent();
+ }
+ }
} else {
uint64_t Size = Info.AI->getAllocationSizeInBits(*DL).getValue() / 8;
Value *Ptr = IRB.CreatePointerCast(TagPCall, IRB.getInt8PtrTy());
diff --git a/lib/Target/AArch64/AArch64StackTaggingPreRA.cpp b/lib/Target/AArch64/AArch64StackTaggingPreRA.cpp
new file mode 100644
index 000000000000..3cc556f74aea
--- /dev/null
+++ b/lib/Target/AArch64/AArch64StackTaggingPreRA.cpp
@@ -0,0 +1,209 @@
+//===-- AArch64StackTaggingPreRA.cpp --- Stack Tagging for AArch64 -----===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+
+#include "AArch64.h"
+#include "AArch64MachineFunctionInfo.h"
+#include "AArch64InstrInfo.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineTraceMetrics.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "aarch64-stack-tagging-pre-ra"
+
+enum UncheckedLdStMode { UncheckedNever, UncheckedSafe, UncheckedAlways };
+
+cl::opt<UncheckedLdStMode> ClUncheckedLdSt(
+ "stack-tagging-unchecked-ld-st", cl::Hidden,
+ cl::init(UncheckedSafe),
+ cl::desc(
+ "Unconditionally apply unchecked-ld-st optimization (even for large "
+ "stack frames, or in the presence of variable sized allocas)."),
+ cl::values(
+ clEnumValN(UncheckedNever, "never", "never apply unchecked-ld-st"),
+ clEnumValN(
+ UncheckedSafe, "safe",
+ "apply unchecked-ld-st when the target is definitely within range"),
+ clEnumValN(UncheckedAlways, "always", "always apply unchecked-ld-st")));
+
+namespace {
+
+class AArch64StackTaggingPreRA : public MachineFunctionPass {
+ MachineFunction *MF;
+ AArch64FunctionInfo *AFI;
+ MachineFrameInfo *MFI;
+ MachineRegisterInfo *MRI;
+ const AArch64RegisterInfo *TRI;
+ const AArch64InstrInfo *TII;
+
+ SmallVector<MachineInstr*, 16> ReTags;
+
+public:
+ static char ID;
+ AArch64StackTaggingPreRA() : MachineFunctionPass(ID) {
+ initializeAArch64StackTaggingPreRAPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool mayUseUncheckedLoadStore();
+ void uncheckUsesOf(unsigned TaggedReg, int FI);
+ void uncheckLoadsAndStores();
+
+ bool runOnMachineFunction(MachineFunction &Func) override;
+ StringRef getPassName() const override {
+ return "AArch64 Stack Tagging PreRA";
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+};
+} // end anonymous namespace
+
+char AArch64StackTaggingPreRA::ID = 0;
+
+INITIALIZE_PASS_BEGIN(AArch64StackTaggingPreRA, "aarch64-stack-tagging-pre-ra",
+ "AArch64 Stack Tagging PreRA Pass", false, false)
+INITIALIZE_PASS_END(AArch64StackTaggingPreRA, "aarch64-stack-tagging-pre-ra",
+ "AArch64 Stack Tagging PreRA Pass", false, false)
+
+FunctionPass *llvm::createAArch64StackTaggingPreRAPass() {
+ return new AArch64StackTaggingPreRA();
+}
+
+static bool isUncheckedLoadOrStoreOpcode(unsigned Opcode) {
+ switch (Opcode) {
+ case AArch64::LDRWui:
+ case AArch64::LDRSHWui:
+ case AArch64::LDRXui:
+ case AArch64::LDRBui:
+ case AArch64::LDRBBui:
+ case AArch64::LDRHui:
+ case AArch64::LDRSui:
+ case AArch64::LDRDui:
+ case AArch64::LDRQui:
+ case AArch64::STRWui:
+ case AArch64::STRXui:
+ case AArch64::STRBui:
+ case AArch64::STRBBui:
+ case AArch64::STRHui:
+ case AArch64::STRSui:
+ case AArch64::STRDui:
+ case AArch64::STRQui:
+ return true;
+ default:
+ return false;
+ }
+}
+
+bool AArch64StackTaggingPreRA::mayUseUncheckedLoadStore() {
+ if (ClUncheckedLdSt == UncheckedNever)
+ return false;
+ else if (ClUncheckedLdSt == UncheckedAlways)
+ return true;
+
+ // This estimate can be improved if we had harder guarantees about stack frame
+ // layout. With LocalStackAllocation we can estimate SP offset to any
+ // preallocated slot. AArch64FrameLowering::orderFrameObjects could put tagged
+ // objects ahead of non-tagged ones, but that's not always desirable.
+ //
+ // Underestimating SP offset here may require the use of LDG to materialize
+ // the tagged address of the stack slot, along with a scratch register
+ // allocation (post-regalloc!).
+ //
+ // For now we do the safe thing here and require that the entire stack frame
+ // is within range of the shortest of the unchecked instructions.
+ unsigned FrameSize = 0;
+ for (unsigned i = 0, e = MFI->getObjectIndexEnd(); i != e; ++i)
+ FrameSize += MFI->getObjectSize(i);
+ bool EntireFrameReachableFromSP = FrameSize < 0xf00;
+ return !MFI->hasVarSizedObjects() && EntireFrameReachableFromSP;
+}
+
+void AArch64StackTaggingPreRA::uncheckUsesOf(unsigned TaggedReg, int FI) {
+ for (auto UI = MRI->use_instr_begin(TaggedReg), E = MRI->use_instr_end();
+ UI != E;) {
+ MachineInstr *UseI = &*(UI++);
+ if (isUncheckedLoadOrStoreOpcode(UseI->getOpcode())) {
+ // FI operand is always the one before the immediate offset.
+ unsigned OpIdx = TII->getLoadStoreImmIdx(UseI->getOpcode()) - 1;
+ if (UseI->getOperand(OpIdx).isReg() &&
+ UseI->getOperand(OpIdx).getReg() == TaggedReg) {
+ UseI->getOperand(OpIdx).ChangeToFrameIndex(FI);
+ UseI->getOperand(OpIdx).setTargetFlags(AArch64II::MO_TAGGED);
+ }
+ } else if (UseI->isCopy() &&
+ Register::isVirtualRegister(UseI->getOperand(0).getReg())) {
+ uncheckUsesOf(UseI->getOperand(0).getReg(), FI);
+ }
+ }
+}
+
+void AArch64StackTaggingPreRA::uncheckLoadsAndStores() {
+ for (auto *I : ReTags) {
+ unsigned TaggedReg = I->getOperand(0).getReg();
+ int FI = I->getOperand(1).getIndex();
+ uncheckUsesOf(TaggedReg, FI);
+ }
+}
+
+bool AArch64StackTaggingPreRA::runOnMachineFunction(MachineFunction &Func) {
+ MF = &Func;
+ MRI = &MF->getRegInfo();
+ AFI = MF->getInfo<AArch64FunctionInfo>();
+ TII = static_cast<const AArch64InstrInfo *>(MF->getSubtarget().getInstrInfo());
+ TRI = static_cast<const AArch64RegisterInfo *>(
+ MF->getSubtarget().getRegisterInfo());
+ MFI = &MF->getFrameInfo();
+ ReTags.clear();
+
+ assert(MRI->isSSA());
+
+ LLVM_DEBUG(dbgs() << "********** AArch64 Stack Tagging PreRA **********\n"
+ << "********** Function: " << MF->getName() << '\n');
+
+ SmallSetVector<int, 8> TaggedSlots;
+ for (auto &BB : *MF) {
+ for (auto &I : BB) {
+ if (I.getOpcode() == AArch64::TAGPstack) {
+ ReTags.push_back(&I);
+ int FI = I.getOperand(1).getIndex();
+ TaggedSlots.insert(FI);
+ // There should be no offsets in TAGP yet.
+ assert(I.getOperand(2).getImm() == 0);
+ }
+ }
+ }
+
+ if (ReTags.empty())
+ return false;
+
+ if (mayUseUncheckedLoadStore())
+ uncheckLoadsAndStores();
+
+ return true;
+}
diff --git a/lib/Target/AArch64/AArch64StorePairSuppress.cpp b/lib/Target/AArch64/AArch64StorePairSuppress.cpp
index 0e84a00df006..5deb601822b8 100644
--- a/lib/Target/AArch64/AArch64StorePairSuppress.cpp
+++ b/lib/Target/AArch64/AArch64StorePairSuppress.cpp
@@ -151,7 +151,7 @@ bool AArch64StorePairSuppress::runOnMachineFunction(MachineFunction &MF) {
int64_t Offset;
if (TII->getMemOperandWithOffset(MI, BaseOp, Offset, TRI) &&
BaseOp->isReg()) {
- unsigned BaseReg = BaseOp->getReg();
+ Register BaseReg = BaseOp->getReg();
if (PrevBaseReg == BaseReg) {
// If this block can take STPs, skip ahead to the next block.
if (!SuppressSTP && shouldAddSTPToBlock(MI.getParent()))
diff --git a/lib/Target/AArch64/AArch64Subtarget.cpp b/lib/Target/AArch64/AArch64Subtarget.cpp
index 3bc89b91c3f7..558bea368eff 100644
--- a/lib/Target/AArch64/AArch64Subtarget.cpp
+++ b/lib/Target/AArch64/AArch64Subtarget.cpp
@@ -71,19 +71,22 @@ void AArch64Subtarget::initializeProperties() {
case CortexA35:
break;
case CortexA53:
- PrefFunctionAlignment = 3;
+ PrefFunctionLogAlignment = 3;
break;
case CortexA55:
break;
case CortexA57:
MaxInterleaveFactor = 4;
- PrefFunctionAlignment = 4;
+ PrefFunctionLogAlignment = 4;
+ break;
+ case CortexA65:
+ PrefFunctionLogAlignment = 3;
break;
case CortexA72:
case CortexA73:
case CortexA75:
case CortexA76:
- PrefFunctionAlignment = 4;
+ PrefFunctionLogAlignment = 4;
break;
case Cyclone:
CacheLineSize = 64;
@@ -94,14 +97,14 @@ void AArch64Subtarget::initializeProperties() {
case ExynosM1:
MaxInterleaveFactor = 4;
MaxJumpTableSize = 8;
- PrefFunctionAlignment = 4;
- PrefLoopAlignment = 3;
+ PrefFunctionLogAlignment = 4;
+ PrefLoopLogAlignment = 3;
break;
case ExynosM3:
MaxInterleaveFactor = 4;
MaxJumpTableSize = 20;
- PrefFunctionAlignment = 5;
- PrefLoopAlignment = 4;
+ PrefFunctionLogAlignment = 5;
+ PrefLoopLogAlignment = 4;
break;
case Falkor:
MaxInterleaveFactor = 4;
@@ -122,6 +125,12 @@ void AArch64Subtarget::initializeProperties() {
// FIXME: remove this to enable 64-bit SLP if performance looks good.
MinVectorRegisterBitWidth = 128;
break;
+ case NeoverseE1:
+ PrefFunctionLogAlignment = 3;
+ break;
+ case NeoverseN1:
+ PrefFunctionLogAlignment = 4;
+ break;
case Saphira:
MaxInterleaveFactor = 4;
// FIXME: remove this to enable 64-bit SLP if performance looks good.
@@ -129,8 +138,8 @@ void AArch64Subtarget::initializeProperties() {
break;
case ThunderX2T99:
CacheLineSize = 64;
- PrefFunctionAlignment = 3;
- PrefLoopAlignment = 2;
+ PrefFunctionLogAlignment = 3;
+ PrefLoopLogAlignment = 2;
MaxInterleaveFactor = 4;
PrefetchDistance = 128;
MinPrefetchStride = 1024;
@@ -143,15 +152,15 @@ void AArch64Subtarget::initializeProperties() {
case ThunderXT81:
case ThunderXT83:
CacheLineSize = 128;
- PrefFunctionAlignment = 3;
- PrefLoopAlignment = 2;
+ PrefFunctionLogAlignment = 3;
+ PrefLoopLogAlignment = 2;
// FIXME: remove this to enable 64-bit SLP if performance looks good.
MinVectorRegisterBitWidth = 128;
break;
case TSV110:
CacheLineSize = 64;
- PrefFunctionAlignment = 4;
- PrefLoopAlignment = 2;
+ PrefFunctionLogAlignment = 4;
+ PrefLoopLogAlignment = 2;
break;
}
}
@@ -187,7 +196,7 @@ const CallLowering *AArch64Subtarget::getCallLowering() const {
return CallLoweringInfo.get();
}
-const InstructionSelector *AArch64Subtarget::getInstructionSelector() const {
+InstructionSelector *AArch64Subtarget::getInstructionSelector() const {
return InstSelector.get();
}
@@ -201,7 +210,7 @@ const RegisterBankInfo *AArch64Subtarget::getRegBankInfo() const {
/// Find the target operand flags that describe how a global value should be
/// referenced for the current subtarget.
-unsigned char
+unsigned
AArch64Subtarget::ClassifyGlobalReference(const GlobalValue *GV,
const TargetMachine &TM) const {
// MachO large model always goes via a GOT, simply to get a single 8-byte
@@ -224,10 +233,17 @@ AArch64Subtarget::ClassifyGlobalReference(const GlobalValue *GV,
GV->hasExternalWeakLinkage())
return AArch64II::MO_GOT;
+ // References to tagged globals are marked with MO_NC | MO_TAGGED to indicate
+ // that their nominal addresses are tagged and outside of the code model. In
+ // AArch64ExpandPseudo::expandMI we emit an additional instruction to set the
+ // tag if necessary based on MO_TAGGED.
+ if (AllowTaggedGlobals && !isa<FunctionType>(GV->getValueType()))
+ return AArch64II::MO_NC | AArch64II::MO_TAGGED;
+
return AArch64II::MO_NO_FLAG;
}
-unsigned char AArch64Subtarget::classifyGlobalFunctionReference(
+unsigned AArch64Subtarget::classifyGlobalFunctionReference(
const GlobalValue *GV, const TargetMachine &TM) const {
// MachO large model always goes via a GOT, because we don't have the
// relocations available to do anything else..
@@ -275,7 +291,7 @@ bool AArch64Subtarget::supportsAddressTopByteIgnored() const {
std::unique_ptr<PBQPRAConstraint>
AArch64Subtarget::getCustomPBQPConstraints() const {
- return balanceFPOps() ? llvm::make_unique<A57ChainingConstraint>() : nullptr;
+ return balanceFPOps() ? std::make_unique<A57ChainingConstraint>() : nullptr;
}
void AArch64Subtarget::mirFileLoaded(MachineFunction &MF) const {
diff --git a/lib/Target/AArch64/AArch64Subtarget.h b/lib/Target/AArch64/AArch64Subtarget.h
index 0c84cfb8329a..f3212fae8e5e 100644
--- a/lib/Target/AArch64/AArch64Subtarget.h
+++ b/lib/Target/AArch64/AArch64Subtarget.h
@@ -42,6 +42,7 @@ public:
CortexA53,
CortexA55,
CortexA57,
+ CortexA65,
CortexA72,
CortexA73,
CortexA75,
@@ -51,6 +52,8 @@ public:
ExynosM3,
Falkor,
Kryo,
+ NeoverseE1,
+ NeoverseN1,
Saphira,
ThunderX2T99,
ThunderX,
@@ -113,6 +116,7 @@ protected:
bool HasTRACEV8_4 = false;
bool HasAM = false;
bool HasSEL2 = false;
+ bool HasPMU = false;
bool HasTLB_RMI = false;
bool HasFMI = false;
bool HasRCPC_IMMO = false;
@@ -134,6 +138,7 @@ protected:
bool HasBTI = false;
bool HasRandGen = false;
bool HasMTE = false;
+ bool HasTME = false;
// Arm SVE2 extensions
bool HasSVE2AES = false;
@@ -141,6 +146,10 @@ protected:
bool HasSVE2SHA3 = false;
bool HasSVE2BitPerm = false;
+ // Future architecture extensions.
+ bool HasETE = false;
+ bool HasTRBE = false;
+
// HasZeroCycleRegMove - Has zero-cycle register mov instructions.
bool HasZeroCycleRegMove = false;
@@ -183,14 +192,15 @@ protected:
bool UseEL1ForTP = false;
bool UseEL2ForTP = false;
bool UseEL3ForTP = false;
+ bool AllowTaggedGlobals = false;
uint8_t MaxInterleaveFactor = 2;
uint8_t VectorInsertExtractBaseCost = 3;
uint16_t CacheLineSize = 0;
uint16_t PrefetchDistance = 0;
uint16_t MinPrefetchStride = 1;
unsigned MaxPrefetchIterationsAhead = UINT_MAX;
- unsigned PrefFunctionAlignment = 0;
- unsigned PrefLoopAlignment = 0;
+ unsigned PrefFunctionLogAlignment = 0;
+ unsigned PrefLoopLogAlignment = 0;
unsigned MaxJumpTableSize = 0;
unsigned WideningBaseCost = 0;
@@ -247,7 +257,7 @@ public:
return &getInstrInfo()->getRegisterInfo();
}
const CallLowering *getCallLowering() const override;
- const InstructionSelector *getInstructionSelector() const override;
+ InstructionSelector *getInstructionSelector() const override;
const LegalizerInfo *getLegalizerInfo() const override;
const RegisterBankInfo *getRegBankInfo() const override;
const Triple &getTargetTriple() const { return TargetTriple; }
@@ -344,14 +354,16 @@ public:
unsigned getVectorInsertExtractBaseCost() const {
return VectorInsertExtractBaseCost;
}
- unsigned getCacheLineSize() const { return CacheLineSize; }
- unsigned getPrefetchDistance() const { return PrefetchDistance; }
- unsigned getMinPrefetchStride() const { return MinPrefetchStride; }
- unsigned getMaxPrefetchIterationsAhead() const {
+ unsigned getCacheLineSize() const override { return CacheLineSize; }
+ unsigned getPrefetchDistance() const override { return PrefetchDistance; }
+ unsigned getMinPrefetchStride() const override { return MinPrefetchStride; }
+ unsigned getMaxPrefetchIterationsAhead() const override {
return MaxPrefetchIterationsAhead;
}
- unsigned getPrefFunctionAlignment() const { return PrefFunctionAlignment; }
- unsigned getPrefLoopAlignment() const { return PrefLoopAlignment; }
+ unsigned getPrefFunctionLogAlignment() const {
+ return PrefFunctionLogAlignment;
+ }
+ unsigned getPrefLoopLogAlignment() const { return PrefLoopLogAlignment; }
unsigned getMaximumJumpTableSize() const { return MaxJumpTableSize; }
@@ -380,6 +392,7 @@ public:
bool hasBTI() const { return HasBTI; }
bool hasRandGen() const { return HasRandGen; }
bool hasMTE() const { return HasMTE; }
+ bool hasTME() const { return HasTME; }
// Arm SVE2 extensions
bool hasSVE2AES() const { return HasSVE2AES; }
bool hasSVE2SM4() const { return HasSVE2SM4; }
@@ -399,6 +412,8 @@ public:
bool isTargetELF() const { return TargetTriple.isOSBinFormatELF(); }
bool isTargetMachO() const { return TargetTriple.isOSBinFormatMachO(); }
+ bool isTargetILP32() const { return TargetTriple.isArch32Bit(); }
+
bool useAA() const override { return UseAA; }
bool hasVH() const { return HasVH; }
@@ -421,10 +436,17 @@ public:
bool hasTRACEV8_4() const { return HasTRACEV8_4; }
bool hasAM() const { return HasAM; }
bool hasSEL2() const { return HasSEL2; }
+ bool hasPMU() const { return HasPMU; }
bool hasTLB_RMI() const { return HasTLB_RMI; }
bool hasFMI() const { return HasFMI; }
bool hasRCPC_IMMO() const { return HasRCPC_IMMO; }
+ bool addrSinkUsingGEPs() const override {
+ // Keeping GEPs inbounds is important for exploiting AArch64
+ // addressing-modes in ILP32 mode.
+ return useAA() || isTargetILP32();
+ }
+
bool useSmallAddressing() const {
switch (TLInfo.getTargetMachine().getCodeModel()) {
case CodeModel::Kernel:
@@ -443,11 +465,11 @@ public:
/// ClassifyGlobalReference - Find the target operand flags that describe
/// how a global value should be referenced for the current subtarget.
- unsigned char ClassifyGlobalReference(const GlobalValue *GV,
- const TargetMachine &TM) const;
+ unsigned ClassifyGlobalReference(const GlobalValue *GV,
+ const TargetMachine &TM) const;
- unsigned char classifyGlobalFunctionReference(const GlobalValue *GV,
- const TargetMachine &TM) const;
+ unsigned classifyGlobalFunctionReference(const GlobalValue *GV,
+ const TargetMachine &TM) const;
void overrideSchedPolicy(MachineSchedPolicy &Policy,
unsigned NumRegionInstrs) const override;
diff --git a/lib/Target/AArch64/AArch64SystemOperands.td b/lib/Target/AArch64/AArch64SystemOperands.td
index 536a6591478b..05249a4ea6a8 100644
--- a/lib/Target/AArch64/AArch64SystemOperands.td
+++ b/lib/Target/AArch64/AArch64SystemOperands.td
@@ -612,6 +612,7 @@ def : ROSysReg<"ISR_EL1", 0b11, 0b000, 0b1100, 0b0001, 0b000>;
def : ROSysReg<"CNTPCT_EL0", 0b11, 0b011, 0b1110, 0b0000, 0b001>;
def : ROSysReg<"CNTVCT_EL0", 0b11, 0b011, 0b1110, 0b0000, 0b010>;
def : ROSysReg<"ID_MMFR4_EL1", 0b11, 0b000, 0b0000, 0b0010, 0b110>;
+def : ROSysReg<"ID_MMFR5_EL1", 0b11, 0b000, 0b0000, 0b0011, 0b110>;
// Trace registers
// Op0 Op1 CRn CRm Op2
@@ -1321,6 +1322,12 @@ def : RWSysReg<"CNTHPS_CTL_EL2", 0b11, 0b100, 0b1110, 0b0101, 0b001>;
def : RWSysReg<"SDER32_EL2", 0b11, 0b100, 0b0001, 0b0011, 0b001>;
} // FeatureSEL2
+// v8.4a PMU registers
+// Op0 Op1 CRn CRm Op2
+let Requires = [{ {AArch64::FeaturePMU} }] in {
+def : RWSysReg<"PMMIR_EL1", 0b11, 0b000, 0b1001, 0b1110, 0b110>;
+} // FeaturePMU
+
// v8.4a RAS registers
// Op0 Op1 CRn CRm Op2
let Requires = [{ {AArch64::FeatureRASv8_4} }] in {
@@ -1452,14 +1459,37 @@ let Requires = [{ {AArch64::FeatureMTE} }] in {
def : RWSysReg<"TCO", 0b11, 0b011, 0b0100, 0b0010, 0b111>;
def : RWSysReg<"GCR_EL1", 0b11, 0b000, 0b0001, 0b0000, 0b110>;
def : RWSysReg<"RGSR_EL1", 0b11, 0b000, 0b0001, 0b0000, 0b101>;
-def : RWSysReg<"TFSR_EL1", 0b11, 0b000, 0b0110, 0b0101, 0b000>;
-def : RWSysReg<"TFSR_EL2", 0b11, 0b100, 0b0110, 0b0101, 0b000>;
-def : RWSysReg<"TFSR_EL3", 0b11, 0b110, 0b0110, 0b0110, 0b000>;
-def : RWSysReg<"TFSR_EL12", 0b11, 0b101, 0b0110, 0b0110, 0b000>;
-def : RWSysReg<"TFSRE0_EL1", 0b11, 0b000, 0b0110, 0b0110, 0b001>;
+def : RWSysReg<"TFSR_EL1", 0b11, 0b000, 0b0101, 0b0110, 0b000>;
+def : RWSysReg<"TFSR_EL2", 0b11, 0b100, 0b0101, 0b0110, 0b000>;
+def : RWSysReg<"TFSR_EL3", 0b11, 0b110, 0b0101, 0b0110, 0b000>;
+def : RWSysReg<"TFSR_EL12", 0b11, 0b101, 0b0101, 0b0110, 0b000>;
+def : RWSysReg<"TFSRE0_EL1", 0b11, 0b000, 0b0101, 0b0110, 0b001>;
def : ROSysReg<"GMID_EL1", 0b11, 0b001, 0b0000, 0b0000, 0b100>;
} // HasMTE
+// Embedded Trace Extension R/W System registers
+let Requires = [{ {AArch64::FeatureETE} }] in {
+// Name Op0 Op1 CRn CRm Op2
+def : RWSysReg<"TRCRSR", 0b10, 0b001, 0b0000, 0b1010, 0b000>;
+// TRCEXTINSELR0 has the same encoding as ETM TRCEXTINSELR
+def : RWSysReg<"TRCEXTINSELR0", 0b10, 0b001, 0b0000, 0b1000, 0b100>;
+def : RWSysReg<"TRCEXTINSELR1", 0b10, 0b001, 0b0000, 0b1001, 0b100>;
+def : RWSysReg<"TRCEXTINSELR2", 0b10, 0b001, 0b0000, 0b1010, 0b100>;
+def : RWSysReg<"TRCEXTINSELR3", 0b10, 0b001, 0b0000, 0b1011, 0b100>;
+} // FeatureETE
+
+// Trace Buffer Extension System registers
+let Requires = [{ {AArch64::FeatureTRBE} }] in {
+// Name Op0 Op1 CRn CRm Op2
+def : RWSysReg<"TRBLIMITR_EL1", 0b11, 0b000, 0b1001, 0b1011, 0b000>;
+def : RWSysReg<"TRBPTR_EL1", 0b11, 0b000, 0b1001, 0b1011, 0b001>;
+def : RWSysReg<"TRBBASER_EL1", 0b11, 0b000, 0b1001, 0b1011, 0b010>;
+def : RWSysReg<"TRBSR_EL1", 0b11, 0b000, 0b1001, 0b1011, 0b011>;
+def : RWSysReg<"TRBMAR_EL1", 0b11, 0b000, 0b1001, 0b1011, 0b100>;
+def : RWSysReg<"TRBTRG_EL1", 0b11, 0b000, 0b1001, 0b1011, 0b110>;
+def : ROSysReg<"TRBIDR_EL1", 0b11, 0b000, 0b1001, 0b1011, 0b111>;
+} // FeatureTRBE
+
// Cyclone specific system registers
// Op0 Op1 CRn CRm Op2
let Requires = [{ {AArch64::ProcCyclone} }] in
diff --git a/lib/Target/AArch64/AArch64TargetMachine.cpp b/lib/Target/AArch64/AArch64TargetMachine.cpp
index 865461480499..b3ed96e815be 100644
--- a/lib/Target/AArch64/AArch64TargetMachine.cpp
+++ b/lib/Target/AArch64/AArch64TargetMachine.cpp
@@ -157,6 +157,8 @@ extern "C" void LLVMInitializeAArch64Target() {
RegisterTargetMachine<AArch64leTargetMachine> X(getTheAArch64leTarget());
RegisterTargetMachine<AArch64beTargetMachine> Y(getTheAArch64beTarget());
RegisterTargetMachine<AArch64leTargetMachine> Z(getTheARM64Target());
+ RegisterTargetMachine<AArch64leTargetMachine> W(getTheARM64_32Target());
+ RegisterTargetMachine<AArch64leTargetMachine> V(getTheAArch64_32Target());
auto PR = PassRegistry::getPassRegistry();
initializeGlobalISel(*PR);
initializeAArch64A53Fix835769Pass(*PR);
@@ -180,6 +182,7 @@ extern "C" void LLVMInitializeAArch64Target() {
initializeLDTLSCleanupPass(*PR);
initializeAArch64SpeculationHardeningPass(*PR);
initializeAArch64StackTaggingPass(*PR);
+ initializeAArch64StackTaggingPreRAPass(*PR);
}
//===----------------------------------------------------------------------===//
@@ -187,11 +190,11 @@ extern "C" void LLVMInitializeAArch64Target() {
//===----------------------------------------------------------------------===//
static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
if (TT.isOSBinFormatMachO())
- return llvm::make_unique<AArch64_MachoTargetObjectFile>();
+ return std::make_unique<AArch64_MachoTargetObjectFile>();
if (TT.isOSBinFormatCOFF())
- return llvm::make_unique<AArch64_COFFTargetObjectFile>();
+ return std::make_unique<AArch64_COFFTargetObjectFile>();
- return llvm::make_unique<AArch64_ELFTargetObjectFile>();
+ return std::make_unique<AArch64_ELFTargetObjectFile>();
}
// Helper function to build a DataLayout string
@@ -200,8 +203,11 @@ static std::string computeDataLayout(const Triple &TT,
bool LittleEndian) {
if (Options.getABIName() == "ilp32")
return "e-m:e-p:32:32-i8:8-i16:16-i64:64-S128";
- if (TT.isOSBinFormatMachO())
+ if (TT.isOSBinFormatMachO()) {
+ if (TT.getArch() == Triple::aarch64_32)
+ return "e-m:o-p:32:32-i64:64-i128:128-n32:64-S128";
return "e-m:o-i64:64-i128:128-n32:64-S128";
+ }
if (TT.isOSBinFormatCOFF())
return "e-m:w-p:64:64-i32:32-i64:64-i128:128-n32:64-S128";
if (LittleEndian)
@@ -277,8 +283,11 @@ AArch64TargetMachine::AArch64TargetMachine(const Target &T, const Triple &TT,
this->Options.TrapUnreachable = true;
}
- // Enable GlobalISel at or below EnableGlobalISelAt0.
- if (getOptLevel() <= EnableGlobalISelAtO) {
+ // Enable GlobalISel at or below EnableGlobalISelAt0, unless this is
+ // MachO/CodeModel::Large, which GlobalISel does not support.
+ if (getOptLevel() <= EnableGlobalISelAtO &&
+ TT.getArch() != Triple::aarch64_32 &&
+ !(getCodeModel() == CodeModel::Large && TT.isOSBinFormatMachO())) {
setGlobalISel(true);
setGlobalISelAbort(GlobalISelAbortMode::Disable);
}
@@ -310,7 +319,7 @@ AArch64TargetMachine::getSubtargetImpl(const Function &F) const {
// creation will depend on the TM and the code generation flags on the
// function that reside in TargetOptions.
resetTargetOptions(F);
- I = llvm::make_unique<AArch64Subtarget>(TargetTriple, CPU, FS, *this,
+ I = std::make_unique<AArch64Subtarget>(TargetTriple, CPU, FS, *this,
isLittle);
}
return I.get();
@@ -448,7 +457,8 @@ void AArch64PassConfig::addIRPasses() {
addPass(createLICMPass());
}
- addPass(createAArch64StackTaggingPass());
+ addPass(createAArch64StackTaggingPass(/* MergeInit = */ TM->getOptLevel() !=
+ CodeGenOpt::None));
}
// Pass Pipeline Configuration
@@ -502,7 +512,8 @@ bool AArch64PassConfig::addIRTranslator() {
}
void AArch64PassConfig::addPreLegalizeMachineIR() {
- addPass(createAArch64PreLegalizeCombiner());
+ bool IsOptNone = getOptLevel() == CodeGenOpt::None;
+ addPass(createAArch64PreLegalizeCombiner(IsOptNone));
}
bool AArch64PassConfig::addLegalizeMachineIR() {
@@ -516,9 +527,7 @@ bool AArch64PassConfig::addRegBankSelect() {
}
void AArch64PassConfig::addPreGlobalInstructionSelect() {
- // Workaround the deficiency of the fast register allocator.
- if (TM->getOptLevel() == CodeGenOpt::None)
- addPass(new Localizer());
+ addPass(new Localizer());
}
bool AArch64PassConfig::addGlobalInstructionSelect() {
@@ -540,6 +549,8 @@ bool AArch64PassConfig::addILPOpts() {
if (EnableStPairSuppress)
addPass(createAArch64StorePairSuppressPass());
addPass(createAArch64SIMDInstrOptPass());
+ if (TM->getOptLevel() != CodeGenOpt::None)
+ addPass(createAArch64StackTaggingPreRAPass());
return true;
}
diff --git a/lib/Target/AArch64/AArch64TargetObjectFile.cpp b/lib/Target/AArch64/AArch64TargetObjectFile.cpp
index 1c3d5d0743ad..54562094fcf5 100644
--- a/lib/Target/AArch64/AArch64TargetObjectFile.cpp
+++ b/lib/Target/AArch64/AArch64TargetObjectFile.cpp
@@ -59,8 +59,8 @@ MCSymbol *AArch64_MachoTargetObjectFile::getCFIPersonalitySymbol(
}
const MCExpr *AArch64_MachoTargetObjectFile::getIndirectSymViaGOTPCRel(
- const MCSymbol *Sym, const MCValue &MV, int64_t Offset,
- MachineModuleInfo *MMI, MCStreamer &Streamer) const {
+ const GlobalValue *GV, const MCSymbol *Sym, const MCValue &MV,
+ int64_t Offset, MachineModuleInfo *MMI, MCStreamer &Streamer) const {
assert((Offset+MV.getConstant() == 0) &&
"Arch64 does not support GOT PC rel with extra offset");
// On ARM64 Darwin, we can reference symbols with foo@GOT-., which
diff --git a/lib/Target/AArch64/AArch64TargetObjectFile.h b/lib/Target/AArch64/AArch64TargetObjectFile.h
index 7ead363d42fe..1cb4c028c80d 100644
--- a/lib/Target/AArch64/AArch64TargetObjectFile.h
+++ b/lib/Target/AArch64/AArch64TargetObjectFile.h
@@ -35,7 +35,8 @@ public:
const TargetMachine &TM,
MachineModuleInfo *MMI) const override;
- const MCExpr *getIndirectSymViaGOTPCRel(const MCSymbol *Sym,
+ const MCExpr *getIndirectSymViaGOTPCRel(const GlobalValue *GV,
+ const MCSymbol *Sym,
const MCValue &MV, int64_t Offset,
MachineModuleInfo *MMI,
MCStreamer &Streamer) const override;
diff --git a/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index a4b78f2a7d6b..dc916a7b3407 100644
--- a/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -618,6 +618,19 @@ int AArch64TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, I);
}
+AArch64TTIImpl::TTI::MemCmpExpansionOptions
+AArch64TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
+ TTI::MemCmpExpansionOptions Options;
+ Options.AllowOverlappingLoads = !ST->requiresStrictAlign();
+ Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
+ Options.NumLoadsPerBlock = Options.MaxNumLoads;
+ // TODO: Though vector loads usually perform well on AArch64, in some targets
+ // they may wake up the FP unit, which raises the power consumption. Perhaps
+ // they could be used with no holds barred (-O3).
+ Options.LoadSizes = {8, 4, 2, 1};
+ return Options;
+}
+
int AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Ty,
unsigned Alignment, unsigned AddressSpace,
const Instruction *I) {
@@ -879,22 +892,6 @@ bool AArch64TTIImpl::shouldConsiderAddressTypePromotion(
return Considerable;
}
-unsigned AArch64TTIImpl::getCacheLineSize() {
- return ST->getCacheLineSize();
-}
-
-unsigned AArch64TTIImpl::getPrefetchDistance() {
- return ST->getPrefetchDistance();
-}
-
-unsigned AArch64TTIImpl::getMinPrefetchStride() {
- return ST->getMinPrefetchStride();
-}
-
-unsigned AArch64TTIImpl::getMaxPrefetchIterationsAhead() {
- return ST->getMaxPrefetchIterationsAhead();
-}
-
bool AArch64TTIImpl::useReductionIntrinsic(unsigned Opcode, Type *Ty,
TTI::ReductionFlags Flags) const {
assert(isa<VectorType>(Ty) && "Expected Ty to be a vector type");
diff --git a/lib/Target/AArch64/AArch64TargetTransformInfo.h b/lib/Target/AArch64/AArch64TargetTransformInfo.h
index 10c15a139b4c..32c59f41e1c3 100644
--- a/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -85,7 +85,8 @@ public:
bool enableInterleavedAccessVectorization() { return true; }
- unsigned getNumberOfRegisters(bool Vector) {
+ unsigned getNumberOfRegisters(unsigned ClassID) const {
+ bool Vector = (ClassID == 1);
if (Vector) {
if (ST->hasNEON())
return 32;
@@ -130,6 +131,9 @@ public:
int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
const Instruction *I = nullptr);
+ TTI::MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize,
+ bool IsZeroCmp) const;
+
int getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
unsigned AddressSpace, const Instruction *I = nullptr);
@@ -153,14 +157,6 @@ public:
shouldConsiderAddressTypePromotion(const Instruction &I,
bool &AllowPromotionWithoutCommonHeader);
- unsigned getCacheLineSize();
-
- unsigned getPrefetchDistance();
-
- unsigned getMinPrefetchStride();
-
- unsigned getMaxPrefetchIterationsAhead();
-
bool shouldExpandReduction(const IntrinsicInst *II) const {
return false;
}
diff --git a/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
index f4c55d48d215..4fb409f020d9 100644
--- a/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
+++ b/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
@@ -935,48 +935,34 @@ public:
return false;
}
- bool isMovZSymbolG3() const {
- return isMovWSymbol(AArch64MCExpr::VK_ABS_G3);
+ bool isMovWSymbolG3() const {
+ return isMovWSymbol({AArch64MCExpr::VK_ABS_G3, AArch64MCExpr::VK_PREL_G3});
}
- bool isMovZSymbolG2() const {
- return isMovWSymbol({AArch64MCExpr::VK_ABS_G2, AArch64MCExpr::VK_ABS_G2_S,
- AArch64MCExpr::VK_TPREL_G2,
- AArch64MCExpr::VK_DTPREL_G2});
- }
-
- bool isMovZSymbolG1() const {
- return isMovWSymbol({
- AArch64MCExpr::VK_ABS_G1, AArch64MCExpr::VK_ABS_G1_S,
- AArch64MCExpr::VK_GOTTPREL_G1, AArch64MCExpr::VK_TPREL_G1,
- AArch64MCExpr::VK_DTPREL_G1,
- });
- }
-
- bool isMovZSymbolG0() const {
- return isMovWSymbol({AArch64MCExpr::VK_ABS_G0, AArch64MCExpr::VK_ABS_G0_S,
- AArch64MCExpr::VK_TPREL_G0,
- AArch64MCExpr::VK_DTPREL_G0});
- }
-
- bool isMovKSymbolG3() const {
- return isMovWSymbol(AArch64MCExpr::VK_ABS_G3);
- }
-
- bool isMovKSymbolG2() const {
- return isMovWSymbol(AArch64MCExpr::VK_ABS_G2_NC);
+ bool isMovWSymbolG2() const {
+ return isMovWSymbol(
+ {AArch64MCExpr::VK_ABS_G2, AArch64MCExpr::VK_ABS_G2_S,
+ AArch64MCExpr::VK_ABS_G2_NC, AArch64MCExpr::VK_PREL_G2,
+ AArch64MCExpr::VK_PREL_G2_NC, AArch64MCExpr::VK_TPREL_G2,
+ AArch64MCExpr::VK_DTPREL_G2});
}
- bool isMovKSymbolG1() const {
- return isMovWSymbol({AArch64MCExpr::VK_ABS_G1_NC,
- AArch64MCExpr::VK_TPREL_G1_NC,
- AArch64MCExpr::VK_DTPREL_G1_NC});
+ bool isMovWSymbolG1() const {
+ return isMovWSymbol(
+ {AArch64MCExpr::VK_ABS_G1, AArch64MCExpr::VK_ABS_G1_S,
+ AArch64MCExpr::VK_ABS_G1_NC, AArch64MCExpr::VK_PREL_G1,
+ AArch64MCExpr::VK_PREL_G1_NC, AArch64MCExpr::VK_GOTTPREL_G1,
+ AArch64MCExpr::VK_TPREL_G1, AArch64MCExpr::VK_TPREL_G1_NC,
+ AArch64MCExpr::VK_DTPREL_G1, AArch64MCExpr::VK_DTPREL_G1_NC});
}
- bool isMovKSymbolG0() const {
+ bool isMovWSymbolG0() const {
return isMovWSymbol(
- {AArch64MCExpr::VK_ABS_G0_NC, AArch64MCExpr::VK_GOTTPREL_G0_NC,
- AArch64MCExpr::VK_TPREL_G0_NC, AArch64MCExpr::VK_DTPREL_G0_NC});
+ {AArch64MCExpr::VK_ABS_G0, AArch64MCExpr::VK_ABS_G0_S,
+ AArch64MCExpr::VK_ABS_G0_NC, AArch64MCExpr::VK_PREL_G0,
+ AArch64MCExpr::VK_PREL_G0_NC, AArch64MCExpr::VK_GOTTPREL_G0_NC,
+ AArch64MCExpr::VK_TPREL_G0, AArch64MCExpr::VK_TPREL_G0_NC,
+ AArch64MCExpr::VK_DTPREL_G0, AArch64MCExpr::VK_DTPREL_G0_NC});
}
template<int RegWidth, int Shift>
@@ -1814,7 +1800,7 @@ public:
static std::unique_ptr<AArch64Operand>
CreateToken(StringRef Str, bool IsSuffix, SMLoc S, MCContext &Ctx) {
- auto Op = make_unique<AArch64Operand>(k_Token, Ctx);
+ auto Op = std::make_unique<AArch64Operand>(k_Token, Ctx);
Op->Tok.Data = Str.data();
Op->Tok.Length = Str.size();
Op->Tok.IsSuffix = IsSuffix;
@@ -1829,7 +1815,7 @@ public:
AArch64_AM::ShiftExtendType ExtTy = AArch64_AM::LSL,
unsigned ShiftAmount = 0,
unsigned HasExplicitAmount = false) {
- auto Op = make_unique<AArch64Operand>(k_Register, Ctx);
+ auto Op = std::make_unique<AArch64Operand>(k_Register, Ctx);
Op->Reg.RegNum = RegNum;
Op->Reg.Kind = Kind;
Op->Reg.ElementWidth = 0;
@@ -1861,7 +1847,7 @@ public:
CreateVectorList(unsigned RegNum, unsigned Count, unsigned NumElements,
unsigned ElementWidth, RegKind RegisterKind, SMLoc S, SMLoc E,
MCContext &Ctx) {
- auto Op = make_unique<AArch64Operand>(k_VectorList, Ctx);
+ auto Op = std::make_unique<AArch64Operand>(k_VectorList, Ctx);
Op->VectorList.RegNum = RegNum;
Op->VectorList.Count = Count;
Op->VectorList.NumElements = NumElements;
@@ -1874,7 +1860,7 @@ public:
static std::unique_ptr<AArch64Operand>
CreateVectorIndex(unsigned Idx, SMLoc S, SMLoc E, MCContext &Ctx) {
- auto Op = make_unique<AArch64Operand>(k_VectorIndex, Ctx);
+ auto Op = std::make_unique<AArch64Operand>(k_VectorIndex, Ctx);
Op->VectorIndex.Val = Idx;
Op->StartLoc = S;
Op->EndLoc = E;
@@ -1883,7 +1869,7 @@ public:
static std::unique_ptr<AArch64Operand> CreateImm(const MCExpr *Val, SMLoc S,
SMLoc E, MCContext &Ctx) {
- auto Op = make_unique<AArch64Operand>(k_Immediate, Ctx);
+ auto Op = std::make_unique<AArch64Operand>(k_Immediate, Ctx);
Op->Imm.Val = Val;
Op->StartLoc = S;
Op->EndLoc = E;
@@ -1894,7 +1880,7 @@ public:
unsigned ShiftAmount,
SMLoc S, SMLoc E,
MCContext &Ctx) {
- auto Op = make_unique<AArch64Operand>(k_ShiftedImm, Ctx);
+ auto Op = std::make_unique<AArch64Operand>(k_ShiftedImm, Ctx);
Op->ShiftedImm .Val = Val;
Op->ShiftedImm.ShiftAmount = ShiftAmount;
Op->StartLoc = S;
@@ -1904,7 +1890,7 @@ public:
static std::unique_ptr<AArch64Operand>
CreateCondCode(AArch64CC::CondCode Code, SMLoc S, SMLoc E, MCContext &Ctx) {
- auto Op = make_unique<AArch64Operand>(k_CondCode, Ctx);
+ auto Op = std::make_unique<AArch64Operand>(k_CondCode, Ctx);
Op->CondCode.Code = Code;
Op->StartLoc = S;
Op->EndLoc = E;
@@ -1913,7 +1899,7 @@ public:
static std::unique_ptr<AArch64Operand>
CreateFPImm(APFloat Val, bool IsExact, SMLoc S, MCContext &Ctx) {
- auto Op = make_unique<AArch64Operand>(k_FPImm, Ctx);
+ auto Op = std::make_unique<AArch64Operand>(k_FPImm, Ctx);
Op->FPImm.Val = Val.bitcastToAPInt().getSExtValue();
Op->FPImm.IsExact = IsExact;
Op->StartLoc = S;
@@ -1925,7 +1911,7 @@ public:
StringRef Str,
SMLoc S,
MCContext &Ctx) {
- auto Op = make_unique<AArch64Operand>(k_Barrier, Ctx);
+ auto Op = std::make_unique<AArch64Operand>(k_Barrier, Ctx);
Op->Barrier.Val = Val;
Op->Barrier.Data = Str.data();
Op->Barrier.Length = Str.size();
@@ -1939,7 +1925,7 @@ public:
uint32_t MSRReg,
uint32_t PStateField,
MCContext &Ctx) {
- auto Op = make_unique<AArch64Operand>(k_SysReg, Ctx);
+ auto Op = std::make_unique<AArch64Operand>(k_SysReg, Ctx);
Op->SysReg.Data = Str.data();
Op->SysReg.Length = Str.size();
Op->SysReg.MRSReg = MRSReg;
@@ -1952,7 +1938,7 @@ public:
static std::unique_ptr<AArch64Operand> CreateSysCR(unsigned Val, SMLoc S,
SMLoc E, MCContext &Ctx) {
- auto Op = make_unique<AArch64Operand>(k_SysCR, Ctx);
+ auto Op = std::make_unique<AArch64Operand>(k_SysCR, Ctx);
Op->SysCRImm.Val = Val;
Op->StartLoc = S;
Op->EndLoc = E;
@@ -1963,7 +1949,7 @@ public:
StringRef Str,
SMLoc S,
MCContext &Ctx) {
- auto Op = make_unique<AArch64Operand>(k_Prefetch, Ctx);
+ auto Op = std::make_unique<AArch64Operand>(k_Prefetch, Ctx);
Op->Prefetch.Val = Val;
Op->Barrier.Data = Str.data();
Op->Barrier.Length = Str.size();
@@ -1976,7 +1962,7 @@ public:
StringRef Str,
SMLoc S,
MCContext &Ctx) {
- auto Op = make_unique<AArch64Operand>(k_PSBHint, Ctx);
+ auto Op = std::make_unique<AArch64Operand>(k_PSBHint, Ctx);
Op->PSBHint.Val = Val;
Op->PSBHint.Data = Str.data();
Op->PSBHint.Length = Str.size();
@@ -1989,7 +1975,7 @@ public:
StringRef Str,
SMLoc S,
MCContext &Ctx) {
- auto Op = make_unique<AArch64Operand>(k_BTIHint, Ctx);
+ auto Op = std::make_unique<AArch64Operand>(k_BTIHint, Ctx);
Op->BTIHint.Val = Val << 1 | 32;
Op->BTIHint.Data = Str.data();
Op->BTIHint.Length = Str.size();
@@ -2001,7 +1987,7 @@ public:
static std::unique_ptr<AArch64Operand>
CreateShiftExtend(AArch64_AM::ShiftExtendType ShOp, unsigned Val,
bool HasExplicitAmount, SMLoc S, SMLoc E, MCContext &Ctx) {
- auto Op = make_unique<AArch64Operand>(k_ShiftExtend, Ctx);
+ auto Op = std::make_unique<AArch64Operand>(k_ShiftExtend, Ctx);
Op->ShiftExtend.Type = ShOp;
Op->ShiftExtend.Amount = Val;
Op->ShiftExtend.HasExplicitAmount = HasExplicitAmount;
@@ -2840,7 +2826,7 @@ static const struct Extension {
{"sve2-aes", {AArch64::FeatureSVE2AES}},
{"sve2-sm4", {AArch64::FeatureSVE2SM4}},
{"sve2-sha3", {AArch64::FeatureSVE2SHA3}},
- {"bitperm", {AArch64::FeatureSVE2BitPerm}},
+ {"sve2-bitperm", {AArch64::FeatureSVE2BitPerm}},
// FIXME: Unsupported extensions
{"pan", {}},
{"lor", {}},
@@ -3260,6 +3246,13 @@ bool AArch64AsmParser::parseSymbolicImmVal(const MCExpr *&ImmVal) {
.Case("abs_g0", AArch64MCExpr::VK_ABS_G0)
.Case("abs_g0_s", AArch64MCExpr::VK_ABS_G0_S)
.Case("abs_g0_nc", AArch64MCExpr::VK_ABS_G0_NC)
+ .Case("prel_g3", AArch64MCExpr::VK_PREL_G3)
+ .Case("prel_g2", AArch64MCExpr::VK_PREL_G2)
+ .Case("prel_g2_nc", AArch64MCExpr::VK_PREL_G2_NC)
+ .Case("prel_g1", AArch64MCExpr::VK_PREL_G1)
+ .Case("prel_g1_nc", AArch64MCExpr::VK_PREL_G1_NC)
+ .Case("prel_g0", AArch64MCExpr::VK_PREL_G0)
+ .Case("prel_g0_nc", AArch64MCExpr::VK_PREL_G0_NC)
.Case("dtprel_g2", AArch64MCExpr::VK_DTPREL_G2)
.Case("dtprel_g1", AArch64MCExpr::VK_DTPREL_G1)
.Case("dtprel_g1_nc", AArch64MCExpr::VK_DTPREL_G1_NC)
@@ -5283,7 +5276,7 @@ bool AArch64AsmParser::parseDirectiveInst(SMLoc Loc) {
auto parseOp = [&]() -> bool {
SMLoc L = getLoc();
- const MCExpr *Expr;
+ const MCExpr *Expr = nullptr;
if (check(getParser().parseExpression(Expr), L, "expected expression"))
return true;
const MCConstantExpr *Value = dyn_cast_or_null<MCConstantExpr>(Expr);
@@ -5542,43 +5535,43 @@ unsigned AArch64AsmParser::validateTargetOperandClass(MCParsedAsmOperand &AsmOp,
switch (Kind) {
default:
return Match_InvalidOperand;
- case MCK__35_0:
+ case MCK__HASH_0:
ExpectedVal = 0;
break;
- case MCK__35_1:
+ case MCK__HASH_1:
ExpectedVal = 1;
break;
- case MCK__35_12:
+ case MCK__HASH_12:
ExpectedVal = 12;
break;
- case MCK__35_16:
+ case MCK__HASH_16:
ExpectedVal = 16;
break;
- case MCK__35_2:
+ case MCK__HASH_2:
ExpectedVal = 2;
break;
- case MCK__35_24:
+ case MCK__HASH_24:
ExpectedVal = 24;
break;
- case MCK__35_3:
+ case MCK__HASH_3:
ExpectedVal = 3;
break;
- case MCK__35_32:
+ case MCK__HASH_32:
ExpectedVal = 32;
break;
- case MCK__35_4:
+ case MCK__HASH_4:
ExpectedVal = 4;
break;
- case MCK__35_48:
+ case MCK__HASH_48:
ExpectedVal = 48;
break;
- case MCK__35_6:
+ case MCK__HASH_6:
ExpectedVal = 6;
break;
- case MCK__35_64:
+ case MCK__HASH_64:
ExpectedVal = 64;
break;
- case MCK__35_8:
+ case MCK__HASH_8:
ExpectedVal = 8;
break;
}
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
index 6418211a4f55..21ce5785ea5e 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
@@ -153,9 +153,8 @@ static unsigned AdrImmBits(unsigned Value) {
static uint64_t adjustFixupValue(const MCFixup &Fixup, const MCValue &Target,
uint64_t Value, MCContext &Ctx,
const Triple &TheTriple, bool IsResolved) {
- unsigned Kind = Fixup.getKind();
int64_t SignedValue = static_cast<int64_t>(Value);
- switch (Kind) {
+ switch (Fixup.getTargetKind()) {
default:
llvm_unreachable("Unknown fixup kind!");
case AArch64::fixup_aarch64_pcrel_adr_imm21:
@@ -574,7 +573,7 @@ public:
case MCCFIInstruction::OpDefCfa: {
// Defines a frame pointer.
unsigned XReg =
- getXRegFromWReg(MRI.getLLVMRegNum(Inst.getRegister(), true));
+ getXRegFromWReg(*MRI.getLLVMRegNum(Inst.getRegister(), true));
// Other CFA registers than FP are not supported by compact unwind.
// Fallback on DWARF.
@@ -593,8 +592,8 @@ public:
assert(FPPush.getOperation() == MCCFIInstruction::OpOffset &&
"Frame pointer not pushed!");
- unsigned LRReg = MRI.getLLVMRegNum(LRPush.getRegister(), true);
- unsigned FPReg = MRI.getLLVMRegNum(FPPush.getRegister(), true);
+ unsigned LRReg = *MRI.getLLVMRegNum(LRPush.getRegister(), true);
+ unsigned FPReg = *MRI.getLLVMRegNum(FPPush.getRegister(), true);
LRReg = getXRegFromWReg(LRReg);
FPReg = getXRegFromWReg(FPReg);
@@ -615,14 +614,14 @@ public:
case MCCFIInstruction::OpOffset: {
// Registers are saved in pairs. We expect there to be two consecutive
// `.cfi_offset' instructions with the appropriate registers specified.
- unsigned Reg1 = MRI.getLLVMRegNum(Inst.getRegister(), true);
+ unsigned Reg1 = *MRI.getLLVMRegNum(Inst.getRegister(), true);
if (i + 1 == e)
return CU::UNWIND_ARM64_MODE_DWARF;
const MCCFIInstruction &Inst2 = Instrs[++i];
if (Inst2.getOperation() != MCCFIInstruction::OpOffset)
return CU::UNWIND_ARM64_MODE_DWARF;
- unsigned Reg2 = MRI.getLLVMRegNum(Inst2.getRegister(), true);
+ unsigned Reg2 = *MRI.getLLVMRegNum(Inst2.getRegister(), true);
// N.B. The encodings must be in register number order, and the X
// registers before the D registers.
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp
index c871e2c62eac..0fd1ca187be7 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp
@@ -57,7 +57,7 @@ AArch64ELFObjectWriter::AArch64ELFObjectWriter(uint8_t OSABI, bool IsILP32)
static bool isNonILP32reloc(const MCFixup &Fixup,
AArch64MCExpr::VariantKind RefKind,
MCContext &Ctx) {
- if ((unsigned)Fixup.getKind() != AArch64::fixup_aarch64_movw)
+ if (Fixup.getTargetKind() != AArch64::fixup_aarch64_movw)
return false;
switch (RefKind) {
case AArch64MCExpr::VK_ABS_G3:
@@ -120,7 +120,7 @@ unsigned AArch64ELFObjectWriter::getRelocType(MCContext &Ctx,
"Should only be expression-level modifiers here");
if (IsPCRel) {
- switch ((unsigned)Fixup.getKind()) {
+ switch (Fixup.getTargetKind()) {
case FK_Data_1:
Ctx.reportError(Fixup.getLoc(), "1-byte data relocations not supported");
return ELF::R_AARCH64_NONE;
@@ -184,7 +184,7 @@ unsigned AArch64ELFObjectWriter::getRelocType(MCContext &Ctx,
} else {
if (IsILP32 && isNonILP32reloc(Fixup, RefKind, Ctx))
return ELF::R_AARCH64_NONE;
- switch ((unsigned)Fixup.getKind()) {
+ switch (Fixup.getTargetKind()) {
case FK_NONE:
return ELF::R_AARCH64_NONE;
case FK_Data_1:
@@ -394,6 +394,20 @@ unsigned AArch64ELFObjectWriter::getRelocType(MCContext &Ctx,
return R_CLS(MOVW_SABS_G0);
if (RefKind == AArch64MCExpr::VK_ABS_G0_NC)
return R_CLS(MOVW_UABS_G0_NC);
+ if (RefKind == AArch64MCExpr::VK_PREL_G3)
+ return ELF::R_AARCH64_MOVW_PREL_G3;
+ if (RefKind == AArch64MCExpr::VK_PREL_G2)
+ return ELF::R_AARCH64_MOVW_PREL_G2;
+ if (RefKind == AArch64MCExpr::VK_PREL_G2_NC)
+ return ELF::R_AARCH64_MOVW_PREL_G2_NC;
+ if (RefKind == AArch64MCExpr::VK_PREL_G1)
+ return R_CLS(MOVW_PREL_G1);
+ if (RefKind == AArch64MCExpr::VK_PREL_G1_NC)
+ return ELF::R_AARCH64_MOVW_PREL_G1_NC;
+ if (RefKind == AArch64MCExpr::VK_PREL_G0)
+ return R_CLS(MOVW_PREL_G0);
+ if (RefKind == AArch64MCExpr::VK_PREL_G0_NC)
+ return R_CLS(MOVW_PREL_G0_NC);
if (RefKind == AArch64MCExpr::VK_DTPREL_G2)
return ELF::R_AARCH64_TLSLD_MOVW_DTPREL_G2;
if (RefKind == AArch64MCExpr::VK_DTPREL_G1)
@@ -434,5 +448,5 @@ unsigned AArch64ELFObjectWriter::getRelocType(MCContext &Ctx,
std::unique_ptr<MCObjectTargetWriter>
llvm::createAArch64ELFObjectWriter(uint8_t OSABI, bool IsILP32) {
- return llvm::make_unique<AArch64ELFObjectWriter>(OSABI, IsILP32);
+ return std::make_unique<AArch64ELFObjectWriter>(OSABI, IsILP32);
}
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp
index d0a544273b8b..1a16468484ad 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp
@@ -172,7 +172,8 @@ void AArch64InstPrinter::printInst(const MCInst *MI, raw_ostream &O,
int ImmS = MI->getOperand(4).getImm();
if ((Op2.getReg() == AArch64::WZR || Op2.getReg() == AArch64::XZR) &&
- (ImmR == 0 || ImmS < ImmR)) {
+ (ImmR == 0 || ImmS < ImmR) &&
+ STI.getFeatureBits()[AArch64::HasV8_2aOps]) {
// BFC takes precedence over its entire range, sligtly differently to BFI.
int BitWidth = Opcode == AArch64::BFMXri ? 64 : 32;
int LSB = (BitWidth - ImmR) % BitWidth;
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp
index ecff1ab0a8b3..5926a4f81616 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp
@@ -30,7 +30,7 @@ static cl::opt<AsmWriterVariantTy> AsmWriterVariant(
cl::values(clEnumValN(Generic, "generic", "Emit generic NEON assembly"),
clEnumValN(Apple, "apple", "Emit Apple-style NEON assembly")));
-AArch64MCAsmInfoDarwin::AArch64MCAsmInfoDarwin() {
+AArch64MCAsmInfoDarwin::AArch64MCAsmInfoDarwin(bool IsILP32) {
// We prefer NEON instructions to be printed in the short, Apple-specific
// form when targeting Darwin.
AssemblerDialect = AsmWriterVariant == Default ? Apple : AsmWriterVariant;
@@ -39,7 +39,8 @@ AArch64MCAsmInfoDarwin::AArch64MCAsmInfoDarwin() {
PrivateLabelPrefix = "L";
SeparatorString = "%%";
CommentString = ";";
- CodePointerSize = CalleeSaveStackSlotSize = 8;
+ CalleeSaveStackSlotSize = 8;
+ CodePointerSize = IsILP32 ? 4 : 8;
AlignmentIsInBytes = false;
UsesELFSectionDirectiveForBSS = true;
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h b/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h
index 36ae92afc8c1..7274ae79f74a 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h
@@ -23,7 +23,7 @@ class Target;
class Triple;
struct AArch64MCAsmInfoDarwin : public MCAsmInfoDarwin {
- explicit AArch64MCAsmInfoDarwin();
+ explicit AArch64MCAsmInfoDarwin(bool IsILP32);
const MCExpr *
getExprForPersonalitySymbol(const MCSymbol *Sym, unsigned Encoding,
MCStreamer &Streamer) const override;
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp
index 0a529321edc8..548e399e05a3 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp
@@ -42,6 +42,13 @@ StringRef AArch64MCExpr::getVariantKindName() const {
case VK_ABS_G0: return ":abs_g0:";
case VK_ABS_G0_S: return ":abs_g0_s:";
case VK_ABS_G0_NC: return ":abs_g0_nc:";
+ case VK_PREL_G3: return ":prel_g3:";
+ case VK_PREL_G2: return ":prel_g2:";
+ case VK_PREL_G2_NC: return ":prel_g2_nc:";
+ case VK_PREL_G1: return ":prel_g1:";
+ case VK_PREL_G1_NC: return ":prel_g1_nc:";
+ case VK_PREL_G0: return ":prel_g0:";
+ case VK_PREL_G0_NC: return ":prel_g0_nc:";
case VK_DTPREL_G2: return ":dtprel_g2:";
case VK_DTPREL_G1: return ":dtprel_g1:";
case VK_DTPREL_G1_NC: return ":dtprel_g1_nc:";
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h b/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h
index ec9c95911628..a82ff2e91426 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h
@@ -27,12 +27,13 @@ public:
// symbol. E.g. direct, via the GOT, ...
VK_ABS = 0x001,
VK_SABS = 0x002,
- VK_GOT = 0x003,
- VK_DTPREL = 0x004,
- VK_GOTTPREL = 0x005,
- VK_TPREL = 0x006,
- VK_TLSDESC = 0x007,
- VK_SECREL = 0x008,
+ VK_PREL = 0x003,
+ VK_GOT = 0x004,
+ VK_DTPREL = 0x005,
+ VK_GOTTPREL = 0x006,
+ VK_TPREL = 0x007,
+ VK_TLSDESC = 0x008,
+ VK_SECREL = 0x009,
VK_SymLocBits = 0x00f,
// Variants specifying which part of the final address calculation is
@@ -72,6 +73,13 @@ public:
VK_ABS_G0_S = VK_SABS | VK_G0,
VK_ABS_G0_NC = VK_ABS | VK_G0 | VK_NC,
VK_LO12 = VK_ABS | VK_PAGEOFF | VK_NC,
+ VK_PREL_G3 = VK_PREL | VK_G3,
+ VK_PREL_G2 = VK_PREL | VK_G2,
+ VK_PREL_G2_NC = VK_PREL | VK_G2 | VK_NC,
+ VK_PREL_G1 = VK_PREL | VK_G1,
+ VK_PREL_G1_NC = VK_PREL | VK_G1 | VK_NC,
+ VK_PREL_G0 = VK_PREL | VK_G0,
+ VK_PREL_G0_NC = VK_PREL | VK_G0 | VK_NC,
VK_GOT_LO12 = VK_GOT | VK_PAGEOFF | VK_NC,
VK_GOT_PAGE = VK_GOT | VK_PAGE,
VK_DTPREL_G2 = VK_DTPREL | VK_G2,
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
index df12274d9470..1d583ec0087b 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
@@ -241,7 +241,7 @@ static MCAsmInfo *createAArch64MCAsmInfo(const MCRegisterInfo &MRI,
const Triple &TheTriple) {
MCAsmInfo *MAI;
if (TheTriple.isOSBinFormatMachO())
- MAI = new AArch64MCAsmInfoDarwin();
+ MAI = new AArch64MCAsmInfoDarwin(TheTriple.getArch() == Triple::aarch64_32);
else if (TheTriple.isWindowsMSVCEnvironment())
MAI = new AArch64MCAsmInfoMicrosoftCOFF();
else if (TheTriple.isOSBinFormatCOFF())
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp
index b3ce5ef22eef..fc04d37eb362 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp
@@ -54,7 +54,7 @@ bool AArch64MachObjectWriter::getAArch64FixupKindMachOInfo(
RelocType = unsigned(MachO::ARM64_RELOC_UNSIGNED);
Log2Size = ~0U;
- switch ((unsigned)Fixup.getKind()) {
+ switch (Fixup.getTargetKind()) {
default:
return false;
@@ -406,6 +406,6 @@ void AArch64MachObjectWriter::recordRelocation(
std::unique_ptr<MCObjectTargetWriter>
llvm::createAArch64MachObjectWriter(uint32_t CPUType, uint32_t CPUSubtype,
bool IsILP32) {
- return llvm::make_unique<AArch64MachObjectWriter>(CPUType, CPUSubtype,
+ return std::make_unique<AArch64MachObjectWriter>(CPUType, CPUSubtype,
IsILP32);
}
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFObjectWriter.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFObjectWriter.cpp
index a45880a07427..aa50bd05cb71 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFObjectWriter.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFObjectWriter.cpp
@@ -120,7 +120,7 @@ bool AArch64WinCOFFObjectWriter::recordRelocation(const MCFixup &Fixup) const {
namespace llvm {
std::unique_ptr<MCObjectTargetWriter> createAArch64WinCOFFObjectWriter() {
- return llvm::make_unique<AArch64WinCOFFObjectWriter>();
+ return std::make_unique<AArch64WinCOFFObjectWriter>();
}
} // end namespace llvm
diff --git a/lib/Target/AArch64/SVEInstrFormats.td b/lib/Target/AArch64/SVEInstrFormats.td
index 808e59467081..8ccf6aa675ba 100644
--- a/lib/Target/AArch64/SVEInstrFormats.td
+++ b/lib/Target/AArch64/SVEInstrFormats.td
@@ -279,6 +279,19 @@ let Predicates = [HasSVE] in {
defm PTRUES : sve_int_ptrue<0b001, "ptrues">;
}
+//===----------------------------------------------------------------------===//
+// SVE pattern match helpers.
+//===----------------------------------------------------------------------===//
+
+class SVE_1_Op_Pat<ValueType vtd, SDPatternOperator op, ValueType vt1,
+ Instruction inst>
+: Pat<(vtd (op vt1:$Op1)),
+ (inst $Op1)>;
+
+class SVE_3_Op_Pat<ValueType vtd, SDPatternOperator op, ValueType vt1,
+ ValueType vt2, ValueType vt3, Instruction inst>
+: Pat<(vtd (op vt1:$Op1, vt2:$Op2, vt3:$Op3)),
+ (inst $Op1, $Op2, $Op3)>;
//===----------------------------------------------------------------------===//
// SVE Predicate Misc Group
@@ -403,12 +416,12 @@ multiclass sve_int_count_r_x64<bits<5> opc, string asm> {
}
class sve_int_count_v<bits<2> sz8_64, bits<5> opc, string asm,
- ZPRRegOp zprty>
-: I<(outs zprty:$Zdn), (ins zprty:$_Zdn, PPRAny:$Pg),
- asm, "\t$Zdn, $Pg",
+ ZPRRegOp zprty, PPRRegOp pprty>
+: I<(outs zprty:$Zdn), (ins zprty:$_Zdn, pprty:$Pm),
+ asm, "\t$Zdn, $Pm",
"",
[]>, Sched<[]> {
- bits<4> Pg;
+ bits<4> Pm;
bits<5> Zdn;
let Inst{31-24} = 0b00100101;
let Inst{23-22} = sz8_64;
@@ -416,7 +429,7 @@ class sve_int_count_v<bits<2> sz8_64, bits<5> opc, string asm,
let Inst{18-16} = opc{4-2};
let Inst{15-11} = 0b10000;
let Inst{10-9} = opc{1-0};
- let Inst{8-5} = Pg;
+ let Inst{8-5} = Pm;
let Inst{4-0} = Zdn;
let Constraints = "$Zdn = $_Zdn";
@@ -425,9 +438,16 @@ class sve_int_count_v<bits<2> sz8_64, bits<5> opc, string asm,
}
multiclass sve_int_count_v<bits<5> opc, string asm> {
- def _H : sve_int_count_v<0b01, opc, asm, ZPR16>;
- def _S : sve_int_count_v<0b10, opc, asm, ZPR32>;
- def _D : sve_int_count_v<0b11, opc, asm, ZPR64>;
+ def _H : sve_int_count_v<0b01, opc, asm, ZPR16, PPR16>;
+ def _S : sve_int_count_v<0b10, opc, asm, ZPR32, PPR32>;
+ def _D : sve_int_count_v<0b11, opc, asm, ZPR64, PPR64>;
+
+ def : InstAlias<asm # "\t$Zdn, $Pm",
+ (!cast<Instruction>(NAME # "_H") ZPR16:$Zdn, PPRAny:$Pm), 0>;
+ def : InstAlias<asm # "\t$Zdn, $Pm",
+ (!cast<Instruction>(NAME # "_S") ZPR32:$Zdn, PPRAny:$Pm), 0>;
+ def : InstAlias<asm # "\t$Zdn, $Pm",
+ (!cast<Instruction>(NAME # "_D") ZPR64:$Zdn, PPRAny:$Pm), 0>;
}
class sve_int_pcount_pred<bits<2> sz8_64, bits<4> opc, string asm,
@@ -609,11 +629,12 @@ multiclass sve_int_pred_pattern_b_x64<bits<5> opc, string asm> {
//===----------------------------------------------------------------------===//
class sve_int_perm_dup_r<bits<2> sz8_64, string asm, ZPRRegOp zprty,
- RegisterClass srcRegType>
+ ValueType vt, RegisterClass srcRegType,
+ SDPatternOperator op>
: I<(outs zprty:$Zd), (ins srcRegType:$Rn),
asm, "\t$Zd, $Rn",
"",
- []>, Sched<[]> {
+ [(set (vt zprty:$Zd), (op srcRegType:$Rn))]>, Sched<[]> {
bits<5> Rn;
bits<5> Zd;
let Inst{31-24} = 0b00000101;
@@ -623,11 +644,11 @@ class sve_int_perm_dup_r<bits<2> sz8_64, string asm, ZPRRegOp zprty,
let Inst{4-0} = Zd;
}
-multiclass sve_int_perm_dup_r<string asm> {
- def _B : sve_int_perm_dup_r<0b00, asm, ZPR8, GPR32sp>;
- def _H : sve_int_perm_dup_r<0b01, asm, ZPR16, GPR32sp>;
- def _S : sve_int_perm_dup_r<0b10, asm, ZPR32, GPR32sp>;
- def _D : sve_int_perm_dup_r<0b11, asm, ZPR64, GPR64sp>;
+multiclass sve_int_perm_dup_r<string asm, SDPatternOperator op> {
+ def _B : sve_int_perm_dup_r<0b00, asm, ZPR8, nxv16i8, GPR32sp, op>;
+ def _H : sve_int_perm_dup_r<0b01, asm, ZPR16, nxv8i16, GPR32sp, op>;
+ def _S : sve_int_perm_dup_r<0b10, asm, ZPR32, nxv4i32, GPR32sp, op>;
+ def _D : sve_int_perm_dup_r<0b11, asm, ZPR64, nxv2i64, GPR64sp, op>;
def : InstAlias<"mov $Zd, $Rn",
(!cast<Instruction>(NAME # _B) ZPR8:$Zd, GPR32sp:$Rn), 1>;
@@ -744,7 +765,7 @@ multiclass sve2_int_perm_tbl<string asm> {
}
class sve2_int_perm_tbx<bits<2> sz8_64, string asm, ZPRRegOp zprty>
-: I<(outs zprty:$Zd), (ins zprty:$Zn, zprty:$Zm),
+: I<(outs zprty:$Zd), (ins zprty:$_Zd, zprty:$Zn, zprty:$Zm),
asm, "\t$Zd, $Zn, $Zm",
"",
[]>, Sched<[]> {
@@ -758,6 +779,8 @@ class sve2_int_perm_tbx<bits<2> sz8_64, string asm, ZPRRegOp zprty>
let Inst{15-10} = 0b001011;
let Inst{9-5} = Zn;
let Inst{4-0} = Zd;
+
+ let Constraints = "$Zd = $_Zd";
}
multiclass sve2_int_perm_tbx<string asm> {
@@ -826,10 +849,14 @@ class sve_int_perm_unpk<bits<2> sz16_64, bits<2> opc, string asm,
let Inst{4-0} = Zd;
}
-multiclass sve_int_perm_unpk<bits<2> opc, string asm> {
+multiclass sve_int_perm_unpk<bits<2> opc, string asm, SDPatternOperator op> {
def _H : sve_int_perm_unpk<0b01, opc, asm, ZPR16, ZPR8>;
def _S : sve_int_perm_unpk<0b10, opc, asm, ZPR32, ZPR16>;
def _D : sve_int_perm_unpk<0b11, opc, asm, ZPR64, ZPR32>;
+
+ def : SVE_1_Op_Pat<nxv8i16, op, nxv16i8, !cast<Instruction>(NAME # _H)>;
+ def : SVE_1_Op_Pat<nxv4i32, op, nxv8i16, !cast<Instruction>(NAME # _S)>;
+ def : SVE_1_Op_Pat<nxv2i64, op, nxv4i32, !cast<Instruction>(NAME # _D)>;
}
class sve_int_perm_insrs<bits<2> sz8_64, string asm, ZPRRegOp zprty,
@@ -1197,10 +1224,12 @@ multiclass sve_fp_ftmad<string asm> {
//===----------------------------------------------------------------------===//
class sve_fp_3op_u_zd<bits<2> sz, bits<3> opc, string asm,
- ZPRRegOp zprty>
+ ZPRRegOp zprty,
+ ValueType vt, ValueType vt2, SDPatternOperator op>
: I<(outs zprty:$Zd), (ins zprty:$Zn, zprty:$Zm),
asm, "\t$Zd, $Zn, $Zm",
- "", []>, Sched<[]> {
+ "",
+ [(set (vt zprty:$Zd), (op (vt zprty:$Zn), (vt2 zprty:$Zm)))]>, Sched<[]> {
bits<5> Zd;
bits<5> Zm;
bits<5> Zn;
@@ -1214,10 +1243,10 @@ class sve_fp_3op_u_zd<bits<2> sz, bits<3> opc, string asm,
let Inst{4-0} = Zd;
}
-multiclass sve_fp_3op_u_zd<bits<3> opc, string asm> {
- def _H : sve_fp_3op_u_zd<0b01, opc, asm, ZPR16>;
- def _S : sve_fp_3op_u_zd<0b10, opc, asm, ZPR32>;
- def _D : sve_fp_3op_u_zd<0b11, opc, asm, ZPR64>;
+multiclass sve_fp_3op_u_zd<bits<3> opc, string asm, SDPatternOperator op> {
+ def _H : sve_fp_3op_u_zd<0b01, opc, asm, ZPR16, nxv8f16, nxv8f16, op>;
+ def _S : sve_fp_3op_u_zd<0b10, opc, asm, ZPR32, nxv4f32, nxv4f32, op>;
+ def _D : sve_fp_3op_u_zd<0b11, opc, asm, ZPR64, nxv2f64, nxv2f64, op>;
}
//===----------------------------------------------------------------------===//
@@ -1489,7 +1518,7 @@ multiclass sve_fp_fcadd<string asm> {
class sve2_fp_convert_precision<bits<4> opc, string asm,
ZPRRegOp zprty1, ZPRRegOp zprty2>
-: I<(outs zprty1:$Zd), (ins PPR3bAny:$Pg, zprty2:$Zn),
+: I<(outs zprty1:$Zd), (ins zprty1:$_Zd, PPR3bAny:$Pg, zprty2:$Zn),
asm, "\t$Zd, $Pg/m, $Zn",
"",
[]>, Sched<[]> {
@@ -1504,6 +1533,8 @@ class sve2_fp_convert_precision<bits<4> opc, string asm,
let Inst{12-10} = Pg;
let Inst{9-5} = Zn;
let Inst{4-0} = Zd;
+
+ let Constraints = "$Zd = $_Zd";
}
multiclass sve2_fp_convert_down_narrow<string asm> {
@@ -1998,12 +2029,14 @@ class sve_intx_dot<bit sz, bit U, string asm, ZPRRegOp zprty1,
let Constraints = "$Zda = $_Zda";
let DestructiveInstType = Destructive;
- let ElementSize = zprty1.ElementSize;
}
-multiclass sve_intx_dot<bit opc, string asm> {
+multiclass sve_intx_dot<bit opc, string asm, SDPatternOperator op> {
def _S : sve_intx_dot<0b0, opc, asm, ZPR32, ZPR8>;
def _D : sve_intx_dot<0b1, opc, asm, ZPR64, ZPR16>;
+
+ def : SVE_3_Op_Pat<nxv4i32, op, nxv4i32, nxv16i8, nxv16i8, !cast<Instruction>(NAME # _S)>;
+ def : SVE_3_Op_Pat<nxv2i64, op, nxv2i64, nxv8i16, nxv8i16, !cast<Instruction>(NAME # _D)>;
}
//===----------------------------------------------------------------------===//
@@ -2028,22 +2061,27 @@ class sve_intx_dot_by_indexed_elem<bit sz, bit U, string asm,
let Constraints = "$Zda = $_Zda";
let DestructiveInstType = Destructive;
- let ElementSize = ElementSizeNone;
}
-multiclass sve_intx_dot_by_indexed_elem<bit opc, string asm> {
- def _S : sve_intx_dot_by_indexed_elem<0b0, opc, asm, ZPR32, ZPR8, ZPR3b8, VectorIndexS> {
+multiclass sve_intx_dot_by_indexed_elem<bit opc, string asm,
+ SDPatternOperator op> {
+ def _S : sve_intx_dot_by_indexed_elem<0b0, opc, asm, ZPR32, ZPR8, ZPR3b8, VectorIndexS32b> {
bits<2> iop;
bits<3> Zm;
let Inst{20-19} = iop;
let Inst{18-16} = Zm;
}
- def _D : sve_intx_dot_by_indexed_elem<0b1, opc, asm, ZPR64, ZPR16, ZPR4b16, VectorIndexD> {
+ def _D : sve_intx_dot_by_indexed_elem<0b1, opc, asm, ZPR64, ZPR16, ZPR4b16, VectorIndexD32b> {
bits<1> iop;
bits<4> Zm;
let Inst{20} = iop;
let Inst{19-16} = Zm;
}
+
+ def : Pat<(nxv4i32 (op nxv4i32:$Op1, nxv16i8:$Op2, nxv16i8:$Op3, (i32 VectorIndexS32b:$idx))),
+ (!cast<Instruction>(NAME # _S) $Op1, $Op2, $Op3, VectorIndexS32b:$idx)>;
+ def : Pat<(nxv2i64 (op nxv2i64:$Op1, nxv8i16:$Op2, nxv8i16:$Op3, (i32 VectorIndexD32b:$idx))),
+ (!cast<Instruction>(NAME # _D) $Op1, $Op2, $Op3, VectorIndexD32b:$idx)>;
}
//===----------------------------------------------------------------------===//
@@ -2399,21 +2437,40 @@ multiclass sve2_misc_bitwise<bits<4> opc, string asm> {
def _D : sve2_misc<0b11, opc, asm, ZPR64, ZPR64>;
}
-multiclass sve2_bitwise_xor_interleaved<bit opc, string asm> {
- let DestructiveInstType = Destructive, ElementSize = ElementSizeNone in {
- def _B : sve2_misc<0b00, { 0b010, opc }, asm, ZPR8, ZPR8>;
- def _H : sve2_misc<0b01, { 0b010, opc }, asm, ZPR16, ZPR16>;
- def _S : sve2_misc<0b10, { 0b010, opc }, asm, ZPR32, ZPR32>;
- def _D : sve2_misc<0b11, { 0b010, opc }, asm, ZPR64, ZPR64>;
- }
-}
-
multiclass sve2_misc_int_addsub_long_interleaved<bits<2> opc, string asm> {
def _H : sve2_misc<0b01, { 0b00, opc }, asm, ZPR16, ZPR8>;
def _S : sve2_misc<0b10, { 0b00, opc }, asm, ZPR32, ZPR16>;
def _D : sve2_misc<0b11, { 0b00, opc }, asm, ZPR64, ZPR32>;
}
+class sve2_bitwise_xor_interleaved<bits<2> sz, bits<1> opc, string asm,
+ ZPRRegOp zprty1, ZPRRegOp zprty2>
+: I<(outs zprty1:$Zd), (ins zprty1:$_Zd, zprty2:$Zn, zprty2:$Zm),
+ asm, "\t$Zd, $Zn, $Zm", "", []>, Sched<[]> {
+ bits<5> Zd;
+ bits<5> Zn;
+ bits<5> Zm;
+ let Inst{31-24} = 0b01000101;
+ let Inst{23-22} = sz;
+ let Inst{21} = 0b0;
+ let Inst{20-16} = Zm;
+ let Inst{15-11} = 0b10010;
+ let Inst{10} = opc;
+ let Inst{9-5} = Zn;
+ let Inst{4-0} = Zd;
+
+ let Constraints = "$Zd = $_Zd";
+ let DestructiveInstType = Destructive;
+ let ElementSize = ElementSizeNone;
+}
+
+multiclass sve2_bitwise_xor_interleaved<bit opc, string asm> {
+ def _B : sve2_bitwise_xor_interleaved<0b00, opc, asm, ZPR8, ZPR8>;
+ def _H : sve2_bitwise_xor_interleaved<0b01, opc, asm, ZPR16, ZPR16>;
+ def _S : sve2_bitwise_xor_interleaved<0b10, opc, asm, ZPR32, ZPR32>;
+ def _D : sve2_bitwise_xor_interleaved<0b11, opc, asm, ZPR64, ZPR64>;
+}
+
class sve2_bitwise_shift_left_long<bits<3> tsz8_64, bits<2> opc, string asm,
ZPRRegOp zprty1, ZPRRegOp zprty2,
Operand immtype>
@@ -2451,9 +2508,9 @@ multiclass sve2_bitwise_shift_left_long<bits<2> opc, string asm> {
// SVE2 Accumulate Group
//===----------------------------------------------------------------------===//
-class sve2_int_bin_cons_shift_imm<bits<4> tsz8_64, bit opc, string asm,
- ZPRRegOp zprty, Operand immtype>
-: I<(outs zprty:$Zd), (ins zprty:$Zn, immtype:$imm),
+class sve2_int_bin_shift_imm<bits<4> tsz8_64, bit opc, string asm,
+ ZPRRegOp zprty, Operand immtype>
+: I<(outs zprty:$Zd), (ins zprty:$_Zd, zprty:$Zn, immtype:$imm),
asm, "\t$Zd, $Zn, $imm",
"", []>, Sched<[]> {
bits<5> Zd;
@@ -2468,38 +2525,40 @@ class sve2_int_bin_cons_shift_imm<bits<4> tsz8_64, bit opc, string asm,
let Inst{10} = opc;
let Inst{9-5} = Zn;
let Inst{4-0} = Zd;
+
+ let Constraints = "$Zd = $_Zd";
}
-multiclass sve2_int_bin_cons_shift_imm_left<bit opc, string asm> {
- def _B : sve2_int_bin_cons_shift_imm<{0,0,0,1}, opc, asm, ZPR8, vecshiftL8>;
- def _H : sve2_int_bin_cons_shift_imm<{0,0,1,?}, opc, asm, ZPR16, vecshiftL16> {
+multiclass sve2_int_bin_shift_imm_left<bit opc, string asm> {
+ def _B : sve2_int_bin_shift_imm<{0,0,0,1}, opc, asm, ZPR8, vecshiftL8>;
+ def _H : sve2_int_bin_shift_imm<{0,0,1,?}, opc, asm, ZPR16, vecshiftL16> {
let Inst{19} = imm{3};
}
- def _S : sve2_int_bin_cons_shift_imm<{0,1,?,?}, opc, asm, ZPR32, vecshiftL32> {
+ def _S : sve2_int_bin_shift_imm<{0,1,?,?}, opc, asm, ZPR32, vecshiftL32> {
let Inst{20-19} = imm{4-3};
}
- def _D : sve2_int_bin_cons_shift_imm<{1,?,?,?}, opc, asm, ZPR64, vecshiftL64> {
+ def _D : sve2_int_bin_shift_imm<{1,?,?,?}, opc, asm, ZPR64, vecshiftL64> {
let Inst{22} = imm{5};
let Inst{20-19} = imm{4-3};
}
}
-multiclass sve2_int_bin_cons_shift_imm_right<bit opc, string asm> {
- def _B : sve2_int_bin_cons_shift_imm<{0,0,0,1}, opc, asm, ZPR8, vecshiftR8>;
- def _H : sve2_int_bin_cons_shift_imm<{0,0,1,?}, opc, asm, ZPR16, vecshiftR16> {
+multiclass sve2_int_bin_shift_imm_right<bit opc, string asm> {
+ def _B : sve2_int_bin_shift_imm<{0,0,0,1}, opc, asm, ZPR8, vecshiftR8>;
+ def _H : sve2_int_bin_shift_imm<{0,0,1,?}, opc, asm, ZPR16, vecshiftR16> {
let Inst{19} = imm{3};
}
- def _S : sve2_int_bin_cons_shift_imm<{0,1,?,?}, opc, asm, ZPR32, vecshiftR32> {
+ def _S : sve2_int_bin_shift_imm<{0,1,?,?}, opc, asm, ZPR32, vecshiftR32> {
let Inst{20-19} = imm{4-3};
}
- def _D : sve2_int_bin_cons_shift_imm<{1,?,?,?}, opc, asm, ZPR64, vecshiftR64> {
+ def _D : sve2_int_bin_shift_imm<{1,?,?,?}, opc, asm, ZPR64, vecshiftR64> {
let Inst{22} = imm{5};
let Inst{20-19} = imm{4-3};
}
}
-class sve2_int_bin_accum_cons_shift_imm<bits<4> tsz8_64, bits<2> opc, string asm,
- ZPRRegOp zprty, Operand immtype>
+class sve2_int_bin_accum_shift_imm<bits<4> tsz8_64, bits<2> opc, string asm,
+ ZPRRegOp zprty, Operand immtype>
: I<(outs zprty:$Zda), (ins zprty:$_Zda, zprty:$Zn, immtype:$imm),
asm, "\t$Zda, $Zn, $imm",
"", []>, Sched<[]> {
@@ -2521,15 +2580,15 @@ class sve2_int_bin_accum_cons_shift_imm<bits<4> tsz8_64, bits<2> opc, string asm
let ElementSize = ElementSizeNone;
}
-multiclass sve2_int_bin_accum_cons_shift_imm_right<bits<2> opc, string asm> {
- def _B : sve2_int_bin_accum_cons_shift_imm<{0,0,0,1}, opc, asm, ZPR8, vecshiftR8>;
- def _H : sve2_int_bin_accum_cons_shift_imm<{0,0,1,?}, opc, asm, ZPR16, vecshiftR16> {
+multiclass sve2_int_bin_accum_shift_imm_right<bits<2> opc, string asm> {
+ def _B : sve2_int_bin_accum_shift_imm<{0,0,0,1}, opc, asm, ZPR8, vecshiftR8>;
+ def _H : sve2_int_bin_accum_shift_imm<{0,0,1,?}, opc, asm, ZPR16, vecshiftR16> {
let Inst{19} = imm{3};
}
- def _S : sve2_int_bin_accum_cons_shift_imm<{0,1,?,?}, opc, asm, ZPR32, vecshiftR32> {
+ def _S : sve2_int_bin_accum_shift_imm<{0,1,?,?}, opc, asm, ZPR32, vecshiftR32> {
let Inst{20-19} = imm{4-3};
}
- def _D : sve2_int_bin_accum_cons_shift_imm<{1,?,?,?}, opc, asm, ZPR64, vecshiftR64> {
+ def _D : sve2_int_bin_accum_shift_imm<{1,?,?,?}, opc, asm, ZPR64, vecshiftR64> {
let Inst{22} = imm{5};
let Inst{20-19} = imm{4-3};
}
@@ -2607,9 +2666,9 @@ multiclass sve2_int_addsub_long_carry<bits<2> opc, string asm> {
// SVE2 Narrowing Group
//===----------------------------------------------------------------------===//
-class sve2_int_bin_cons_shift_imm_narrow<bits<3> tsz8_64, bits<4> opc,
- string asm, ZPRRegOp zprty1,
- ZPRRegOp zprty2, Operand immtype>
+class sve2_int_bin_shift_imm_narrow_bottom<bits<3> tsz8_64, bits<3> opc,
+ string asm, ZPRRegOp zprty1,
+ ZPRRegOp zprty2, Operand immtype>
: I<(outs zprty1:$Zd), (ins zprty2:$Zn, immtype:$imm),
asm, "\t$Zd, $Zn, $imm",
"", []>, Sched<[]> {
@@ -2622,26 +2681,63 @@ class sve2_int_bin_cons_shift_imm_narrow<bits<3> tsz8_64, bits<4> opc,
let Inst{20-19} = tsz8_64{1-0};
let Inst{18-16} = imm{2-0}; // imm3
let Inst{15-14} = 0b00;
- let Inst{13-10} = opc;
+ let Inst{13-11} = opc;
+ let Inst{10} = 0b0;
let Inst{9-5} = Zn;
let Inst{4-0} = Zd;
}
-multiclass sve2_int_bin_cons_shift_imm_right_narrow<bits<4> opc, string asm> {
- def _B : sve2_int_bin_cons_shift_imm_narrow<{0,0,1}, opc, asm, ZPR8, ZPR16,
- vecshiftR8>;
- def _H : sve2_int_bin_cons_shift_imm_narrow<{0,1,?}, opc, asm, ZPR16, ZPR32,
- vecshiftR16> {
+multiclass sve2_int_bin_shift_imm_right_narrow_bottom<bits<3> opc, string asm> {
+ def _B : sve2_int_bin_shift_imm_narrow_bottom<{0,0,1}, opc, asm, ZPR8, ZPR16,
+ vecshiftR8>;
+ def _H : sve2_int_bin_shift_imm_narrow_bottom<{0,1,?}, opc, asm, ZPR16, ZPR32,
+ vecshiftR16> {
let Inst{19} = imm{3};
}
- def _S : sve2_int_bin_cons_shift_imm_narrow<{1,?,?}, opc, asm, ZPR32, ZPR64,
- vecshiftR32> {
+ def _S : sve2_int_bin_shift_imm_narrow_bottom<{1,?,?}, opc, asm, ZPR32, ZPR64,
+ vecshiftR32> {
let Inst{20-19} = imm{4-3};
}
}
-class sve2_int_addsub_narrow_high<bits<2> sz, bits<3> opc, string asm,
- ZPRRegOp zprty1, ZPRRegOp zprty2>
+class sve2_int_bin_shift_imm_narrow_top<bits<3> tsz8_64, bits<3> opc,
+ string asm, ZPRRegOp zprty1,
+ ZPRRegOp zprty2, Operand immtype>
+: I<(outs zprty1:$Zd), (ins zprty1:$_Zd, zprty2:$Zn, immtype:$imm),
+ asm, "\t$Zd, $Zn, $imm",
+ "", []>, Sched<[]> {
+ bits<5> Zd;
+ bits<5> Zn;
+ bits<5> imm;
+ let Inst{31-23} = 0b010001010;
+ let Inst{22} = tsz8_64{2};
+ let Inst{21} = 0b1;
+ let Inst{20-19} = tsz8_64{1-0};
+ let Inst{18-16} = imm{2-0}; // imm3
+ let Inst{15-14} = 0b00;
+ let Inst{13-11} = opc;
+ let Inst{10} = 0b1;
+ let Inst{9-5} = Zn;
+ let Inst{4-0} = Zd;
+
+ let Constraints = "$Zd = $_Zd";
+}
+
+multiclass sve2_int_bin_shift_imm_right_narrow_top<bits<3> opc, string asm> {
+ def _B : sve2_int_bin_shift_imm_narrow_top<{0,0,1}, opc, asm, ZPR8, ZPR16,
+ vecshiftR8>;
+ def _H : sve2_int_bin_shift_imm_narrow_top<{0,1,?}, opc, asm, ZPR16, ZPR32,
+ vecshiftR16> {
+ let Inst{19} = imm{3};
+ }
+ def _S : sve2_int_bin_shift_imm_narrow_top<{1,?,?}, opc, asm, ZPR32, ZPR64,
+ vecshiftR32> {
+ let Inst{20-19} = imm{4-3};
+ }
+}
+
+class sve2_int_addsub_narrow_high_bottom<bits<2> sz, bits<2> opc, string asm,
+ ZPRRegOp zprty1, ZPRRegOp zprty2>
: I<(outs zprty1:$Zd), (ins zprty2:$Zn, zprty2:$Zm),
asm, "\t$Zd, $Zn, $Zm", "", []>, Sched<[]> {
bits<5> Zd;
@@ -2652,19 +2748,46 @@ class sve2_int_addsub_narrow_high<bits<2> sz, bits<3> opc, string asm,
let Inst{21} = 0b1;
let Inst{20-16} = Zm;
let Inst{15-13} = 0b011;
- let Inst{12-10} = opc; // S, R, T
+ let Inst{12-11} = opc; // S, R
+ let Inst{10} = 0b0; // Top
let Inst{9-5} = Zn;
let Inst{4-0} = Zd;
}
-multiclass sve2_int_addsub_narrow_high<bits<3> opc, string asm> {
- def _B : sve2_int_addsub_narrow_high<0b01, opc, asm, ZPR8, ZPR16>;
- def _H : sve2_int_addsub_narrow_high<0b10, opc, asm, ZPR16, ZPR32>;
- def _S : sve2_int_addsub_narrow_high<0b11, opc, asm, ZPR32, ZPR64>;
+multiclass sve2_int_addsub_narrow_high_bottom<bits<2> opc, string asm> {
+ def _B : sve2_int_addsub_narrow_high_bottom<0b01, opc, asm, ZPR8, ZPR16>;
+ def _H : sve2_int_addsub_narrow_high_bottom<0b10, opc, asm, ZPR16, ZPR32>;
+ def _S : sve2_int_addsub_narrow_high_bottom<0b11, opc, asm, ZPR32, ZPR64>;
+}
+
+class sve2_int_addsub_narrow_high_top<bits<2> sz, bits<2> opc, string asm,
+ ZPRRegOp zprty1, ZPRRegOp zprty2>
+: I<(outs zprty1:$Zd), (ins zprty1:$_Zd, zprty2:$Zn, zprty2:$Zm),
+ asm, "\t$Zd, $Zn, $Zm", "", []>, Sched<[]> {
+ bits<5> Zd;
+ bits<5> Zn;
+ bits<5> Zm;
+ let Inst{31-24} = 0b01000101;
+ let Inst{23-22} = sz;
+ let Inst{21} = 0b1;
+ let Inst{20-16} = Zm;
+ let Inst{15-13} = 0b011;
+ let Inst{12-11} = opc; // S, R
+ let Inst{10} = 0b1; // Top
+ let Inst{9-5} = Zn;
+ let Inst{4-0} = Zd;
+
+ let Constraints = "$Zd = $_Zd";
+}
+
+multiclass sve2_int_addsub_narrow_high_top<bits<2> opc, string asm> {
+ def _B : sve2_int_addsub_narrow_high_top<0b01, opc, asm, ZPR8, ZPR16>;
+ def _H : sve2_int_addsub_narrow_high_top<0b10, opc, asm, ZPR16, ZPR32>;
+ def _S : sve2_int_addsub_narrow_high_top<0b11, opc, asm, ZPR32, ZPR64>;
}
-class sve2_int_sat_extract_narrow<bits<3> tsz8_64, bits<3> opc, string asm,
- ZPRRegOp zprty1, ZPRRegOp zprty2>
+class sve2_int_sat_extract_narrow_bottom<bits<3> tsz8_64, bits<2> opc, string asm,
+ ZPRRegOp zprty1, ZPRRegOp zprty2>
: I<(outs zprty1:$Zd), (ins zprty2:$Zn),
asm, "\t$Zd, $Zn", "", []>, Sched<[]> {
bits<5> Zd;
@@ -2674,15 +2797,41 @@ class sve2_int_sat_extract_narrow<bits<3> tsz8_64, bits<3> opc, string asm,
let Inst{21} = 0b1;
let Inst{20-19} = tsz8_64{1-0};
let Inst{18-13} = 0b000010;
- let Inst{12-10} = opc;
+ let Inst{12-11} = opc;
+ let Inst{10} = 0b0;
let Inst{9-5} = Zn;
let Inst{4-0} = Zd;
}
-multiclass sve2_int_sat_extract_narrow<bits<3> opc, string asm> {
- def _B : sve2_int_sat_extract_narrow<0b001, opc, asm, ZPR8, ZPR16>;
- def _H : sve2_int_sat_extract_narrow<0b010, opc, asm, ZPR16, ZPR32>;
- def _S : sve2_int_sat_extract_narrow<0b100, opc, asm, ZPR32, ZPR64>;
+multiclass sve2_int_sat_extract_narrow_bottom<bits<2> opc, string asm> {
+ def _B : sve2_int_sat_extract_narrow_bottom<0b001, opc, asm, ZPR8, ZPR16>;
+ def _H : sve2_int_sat_extract_narrow_bottom<0b010, opc, asm, ZPR16, ZPR32>;
+ def _S : sve2_int_sat_extract_narrow_bottom<0b100, opc, asm, ZPR32, ZPR64>;
+}
+
+class sve2_int_sat_extract_narrow_top<bits<3> tsz8_64, bits<2> opc, string asm,
+ ZPRRegOp zprty1, ZPRRegOp zprty2>
+: I<(outs zprty1:$Zd), (ins zprty1:$_Zd, zprty2:$Zn),
+ asm, "\t$Zd, $Zn", "", []>, Sched<[]> {
+ bits<5> Zd;
+ bits<5> Zn;
+ let Inst{31-23} = 0b010001010;
+ let Inst{22} = tsz8_64{2};
+ let Inst{21} = 0b1;
+ let Inst{20-19} = tsz8_64{1-0};
+ let Inst{18-13} = 0b000010;
+ let Inst{12-11} = opc;
+ let Inst{10} = 0b1;
+ let Inst{9-5} = Zn;
+ let Inst{4-0} = Zd;
+
+ let Constraints = "$Zd = $_Zd";
+}
+
+multiclass sve2_int_sat_extract_narrow_top<bits<2> opc, string asm> {
+ def _B : sve2_int_sat_extract_narrow_top<0b001, opc, asm, ZPR8, ZPR16>;
+ def _H : sve2_int_sat_extract_narrow_top<0b010, opc, asm, ZPR16, ZPR32>;
+ def _S : sve2_int_sat_extract_narrow_top<0b100, opc, asm, ZPR32, ZPR64>;
}
//===----------------------------------------------------------------------===//
@@ -2713,11 +2862,17 @@ class sve_int_un_pred_arit<bits<2> sz8_64, bits<4> opc,
let ElementSize = zprty.ElementSize;
}
-multiclass sve_int_un_pred_arit_0<bits<3> opc, string asm> {
+multiclass sve_int_un_pred_arit_0<bits<3> opc, string asm,
+ SDPatternOperator op> {
def _B : sve_int_un_pred_arit<0b00, { opc, 0b0 }, asm, ZPR8>;
def _H : sve_int_un_pred_arit<0b01, { opc, 0b0 }, asm, ZPR16>;
def _S : sve_int_un_pred_arit<0b10, { opc, 0b0 }, asm, ZPR32>;
def _D : sve_int_un_pred_arit<0b11, { opc, 0b0 }, asm, ZPR64>;
+
+ def : SVE_3_Op_Pat<nxv16i8, op, nxv16i8, nxv16i1, nxv16i8, !cast<Instruction>(NAME # _B)>;
+ def : SVE_3_Op_Pat<nxv8i16, op, nxv8i16, nxv8i1, nxv8i16, !cast<Instruction>(NAME # _H)>;
+ def : SVE_3_Op_Pat<nxv4i32, op, nxv4i32, nxv4i1, nxv4i32, !cast<Instruction>(NAME # _S)>;
+ def : SVE_3_Op_Pat<nxv2i64, op, nxv2i64, nxv2i1, nxv2i64, !cast<Instruction>(NAME # _D)>;
}
multiclass sve_int_un_pred_arit_0_h<bits<3> opc, string asm> {
@@ -2735,11 +2890,21 @@ multiclass sve_int_un_pred_arit_0_d<bits<3> opc, string asm> {
def _D : sve_int_un_pred_arit<0b11, { opc, 0b0 }, asm, ZPR64>;
}
-multiclass sve_int_un_pred_arit_1<bits<3> opc, string asm> {
+multiclass sve_int_un_pred_arit_1<bits<3> opc, string asm,
+ SDPatternOperator op> {
def _B : sve_int_un_pred_arit<0b00, { opc, 0b1 }, asm, ZPR8>;
def _H : sve_int_un_pred_arit<0b01, { opc, 0b1 }, asm, ZPR16>;
def _S : sve_int_un_pred_arit<0b10, { opc, 0b1 }, asm, ZPR32>;
def _D : sve_int_un_pred_arit<0b11, { opc, 0b1 }, asm, ZPR64>;
+
+ def : SVE_3_Op_Pat<nxv16i8, op, nxv16i8, nxv16i1, nxv16i8, !cast<Instruction>(NAME # _B)>;
+ def : SVE_3_Op_Pat<nxv8i16, op, nxv8i16, nxv8i1, nxv8i16, !cast<Instruction>(NAME # _H)>;
+ def : SVE_3_Op_Pat<nxv4i32, op, nxv4i32, nxv4i1, nxv4i32, !cast<Instruction>(NAME # _S)>;
+ def : SVE_3_Op_Pat<nxv2i64, op, nxv2i64, nxv2i1, nxv2i64, !cast<Instruction>(NAME # _D)>;
+
+ def : SVE_3_Op_Pat<nxv8i16, op, nxv8i16, nxv8i1, nxv8f16, !cast<Instruction>(NAME # _H)>;
+ def : SVE_3_Op_Pat<nxv4i32, op, nxv4i32, nxv4i1, nxv4f32, !cast<Instruction>(NAME # _S)>;
+ def : SVE_3_Op_Pat<nxv2i64, op, nxv2i64, nxv2i1, nxv2f64, !cast<Instruction>(NAME # _D)>;
}
multiclass sve_int_un_pred_arit_1_fp<bits<3> opc, string asm> {
@@ -3886,9 +4051,9 @@ multiclass sve_mem_cstnt_ss<bits<2> msz, string asm, RegisterOperand listty,
(!cast<Instruction>(NAME) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, gprty:$Rm), 0>;
}
-class sve2_mem_cstnt_vs_base<bits<3> opc, dag iops, string asm,
- RegisterOperand VecList>
-: I<(outs VecList:$Zt), iops,
+class sve2_mem_sstnt_vs_base<bits<3> opc, string asm,
+ RegisterOperand listty, ZPRRegOp zprty>
+: I<(outs), (ins listty:$Zt, PPR3bAny:$Pg, zprty:$Zn, GPR64:$Rm),
asm, "\t$Zt, $Pg, [$Zn, $Rm]",
"",
[]>, Sched<[]> {
@@ -3908,17 +4073,14 @@ class sve2_mem_cstnt_vs_base<bits<3> opc, dag iops, string asm,
let mayStore = 1;
}
-multiclass sve2_mem_cstnt_vs<bits<3> opc, string asm,
+multiclass sve2_mem_sstnt_vs<bits<3> opc, string asm,
RegisterOperand listty, ZPRRegOp zprty> {
- def _REAL : sve2_mem_cstnt_vs_base<opc, (ins PPR3bAny:$Pg, zprty:$Zn, GPR64:$Rm),
- asm, listty>;
+ def _REAL : sve2_mem_sstnt_vs_base<opc, asm, listty, zprty>;
def : InstAlias<asm # "\t$Zt, $Pg, [$Zn, $Rm]",
(!cast<Instruction>(NAME # _REAL) zprty:$Zt, PPR3bAny:$Pg, zprty:$Zn, GPR64:$Rm), 0>;
def : InstAlias<asm # "\t$Zt, $Pg, [$Zn]",
(!cast<Instruction>(NAME # _REAL) zprty:$Zt, PPR3bAny:$Pg, zprty:$Zn, XZR), 0>;
- def : InstAlias<asm # "\t$Zt, $Pg, [$Zn, $Rm]",
- (!cast<Instruction>(NAME # _REAL) listty:$Zt, PPR3bAny:$Pg, zprty:$Zn, GPR64:$Rm), 0>;
def : InstAlias<asm # "\t$Zt, $Pg, [$Zn]",
(!cast<Instruction>(NAME # _REAL) listty:$Zt, PPR3bAny:$Pg, zprty:$Zn, XZR), 1>;
}
@@ -4147,6 +4309,14 @@ class sve_int_perm_punpk<bit opc, string asm>
let Inst{3-0} = Pd;
}
+multiclass sve_int_perm_punpk<bit opc, string asm, SDPatternOperator op> {
+ def NAME : sve_int_perm_punpk<opc, asm>;
+
+ def : SVE_1_Op_Pat<nxv8i1, op, nxv16i1, !cast<Instruction>(NAME)>;
+ def : SVE_1_Op_Pat<nxv4i1, op, nxv8i1, !cast<Instruction>(NAME)>;
+ def : SVE_1_Op_Pat<nxv2i1, op, nxv4i1, !cast<Instruction>(NAME)>;
+}
+
class sve_int_rdffr_pred<bit s, string asm>
: I<(outs PPR8:$Pd), (ins PPRAny:$Pg),
asm, "\t$Pd, $Pg/z",
@@ -5094,7 +5264,7 @@ multiclass sve_mem_p_fill<string asm> {
(!cast<Instruction>(NAME) PPRAny:$Pt, GPR64sp:$Rn, 0), 1>;
}
-class sve2_mem_cldnt_vs_base<bits<5> opc, dag iops, string asm,
+class sve2_mem_gldnt_vs_base<bits<5> opc, dag iops, string asm,
RegisterOperand VecList>
: I<(outs VecList:$Zt), iops,
asm, "\t$Zt, $Pg/z, [$Zn, $Rm]",
@@ -5119,17 +5289,15 @@ class sve2_mem_cldnt_vs_base<bits<5> opc, dag iops, string asm,
let mayLoad = 1;
}
-multiclass sve2_mem_cldnt_vs<bits<5> opc, string asm,
+multiclass sve2_mem_gldnt_vs<bits<5> opc, string asm,
RegisterOperand listty, ZPRRegOp zprty> {
- def _REAL : sve2_mem_cldnt_vs_base<opc, (ins PPR3bAny:$Pg, zprty:$Zn, GPR64:$Rm),
+ def _REAL : sve2_mem_gldnt_vs_base<opc, (ins PPR3bAny:$Pg, zprty:$Zn, GPR64:$Rm),
asm, listty>;
def : InstAlias<asm # "\t$Zt, $Pg/z, [$Zn, $Rm]",
(!cast<Instruction>(NAME # _REAL) zprty:$Zt, PPR3bAny:$Pg, zprty:$Zn, GPR64:$Rm), 0>;
def : InstAlias<asm # "\t$Zt, $Pg/z, [$Zn]",
(!cast<Instruction>(NAME # _REAL) zprty:$Zt, PPR3bAny:$Pg, zprty:$Zn, XZR), 0>;
- def : InstAlias<asm # "\t$Zt, $Pg/z, [$Zn, $Rm]",
- (!cast<Instruction>(NAME # _REAL) listty:$Zt, PPR3bAny:$Pg, zprty:$Zn, GPR64:$Rm), 0>;
def : InstAlias<asm # "\t$Zt, $Pg/z, [$Zn]",
(!cast<Instruction>(NAME # _REAL) listty:$Zt, PPR3bAny:$Pg, zprty:$Zn, XZR), 1>;
}
diff --git a/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp b/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp
index 7bb075c36e79..c27fc7a112ec 100644
--- a/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp
+++ b/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp
@@ -125,7 +125,7 @@ namespace llvm {
uint32_t AArch64SysReg::parseGenericRegister(StringRef Name) {
// Try to parse an S<op0>_<op1>_<Cn>_<Cm>_<op2> register name
- Regex GenericRegPattern("^S([0-3])_([0-7])_C([0-9]|1[0-5])_C([0-9]|1[0-5])_([0-7])$");
+ static const Regex GenericRegPattern("^S([0-3])_([0-7])_C([0-9]|1[0-5])_C([0-9]|1[0-5])_([0-7])$");
std::string UpperName = Name.upper();
SmallVector<StringRef, 5> Ops;
diff --git a/lib/Target/AArch64/Utils/AArch64BaseInfo.h b/lib/Target/AArch64/Utils/AArch64BaseInfo.h
index e5e2fc2cb0df..7a4fcac09ec4 100644
--- a/lib/Target/AArch64/Utils/AArch64BaseInfo.h
+++ b/lib/Target/AArch64/Utils/AArch64BaseInfo.h
@@ -313,9 +313,9 @@ struct SysAlias {
uint16_t Encoding;
FeatureBitset FeaturesRequired;
- SysAlias (const char *N, uint16_t E) : Name(N), Encoding(E) {};
- SysAlias (const char *N, uint16_t E, FeatureBitset F) :
- Name(N), Encoding(E), FeaturesRequired(F) {};
+ constexpr SysAlias(const char *N, uint16_t E) : Name(N), Encoding(E) {}
+ constexpr SysAlias(const char *N, uint16_t E, FeatureBitset F)
+ : Name(N), Encoding(E), FeaturesRequired(F) {}
bool haveFeatures(FeatureBitset ActiveFeatures) const {
return (FeaturesRequired & ActiveFeatures) == FeaturesRequired;
@@ -326,9 +326,10 @@ struct SysAlias {
struct SysAliasReg : SysAlias {
bool NeedsReg;
- SysAliasReg(const char *N, uint16_t E, bool R) : SysAlias(N, E), NeedsReg(R) {};
- SysAliasReg(const char *N, uint16_t E, bool R, FeatureBitset F) : SysAlias(N, E, F),
- NeedsReg(R) {};
+ constexpr SysAliasReg(const char *N, uint16_t E, bool R)
+ : SysAlias(N, E), NeedsReg(R) {}
+ constexpr SysAliasReg(const char *N, uint16_t E, bool R, FeatureBitset F)
+ : SysAlias(N, E, F), NeedsReg(R) {}
};
namespace AArch64AT{
@@ -627,6 +628,18 @@ namespace AArch64II {
/// MO_S - Indicates that the bits of the symbol operand represented by
/// MO_G0 etc are signed.
MO_S = 0x100,
+
+ /// MO_PREL - Indicates that the bits of the symbol operand represented by
+ /// MO_G0 etc are PC relative.
+ MO_PREL = 0x200,
+
+ /// MO_TAGGED - With MO_PAGE, indicates that the page includes a memory tag
+ /// in bits 56-63.
+ /// On a FrameIndex operand, indicates that the underlying memory is tagged
+ /// with an unknown tag value (MTE); this needs to be lowered either to an
+ /// SP-relative load or store instruction (which do not check tags), or to
+ /// an LDG instruction to obtain the tag value.
+ MO_TAGGED = 0x400,
};
} // end namespace AArch64II