summaryrefslogtreecommitdiff
path: root/lib/Target/AArch64
diff options
context:
space:
mode:
Diffstat (limited to 'lib/Target/AArch64')
-rw-r--r--lib/Target/AArch64/AArch64.h3
-rw-r--r--lib/Target/AArch64/AArch64.td187
-rw-r--r--lib/Target/AArch64/AArch64A53Fix835769.cpp10
-rw-r--r--lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp40
-rw-r--r--lib/Target/AArch64/AArch64AddressTypePromotion.cpp14
-rw-r--r--lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp128
-rw-r--r--lib/Target/AArch64/AArch64AsmPrinter.cpp67
-rw-r--r--lib/Target/AArch64/AArch64BranchRelaxation.cpp30
-rw-r--r--lib/Target/AArch64/AArch64CallLowering.cpp104
-rw-r--r--lib/Target/AArch64/AArch64CallLowering.h36
-rw-r--r--lib/Target/AArch64/AArch64CallingConvention.td18
-rw-r--r--lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp25
-rw-r--r--lib/Target/AArch64/AArch64CollectLOH.cpp23
-rw-r--r--lib/Target/AArch64/AArch64ConditionOptimizer.cpp26
-rw-r--r--lib/Target/AArch64/AArch64ConditionalCompares.cpp24
-rw-r--r--lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp18
-rw-r--r--lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp215
-rw-r--r--lib/Target/AArch64/AArch64FastISel.cpp81
-rw-r--r--lib/Target/AArch64/AArch64FrameLowering.cpp895
-rw-r--r--lib/Target/AArch64/AArch64FrameLowering.h15
-rw-r--r--lib/Target/AArch64/AArch64ISelDAGToDAG.cpp2400
-rw-r--r--lib/Target/AArch64/AArch64ISelLowering.cpp963
-rw-r--r--lib/Target/AArch64/AArch64ISelLowering.h60
-rw-r--r--lib/Target/AArch64/AArch64InstrAtomics.td51
-rw-r--r--lib/Target/AArch64/AArch64InstrFormats.td25
-rw-r--r--lib/Target/AArch64/AArch64InstrInfo.cpp1743
-rw-r--r--lib/Target/AArch64/AArch64InstrInfo.h107
-rw-r--r--lib/Target/AArch64/AArch64InstrInfo.td193
-rw-r--r--lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp1050
-rw-r--r--lib/Target/AArch64/AArch64MachineFunctionInfo.h38
-rw-r--r--lib/Target/AArch64/AArch64PBQPRegAlloc.cpp2
-rw-r--r--lib/Target/AArch64/AArch64PromoteConstant.cpp327
-rw-r--r--lib/Target/AArch64/AArch64RedundantCopyElimination.cpp182
-rw-r--r--lib/Target/AArch64/AArch64RegisterBankInfo.cpp168
-rw-r--r--lib/Target/AArch64/AArch64RegisterBankInfo.h69
-rw-r--r--lib/Target/AArch64/AArch64RegisterInfo.cpp26
-rw-r--r--lib/Target/AArch64/AArch64RegisterInfo.td2
-rw-r--r--lib/Target/AArch64/AArch64SchedA53.td4
-rw-r--r--lib/Target/AArch64/AArch64SchedA57.td3
-rw-r--r--lib/Target/AArch64/AArch64SchedCyclone.td14
-rw-r--r--lib/Target/AArch64/AArch64SchedKryo.td133
-rw-r--r--lib/Target/AArch64/AArch64SchedKryoDetails.td2358
-rw-r--r--lib/Target/AArch64/AArch64SchedM1.td29
-rw-r--r--lib/Target/AArch64/AArch64SchedVulcan.td855
-rw-r--r--lib/Target/AArch64/AArch64Schedule.td8
-rw-r--r--lib/Target/AArch64/AArch64SelectionDAGInfo.cpp10
-rw-r--r--lib/Target/AArch64/AArch64SelectionDAGInfo.h14
-rw-r--r--lib/Target/AArch64/AArch64StorePairSuppress.cpp9
-rw-r--r--lib/Target/AArch64/AArch64Subtarget.cpp109
-rw-r--r--lib/Target/AArch64/AArch64Subtarget.h122
-rw-r--r--lib/Target/AArch64/AArch64SystemOperands.td1018
-rw-r--r--lib/Target/AArch64/AArch64TargetMachine.cpp138
-rw-r--r--lib/Target/AArch64/AArch64TargetMachine.h12
-rw-r--r--lib/Target/AArch64/AArch64TargetTransformInfo.cpp77
-rw-r--r--lib/Target/AArch64/AArch64TargetTransformInfo.h11
-rw-r--r--lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp214
-rw-r--r--lib/Target/AArch64/AsmParser/Makefile15
-rw-r--r--lib/Target/AArch64/CMakeLists.txt19
-rw-r--r--lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp15
-rw-r--r--lib/Target/AArch64/Disassembler/AArch64Disassembler.h2
-rw-r--r--lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.cpp8
-rw-r--r--lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.h2
-rw-r--r--lib/Target/AArch64/Disassembler/Makefile16
-rw-r--r--lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp149
-rw-r--r--lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h4
-rw-r--r--lib/Target/AArch64/InstPrinter/Makefile15
-rw-r--r--lib/Target/AArch64/LLVMBuild.txt2
-rw-r--r--lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h43
-rw-r--r--lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp230
-rw-r--r--lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp63
-rw-r--r--lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp53
-rw-r--r--lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp22
-rw-r--r--lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h1
-rw-r--r--lib/Target/AArch64/MCTargetDesc/Makefile16
-rw-r--r--lib/Target/AArch64/Makefile25
-rw-r--r--lib/Target/AArch64/TargetInfo/Makefile15
-rw-r--r--lib/Target/AArch64/Utils/AArch64BaseInfo.cpp943
-rw-r--r--lib/Target/AArch64/Utils/AArch64BaseInfo.h1003
-rw-r--r--lib/Target/AArch64/Utils/Makefile16
79 files changed, 11776 insertions, 5404 deletions
diff --git a/lib/Target/AArch64/AArch64.h b/lib/Target/AArch64/AArch64.h
index 21106c9ad29a6..c767c75fce573 100644
--- a/lib/Target/AArch64/AArch64.h
+++ b/lib/Target/AArch64/AArch64.h
@@ -27,6 +27,7 @@ class FunctionPass;
class MachineFunctionPass;
FunctionPass *createAArch64DeadRegisterDefinitions();
+FunctionPass *createAArch64RedundantCopyEliminationPass();
FunctionPass *createAArch64ConditionalCompares();
FunctionPass *createAArch64AdvSIMDScalar();
FunctionPass *createAArch64BranchRelaxation();
@@ -44,6 +45,8 @@ FunctionPass *createAArch64A53Fix835769();
FunctionPass *createAArch64CleanupLocalDynamicTLSPass();
FunctionPass *createAArch64CollectLOHPass();
+
+void initializeAArch64ExpandPseudoPass(PassRegistry&);
} // end namespace llvm
#endif
diff --git a/lib/Target/AArch64/AArch64.td b/lib/Target/AArch64/AArch64.td
index cd3e84d38fe2f..b1e881685b0c6 100644
--- a/lib/Target/AArch64/AArch64.td
+++ b/lib/Target/AArch64/AArch64.td
@@ -11,7 +11,7 @@
//===----------------------------------------------------------------------===//
//===----------------------------------------------------------------------===//
-// Target-independent interfaces which we are implementing
+// Target-independent interfaces which we are implementing.
//===----------------------------------------------------------------------===//
include "llvm/Target/Target.td"
@@ -32,6 +32,9 @@ def FeatureCrypto : SubtargetFeature<"crypto", "HasCrypto", "true",
def FeatureCRC : SubtargetFeature<"crc", "HasCRC", "true",
"Enable ARMv8 CRC-32 checksum instructions">;
+def FeatureRAS : SubtargetFeature<"ras", "HasRAS", "true",
+ "Enable ARMv8 Reliability, Availability and Serviceability Extensions">;
+
def FeaturePerfMon : SubtargetFeature<"perfmon", "HasPerfMon", "true",
"Enable ARMv8 PMUv3 Performance Monitors extension">;
@@ -58,6 +61,50 @@ def FeatureReserveX18 : SubtargetFeature<"reserve-x18", "ReserveX18", "true",
"Reserve X18, making it unavailable "
"as a GPR">;
+def FeatureMergeNarrowLd : SubtargetFeature<"merge-narrow-ld",
+ "MergeNarrowLoads", "true",
+ "Merge narrow load instructions">;
+
+def FeatureUseAA : SubtargetFeature<"use-aa", "UseAA", "true",
+ "Use alias analysis during codegen">;
+
+def FeatureBalanceFPOps : SubtargetFeature<"balance-fp-ops", "BalanceFPOps",
+ "true",
+ "balance mix of odd and even D-registers for fp multiply(-accumulate) ops">;
+
+def FeaturePredictableSelectIsExpensive : SubtargetFeature<
+ "predictable-select-expensive", "PredictableSelectIsExpensive", "true",
+ "Prefer likely predicted branches over selects">;
+
+def FeatureCustomCheapAsMoveHandling : SubtargetFeature<"custom-cheap-as-move",
+ "CustomAsCheapAsMove", "true",
+ "Use custom code for TargetInstrInfo::isAsCheapAsAMove()">;
+
+def FeaturePostRAScheduler : SubtargetFeature<"use-postra-scheduler",
+ "UsePostRAScheduler", "true", "Schedule again after register allocation">;
+
+def FeatureSlowMisaligned128Store : SubtargetFeature<"slow-misaligned-128store",
+ "Misaligned128StoreIsSlow", "true", "Misaligned 128 bit stores are slow">;
+
+def FeatureAvoidQuadLdStPairs : SubtargetFeature<"no-quad-ldst-pairs",
+ "AvoidQuadLdStPairs", "true",
+ "Do not form quad load/store pair operations">;
+
+def FeatureAlternateSExtLoadCVTF32Pattern : SubtargetFeature<
+ "alternate-sextload-cvt-f32-pattern", "UseAlternateSExtLoadCVTF32Pattern",
+ "true", "Use alternative pattern for sextload convert to f32">;
+
+def FeatureMacroOpFusion : SubtargetFeature<
+ "macroop-fusion", "HasMacroOpFusion", "true",
+ "CPU supports macro op fusion">;
+
+def FeatureDisableLatencySchedHeuristic : SubtargetFeature<
+ "disable-latency-sched-heuristic", "DisableLatencySchedHeuristic", "true",
+ "Disable latency scheduling heuristic">;
+
+def FeatureUseRSqrt : SubtargetFeature<
+ "use-reverse-square-root", "UseRSqrt", "true", "Use reverse square root">;
+
//===----------------------------------------------------------------------===//
// Architectures.
//
@@ -66,7 +113,7 @@ def HasV8_1aOps : SubtargetFeature<"v8.1a", "HasV8_1aOps", "true",
"Support ARM v8.1a instructions", [FeatureCRC]>;
def HasV8_2aOps : SubtargetFeature<"v8.2a", "HasV8_2aOps", "true",
- "Support ARM v8.2a instructions", [HasV8_1aOps]>;
+ "Support ARM v8.2a instructions", [HasV8_1aOps, FeatureRAS]>;
//===----------------------------------------------------------------------===//
// Register File Description
@@ -85,67 +132,145 @@ include "AArch64InstrInfo.td"
def AArch64InstrInfo : InstrInfo;
//===----------------------------------------------------------------------===//
+// Named operands for MRS/MSR/TLBI/...
+//===----------------------------------------------------------------------===//
+
+include "AArch64SystemOperands.td"
+
+//===----------------------------------------------------------------------===//
// AArch64 Processors supported.
//
include "AArch64SchedA53.td"
include "AArch64SchedA57.td"
include "AArch64SchedCyclone.td"
include "AArch64SchedM1.td"
+include "AArch64SchedKryo.td"
+include "AArch64SchedVulcan.td"
def ProcA35 : SubtargetFeature<"a35", "ARMProcFamily", "CortexA35",
- "Cortex-A35 ARM processors",
- [FeatureFPARMv8,
- FeatureNEON,
- FeatureCrypto,
+ "Cortex-A35 ARM processors", [
FeatureCRC,
- FeaturePerfMon]>;
+ FeatureCrypto,
+ FeatureFPARMv8,
+ FeatureNEON,
+ FeaturePerfMon
+ ]>;
def ProcA53 : SubtargetFeature<"a53", "ARMProcFamily", "CortexA53",
- "Cortex-A53 ARM processors",
- [FeatureFPARMv8,
- FeatureNEON,
- FeatureCrypto,
+ "Cortex-A53 ARM processors", [
+ FeatureBalanceFPOps,
FeatureCRC,
- FeaturePerfMon]>;
+ FeatureCrypto,
+ FeatureCustomCheapAsMoveHandling,
+ FeatureFPARMv8,
+ FeatureNEON,
+ FeaturePerfMon,
+ FeaturePostRAScheduler,
+ FeatureUseAA
+ ]>;
def ProcA57 : SubtargetFeature<"a57", "ARMProcFamily", "CortexA57",
- "Cortex-A57 ARM processors",
- [FeatureFPARMv8,
+ "Cortex-A57 ARM processors", [
+ FeatureBalanceFPOps,
+ FeatureCRC,
+ FeatureCrypto,
+ FeatureCustomCheapAsMoveHandling,
+ FeatureFPARMv8,
+ FeatureMergeNarrowLd,
FeatureNEON,
+ FeaturePerfMon,
+ FeaturePostRAScheduler,
+ FeaturePredictableSelectIsExpensive
+ ]>;
+
+def ProcA72 : SubtargetFeature<"a72", "ARMProcFamily", "CortexA72",
+ "Cortex-A72 ARM processors", [
+ FeatureCRC,
FeatureCrypto,
+ FeatureFPARMv8,
+ FeatureNEON,
+ FeaturePerfMon
+ ]>;
+
+def ProcA73 : SubtargetFeature<"a73", "ARMProcFamily", "CortexA73",
+ "Cortex-A73 ARM processors", [
FeatureCRC,
- FeaturePerfMon]>;
+ FeatureCrypto,
+ FeatureFPARMv8,
+ FeatureNEON,
+ FeaturePerfMon
+ ]>;
def ProcCyclone : SubtargetFeature<"cyclone", "ARMProcFamily", "Cyclone",
- "Cyclone",
- [FeatureFPARMv8,
- FeatureNEON,
+ "Cyclone", [
+ FeatureAlternateSExtLoadCVTF32Pattern,
FeatureCrypto,
- FeatureCRC,
+ FeatureDisableLatencySchedHeuristic,
+ FeatureFPARMv8,
+ FeatureMacroOpFusion,
+ FeatureNEON,
FeaturePerfMon,
- FeatureZCRegMove, FeatureZCZeroing]>;
+ FeatureSlowMisaligned128Store,
+ FeatureZCRegMove,
+ FeatureZCZeroing
+ ]>;
def ProcExynosM1 : SubtargetFeature<"exynosm1", "ARMProcFamily", "ExynosM1",
- "Samsung Exynos-M1 processors",
- [FeatureFPARMv8,
- FeatureNEON,
- FeatureCrypto,
+ "Samsung Exynos-M1 processors", [
+ FeatureAvoidQuadLdStPairs,
FeatureCRC,
- FeaturePerfMon]>;
+ FeatureCrypto,
+ FeatureCustomCheapAsMoveHandling,
+ FeatureFPARMv8,
+ FeatureNEON,
+ FeaturePerfMon,
+ FeaturePostRAScheduler,
+ FeatureUseRSqrt
+ ]>;
+
+def ProcKryo : SubtargetFeature<"kryo", "ARMProcFamily", "Kryo",
+ "Qualcomm Kryo processors", [
+ FeatureCRC,
+ FeatureCrypto,
+ FeatureCustomCheapAsMoveHandling,
+ FeatureFPARMv8,
+ FeatureMergeNarrowLd,
+ FeatureNEON,
+ FeaturePerfMon,
+ FeaturePostRAScheduler,
+ FeaturePredictableSelectIsExpensive,
+ FeatureZCZeroing
+ ]>;
+
+def ProcVulcan : SubtargetFeature<"vulcan", "ARMProcFamily", "Vulcan",
+ "Broadcom Vulcan processors", [
+ FeatureCRC,
+ FeatureCrypto,
+ FeatureFPARMv8,
+ FeatureMacroOpFusion,
+ FeatureNEON,
+ FeaturePostRAScheduler,
+ HasV8_1aOps]>;
-def : ProcessorModel<"generic", NoSchedModel, [FeatureFPARMv8,
- FeatureNEON,
- FeatureCRC,
- FeaturePerfMon]>;
+def : ProcessorModel<"generic", NoSchedModel, [
+ FeatureCRC,
+ FeatureFPARMv8,
+ FeatureNEON,
+ FeaturePerfMon,
+ FeaturePostRAScheduler
+ ]>;
// FIXME: Cortex-A35 is currently modelled as a Cortex-A53
def : ProcessorModel<"cortex-a35", CortexA53Model, [ProcA35]>;
def : ProcessorModel<"cortex-a53", CortexA53Model, [ProcA53]>;
def : ProcessorModel<"cortex-a57", CortexA57Model, [ProcA57]>;
-// FIXME: Cortex-A72 is currently modelled as an Cortex-A57.
-def : ProcessorModel<"cortex-a72", CortexA57Model, [ProcA57]>;
+// FIXME: Cortex-A72 and Cortex-A73 are currently modelled as an Cortex-A57.
+def : ProcessorModel<"cortex-a72", CortexA57Model, [ProcA72]>;
+def : ProcessorModel<"cortex-a73", CortexA57Model, [ProcA73]>;
def : ProcessorModel<"cyclone", CycloneModel, [ProcCyclone]>;
def : ProcessorModel<"exynos-m1", ExynosM1Model, [ProcExynosM1]>;
+def : ProcessorModel<"kryo", KryoModel, [ProcKryo]>;
+def : ProcessorModel<"vulcan", VulcanModel, [ProcVulcan]>;
//===----------------------------------------------------------------------===//
// Assembly parser
diff --git a/lib/Target/AArch64/AArch64A53Fix835769.cpp b/lib/Target/AArch64/AArch64A53Fix835769.cpp
index d215d9e831c06..c2cca63f49774 100644
--- a/lib/Target/AArch64/AArch64A53Fix835769.cpp
+++ b/lib/Target/AArch64/AArch64A53Fix835769.cpp
@@ -22,7 +22,6 @@
#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Target/TargetInstrInfo.h"
@@ -87,6 +86,11 @@ public:
bool runOnMachineFunction(MachineFunction &F) override;
+ MachineFunctionProperties getRequiredProperties() const override {
+ return MachineFunctionProperties().set(
+ MachineFunctionProperties::Property::AllVRegsAllocated);
+ }
+
const char *getPassName() const override {
return "Workaround A53 erratum 835769 pass";
}
@@ -133,8 +137,8 @@ static MachineBasicBlock *getBBFallenThrough(MachineBasicBlock *MBB,
MachineBasicBlock *PrevBB = &*std::prev(MBBI);
for (MachineBasicBlock *S : MBB->predecessors())
- if (S == PrevBB && !TII->AnalyzeBranch(*PrevBB, TBB, FBB, Cond) &&
- !TBB && !FBB)
+ if (S == PrevBB && !TII->analyzeBranch(*PrevBB, TBB, FBB, Cond) && !TBB &&
+ !FBB)
return S;
return nullptr;
diff --git a/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp b/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp
index 3d1ab4e3fc2b6..0465e59dc54a6 100644
--- a/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp
+++ b/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp
@@ -43,7 +43,6 @@
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
-#include <list>
using namespace llvm;
#define DEBUG_TYPE "aarch64-a57-fp-load-balancing"
@@ -125,6 +124,11 @@ public:
bool runOnMachineFunction(MachineFunction &F) override;
+ MachineFunctionProperties getRequiredProperties() const override {
+ return MachineFunctionProperties().set(
+ MachineFunctionProperties::Property::AllVRegsAllocated);
+ }
+
const char *getPassName() const override {
return "A57 FP Anti-dependency breaker";
}
@@ -222,7 +226,7 @@ public:
}
/// Return true if MI is a member of the chain.
- bool contains(MachineInstr *MI) { return Insts.count(MI) > 0; }
+ bool contains(MachineInstr &MI) { return Insts.count(&MI) > 0; }
/// Return the number of instructions in the chain.
unsigned size() const {
@@ -248,9 +252,10 @@ public:
MachineInstr *getKill() const { return KillInst; }
/// Return an instruction that can be used as an iterator for the end
/// of the chain. This is the maximum of KillInst (if set) and LastInst.
- MachineBasicBlock::iterator getEnd() const {
+ MachineBasicBlock::iterator end() const {
return ++MachineBasicBlock::iterator(KillInst ? KillInst : LastInst);
}
+ MachineBasicBlock::iterator begin() const { return getStart(); }
/// Can the Kill instruction (assuming one exists) be modified?
bool isKillImmutable() const { return KillIsImmutable; }
@@ -307,9 +312,10 @@ public:
//===----------------------------------------------------------------------===//
bool AArch64A57FPLoadBalancing::runOnMachineFunction(MachineFunction &F) {
- // Don't do anything if this isn't an A53 or A57.
- if (!(F.getSubtarget<AArch64Subtarget>().isCortexA53() ||
- F.getSubtarget<AArch64Subtarget>().isCortexA57()))
+ if (skipFunction(*F.getFunction()))
+ return false;
+
+ if (!F.getSubtarget<AArch64Subtarget>().balanceFPOps())
return false;
bool Changed = false;
@@ -492,15 +498,14 @@ bool AArch64A57FPLoadBalancing::colorChainSet(std::vector<Chain*> GV,
int AArch64A57FPLoadBalancing::scavengeRegister(Chain *G, Color C,
MachineBasicBlock &MBB) {
RegScavenger RS;
- RS.enterBasicBlock(&MBB);
+ RS.enterBasicBlock(MBB);
RS.forward(MachineBasicBlock::iterator(G->getStart()));
// Can we find an appropriate register that is available throughout the life
// of the chain?
unsigned RegClassID = G->getStart()->getDesc().OpInfo[0].RegClass;
BitVector AvailableRegs = RS.getRegsAvailable(TRI->getRegClass(RegClassID));
- for (MachineBasicBlock::iterator I = G->getStart(), E = G->getEnd();
- I != E; ++I) {
+ for (MachineBasicBlock::iterator I = G->begin(), E = G->end(); I != E; ++I) {
RS.forward(I);
AvailableRegs &= RS.getRegsAvailable(TRI->getRegClass(RegClassID));
@@ -530,8 +535,7 @@ int AArch64A57FPLoadBalancing::scavengeRegister(Chain *G, Color C,
for (auto Reg : Ord) {
if (!AvailableRegs[Reg])
continue;
- if ((C == Color::Even && (Reg % 2) == 0) ||
- (C == Color::Odd && (Reg % 2) == 1))
+ if (C == getColor(Reg))
return Reg;
}
@@ -554,16 +558,14 @@ bool AArch64A57FPLoadBalancing::colorChain(Chain *G, Color C,
DEBUG(dbgs() << " - Scavenged register: " << TRI->getName(Reg) << "\n");
std::map<unsigned, unsigned> Substs;
- for (MachineBasicBlock::iterator I = G->getStart(), E = G->getEnd();
- I != E; ++I) {
- if (!G->contains(I) &&
- (&*I != G->getKill() || G->isKillImmutable()))
+ for (MachineInstr &I : *G) {
+ if (!G->contains(I) && (&I != G->getKill() || G->isKillImmutable()))
continue;
// I is a member of G, or I is a mutable instruction that kills G.
std::vector<unsigned> ToErase;
- for (auto &U : I->operands()) {
+ for (auto &U : I.operands()) {
if (U.isReg() && U.isUse() && Substs.find(U.getReg()) != Substs.end()) {
unsigned OrigReg = U.getReg();
U.setReg(Substs[OrigReg]);
@@ -583,11 +585,11 @@ bool AArch64A57FPLoadBalancing::colorChain(Chain *G, Color C,
Substs.erase(J);
// Only change the def if this isn't the last instruction.
- if (&*I != G->getKill()) {
- MachineOperand &MO = I->getOperand(0);
+ if (&I != G->getKill()) {
+ MachineOperand &MO = I.getOperand(0);
bool Change = TransformAll || getColor(MO.getReg()) != C;
- if (G->requiresFixup() && &*I == G->getLast())
+ if (G->requiresFixup() && &I == G->getLast())
Change = false;
if (Change) {
diff --git a/lib/Target/AArch64/AArch64AddressTypePromotion.cpp b/lib/Target/AArch64/AArch64AddressTypePromotion.cpp
index 3afcdfb8b930d..4846ef08c983c 100644
--- a/lib/Target/AArch64/AArch64AddressTypePromotion.cpp
+++ b/lib/Target/AArch64/AArch64AddressTypePromotion.cpp
@@ -20,10 +20,9 @@
// e = getelementptr ..., i64 a
//
// This is legal to do if the computations are marked with either nsw or nuw
-// markers.
-// Moreover, the current heuristic is simple: it does not create new sext
-// operations, i.e., it gives up when a sext would have forked (e.g., if
-// a = add i32 b, c, two sexts are required to promote the computation).
+// markers. Moreover, the current heuristic is simple: it does not create new
+// sext operations, i.e., it gives up when a sext would have forked (e.g., if a
+// = add i32 b, c, two sexts are required to promote the computation).
//
// FIXME: This pass may be useful for other targets too.
// ===---------------------------------------------------------------------===//
@@ -207,9 +206,7 @@ bool AArch64AddressTypePromotion::shouldGetThrough(const Instruction *Inst) {
}
static bool shouldSExtOperand(const Instruction *Inst, int OpIdx) {
- if (isa<SelectInst>(Inst) && OpIdx == 0)
- return false;
- return true;
+ return !(isa<SelectInst>(Inst) && OpIdx == 0);
}
bool
@@ -481,6 +478,9 @@ void AArch64AddressTypePromotion::analyzeSExtension(Instructions &SExtInsts) {
}
bool AArch64AddressTypePromotion::runOnFunction(Function &F) {
+ if (skipFunction(F))
+ return false;
+
if (!EnableAddressTypePromotion || F.isDeclaration())
return false;
Func = &F;
diff --git a/lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp b/lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp
index 1644d71d2821d..d0a2dd3fa1fc0 100644
--- a/lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp
+++ b/lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp
@@ -76,12 +76,12 @@ private:
// isProfitableToTransform - Predicate function to determine whether an
// instruction should be transformed to its equivalent AdvSIMD scalar
// instruction. "add Xd, Xn, Xm" ==> "add Dd, Da, Db", for example.
- bool isProfitableToTransform(const MachineInstr *MI) const;
+ bool isProfitableToTransform(const MachineInstr &MI) const;
// transformInstruction - Perform the transformation of an instruction
// to its equivalant AdvSIMD scalar instruction. Update inputs and outputs
// to be the correct register class, minimizing cross-class copies.
- void transformInstruction(MachineInstr *MI);
+ void transformInstruction(MachineInstr &MI);
// processMachineBasicBlock - Main optimzation loop.
bool processMachineBasicBlock(MachineBasicBlock *MBB);
@@ -132,19 +132,19 @@ static bool isFPR64(unsigned Reg, unsigned SubReg,
// getSrcFromCopy - Get the original source register for a GPR64 <--> FPR64
// copy instruction. Return zero_reg if the instruction is not a copy.
-static unsigned getSrcFromCopy(const MachineInstr *MI,
- const MachineRegisterInfo *MRI,
- unsigned &SubReg) {
+static MachineOperand *getSrcFromCopy(MachineInstr *MI,
+ const MachineRegisterInfo *MRI,
+ unsigned &SubReg) {
SubReg = 0;
// The "FMOV Xd, Dn" instruction is the typical form.
if (MI->getOpcode() == AArch64::FMOVDXr ||
MI->getOpcode() == AArch64::FMOVXDr)
- return MI->getOperand(1).getReg();
+ return &MI->getOperand(1);
// A lane zero extract "UMOV.d Xd, Vn[0]" is equivalent. We shouldn't see
// these at this stage, but it's easy to check for.
if (MI->getOpcode() == AArch64::UMOVvi64 && MI->getOperand(2).getImm() == 0) {
SubReg = AArch64::dsub;
- return MI->getOperand(1).getReg();
+ return &MI->getOperand(1);
}
// Or just a plain COPY instruction. This can be directly to/from FPR64,
// or it can be a dsub subreg reference to an FPR128.
@@ -152,18 +152,18 @@ static unsigned getSrcFromCopy(const MachineInstr *MI,
if (isFPR64(MI->getOperand(0).getReg(), MI->getOperand(0).getSubReg(),
MRI) &&
isGPR64(MI->getOperand(1).getReg(), MI->getOperand(1).getSubReg(), MRI))
- return MI->getOperand(1).getReg();
+ return &MI->getOperand(1);
if (isGPR64(MI->getOperand(0).getReg(), MI->getOperand(0).getSubReg(),
MRI) &&
isFPR64(MI->getOperand(1).getReg(), MI->getOperand(1).getSubReg(),
MRI)) {
SubReg = MI->getOperand(1).getSubReg();
- return MI->getOperand(1).getReg();
+ return &MI->getOperand(1);
}
}
// Otherwise, this is some other kind of instruction.
- return 0;
+ return nullptr;
}
// getTransformOpcode - For any opcode for which there is an AdvSIMD equivalent
@@ -189,16 +189,16 @@ static unsigned getTransformOpcode(unsigned Opc) {
return Opc;
}
-static bool isTransformable(const MachineInstr *MI) {
- unsigned Opc = MI->getOpcode();
+static bool isTransformable(const MachineInstr &MI) {
+ unsigned Opc = MI.getOpcode();
return Opc != getTransformOpcode(Opc);
}
// isProfitableToTransform - Predicate function to determine whether an
// instruction should be transformed to its equivalent AdvSIMD scalar
// instruction. "add Xd, Xn, Xm" ==> "add Dd, Da, Db", for example.
-bool
-AArch64AdvSIMDScalar::isProfitableToTransform(const MachineInstr *MI) const {
+bool AArch64AdvSIMDScalar::isProfitableToTransform(
+ const MachineInstr &MI) const {
// If this instruction isn't eligible to be transformed (no SIMD equivalent),
// early exit since that's the common case.
if (!isTransformable(MI))
@@ -209,33 +209,33 @@ AArch64AdvSIMDScalar::isProfitableToTransform(const MachineInstr *MI) const {
unsigned NumNewCopies = 3;
unsigned NumRemovableCopies = 0;
- unsigned OrigSrc0 = MI->getOperand(1).getReg();
- unsigned OrigSrc1 = MI->getOperand(2).getReg();
- unsigned Src0 = 0, SubReg0;
- unsigned Src1 = 0, SubReg1;
+ unsigned OrigSrc0 = MI.getOperand(1).getReg();
+ unsigned OrigSrc1 = MI.getOperand(2).getReg();
+ unsigned SubReg0;
+ unsigned SubReg1;
if (!MRI->def_empty(OrigSrc0)) {
MachineRegisterInfo::def_instr_iterator Def =
MRI->def_instr_begin(OrigSrc0);
assert(std::next(Def) == MRI->def_instr_end() && "Multiple def in SSA!");
- Src0 = getSrcFromCopy(&*Def, MRI, SubReg0);
+ MachineOperand *MOSrc0 = getSrcFromCopy(&*Def, MRI, SubReg0);
// If the source was from a copy, we don't need to insert a new copy.
- if (Src0)
+ if (MOSrc0)
--NumNewCopies;
// If there are no other users of the original source, we can delete
// that instruction.
- if (Src0 && MRI->hasOneNonDBGUse(OrigSrc0))
+ if (MOSrc0 && MRI->hasOneNonDBGUse(OrigSrc0))
++NumRemovableCopies;
}
if (!MRI->def_empty(OrigSrc1)) {
MachineRegisterInfo::def_instr_iterator Def =
MRI->def_instr_begin(OrigSrc1);
assert(std::next(Def) == MRI->def_instr_end() && "Multiple def in SSA!");
- Src1 = getSrcFromCopy(&*Def, MRI, SubReg1);
- if (Src1)
+ MachineOperand *MOSrc1 = getSrcFromCopy(&*Def, MRI, SubReg1);
+ if (MOSrc1)
--NumNewCopies;
// If there are no other users of the original source, we can delete
// that instruction.
- if (Src1 && MRI->hasOneNonDBGUse(OrigSrc1))
+ if (MOSrc1 && MRI->hasOneNonDBGUse(OrigSrc1))
++NumRemovableCopies;
}
@@ -244,14 +244,14 @@ AArch64AdvSIMDScalar::isProfitableToTransform(const MachineInstr *MI) const {
// any of the uses is a transformable instruction, it's likely the tranforms
// will chain, enabling us to save a copy there, too. This is an aggressive
// heuristic that approximates the graph based cost analysis described above.
- unsigned Dst = MI->getOperand(0).getReg();
+ unsigned Dst = MI.getOperand(0).getReg();
bool AllUsesAreCopies = true;
for (MachineRegisterInfo::use_instr_nodbg_iterator
Use = MRI->use_instr_nodbg_begin(Dst),
E = MRI->use_instr_nodbg_end();
Use != E; ++Use) {
unsigned SubReg;
- if (getSrcFromCopy(&*Use, MRI, SubReg) || isTransformable(&*Use))
+ if (getSrcFromCopy(&*Use, MRI, SubReg) || isTransformable(*Use))
++NumRemovableCopies;
// If the use is an INSERT_SUBREG, that's still something that can
// directly use the FPR64, so we don't invalidate AllUsesAreCopies. It's
@@ -279,12 +279,11 @@ AArch64AdvSIMDScalar::isProfitableToTransform(const MachineInstr *MI) const {
return TransformAll;
}
-static MachineInstr *insertCopy(const TargetInstrInfo *TII, MachineInstr *MI,
+static MachineInstr *insertCopy(const TargetInstrInfo *TII, MachineInstr &MI,
unsigned Dst, unsigned Src, bool IsKill) {
- MachineInstrBuilder MIB =
- BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII->get(AArch64::COPY),
- Dst)
- .addReg(Src, getKillRegState(IsKill));
+ MachineInstrBuilder MIB = BuildMI(*MI.getParent(), MI, MI.getDebugLoc(),
+ TII->get(AArch64::COPY), Dst)
+ .addReg(Src, getKillRegState(IsKill));
DEBUG(dbgs() << " adding copy: " << *MIB);
++NumCopiesInserted;
return MIB;
@@ -293,43 +292,56 @@ static MachineInstr *insertCopy(const TargetInstrInfo *TII, MachineInstr *MI,
// transformInstruction - Perform the transformation of an instruction
// to its equivalant AdvSIMD scalar instruction. Update inputs and outputs
// to be the correct register class, minimizing cross-class copies.
-void AArch64AdvSIMDScalar::transformInstruction(MachineInstr *MI) {
- DEBUG(dbgs() << "Scalar transform: " << *MI);
+void AArch64AdvSIMDScalar::transformInstruction(MachineInstr &MI) {
+ DEBUG(dbgs() << "Scalar transform: " << MI);
- MachineBasicBlock *MBB = MI->getParent();
- unsigned OldOpc = MI->getOpcode();
+ MachineBasicBlock *MBB = MI.getParent();
+ unsigned OldOpc = MI.getOpcode();
unsigned NewOpc = getTransformOpcode(OldOpc);
assert(OldOpc != NewOpc && "transform an instruction to itself?!");
// Check if we need a copy for the source registers.
- unsigned OrigSrc0 = MI->getOperand(1).getReg();
- unsigned OrigSrc1 = MI->getOperand(2).getReg();
+ unsigned OrigSrc0 = MI.getOperand(1).getReg();
+ unsigned OrigSrc1 = MI.getOperand(2).getReg();
unsigned Src0 = 0, SubReg0;
unsigned Src1 = 0, SubReg1;
+ bool KillSrc0 = false, KillSrc1 = false;
if (!MRI->def_empty(OrigSrc0)) {
MachineRegisterInfo::def_instr_iterator Def =
MRI->def_instr_begin(OrigSrc0);
assert(std::next(Def) == MRI->def_instr_end() && "Multiple def in SSA!");
- Src0 = getSrcFromCopy(&*Def, MRI, SubReg0);
+ MachineOperand *MOSrc0 = getSrcFromCopy(&*Def, MRI, SubReg0);
// If there are no other users of the original source, we can delete
// that instruction.
- if (Src0 && MRI->hasOneNonDBGUse(OrigSrc0)) {
- assert(Src0 && "Can't delete copy w/o a valid original source!");
- Def->eraseFromParent();
- ++NumCopiesDeleted;
+ if (MOSrc0) {
+ Src0 = MOSrc0->getReg();
+ KillSrc0 = MOSrc0->isKill();
+ // Src0 is going to be reused, thus, it cannot be killed anymore.
+ MOSrc0->setIsKill(false);
+ if (MRI->hasOneNonDBGUse(OrigSrc0)) {
+ assert(MOSrc0 && "Can't delete copy w/o a valid original source!");
+ Def->eraseFromParent();
+ ++NumCopiesDeleted;
+ }
}
}
if (!MRI->def_empty(OrigSrc1)) {
MachineRegisterInfo::def_instr_iterator Def =
MRI->def_instr_begin(OrigSrc1);
assert(std::next(Def) == MRI->def_instr_end() && "Multiple def in SSA!");
- Src1 = getSrcFromCopy(&*Def, MRI, SubReg1);
+ MachineOperand *MOSrc1 = getSrcFromCopy(&*Def, MRI, SubReg1);
// If there are no other users of the original source, we can delete
// that instruction.
- if (Src1 && MRI->hasOneNonDBGUse(OrigSrc1)) {
- assert(Src1 && "Can't delete copy w/o a valid original source!");
- Def->eraseFromParent();
- ++NumCopiesDeleted;
+ if (MOSrc1) {
+ Src1 = MOSrc1->getReg();
+ KillSrc1 = MOSrc1->isKill();
+ // Src0 is going to be reused, thus, it cannot be killed anymore.
+ MOSrc1->setIsKill(false);
+ if (MRI->hasOneNonDBGUse(OrigSrc1)) {
+ assert(MOSrc1 && "Can't delete copy w/o a valid original source!");
+ Def->eraseFromParent();
+ ++NumCopiesDeleted;
+ }
}
}
// If we weren't able to reference the original source directly, create a
@@ -337,12 +349,14 @@ void AArch64AdvSIMDScalar::transformInstruction(MachineInstr *MI) {
if (!Src0) {
SubReg0 = 0;
Src0 = MRI->createVirtualRegister(&AArch64::FPR64RegClass);
- insertCopy(TII, MI, Src0, OrigSrc0, true);
+ insertCopy(TII, MI, Src0, OrigSrc0, KillSrc0);
+ KillSrc0 = true;
}
if (!Src1) {
SubReg1 = 0;
Src1 = MRI->createVirtualRegister(&AArch64::FPR64RegClass);
- insertCopy(TII, MI, Src1, OrigSrc1, true);
+ insertCopy(TII, MI, Src1, OrigSrc1, KillSrc1);
+ KillSrc1 = true;
}
// Create a vreg for the destination.
@@ -353,17 +367,17 @@ void AArch64AdvSIMDScalar::transformInstruction(MachineInstr *MI) {
// For now, all of the new instructions have the same simple three-register
// form, so no need to special case based on what instruction we're
// building.
- BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(NewOpc), Dst)
- .addReg(Src0, getKillRegState(true), SubReg0)
- .addReg(Src1, getKillRegState(true), SubReg1);
+ BuildMI(*MBB, MI, MI.getDebugLoc(), TII->get(NewOpc), Dst)
+ .addReg(Src0, getKillRegState(KillSrc0), SubReg0)
+ .addReg(Src1, getKillRegState(KillSrc1), SubReg1);
// Now copy the result back out to a GPR.
// FIXME: Try to avoid this if all uses could actually just use the FPR64
// directly.
- insertCopy(TII, MI, MI->getOperand(0).getReg(), Dst, true);
+ insertCopy(TII, MI, MI.getOperand(0).getReg(), Dst, true);
// Erase the old instruction.
- MI->eraseFromParent();
+ MI.eraseFromParent();
++NumScalarInsnsUsed;
}
@@ -372,8 +386,7 @@ void AArch64AdvSIMDScalar::transformInstruction(MachineInstr *MI) {
bool AArch64AdvSIMDScalar::processMachineBasicBlock(MachineBasicBlock *MBB) {
bool Changed = false;
for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E;) {
- MachineInstr *MI = I;
- ++I;
+ MachineInstr &MI = *I++;
if (isProfitableToTransform(MI)) {
transformInstruction(MI);
Changed = true;
@@ -387,6 +400,9 @@ bool AArch64AdvSIMDScalar::runOnMachineFunction(MachineFunction &mf) {
bool Changed = false;
DEBUG(dbgs() << "***** AArch64AdvSIMDScalar *****\n");
+ if (skipFunction(*mf.getFunction()))
+ return false;
+
MRI = &mf.getRegInfo();
TII = mf.getSubtarget().getInstrInfo();
diff --git a/lib/Target/AArch64/AArch64AsmPrinter.cpp b/lib/Target/AArch64/AArch64AsmPrinter.cpp
index ada995bad37e6..22374f754603d 100644
--- a/lib/Target/AArch64/AArch64AsmPrinter.cpp
+++ b/lib/Target/AArch64/AArch64AsmPrinter.cpp
@@ -49,6 +49,7 @@ namespace {
class AArch64AsmPrinter : public AsmPrinter {
AArch64MCInstLower MCInstLowering;
StackMaps SM;
+ const AArch64Subtarget *STI;
public:
AArch64AsmPrinter(TargetMachine &TM, std::unique_ptr<MCStreamer> Streamer)
@@ -83,11 +84,11 @@ public:
bool runOnMachineFunction(MachineFunction &F) override {
AArch64FI = F.getInfo<AArch64FunctionInfo>();
+ STI = static_cast<const AArch64Subtarget*>(&F.getSubtarget());
return AsmPrinter::runOnMachineFunction(F);
}
private:
- MachineLocation getDebugValueLocation(const MachineInstr *MI) const;
void printOperand(const MachineInstr *MI, unsigned OpNum, raw_ostream &O);
bool printAsmMRegister(const MachineOperand &MO, char Mode, raw_ostream &O);
bool printAsmRegInClass(const MachineOperand &MO,
@@ -112,6 +113,9 @@ private:
/// \brief Emit the LOHs contained in AArch64FI.
void EmitLOHs();
+ /// Emit instruction to set float register to zero.
+ void EmitFMov0(const MachineInstr &MI);
+
typedef std::map<const MachineInstr *, MCSymbol *> MInstToMCSymbol;
MInstToMCSymbol LOHInstToLabel;
};
@@ -133,19 +137,6 @@ void AArch64AsmPrinter::EmitEndOfAsmFile(Module &M) {
}
}
-MachineLocation
-AArch64AsmPrinter::getDebugValueLocation(const MachineInstr *MI) const {
- MachineLocation Location;
- assert(MI->getNumOperands() == 4 && "Invalid no. of machine operands!");
- // Frame address. Currently handles register +- offset only.
- if (MI->getOperand(0).isReg() && MI->getOperand(1).isImm())
- Location.set(MI->getOperand(0).getReg(), MI->getOperand(1).getImm());
- else {
- DEBUG(dbgs() << "DBG_VALUE instruction ignored! " << *MI << "\n");
- }
- return Location;
-}
-
void AArch64AsmPrinter::EmitLOHs() {
SmallVector<MCSymbol *, 3> MCArgs;
@@ -238,8 +229,7 @@ bool AArch64AsmPrinter::printAsmRegInClass(const MachineOperand &MO,
const TargetRegisterClass *RC,
bool isVector, raw_ostream &O) {
assert(MO.isReg() && "Should only get here with a register!");
- const AArch64RegisterInfo *RI =
- MF->getSubtarget<AArch64Subtarget>().getRegisterInfo();
+ const TargetRegisterInfo *RI = STI->getRegisterInfo();
unsigned Reg = MO.getReg();
unsigned RegToPrint = RC->getRegister(RI->getEncodingValue(Reg));
assert(RI->regsOverlap(RegToPrint, Reg));
@@ -404,16 +394,16 @@ void AArch64AsmPrinter::LowerPATCHPOINT(MCStreamer &OutStreamer, StackMaps &SM,
unsigned ScratchReg = MI.getOperand(Opers.getNextScratchIdx()).getReg();
EncodedBytes = 16;
// Materialize the jump address:
- EmitToStreamer(OutStreamer, MCInstBuilder(AArch64::MOVZWi)
+ EmitToStreamer(OutStreamer, MCInstBuilder(AArch64::MOVZXi)
.addReg(ScratchReg)
.addImm((CallTarget >> 32) & 0xFFFF)
.addImm(32));
- EmitToStreamer(OutStreamer, MCInstBuilder(AArch64::MOVKWi)
+ EmitToStreamer(OutStreamer, MCInstBuilder(AArch64::MOVKXi)
.addReg(ScratchReg)
.addReg(ScratchReg)
.addImm((CallTarget >> 16) & 0xFFFF)
.addImm(16));
- EmitToStreamer(OutStreamer, MCInstBuilder(AArch64::MOVKWi)
+ EmitToStreamer(OutStreamer, MCInstBuilder(AArch64::MOVKXi)
.addReg(ScratchReg)
.addReg(ScratchReg)
.addImm(CallTarget & 0xFFFF)
@@ -430,6 +420,40 @@ void AArch64AsmPrinter::LowerPATCHPOINT(MCStreamer &OutStreamer, StackMaps &SM,
EmitToStreamer(OutStreamer, MCInstBuilder(AArch64::HINT).addImm(0));
}
+void AArch64AsmPrinter::EmitFMov0(const MachineInstr &MI) {
+ unsigned DestReg = MI.getOperand(0).getReg();
+ if (STI->hasZeroCycleZeroing()) {
+ // Convert S/D register to corresponding Q register
+ if (AArch64::S0 <= DestReg && DestReg <= AArch64::S31) {
+ DestReg = AArch64::Q0 + (DestReg - AArch64::S0);
+ } else {
+ assert(AArch64::D0 <= DestReg && DestReg <= AArch64::D31);
+ DestReg = AArch64::Q0 + (DestReg - AArch64::D0);
+ }
+ MCInst MOVI;
+ MOVI.setOpcode(AArch64::MOVIv2d_ns);
+ MOVI.addOperand(MCOperand::createReg(DestReg));
+ MOVI.addOperand(MCOperand::createImm(0));
+ EmitToStreamer(*OutStreamer, MOVI);
+ } else {
+ MCInst FMov;
+ switch (MI.getOpcode()) {
+ default: llvm_unreachable("Unexpected opcode");
+ case AArch64::FMOVS0:
+ FMov.setOpcode(AArch64::FMOVWSr);
+ FMov.addOperand(MCOperand::createReg(DestReg));
+ FMov.addOperand(MCOperand::createReg(AArch64::WZR));
+ break;
+ case AArch64::FMOVD0:
+ FMov.setOpcode(AArch64::FMOVXDr);
+ FMov.addOperand(MCOperand::createReg(DestReg));
+ FMov.addOperand(MCOperand::createReg(AArch64::XZR));
+ break;
+ }
+ EmitToStreamer(*OutStreamer, FMov);
+ }
+}
+
// Simple pseudo-instructions have their lowering (with expansion to real
// instructions) auto-generated.
#include "AArch64GenMCPseudoLowering.inc"
@@ -535,6 +559,11 @@ void AArch64AsmPrinter::EmitInstruction(const MachineInstr *MI) {
return;
}
+ case AArch64::FMOVS0:
+ case AArch64::FMOVD0:
+ EmitFMov0(*MI);
+ return;
+
case TargetOpcode::STACKMAP:
return LowerSTACKMAP(*OutStreamer, SM, *MI);
diff --git a/lib/Target/AArch64/AArch64BranchRelaxation.cpp b/lib/Target/AArch64/AArch64BranchRelaxation.cpp
index a614f555a4e9f..9ec6ae4118a44 100644
--- a/lib/Target/AArch64/AArch64BranchRelaxation.cpp
+++ b/lib/Target/AArch64/AArch64BranchRelaxation.cpp
@@ -177,7 +177,7 @@ void AArch64BranchRelaxation::scanFunction() {
void AArch64BranchRelaxation::computeBlockSize(const MachineBasicBlock &MBB) {
unsigned Size = 0;
for (const MachineInstr &MI : MBB)
- Size += TII->GetInstSizeInBytes(&MI);
+ Size += TII->GetInstSizeInBytes(MI);
BlockInfo[MBB.getNumber()].Size = Size;
}
@@ -195,7 +195,7 @@ unsigned AArch64BranchRelaxation::getInstrOffset(MachineInstr *MI) const {
// Sum instructions before MI in MBB.
for (MachineBasicBlock::iterator I = MBB->begin(); &*I != MI; ++I) {
assert(I != MBB->end() && "Didn't find MI in its own basic block?");
- Offset += TII->GetInstSizeInBytes(I);
+ Offset += TII->GetInstSizeInBytes(*I);
}
return Offset;
}
@@ -415,12 +415,12 @@ bool AArch64BranchRelaxation::fixupConditionalBranch(MachineInstr *MI) {
// Analyze the branch so we know how to update the successor lists.
MachineBasicBlock *TBB, *FBB;
SmallVector<MachineOperand, 2> Cond;
- TII->AnalyzeBranch(*MBB, TBB, FBB, Cond, false);
+ TII->analyzeBranch(*MBB, TBB, FBB, Cond, false);
MachineBasicBlock *NewBB = splitBlockBeforeInstr(MI);
// No need for the branch to the next block. We're adding an unconditional
// branch to the destination.
- int delta = TII->GetInstSizeInBytes(&MBB->back());
+ int delta = TII->GetInstSizeInBytes(MBB->back());
BlockInfo[MBB->getNumber()].Size -= delta;
MBB->back().eraseFromParent();
// BlockInfo[SplitBB].Offset is wrong temporarily, fixed below
@@ -446,12 +446,12 @@ bool AArch64BranchRelaxation::fixupConditionalBranch(MachineInstr *MI) {
if (MI->getOpcode() == AArch64::Bcc)
invertBccCondition(MIB);
MIB.addMBB(NextBB);
- BlockInfo[MBB->getNumber()].Size += TII->GetInstSizeInBytes(&MBB->back());
+ BlockInfo[MBB->getNumber()].Size += TII->GetInstSizeInBytes(MBB->back());
BuildMI(MBB, DebugLoc(), TII->get(AArch64::B)).addMBB(DestBB);
- BlockInfo[MBB->getNumber()].Size += TII->GetInstSizeInBytes(&MBB->back());
+ BlockInfo[MBB->getNumber()].Size += TII->GetInstSizeInBytes(MBB->back());
// Remove the old conditional branch. It may or may not still be in MBB.
- BlockInfo[MI->getParent()->getNumber()].Size -= TII->GetInstSizeInBytes(MI);
+ BlockInfo[MI->getParent()->getNumber()].Size -= TII->GetInstSizeInBytes(*MI);
MI->eraseFromParent();
// Finally, keep the block offsets up to date.
@@ -463,12 +463,13 @@ bool AArch64BranchRelaxation::relaxBranchInstructions() {
bool Changed = false;
// Relaxing branches involves creating new basic blocks, so re-eval
// end() for termination.
- for (auto &MBB : *MF) {
- MachineInstr *MI = MBB.getFirstTerminator();
- if (isConditionalBranch(MI->getOpcode()) &&
- !isBlockInRange(MI, getDestBlock(MI),
- getBranchDisplacementBits(MI->getOpcode()))) {
- fixupConditionalBranch(MI);
+ for (MachineFunction::iterator I = MF->begin(); I != MF->end(); ++I) {
+ MachineBasicBlock &MBB = *I;
+ MachineInstr &MI = *MBB.getFirstTerminator();
+ if (isConditionalBranch(MI.getOpcode()) &&
+ !isBlockInRange(&MI, getDestBlock(&MI),
+ getBranchDisplacementBits(MI.getOpcode()))) {
+ fixupConditionalBranch(&MI);
++NumRelaxed;
Changed = true;
}
@@ -513,8 +514,7 @@ bool AArch64BranchRelaxation::runOnMachineFunction(MachineFunction &mf) {
return MadeChange;
}
-/// createAArch64BranchRelaxation - returns an instance of the constpool
-/// island pass.
+/// Returns an instance of the AArch64 Branch Relaxation pass.
FunctionPass *llvm::createAArch64BranchRelaxation() {
return new AArch64BranchRelaxation();
}
diff --git a/lib/Target/AArch64/AArch64CallLowering.cpp b/lib/Target/AArch64/AArch64CallLowering.cpp
new file mode 100644
index 0000000000000..e3522e63c21c0
--- /dev/null
+++ b/lib/Target/AArch64/AArch64CallLowering.cpp
@@ -0,0 +1,104 @@
+//===-- llvm/lib/Target/AArch64/AArch64CallLowering.cpp - Call lowering ---===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file implements the lowering of LLVM calls to machine code calls for
+/// GlobalISel.
+///
+//===----------------------------------------------------------------------===//
+
+#include "AArch64CallLowering.h"
+#include "AArch64ISelLowering.h"
+
+#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+
+using namespace llvm;
+
+#ifndef LLVM_BUILD_GLOBAL_ISEL
+#error "This shouldn't be built without GISel"
+#endif
+
+AArch64CallLowering::AArch64CallLowering(const AArch64TargetLowering &TLI)
+ : CallLowering(&TLI) {
+}
+
+bool AArch64CallLowering::lowerReturn(MachineIRBuilder &MIRBuilder,
+ const Value *Val, unsigned VReg) const {
+ MachineInstr *Return = MIRBuilder.buildInstr(AArch64::RET_ReallyLR);
+ assert(Return && "Unable to build a return instruction?!");
+
+ assert(((Val && VReg) || (!Val && !VReg)) && "Return value without a vreg");
+ if (VReg) {
+ assert(Val->getType()->isIntegerTy() && "Type not supported yet");
+ unsigned Size = Val->getType()->getPrimitiveSizeInBits();
+ assert((Size == 64 || Size == 32) && "Size not supported yet");
+ unsigned ResReg = (Size == 32) ? AArch64::W0 : AArch64::X0;
+ // Set the insertion point to be right before Return.
+ MIRBuilder.setInstr(*Return, /* Before */ true);
+ MachineInstr *Copy =
+ MIRBuilder.buildInstr(TargetOpcode::COPY, ResReg, VReg);
+ (void)Copy;
+ assert(Copy->getNextNode() == Return &&
+ "The insertion did not happen where we expected");
+ MachineInstrBuilder(MIRBuilder.getMF(), Return)
+ .addReg(ResReg, RegState::Implicit);
+ }
+ return true;
+}
+
+bool AArch64CallLowering::lowerFormalArguments(
+ MachineIRBuilder &MIRBuilder, const Function::ArgumentListType &Args,
+ const SmallVectorImpl<unsigned> &VRegs) const {
+ MachineFunction &MF = MIRBuilder.getMF();
+ const Function &F = *MF.getFunction();
+
+ SmallVector<CCValAssign, 16> ArgLocs;
+ CCState CCInfo(F.getCallingConv(), F.isVarArg(), MF, ArgLocs, F.getContext());
+
+ unsigned NumArgs = Args.size();
+ Function::const_arg_iterator CurOrigArg = Args.begin();
+ const AArch64TargetLowering &TLI = *getTLI<AArch64TargetLowering>();
+ for (unsigned i = 0; i != NumArgs; ++i, ++CurOrigArg) {
+ MVT ValVT = MVT::getVT(CurOrigArg->getType());
+ CCAssignFn *AssignFn =
+ TLI.CCAssignFnForCall(F.getCallingConv(), /*IsVarArg=*/false);
+ bool Res =
+ AssignFn(i, ValVT, ValVT, CCValAssign::Full, ISD::ArgFlagsTy(), CCInfo);
+ assert(!Res && "Call operand has unhandled type");
+ (void)Res;
+ }
+ assert(ArgLocs.size() == Args.size() &&
+ "We have a different number of location and args?!");
+ for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+ CCValAssign &VA = ArgLocs[i];
+
+ assert(VA.isRegLoc() && "Not yet implemented");
+ // Transform the arguments in physical registers into virtual ones.
+ MIRBuilder.getMBB().addLiveIn(VA.getLocReg());
+ MIRBuilder.buildInstr(TargetOpcode::COPY, VRegs[i], VA.getLocReg());
+
+ switch (VA.getLocInfo()) {
+ default:
+ llvm_unreachable("Unknown loc info!");
+ case CCValAssign::Full:
+ break;
+ case CCValAssign::BCvt:
+ // We don't care about bitcast.
+ break;
+ case CCValAssign::AExt:
+ case CCValAssign::SExt:
+ case CCValAssign::ZExt:
+ // Zero/Sign extend the register.
+ assert(0 && "Not yet implemented");
+ break;
+ }
+ }
+ return true;
+}
diff --git a/lib/Target/AArch64/AArch64CallLowering.h b/lib/Target/AArch64/AArch64CallLowering.h
new file mode 100644
index 0000000000000..411622803461f
--- /dev/null
+++ b/lib/Target/AArch64/AArch64CallLowering.h
@@ -0,0 +1,36 @@
+//===-- llvm/lib/Target/AArch64/AArch64CallLowering.h - Call lowering -----===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file describes how to lower LLVM calls to machine code calls.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64CALLLOWERING
+#define LLVM_LIB_TARGET_AARCH64_AARCH64CALLLOWERING
+
+#include "llvm/CodeGen/GlobalISel/CallLowering.h"
+
+namespace llvm {
+
+class AArch64TargetLowering;
+
+class AArch64CallLowering: public CallLowering {
+ public:
+ AArch64CallLowering(const AArch64TargetLowering &TLI);
+
+ bool lowerReturn(MachineIRBuilder &MIRBuiler, const Value *Val,
+ unsigned VReg) const override;
+ bool
+ lowerFormalArguments(MachineIRBuilder &MIRBuilder,
+ const Function::ArgumentListType &Args,
+ const SmallVectorImpl<unsigned> &VRegs) const override;
+};
+} // End of namespace llvm;
+#endif
diff --git a/lib/Target/AArch64/AArch64CallingConvention.td b/lib/Target/AArch64/AArch64CallingConvention.td
index 388d64ec4e99d..178e3971640ed 100644
--- a/lib/Target/AArch64/AArch64CallingConvention.td
+++ b/lib/Target/AArch64/AArch64CallingConvention.td
@@ -45,6 +45,9 @@ def CC_AArch64_AAPCS : CallingConv<[
// supported there.
CCIfNest<CCAssignToReg<[X18]>>,
+ // Pass SwiftSelf in a callee saved register.
+ CCIfSwiftSelf<CCIfType<[i64], CCAssignToRegWithShadow<[X20], [W20]>>>,
+
CCIfConsecutiveRegs<CCCustom<"CC_AArch64_Custom_Block">>,
// Handle i1, i8, i16, i32, i64, f32, f64 and v2f64 by passing in registers,
@@ -86,6 +89,8 @@ def RetCC_AArch64_AAPCS : CallingConv<[
CCIfType<[v2f32], CCBitConvertToType<v2i32>>,
CCIfType<[v2f64, v4f32], CCBitConvertToType<v2i64>>,
+ CCIfSwiftError<CCIfType<[i64], CCAssignToRegWithShadow<[X19], [W19]>>>,
+
// Big endian vectors must be passed as if they were 1-element vectors so that
// their lanes are in a consistent order.
CCIfBigEndian<CCIfType<[v2i32, v2f32, v4i16, v4f16, v8i8],
@@ -126,6 +131,12 @@ def CC_AArch64_DarwinPCS : CallingConv<[
// slot is 64-bit.
CCIfByVal<CCPassByVal<8, 8>>,
+ // Pass SwiftSelf in a callee saved register.
+ CCIfSwiftSelf<CCIfType<[i64], CCAssignToRegWithShadow<[X20], [W20]>>>,
+
+ // A SwiftError is passed in X19.
+ CCIfSwiftError<CCIfType<[i64], CCAssignToRegWithShadow<[X19], [W19]>>>,
+
CCIfConsecutiveRegs<CCCustom<"CC_AArch64_Custom_Block">>,
// Handle i1, i8, i16, i32, i64, f32, f64 and v2f64 by passing in registers,
@@ -270,6 +281,9 @@ def CSR_AArch64_AAPCS : CalleeSavedRegs<(add LR, FP, X19, X20, X21, X22,
// case)
def CSR_AArch64_AAPCS_ThisReturn : CalleeSavedRegs<(add CSR_AArch64_AAPCS, X0)>;
+def CSR_AArch64_AAPCS_SwiftError
+ : CalleeSavedRegs<(sub CSR_AArch64_AAPCS, X19)>;
+
// The function used by Darwin to obtain the address of a thread-local variable
// guarantees more than a normal AAPCS function. x16 and x17 are used on the
// fast path for calculation, but other registers except X0 (argument/return)
@@ -310,3 +324,7 @@ def CSR_AArch64_AllRegs
(sequence "Q%u", 0, 31))>;
def CSR_AArch64_NoRegs : CalleeSavedRegs<(add)>;
+
+def CSR_AArch64_RT_MostRegs : CalleeSavedRegs<(add CSR_AArch64_AAPCS,
+ (sequence "X%u", 9, 15))>;
+
diff --git a/lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp b/lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp
index 9310ac4a44a2d..011a03622ba51 100644
--- a/lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp
+++ b/lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp
@@ -39,6 +39,9 @@ struct LDTLSCleanup : public MachineFunctionPass {
LDTLSCleanup() : MachineFunctionPass(ID) {}
bool runOnMachineFunction(MachineFunction &MF) override {
+ if (skipFunction(*MF.getFunction()))
+ return false;
+
AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
if (AFI->getNumLocalDynamicTLSAccesses() < 2) {
// No point folding accesses if there isn't at least two.
@@ -69,9 +72,9 @@ struct LDTLSCleanup : public MachineFunctionPass {
break;
if (TLSBaseAddrReg)
- I = replaceTLSBaseAddrCall(I, TLSBaseAddrReg);
+ I = replaceTLSBaseAddrCall(*I, TLSBaseAddrReg);
else
- I = setRegister(I, &TLSBaseAddrReg);
+ I = setRegister(*I, &TLSBaseAddrReg);
Changed = true;
break;
default:
@@ -89,27 +92,27 @@ struct LDTLSCleanup : public MachineFunctionPass {
// Replace the TLS_base_addr instruction I with a copy from
// TLSBaseAddrReg, returning the new instruction.
- MachineInstr *replaceTLSBaseAddrCall(MachineInstr *I,
+ MachineInstr *replaceTLSBaseAddrCall(MachineInstr &I,
unsigned TLSBaseAddrReg) {
- MachineFunction *MF = I->getParent()->getParent();
+ MachineFunction *MF = I.getParent()->getParent();
const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
// Insert a Copy from TLSBaseAddrReg to x0, which is where the rest of the
// code sequence assumes the address will be.
- MachineInstr *Copy = BuildMI(*I->getParent(), I, I->getDebugLoc(),
- TII->get(TargetOpcode::COPY),
- AArch64::X0).addReg(TLSBaseAddrReg);
+ MachineInstr *Copy = BuildMI(*I.getParent(), I, I.getDebugLoc(),
+ TII->get(TargetOpcode::COPY), AArch64::X0)
+ .addReg(TLSBaseAddrReg);
// Erase the TLS_base_addr instruction.
- I->eraseFromParent();
+ I.eraseFromParent();
return Copy;
}
// Create a virtal register in *TLSBaseAddrReg, and populate it by
// inserting a copy instruction after I. Returns the new instruction.
- MachineInstr *setRegister(MachineInstr *I, unsigned *TLSBaseAddrReg) {
- MachineFunction *MF = I->getParent()->getParent();
+ MachineInstr *setRegister(MachineInstr &I, unsigned *TLSBaseAddrReg) {
+ MachineFunction *MF = I.getParent()->getParent();
const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
// Create a virtual register for the TLS base address.
@@ -118,7 +121,7 @@ struct LDTLSCleanup : public MachineFunctionPass {
// Insert a copy from X0 to TLSBaseAddrReg for later.
MachineInstr *Copy =
- BuildMI(*I->getParent(), ++I->getIterator(), I->getDebugLoc(),
+ BuildMI(*I.getParent(), ++I.getIterator(), I.getDebugLoc(),
TII->get(TargetOpcode::COPY), *TLSBaseAddrReg)
.addReg(AArch64::X0);
diff --git a/lib/Target/AArch64/AArch64CollectLOH.cpp b/lib/Target/AArch64/AArch64CollectLOH.cpp
index 78c239b11ef31..5eecb3a868566 100644
--- a/lib/Target/AArch64/AArch64CollectLOH.cpp
+++ b/lib/Target/AArch64/AArch64CollectLOH.cpp
@@ -179,6 +179,11 @@ struct AArch64CollectLOH : public MachineFunctionPass {
bool runOnMachineFunction(MachineFunction &MF) override;
+ MachineFunctionProperties getRequiredProperties() const override {
+ return MachineFunctionProperties().set(
+ MachineFunctionProperties::Property::AllVRegsAllocated);
+ }
+
const char *getPassName() const override {
return AARCH64_COLLECT_LOH_NAME;
}
@@ -623,10 +628,7 @@ static void computeADRP(const InstrToInstrs &UseToDefs,
continue;
}
DEBUG(dbgs() << "Record AdrpAdrp:\n" << *L2 << '\n' << *L1 << '\n');
- SmallVector<const MachineInstr *, 2> Args;
- Args.push_back(L2);
- Args.push_back(L1);
- AArch64FI.addLOHDirective(MCLOH_AdrpAdrp, Args);
+ AArch64FI.addLOHDirective(MCLOH_AdrpAdrp, {L2, L1});
++NumADRPSimpleCandidate;
}
#ifdef DEBUG
@@ -760,13 +762,9 @@ static bool registerADRCandidate(const MachineInstr &Use,
"ADD already involved in LOH.");
DEBUG(dbgs() << "Record AdrpAdd\n" << Def << '\n' << Use << '\n');
- SmallVector<const MachineInstr *, 2> Args;
- Args.push_back(&Def);
- Args.push_back(&Use);
-
- AArch64FI.addLOHDirective(Use.getOpcode() == AArch64::ADDXri ? MCLOH_AdrpAdd
- : MCLOH_AdrpLdrGot,
- Args);
+ AArch64FI.addLOHDirective(
+ Use.getOpcode() == AArch64::ADDXri ? MCLOH_AdrpAdd : MCLOH_AdrpLdrGot,
+ {&Def, &Use});
return true;
}
@@ -1036,6 +1034,9 @@ static void collectInvolvedReg(const MachineFunction &MF, MapRegToId &RegToId,
}
bool AArch64CollectLOH::runOnMachineFunction(MachineFunction &MF) {
+ if (skipFunction(*MF.getFunction()))
+ return false;
+
const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
const MachineDominatorTree *MDT = &getAnalysis<MachineDominatorTree>();
diff --git a/lib/Target/AArch64/AArch64ConditionOptimizer.cpp b/lib/Target/AArch64/AArch64ConditionOptimizer.cpp
index fc27bfee73d13..8fff381d391e9 100644
--- a/lib/Target/AArch64/AArch64ConditionOptimizer.cpp
+++ b/lib/Target/AArch64/AArch64ConditionOptimizer.cpp
@@ -70,7 +70,6 @@
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/Passes.h"
-#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Target/TargetInstrInfo.h"
@@ -144,10 +143,18 @@ MachineInstr *AArch64ConditionOptimizer::findSuitableCompare(
if (I->getOpcode() != AArch64::Bcc)
return nullptr;
+ // Since we may modify cmp of this MBB, make sure NZCV does not live out.
+ for (auto SuccBB : MBB->successors())
+ if (SuccBB->isLiveIn(AArch64::NZCV))
+ return nullptr;
+
// Now find the instruction controlling the terminator.
for (MachineBasicBlock::iterator B = MBB->begin(); I != B;) {
--I;
assert(!I->isTerminator() && "Spurious terminator");
+ // Check if there is any use of NZCV between CMP and Bcc.
+ if (I->readsRegister(AArch64::NZCV))
+ return nullptr;
switch (I->getOpcode()) {
// cmp is an alias for subs with a dead destination register.
case AArch64::SUBSWri:
@@ -166,7 +173,7 @@ MachineInstr *AArch64ConditionOptimizer::findSuitableCompare(
DEBUG(dbgs() << "Destination of cmp is not dead, " << *I << '\n');
return nullptr;
}
- return I;
+ return &*I;
}
// Prevent false positive case like:
// cmp w19, #0
@@ -268,13 +275,13 @@ void AArch64ConditionOptimizer::modifyCmp(MachineInstr *CmpMI,
// The fact that this comparison was picked ensures that it's related to the
// first terminator instruction.
- MachineInstr *BrMI = MBB->getFirstTerminator();
+ MachineInstr &BrMI = *MBB->getFirstTerminator();
// Change condition in branch instruction.
- BuildMI(*MBB, BrMI, BrMI->getDebugLoc(), TII->get(AArch64::Bcc))
+ BuildMI(*MBB, BrMI, BrMI.getDebugLoc(), TII->get(AArch64::Bcc))
.addImm(Cmp)
- .addOperand(BrMI->getOperand(1));
- BrMI->eraseFromParent();
+ .addOperand(BrMI.getOperand(1));
+ BrMI.eraseFromParent();
MBB->updateTerminator();
@@ -311,6 +318,9 @@ bool AArch64ConditionOptimizer::adjustTo(MachineInstr *CmpMI,
bool AArch64ConditionOptimizer::runOnMachineFunction(MachineFunction &MF) {
DEBUG(dbgs() << "********** AArch64 Conditional Compares **********\n"
<< "********** Function: " << MF.getName() << '\n');
+ if (skipFunction(*MF.getFunction()))
+ return false;
+
TII = MF.getSubtarget().getInstrInfo();
DomTree = &getAnalysis<MachineDominatorTree>();
MRI = &MF.getRegInfo();
@@ -327,7 +337,7 @@ bool AArch64ConditionOptimizer::runOnMachineFunction(MachineFunction &MF) {
SmallVector<MachineOperand, 4> HeadCond;
MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
- if (TII->AnalyzeBranch(*HBB, TBB, FBB, HeadCond)) {
+ if (TII->analyzeBranch(*HBB, TBB, FBB, HeadCond)) {
continue;
}
@@ -338,7 +348,7 @@ bool AArch64ConditionOptimizer::runOnMachineFunction(MachineFunction &MF) {
SmallVector<MachineOperand, 4> TrueCond;
MachineBasicBlock *TBB_TBB = nullptr, *TBB_FBB = nullptr;
- if (TII->AnalyzeBranch(*TBB, TBB_TBB, TBB_FBB, TrueCond)) {
+ if (TII->analyzeBranch(*TBB, TBB_TBB, TBB_FBB, TrueCond)) {
continue;
}
diff --git a/lib/Target/AArch64/AArch64ConditionalCompares.cpp b/lib/Target/AArch64/AArch64ConditionalCompares.cpp
index df1320fbd4c95..e1b0dc724b39a 100644
--- a/lib/Target/AArch64/AArch64ConditionalCompares.cpp
+++ b/lib/Target/AArch64/AArch64ConditionalCompares.cpp
@@ -18,13 +18,10 @@
//===----------------------------------------------------------------------===//
#include "AArch64.h"
-#include "llvm/ADT/BitVector.h"
#include "llvm/ADT/DepthFirstIterator.h"
#include "llvm/ADT/SetVector.h"
#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/SparseSet.h"
#include "llvm/ADT/Statistic.h"
-#include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
#include "llvm/CodeGen/MachineDominators.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
@@ -307,7 +304,7 @@ MachineInstr *SSACCmpConv::findConvertibleCompare(MachineBasicBlock *MBB) {
case AArch64::CBNZW:
case AArch64::CBNZX:
// These can be converted into a ccmp against #0.
- return I;
+ return &*I;
}
++NumCmpTermRejs;
DEBUG(dbgs() << "Flags not used by terminator: " << *I);
@@ -338,7 +335,7 @@ MachineInstr *SSACCmpConv::findConvertibleCompare(MachineBasicBlock *MBB) {
case AArch64::ADDSWrr:
case AArch64::ADDSXrr:
if (isDeadDef(I->getOperand(0).getReg()))
- return I;
+ return &*I;
DEBUG(dbgs() << "Can't convert compare with live destination: " << *I);
++NumLiveDstRejs;
return nullptr;
@@ -346,12 +343,12 @@ MachineInstr *SSACCmpConv::findConvertibleCompare(MachineBasicBlock *MBB) {
case AArch64::FCMPDrr:
case AArch64::FCMPESrr:
case AArch64::FCMPEDrr:
- return I;
+ return &*I;
}
// Check for flag reads and clobbers.
MIOperands::PhysRegInfo PRI =
- MIOperands(I).analyzePhysReg(AArch64::NZCV, TRI);
+ MIOperands(*I).analyzePhysReg(AArch64::NZCV, TRI);
if (PRI.Read) {
// The ccmp doesn't produce exactly the same flags as the original
@@ -496,7 +493,7 @@ bool SSACCmpConv::canConvert(MachineBasicBlock *MBB) {
// The branch we're looking to eliminate must be analyzable.
HeadCond.clear();
MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
- if (TII->AnalyzeBranch(*Head, TBB, FBB, HeadCond)) {
+ if (TII->analyzeBranch(*Head, TBB, FBB, HeadCond)) {
DEBUG(dbgs() << "Head branch not analyzable.\n");
++NumHeadBranchRejs;
return false;
@@ -524,7 +521,7 @@ bool SSACCmpConv::canConvert(MachineBasicBlock *MBB) {
CmpBBCond.clear();
TBB = FBB = nullptr;
- if (TII->AnalyzeBranch(*CmpBB, TBB, FBB, CmpBBCond)) {
+ if (TII->analyzeBranch(*CmpBB, TBB, FBB, CmpBBCond)) {
DEBUG(dbgs() << "CmpBB branch not analyzable.\n");
++NumCmpBranchRejs;
return false;
@@ -759,7 +756,6 @@ void initializeAArch64ConditionalComparesPass(PassRegistry &);
INITIALIZE_PASS_BEGIN(AArch64ConditionalCompares, "aarch64-ccmp",
"AArch64 CCMP Pass", false, false)
-INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfo)
INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
INITIALIZE_PASS_DEPENDENCY(MachineTraceMetrics)
INITIALIZE_PASS_END(AArch64ConditionalCompares, "aarch64-ccmp",
@@ -770,7 +766,6 @@ FunctionPass *llvm::createAArch64ConditionalCompares() {
}
void AArch64ConditionalCompares::getAnalysisUsage(AnalysisUsage &AU) const {
- AU.addRequired<MachineBranchProbabilityInfo>();
AU.addRequired<MachineDominatorTree>();
AU.addPreserved<MachineDominatorTree>();
AU.addRequired<MachineLoopInfo>();
@@ -849,9 +844,9 @@ bool AArch64ConditionalCompares::shouldConvert() {
// Instruction depths can be computed for all trace instructions above CmpBB.
unsigned HeadDepth =
- Trace.getInstrCycles(CmpConv.Head->getFirstTerminator()).Depth;
+ Trace.getInstrCycles(*CmpConv.Head->getFirstTerminator()).Depth;
unsigned CmpBBDepth =
- Trace.getInstrCycles(CmpConv.CmpBB->getFirstTerminator()).Depth;
+ Trace.getInstrCycles(*CmpConv.CmpBB->getFirstTerminator()).Depth;
DEBUG(dbgs() << "Head depth: " << HeadDepth
<< "\nCmpBB depth: " << CmpBBDepth << '\n');
if (CmpBBDepth > HeadDepth + DelayLimit) {
@@ -891,6 +886,9 @@ bool AArch64ConditionalCompares::tryConvert(MachineBasicBlock *MBB) {
bool AArch64ConditionalCompares::runOnMachineFunction(MachineFunction &MF) {
DEBUG(dbgs() << "********** AArch64 Conditional Compares **********\n"
<< "********** Function: " << MF.getName() << '\n');
+ if (skipFunction(*MF.getFunction()))
+ return false;
+
TII = MF.getSubtarget().getInstrInfo();
TRI = MF.getSubtarget().getRegisterInfo();
SchedModel = MF.getSubtarget().getSchedModel();
diff --git a/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp b/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp
index 576cf4a741678..7a6f7669db5f3 100644
--- a/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp
+++ b/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp
@@ -48,6 +48,11 @@ public:
bool runOnMachineFunction(MachineFunction &F) override;
+ MachineFunctionProperties getRequiredProperties() const override {
+ return MachineFunctionProperties().set(
+ MachineFunctionProperties::Property::AllVRegsAllocated);
+ }
+
const char *getPassName() const override { return AARCH64_DEAD_REG_DEF_NAME; }
void getAnalysisUsage(AnalysisUsage &AU) const override {
@@ -88,6 +93,12 @@ bool AArch64DeadRegisterDefinitions::processMachineBasicBlock(
DEBUG(dbgs() << " Ignoring, operand is frame index\n");
continue;
}
+ if (MI.definesRegister(AArch64::XZR) || MI.definesRegister(AArch64::WZR)) {
+ // It is not allowed to write to the same register (not even the zero
+ // register) twice in a single instruction.
+ DEBUG(dbgs() << " Ignoring, XZR or WZR already used by the instruction\n");
+ continue;
+ }
for (int i = 0, e = MI.getDesc().getNumDefs(); i != e; ++i) {
MachineOperand &MO = MI.getOperand(i);
if (MO.isReg() && MO.isDead() && MO.isDef()) {
@@ -100,7 +111,7 @@ bool AArch64DeadRegisterDefinitions::processMachineBasicBlock(
continue;
}
// Don't change the register if there's an implicit def of a subreg or
- // supperreg.
+ // superreg.
if (implicitlyDefinesOverlappingReg(MO.getReg(), MI)) {
DEBUG(dbgs() << " Ignoring, implicitly defines overlap reg.\n");
continue;
@@ -123,6 +134,8 @@ bool AArch64DeadRegisterDefinitions::processMachineBasicBlock(
MO.setReg(NewReg);
DEBUG(MI.print(dbgs()));
++NumDeadDefsReplaced;
+ // Only replace one dead register, see check for zero register above.
+ break;
}
}
}
@@ -136,6 +149,9 @@ bool AArch64DeadRegisterDefinitions::runOnMachineFunction(MachineFunction &MF) {
bool Changed = false;
DEBUG(dbgs() << "***** AArch64DeadRegisterDefinitions *****\n");
+ if (skipFunction(*MF.getFunction()))
+ return false;
+
for (auto &MBB : MF)
if (processMachineBasicBlock(MBB))
Changed = true;
diff --git a/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
index d24e42a937634..5e477d39e074a 100644
--- a/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
+++ b/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
@@ -17,6 +17,7 @@
#include "MCTargetDesc/AArch64AddressingModes.h"
#include "AArch64InstrInfo.h"
#include "AArch64Subtarget.h"
+#include "llvm/CodeGen/LivePhysRegs.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/Support/MathExtras.h"
@@ -46,9 +47,18 @@ public:
private:
bool expandMBB(MachineBasicBlock &MBB);
- bool expandMI(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI);
+ bool expandMI(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+ MachineBasicBlock::iterator &NextMBBI);
bool expandMOVImm(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
unsigned BitSize);
+
+ bool expandCMP_SWAP(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+ unsigned LdarOp, unsigned StlrOp, unsigned CmpOp,
+ unsigned ExtendImm, unsigned ZeroReg,
+ MachineBasicBlock::iterator &NextMBBI);
+ bool expandCMP_SWAP_128(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ MachineBasicBlock::iterator &NextMBBI);
};
char AArch64ExpandPseudo::ID = 0;
}
@@ -403,9 +413,17 @@ bool AArch64ExpandPseudo::expandMOVImm(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MBBI,
unsigned BitSize) {
MachineInstr &MI = *MBBI;
+ unsigned DstReg = MI.getOperand(0).getReg();
uint64_t Imm = MI.getOperand(1).getImm();
const unsigned Mask = 0xFFFF;
+ if (DstReg == AArch64::XZR || DstReg == AArch64::WZR) {
+ // Useless def, and we don't want to risk creating an invalid ORR (which
+ // would really write to sp).
+ MI.eraseFromParent();
+ return true;
+ }
+
// Try a MOVI instruction (aka ORR-immediate with the zero register).
uint64_t UImm = Imm << (64 - BitSize) >> (64 - BitSize);
uint64_t Encoding;
@@ -531,7 +549,6 @@ bool AArch64ExpandPseudo::expandMOVImm(MachineBasicBlock &MBB,
LastShift = (TZ / 16) * 16;
}
unsigned Imm16 = (Imm >> Shift) & Mask;
- unsigned DstReg = MI.getOperand(0).getReg();
bool DstIsDead = MI.getOperand(0).isDead();
MachineInstrBuilder MIB1 =
BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(FirstOpc))
@@ -572,10 +589,178 @@ bool AArch64ExpandPseudo::expandMOVImm(MachineBasicBlock &MBB,
return true;
}
+static void addPostLoopLiveIns(MachineBasicBlock *MBB, LivePhysRegs &LiveRegs) {
+ for (auto I = LiveRegs.begin(); I != LiveRegs.end(); ++I)
+ MBB->addLiveIn(*I);
+}
+
+bool AArch64ExpandPseudo::expandCMP_SWAP(
+ MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, unsigned LdarOp,
+ unsigned StlrOp, unsigned CmpOp, unsigned ExtendImm, unsigned ZeroReg,
+ MachineBasicBlock::iterator &NextMBBI) {
+ MachineInstr &MI = *MBBI;
+ DebugLoc DL = MI.getDebugLoc();
+ MachineOperand &Dest = MI.getOperand(0);
+ unsigned StatusReg = MI.getOperand(1).getReg();
+ MachineOperand &Addr = MI.getOperand(2);
+ MachineOperand &Desired = MI.getOperand(3);
+ MachineOperand &New = MI.getOperand(4);
+
+ LivePhysRegs LiveRegs(&TII->getRegisterInfo());
+ LiveRegs.addLiveOuts(MBB);
+ for (auto I = std::prev(MBB.end()); I != MBBI; --I)
+ LiveRegs.stepBackward(*I);
+
+ MachineFunction *MF = MBB.getParent();
+ auto LoadCmpBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
+ auto StoreBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
+ auto DoneBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
+
+ MF->insert(++MBB.getIterator(), LoadCmpBB);
+ MF->insert(++LoadCmpBB->getIterator(), StoreBB);
+ MF->insert(++StoreBB->getIterator(), DoneBB);
+
+ // .Lloadcmp:
+ // ldaxr xDest, [xAddr]
+ // cmp xDest, xDesired
+ // b.ne .Ldone
+ LoadCmpBB->addLiveIn(Addr.getReg());
+ LoadCmpBB->addLiveIn(Dest.getReg());
+ LoadCmpBB->addLiveIn(Desired.getReg());
+ addPostLoopLiveIns(LoadCmpBB, LiveRegs);
+
+ BuildMI(LoadCmpBB, DL, TII->get(LdarOp), Dest.getReg())
+ .addReg(Addr.getReg());
+ BuildMI(LoadCmpBB, DL, TII->get(CmpOp), ZeroReg)
+ .addReg(Dest.getReg(), getKillRegState(Dest.isDead()))
+ .addOperand(Desired)
+ .addImm(ExtendImm);
+ BuildMI(LoadCmpBB, DL, TII->get(AArch64::Bcc))
+ .addImm(AArch64CC::NE)
+ .addMBB(DoneBB)
+ .addReg(AArch64::NZCV, RegState::Implicit | RegState::Kill);
+ LoadCmpBB->addSuccessor(DoneBB);
+ LoadCmpBB->addSuccessor(StoreBB);
+
+ // .Lstore:
+ // stlxr wStatus, xNew, [xAddr]
+ // cbnz wStatus, .Lloadcmp
+ StoreBB->addLiveIn(Addr.getReg());
+ StoreBB->addLiveIn(New.getReg());
+ addPostLoopLiveIns(StoreBB, LiveRegs);
+
+ BuildMI(StoreBB, DL, TII->get(StlrOp), StatusReg)
+ .addOperand(New)
+ .addOperand(Addr);
+ BuildMI(StoreBB, DL, TII->get(AArch64::CBNZW))
+ .addReg(StatusReg, RegState::Kill)
+ .addMBB(LoadCmpBB);
+ StoreBB->addSuccessor(LoadCmpBB);
+ StoreBB->addSuccessor(DoneBB);
+
+ DoneBB->splice(DoneBB->end(), &MBB, MI, MBB.end());
+ DoneBB->transferSuccessors(&MBB);
+ addPostLoopLiveIns(DoneBB, LiveRegs);
+
+ MBB.addSuccessor(LoadCmpBB);
+
+ NextMBBI = MBB.end();
+ MI.eraseFromParent();
+ return true;
+}
+
+bool AArch64ExpandPseudo::expandCMP_SWAP_128(
+ MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+ MachineBasicBlock::iterator &NextMBBI) {
+
+ MachineInstr &MI = *MBBI;
+ DebugLoc DL = MI.getDebugLoc();
+ MachineOperand &DestLo = MI.getOperand(0);
+ MachineOperand &DestHi = MI.getOperand(1);
+ unsigned StatusReg = MI.getOperand(2).getReg();
+ MachineOperand &Addr = MI.getOperand(3);
+ MachineOperand &DesiredLo = MI.getOperand(4);
+ MachineOperand &DesiredHi = MI.getOperand(5);
+ MachineOperand &NewLo = MI.getOperand(6);
+ MachineOperand &NewHi = MI.getOperand(7);
+
+ LivePhysRegs LiveRegs(&TII->getRegisterInfo());
+ LiveRegs.addLiveOuts(MBB);
+ for (auto I = std::prev(MBB.end()); I != MBBI; --I)
+ LiveRegs.stepBackward(*I);
+
+ MachineFunction *MF = MBB.getParent();
+ auto LoadCmpBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
+ auto StoreBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
+ auto DoneBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
+
+ MF->insert(++MBB.getIterator(), LoadCmpBB);
+ MF->insert(++LoadCmpBB->getIterator(), StoreBB);
+ MF->insert(++StoreBB->getIterator(), DoneBB);
+
+ // .Lloadcmp:
+ // ldaxp xDestLo, xDestHi, [xAddr]
+ // cmp xDestLo, xDesiredLo
+ // sbcs xDestHi, xDesiredHi
+ // b.ne .Ldone
+ LoadCmpBB->addLiveIn(Addr.getReg());
+ LoadCmpBB->addLiveIn(DestLo.getReg());
+ LoadCmpBB->addLiveIn(DestHi.getReg());
+ LoadCmpBB->addLiveIn(DesiredLo.getReg());
+ LoadCmpBB->addLiveIn(DesiredHi.getReg());
+ addPostLoopLiveIns(LoadCmpBB, LiveRegs);
+
+ BuildMI(LoadCmpBB, DL, TII->get(AArch64::LDAXPX))
+ .addReg(DestLo.getReg(), RegState::Define)
+ .addReg(DestHi.getReg(), RegState::Define)
+ .addReg(Addr.getReg());
+ BuildMI(LoadCmpBB, DL, TII->get(AArch64::SUBSXrs), AArch64::XZR)
+ .addReg(DestLo.getReg(), getKillRegState(DestLo.isDead()))
+ .addOperand(DesiredLo)
+ .addImm(0);
+ BuildMI(LoadCmpBB, DL, TII->get(AArch64::SBCSXr), AArch64::XZR)
+ .addReg(DestHi.getReg(), getKillRegState(DestHi.isDead()))
+ .addOperand(DesiredHi);
+ BuildMI(LoadCmpBB, DL, TII->get(AArch64::Bcc))
+ .addImm(AArch64CC::NE)
+ .addMBB(DoneBB)
+ .addReg(AArch64::NZCV, RegState::Implicit | RegState::Kill);
+ LoadCmpBB->addSuccessor(DoneBB);
+ LoadCmpBB->addSuccessor(StoreBB);
+
+ // .Lstore:
+ // stlxp wStatus, xNewLo, xNewHi, [xAddr]
+ // cbnz wStatus, .Lloadcmp
+ StoreBB->addLiveIn(Addr.getReg());
+ StoreBB->addLiveIn(NewLo.getReg());
+ StoreBB->addLiveIn(NewHi.getReg());
+ addPostLoopLiveIns(StoreBB, LiveRegs);
+ BuildMI(StoreBB, DL, TII->get(AArch64::STLXPX), StatusReg)
+ .addOperand(NewLo)
+ .addOperand(NewHi)
+ .addOperand(Addr);
+ BuildMI(StoreBB, DL, TII->get(AArch64::CBNZW))
+ .addReg(StatusReg, RegState::Kill)
+ .addMBB(LoadCmpBB);
+ StoreBB->addSuccessor(LoadCmpBB);
+ StoreBB->addSuccessor(DoneBB);
+
+ DoneBB->splice(DoneBB->end(), &MBB, MI, MBB.end());
+ DoneBB->transferSuccessors(&MBB);
+ addPostLoopLiveIns(DoneBB, LiveRegs);
+
+ MBB.addSuccessor(LoadCmpBB);
+
+ NextMBBI = MBB.end();
+ MI.eraseFromParent();
+ return true;
+}
+
/// \brief If MBBI references a pseudo instruction that should be expanded here,
/// do the expansion and return true. Otherwise return false.
bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB,
- MachineBasicBlock::iterator MBBI) {
+ MachineBasicBlock::iterator MBBI,
+ MachineBasicBlock::iterator &NextMBBI) {
MachineInstr &MI = *MBBI;
unsigned Opcode = MI.getOpcode();
switch (Opcode) {
@@ -717,6 +902,28 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB,
MI.eraseFromParent();
return true;
}
+ case AArch64::CMP_SWAP_8:
+ return expandCMP_SWAP(MBB, MBBI, AArch64::LDAXRB, AArch64::STLXRB,
+ AArch64::SUBSWrx,
+ AArch64_AM::getArithExtendImm(AArch64_AM::UXTB, 0),
+ AArch64::WZR, NextMBBI);
+ case AArch64::CMP_SWAP_16:
+ return expandCMP_SWAP(MBB, MBBI, AArch64::LDAXRH, AArch64::STLXRH,
+ AArch64::SUBSWrx,
+ AArch64_AM::getArithExtendImm(AArch64_AM::UXTH, 0),
+ AArch64::WZR, NextMBBI);
+ case AArch64::CMP_SWAP_32:
+ return expandCMP_SWAP(MBB, MBBI, AArch64::LDAXRW, AArch64::STLXRW,
+ AArch64::SUBSWrs,
+ AArch64_AM::getShifterImm(AArch64_AM::LSL, 0),
+ AArch64::WZR, NextMBBI);
+ case AArch64::CMP_SWAP_64:
+ return expandCMP_SWAP(MBB, MBBI,
+ AArch64::LDAXRX, AArch64::STLXRX, AArch64::SUBSXrs,
+ AArch64_AM::getShifterImm(AArch64_AM::LSL, 0),
+ AArch64::XZR, NextMBBI);
+ case AArch64::CMP_SWAP_128:
+ return expandCMP_SWAP_128(MBB, MBBI, NextMBBI);
}
return false;
}
@@ -729,7 +936,7 @@ bool AArch64ExpandPseudo::expandMBB(MachineBasicBlock &MBB) {
MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
while (MBBI != E) {
MachineBasicBlock::iterator NMBBI = std::next(MBBI);
- Modified |= expandMI(MBB, MBBI);
+ Modified |= expandMI(MBB, MBBI, NMBBI);
MBBI = NMBBI;
}
diff --git a/lib/Target/AArch64/AArch64FastISel.cpp b/lib/Target/AArch64/AArch64FastISel.cpp
index 0ac4b39b03572..e2ab7ab79be19 100644
--- a/lib/Target/AArch64/AArch64FastISel.cpp
+++ b/lib/Target/AArch64/AArch64FastISel.cpp
@@ -37,7 +37,6 @@
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/Operator.h"
#include "llvm/MC/MCSymbol.h"
-#include "llvm/Support/CommandLine.h"
using namespace llvm;
namespace {
@@ -144,8 +143,8 @@ private:
bool computeCallAddress(const Value *V, Address &Addr);
bool simplifyAddress(Address &Addr, MVT VT);
void addLoadStoreOperands(Address &Addr, const MachineInstrBuilder &MIB,
- unsigned Flags, unsigned ScaleFactor,
- MachineMemOperand *MMO);
+ MachineMemOperand::Flags Flags,
+ unsigned ScaleFactor, MachineMemOperand *MMO);
bool isMemCpySmall(uint64_t Len, unsigned Alignment);
bool tryEmitSmallMemCpy(Address Dest, Address Src, uint64_t Len,
unsigned Alignment);
@@ -439,9 +438,6 @@ unsigned AArch64FastISel::materializeGV(const GlobalValue *GV) {
.addReg(ADRPReg)
.addGlobalAddress(GV, 0, AArch64II::MO_GOT | AArch64II::MO_PAGEOFF |
AArch64II::MO_NC);
- } else if (OpFlags & AArch64II::MO_CONSTPOOL) {
- // We can't handle addresses loaded from a constant pool quickly yet.
- return 0;
} else {
// ADRP + ADDX
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ADRP),
@@ -555,10 +551,9 @@ bool AArch64FastISel::computeAddress(const Value *Obj, Address &Addr, Type *Ty)
// Iterate through the GEP folding the constants into offsets where
// we can.
- gep_type_iterator GTI = gep_type_begin(U);
- for (User::const_op_iterator i = U->op_begin() + 1, e = U->op_end(); i != e;
- ++i, ++GTI) {
- const Value *Op = *i;
+ for (gep_type_iterator GTI = gep_type_begin(U), E = gep_type_end(U);
+ GTI != E; ++GTI) {
+ const Value *Op = GTI.getOperand();
if (StructType *STy = dyn_cast<StructType>(*GTI)) {
const StructLayout *SL = DL.getStructLayout(STy);
unsigned Idx = cast<ConstantInt>(Op)->getZExtValue();
@@ -947,10 +942,7 @@ bool AArch64FastISel::isValueAvailable(const Value *V) const {
return true;
const auto *I = cast<Instruction>(V);
- if (FuncInfo.MBBMap[I->getParent()] == FuncInfo.MBB)
- return true;
-
- return false;
+ return FuncInfo.MBBMap[I->getParent()] == FuncInfo.MBB;
}
bool AArch64FastISel::simplifyAddress(Address &Addr, MVT VT) {
@@ -1048,7 +1040,7 @@ bool AArch64FastISel::simplifyAddress(Address &Addr, MVT VT) {
void AArch64FastISel::addLoadStoreOperands(Address &Addr,
const MachineInstrBuilder &MIB,
- unsigned Flags,
+ MachineMemOperand::Flags Flags,
unsigned ScaleFactor,
MachineMemOperand *MMO) {
int64_t Offset = Addr.getOffset() / ScaleFactor;
@@ -1612,8 +1604,8 @@ unsigned AArch64FastISel::emitLogicalOp(unsigned ISDOpc, MVT RetVT,
unsigned AArch64FastISel::emitLogicalOp_ri(unsigned ISDOpc, MVT RetVT,
unsigned LHSReg, bool LHSIsKill,
uint64_t Imm) {
- assert((ISD::AND + 1 == ISD::OR) && (ISD::AND + 2 == ISD::XOR) &&
- "ISD nodes are not consecutive!");
+ static_assert((ISD::AND + 1 == ISD::OR) && (ISD::AND + 2 == ISD::XOR),
+ "ISD nodes are not consecutive!");
static const unsigned OpcTable[3][2] = {
{ AArch64::ANDWri, AArch64::ANDXri },
{ AArch64::ORRWri, AArch64::ORRXri },
@@ -1659,8 +1651,8 @@ unsigned AArch64FastISel::emitLogicalOp_rs(unsigned ISDOpc, MVT RetVT,
unsigned LHSReg, bool LHSIsKill,
unsigned RHSReg, bool RHSIsKill,
uint64_t ShiftImm) {
- assert((ISD::AND + 1 == ISD::OR) && (ISD::AND + 2 == ISD::XOR) &&
- "ISD nodes are not consecutive!");
+ static_assert((ISD::AND + 1 == ISD::OR) && (ISD::AND + 2 == ISD::XOR),
+ "ISD nodes are not consecutive!");
static const unsigned OpcTable[3][2] = {
{ AArch64::ANDWrs, AArch64::ANDXrs },
{ AArch64::ORRWrs, AArch64::ORRXrs },
@@ -1904,6 +1896,21 @@ bool AArch64FastISel::selectLoad(const Instruction *I) {
cast<LoadInst>(I)->isAtomic())
return false;
+ const Value *SV = I->getOperand(0);
+ if (TLI.supportSwiftError()) {
+ // Swifterror values can come from either a function parameter with
+ // swifterror attribute or an alloca with swifterror attribute.
+ if (const Argument *Arg = dyn_cast<Argument>(SV)) {
+ if (Arg->hasSwiftErrorAttr())
+ return false;
+ }
+
+ if (const AllocaInst *Alloca = dyn_cast<AllocaInst>(SV)) {
+ if (Alloca->isSwiftError())
+ return false;
+ }
+ }
+
// See if we can handle this address.
Address Addr;
if (!computeAddress(I->getOperand(0), Addr, I->getType()))
@@ -2068,6 +2075,21 @@ bool AArch64FastISel::selectStore(const Instruction *I) {
cast<StoreInst>(I)->isAtomic())
return false;
+ const Value *PtrV = I->getOperand(1);
+ if (TLI.supportSwiftError()) {
+ // Swifterror values can come from either a function parameter with
+ // swifterror attribute or an alloca with swifterror attribute.
+ if (const Argument *Arg = dyn_cast<Argument>(PtrV)) {
+ if (Arg->hasSwiftErrorAttr())
+ return false;
+ }
+
+ if (const AllocaInst *Alloca = dyn_cast<AllocaInst>(PtrV)) {
+ if (Alloca->isSwiftError())
+ return false;
+ }
+ }
+
// Get the value to be stored into a register. Use the zero register directly
// when possible to avoid an unnecessary copy and a wasted register.
unsigned SrcReg = 0;
@@ -2813,6 +2835,8 @@ bool AArch64FastISel::fastLowerArguments() {
if (F->getAttributes().hasAttribute(Idx, Attribute::ByVal) ||
F->getAttributes().hasAttribute(Idx, Attribute::InReg) ||
F->getAttributes().hasAttribute(Idx, Attribute::StructRet) ||
+ F->getAttributes().hasAttribute(Idx, Attribute::SwiftSelf) ||
+ F->getAttributes().hasAttribute(Idx, Attribute::SwiftError) ||
F->getAttributes().hasAttribute(Idx, Attribute::Nest))
return false;
@@ -3064,7 +3088,8 @@ bool AArch64FastISel::fastLowerCall(CallLoweringInfo &CLI) {
return false;
for (auto Flag : CLI.OutFlags)
- if (Flag.isInReg() || Flag.isSRet() || Flag.isNest() || Flag.isByVal())
+ if (Flag.isInReg() || Flag.isSRet() || Flag.isNest() || Flag.isByVal() ||
+ Flag.isSwiftSelf() || Flag.isSwiftError())
return false;
// Set up the argument vectors.
@@ -3646,6 +3671,10 @@ bool AArch64FastISel::selectRet(const Instruction *I) {
if (F.isVarArg())
return false;
+ if (TLI.supportSwiftError() &&
+ F.getAttributes().hasAttrSomewhere(Attribute::SwiftError))
+ return false;
+
if (TLI.supportSplitCSR(FuncInfo.MF))
return false;
@@ -4814,18 +4843,18 @@ bool AArch64FastISel::selectGetElementPtr(const Instruction *I) {
// Keep a running tab of the total offset to coalesce multiple N = N + Offset
// into a single N = N + TotalOffset.
uint64_t TotalOffs = 0;
- Type *Ty = I->getOperand(0)->getType();
MVT VT = TLI.getPointerTy(DL);
- for (auto OI = std::next(I->op_begin()), E = I->op_end(); OI != E; ++OI) {
- const Value *Idx = *OI;
- if (auto *StTy = dyn_cast<StructType>(Ty)) {
+ for (gep_type_iterator GTI = gep_type_begin(I), E = gep_type_end(I);
+ GTI != E; ++GTI) {
+ const Value *Idx = GTI.getOperand();
+ if (auto *StTy = dyn_cast<StructType>(*GTI)) {
unsigned Field = cast<ConstantInt>(Idx)->getZExtValue();
// N = N + Offset
if (Field)
TotalOffs += DL.getStructLayout(StTy)->getElementOffset(Field);
- Ty = StTy->getElementType(Field);
} else {
- Ty = cast<SequentialType>(Ty)->getElementType();
+ Type *Ty = GTI.getIndexedType();
+
// If this is a constant subscript, handle it quickly.
if (const auto *CI = dyn_cast<ConstantInt>(Idx)) {
if (CI->isZero())
diff --git a/lib/Target/AArch64/AArch64FrameLowering.cpp b/lib/Target/AArch64/AArch64FrameLowering.cpp
index 3f63d049c34ed..82111e5c72593 100644
--- a/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -93,6 +93,7 @@
#include "AArch64Subtarget.h"
#include "AArch64TargetMachine.h"
#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/LivePhysRegs.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -127,12 +128,7 @@ bool AArch64FrameLowering::canUseRedZone(const MachineFunction &MF) const {
const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
unsigned NumBytes = AFI->getLocalStackSize();
- // Note: currently hasFP() is always true for hasCalls(), but that's an
- // implementation detail of the current code, not a strict requirement,
- // so stay safe here and check both.
- if (MFI->hasCalls() || hasFP(MF) || NumBytes > 128)
- return false;
- return true;
+ return !(MFI->hasCalls() || hasFP(MF) || NumBytes > 128);
}
/// hasFP - Return true if the specified function should have a dedicated frame
@@ -140,9 +136,12 @@ bool AArch64FrameLowering::canUseRedZone(const MachineFunction &MF) const {
bool AArch64FrameLowering::hasFP(const MachineFunction &MF) const {
const MachineFrameInfo *MFI = MF.getFrameInfo();
const TargetRegisterInfo *RegInfo = MF.getSubtarget().getRegisterInfo();
- return (MFI->hasCalls() || MFI->hasVarSizedObjects() ||
- MFI->isFrameAddressTaken() || MFI->hasStackMap() ||
- MFI->hasPatchPoint() || RegInfo->needsStackRealignment(MF));
+ // Retain behavior of always omitting the FP for leaf functions when possible.
+ return (MFI->hasCalls() &&
+ MF.getTarget().Options.DisableFramePointerElim(MF)) ||
+ MFI->hasVarSizedObjects() || MFI->isFrameAddressTaken() ||
+ MFI->hasStackMap() || MFI->hasPatchPoint() ||
+ RegInfo->needsStackRealignment(MF);
}
/// hasReservedCallFrame - Under normal circumstances, when a frame pointer is
@@ -155,7 +154,7 @@ AArch64FrameLowering::hasReservedCallFrame(const MachineFunction &MF) const {
return !MF.getFrameInfo()->hasVarSizedObjects();
}
-void AArch64FrameLowering::eliminateCallFramePseudoInstr(
+MachineBasicBlock::iterator AArch64FrameLowering::eliminateCallFramePseudoInstr(
MachineFunction &MF, MachineBasicBlock &MBB,
MachineBasicBlock::iterator I) const {
const AArch64InstrInfo *TII =
@@ -170,7 +169,7 @@ void AArch64FrameLowering::eliminateCallFramePseudoInstr(
unsigned Align = getStackAlignment();
int64_t Amount = I->getOperand(0).getImm();
- Amount = RoundUpToAlignment(Amount, Align);
+ Amount = alignTo(Amount, Align);
if (!IsDestroy)
Amount = -Amount;
@@ -186,7 +185,7 @@ void AArch64FrameLowering::eliminateCallFramePseudoInstr(
// 2) For 12-bit <= offset <= 24-bit, we use two instructions. One uses
// LSL #0, and the other uses LSL #12.
//
- // Mostly call frames will be allocated at the start of a function so
+ // Most call frames will be allocated at the start of a function so
// this is OK, but it is a limitation that needs dealing with.
assert(Amount > -0xffffff && Amount < 0xffffff && "call frame too large");
emitFrameOffset(MBB, I, DL, AArch64::SP, AArch64::SP, Amount, TII);
@@ -198,12 +197,11 @@ void AArch64FrameLowering::eliminateCallFramePseudoInstr(
emitFrameOffset(MBB, I, DL, AArch64::SP, AArch64::SP, -CalleePopAmount,
TII);
}
- MBB.erase(I);
+ return MBB.erase(I);
}
void AArch64FrameLowering::emitCalleeSavedFrameMoves(
- MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
- unsigned FramePtr) const {
+ MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI) const {
MachineFunction &MF = *MBB.getParent();
MachineFrameInfo *MFI = MF.getFrameInfo();
MachineModuleInfo &MMI = MF.getMMI();
@@ -216,75 +214,194 @@ void AArch64FrameLowering::emitCalleeSavedFrameMoves(
if (CSI.empty())
return;
- const DataLayout &TD = MF.getDataLayout();
- bool HasFP = hasFP(MF);
-
- // Calculate amount of bytes used for return address storing.
- int stackGrowth = -TD.getPointerSize(0);
-
- // Calculate offsets.
- int64_t saveAreaOffset = (HasFP ? 2 : 1) * stackGrowth;
- unsigned TotalSkipped = 0;
for (const auto &Info : CSI) {
unsigned Reg = Info.getReg();
- int64_t Offset = MFI->getObjectOffset(Info.getFrameIdx()) -
- getOffsetOfLocalArea() + saveAreaOffset;
-
- // Don't output a new CFI directive if we're re-saving the frame pointer or
- // link register. This happens when the PrologEpilogInserter has inserted an
- // extra "STP" of the frame pointer and link register -- the "emitPrologue"
- // method automatically generates the directives when frame pointers are
- // used. If we generate CFI directives for the extra "STP"s, the linker will
- // lose track of the correct values for the frame pointer and link register.
- if (HasFP && (FramePtr == Reg || Reg == AArch64::LR)) {
- TotalSkipped += stackGrowth;
- continue;
- }
-
+ int64_t Offset =
+ MFI->getObjectOffset(Info.getFrameIdx()) - getOffsetOfLocalArea();
unsigned DwarfReg = MRI->getDwarfRegNum(Reg, true);
- unsigned CFIIndex = MMI.addFrameInst(MCCFIInstruction::createOffset(
- nullptr, DwarfReg, Offset - TotalSkipped));
+ unsigned CFIIndex = MMI.addFrameInst(
+ MCCFIInstruction::createOffset(nullptr, DwarfReg, Offset));
BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
.addCFIIndex(CFIIndex)
.setMIFlags(MachineInstr::FrameSetup);
}
}
-/// Get FPOffset by analyzing the first instruction.
-static int getFPOffsetInPrologue(MachineInstr *MBBI) {
- // First instruction must a) allocate the stack and b) have an immediate
- // that is a multiple of -2.
- assert(((MBBI->getOpcode() == AArch64::STPXpre ||
- MBBI->getOpcode() == AArch64::STPDpre) &&
- MBBI->getOperand(3).getReg() == AArch64::SP &&
- MBBI->getOperand(4).getImm() < 0 &&
- (MBBI->getOperand(4).getImm() & 1) == 0));
-
- // Frame pointer is fp = sp - 16. Since the STPXpre subtracts the space
- // required for the callee saved register area we get the frame pointer
- // by addding that offset - 16 = -getImm()*8 - 2*8 = -(getImm() + 2) * 8.
- int FPOffset = -(MBBI->getOperand(4).getImm() + 2) * 8;
- assert(FPOffset >= 0 && "Bad Framepointer Offset");
- return FPOffset;
-}
+// Find a scratch register that we can use at the start of the prologue to
+// re-align the stack pointer. We avoid using callee-save registers since they
+// may appear to be free when this is called from canUseAsPrologue (during
+// shrink wrapping), but then no longer be free when this is called from
+// emitPrologue.
+//
+// FIXME: This is a bit conservative, since in the above case we could use one
+// of the callee-save registers as a scratch temp to re-align the stack pointer,
+// but we would then have to make sure that we were in fact saving at least one
+// callee-save register in the prologue, which is additional complexity that
+// doesn't seem worth the benefit.
+static unsigned findScratchNonCalleeSaveRegister(MachineBasicBlock *MBB) {
+ MachineFunction *MF = MBB->getParent();
+
+ // If MBB is an entry block, use X9 as the scratch register
+ if (&MF->front() == MBB)
+ return AArch64::X9;
+
+ const TargetRegisterInfo &TRI = *MF->getSubtarget().getRegisterInfo();
+ LivePhysRegs LiveRegs(&TRI);
+ LiveRegs.addLiveIns(*MBB);
+
+ // Mark callee saved registers as used so we will not choose them.
+ const AArch64Subtarget &Subtarget = MF->getSubtarget<AArch64Subtarget>();
+ const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
+ const MCPhysReg *CSRegs = RegInfo->getCalleeSavedRegs(MF);
+ for (unsigned i = 0; CSRegs[i]; ++i)
+ LiveRegs.addReg(CSRegs[i]);
+
+ // Prefer X9 since it was historically used for the prologue scratch reg.
+ const MachineRegisterInfo &MRI = MF->getRegInfo();
+ if (LiveRegs.available(MRI, AArch64::X9))
+ return AArch64::X9;
-static bool isCSSave(MachineInstr *MBBI) {
- return MBBI->getOpcode() == AArch64::STPXi ||
- MBBI->getOpcode() == AArch64::STPDi ||
- MBBI->getOpcode() == AArch64::STPXpre ||
- MBBI->getOpcode() == AArch64::STPDpre;
+ for (unsigned Reg : AArch64::GPR64RegClass) {
+ if (LiveRegs.available(MRI, Reg))
+ return Reg;
+ }
+ return AArch64::NoRegister;
}
bool AArch64FrameLowering::canUseAsPrologue(
const MachineBasicBlock &MBB) const {
const MachineFunction *MF = MBB.getParent();
+ MachineBasicBlock *TmpMBB = const_cast<MachineBasicBlock *>(&MBB);
const AArch64Subtarget &Subtarget = MF->getSubtarget<AArch64Subtarget>();
const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
// Don't need a scratch register if we're not going to re-align the stack.
- // Otherwise, we may need a scratch register to be available and we do not
- // support that for now.
- return !RegInfo->needsStackRealignment(*MF);
+ if (!RegInfo->needsStackRealignment(*MF))
+ return true;
+ // Otherwise, we can use any block as long as it has a scratch register
+ // available.
+ return findScratchNonCalleeSaveRegister(TmpMBB) != AArch64::NoRegister;
+}
+
+bool AArch64FrameLowering::shouldCombineCSRLocalStackBump(
+ MachineFunction &MF, unsigned StackBumpBytes) const {
+ AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
+ const MachineFrameInfo *MFI = MF.getFrameInfo();
+ const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
+ const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
+
+ if (AFI->getLocalStackSize() == 0)
+ return false;
+
+ // 512 is the maximum immediate for stp/ldp that will be used for
+ // callee-save save/restores
+ if (StackBumpBytes >= 512)
+ return false;
+
+ if (MFI->hasVarSizedObjects())
+ return false;
+
+ if (RegInfo->needsStackRealignment(MF))
+ return false;
+
+ // This isn't strictly necessary, but it simplifies things a bit since the
+ // current RedZone handling code assumes the SP is adjusted by the
+ // callee-save save/restore code.
+ if (canUseRedZone(MF))
+ return false;
+
+ return true;
+}
+
+// Convert callee-save register save/restore instruction to do stack pointer
+// decrement/increment to allocate/deallocate the callee-save stack area by
+// converting store/load to use pre/post increment version.
+static MachineBasicBlock::iterator convertCalleeSaveRestoreToSPPrePostIncDec(
+ MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+ const DebugLoc &DL, const TargetInstrInfo *TII, int CSStackSizeInc) {
+
+ unsigned NewOpc;
+ bool NewIsUnscaled = false;
+ switch (MBBI->getOpcode()) {
+ default:
+ llvm_unreachable("Unexpected callee-save save/restore opcode!");
+ case AArch64::STPXi:
+ NewOpc = AArch64::STPXpre;
+ break;
+ case AArch64::STPDi:
+ NewOpc = AArch64::STPDpre;
+ break;
+ case AArch64::STRXui:
+ NewOpc = AArch64::STRXpre;
+ NewIsUnscaled = true;
+ break;
+ case AArch64::STRDui:
+ NewOpc = AArch64::STRDpre;
+ NewIsUnscaled = true;
+ break;
+ case AArch64::LDPXi:
+ NewOpc = AArch64::LDPXpost;
+ break;
+ case AArch64::LDPDi:
+ NewOpc = AArch64::LDPDpost;
+ break;
+ case AArch64::LDRXui:
+ NewOpc = AArch64::LDRXpost;
+ NewIsUnscaled = true;
+ break;
+ case AArch64::LDRDui:
+ NewOpc = AArch64::LDRDpost;
+ NewIsUnscaled = true;
+ break;
+ }
+
+ MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII->get(NewOpc));
+ MIB.addReg(AArch64::SP, RegState::Define);
+
+ // Copy all operands other than the immediate offset.
+ unsigned OpndIdx = 0;
+ for (unsigned OpndEnd = MBBI->getNumOperands() - 1; OpndIdx < OpndEnd;
+ ++OpndIdx)
+ MIB.addOperand(MBBI->getOperand(OpndIdx));
+
+ assert(MBBI->getOperand(OpndIdx).getImm() == 0 &&
+ "Unexpected immediate offset in first/last callee-save save/restore "
+ "instruction!");
+ assert(MBBI->getOperand(OpndIdx - 1).getReg() == AArch64::SP &&
+ "Unexpected base register in callee-save save/restore instruction!");
+ // Last operand is immediate offset that needs fixing.
+ assert(CSStackSizeInc % 8 == 0);
+ int64_t CSStackSizeIncImm = CSStackSizeInc;
+ if (!NewIsUnscaled)
+ CSStackSizeIncImm /= 8;
+ MIB.addImm(CSStackSizeIncImm);
+
+ MIB.setMIFlags(MBBI->getFlags());
+ MIB.setMemRefs(MBBI->memoperands_begin(), MBBI->memoperands_end());
+
+ return std::prev(MBB.erase(MBBI));
+}
+
+// Fixup callee-save register save/restore instructions to take into account
+// combined SP bump by adding the local stack size to the stack offsets.
+static void fixupCalleeSaveRestoreStackOffset(MachineInstr &MI,
+ unsigned LocalStackSize) {
+ unsigned Opc = MI.getOpcode();
+ (void)Opc;
+ assert((Opc == AArch64::STPXi || Opc == AArch64::STPDi ||
+ Opc == AArch64::STRXui || Opc == AArch64::STRDui ||
+ Opc == AArch64::LDPXi || Opc == AArch64::LDPDi ||
+ Opc == AArch64::LDRXui || Opc == AArch64::LDRDui) &&
+ "Unexpected callee-save save/restore opcode!");
+
+ unsigned OffsetIdx = MI.getNumExplicitOperands() - 1;
+ assert(MI.getOperand(OffsetIdx - 1).getReg() == AArch64::SP &&
+ "Unexpected base register in callee-save save/restore instruction!");
+ // Last operand is immediate offset that needs fixing.
+ MachineOperand &OffsetOpnd = MI.getOperand(OffsetIdx);
+ // All generated opcodes have scaled offsets.
+ assert(LocalStackSize % 8 == 0);
+ OffsetOpnd.setImm(OffsetOpnd.getImm() + LocalStackSize / 8);
}
void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
@@ -316,40 +433,59 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
// All of the stack allocation is for locals.
AFI->setLocalStackSize(NumBytes);
- // Label used to tie together the PROLOG_LABEL and the MachineMoves.
- MCSymbol *FrameLabel = MMI.getContext().createTempSymbol();
-
+ if (!NumBytes)
+ return;
// REDZONE: If the stack size is less than 128 bytes, we don't need
// to actually allocate.
- if (NumBytes && !canUseRedZone(MF)) {
+ if (canUseRedZone(MF))
+ ++NumRedZoneFunctions;
+ else {
emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP, -NumBytes, TII,
MachineInstr::FrameSetup);
+ // Label used to tie together the PROLOG_LABEL and the MachineMoves.
+ MCSymbol *FrameLabel = MMI.getContext().createTempSymbol();
// Encode the stack size of the leaf function.
unsigned CFIIndex = MMI.addFrameInst(
MCCFIInstruction::createDefCfaOffset(FrameLabel, -NumBytes));
BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
.addCFIIndex(CFIIndex)
.setMIFlags(MachineInstr::FrameSetup);
- } else if (NumBytes) {
- ++NumRedZoneFunctions;
}
-
return;
}
- // Only set up FP if we actually need to.
- int FPOffset = 0;
- if (HasFP)
- FPOffset = getFPOffsetInPrologue(MBBI);
+ auto CSStackSize = AFI->getCalleeSavedStackSize();
+ // All of the remaining stack allocations are for locals.
+ AFI->setLocalStackSize(NumBytes - CSStackSize);
- // Move past the saves of the callee-saved registers.
- while (isCSSave(MBBI)) {
- ++MBBI;
- NumBytes -= 16;
+ bool CombineSPBump = shouldCombineCSRLocalStackBump(MF, NumBytes);
+ if (CombineSPBump) {
+ emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP, -NumBytes, TII,
+ MachineInstr::FrameSetup);
+ NumBytes = 0;
+ } else if (CSStackSize != 0) {
+ MBBI = convertCalleeSaveRestoreToSPPrePostIncDec(MBB, MBBI, DL, TII,
+ -CSStackSize);
+ NumBytes -= CSStackSize;
}
assert(NumBytes >= 0 && "Negative stack allocation size!?");
+
+ // Move past the saves of the callee-saved registers, fixing up the offsets
+ // and pre-inc if we decided to combine the callee-save and local stack
+ // pointer bump above.
+ MachineBasicBlock::iterator End = MBB.end();
+ while (MBBI != End && MBBI->getFlag(MachineInstr::FrameSetup)) {
+ if (CombineSPBump)
+ fixupCalleeSaveRestoreStackOffset(*MBBI, AFI->getLocalStackSize());
+ ++MBBI;
+ }
if (HasFP) {
+ // Only set up FP if we actually need to. Frame pointer is fp = sp - 16.
+ int FPOffset = CSStackSize - 16;
+ if (CombineSPBump)
+ FPOffset += AFI->getLocalStackSize();
+
// Issue sub fp, sp, FPOffset or
// mov fp,sp when FPOffset is zero.
// Note: All stores of callee-saved registers are marked as "FrameSetup".
@@ -358,47 +494,46 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
MachineInstr::FrameSetup);
}
- // All of the remaining stack allocations are for locals.
- AFI->setLocalStackSize(NumBytes);
-
// Allocate space for the rest of the frame.
+ if (NumBytes) {
+ const bool NeedsRealignment = RegInfo->needsStackRealignment(MF);
+ unsigned scratchSPReg = AArch64::SP;
- const unsigned Alignment = MFI->getMaxAlignment();
- const bool NeedsRealignment = RegInfo->needsStackRealignment(MF);
- unsigned scratchSPReg = AArch64::SP;
- if (NumBytes && NeedsRealignment) {
- // Use the first callee-saved register as a scratch register.
- scratchSPReg = AArch64::X9;
- }
+ if (NeedsRealignment) {
+ scratchSPReg = findScratchNonCalleeSaveRegister(&MBB);
+ assert(scratchSPReg != AArch64::NoRegister);
+ }
- // If we're a leaf function, try using the red zone.
- if (NumBytes && !canUseRedZone(MF))
- // FIXME: in the case of dynamic re-alignment, NumBytes doesn't have
- // the correct value here, as NumBytes also includes padding bytes,
- // which shouldn't be counted here.
- emitFrameOffset(MBB, MBBI, DL, scratchSPReg, AArch64::SP, -NumBytes, TII,
- MachineInstr::FrameSetup);
+ // If we're a leaf function, try using the red zone.
+ if (!canUseRedZone(MF))
+ // FIXME: in the case of dynamic re-alignment, NumBytes doesn't have
+ // the correct value here, as NumBytes also includes padding bytes,
+ // which shouldn't be counted here.
+ emitFrameOffset(MBB, MBBI, DL, scratchSPReg, AArch64::SP, -NumBytes, TII,
+ MachineInstr::FrameSetup);
- if (NumBytes && NeedsRealignment) {
- const unsigned NrBitsToZero = countTrailingZeros(Alignment);
- assert(NrBitsToZero > 1);
- assert(scratchSPReg != AArch64::SP);
-
- // SUB X9, SP, NumBytes
- // -- X9 is temporary register, so shouldn't contain any live data here,
- // -- free to use. This is already produced by emitFrameOffset above.
- // AND SP, X9, 0b11111...0000
- // The logical immediates have a non-trivial encoding. The following
- // formula computes the encoded immediate with all ones but
- // NrBitsToZero zero bits as least significant bits.
- uint32_t andMaskEncoded =
- (1 <<12) // = N
- | ((64-NrBitsToZero) << 6) // immr
- | ((64-NrBitsToZero-1) << 0) // imms
- ;
- BuildMI(MBB, MBBI, DL, TII->get(AArch64::ANDXri), AArch64::SP)
- .addReg(scratchSPReg, RegState::Kill)
- .addImm(andMaskEncoded);
+ if (NeedsRealignment) {
+ const unsigned Alignment = MFI->getMaxAlignment();
+ const unsigned NrBitsToZero = countTrailingZeros(Alignment);
+ assert(NrBitsToZero > 1);
+ assert(scratchSPReg != AArch64::SP);
+
+ // SUB X9, SP, NumBytes
+ // -- X9 is temporary register, so shouldn't contain any live data here,
+ // -- free to use. This is already produced by emitFrameOffset above.
+ // AND SP, X9, 0b11111...0000
+ // The logical immediates have a non-trivial encoding. The following
+ // formula computes the encoded immediate with all ones but
+ // NrBitsToZero zero bits as least significant bits.
+ uint32_t andMaskEncoded = (1 << 12) // = N
+ | ((64 - NrBitsToZero) << 6) // immr
+ | ((64 - NrBitsToZero - 1) << 0); // imms
+
+ BuildMI(MBB, MBBI, DL, TII->get(AArch64::ANDXri), AArch64::SP)
+ .addReg(scratchSPReg, RegState::Kill)
+ .addImm(andMaskEncoded);
+ AFI->setStackRealigned(true);
+ }
}
// If we need a base pointer, set it up here. It's whatever the value of the
@@ -491,21 +626,6 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
.addCFIIndex(CFIIndex)
.setMIFlags(MachineInstr::FrameSetup);
-
- // Record the location of the stored LR
- unsigned LR = RegInfo->getDwarfRegNum(AArch64::LR, true);
- CFIIndex = MMI.addFrameInst(
- MCCFIInstruction::createOffset(nullptr, LR, StackGrowth));
- BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
- .addCFIIndex(CFIIndex)
- .setMIFlags(MachineInstr::FrameSetup);
-
- // Record the location of the stored FP
- CFIIndex = MMI.addFrameInst(
- MCCFIInstruction::createOffset(nullptr, Reg, 2 * StackGrowth));
- BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
- .addCFIIndex(CFIIndex)
- .setMIFlags(MachineInstr::FrameSetup);
} else {
// Encode the stack size of the leaf function.
unsigned CFIIndex = MMI.addFrameInst(
@@ -515,36 +635,10 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
.setMIFlags(MachineInstr::FrameSetup);
}
- // Now emit the moves for whatever callee saved regs we have.
- emitCalleeSavedFrameMoves(MBB, MBBI, FramePtr);
- }
-}
-
-static bool isCalleeSavedRegister(unsigned Reg, const MCPhysReg *CSRegs) {
- for (unsigned i = 0; CSRegs[i]; ++i)
- if (Reg == CSRegs[i])
- return true;
- return false;
-}
-
-/// Checks whether the given instruction restores callee save registers
-/// and if so returns how many.
-static unsigned getNumCSRestores(MachineInstr &MI, const MCPhysReg *CSRegs) {
- unsigned RtIdx = 0;
- switch (MI.getOpcode()) {
- case AArch64::LDPXpost:
- case AArch64::LDPDpost:
- RtIdx = 1;
- // FALLTHROUGH
- case AArch64::LDPXi:
- case AArch64::LDPDi:
- if (!isCalleeSavedRegister(MI.getOperand(RtIdx).getReg(), CSRegs) ||
- !isCalleeSavedRegister(MI.getOperand(RtIdx + 1).getReg(), CSRegs) ||
- MI.getOperand(RtIdx + 2).getReg() != AArch64::SP)
- return 0;
- return 2;
+ // Now emit the moves for whatever callee saved regs we have (including FP,
+ // LR if those are saved).
+ emitCalleeSavedFrameMoves(MBB, MBBI);
}
- return 0;
}
void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
@@ -552,7 +646,6 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
MachineFrameInfo *MFI = MF.getFrameInfo();
const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
- const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
const TargetInstrInfo *TII = Subtarget.getInstrInfo();
DebugLoc DL;
bool IsTailCallReturn = false;
@@ -599,7 +692,7 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
// ---------------------| --- |
// | | | |
// | CalleeSavedReg | | |
- // | (NumRestores * 8) | | |
+ // | (CalleeSavedStackSize)| | |
// | | | |
// ---------------------| | NumBytes
// | | StackSize (StackAdjustUp)
@@ -614,41 +707,74 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
//
// AArch64TargetLowering::LowerCall figures out ArgumentPopSize and keeps
// it as the 2nd argument of AArch64ISD::TC_RETURN.
- NumBytes += ArgumentPopSize;
- unsigned NumRestores = 0;
+ auto CSStackSize = AFI->getCalleeSavedStackSize();
+ bool CombineSPBump = shouldCombineCSRLocalStackBump(MF, NumBytes);
+
+ if (!CombineSPBump && CSStackSize != 0)
+ convertCalleeSaveRestoreToSPPrePostIncDec(
+ MBB, std::prev(MBB.getFirstTerminator()), DL, TII, CSStackSize);
+
// Move past the restores of the callee-saved registers.
MachineBasicBlock::iterator LastPopI = MBB.getFirstTerminator();
- const MCPhysReg *CSRegs = RegInfo->getCalleeSavedRegs(&MF);
MachineBasicBlock::iterator Begin = MBB.begin();
while (LastPopI != Begin) {
--LastPopI;
- unsigned Restores = getNumCSRestores(*LastPopI, CSRegs);
- NumRestores += Restores;
- if (Restores == 0) {
+ if (!LastPopI->getFlag(MachineInstr::FrameDestroy)) {
++LastPopI;
break;
- }
+ } else if (CombineSPBump)
+ fixupCalleeSaveRestoreStackOffset(*LastPopI, AFI->getLocalStackSize());
+ }
+
+ // If there is a single SP update, insert it before the ret and we're done.
+ if (CombineSPBump) {
+ emitFrameOffset(MBB, MBB.getFirstTerminator(), DL, AArch64::SP, AArch64::SP,
+ NumBytes + ArgumentPopSize, TII,
+ MachineInstr::FrameDestroy);
+ return;
}
- NumBytes -= NumRestores * 8;
+
+ NumBytes -= CSStackSize;
assert(NumBytes >= 0 && "Negative stack allocation size!?");
if (!hasFP(MF)) {
+ bool RedZone = canUseRedZone(MF);
// If this was a redzone leaf function, we don't need to restore the
- // stack pointer.
- if (!canUseRedZone(MF))
- emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::SP, NumBytes,
- TII);
- return;
+ // stack pointer (but we may need to pop stack args for fastcc).
+ if (RedZone && ArgumentPopSize == 0)
+ return;
+
+ bool NoCalleeSaveRestore = CSStackSize == 0;
+ int StackRestoreBytes = RedZone ? 0 : NumBytes;
+ if (NoCalleeSaveRestore)
+ StackRestoreBytes += ArgumentPopSize;
+ emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::SP,
+ StackRestoreBytes, TII, MachineInstr::FrameDestroy);
+ // If we were able to combine the local stack pop with the argument pop,
+ // then we're done.
+ if (NoCalleeSaveRestore || ArgumentPopSize == 0)
+ return;
+ NumBytes = 0;
}
// Restore the original stack pointer.
// FIXME: Rather than doing the math here, we should instead just use
// non-post-indexed loads for the restores if we aren't actually going to
// be able to save any instructions.
- if (NumBytes || MFI->hasVarSizedObjects())
+ if (MFI->hasVarSizedObjects() || AFI->isStackRealigned())
emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::FP,
- -(NumRestores - 2) * 8, TII, MachineInstr::NoFlags);
+ -CSStackSize + 16, TII, MachineInstr::FrameDestroy);
+ else if (NumBytes)
+ emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::SP, NumBytes, TII,
+ MachineInstr::FrameDestroy);
+
+ // This must be placed after the callee-save restore code because that code
+ // assumes the SP is at the same location as it was after the callee-save save
+ // code in the prologue.
+ if (ArgumentPopSize)
+ emitFrameOffset(MBB, MBB.getFirstTerminator(), DL, AArch64::SP, AArch64::SP,
+ ArgumentPopSize, TII, MachineInstr::FrameDestroy);
}
/// getFrameIndexReference - Provide a base+offset reference to an FI slot for
@@ -726,86 +852,167 @@ int AArch64FrameLowering::resolveFrameIndexReference(const MachineFunction &MF,
}
static unsigned getPrologueDeath(MachineFunction &MF, unsigned Reg) {
- if (Reg != AArch64::LR)
- return getKillRegState(true);
+ // Do not set a kill flag on values that are also marked as live-in. This
+ // happens with the @llvm-returnaddress intrinsic and with arguments passed in
+ // callee saved registers.
+ // Omitting the kill flags is conservatively correct even if the live-in
+ // is not used after all.
+ bool IsLiveIn = MF.getRegInfo().isLiveIn(Reg);
+ return getKillRegState(!IsLiveIn);
+}
- // LR maybe referred to later by an @llvm.returnaddress intrinsic.
- bool LRLiveIn = MF.getRegInfo().isLiveIn(AArch64::LR);
- bool LRKill = !(LRLiveIn && MF.getFrameInfo()->isReturnAddressTaken());
- return getKillRegState(LRKill);
+static bool produceCompactUnwindFrame(MachineFunction &MF) {
+ const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
+ AttributeSet Attrs = MF.getFunction()->getAttributes();
+ return Subtarget.isTargetMachO() &&
+ !(Subtarget.getTargetLowering()->supportSwiftError() &&
+ Attrs.hasAttrSomewhere(Attribute::SwiftError));
}
-bool AArch64FrameLowering::spillCalleeSavedRegisters(
- MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
- const std::vector<CalleeSavedInfo> &CSI,
- const TargetRegisterInfo *TRI) const {
- MachineFunction &MF = *MBB.getParent();
- const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
+
+struct RegPairInfo {
+ RegPairInfo() : Reg1(AArch64::NoRegister), Reg2(AArch64::NoRegister) {}
+ unsigned Reg1;
+ unsigned Reg2;
+ int FrameIdx;
+ int Offset;
+ bool IsGPR;
+ bool isPaired() const { return Reg2 != AArch64::NoRegister; }
+};
+
+static void computeCalleeSaveRegisterPairs(
+ MachineFunction &MF, const std::vector<CalleeSavedInfo> &CSI,
+ const TargetRegisterInfo *TRI, SmallVectorImpl<RegPairInfo> &RegPairs) {
+
+ if (CSI.empty())
+ return;
+
+ AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
+ MachineFrameInfo *MFI = MF.getFrameInfo();
+ CallingConv::ID CC = MF.getFunction()->getCallingConv();
unsigned Count = CSI.size();
- DebugLoc DL;
- assert((Count & 1) == 0 && "Odd number of callee-saved regs to spill!");
+ (void)CC;
+ // MachO's compact unwind format relies on all registers being stored in
+ // pairs.
+ assert((!produceCompactUnwindFrame(MF) ||
+ CC == CallingConv::PreserveMost ||
+ (Count & 1) == 0) &&
+ "Odd number of callee-saved regs to spill!");
+ unsigned Offset = AFI->getCalleeSavedStackSize();
+
+ for (unsigned i = 0; i < Count; ++i) {
+ RegPairInfo RPI;
+ RPI.Reg1 = CSI[i].getReg();
+
+ assert(AArch64::GPR64RegClass.contains(RPI.Reg1) ||
+ AArch64::FPR64RegClass.contains(RPI.Reg1));
+ RPI.IsGPR = AArch64::GPR64RegClass.contains(RPI.Reg1);
+
+ // Add the next reg to the pair if it is in the same register class.
+ if (i + 1 < Count) {
+ unsigned NextReg = CSI[i + 1].getReg();
+ if ((RPI.IsGPR && AArch64::GPR64RegClass.contains(NextReg)) ||
+ (!RPI.IsGPR && AArch64::FPR64RegClass.contains(NextReg)))
+ RPI.Reg2 = NextReg;
+ }
- for (unsigned i = 0; i < Count; i += 2) {
- unsigned idx = Count - i - 2;
- unsigned Reg1 = CSI[idx].getReg();
- unsigned Reg2 = CSI[idx + 1].getReg();
// GPRs and FPRs are saved in pairs of 64-bit regs. We expect the CSI
// list to come in sorted by frame index so that we can issue the store
// pair instructions directly. Assert if we see anything otherwise.
//
// The order of the registers in the list is controlled by
// getCalleeSavedRegs(), so they will always be in-order, as well.
- assert(CSI[idx].getFrameIdx() + 1 == CSI[idx + 1].getFrameIdx() &&
+ assert((!RPI.isPaired() ||
+ (CSI[i].getFrameIdx() + 1 == CSI[i + 1].getFrameIdx())) &&
"Out of order callee saved regs!");
+
+ // MachO's compact unwind format relies on all registers being stored in
+ // adjacent register pairs.
+ assert((!produceCompactUnwindFrame(MF) ||
+ CC == CallingConv::PreserveMost ||
+ (RPI.isPaired() &&
+ ((RPI.Reg1 == AArch64::LR && RPI.Reg2 == AArch64::FP) ||
+ RPI.Reg1 + 1 == RPI.Reg2))) &&
+ "Callee-save registers not saved as adjacent register pair!");
+
+ RPI.FrameIdx = CSI[i].getFrameIdx();
+
+ if (Count * 8 != AFI->getCalleeSavedStackSize() && !RPI.isPaired()) {
+ // Round up size of non-pair to pair size if we need to pad the
+ // callee-save area to ensure 16-byte alignment.
+ Offset -= 16;
+ assert(MFI->getObjectAlignment(RPI.FrameIdx) <= 16);
+ MFI->setObjectAlignment(RPI.FrameIdx, 16);
+ AFI->setCalleeSaveStackHasFreeSpace(true);
+ } else
+ Offset -= RPI.isPaired() ? 16 : 8;
+ assert(Offset % 8 == 0);
+ RPI.Offset = Offset / 8;
+ assert((RPI.Offset >= -64 && RPI.Offset <= 63) &&
+ "Offset out of bounds for LDP/STP immediate");
+
+ RegPairs.push_back(RPI);
+ if (RPI.isPaired())
+ ++i;
+ }
+}
+
+bool AArch64FrameLowering::spillCalleeSavedRegisters(
+ MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+ const std::vector<CalleeSavedInfo> &CSI,
+ const TargetRegisterInfo *TRI) const {
+ MachineFunction &MF = *MBB.getParent();
+ const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
+ DebugLoc DL;
+ SmallVector<RegPairInfo, 8> RegPairs;
+
+ computeCalleeSaveRegisterPairs(MF, CSI, TRI, RegPairs);
+
+ for (auto RPII = RegPairs.rbegin(), RPIE = RegPairs.rend(); RPII != RPIE;
+ ++RPII) {
+ RegPairInfo RPI = *RPII;
+ unsigned Reg1 = RPI.Reg1;
+ unsigned Reg2 = RPI.Reg2;
unsigned StrOpc;
- assert((Count & 1) == 0 && "Odd number of callee-saved regs to spill!");
- assert((i & 1) == 0 && "Odd index for callee-saved reg spill!");
- // Issue sequence of non-sp increment and pi sp spills for cs regs. The
- // first spill is a pre-increment that allocates the stack.
+
+ // Issue sequence of spills for cs regs. The first spill may be converted
+ // to a pre-decrement store later by emitPrologue if the callee-save stack
+ // area allocation can't be combined with the local stack area allocation.
// For example:
- // stp x22, x21, [sp, #-48]! // addImm(-6)
+ // stp x22, x21, [sp, #0] // addImm(+0)
// stp x20, x19, [sp, #16] // addImm(+2)
// stp fp, lr, [sp, #32] // addImm(+4)
// Rationale: This sequence saves uop updates compared to a sequence of
// pre-increment spills like stp xi,xj,[sp,#-16]!
- // Note: Similar rational and sequence for restores in epilog.
- if (AArch64::GPR64RegClass.contains(Reg1)) {
- assert(AArch64::GPR64RegClass.contains(Reg2) &&
- "Expected GPR64 callee-saved register pair!");
- // For first spill use pre-increment store.
- if (i == 0)
- StrOpc = AArch64::STPXpre;
- else
- StrOpc = AArch64::STPXi;
- } else if (AArch64::FPR64RegClass.contains(Reg1)) {
- assert(AArch64::FPR64RegClass.contains(Reg2) &&
- "Expected FPR64 callee-saved register pair!");
- // For first spill use pre-increment store.
- if (i == 0)
- StrOpc = AArch64::STPDpre;
- else
- StrOpc = AArch64::STPDi;
- } else
- llvm_unreachable("Unexpected callee saved register!");
- DEBUG(dbgs() << "CSR spill: (" << TRI->getName(Reg1) << ", "
- << TRI->getName(Reg2) << ") -> fi#(" << CSI[idx].getFrameIdx()
- << ", " << CSI[idx + 1].getFrameIdx() << ")\n");
- // Compute offset: i = 0 => offset = -Count;
- // i = 2 => offset = -(Count - 2) + Count = 2 = i; etc.
- const int Offset = (i == 0) ? -Count : i;
- assert((Offset >= -64 && Offset <= 63) &&
- "Offset out of bounds for STP immediate");
- MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(StrOpc));
- if (StrOpc == AArch64::STPDpre || StrOpc == AArch64::STPXpre)
- MIB.addReg(AArch64::SP, RegState::Define);
+ // Note: Similar rationale and sequence for restores in epilog.
+ if (RPI.IsGPR)
+ StrOpc = RPI.isPaired() ? AArch64::STPXi : AArch64::STRXui;
+ else
+ StrOpc = RPI.isPaired() ? AArch64::STPDi : AArch64::STRDui;
+ DEBUG(dbgs() << "CSR spill: (" << TRI->getName(Reg1);
+ if (RPI.isPaired())
+ dbgs() << ", " << TRI->getName(Reg2);
+ dbgs() << ") -> fi#(" << RPI.FrameIdx;
+ if (RPI.isPaired())
+ dbgs() << ", " << RPI.FrameIdx+1;
+ dbgs() << ")\n");
+ MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(StrOpc));
MBB.addLiveIn(Reg1);
- MBB.addLiveIn(Reg2);
- MIB.addReg(Reg2, getPrologueDeath(MF, Reg2))
- .addReg(Reg1, getPrologueDeath(MF, Reg1))
+ if (RPI.isPaired()) {
+ MBB.addLiveIn(Reg2);
+ MIB.addReg(Reg2, getPrologueDeath(MF, Reg2));
+ MIB.addMemOperand(MF.getMachineMemOperand(
+ MachinePointerInfo::getFixedStack(MF, RPI.FrameIdx + 1),
+ MachineMemOperand::MOStore, 8, 8));
+ }
+ MIB.addReg(Reg1, getPrologueDeath(MF, Reg1))
.addReg(AArch64::SP)
- .addImm(Offset) // [sp, #offset * 8], where factor * 8 is implicit
+ .addImm(RPI.Offset) // [sp, #offset*8], where factor*8 is implicit
.setMIFlag(MachineInstr::FrameSetup);
+ MIB.addMemOperand(MF.getMachineMemOperand(
+ MachinePointerInfo::getFixedStack(MF, RPI.FrameIdx),
+ MachineMemOperand::MOStore, 8, 8));
}
return true;
}
@@ -816,66 +1023,55 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters(
const TargetRegisterInfo *TRI) const {
MachineFunction &MF = *MBB.getParent();
const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
- unsigned Count = CSI.size();
DebugLoc DL;
- assert((Count & 1) == 0 && "Odd number of callee-saved regs to spill!");
+ SmallVector<RegPairInfo, 8> RegPairs;
if (MI != MBB.end())
DL = MI->getDebugLoc();
- for (unsigned i = 0; i < Count; i += 2) {
- unsigned Reg1 = CSI[i].getReg();
- unsigned Reg2 = CSI[i + 1].getReg();
- // GPRs and FPRs are saved in pairs of 64-bit regs. We expect the CSI
- // list to come in sorted by frame index so that we can issue the store
- // pair instructions directly. Assert if we see anything otherwise.
- assert(CSI[i].getFrameIdx() + 1 == CSI[i + 1].getFrameIdx() &&
- "Out of order callee saved regs!");
- // Issue sequence of non-sp increment and sp-pi restores for cs regs. Only
- // the last load is sp-pi post-increment and de-allocates the stack:
+ computeCalleeSaveRegisterPairs(MF, CSI, TRI, RegPairs);
+
+ for (auto RPII = RegPairs.begin(), RPIE = RegPairs.end(); RPII != RPIE;
+ ++RPII) {
+ RegPairInfo RPI = *RPII;
+ unsigned Reg1 = RPI.Reg1;
+ unsigned Reg2 = RPI.Reg2;
+
+ // Issue sequence of restores for cs regs. The last restore may be converted
+ // to a post-increment load later by emitEpilogue if the callee-save stack
+ // area allocation can't be combined with the local stack area allocation.
// For example:
// ldp fp, lr, [sp, #32] // addImm(+4)
// ldp x20, x19, [sp, #16] // addImm(+2)
- // ldp x22, x21, [sp], #48 // addImm(+6)
+ // ldp x22, x21, [sp, #0] // addImm(+0)
// Note: see comment in spillCalleeSavedRegisters()
unsigned LdrOpc;
+ if (RPI.IsGPR)
+ LdrOpc = RPI.isPaired() ? AArch64::LDPXi : AArch64::LDRXui;
+ else
+ LdrOpc = RPI.isPaired() ? AArch64::LDPDi : AArch64::LDRDui;
+ DEBUG(dbgs() << "CSR restore: (" << TRI->getName(Reg1);
+ if (RPI.isPaired())
+ dbgs() << ", " << TRI->getName(Reg2);
+ dbgs() << ") -> fi#(" << RPI.FrameIdx;
+ if (RPI.isPaired())
+ dbgs() << ", " << RPI.FrameIdx+1;
+ dbgs() << ")\n");
- assert((Count & 1) == 0 && "Odd number of callee-saved regs to spill!");
- assert((i & 1) == 0 && "Odd index for callee-saved reg spill!");
- if (AArch64::GPR64RegClass.contains(Reg1)) {
- assert(AArch64::GPR64RegClass.contains(Reg2) &&
- "Expected GPR64 callee-saved register pair!");
- if (i == Count - 2)
- LdrOpc = AArch64::LDPXpost;
- else
- LdrOpc = AArch64::LDPXi;
- } else if (AArch64::FPR64RegClass.contains(Reg1)) {
- assert(AArch64::FPR64RegClass.contains(Reg2) &&
- "Expected FPR64 callee-saved register pair!");
- if (i == Count - 2)
- LdrOpc = AArch64::LDPDpost;
- else
- LdrOpc = AArch64::LDPDi;
- } else
- llvm_unreachable("Unexpected callee saved register!");
- DEBUG(dbgs() << "CSR restore: (" << TRI->getName(Reg1) << ", "
- << TRI->getName(Reg2) << ") -> fi#(" << CSI[i].getFrameIdx()
- << ", " << CSI[i + 1].getFrameIdx() << ")\n");
-
- // Compute offset: i = 0 => offset = Count - 2; i = 2 => offset = Count - 4;
- // etc.
- const int Offset = (i == Count - 2) ? Count : Count - i - 2;
- assert((Offset >= -64 && Offset <= 63) &&
- "Offset out of bounds for LDP immediate");
MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(LdrOpc));
- if (LdrOpc == AArch64::LDPXpost || LdrOpc == AArch64::LDPDpost)
- MIB.addReg(AArch64::SP, RegState::Define);
-
- MIB.addReg(Reg2, getDefRegState(true))
- .addReg(Reg1, getDefRegState(true))
+ if (RPI.isPaired()) {
+ MIB.addReg(Reg2, getDefRegState(true));
+ MIB.addMemOperand(MF.getMachineMemOperand(
+ MachinePointerInfo::getFixedStack(MF, RPI.FrameIdx + 1),
+ MachineMemOperand::MOLoad, 8, 8));
+ }
+ MIB.addReg(Reg1, getDefRegState(true))
.addReg(AArch64::SP)
- .addImm(Offset); // [sp], #offset * 8 or [sp, #offset * 8]
- // where the factor * 8 is implicit
+ .addImm(RPI.Offset) // [sp, #offset*8] where the factor*8 is implicit
+ .setMIFlag(MachineInstr::FrameDestroy);
+ MIB.addMemOperand(MF.getMachineMemOperand(
+ MachinePointerInfo::getFixedStack(MF, RPI.FrameIdx),
+ MachineMemOperand::MOLoad, 8, 8));
}
return true;
}
@@ -892,8 +1088,8 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF,
const AArch64RegisterInfo *RegInfo = static_cast<const AArch64RegisterInfo *>(
MF.getSubtarget().getRegisterInfo());
AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
- SmallVector<unsigned, 4> UnspilledCSGPRs;
- SmallVector<unsigned, 4> UnspilledCSFPRs;
+ unsigned UnspilledCSGPR = AArch64::NoRegister;
+ unsigned UnspilledCSGPRPaired = AArch64::NoRegister;
// The frame record needs to be created by saving the appropriate registers
if (hasFP(MF)) {
@@ -901,79 +1097,51 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF,
SavedRegs.set(AArch64::LR);
}
- // Spill the BasePtr if it's used. Do this first thing so that the
- // getCalleeSavedRegs() below will get the right answer.
+ unsigned BasePointerReg = AArch64::NoRegister;
if (RegInfo->hasBasePointer(MF))
- SavedRegs.set(RegInfo->getBaseRegister());
-
- if (RegInfo->needsStackRealignment(MF) && !RegInfo->hasBasePointer(MF))
- SavedRegs.set(AArch64::X9);
+ BasePointerReg = RegInfo->getBaseRegister();
- // If any callee-saved registers are used, the frame cannot be eliminated.
- unsigned NumGPRSpilled = 0;
- unsigned NumFPRSpilled = 0;
bool ExtraCSSpill = false;
- bool CanEliminateFrame = true;
- DEBUG(dbgs() << "*** determineCalleeSaves\nUsed CSRs:");
const MCPhysReg *CSRegs = RegInfo->getCalleeSavedRegs(&MF);
+ // Figure out which callee-saved registers to save/restore.
+ for (unsigned i = 0; CSRegs[i]; ++i) {
+ const unsigned Reg = CSRegs[i];
+
+ // Add the base pointer register to SavedRegs if it is callee-save.
+ if (Reg == BasePointerReg)
+ SavedRegs.set(Reg);
- // Check pairs of consecutive callee-saved registers.
- for (unsigned i = 0; CSRegs[i]; i += 2) {
- assert(CSRegs[i + 1] && "Odd number of callee-saved registers!");
-
- const unsigned OddReg = CSRegs[i];
- const unsigned EvenReg = CSRegs[i + 1];
- assert((AArch64::GPR64RegClass.contains(OddReg) &&
- AArch64::GPR64RegClass.contains(EvenReg)) ^
- (AArch64::FPR64RegClass.contains(OddReg) &&
- AArch64::FPR64RegClass.contains(EvenReg)) &&
- "Register class mismatch!");
-
- const bool OddRegUsed = SavedRegs.test(OddReg);
- const bool EvenRegUsed = SavedRegs.test(EvenReg);
-
- // Early exit if none of the registers in the register pair is actually
- // used.
- if (!OddRegUsed && !EvenRegUsed) {
- if (AArch64::GPR64RegClass.contains(OddReg)) {
- UnspilledCSGPRs.push_back(OddReg);
- UnspilledCSGPRs.push_back(EvenReg);
- } else {
- UnspilledCSFPRs.push_back(OddReg);
- UnspilledCSFPRs.push_back(EvenReg);
+ bool RegUsed = SavedRegs.test(Reg);
+ unsigned PairedReg = CSRegs[i ^ 1];
+ if (!RegUsed) {
+ if (AArch64::GPR64RegClass.contains(Reg) &&
+ !RegInfo->isReservedReg(MF, Reg)) {
+ UnspilledCSGPR = Reg;
+ UnspilledCSGPRPaired = PairedReg;
}
continue;
}
- unsigned Reg = AArch64::NoRegister;
- // If only one of the registers of the register pair is used, make sure to
- // mark the other one as used as well.
- if (OddRegUsed ^ EvenRegUsed) {
- // Find out which register is the additional spill.
- Reg = OddRegUsed ? EvenReg : OddReg;
- SavedRegs.set(Reg);
+ // MachO's compact unwind format relies on all registers being stored in
+ // pairs.
+ // FIXME: the usual format is actually better if unwinding isn't needed.
+ if (produceCompactUnwindFrame(MF) && !SavedRegs.test(PairedReg)) {
+ SavedRegs.set(PairedReg);
+ if (AArch64::GPR64RegClass.contains(PairedReg) &&
+ !RegInfo->isReservedReg(MF, PairedReg))
+ ExtraCSSpill = true;
}
+ }
- DEBUG(dbgs() << ' ' << PrintReg(OddReg, RegInfo));
- DEBUG(dbgs() << ' ' << PrintReg(EvenReg, RegInfo));
-
- assert(((OddReg == AArch64::LR && EvenReg == AArch64::FP) ||
- (RegInfo->getEncodingValue(OddReg) + 1 ==
- RegInfo->getEncodingValue(EvenReg))) &&
- "Register pair of non-adjacent registers!");
- if (AArch64::GPR64RegClass.contains(OddReg)) {
- NumGPRSpilled += 2;
- // If it's not a reserved register, we can use it in lieu of an
- // emergency spill slot for the register scavenger.
- // FIXME: It would be better to instead keep looking and choose another
- // unspilled register that isn't reserved, if there is one.
- if (Reg != AArch64::NoRegister && !RegInfo->isReservedReg(MF, Reg))
- ExtraCSSpill = true;
- } else
- NumFPRSpilled += 2;
+ DEBUG(dbgs() << "*** determineCalleeSaves\nUsed CSRs:";
+ for (int Reg = SavedRegs.find_first(); Reg != -1;
+ Reg = SavedRegs.find_next(Reg))
+ dbgs() << ' ' << PrintReg(Reg, RegInfo);
+ dbgs() << "\n";);
- CanEliminateFrame = false;
- }
+ // If any callee-saved registers are used, the frame cannot be eliminated.
+ unsigned NumRegsSpilled = SavedRegs.count();
+ bool CanEliminateFrame = NumRegsSpilled == 0;
// FIXME: Set BigStack if any stack slot references may be out of range.
// For now, just conservatively guestimate based on unscaled indexing
@@ -982,8 +1150,7 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF,
// The CSR spill slots have not been allocated yet, so estimateStackSize
// won't include them.
MachineFrameInfo *MFI = MF.getFrameInfo();
- unsigned CFSize =
- MFI->estimateStackSize(MF) + 8 * (NumGPRSpilled + NumFPRSpilled);
+ unsigned CFSize = MFI->estimateStackSize(MF) + 8 * NumRegsSpilled;
DEBUG(dbgs() << "Estimated stack frame size: " << CFSize << " bytes.\n");
bool BigStack = (CFSize >= 256);
if (BigStack || !CanEliminateFrame || RegInfo->cannotEliminateFrame(MF))
@@ -996,19 +1163,17 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF,
// above to keep the number of spills even, we don't need to do anything else
// here.
if (BigStack && !ExtraCSSpill) {
-
- // If we're adding a register to spill here, we have to add two of them
- // to keep the number of regs to spill even.
- assert(((UnspilledCSGPRs.size() & 1) == 0) && "Odd number of registers!");
- unsigned Count = 0;
- while (!UnspilledCSGPRs.empty() && Count < 2) {
- unsigned Reg = UnspilledCSGPRs.back();
- UnspilledCSGPRs.pop_back();
- DEBUG(dbgs() << "Spilling " << PrintReg(Reg, RegInfo)
- << " to get a scratch register.\n");
- SavedRegs.set(Reg);
+ if (UnspilledCSGPR != AArch64::NoRegister) {
+ DEBUG(dbgs() << "Spilling " << PrintReg(UnspilledCSGPR, RegInfo)
+ << " to get a scratch register.\n");
+ SavedRegs.set(UnspilledCSGPR);
+ // MachO's compact unwind format relies on all registers being stored in
+ // pairs, so if we need to spill one extra for BigStack, then we need to
+ // store the pair.
+ if (produceCompactUnwindFrame(MF))
+ SavedRegs.set(UnspilledCSGPRPaired);
ExtraCSSpill = true;
- ++Count;
+ NumRegsSpilled = SavedRegs.count();
}
// If we didn't find an extra callee-saved register to spill, create
@@ -1021,4 +1186,14 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF,
<< " as the emergency spill slot.\n");
}
}
+
+ // Round up to register pair alignment to avoid additional SP adjustment
+ // instructions.
+ AFI->setCalleeSavedStackSize(alignTo(8 * NumRegsSpilled, 16));
+}
+
+bool AArch64FrameLowering::enableStackSlotScavenging(
+ const MachineFunction &MF) const {
+ const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
+ return AFI->hasCalleeSaveStackFreeSpace();
}
diff --git a/lib/Target/AArch64/AArch64FrameLowering.h b/lib/Target/AArch64/AArch64FrameLowering.h
index 7d8354c38787c..f254ea9b70aa7 100644
--- a/lib/Target/AArch64/AArch64FrameLowering.h
+++ b/lib/Target/AArch64/AArch64FrameLowering.h
@@ -25,12 +25,11 @@ public:
true /*StackRealignable*/) {}
void emitCalleeSavedFrameMoves(MachineBasicBlock &MBB,
- MachineBasicBlock::iterator MBBI,
- unsigned FramePtr) const;
+ MachineBasicBlock::iterator MBBI) const;
- void eliminateCallFramePseudoInstr(MachineFunction &MF,
- MachineBasicBlock &MBB,
- MachineBasicBlock::iterator I) const override;
+ MachineBasicBlock::iterator
+ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I) const override;
/// emitProlog/emitEpilog - These methods insert prolog and epilog code into
/// the function.
@@ -67,6 +66,12 @@ public:
bool enableShrinkWrapping(const MachineFunction &MF) const override {
return true;
}
+
+ bool enableStackSlotScavenging(const MachineFunction &MF) const override;
+
+private:
+ bool shouldCombineCSRLocalStackBump(MachineFunction &MF,
+ unsigned StackBumpBytes) const;
};
} // End llvm namespace
diff --git a/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
index 6c868880bcac4..8d649250f6569 100644
--- a/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -57,7 +57,7 @@ public:
return SelectionDAGISel::runOnMachineFunction(MF);
}
- SDNode *Select(SDNode *Node) override;
+ void Select(SDNode *Node) override;
/// SelectInlineAsmMemoryOperand - Implement addressing mode selection for
/// inline asm expressions.
@@ -65,8 +65,8 @@ public:
unsigned ConstraintID,
std::vector<SDValue> &OutOps) override;
- SDNode *SelectMLAV64LaneV128(SDNode *N);
- SDNode *SelectMULLV64LaneV128(unsigned IntNo, SDNode *N);
+ bool tryMLAV64LaneV128(SDNode *N);
+ bool tryMULLV64LaneV128(unsigned IntNo, SDNode *N);
bool SelectArithExtendedRegister(SDValue N, SDValue &Reg, SDValue &Shift);
bool SelectArithImmed(SDValue N, SDValue &Val, SDValue &Shift);
bool SelectNegArithImmed(SDValue N, SDValue &Val, SDValue &Shift);
@@ -147,28 +147,29 @@ public:
SDValue createTuple(ArrayRef<SDValue> Vecs, const unsigned RegClassIDs[],
const unsigned SubRegs[]);
- SDNode *SelectTable(SDNode *N, unsigned NumVecs, unsigned Opc, bool isExt);
+ void SelectTable(SDNode *N, unsigned NumVecs, unsigned Opc, bool isExt);
- SDNode *SelectIndexedLoad(SDNode *N, bool &Done);
+ bool tryIndexedLoad(SDNode *N);
- SDNode *SelectLoad(SDNode *N, unsigned NumVecs, unsigned Opc,
+ void SelectLoad(SDNode *N, unsigned NumVecs, unsigned Opc,
unsigned SubRegIdx);
- SDNode *SelectPostLoad(SDNode *N, unsigned NumVecs, unsigned Opc,
+ void SelectPostLoad(SDNode *N, unsigned NumVecs, unsigned Opc,
unsigned SubRegIdx);
- SDNode *SelectLoadLane(SDNode *N, unsigned NumVecs, unsigned Opc);
- SDNode *SelectPostLoadLane(SDNode *N, unsigned NumVecs, unsigned Opc);
+ void SelectLoadLane(SDNode *N, unsigned NumVecs, unsigned Opc);
+ void SelectPostLoadLane(SDNode *N, unsigned NumVecs, unsigned Opc);
- SDNode *SelectStore(SDNode *N, unsigned NumVecs, unsigned Opc);
- SDNode *SelectPostStore(SDNode *N, unsigned NumVecs, unsigned Opc);
- SDNode *SelectStoreLane(SDNode *N, unsigned NumVecs, unsigned Opc);
- SDNode *SelectPostStoreLane(SDNode *N, unsigned NumVecs, unsigned Opc);
+ void SelectStore(SDNode *N, unsigned NumVecs, unsigned Opc);
+ void SelectPostStore(SDNode *N, unsigned NumVecs, unsigned Opc);
+ void SelectStoreLane(SDNode *N, unsigned NumVecs, unsigned Opc);
+ void SelectPostStoreLane(SDNode *N, unsigned NumVecs, unsigned Opc);
- SDNode *SelectBitfieldExtractOp(SDNode *N);
- SDNode *SelectBitfieldInsertOp(SDNode *N);
- SDNode *SelectBitfieldInsertInZeroOp(SDNode *N);
+ bool tryBitfieldExtractOp(SDNode *N);
+ bool tryBitfieldExtractOpFromSExt(SDNode *N);
+ bool tryBitfieldInsertOp(SDNode *N);
+ bool tryBitfieldInsertInZeroOp(SDNode *N);
- SDNode *SelectReadRegister(SDNode *N);
- SDNode *SelectWriteRegister(SDNode *N);
+ bool tryReadRegister(SDNode *N);
+ bool tryWriteRegister(SDNode *N);
// Include the pieces autogenerated from the target description.
#include "AArch64GenDAGISel.inc"
@@ -198,6 +199,9 @@ private:
}
bool SelectCVTFixedPosOperand(SDValue N, SDValue &FixedPos, unsigned Width);
+
+ void SelectCMP_SWAP(SDNode *N);
+
};
} // end anonymous namespace
@@ -328,9 +332,7 @@ static AArch64_AM::ShiftExtendType getShiftTypeForNode(SDValue N) {
bool AArch64DAGToDAGISel::isWorthFolding(SDValue V) const {
// it hurts if the value is used at least twice, unless we are optimizing
// for code size.
- if (ForCodeSize || V.hasOneUse())
- return true;
- return false;
+ return ForCodeSize || V.hasOneUse();
}
/// SelectShiftedRegister - Select a "shifted register" operand. If the value
@@ -452,7 +454,7 @@ static bool checkV64LaneV128(SDValue Op0, SDValue Op1, SDValue &StdOp,
/// SelectMLAV64LaneV128 - AArch64 supports vector MLAs where one multiplicand
/// is a lane in the upper half of a 128-bit vector. Recognize and select this
/// so that we don't emit unnecessary lane extracts.
-SDNode *AArch64DAGToDAGISel::SelectMLAV64LaneV128(SDNode *N) {
+bool AArch64DAGToDAGISel::tryMLAV64LaneV128(SDNode *N) {
SDLoc dl(N);
SDValue Op0 = N->getOperand(0);
SDValue Op1 = N->getOperand(1);
@@ -467,7 +469,7 @@ SDNode *AArch64DAGToDAGISel::SelectMLAV64LaneV128(SDNode *N) {
if (Op1.getOpcode() != ISD::MUL ||
!checkV64LaneV128(Op1.getOperand(0), Op1.getOperand(1), MLAOp1, MLAOp2,
LaneIdx))
- return nullptr;
+ return false;
}
SDValue LaneIdxVal = CurDAG->getTargetConstant(LaneIdx, dl, MVT::i64);
@@ -493,10 +495,11 @@ SDNode *AArch64DAGToDAGISel::SelectMLAV64LaneV128(SDNode *N) {
break;
}
- return CurDAG->getMachineNode(MLAOpc, dl, N->getValueType(0), Ops);
+ ReplaceNode(N, CurDAG->getMachineNode(MLAOpc, dl, N->getValueType(0), Ops));
+ return true;
}
-SDNode *AArch64DAGToDAGISel::SelectMULLV64LaneV128(unsigned IntNo, SDNode *N) {
+bool AArch64DAGToDAGISel::tryMULLV64LaneV128(unsigned IntNo, SDNode *N) {
SDLoc dl(N);
SDValue SMULLOp0;
SDValue SMULLOp1;
@@ -504,7 +507,7 @@ SDNode *AArch64DAGToDAGISel::SelectMULLV64LaneV128(unsigned IntNo, SDNode *N) {
if (!checkV64LaneV128(N->getOperand(1), N->getOperand(2), SMULLOp0, SMULLOp1,
LaneIdx))
- return nullptr;
+ return false;
SDValue LaneIdxVal = CurDAG->getTargetConstant(LaneIdx, dl, MVT::i64);
@@ -537,7 +540,8 @@ SDNode *AArch64DAGToDAGISel::SelectMULLV64LaneV128(unsigned IntNo, SDNode *N) {
} else
llvm_unreachable("Unrecognized intrinsic.");
- return CurDAG->getMachineNode(SMULLOpc, dl, N->getValueType(0), Ops);
+ ReplaceNode(N, CurDAG->getMachineNode(SMULLOpc, dl, N->getValueType(0), Ops));
+ return true;
}
/// Instructions that accept extend modifiers like UXTW expect the register
@@ -610,7 +614,7 @@ static bool isWorthFoldingADDlow(SDValue N) {
// ldar and stlr have much more restrictive addressing modes (just a
// register).
- if (cast<MemSDNode>(Use)->getOrdering() > Monotonic)
+ if (isStrongerThanMonotonic(cast<MemSDNode>(Use)->getOrdering()))
return false;
}
@@ -687,7 +691,7 @@ bool AArch64DAGToDAGISel::SelectAddrModeIndexed(SDValue N, unsigned Size,
const GlobalValue *GV = GAN->getGlobal();
unsigned Alignment = GV->getAlignment();
- Type *Ty = GV->getType()->getElementType();
+ Type *Ty = GV->getValueType();
if (Alignment == 0 && Ty->isSized())
Alignment = DL.getABITypeAlignment(Ty);
@@ -797,10 +801,7 @@ bool AArch64DAGToDAGISel::SelectExtendedSHL(SDValue N, unsigned Size,
if (ShiftVal != 0 && ShiftVal != LegalShiftVal)
return false;
- if (isWorthFolding(N))
- return true;
-
- return false;
+ return isWorthFolding(N);
}
bool AArch64DAGToDAGISel::SelectAddrModeWRO(SDValue N, unsigned Size,
@@ -1015,8 +1016,8 @@ SDValue AArch64DAGToDAGISel::createTuple(ArrayRef<SDValue> Regs,
return SDValue(N, 0);
}
-SDNode *AArch64DAGToDAGISel::SelectTable(SDNode *N, unsigned NumVecs,
- unsigned Opc, bool isExt) {
+void AArch64DAGToDAGISel::SelectTable(SDNode *N, unsigned NumVecs, unsigned Opc,
+ bool isExt) {
SDLoc dl(N);
EVT VT = N->getValueType(0);
@@ -1033,13 +1034,13 @@ SDNode *AArch64DAGToDAGISel::SelectTable(SDNode *N, unsigned NumVecs,
Ops.push_back(N->getOperand(1));
Ops.push_back(RegSeq);
Ops.push_back(N->getOperand(NumVecs + ExtOff + 1));
- return CurDAG->getMachineNode(Opc, dl, VT, Ops);
+ ReplaceNode(N, CurDAG->getMachineNode(Opc, dl, VT, Ops));
}
-SDNode *AArch64DAGToDAGISel::SelectIndexedLoad(SDNode *N, bool &Done) {
+bool AArch64DAGToDAGISel::tryIndexedLoad(SDNode *N) {
LoadSDNode *LD = cast<LoadSDNode>(N);
if (LD->isUnindexed())
- return nullptr;
+ return false;
EVT VT = LD->getMemoryVT();
EVT DstVT = N->getValueType(0);
ISD::MemIndexedMode AM = LD->getAddressingMode();
@@ -1101,7 +1102,7 @@ SDNode *AArch64DAGToDAGISel::SelectIndexedLoad(SDNode *N, bool &Done) {
} else if (VT.is128BitVector()) {
Opcode = IsPre ? AArch64::LDRQpre : AArch64::LDRQpost;
} else
- return nullptr;
+ return false;
SDValue Chain = LD->getChain();
SDValue Base = LD->getBasePtr();
ConstantSDNode *OffsetOp = cast<ConstantSDNode>(LD->getOffset());
@@ -1112,7 +1113,6 @@ SDNode *AArch64DAGToDAGISel::SelectIndexedLoad(SDNode *N, bool &Done) {
SDNode *Res = CurDAG->getMachineNode(Opcode, dl, MVT::i64, DstVT,
MVT::Other, Ops);
// Either way, we're replacing the node, so tell the caller that.
- Done = true;
SDValue LoadedVal = SDValue(Res, 1);
if (InsertTo64) {
SDValue SubReg = CurDAG->getTargetConstant(AArch64::sub_32, dl, MVT::i32);
@@ -1127,12 +1127,12 @@ SDNode *AArch64DAGToDAGISel::SelectIndexedLoad(SDNode *N, bool &Done) {
ReplaceUses(SDValue(N, 0), LoadedVal);
ReplaceUses(SDValue(N, 1), SDValue(Res, 0));
ReplaceUses(SDValue(N, 2), SDValue(Res, 2));
-
- return nullptr;
+ CurDAG->RemoveDeadNode(N);
+ return true;
}
-SDNode *AArch64DAGToDAGISel::SelectLoad(SDNode *N, unsigned NumVecs,
- unsigned Opc, unsigned SubRegIdx) {
+void AArch64DAGToDAGISel::SelectLoad(SDNode *N, unsigned NumVecs, unsigned Opc,
+ unsigned SubRegIdx) {
SDLoc dl(N);
EVT VT = N->getValueType(0);
SDValue Chain = N->getOperand(0);
@@ -1149,11 +1149,11 @@ SDNode *AArch64DAGToDAGISel::SelectLoad(SDNode *N, unsigned NumVecs,
CurDAG->getTargetExtractSubreg(SubRegIdx + i, dl, VT, SuperReg));
ReplaceUses(SDValue(N, NumVecs), SDValue(Ld, 1));
- return nullptr;
+ CurDAG->RemoveDeadNode(N);
}
-SDNode *AArch64DAGToDAGISel::SelectPostLoad(SDNode *N, unsigned NumVecs,
- unsigned Opc, unsigned SubRegIdx) {
+void AArch64DAGToDAGISel::SelectPostLoad(SDNode *N, unsigned NumVecs,
+ unsigned Opc, unsigned SubRegIdx) {
SDLoc dl(N);
EVT VT = N->getValueType(0);
SDValue Chain = N->getOperand(0);
@@ -1181,11 +1181,11 @@ SDNode *AArch64DAGToDAGISel::SelectPostLoad(SDNode *N, unsigned NumVecs,
// Update the chain
ReplaceUses(SDValue(N, NumVecs + 1), SDValue(Ld, 2));
- return nullptr;
+ CurDAG->RemoveDeadNode(N);
}
-SDNode *AArch64DAGToDAGISel::SelectStore(SDNode *N, unsigned NumVecs,
- unsigned Opc) {
+void AArch64DAGToDAGISel::SelectStore(SDNode *N, unsigned NumVecs,
+ unsigned Opc) {
SDLoc dl(N);
EVT VT = N->getOperand(2)->getValueType(0);
@@ -1197,11 +1197,11 @@ SDNode *AArch64DAGToDAGISel::SelectStore(SDNode *N, unsigned NumVecs,
SDValue Ops[] = {RegSeq, N->getOperand(NumVecs + 2), N->getOperand(0)};
SDNode *St = CurDAG->getMachineNode(Opc, dl, N->getValueType(0), Ops);
- return St;
+ ReplaceNode(N, St);
}
-SDNode *AArch64DAGToDAGISel::SelectPostStore(SDNode *N, unsigned NumVecs,
- unsigned Opc) {
+void AArch64DAGToDAGISel::SelectPostStore(SDNode *N, unsigned NumVecs,
+ unsigned Opc) {
SDLoc dl(N);
EVT VT = N->getOperand(2)->getValueType(0);
const EVT ResTys[] = {MVT::i64, // Type of the write back register
@@ -1218,7 +1218,7 @@ SDNode *AArch64DAGToDAGISel::SelectPostStore(SDNode *N, unsigned NumVecs,
N->getOperand(0)}; // Chain
SDNode *St = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);
- return St;
+ ReplaceNode(N, St);
}
namespace {
@@ -1256,8 +1256,8 @@ static SDValue NarrowVector(SDValue V128Reg, SelectionDAG &DAG) {
V128Reg);
}
-SDNode *AArch64DAGToDAGISel::SelectLoadLane(SDNode *N, unsigned NumVecs,
- unsigned Opc) {
+void AArch64DAGToDAGISel::SelectLoadLane(SDNode *N, unsigned NumVecs,
+ unsigned Opc) {
SDLoc dl(N);
EVT VT = N->getValueType(0);
bool Narrow = VT.getSizeInBits() == 64;
@@ -1292,12 +1292,11 @@ SDNode *AArch64DAGToDAGISel::SelectLoadLane(SDNode *N, unsigned NumVecs,
}
ReplaceUses(SDValue(N, NumVecs), SDValue(Ld, 1));
-
- return Ld;
+ CurDAG->RemoveDeadNode(N);
}
-SDNode *AArch64DAGToDAGISel::SelectPostLoadLane(SDNode *N, unsigned NumVecs,
- unsigned Opc) {
+void AArch64DAGToDAGISel::SelectPostLoadLane(SDNode *N, unsigned NumVecs,
+ unsigned Opc) {
SDLoc dl(N);
EVT VT = N->getValueType(0);
bool Narrow = VT.getSizeInBits() == 64;
@@ -1348,12 +1347,11 @@ SDNode *AArch64DAGToDAGISel::SelectPostLoadLane(SDNode *N, unsigned NumVecs,
// Update the Chain
ReplaceUses(SDValue(N, NumVecs + 1), SDValue(Ld, 2));
-
- return Ld;
+ CurDAG->RemoveDeadNode(N);
}
-SDNode *AArch64DAGToDAGISel::SelectStoreLane(SDNode *N, unsigned NumVecs,
- unsigned Opc) {
+void AArch64DAGToDAGISel::SelectStoreLane(SDNode *N, unsigned NumVecs,
+ unsigned Opc) {
SDLoc dl(N);
EVT VT = N->getOperand(2)->getValueType(0);
bool Narrow = VT.getSizeInBits() == 64;
@@ -1379,11 +1377,11 @@ SDNode *AArch64DAGToDAGISel::SelectStoreLane(SDNode *N, unsigned NumVecs,
MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand();
cast<MachineSDNode>(St)->setMemRefs(MemOp, MemOp + 1);
- return St;
+ ReplaceNode(N, St);
}
-SDNode *AArch64DAGToDAGISel::SelectPostStoreLane(SDNode *N, unsigned NumVecs,
- unsigned Opc) {
+void AArch64DAGToDAGISel::SelectPostStoreLane(SDNode *N, unsigned NumVecs,
+ unsigned Opc) {
SDLoc dl(N);
EVT VT = N->getOperand(2)->getValueType(0);
bool Narrow = VT.getSizeInBits() == 64;
@@ -1414,7 +1412,7 @@ SDNode *AArch64DAGToDAGISel::SelectPostStoreLane(SDNode *N, unsigned NumVecs,
MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand();
cast<MachineSDNode>(St)->setMemRefs(MemOp, MemOp + 1);
- return St;
+ ReplaceNode(N, St);
}
static bool isBitfieldExtractOpFromAnd(SelectionDAG *CurDAG, SDNode *N,
@@ -1441,25 +1439,25 @@ static bool isBitfieldExtractOpFromAnd(SelectionDAG *CurDAG, SDNode *N,
// form these situations when matching bigger pattern (bitfield insert).
// For unsigned extracts, check for a shift right and mask
- uint64_t And_imm = 0;
- if (!isOpcWithIntImmediate(N, ISD::AND, And_imm))
+ uint64_t AndImm = 0;
+ if (!isOpcWithIntImmediate(N, ISD::AND, AndImm))
return false;
const SDNode *Op0 = N->getOperand(0).getNode();
// Because of simplify-demanded-bits in DAGCombine, the mask may have been
// simplified. Try to undo that
- And_imm |= (1 << NumberOfIgnoredLowBits) - 1;
+ AndImm |= (1 << NumberOfIgnoredLowBits) - 1;
// The immediate is a mask of the low bits iff imm & (imm+1) == 0
- if (And_imm & (And_imm + 1))
+ if (AndImm & (AndImm + 1))
return false;
bool ClampMSB = false;
- uint64_t Srl_imm = 0;
+ uint64_t SrlImm = 0;
// Handle the SRL + ANY_EXTEND case.
if (VT == MVT::i64 && Op0->getOpcode() == ISD::ANY_EXTEND &&
- isOpcWithIntImmediate(Op0->getOperand(0).getNode(), ISD::SRL, Srl_imm)) {
+ isOpcWithIntImmediate(Op0->getOperand(0).getNode(), ISD::SRL, SrlImm)) {
// Extend the incoming operand of the SRL to 64-bit.
Opd0 = Widen(CurDAG, Op0->getOperand(0).getOperand(0));
// Make sure to clamp the MSB so that we preserve the semantics of the
@@ -1467,13 +1465,13 @@ static bool isBitfieldExtractOpFromAnd(SelectionDAG *CurDAG, SDNode *N,
ClampMSB = true;
} else if (VT == MVT::i32 && Op0->getOpcode() == ISD::TRUNCATE &&
isOpcWithIntImmediate(Op0->getOperand(0).getNode(), ISD::SRL,
- Srl_imm)) {
+ SrlImm)) {
// If the shift result was truncated, we can still combine them.
Opd0 = Op0->getOperand(0).getOperand(0);
// Use the type of SRL node.
VT = Opd0->getValueType(0);
- } else if (isOpcWithIntImmediate(Op0, ISD::SRL, Srl_imm)) {
+ } else if (isOpcWithIntImmediate(Op0, ISD::SRL, SrlImm)) {
Opd0 = Op0->getOperand(0);
} else if (BiggerPattern) {
// Let's pretend a 0 shift right has been performed.
@@ -1487,15 +1485,15 @@ static bool isBitfieldExtractOpFromAnd(SelectionDAG *CurDAG, SDNode *N,
// Bail out on large immediates. This happens when no proper
// combining/constant folding was performed.
- if (!BiggerPattern && (Srl_imm <= 0 || Srl_imm >= VT.getSizeInBits())) {
+ if (!BiggerPattern && (SrlImm <= 0 || SrlImm >= VT.getSizeInBits())) {
DEBUG((dbgs() << N
<< ": Found large shift immediate, this should not happen\n"));
return false;
}
- LSB = Srl_imm;
- MSB = Srl_imm + (VT == MVT::i32 ? countTrailingOnes<uint32_t>(And_imm)
- : countTrailingOnes<uint64_t>(And_imm)) -
+ LSB = SrlImm;
+ MSB = SrlImm + (VT == MVT::i32 ? countTrailingOnes<uint32_t>(AndImm)
+ : countTrailingOnes<uint64_t>(AndImm)) -
1;
if (ClampMSB)
// Since we're moving the extend before the right shift operation, we need
@@ -1508,6 +1506,39 @@ static bool isBitfieldExtractOpFromAnd(SelectionDAG *CurDAG, SDNode *N,
return true;
}
+static bool isBitfieldExtractOpFromSExtInReg(SDNode *N, unsigned &Opc,
+ SDValue &Opd0, unsigned &Immr,
+ unsigned &Imms) {
+ assert(N->getOpcode() == ISD::SIGN_EXTEND_INREG);
+
+ EVT VT = N->getValueType(0);
+ unsigned BitWidth = VT.getSizeInBits();
+ assert((VT == MVT::i32 || VT == MVT::i64) &&
+ "Type checking must have been done before calling this function");
+
+ SDValue Op = N->getOperand(0);
+ if (Op->getOpcode() == ISD::TRUNCATE) {
+ Op = Op->getOperand(0);
+ VT = Op->getValueType(0);
+ BitWidth = VT.getSizeInBits();
+ }
+
+ uint64_t ShiftImm;
+ if (!isOpcWithIntImmediate(Op.getNode(), ISD::SRL, ShiftImm) &&
+ !isOpcWithIntImmediate(Op.getNode(), ISD::SRA, ShiftImm))
+ return false;
+
+ unsigned Width = cast<VTSDNode>(N->getOperand(1))->getVT().getSizeInBits();
+ if (ShiftImm + Width > BitWidth)
+ return false;
+
+ Opc = (VT == MVT::i32) ? AArch64::SBFMWri : AArch64::SBFMXri;
+ Opd0 = Op.getOperand(0);
+ Immr = ShiftImm;
+ Imms = ShiftImm + Width - 1;
+ return true;
+}
+
static bool isSeveralBitsExtractOpFromShr(SDNode *N, unsigned &Opc,
SDValue &Opd0, unsigned &LSB,
unsigned &MSB) {
@@ -1522,32 +1553,32 @@ static bool isSeveralBitsExtractOpFromShr(SDNode *N, unsigned &Opc,
//
// This gets selected into a single UBFM:
//
- // UBFM Value, ShiftImm, BitWide + Srl_imm -1
+ // UBFM Value, ShiftImm, BitWide + SrlImm -1
//
if (N->getOpcode() != ISD::SRL)
return false;
- uint64_t And_mask = 0;
- if (!isOpcWithIntImmediate(N->getOperand(0).getNode(), ISD::AND, And_mask))
+ uint64_t AndMask = 0;
+ if (!isOpcWithIntImmediate(N->getOperand(0).getNode(), ISD::AND, AndMask))
return false;
Opd0 = N->getOperand(0).getOperand(0);
- uint64_t Srl_imm = 0;
- if (!isIntImmediate(N->getOperand(1), Srl_imm))
+ uint64_t SrlImm = 0;
+ if (!isIntImmediate(N->getOperand(1), SrlImm))
return false;
// Check whether we really have several bits extract here.
- unsigned BitWide = 64 - countLeadingOnes(~(And_mask >> Srl_imm));
- if (BitWide && isMask_64(And_mask >> Srl_imm)) {
+ unsigned BitWide = 64 - countLeadingOnes(~(AndMask >> SrlImm));
+ if (BitWide && isMask_64(AndMask >> SrlImm)) {
if (N->getValueType(0) == MVT::i32)
Opc = AArch64::UBFMWri;
else
Opc = AArch64::UBFMXri;
- LSB = Srl_imm;
- MSB = BitWide + Srl_imm - 1;
+ LSB = SrlImm;
+ MSB = BitWide + SrlImm - 1;
return true;
}
@@ -1572,10 +1603,10 @@ static bool isBitfieldExtractOpFromShr(SDNode *N, unsigned &Opc, SDValue &Opd0,
if (isSeveralBitsExtractOpFromShr(N, Opc, Opd0, Immr, Imms))
return true;
- // we're looking for a shift of a shift
- uint64_t Shl_imm = 0;
- uint64_t Trunc_bits = 0;
- if (isOpcWithIntImmediate(N->getOperand(0).getNode(), ISD::SHL, Shl_imm)) {
+ // We're looking for a shift of a shift.
+ uint64_t ShlImm = 0;
+ uint64_t TruncBits = 0;
+ if (isOpcWithIntImmediate(N->getOperand(0).getNode(), ISD::SHL, ShlImm)) {
Opd0 = N->getOperand(0).getOperand(0);
} else if (VT == MVT::i32 && N->getOpcode() == ISD::SRL &&
N->getOperand(0).getNode()->getOpcode() == ISD::TRUNCATE) {
@@ -1584,7 +1615,7 @@ static bool isBitfieldExtractOpFromShr(SDNode *N, unsigned &Opc, SDValue &Opd0,
// always generate 64bit UBFM. This consistency will help the CSE pass
// later find more redundancy.
Opd0 = N->getOperand(0).getOperand(0);
- Trunc_bits = Opd0->getValueType(0).getSizeInBits() - VT.getSizeInBits();
+ TruncBits = Opd0->getValueType(0).getSizeInBits() - VT.getSizeInBits();
VT = Opd0->getValueType(0);
assert(VT == MVT::i64 && "the promoted type should be i64");
} else if (BiggerPattern) {
@@ -1597,21 +1628,21 @@ static bool isBitfieldExtractOpFromShr(SDNode *N, unsigned &Opc, SDValue &Opd0,
// Missing combines/constant folding may have left us with strange
// constants.
- if (Shl_imm >= VT.getSizeInBits()) {
+ if (ShlImm >= VT.getSizeInBits()) {
DEBUG((dbgs() << N
<< ": Found large shift immediate, this should not happen\n"));
return false;
}
- uint64_t Srl_imm = 0;
- if (!isIntImmediate(N->getOperand(1), Srl_imm))
+ uint64_t SrlImm = 0;
+ if (!isIntImmediate(N->getOperand(1), SrlImm))
return false;
- assert(Srl_imm > 0 && Srl_imm < VT.getSizeInBits() &&
+ assert(SrlImm > 0 && SrlImm < VT.getSizeInBits() &&
"bad amount in shift node!");
- int immr = Srl_imm - Shl_imm;
+ int immr = SrlImm - ShlImm;
Immr = immr < 0 ? immr + VT.getSizeInBits() : immr;
- Imms = VT.getSizeInBits() - Shl_imm - Trunc_bits - 1;
+ Imms = VT.getSizeInBits() - ShlImm - TruncBits - 1;
// SRA requires a signed extraction
if (VT == MVT::i32)
Opc = N->getOpcode() == ISD::SRA ? AArch64::SBFMWri : AArch64::UBFMWri;
@@ -1620,6 +1651,30 @@ static bool isBitfieldExtractOpFromShr(SDNode *N, unsigned &Opc, SDValue &Opd0,
return true;
}
+bool AArch64DAGToDAGISel::tryBitfieldExtractOpFromSExt(SDNode *N) {
+ assert(N->getOpcode() == ISD::SIGN_EXTEND);
+
+ EVT VT = N->getValueType(0);
+ EVT NarrowVT = N->getOperand(0)->getValueType(0);
+ if (VT != MVT::i64 || NarrowVT != MVT::i32)
+ return false;
+
+ uint64_t ShiftImm;
+ SDValue Op = N->getOperand(0);
+ if (!isOpcWithIntImmediate(Op.getNode(), ISD::SRA, ShiftImm))
+ return false;
+
+ SDLoc dl(N);
+ // Extend the incoming operand of the shift to 64-bits.
+ SDValue Opd0 = Widen(CurDAG, Op.getOperand(0));
+ unsigned Immr = ShiftImm;
+ unsigned Imms = NarrowVT.getSizeInBits() - 1;
+ SDValue Ops[] = {Opd0, CurDAG->getTargetConstant(Immr, dl, VT),
+ CurDAG->getTargetConstant(Imms, dl, VT)};
+ CurDAG->SelectNodeTo(N, AArch64::SBFMXri, VT, Ops);
+ return true;
+}
+
static bool isBitfieldExtractOp(SelectionDAG *CurDAG, SDNode *N, unsigned &Opc,
SDValue &Opd0, unsigned &Immr, unsigned &Imms,
unsigned NumberOfIgnoredLowBits = 0,
@@ -1638,6 +1693,9 @@ static bool isBitfieldExtractOp(SelectionDAG *CurDAG, SDNode *N, unsigned &Opc,
case ISD::SRL:
case ISD::SRA:
return isBitfieldExtractOpFromShr(N, Opc, Opd0, Immr, Imms, BiggerPattern);
+
+ case ISD::SIGN_EXTEND_INREG:
+ return isBitfieldExtractOpFromSExtInReg(N, Opc, Opd0, Immr, Imms);
}
unsigned NOpc = N->getMachineOpcode();
@@ -1658,11 +1716,11 @@ static bool isBitfieldExtractOp(SelectionDAG *CurDAG, SDNode *N, unsigned &Opc,
return false;
}
-SDNode *AArch64DAGToDAGISel::SelectBitfieldExtractOp(SDNode *N) {
+bool AArch64DAGToDAGISel::tryBitfieldExtractOp(SDNode *N) {
unsigned Opc, Immr, Imms;
SDValue Opd0;
if (!isBitfieldExtractOp(CurDAG, N, Opc, Opd0, Immr, Imms))
- return nullptr;
+ return false;
EVT VT = N->getValueType(0);
SDLoc dl(N);
@@ -1675,22 +1733,22 @@ SDNode *AArch64DAGToDAGISel::SelectBitfieldExtractOp(SDNode *N) {
SDNode *BFM = CurDAG->getMachineNode(Opc, dl, MVT::i64, Ops64);
SDValue SubReg = CurDAG->getTargetConstant(AArch64::sub_32, dl, MVT::i32);
- MachineSDNode *Node =
- CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, dl, MVT::i32,
- SDValue(BFM, 0), SubReg);
- return Node;
+ ReplaceNode(N, CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, dl,
+ MVT::i32, SDValue(BFM, 0), SubReg));
+ return true;
}
SDValue Ops[] = {Opd0, CurDAG->getTargetConstant(Immr, dl, VT),
CurDAG->getTargetConstant(Imms, dl, VT)};
- return CurDAG->SelectNodeTo(N, Opc, VT, Ops);
+ CurDAG->SelectNodeTo(N, Opc, VT, Ops);
+ return true;
}
/// Does DstMask form a complementary pair with the mask provided by
/// BitsToBeInserted, suitable for use in a BFI instruction. Roughly speaking,
/// this asks whether DstMask zeroes precisely those bits that will be set by
/// the other half.
-static bool isBitfieldDstMask(uint64_t DstMask, APInt BitsToBeInserted,
+static bool isBitfieldDstMask(uint64_t DstMask, const APInt &BitsToBeInserted,
unsigned NumberOfIgnoredHighBits, EVT VT) {
assert((VT == MVT::i32 || VT == MVT::i64) &&
"i32 or i64 mask type expected!");
@@ -1851,6 +1909,20 @@ static void getUsefulBitsForUse(SDNode *UserNode, APInt &UsefulBits,
case AArch64::BFMWri:
case AArch64::BFMXri:
return getUsefulBitsFromBFM(SDValue(UserNode, 0), Orig, UsefulBits, Depth);
+
+ case AArch64::STRBBui:
+ case AArch64::STURBBi:
+ if (UserNode->getOperand(0) != Orig)
+ return;
+ UsefulBits &= APInt(UsefulBits.getBitWidth(), 0xff);
+ return;
+
+ case AArch64::STRHHui:
+ case AArch64::STURHHi:
+ if (UserNode->getOperand(0) != Orig)
+ return;
+ UsefulBits &= APInt(UsefulBits.getBitWidth(), 0xffff);
+ return;
}
}
@@ -1963,36 +2035,129 @@ static bool isBitfieldPositioningOp(SelectionDAG *CurDAG, SDValue Op,
return true;
}
-// Given a OR operation, check if we have the following pattern
-// ubfm c, b, imm, imm2 (or something that does the same jobs, see
-// isBitfieldExtractOp)
-// d = e & mask2 ; where mask is a binary sequence of 1..10..0 and
-// countTrailingZeros(mask2) == imm2 - imm + 1
-// f = d | c
-// if yes, given reference arguments will be update so that one can replace
-// the OR instruction with:
-// f = Opc Opd0, Opd1, LSB, MSB ; where Opc is a BFM, LSB = imm, and MSB = imm2
-static bool isBitfieldInsertOpFromOr(SDNode *N, unsigned &Opc, SDValue &Dst,
- SDValue &Src, unsigned &ImmR,
- unsigned &ImmS, const APInt &UsefulBits,
- SelectionDAG *CurDAG) {
+static bool isShiftedMask(uint64_t Mask, EVT VT) {
+ assert(VT == MVT::i32 || VT == MVT::i64);
+ if (VT == MVT::i32)
+ return isShiftedMask_32(Mask);
+ return isShiftedMask_64(Mask);
+}
+
+// Generate a BFI/BFXIL from 'or (and X, MaskImm), OrImm' iff the value being
+// inserted only sets known zero bits.
+static bool tryBitfieldInsertOpFromOrAndImm(SDNode *N, SelectionDAG *CurDAG) {
assert(N->getOpcode() == ISD::OR && "Expect a OR operation");
- // Set Opc
EVT VT = N->getValueType(0);
- if (VT == MVT::i32)
- Opc = AArch64::BFMWri;
- else if (VT == MVT::i64)
- Opc = AArch64::BFMXri;
- else
+ if (VT != MVT::i32 && VT != MVT::i64)
+ return false;
+
+ unsigned BitWidth = VT.getSizeInBits();
+
+ uint64_t OrImm;
+ if (!isOpcWithIntImmediate(N, ISD::OR, OrImm))
+ return false;
+
+ // Skip this transformation if the ORR immediate can be encoded in the ORR.
+ // Otherwise, we'll trade an AND+ORR for ORR+BFI/BFXIL, which is most likely
+ // performance neutral.
+ if (AArch64_AM::isLogicalImmediate(OrImm, BitWidth))
return false;
+ uint64_t MaskImm;
+ SDValue And = N->getOperand(0);
+ // Must be a single use AND with an immediate operand.
+ if (!And.hasOneUse() ||
+ !isOpcWithIntImmediate(And.getNode(), ISD::AND, MaskImm))
+ return false;
+
+ // Compute the Known Zero for the AND as this allows us to catch more general
+ // cases than just looking for AND with imm.
+ APInt KnownZero, KnownOne;
+ CurDAG->computeKnownBits(And, KnownZero, KnownOne);
+
+ // Non-zero in the sense that they're not provably zero, which is the key
+ // point if we want to use this value.
+ uint64_t NotKnownZero = (~KnownZero).getZExtValue();
+
+ // The KnownZero mask must be a shifted mask (e.g., 1110..011, 11100..00).
+ if (!isShiftedMask(KnownZero.getZExtValue(), VT))
+ return false;
+
+ // The bits being inserted must only set those bits that are known to be zero.
+ if ((OrImm & NotKnownZero) != 0) {
+ // FIXME: It's okay if the OrImm sets NotKnownZero bits to 1, but we don't
+ // currently handle this case.
+ return false;
+ }
+
+ // BFI/BFXIL dst, src, #lsb, #width.
+ int LSB = countTrailingOnes(NotKnownZero);
+ int Width = BitWidth - APInt(BitWidth, NotKnownZero).countPopulation();
+
+ // BFI/BFXIL is an alias of BFM, so translate to BFM operands.
+ unsigned ImmR = (BitWidth - LSB) % BitWidth;
+ unsigned ImmS = Width - 1;
+
+ // If we're creating a BFI instruction avoid cases where we need more
+ // instructions to materialize the BFI constant as compared to the original
+ // ORR. A BFXIL will use the same constant as the original ORR, so the code
+ // should be no worse in this case.
+ bool IsBFI = LSB != 0;
+ uint64_t BFIImm = OrImm >> LSB;
+ if (IsBFI && !AArch64_AM::isLogicalImmediate(BFIImm, BitWidth)) {
+ // We have a BFI instruction and we know the constant can't be materialized
+ // with a ORR-immediate with the zero register.
+ unsigned OrChunks = 0, BFIChunks = 0;
+ for (unsigned Shift = 0; Shift < BitWidth; Shift += 16) {
+ if (((OrImm >> Shift) & 0xFFFF) != 0)
+ ++OrChunks;
+ if (((BFIImm >> Shift) & 0xFFFF) != 0)
+ ++BFIChunks;
+ }
+ if (BFIChunks > OrChunks)
+ return false;
+ }
+
+ // Materialize the constant to be inserted.
+ SDLoc DL(N);
+ unsigned MOVIOpc = VT == MVT::i32 ? AArch64::MOVi32imm : AArch64::MOVi64imm;
+ SDNode *MOVI = CurDAG->getMachineNode(
+ MOVIOpc, DL, VT, CurDAG->getTargetConstant(BFIImm, DL, VT));
+
+ // Create the BFI/BFXIL instruction.
+ SDValue Ops[] = {And.getOperand(0), SDValue(MOVI, 0),
+ CurDAG->getTargetConstant(ImmR, DL, VT),
+ CurDAG->getTargetConstant(ImmS, DL, VT)};
+ unsigned Opc = (VT == MVT::i32) ? AArch64::BFMWri : AArch64::BFMXri;
+ CurDAG->SelectNodeTo(N, Opc, VT, Ops);
+ return true;
+}
+
+static bool tryBitfieldInsertOpFromOr(SDNode *N, const APInt &UsefulBits,
+ SelectionDAG *CurDAG) {
+ assert(N->getOpcode() == ISD::OR && "Expect a OR operation");
+
+ EVT VT = N->getValueType(0);
+ if (VT != MVT::i32 && VT != MVT::i64)
+ return false;
+
+ unsigned BitWidth = VT.getSizeInBits();
+
// Because of simplify-demanded-bits in DAGCombine, involved masks may not
// have the expected shape. Try to undo that.
unsigned NumberOfIgnoredLowBits = UsefulBits.countTrailingZeros();
unsigned NumberOfIgnoredHighBits = UsefulBits.countLeadingZeros();
+ // Given a OR operation, check if we have the following pattern
+ // ubfm c, b, imm, imm2 (or something that does the same jobs, see
+ // isBitfieldExtractOp)
+ // d = e & mask2 ; where mask is a binary sequence of 1..10..0 and
+ // countTrailingZeros(mask2) == imm2 - imm + 1
+ // f = d | c
+ // if yes, replace the OR instruction with:
+ // f = BFM Opd0, Opd1, LSB, MSB ; where LSB = imm, and MSB = imm2
+
// OR is commutative, check all combinations of operand order and values of
// BiggerPattern, i.e.
// Opd0, Opd1, BiggerPattern=false
@@ -2004,8 +2169,11 @@ static bool isBitfieldInsertOpFromOr(SDNode *N, unsigned &Opc, SDValue &Dst,
// and/or inserting fewer extra instructions.
for (int I = 0; I < 4; ++I) {
+ SDValue Dst, Src;
+ unsigned ImmR, ImmS;
bool BiggerPattern = I / 2;
- SDNode *OrOpd0 = N->getOperand(I % 2).getNode();
+ SDValue OrOpd0Val = N->getOperand(I % 2);
+ SDNode *OrOpd0 = OrOpd0Val.getNode();
SDValue OrOpd1Val = N->getOperand((I + 1) % 2);
SDNode *OrOpd1 = OrOpd1Val.getNode();
@@ -2030,10 +2198,10 @@ static bool isBitfieldInsertOpFromOr(SDNode *N, unsigned &Opc, SDValue &Dst,
// If the mask on the insertee is correct, we have a BFXIL operation. We
// can share the ImmR and ImmS values from the already-computed UBFM.
- } else if (isBitfieldPositioningOp(CurDAG, SDValue(OrOpd0, 0),
+ } else if (isBitfieldPositioningOp(CurDAG, OrOpd0Val,
BiggerPattern,
Src, DstLSB, Width)) {
- ImmR = (VT.getSizeInBits() - DstLSB) % VT.getSizeInBits();
+ ImmR = (BitWidth - DstLSB) % BitWidth;
ImmS = Width - 1;
} else
continue;
@@ -2069,60 +2237,98 @@ static bool isBitfieldInsertOpFromOr(SDNode *N, unsigned &Opc, SDValue &Dst,
Dst = OrOpd1Val;
// both parts match
+ SDLoc DL(N);
+ SDValue Ops[] = {Dst, Src, CurDAG->getTargetConstant(ImmR, DL, VT),
+ CurDAG->getTargetConstant(ImmS, DL, VT)};
+ unsigned Opc = (VT == MVT::i32) ? AArch64::BFMWri : AArch64::BFMXri;
+ CurDAG->SelectNodeTo(N, Opc, VT, Ops);
+ return true;
+ }
+
+ // Generate a BFXIL from 'or (and X, Mask0Imm), (and Y, Mask1Imm)' iff
+ // Mask0Imm and ~Mask1Imm are equivalent and one of the MaskImms is a shifted
+ // mask (e.g., 0x000ffff0).
+ uint64_t Mask0Imm, Mask1Imm;
+ SDValue And0 = N->getOperand(0);
+ SDValue And1 = N->getOperand(1);
+ if (And0.hasOneUse() && And1.hasOneUse() &&
+ isOpcWithIntImmediate(And0.getNode(), ISD::AND, Mask0Imm) &&
+ isOpcWithIntImmediate(And1.getNode(), ISD::AND, Mask1Imm) &&
+ APInt(BitWidth, Mask0Imm) == ~APInt(BitWidth, Mask1Imm) &&
+ (isShiftedMask(Mask0Imm, VT) || isShiftedMask(Mask1Imm, VT))) {
+
+ // ORR is commutative, so canonicalize to the form 'or (and X, Mask0Imm),
+ // (and Y, Mask1Imm)' where Mask1Imm is the shifted mask masking off the
+ // bits to be inserted.
+ if (isShiftedMask(Mask0Imm, VT)) {
+ std::swap(And0, And1);
+ std::swap(Mask0Imm, Mask1Imm);
+ }
+
+ SDValue Src = And1->getOperand(0);
+ SDValue Dst = And0->getOperand(0);
+ unsigned LSB = countTrailingZeros(Mask1Imm);
+ int Width = BitWidth - APInt(BitWidth, Mask0Imm).countPopulation();
+
+ // The BFXIL inserts the low-order bits from a source register, so right
+ // shift the needed bits into place.
+ SDLoc DL(N);
+ unsigned ShiftOpc = (VT == MVT::i32) ? AArch64::UBFMWri : AArch64::UBFMXri;
+ SDNode *LSR = CurDAG->getMachineNode(
+ ShiftOpc, DL, VT, Src, CurDAG->getTargetConstant(LSB, DL, VT),
+ CurDAG->getTargetConstant(BitWidth - 1, DL, VT));
+
+ // BFXIL is an alias of BFM, so translate to BFM operands.
+ unsigned ImmR = (BitWidth - LSB) % BitWidth;
+ unsigned ImmS = Width - 1;
+
+ // Create the BFXIL instruction.
+ SDValue Ops[] = {Dst, SDValue(LSR, 0),
+ CurDAG->getTargetConstant(ImmR, DL, VT),
+ CurDAG->getTargetConstant(ImmS, DL, VT)};
+ unsigned Opc = (VT == MVT::i32) ? AArch64::BFMWri : AArch64::BFMXri;
+ CurDAG->SelectNodeTo(N, Opc, VT, Ops);
return true;
}
return false;
}
-SDNode *AArch64DAGToDAGISel::SelectBitfieldInsertOp(SDNode *N) {
+bool AArch64DAGToDAGISel::tryBitfieldInsertOp(SDNode *N) {
if (N->getOpcode() != ISD::OR)
- return nullptr;
+ return false;
- unsigned Opc;
- unsigned LSB, MSB;
- SDValue Opd0, Opd1;
- EVT VT = N->getValueType(0);
APInt NUsefulBits;
getUsefulBits(SDValue(N, 0), NUsefulBits);
// If all bits are not useful, just return UNDEF.
- if (!NUsefulBits)
- return CurDAG->SelectNodeTo(N, TargetOpcode::IMPLICIT_DEF, VT);
+ if (!NUsefulBits) {
+ CurDAG->SelectNodeTo(N, TargetOpcode::IMPLICIT_DEF, N->getValueType(0));
+ return true;
+ }
- if (!isBitfieldInsertOpFromOr(N, Opc, Opd0, Opd1, LSB, MSB, NUsefulBits,
- CurDAG))
- return nullptr;
+ if (tryBitfieldInsertOpFromOr(N, NUsefulBits, CurDAG))
+ return true;
- SDLoc dl(N);
- SDValue Ops[] = { Opd0,
- Opd1,
- CurDAG->getTargetConstant(LSB, dl, VT),
- CurDAG->getTargetConstant(MSB, dl, VT) };
- return CurDAG->SelectNodeTo(N, Opc, VT, Ops);
+ return tryBitfieldInsertOpFromOrAndImm(N, CurDAG);
}
/// SelectBitfieldInsertInZeroOp - Match a UBFIZ instruction that is the
/// equivalent of a left shift by a constant amount followed by an and masking
/// out a contiguous set of bits.
-SDNode *AArch64DAGToDAGISel::SelectBitfieldInsertInZeroOp(SDNode *N) {
+bool AArch64DAGToDAGISel::tryBitfieldInsertInZeroOp(SDNode *N) {
if (N->getOpcode() != ISD::AND)
- return nullptr;
+ return false;
EVT VT = N->getValueType(0);
- unsigned Opc;
- if (VT == MVT::i32)
- Opc = AArch64::UBFMWri;
- else if (VT == MVT::i64)
- Opc = AArch64::UBFMXri;
- else
- return nullptr;
+ if (VT != MVT::i32 && VT != MVT::i64)
+ return false;
SDValue Op0;
int DstLSB, Width;
if (!isBitfieldPositioningOp(CurDAG, SDValue(N, 0), /*BiggerPattern=*/false,
Op0, DstLSB, Width))
- return nullptr;
+ return false;
// ImmR is the rotate right amount.
unsigned ImmR = (VT.getSizeInBits() - DstLSB) % VT.getSizeInBits();
@@ -2132,7 +2338,9 @@ SDNode *AArch64DAGToDAGISel::SelectBitfieldInsertInZeroOp(SDNode *N) {
SDLoc DL(N);
SDValue Ops[] = {Op0, CurDAG->getTargetConstant(ImmR, DL, VT),
CurDAG->getTargetConstant(ImmS, DL, VT)};
- return CurDAG->SelectNodeTo(N, Opc, VT, Ops);
+ unsigned Opc = (VT == MVT::i32) ? AArch64::UBFMWri : AArch64::UBFMXri;
+ CurDAG->SelectNodeTo(N, Opc, VT, Ops);
+ return true;
}
bool
@@ -2214,62 +2422,68 @@ static int getIntOperandFromRegisterString(StringRef RegString) {
// register string argument is either of the form detailed in the ALCE (the
// form described in getIntOperandsFromRegsterString) or is a named register
// known by the MRS SysReg mapper.
-SDNode *AArch64DAGToDAGISel::SelectReadRegister(SDNode *N) {
+bool AArch64DAGToDAGISel::tryReadRegister(SDNode *N) {
const MDNodeSDNode *MD = dyn_cast<MDNodeSDNode>(N->getOperand(1));
const MDString *RegString = dyn_cast<MDString>(MD->getMD()->getOperand(0));
SDLoc DL(N);
int Reg = getIntOperandFromRegisterString(RegString->getString());
- if (Reg != -1)
- return CurDAG->getMachineNode(AArch64::MRS, DL, N->getSimpleValueType(0),
- MVT::Other,
- CurDAG->getTargetConstant(Reg, DL, MVT::i32),
- N->getOperand(0));
+ if (Reg != -1) {
+ ReplaceNode(N, CurDAG->getMachineNode(
+ AArch64::MRS, DL, N->getSimpleValueType(0), MVT::Other,
+ CurDAG->getTargetConstant(Reg, DL, MVT::i32),
+ N->getOperand(0)));
+ return true;
+ }
// Use the sysreg mapper to map the remaining possible strings to the
// value for the register to be used for the instruction operand.
- AArch64SysReg::MRSMapper mapper;
- bool IsValidSpecialReg;
- Reg = mapper.fromString(RegString->getString(),
- Subtarget->getFeatureBits(),
- IsValidSpecialReg);
- if (IsValidSpecialReg)
- return CurDAG->getMachineNode(AArch64::MRS, DL, N->getSimpleValueType(0),
- MVT::Other,
- CurDAG->getTargetConstant(Reg, DL, MVT::i32),
- N->getOperand(0));
+ auto TheReg = AArch64SysReg::lookupSysRegByName(RegString->getString());
+ if (TheReg && TheReg->Readable &&
+ TheReg->haveFeatures(Subtarget->getFeatureBits()))
+ Reg = TheReg->Encoding;
+ else
+ Reg = AArch64SysReg::parseGenericRegister(RegString->getString());
+
+ if (Reg != -1) {
+ ReplaceNode(N, CurDAG->getMachineNode(
+ AArch64::MRS, DL, N->getSimpleValueType(0), MVT::Other,
+ CurDAG->getTargetConstant(Reg, DL, MVT::i32),
+ N->getOperand(0)));
+ return true;
+ }
- return nullptr;
+ return false;
}
// Lower the write_register intrinsic to an MSR instruction node if the special
// register string argument is either of the form detailed in the ALCE (the
// form described in getIntOperandsFromRegsterString) or is a named register
// known by the MSR SysReg mapper.
-SDNode *AArch64DAGToDAGISel::SelectWriteRegister(SDNode *N) {
+bool AArch64DAGToDAGISel::tryWriteRegister(SDNode *N) {
const MDNodeSDNode *MD = dyn_cast<MDNodeSDNode>(N->getOperand(1));
const MDString *RegString = dyn_cast<MDString>(MD->getMD()->getOperand(0));
SDLoc DL(N);
int Reg = getIntOperandFromRegisterString(RegString->getString());
- if (Reg != -1)
- return CurDAG->getMachineNode(AArch64::MSR, DL, MVT::Other,
+ if (Reg != -1) {
+ ReplaceNode(
+ N, CurDAG->getMachineNode(AArch64::MSR, DL, MVT::Other,
CurDAG->getTargetConstant(Reg, DL, MVT::i32),
- N->getOperand(2), N->getOperand(0));
+ N->getOperand(2), N->getOperand(0)));
+ return true;
+ }
// Check if the register was one of those allowed as the pstatefield value in
// the MSR (immediate) instruction. To accept the values allowed in the
// pstatefield for the MSR (immediate) instruction, we also require that an
// immediate value has been provided as an argument, we know that this is
// the case as it has been ensured by semantic checking.
- AArch64PState::PStateMapper PMapper;
- bool IsValidSpecialReg;
- Reg = PMapper.fromString(RegString->getString(),
- Subtarget->getFeatureBits(),
- IsValidSpecialReg);
- if (IsValidSpecialReg) {
+ auto PMapper = AArch64PState::lookupPStateByName(RegString->getString());;
+ if (PMapper) {
assert (isa<ConstantSDNode>(N->getOperand(2))
&& "Expected a constant integer expression.");
+ unsigned Reg = PMapper->Encoding;
uint64_t Immed = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue();
unsigned State;
if (Reg == AArch64PState::PAN || Reg == AArch64PState::UAO) {
@@ -2279,29 +2493,66 @@ SDNode *AArch64DAGToDAGISel::SelectWriteRegister(SDNode *N) {
assert(Immed < 16 && "Bad imm");
State = AArch64::MSRpstateImm4;
}
- return CurDAG->getMachineNode(State, DL, MVT::Other,
- CurDAG->getTargetConstant(Reg, DL, MVT::i32),
- CurDAG->getTargetConstant(Immed, DL, MVT::i16),
- N->getOperand(0));
+ ReplaceNode(N, CurDAG->getMachineNode(
+ State, DL, MVT::Other,
+ CurDAG->getTargetConstant(Reg, DL, MVT::i32),
+ CurDAG->getTargetConstant(Immed, DL, MVT::i16),
+ N->getOperand(0)));
+ return true;
}
// Use the sysreg mapper to attempt to map the remaining possible strings
// to the value for the register to be used for the MSR (register)
// instruction operand.
- AArch64SysReg::MSRMapper Mapper;
- Reg = Mapper.fromString(RegString->getString(),
- Subtarget->getFeatureBits(),
- IsValidSpecialReg);
+ auto TheReg = AArch64SysReg::lookupSysRegByName(RegString->getString());
+ if (TheReg && TheReg->Writeable &&
+ TheReg->haveFeatures(Subtarget->getFeatureBits()))
+ Reg = TheReg->Encoding;
+ else
+ Reg = AArch64SysReg::parseGenericRegister(RegString->getString());
+ if (Reg != -1) {
+ ReplaceNode(N, CurDAG->getMachineNode(
+ AArch64::MSR, DL, MVT::Other,
+ CurDAG->getTargetConstant(Reg, DL, MVT::i32),
+ N->getOperand(2), N->getOperand(0)));
+ return true;
+ }
- if (IsValidSpecialReg)
- return CurDAG->getMachineNode(AArch64::MSR, DL, MVT::Other,
- CurDAG->getTargetConstant(Reg, DL, MVT::i32),
- N->getOperand(2), N->getOperand(0));
+ return false;
+}
+
+/// We've got special pseudo-instructions for these
+void AArch64DAGToDAGISel::SelectCMP_SWAP(SDNode *N) {
+ unsigned Opcode;
+ EVT MemTy = cast<MemSDNode>(N)->getMemoryVT();
+ if (MemTy == MVT::i8)
+ Opcode = AArch64::CMP_SWAP_8;
+ else if (MemTy == MVT::i16)
+ Opcode = AArch64::CMP_SWAP_16;
+ else if (MemTy == MVT::i32)
+ Opcode = AArch64::CMP_SWAP_32;
+ else if (MemTy == MVT::i64)
+ Opcode = AArch64::CMP_SWAP_64;
+ else
+ llvm_unreachable("Unknown AtomicCmpSwap type");
- return nullptr;
+ MVT RegTy = MemTy == MVT::i64 ? MVT::i64 : MVT::i32;
+ SDValue Ops[] = {N->getOperand(1), N->getOperand(2), N->getOperand(3),
+ N->getOperand(0)};
+ SDNode *CmpSwap = CurDAG->getMachineNode(
+ Opcode, SDLoc(N),
+ CurDAG->getVTList(RegTy, MVT::i32, MVT::Other), Ops);
+
+ MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+ MemOp[0] = cast<MemSDNode>(N)->getMemOperand();
+ cast<MachineSDNode>(CmpSwap)->setMemRefs(MemOp, MemOp + 1);
+
+ ReplaceUses(SDValue(N, 0), SDValue(CmpSwap, 0));
+ ReplaceUses(SDValue(N, 1), SDValue(CmpSwap, 2));
+ CurDAG->RemoveDeadNode(N);
}
-SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
+void AArch64DAGToDAGISel::Select(SDNode *Node) {
// Dump information about the Node being selected
DEBUG(errs() << "Selecting: ");
DEBUG(Node->dump(CurDAG));
@@ -2311,54 +2562,61 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
if (Node->isMachineOpcode()) {
DEBUG(errs() << "== "; Node->dump(CurDAG); errs() << "\n");
Node->setNodeId(-1);
- return nullptr;
+ return;
}
// Few custom selection stuff.
- SDNode *ResNode = nullptr;
EVT VT = Node->getValueType(0);
switch (Node->getOpcode()) {
default:
break;
+ case ISD::ATOMIC_CMP_SWAP:
+ SelectCMP_SWAP(Node);
+ return;
+
case ISD::READ_REGISTER:
- if (SDNode *Res = SelectReadRegister(Node))
- return Res;
+ if (tryReadRegister(Node))
+ return;
break;
case ISD::WRITE_REGISTER:
- if (SDNode *Res = SelectWriteRegister(Node))
- return Res;
+ if (tryWriteRegister(Node))
+ return;
break;
case ISD::ADD:
- if (SDNode *I = SelectMLAV64LaneV128(Node))
- return I;
+ if (tryMLAV64LaneV128(Node))
+ return;
break;
case ISD::LOAD: {
// Try to select as an indexed load. Fall through to normal processing
// if we can't.
- bool Done = false;
- SDNode *I = SelectIndexedLoad(Node, Done);
- if (Done)
- return I;
+ if (tryIndexedLoad(Node))
+ return;
break;
}
case ISD::SRL:
case ISD::AND:
case ISD::SRA:
- if (SDNode *I = SelectBitfieldExtractOp(Node))
- return I;
- if (SDNode *I = SelectBitfieldInsertInZeroOp(Node))
- return I;
+ case ISD::SIGN_EXTEND_INREG:
+ if (tryBitfieldExtractOp(Node))
+ return;
+ if (tryBitfieldInsertInZeroOp(Node))
+ return;
+ break;
+
+ case ISD::SIGN_EXTEND:
+ if (tryBitfieldExtractOpFromSExt(Node))
+ return;
break;
case ISD::OR:
- if (SDNode *I = SelectBitfieldInsertOp(Node))
- return I;
+ if (tryBitfieldInsertOp(Node))
+ return;
break;
case ISD::EXTRACT_VECTOR_ELT: {
@@ -2401,19 +2659,25 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
DEBUG(dbgs() << "ISEL: Custom selection!\n=> ");
DEBUG(Extract->dumpr(CurDAG));
DEBUG(dbgs() << "\n");
- return Extract.getNode();
+ ReplaceNode(Node, Extract.getNode());
+ return;
}
case ISD::Constant: {
// Materialize zero constants as copies from WZR/XZR. This allows
// the coalescer to propagate these into other instructions.
ConstantSDNode *ConstNode = cast<ConstantSDNode>(Node);
if (ConstNode->isNullValue()) {
- if (VT == MVT::i32)
- return CurDAG->getCopyFromReg(CurDAG->getEntryNode(), SDLoc(Node),
- AArch64::WZR, MVT::i32).getNode();
- else if (VT == MVT::i64)
- return CurDAG->getCopyFromReg(CurDAG->getEntryNode(), SDLoc(Node),
- AArch64::XZR, MVT::i64).getNode();
+ if (VT == MVT::i32) {
+ SDValue New = CurDAG->getCopyFromReg(
+ CurDAG->getEntryNode(), SDLoc(Node), AArch64::WZR, MVT::i32);
+ ReplaceNode(Node, New.getNode());
+ return;
+ } else if (VT == MVT::i64) {
+ SDValue New = CurDAG->getCopyFromReg(
+ CurDAG->getEntryNode(), SDLoc(Node), AArch64::XZR, MVT::i64);
+ ReplaceNode(Node, New.getNode());
+ return;
+ }
}
break;
}
@@ -2428,7 +2692,8 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
SDLoc DL(Node);
SDValue Ops[] = { TFI, CurDAG->getTargetConstant(0, DL, MVT::i32),
CurDAG->getTargetConstant(Shifter, DL, MVT::i32) };
- return CurDAG->SelectNodeTo(Node, AArch64::ADDXri, MVT::i64, Ops);
+ CurDAG->SelectNodeTo(Node, AArch64::ADDXri, MVT::i64, Ops);
+ return;
}
case ISD::INTRINSIC_W_CHAIN: {
unsigned IntNo = cast<ConstantSDNode>(Node->getOperand(1))->getZExtValue();
@@ -2450,7 +2715,8 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
MemOp[0] = cast<MemIntrinsicSDNode>(Node)->getMemOperand();
cast<MachineSDNode>(Ld)->setMemRefs(MemOp, MemOp + 1);
- return Ld;
+ ReplaceNode(Node, Ld);
+ return;
}
case Intrinsic::aarch64_stlxp:
case Intrinsic::aarch64_stxp: {
@@ -2471,208 +2737,305 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
MemOp[0] = cast<MemIntrinsicSDNode>(Node)->getMemOperand();
cast<MachineSDNode>(St)->setMemRefs(MemOp, MemOp + 1);
- return St;
+ ReplaceNode(Node, St);
+ return;
}
case Intrinsic::aarch64_neon_ld1x2:
- if (VT == MVT::v8i8)
- return SelectLoad(Node, 2, AArch64::LD1Twov8b, AArch64::dsub0);
- else if (VT == MVT::v16i8)
- return SelectLoad(Node, 2, AArch64::LD1Twov16b, AArch64::qsub0);
- else if (VT == MVT::v4i16 || VT == MVT::v4f16)
- return SelectLoad(Node, 2, AArch64::LD1Twov4h, AArch64::dsub0);
- else if (VT == MVT::v8i16 || VT == MVT::v8f16)
- return SelectLoad(Node, 2, AArch64::LD1Twov8h, AArch64::qsub0);
- else if (VT == MVT::v2i32 || VT == MVT::v2f32)
- return SelectLoad(Node, 2, AArch64::LD1Twov2s, AArch64::dsub0);
- else if (VT == MVT::v4i32 || VT == MVT::v4f32)
- return SelectLoad(Node, 2, AArch64::LD1Twov4s, AArch64::qsub0);
- else if (VT == MVT::v1i64 || VT == MVT::v1f64)
- return SelectLoad(Node, 2, AArch64::LD1Twov1d, AArch64::dsub0);
- else if (VT == MVT::v2i64 || VT == MVT::v2f64)
- return SelectLoad(Node, 2, AArch64::LD1Twov2d, AArch64::qsub0);
+ if (VT == MVT::v8i8) {
+ SelectLoad(Node, 2, AArch64::LD1Twov8b, AArch64::dsub0);
+ return;
+ } else if (VT == MVT::v16i8) {
+ SelectLoad(Node, 2, AArch64::LD1Twov16b, AArch64::qsub0);
+ return;
+ } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+ SelectLoad(Node, 2, AArch64::LD1Twov4h, AArch64::dsub0);
+ return;
+ } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+ SelectLoad(Node, 2, AArch64::LD1Twov8h, AArch64::qsub0);
+ return;
+ } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
+ SelectLoad(Node, 2, AArch64::LD1Twov2s, AArch64::dsub0);
+ return;
+ } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
+ SelectLoad(Node, 2, AArch64::LD1Twov4s, AArch64::qsub0);
+ return;
+ } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
+ SelectLoad(Node, 2, AArch64::LD1Twov1d, AArch64::dsub0);
+ return;
+ } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
+ SelectLoad(Node, 2, AArch64::LD1Twov2d, AArch64::qsub0);
+ return;
+ }
break;
case Intrinsic::aarch64_neon_ld1x3:
- if (VT == MVT::v8i8)
- return SelectLoad(Node, 3, AArch64::LD1Threev8b, AArch64::dsub0);
- else if (VT == MVT::v16i8)
- return SelectLoad(Node, 3, AArch64::LD1Threev16b, AArch64::qsub0);
- else if (VT == MVT::v4i16 || VT == MVT::v4f16)
- return SelectLoad(Node, 3, AArch64::LD1Threev4h, AArch64::dsub0);
- else if (VT == MVT::v8i16 || VT == MVT::v8f16)
- return SelectLoad(Node, 3, AArch64::LD1Threev8h, AArch64::qsub0);
- else if (VT == MVT::v2i32 || VT == MVT::v2f32)
- return SelectLoad(Node, 3, AArch64::LD1Threev2s, AArch64::dsub0);
- else if (VT == MVT::v4i32 || VT == MVT::v4f32)
- return SelectLoad(Node, 3, AArch64::LD1Threev4s, AArch64::qsub0);
- else if (VT == MVT::v1i64 || VT == MVT::v1f64)
- return SelectLoad(Node, 3, AArch64::LD1Threev1d, AArch64::dsub0);
- else if (VT == MVT::v2i64 || VT == MVT::v2f64)
- return SelectLoad(Node, 3, AArch64::LD1Threev2d, AArch64::qsub0);
+ if (VT == MVT::v8i8) {
+ SelectLoad(Node, 3, AArch64::LD1Threev8b, AArch64::dsub0);
+ return;
+ } else if (VT == MVT::v16i8) {
+ SelectLoad(Node, 3, AArch64::LD1Threev16b, AArch64::qsub0);
+ return;
+ } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+ SelectLoad(Node, 3, AArch64::LD1Threev4h, AArch64::dsub0);
+ return;
+ } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+ SelectLoad(Node, 3, AArch64::LD1Threev8h, AArch64::qsub0);
+ return;
+ } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
+ SelectLoad(Node, 3, AArch64::LD1Threev2s, AArch64::dsub0);
+ return;
+ } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
+ SelectLoad(Node, 3, AArch64::LD1Threev4s, AArch64::qsub0);
+ return;
+ } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
+ SelectLoad(Node, 3, AArch64::LD1Threev1d, AArch64::dsub0);
+ return;
+ } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
+ SelectLoad(Node, 3, AArch64::LD1Threev2d, AArch64::qsub0);
+ return;
+ }
break;
case Intrinsic::aarch64_neon_ld1x4:
- if (VT == MVT::v8i8)
- return SelectLoad(Node, 4, AArch64::LD1Fourv8b, AArch64::dsub0);
- else if (VT == MVT::v16i8)
- return SelectLoad(Node, 4, AArch64::LD1Fourv16b, AArch64::qsub0);
- else if (VT == MVT::v4i16 || VT == MVT::v4f16)
- return SelectLoad(Node, 4, AArch64::LD1Fourv4h, AArch64::dsub0);
- else if (VT == MVT::v8i16 || VT == MVT::v8f16)
- return SelectLoad(Node, 4, AArch64::LD1Fourv8h, AArch64::qsub0);
- else if (VT == MVT::v2i32 || VT == MVT::v2f32)
- return SelectLoad(Node, 4, AArch64::LD1Fourv2s, AArch64::dsub0);
- else if (VT == MVT::v4i32 || VT == MVT::v4f32)
- return SelectLoad(Node, 4, AArch64::LD1Fourv4s, AArch64::qsub0);
- else if (VT == MVT::v1i64 || VT == MVT::v1f64)
- return SelectLoad(Node, 4, AArch64::LD1Fourv1d, AArch64::dsub0);
- else if (VT == MVT::v2i64 || VT == MVT::v2f64)
- return SelectLoad(Node, 4, AArch64::LD1Fourv2d, AArch64::qsub0);
+ if (VT == MVT::v8i8) {
+ SelectLoad(Node, 4, AArch64::LD1Fourv8b, AArch64::dsub0);
+ return;
+ } else if (VT == MVT::v16i8) {
+ SelectLoad(Node, 4, AArch64::LD1Fourv16b, AArch64::qsub0);
+ return;
+ } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+ SelectLoad(Node, 4, AArch64::LD1Fourv4h, AArch64::dsub0);
+ return;
+ } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+ SelectLoad(Node, 4, AArch64::LD1Fourv8h, AArch64::qsub0);
+ return;
+ } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
+ SelectLoad(Node, 4, AArch64::LD1Fourv2s, AArch64::dsub0);
+ return;
+ } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
+ SelectLoad(Node, 4, AArch64::LD1Fourv4s, AArch64::qsub0);
+ return;
+ } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
+ SelectLoad(Node, 4, AArch64::LD1Fourv1d, AArch64::dsub0);
+ return;
+ } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
+ SelectLoad(Node, 4, AArch64::LD1Fourv2d, AArch64::qsub0);
+ return;
+ }
break;
case Intrinsic::aarch64_neon_ld2:
- if (VT == MVT::v8i8)
- return SelectLoad(Node, 2, AArch64::LD2Twov8b, AArch64::dsub0);
- else if (VT == MVT::v16i8)
- return SelectLoad(Node, 2, AArch64::LD2Twov16b, AArch64::qsub0);
- else if (VT == MVT::v4i16 || VT == MVT::v4f16)
- return SelectLoad(Node, 2, AArch64::LD2Twov4h, AArch64::dsub0);
- else if (VT == MVT::v8i16 || VT == MVT::v8f16)
- return SelectLoad(Node, 2, AArch64::LD2Twov8h, AArch64::qsub0);
- else if (VT == MVT::v2i32 || VT == MVT::v2f32)
- return SelectLoad(Node, 2, AArch64::LD2Twov2s, AArch64::dsub0);
- else if (VT == MVT::v4i32 || VT == MVT::v4f32)
- return SelectLoad(Node, 2, AArch64::LD2Twov4s, AArch64::qsub0);
- else if (VT == MVT::v1i64 || VT == MVT::v1f64)
- return SelectLoad(Node, 2, AArch64::LD1Twov1d, AArch64::dsub0);
- else if (VT == MVT::v2i64 || VT == MVT::v2f64)
- return SelectLoad(Node, 2, AArch64::LD2Twov2d, AArch64::qsub0);
+ if (VT == MVT::v8i8) {
+ SelectLoad(Node, 2, AArch64::LD2Twov8b, AArch64::dsub0);
+ return;
+ } else if (VT == MVT::v16i8) {
+ SelectLoad(Node, 2, AArch64::LD2Twov16b, AArch64::qsub0);
+ return;
+ } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+ SelectLoad(Node, 2, AArch64::LD2Twov4h, AArch64::dsub0);
+ return;
+ } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+ SelectLoad(Node, 2, AArch64::LD2Twov8h, AArch64::qsub0);
+ return;
+ } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
+ SelectLoad(Node, 2, AArch64::LD2Twov2s, AArch64::dsub0);
+ return;
+ } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
+ SelectLoad(Node, 2, AArch64::LD2Twov4s, AArch64::qsub0);
+ return;
+ } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
+ SelectLoad(Node, 2, AArch64::LD1Twov1d, AArch64::dsub0);
+ return;
+ } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
+ SelectLoad(Node, 2, AArch64::LD2Twov2d, AArch64::qsub0);
+ return;
+ }
break;
case Intrinsic::aarch64_neon_ld3:
- if (VT == MVT::v8i8)
- return SelectLoad(Node, 3, AArch64::LD3Threev8b, AArch64::dsub0);
- else if (VT == MVT::v16i8)
- return SelectLoad(Node, 3, AArch64::LD3Threev16b, AArch64::qsub0);
- else if (VT == MVT::v4i16 || VT == MVT::v4f16)
- return SelectLoad(Node, 3, AArch64::LD3Threev4h, AArch64::dsub0);
- else if (VT == MVT::v8i16 || VT == MVT::v8f16)
- return SelectLoad(Node, 3, AArch64::LD3Threev8h, AArch64::qsub0);
- else if (VT == MVT::v2i32 || VT == MVT::v2f32)
- return SelectLoad(Node, 3, AArch64::LD3Threev2s, AArch64::dsub0);
- else if (VT == MVT::v4i32 || VT == MVT::v4f32)
- return SelectLoad(Node, 3, AArch64::LD3Threev4s, AArch64::qsub0);
- else if (VT == MVT::v1i64 || VT == MVT::v1f64)
- return SelectLoad(Node, 3, AArch64::LD1Threev1d, AArch64::dsub0);
- else if (VT == MVT::v2i64 || VT == MVT::v2f64)
- return SelectLoad(Node, 3, AArch64::LD3Threev2d, AArch64::qsub0);
+ if (VT == MVT::v8i8) {
+ SelectLoad(Node, 3, AArch64::LD3Threev8b, AArch64::dsub0);
+ return;
+ } else if (VT == MVT::v16i8) {
+ SelectLoad(Node, 3, AArch64::LD3Threev16b, AArch64::qsub0);
+ return;
+ } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+ SelectLoad(Node, 3, AArch64::LD3Threev4h, AArch64::dsub0);
+ return;
+ } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+ SelectLoad(Node, 3, AArch64::LD3Threev8h, AArch64::qsub0);
+ return;
+ } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
+ SelectLoad(Node, 3, AArch64::LD3Threev2s, AArch64::dsub0);
+ return;
+ } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
+ SelectLoad(Node, 3, AArch64::LD3Threev4s, AArch64::qsub0);
+ return;
+ } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
+ SelectLoad(Node, 3, AArch64::LD1Threev1d, AArch64::dsub0);
+ return;
+ } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
+ SelectLoad(Node, 3, AArch64::LD3Threev2d, AArch64::qsub0);
+ return;
+ }
break;
case Intrinsic::aarch64_neon_ld4:
- if (VT == MVT::v8i8)
- return SelectLoad(Node, 4, AArch64::LD4Fourv8b, AArch64::dsub0);
- else if (VT == MVT::v16i8)
- return SelectLoad(Node, 4, AArch64::LD4Fourv16b, AArch64::qsub0);
- else if (VT == MVT::v4i16 || VT == MVT::v4f16)
- return SelectLoad(Node, 4, AArch64::LD4Fourv4h, AArch64::dsub0);
- else if (VT == MVT::v8i16 || VT == MVT::v8f16)
- return SelectLoad(Node, 4, AArch64::LD4Fourv8h, AArch64::qsub0);
- else if (VT == MVT::v2i32 || VT == MVT::v2f32)
- return SelectLoad(Node, 4, AArch64::LD4Fourv2s, AArch64::dsub0);
- else if (VT == MVT::v4i32 || VT == MVT::v4f32)
- return SelectLoad(Node, 4, AArch64::LD4Fourv4s, AArch64::qsub0);
- else if (VT == MVT::v1i64 || VT == MVT::v1f64)
- return SelectLoad(Node, 4, AArch64::LD1Fourv1d, AArch64::dsub0);
- else if (VT == MVT::v2i64 || VT == MVT::v2f64)
- return SelectLoad(Node, 4, AArch64::LD4Fourv2d, AArch64::qsub0);
+ if (VT == MVT::v8i8) {
+ SelectLoad(Node, 4, AArch64::LD4Fourv8b, AArch64::dsub0);
+ return;
+ } else if (VT == MVT::v16i8) {
+ SelectLoad(Node, 4, AArch64::LD4Fourv16b, AArch64::qsub0);
+ return;
+ } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+ SelectLoad(Node, 4, AArch64::LD4Fourv4h, AArch64::dsub0);
+ return;
+ } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+ SelectLoad(Node, 4, AArch64::LD4Fourv8h, AArch64::qsub0);
+ return;
+ } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
+ SelectLoad(Node, 4, AArch64::LD4Fourv2s, AArch64::dsub0);
+ return;
+ } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
+ SelectLoad(Node, 4, AArch64::LD4Fourv4s, AArch64::qsub0);
+ return;
+ } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
+ SelectLoad(Node, 4, AArch64::LD1Fourv1d, AArch64::dsub0);
+ return;
+ } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
+ SelectLoad(Node, 4, AArch64::LD4Fourv2d, AArch64::qsub0);
+ return;
+ }
break;
case Intrinsic::aarch64_neon_ld2r:
- if (VT == MVT::v8i8)
- return SelectLoad(Node, 2, AArch64::LD2Rv8b, AArch64::dsub0);
- else if (VT == MVT::v16i8)
- return SelectLoad(Node, 2, AArch64::LD2Rv16b, AArch64::qsub0);
- else if (VT == MVT::v4i16 || VT == MVT::v4f16)
- return SelectLoad(Node, 2, AArch64::LD2Rv4h, AArch64::dsub0);
- else if (VT == MVT::v8i16 || VT == MVT::v8f16)
- return SelectLoad(Node, 2, AArch64::LD2Rv8h, AArch64::qsub0);
- else if (VT == MVT::v2i32 || VT == MVT::v2f32)
- return SelectLoad(Node, 2, AArch64::LD2Rv2s, AArch64::dsub0);
- else if (VT == MVT::v4i32 || VT == MVT::v4f32)
- return SelectLoad(Node, 2, AArch64::LD2Rv4s, AArch64::qsub0);
- else if (VT == MVT::v1i64 || VT == MVT::v1f64)
- return SelectLoad(Node, 2, AArch64::LD2Rv1d, AArch64::dsub0);
- else if (VT == MVT::v2i64 || VT == MVT::v2f64)
- return SelectLoad(Node, 2, AArch64::LD2Rv2d, AArch64::qsub0);
+ if (VT == MVT::v8i8) {
+ SelectLoad(Node, 2, AArch64::LD2Rv8b, AArch64::dsub0);
+ return;
+ } else if (VT == MVT::v16i8) {
+ SelectLoad(Node, 2, AArch64::LD2Rv16b, AArch64::qsub0);
+ return;
+ } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+ SelectLoad(Node, 2, AArch64::LD2Rv4h, AArch64::dsub0);
+ return;
+ } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+ SelectLoad(Node, 2, AArch64::LD2Rv8h, AArch64::qsub0);
+ return;
+ } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
+ SelectLoad(Node, 2, AArch64::LD2Rv2s, AArch64::dsub0);
+ return;
+ } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
+ SelectLoad(Node, 2, AArch64::LD2Rv4s, AArch64::qsub0);
+ return;
+ } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
+ SelectLoad(Node, 2, AArch64::LD2Rv1d, AArch64::dsub0);
+ return;
+ } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
+ SelectLoad(Node, 2, AArch64::LD2Rv2d, AArch64::qsub0);
+ return;
+ }
break;
case Intrinsic::aarch64_neon_ld3r:
- if (VT == MVT::v8i8)
- return SelectLoad(Node, 3, AArch64::LD3Rv8b, AArch64::dsub0);
- else if (VT == MVT::v16i8)
- return SelectLoad(Node, 3, AArch64::LD3Rv16b, AArch64::qsub0);
- else if (VT == MVT::v4i16 || VT == MVT::v4f16)
- return SelectLoad(Node, 3, AArch64::LD3Rv4h, AArch64::dsub0);
- else if (VT == MVT::v8i16 || VT == MVT::v8f16)
- return SelectLoad(Node, 3, AArch64::LD3Rv8h, AArch64::qsub0);
- else if (VT == MVT::v2i32 || VT == MVT::v2f32)
- return SelectLoad(Node, 3, AArch64::LD3Rv2s, AArch64::dsub0);
- else if (VT == MVT::v4i32 || VT == MVT::v4f32)
- return SelectLoad(Node, 3, AArch64::LD3Rv4s, AArch64::qsub0);
- else if (VT == MVT::v1i64 || VT == MVT::v1f64)
- return SelectLoad(Node, 3, AArch64::LD3Rv1d, AArch64::dsub0);
- else if (VT == MVT::v2i64 || VT == MVT::v2f64)
- return SelectLoad(Node, 3, AArch64::LD3Rv2d, AArch64::qsub0);
+ if (VT == MVT::v8i8) {
+ SelectLoad(Node, 3, AArch64::LD3Rv8b, AArch64::dsub0);
+ return;
+ } else if (VT == MVT::v16i8) {
+ SelectLoad(Node, 3, AArch64::LD3Rv16b, AArch64::qsub0);
+ return;
+ } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+ SelectLoad(Node, 3, AArch64::LD3Rv4h, AArch64::dsub0);
+ return;
+ } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+ SelectLoad(Node, 3, AArch64::LD3Rv8h, AArch64::qsub0);
+ return;
+ } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
+ SelectLoad(Node, 3, AArch64::LD3Rv2s, AArch64::dsub0);
+ return;
+ } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
+ SelectLoad(Node, 3, AArch64::LD3Rv4s, AArch64::qsub0);
+ return;
+ } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
+ SelectLoad(Node, 3, AArch64::LD3Rv1d, AArch64::dsub0);
+ return;
+ } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
+ SelectLoad(Node, 3, AArch64::LD3Rv2d, AArch64::qsub0);
+ return;
+ }
break;
case Intrinsic::aarch64_neon_ld4r:
- if (VT == MVT::v8i8)
- return SelectLoad(Node, 4, AArch64::LD4Rv8b, AArch64::dsub0);
- else if (VT == MVT::v16i8)
- return SelectLoad(Node, 4, AArch64::LD4Rv16b, AArch64::qsub0);
- else if (VT == MVT::v4i16 || VT == MVT::v4f16)
- return SelectLoad(Node, 4, AArch64::LD4Rv4h, AArch64::dsub0);
- else if (VT == MVT::v8i16 || VT == MVT::v8f16)
- return SelectLoad(Node, 4, AArch64::LD4Rv8h, AArch64::qsub0);
- else if (VT == MVT::v2i32 || VT == MVT::v2f32)
- return SelectLoad(Node, 4, AArch64::LD4Rv2s, AArch64::dsub0);
- else if (VT == MVT::v4i32 || VT == MVT::v4f32)
- return SelectLoad(Node, 4, AArch64::LD4Rv4s, AArch64::qsub0);
- else if (VT == MVT::v1i64 || VT == MVT::v1f64)
- return SelectLoad(Node, 4, AArch64::LD4Rv1d, AArch64::dsub0);
- else if (VT == MVT::v2i64 || VT == MVT::v2f64)
- return SelectLoad(Node, 4, AArch64::LD4Rv2d, AArch64::qsub0);
+ if (VT == MVT::v8i8) {
+ SelectLoad(Node, 4, AArch64::LD4Rv8b, AArch64::dsub0);
+ return;
+ } else if (VT == MVT::v16i8) {
+ SelectLoad(Node, 4, AArch64::LD4Rv16b, AArch64::qsub0);
+ return;
+ } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+ SelectLoad(Node, 4, AArch64::LD4Rv4h, AArch64::dsub0);
+ return;
+ } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+ SelectLoad(Node, 4, AArch64::LD4Rv8h, AArch64::qsub0);
+ return;
+ } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
+ SelectLoad(Node, 4, AArch64::LD4Rv2s, AArch64::dsub0);
+ return;
+ } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
+ SelectLoad(Node, 4, AArch64::LD4Rv4s, AArch64::qsub0);
+ return;
+ } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
+ SelectLoad(Node, 4, AArch64::LD4Rv1d, AArch64::dsub0);
+ return;
+ } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
+ SelectLoad(Node, 4, AArch64::LD4Rv2d, AArch64::qsub0);
+ return;
+ }
break;
case Intrinsic::aarch64_neon_ld2lane:
- if (VT == MVT::v16i8 || VT == MVT::v8i8)
- return SelectLoadLane(Node, 2, AArch64::LD2i8);
- else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
- VT == MVT::v8f16)
- return SelectLoadLane(Node, 2, AArch64::LD2i16);
- else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
- VT == MVT::v2f32)
- return SelectLoadLane(Node, 2, AArch64::LD2i32);
- else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
- VT == MVT::v1f64)
- return SelectLoadLane(Node, 2, AArch64::LD2i64);
+ if (VT == MVT::v16i8 || VT == MVT::v8i8) {
+ SelectLoadLane(Node, 2, AArch64::LD2i8);
+ return;
+ } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
+ VT == MVT::v8f16) {
+ SelectLoadLane(Node, 2, AArch64::LD2i16);
+ return;
+ } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
+ VT == MVT::v2f32) {
+ SelectLoadLane(Node, 2, AArch64::LD2i32);
+ return;
+ } else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
+ VT == MVT::v1f64) {
+ SelectLoadLane(Node, 2, AArch64::LD2i64);
+ return;
+ }
break;
case Intrinsic::aarch64_neon_ld3lane:
- if (VT == MVT::v16i8 || VT == MVT::v8i8)
- return SelectLoadLane(Node, 3, AArch64::LD3i8);
- else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
- VT == MVT::v8f16)
- return SelectLoadLane(Node, 3, AArch64::LD3i16);
- else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
- VT == MVT::v2f32)
- return SelectLoadLane(Node, 3, AArch64::LD3i32);
- else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
- VT == MVT::v1f64)
- return SelectLoadLane(Node, 3, AArch64::LD3i64);
+ if (VT == MVT::v16i8 || VT == MVT::v8i8) {
+ SelectLoadLane(Node, 3, AArch64::LD3i8);
+ return;
+ } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
+ VT == MVT::v8f16) {
+ SelectLoadLane(Node, 3, AArch64::LD3i16);
+ return;
+ } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
+ VT == MVT::v2f32) {
+ SelectLoadLane(Node, 3, AArch64::LD3i32);
+ return;
+ } else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
+ VT == MVT::v1f64) {
+ SelectLoadLane(Node, 3, AArch64::LD3i64);
+ return;
+ }
break;
case Intrinsic::aarch64_neon_ld4lane:
- if (VT == MVT::v16i8 || VT == MVT::v8i8)
- return SelectLoadLane(Node, 4, AArch64::LD4i8);
- else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
- VT == MVT::v8f16)
- return SelectLoadLane(Node, 4, AArch64::LD4i16);
- else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
- VT == MVT::v2f32)
- return SelectLoadLane(Node, 4, AArch64::LD4i32);
- else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
- VT == MVT::v1f64)
- return SelectLoadLane(Node, 4, AArch64::LD4i64);
+ if (VT == MVT::v16i8 || VT == MVT::v8i8) {
+ SelectLoadLane(Node, 4, AArch64::LD4i8);
+ return;
+ } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
+ VT == MVT::v8f16) {
+ SelectLoadLane(Node, 4, AArch64::LD4i16);
+ return;
+ } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
+ VT == MVT::v2f32) {
+ SelectLoadLane(Node, 4, AArch64::LD4i32);
+ return;
+ } else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
+ VT == MVT::v1f64) {
+ SelectLoadLane(Node, 4, AArch64::LD4i64);
+ return;
+ }
break;
}
} break;
@@ -2682,33 +3045,39 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
default:
break;
case Intrinsic::aarch64_neon_tbl2:
- return SelectTable(Node, 2, VT == MVT::v8i8 ? AArch64::TBLv8i8Two
- : AArch64::TBLv16i8Two,
- false);
+ SelectTable(Node, 2,
+ VT == MVT::v8i8 ? AArch64::TBLv8i8Two : AArch64::TBLv16i8Two,
+ false);
+ return;
case Intrinsic::aarch64_neon_tbl3:
- return SelectTable(Node, 3, VT == MVT::v8i8 ? AArch64::TBLv8i8Three
- : AArch64::TBLv16i8Three,
- false);
+ SelectTable(Node, 3, VT == MVT::v8i8 ? AArch64::TBLv8i8Three
+ : AArch64::TBLv16i8Three,
+ false);
+ return;
case Intrinsic::aarch64_neon_tbl4:
- return SelectTable(Node, 4, VT == MVT::v8i8 ? AArch64::TBLv8i8Four
- : AArch64::TBLv16i8Four,
- false);
+ SelectTable(Node, 4, VT == MVT::v8i8 ? AArch64::TBLv8i8Four
+ : AArch64::TBLv16i8Four,
+ false);
+ return;
case Intrinsic::aarch64_neon_tbx2:
- return SelectTable(Node, 2, VT == MVT::v8i8 ? AArch64::TBXv8i8Two
- : AArch64::TBXv16i8Two,
- true);
+ SelectTable(Node, 2,
+ VT == MVT::v8i8 ? AArch64::TBXv8i8Two : AArch64::TBXv16i8Two,
+ true);
+ return;
case Intrinsic::aarch64_neon_tbx3:
- return SelectTable(Node, 3, VT == MVT::v8i8 ? AArch64::TBXv8i8Three
- : AArch64::TBXv16i8Three,
- true);
+ SelectTable(Node, 3, VT == MVT::v8i8 ? AArch64::TBXv8i8Three
+ : AArch64::TBXv16i8Three,
+ true);
+ return;
case Intrinsic::aarch64_neon_tbx4:
- return SelectTable(Node, 4, VT == MVT::v8i8 ? AArch64::TBXv8i8Four
- : AArch64::TBXv16i8Four,
- true);
+ SelectTable(Node, 4, VT == MVT::v8i8 ? AArch64::TBXv8i8Four
+ : AArch64::TBXv16i8Four,
+ true);
+ return;
case Intrinsic::aarch64_neon_smull:
case Intrinsic::aarch64_neon_umull:
- if (SDNode *N = SelectMULLV64LaneV128(IntNo, Node))
- return N;
+ if (tryMULLV64LaneV128(IntNo, Node))
+ return;
break;
}
break;
@@ -2721,588 +3090,827 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
default:
break;
case Intrinsic::aarch64_neon_st1x2: {
- if (VT == MVT::v8i8)
- return SelectStore(Node, 2, AArch64::ST1Twov8b);
- else if (VT == MVT::v16i8)
- return SelectStore(Node, 2, AArch64::ST1Twov16b);
- else if (VT == MVT::v4i16 || VT == MVT::v4f16)
- return SelectStore(Node, 2, AArch64::ST1Twov4h);
- else if (VT == MVT::v8i16 || VT == MVT::v8f16)
- return SelectStore(Node, 2, AArch64::ST1Twov8h);
- else if (VT == MVT::v2i32 || VT == MVT::v2f32)
- return SelectStore(Node, 2, AArch64::ST1Twov2s);
- else if (VT == MVT::v4i32 || VT == MVT::v4f32)
- return SelectStore(Node, 2, AArch64::ST1Twov4s);
- else if (VT == MVT::v2i64 || VT == MVT::v2f64)
- return SelectStore(Node, 2, AArch64::ST1Twov2d);
- else if (VT == MVT::v1i64 || VT == MVT::v1f64)
- return SelectStore(Node, 2, AArch64::ST1Twov1d);
+ if (VT == MVT::v8i8) {
+ SelectStore(Node, 2, AArch64::ST1Twov8b);
+ return;
+ } else if (VT == MVT::v16i8) {
+ SelectStore(Node, 2, AArch64::ST1Twov16b);
+ return;
+ } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+ SelectStore(Node, 2, AArch64::ST1Twov4h);
+ return;
+ } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+ SelectStore(Node, 2, AArch64::ST1Twov8h);
+ return;
+ } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
+ SelectStore(Node, 2, AArch64::ST1Twov2s);
+ return;
+ } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
+ SelectStore(Node, 2, AArch64::ST1Twov4s);
+ return;
+ } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
+ SelectStore(Node, 2, AArch64::ST1Twov2d);
+ return;
+ } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
+ SelectStore(Node, 2, AArch64::ST1Twov1d);
+ return;
+ }
break;
}
case Intrinsic::aarch64_neon_st1x3: {
- if (VT == MVT::v8i8)
- return SelectStore(Node, 3, AArch64::ST1Threev8b);
- else if (VT == MVT::v16i8)
- return SelectStore(Node, 3, AArch64::ST1Threev16b);
- else if (VT == MVT::v4i16 || VT == MVT::v4f16)
- return SelectStore(Node, 3, AArch64::ST1Threev4h);
- else if (VT == MVT::v8i16 || VT == MVT::v8f16)
- return SelectStore(Node, 3, AArch64::ST1Threev8h);
- else if (VT == MVT::v2i32 || VT == MVT::v2f32)
- return SelectStore(Node, 3, AArch64::ST1Threev2s);
- else if (VT == MVT::v4i32 || VT == MVT::v4f32)
- return SelectStore(Node, 3, AArch64::ST1Threev4s);
- else if (VT == MVT::v2i64 || VT == MVT::v2f64)
- return SelectStore(Node, 3, AArch64::ST1Threev2d);
- else if (VT == MVT::v1i64 || VT == MVT::v1f64)
- return SelectStore(Node, 3, AArch64::ST1Threev1d);
+ if (VT == MVT::v8i8) {
+ SelectStore(Node, 3, AArch64::ST1Threev8b);
+ return;
+ } else if (VT == MVT::v16i8) {
+ SelectStore(Node, 3, AArch64::ST1Threev16b);
+ return;
+ } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+ SelectStore(Node, 3, AArch64::ST1Threev4h);
+ return;
+ } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+ SelectStore(Node, 3, AArch64::ST1Threev8h);
+ return;
+ } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
+ SelectStore(Node, 3, AArch64::ST1Threev2s);
+ return;
+ } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
+ SelectStore(Node, 3, AArch64::ST1Threev4s);
+ return;
+ } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
+ SelectStore(Node, 3, AArch64::ST1Threev2d);
+ return;
+ } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
+ SelectStore(Node, 3, AArch64::ST1Threev1d);
+ return;
+ }
break;
}
case Intrinsic::aarch64_neon_st1x4: {
- if (VT == MVT::v8i8)
- return SelectStore(Node, 4, AArch64::ST1Fourv8b);
- else if (VT == MVT::v16i8)
- return SelectStore(Node, 4, AArch64::ST1Fourv16b);
- else if (VT == MVT::v4i16 || VT == MVT::v4f16)
- return SelectStore(Node, 4, AArch64::ST1Fourv4h);
- else if (VT == MVT::v8i16 || VT == MVT::v8f16)
- return SelectStore(Node, 4, AArch64::ST1Fourv8h);
- else if (VT == MVT::v2i32 || VT == MVT::v2f32)
- return SelectStore(Node, 4, AArch64::ST1Fourv2s);
- else if (VT == MVT::v4i32 || VT == MVT::v4f32)
- return SelectStore(Node, 4, AArch64::ST1Fourv4s);
- else if (VT == MVT::v2i64 || VT == MVT::v2f64)
- return SelectStore(Node, 4, AArch64::ST1Fourv2d);
- else if (VT == MVT::v1i64 || VT == MVT::v1f64)
- return SelectStore(Node, 4, AArch64::ST1Fourv1d);
+ if (VT == MVT::v8i8) {
+ SelectStore(Node, 4, AArch64::ST1Fourv8b);
+ return;
+ } else if (VT == MVT::v16i8) {
+ SelectStore(Node, 4, AArch64::ST1Fourv16b);
+ return;
+ } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+ SelectStore(Node, 4, AArch64::ST1Fourv4h);
+ return;
+ } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+ SelectStore(Node, 4, AArch64::ST1Fourv8h);
+ return;
+ } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
+ SelectStore(Node, 4, AArch64::ST1Fourv2s);
+ return;
+ } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
+ SelectStore(Node, 4, AArch64::ST1Fourv4s);
+ return;
+ } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
+ SelectStore(Node, 4, AArch64::ST1Fourv2d);
+ return;
+ } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
+ SelectStore(Node, 4, AArch64::ST1Fourv1d);
+ return;
+ }
break;
}
case Intrinsic::aarch64_neon_st2: {
- if (VT == MVT::v8i8)
- return SelectStore(Node, 2, AArch64::ST2Twov8b);
- else if (VT == MVT::v16i8)
- return SelectStore(Node, 2, AArch64::ST2Twov16b);
- else if (VT == MVT::v4i16 || VT == MVT::v4f16)
- return SelectStore(Node, 2, AArch64::ST2Twov4h);
- else if (VT == MVT::v8i16 || VT == MVT::v8f16)
- return SelectStore(Node, 2, AArch64::ST2Twov8h);
- else if (VT == MVT::v2i32 || VT == MVT::v2f32)
- return SelectStore(Node, 2, AArch64::ST2Twov2s);
- else if (VT == MVT::v4i32 || VT == MVT::v4f32)
- return SelectStore(Node, 2, AArch64::ST2Twov4s);
- else if (VT == MVT::v2i64 || VT == MVT::v2f64)
- return SelectStore(Node, 2, AArch64::ST2Twov2d);
- else if (VT == MVT::v1i64 || VT == MVT::v1f64)
- return SelectStore(Node, 2, AArch64::ST1Twov1d);
+ if (VT == MVT::v8i8) {
+ SelectStore(Node, 2, AArch64::ST2Twov8b);
+ return;
+ } else if (VT == MVT::v16i8) {
+ SelectStore(Node, 2, AArch64::ST2Twov16b);
+ return;
+ } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+ SelectStore(Node, 2, AArch64::ST2Twov4h);
+ return;
+ } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+ SelectStore(Node, 2, AArch64::ST2Twov8h);
+ return;
+ } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
+ SelectStore(Node, 2, AArch64::ST2Twov2s);
+ return;
+ } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
+ SelectStore(Node, 2, AArch64::ST2Twov4s);
+ return;
+ } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
+ SelectStore(Node, 2, AArch64::ST2Twov2d);
+ return;
+ } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
+ SelectStore(Node, 2, AArch64::ST1Twov1d);
+ return;
+ }
break;
}
case Intrinsic::aarch64_neon_st3: {
- if (VT == MVT::v8i8)
- return SelectStore(Node, 3, AArch64::ST3Threev8b);
- else if (VT == MVT::v16i8)
- return SelectStore(Node, 3, AArch64::ST3Threev16b);
- else if (VT == MVT::v4i16 || VT == MVT::v4f16)
- return SelectStore(Node, 3, AArch64::ST3Threev4h);
- else if (VT == MVT::v8i16 || VT == MVT::v8f16)
- return SelectStore(Node, 3, AArch64::ST3Threev8h);
- else if (VT == MVT::v2i32 || VT == MVT::v2f32)
- return SelectStore(Node, 3, AArch64::ST3Threev2s);
- else if (VT == MVT::v4i32 || VT == MVT::v4f32)
- return SelectStore(Node, 3, AArch64::ST3Threev4s);
- else if (VT == MVT::v2i64 || VT == MVT::v2f64)
- return SelectStore(Node, 3, AArch64::ST3Threev2d);
- else if (VT == MVT::v1i64 || VT == MVT::v1f64)
- return SelectStore(Node, 3, AArch64::ST1Threev1d);
+ if (VT == MVT::v8i8) {
+ SelectStore(Node, 3, AArch64::ST3Threev8b);
+ return;
+ } else if (VT == MVT::v16i8) {
+ SelectStore(Node, 3, AArch64::ST3Threev16b);
+ return;
+ } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+ SelectStore(Node, 3, AArch64::ST3Threev4h);
+ return;
+ } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+ SelectStore(Node, 3, AArch64::ST3Threev8h);
+ return;
+ } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
+ SelectStore(Node, 3, AArch64::ST3Threev2s);
+ return;
+ } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
+ SelectStore(Node, 3, AArch64::ST3Threev4s);
+ return;
+ } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
+ SelectStore(Node, 3, AArch64::ST3Threev2d);
+ return;
+ } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
+ SelectStore(Node, 3, AArch64::ST1Threev1d);
+ return;
+ }
break;
}
case Intrinsic::aarch64_neon_st4: {
- if (VT == MVT::v8i8)
- return SelectStore(Node, 4, AArch64::ST4Fourv8b);
- else if (VT == MVT::v16i8)
- return SelectStore(Node, 4, AArch64::ST4Fourv16b);
- else if (VT == MVT::v4i16 || VT == MVT::v4f16)
- return SelectStore(Node, 4, AArch64::ST4Fourv4h);
- else if (VT == MVT::v8i16 || VT == MVT::v8f16)
- return SelectStore(Node, 4, AArch64::ST4Fourv8h);
- else if (VT == MVT::v2i32 || VT == MVT::v2f32)
- return SelectStore(Node, 4, AArch64::ST4Fourv2s);
- else if (VT == MVT::v4i32 || VT == MVT::v4f32)
- return SelectStore(Node, 4, AArch64::ST4Fourv4s);
- else if (VT == MVT::v2i64 || VT == MVT::v2f64)
- return SelectStore(Node, 4, AArch64::ST4Fourv2d);
- else if (VT == MVT::v1i64 || VT == MVT::v1f64)
- return SelectStore(Node, 4, AArch64::ST1Fourv1d);
+ if (VT == MVT::v8i8) {
+ SelectStore(Node, 4, AArch64::ST4Fourv8b);
+ return;
+ } else if (VT == MVT::v16i8) {
+ SelectStore(Node, 4, AArch64::ST4Fourv16b);
+ return;
+ } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+ SelectStore(Node, 4, AArch64::ST4Fourv4h);
+ return;
+ } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+ SelectStore(Node, 4, AArch64::ST4Fourv8h);
+ return;
+ } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
+ SelectStore(Node, 4, AArch64::ST4Fourv2s);
+ return;
+ } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
+ SelectStore(Node, 4, AArch64::ST4Fourv4s);
+ return;
+ } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
+ SelectStore(Node, 4, AArch64::ST4Fourv2d);
+ return;
+ } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
+ SelectStore(Node, 4, AArch64::ST1Fourv1d);
+ return;
+ }
break;
}
case Intrinsic::aarch64_neon_st2lane: {
- if (VT == MVT::v16i8 || VT == MVT::v8i8)
- return SelectStoreLane(Node, 2, AArch64::ST2i8);
- else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
- VT == MVT::v8f16)
- return SelectStoreLane(Node, 2, AArch64::ST2i16);
- else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
- VT == MVT::v2f32)
- return SelectStoreLane(Node, 2, AArch64::ST2i32);
- else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
- VT == MVT::v1f64)
- return SelectStoreLane(Node, 2, AArch64::ST2i64);
+ if (VT == MVT::v16i8 || VT == MVT::v8i8) {
+ SelectStoreLane(Node, 2, AArch64::ST2i8);
+ return;
+ } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
+ VT == MVT::v8f16) {
+ SelectStoreLane(Node, 2, AArch64::ST2i16);
+ return;
+ } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
+ VT == MVT::v2f32) {
+ SelectStoreLane(Node, 2, AArch64::ST2i32);
+ return;
+ } else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
+ VT == MVT::v1f64) {
+ SelectStoreLane(Node, 2, AArch64::ST2i64);
+ return;
+ }
break;
}
case Intrinsic::aarch64_neon_st3lane: {
- if (VT == MVT::v16i8 || VT == MVT::v8i8)
- return SelectStoreLane(Node, 3, AArch64::ST3i8);
- else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
- VT == MVT::v8f16)
- return SelectStoreLane(Node, 3, AArch64::ST3i16);
- else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
- VT == MVT::v2f32)
- return SelectStoreLane(Node, 3, AArch64::ST3i32);
- else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
- VT == MVT::v1f64)
- return SelectStoreLane(Node, 3, AArch64::ST3i64);
+ if (VT == MVT::v16i8 || VT == MVT::v8i8) {
+ SelectStoreLane(Node, 3, AArch64::ST3i8);
+ return;
+ } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
+ VT == MVT::v8f16) {
+ SelectStoreLane(Node, 3, AArch64::ST3i16);
+ return;
+ } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
+ VT == MVT::v2f32) {
+ SelectStoreLane(Node, 3, AArch64::ST3i32);
+ return;
+ } else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
+ VT == MVT::v1f64) {
+ SelectStoreLane(Node, 3, AArch64::ST3i64);
+ return;
+ }
break;
}
case Intrinsic::aarch64_neon_st4lane: {
- if (VT == MVT::v16i8 || VT == MVT::v8i8)
- return SelectStoreLane(Node, 4, AArch64::ST4i8);
- else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
- VT == MVT::v8f16)
- return SelectStoreLane(Node, 4, AArch64::ST4i16);
- else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
- VT == MVT::v2f32)
- return SelectStoreLane(Node, 4, AArch64::ST4i32);
- else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
- VT == MVT::v1f64)
- return SelectStoreLane(Node, 4, AArch64::ST4i64);
+ if (VT == MVT::v16i8 || VT == MVT::v8i8) {
+ SelectStoreLane(Node, 4, AArch64::ST4i8);
+ return;
+ } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
+ VT == MVT::v8f16) {
+ SelectStoreLane(Node, 4, AArch64::ST4i16);
+ return;
+ } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
+ VT == MVT::v2f32) {
+ SelectStoreLane(Node, 4, AArch64::ST4i32);
+ return;
+ } else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
+ VT == MVT::v1f64) {
+ SelectStoreLane(Node, 4, AArch64::ST4i64);
+ return;
+ }
break;
}
}
break;
}
case AArch64ISD::LD2post: {
- if (VT == MVT::v8i8)
- return SelectPostLoad(Node, 2, AArch64::LD2Twov8b_POST, AArch64::dsub0);
- else if (VT == MVT::v16i8)
- return SelectPostLoad(Node, 2, AArch64::LD2Twov16b_POST, AArch64::qsub0);
- else if (VT == MVT::v4i16 || VT == MVT::v4f16)
- return SelectPostLoad(Node, 2, AArch64::LD2Twov4h_POST, AArch64::dsub0);
- else if (VT == MVT::v8i16 || VT == MVT::v8f16)
- return SelectPostLoad(Node, 2, AArch64::LD2Twov8h_POST, AArch64::qsub0);
- else if (VT == MVT::v2i32 || VT == MVT::v2f32)
- return SelectPostLoad(Node, 2, AArch64::LD2Twov2s_POST, AArch64::dsub0);
- else if (VT == MVT::v4i32 || VT == MVT::v4f32)
- return SelectPostLoad(Node, 2, AArch64::LD2Twov4s_POST, AArch64::qsub0);
- else if (VT == MVT::v1i64 || VT == MVT::v1f64)
- return SelectPostLoad(Node, 2, AArch64::LD1Twov1d_POST, AArch64::dsub0);
- else if (VT == MVT::v2i64 || VT == MVT::v2f64)
- return SelectPostLoad(Node, 2, AArch64::LD2Twov2d_POST, AArch64::qsub0);
+ if (VT == MVT::v8i8) {
+ SelectPostLoad(Node, 2, AArch64::LD2Twov8b_POST, AArch64::dsub0);
+ return;
+ } else if (VT == MVT::v16i8) {
+ SelectPostLoad(Node, 2, AArch64::LD2Twov16b_POST, AArch64::qsub0);
+ return;
+ } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+ SelectPostLoad(Node, 2, AArch64::LD2Twov4h_POST, AArch64::dsub0);
+ return;
+ } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+ SelectPostLoad(Node, 2, AArch64::LD2Twov8h_POST, AArch64::qsub0);
+ return;
+ } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
+ SelectPostLoad(Node, 2, AArch64::LD2Twov2s_POST, AArch64::dsub0);
+ return;
+ } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
+ SelectPostLoad(Node, 2, AArch64::LD2Twov4s_POST, AArch64::qsub0);
+ return;
+ } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
+ SelectPostLoad(Node, 2, AArch64::LD1Twov1d_POST, AArch64::dsub0);
+ return;
+ } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
+ SelectPostLoad(Node, 2, AArch64::LD2Twov2d_POST, AArch64::qsub0);
+ return;
+ }
break;
}
case AArch64ISD::LD3post: {
- if (VT == MVT::v8i8)
- return SelectPostLoad(Node, 3, AArch64::LD3Threev8b_POST, AArch64::dsub0);
- else if (VT == MVT::v16i8)
- return SelectPostLoad(Node, 3, AArch64::LD3Threev16b_POST, AArch64::qsub0);
- else if (VT == MVT::v4i16 || VT == MVT::v4f16)
- return SelectPostLoad(Node, 3, AArch64::LD3Threev4h_POST, AArch64::dsub0);
- else if (VT == MVT::v8i16 || VT == MVT::v8f16)
- return SelectPostLoad(Node, 3, AArch64::LD3Threev8h_POST, AArch64::qsub0);
- else if (VT == MVT::v2i32 || VT == MVT::v2f32)
- return SelectPostLoad(Node, 3, AArch64::LD3Threev2s_POST, AArch64::dsub0);
- else if (VT == MVT::v4i32 || VT == MVT::v4f32)
- return SelectPostLoad(Node, 3, AArch64::LD3Threev4s_POST, AArch64::qsub0);
- else if (VT == MVT::v1i64 || VT == MVT::v1f64)
- return SelectPostLoad(Node, 3, AArch64::LD1Threev1d_POST, AArch64::dsub0);
- else if (VT == MVT::v2i64 || VT == MVT::v2f64)
- return SelectPostLoad(Node, 3, AArch64::LD3Threev2d_POST, AArch64::qsub0);
+ if (VT == MVT::v8i8) {
+ SelectPostLoad(Node, 3, AArch64::LD3Threev8b_POST, AArch64::dsub0);
+ return;
+ } else if (VT == MVT::v16i8) {
+ SelectPostLoad(Node, 3, AArch64::LD3Threev16b_POST, AArch64::qsub0);
+ return;
+ } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+ SelectPostLoad(Node, 3, AArch64::LD3Threev4h_POST, AArch64::dsub0);
+ return;
+ } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+ SelectPostLoad(Node, 3, AArch64::LD3Threev8h_POST, AArch64::qsub0);
+ return;
+ } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
+ SelectPostLoad(Node, 3, AArch64::LD3Threev2s_POST, AArch64::dsub0);
+ return;
+ } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
+ SelectPostLoad(Node, 3, AArch64::LD3Threev4s_POST, AArch64::qsub0);
+ return;
+ } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
+ SelectPostLoad(Node, 3, AArch64::LD1Threev1d_POST, AArch64::dsub0);
+ return;
+ } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
+ SelectPostLoad(Node, 3, AArch64::LD3Threev2d_POST, AArch64::qsub0);
+ return;
+ }
break;
}
case AArch64ISD::LD4post: {
- if (VT == MVT::v8i8)
- return SelectPostLoad(Node, 4, AArch64::LD4Fourv8b_POST, AArch64::dsub0);
- else if (VT == MVT::v16i8)
- return SelectPostLoad(Node, 4, AArch64::LD4Fourv16b_POST, AArch64::qsub0);
- else if (VT == MVT::v4i16 || VT == MVT::v4f16)
- return SelectPostLoad(Node, 4, AArch64::LD4Fourv4h_POST, AArch64::dsub0);
- else if (VT == MVT::v8i16 || VT == MVT::v8f16)
- return SelectPostLoad(Node, 4, AArch64::LD4Fourv8h_POST, AArch64::qsub0);
- else if (VT == MVT::v2i32 || VT == MVT::v2f32)
- return SelectPostLoad(Node, 4, AArch64::LD4Fourv2s_POST, AArch64::dsub0);
- else if (VT == MVT::v4i32 || VT == MVT::v4f32)
- return SelectPostLoad(Node, 4, AArch64::LD4Fourv4s_POST, AArch64::qsub0);
- else if (VT == MVT::v1i64 || VT == MVT::v1f64)
- return SelectPostLoad(Node, 4, AArch64::LD1Fourv1d_POST, AArch64::dsub0);
- else if (VT == MVT::v2i64 || VT == MVT::v2f64)
- return SelectPostLoad(Node, 4, AArch64::LD4Fourv2d_POST, AArch64::qsub0);
+ if (VT == MVT::v8i8) {
+ SelectPostLoad(Node, 4, AArch64::LD4Fourv8b_POST, AArch64::dsub0);
+ return;
+ } else if (VT == MVT::v16i8) {
+ SelectPostLoad(Node, 4, AArch64::LD4Fourv16b_POST, AArch64::qsub0);
+ return;
+ } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+ SelectPostLoad(Node, 4, AArch64::LD4Fourv4h_POST, AArch64::dsub0);
+ return;
+ } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+ SelectPostLoad(Node, 4, AArch64::LD4Fourv8h_POST, AArch64::qsub0);
+ return;
+ } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
+ SelectPostLoad(Node, 4, AArch64::LD4Fourv2s_POST, AArch64::dsub0);
+ return;
+ } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
+ SelectPostLoad(Node, 4, AArch64::LD4Fourv4s_POST, AArch64::qsub0);
+ return;
+ } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
+ SelectPostLoad(Node, 4, AArch64::LD1Fourv1d_POST, AArch64::dsub0);
+ return;
+ } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
+ SelectPostLoad(Node, 4, AArch64::LD4Fourv2d_POST, AArch64::qsub0);
+ return;
+ }
break;
}
case AArch64ISD::LD1x2post: {
- if (VT == MVT::v8i8)
- return SelectPostLoad(Node, 2, AArch64::LD1Twov8b_POST, AArch64::dsub0);
- else if (VT == MVT::v16i8)
- return SelectPostLoad(Node, 2, AArch64::LD1Twov16b_POST, AArch64::qsub0);
- else if (VT == MVT::v4i16 || VT == MVT::v4f16)
- return SelectPostLoad(Node, 2, AArch64::LD1Twov4h_POST, AArch64::dsub0);
- else if (VT == MVT::v8i16 || VT == MVT::v8f16)
- return SelectPostLoad(Node, 2, AArch64::LD1Twov8h_POST, AArch64::qsub0);
- else if (VT == MVT::v2i32 || VT == MVT::v2f32)
- return SelectPostLoad(Node, 2, AArch64::LD1Twov2s_POST, AArch64::dsub0);
- else if (VT == MVT::v4i32 || VT == MVT::v4f32)
- return SelectPostLoad(Node, 2, AArch64::LD1Twov4s_POST, AArch64::qsub0);
- else if (VT == MVT::v1i64 || VT == MVT::v1f64)
- return SelectPostLoad(Node, 2, AArch64::LD1Twov1d_POST, AArch64::dsub0);
- else if (VT == MVT::v2i64 || VT == MVT::v2f64)
- return SelectPostLoad(Node, 2, AArch64::LD1Twov2d_POST, AArch64::qsub0);
+ if (VT == MVT::v8i8) {
+ SelectPostLoad(Node, 2, AArch64::LD1Twov8b_POST, AArch64::dsub0);
+ return;
+ } else if (VT == MVT::v16i8) {
+ SelectPostLoad(Node, 2, AArch64::LD1Twov16b_POST, AArch64::qsub0);
+ return;
+ } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+ SelectPostLoad(Node, 2, AArch64::LD1Twov4h_POST, AArch64::dsub0);
+ return;
+ } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+ SelectPostLoad(Node, 2, AArch64::LD1Twov8h_POST, AArch64::qsub0);
+ return;
+ } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
+ SelectPostLoad(Node, 2, AArch64::LD1Twov2s_POST, AArch64::dsub0);
+ return;
+ } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
+ SelectPostLoad(Node, 2, AArch64::LD1Twov4s_POST, AArch64::qsub0);
+ return;
+ } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
+ SelectPostLoad(Node, 2, AArch64::LD1Twov1d_POST, AArch64::dsub0);
+ return;
+ } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
+ SelectPostLoad(Node, 2, AArch64::LD1Twov2d_POST, AArch64::qsub0);
+ return;
+ }
break;
}
case AArch64ISD::LD1x3post: {
- if (VT == MVT::v8i8)
- return SelectPostLoad(Node, 3, AArch64::LD1Threev8b_POST, AArch64::dsub0);
- else if (VT == MVT::v16i8)
- return SelectPostLoad(Node, 3, AArch64::LD1Threev16b_POST, AArch64::qsub0);
- else if (VT == MVT::v4i16 || VT == MVT::v4f16)
- return SelectPostLoad(Node, 3, AArch64::LD1Threev4h_POST, AArch64::dsub0);
- else if (VT == MVT::v8i16 || VT == MVT::v8f16)
- return SelectPostLoad(Node, 3, AArch64::LD1Threev8h_POST, AArch64::qsub0);
- else if (VT == MVT::v2i32 || VT == MVT::v2f32)
- return SelectPostLoad(Node, 3, AArch64::LD1Threev2s_POST, AArch64::dsub0);
- else if (VT == MVT::v4i32 || VT == MVT::v4f32)
- return SelectPostLoad(Node, 3, AArch64::LD1Threev4s_POST, AArch64::qsub0);
- else if (VT == MVT::v1i64 || VT == MVT::v1f64)
- return SelectPostLoad(Node, 3, AArch64::LD1Threev1d_POST, AArch64::dsub0);
- else if (VT == MVT::v2i64 || VT == MVT::v2f64)
- return SelectPostLoad(Node, 3, AArch64::LD1Threev2d_POST, AArch64::qsub0);
+ if (VT == MVT::v8i8) {
+ SelectPostLoad(Node, 3, AArch64::LD1Threev8b_POST, AArch64::dsub0);
+ return;
+ } else if (VT == MVT::v16i8) {
+ SelectPostLoad(Node, 3, AArch64::LD1Threev16b_POST, AArch64::qsub0);
+ return;
+ } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+ SelectPostLoad(Node, 3, AArch64::LD1Threev4h_POST, AArch64::dsub0);
+ return;
+ } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+ SelectPostLoad(Node, 3, AArch64::LD1Threev8h_POST, AArch64::qsub0);
+ return;
+ } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
+ SelectPostLoad(Node, 3, AArch64::LD1Threev2s_POST, AArch64::dsub0);
+ return;
+ } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
+ SelectPostLoad(Node, 3, AArch64::LD1Threev4s_POST, AArch64::qsub0);
+ return;
+ } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
+ SelectPostLoad(Node, 3, AArch64::LD1Threev1d_POST, AArch64::dsub0);
+ return;
+ } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
+ SelectPostLoad(Node, 3, AArch64::LD1Threev2d_POST, AArch64::qsub0);
+ return;
+ }
break;
}
case AArch64ISD::LD1x4post: {
- if (VT == MVT::v8i8)
- return SelectPostLoad(Node, 4, AArch64::LD1Fourv8b_POST, AArch64::dsub0);
- else if (VT == MVT::v16i8)
- return SelectPostLoad(Node, 4, AArch64::LD1Fourv16b_POST, AArch64::qsub0);
- else if (VT == MVT::v4i16 || VT == MVT::v4f16)
- return SelectPostLoad(Node, 4, AArch64::LD1Fourv4h_POST, AArch64::dsub0);
- else if (VT == MVT::v8i16 || VT == MVT::v8f16)
- return SelectPostLoad(Node, 4, AArch64::LD1Fourv8h_POST, AArch64::qsub0);
- else if (VT == MVT::v2i32 || VT == MVT::v2f32)
- return SelectPostLoad(Node, 4, AArch64::LD1Fourv2s_POST, AArch64::dsub0);
- else if (VT == MVT::v4i32 || VT == MVT::v4f32)
- return SelectPostLoad(Node, 4, AArch64::LD1Fourv4s_POST, AArch64::qsub0);
- else if (VT == MVT::v1i64 || VT == MVT::v1f64)
- return SelectPostLoad(Node, 4, AArch64::LD1Fourv1d_POST, AArch64::dsub0);
- else if (VT == MVT::v2i64 || VT == MVT::v2f64)
- return SelectPostLoad(Node, 4, AArch64::LD1Fourv2d_POST, AArch64::qsub0);
+ if (VT == MVT::v8i8) {
+ SelectPostLoad(Node, 4, AArch64::LD1Fourv8b_POST, AArch64::dsub0);
+ return;
+ } else if (VT == MVT::v16i8) {
+ SelectPostLoad(Node, 4, AArch64::LD1Fourv16b_POST, AArch64::qsub0);
+ return;
+ } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+ SelectPostLoad(Node, 4, AArch64::LD1Fourv4h_POST, AArch64::dsub0);
+ return;
+ } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+ SelectPostLoad(Node, 4, AArch64::LD1Fourv8h_POST, AArch64::qsub0);
+ return;
+ } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
+ SelectPostLoad(Node, 4, AArch64::LD1Fourv2s_POST, AArch64::dsub0);
+ return;
+ } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
+ SelectPostLoad(Node, 4, AArch64::LD1Fourv4s_POST, AArch64::qsub0);
+ return;
+ } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
+ SelectPostLoad(Node, 4, AArch64::LD1Fourv1d_POST, AArch64::dsub0);
+ return;
+ } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
+ SelectPostLoad(Node, 4, AArch64::LD1Fourv2d_POST, AArch64::qsub0);
+ return;
+ }
break;
}
case AArch64ISD::LD1DUPpost: {
- if (VT == MVT::v8i8)
- return SelectPostLoad(Node, 1, AArch64::LD1Rv8b_POST, AArch64::dsub0);
- else if (VT == MVT::v16i8)
- return SelectPostLoad(Node, 1, AArch64::LD1Rv16b_POST, AArch64::qsub0);
- else if (VT == MVT::v4i16 || VT == MVT::v4f16)
- return SelectPostLoad(Node, 1, AArch64::LD1Rv4h_POST, AArch64::dsub0);
- else if (VT == MVT::v8i16 || VT == MVT::v8f16)
- return SelectPostLoad(Node, 1, AArch64::LD1Rv8h_POST, AArch64::qsub0);
- else if (VT == MVT::v2i32 || VT == MVT::v2f32)
- return SelectPostLoad(Node, 1, AArch64::LD1Rv2s_POST, AArch64::dsub0);
- else if (VT == MVT::v4i32 || VT == MVT::v4f32)
- return SelectPostLoad(Node, 1, AArch64::LD1Rv4s_POST, AArch64::qsub0);
- else if (VT == MVT::v1i64 || VT == MVT::v1f64)
- return SelectPostLoad(Node, 1, AArch64::LD1Rv1d_POST, AArch64::dsub0);
- else if (VT == MVT::v2i64 || VT == MVT::v2f64)
- return SelectPostLoad(Node, 1, AArch64::LD1Rv2d_POST, AArch64::qsub0);
+ if (VT == MVT::v8i8) {
+ SelectPostLoad(Node, 1, AArch64::LD1Rv8b_POST, AArch64::dsub0);
+ return;
+ } else if (VT == MVT::v16i8) {
+ SelectPostLoad(Node, 1, AArch64::LD1Rv16b_POST, AArch64::qsub0);
+ return;
+ } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+ SelectPostLoad(Node, 1, AArch64::LD1Rv4h_POST, AArch64::dsub0);
+ return;
+ } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+ SelectPostLoad(Node, 1, AArch64::LD1Rv8h_POST, AArch64::qsub0);
+ return;
+ } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
+ SelectPostLoad(Node, 1, AArch64::LD1Rv2s_POST, AArch64::dsub0);
+ return;
+ } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
+ SelectPostLoad(Node, 1, AArch64::LD1Rv4s_POST, AArch64::qsub0);
+ return;
+ } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
+ SelectPostLoad(Node, 1, AArch64::LD1Rv1d_POST, AArch64::dsub0);
+ return;
+ } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
+ SelectPostLoad(Node, 1, AArch64::LD1Rv2d_POST, AArch64::qsub0);
+ return;
+ }
break;
}
case AArch64ISD::LD2DUPpost: {
- if (VT == MVT::v8i8)
- return SelectPostLoad(Node, 2, AArch64::LD2Rv8b_POST, AArch64::dsub0);
- else if (VT == MVT::v16i8)
- return SelectPostLoad(Node, 2, AArch64::LD2Rv16b_POST, AArch64::qsub0);
- else if (VT == MVT::v4i16 || VT == MVT::v4f16)
- return SelectPostLoad(Node, 2, AArch64::LD2Rv4h_POST, AArch64::dsub0);
- else if (VT == MVT::v8i16 || VT == MVT::v8f16)
- return SelectPostLoad(Node, 2, AArch64::LD2Rv8h_POST, AArch64::qsub0);
- else if (VT == MVT::v2i32 || VT == MVT::v2f32)
- return SelectPostLoad(Node, 2, AArch64::LD2Rv2s_POST, AArch64::dsub0);
- else if (VT == MVT::v4i32 || VT == MVT::v4f32)
- return SelectPostLoad(Node, 2, AArch64::LD2Rv4s_POST, AArch64::qsub0);
- else if (VT == MVT::v1i64 || VT == MVT::v1f64)
- return SelectPostLoad(Node, 2, AArch64::LD2Rv1d_POST, AArch64::dsub0);
- else if (VT == MVT::v2i64 || VT == MVT::v2f64)
- return SelectPostLoad(Node, 2, AArch64::LD2Rv2d_POST, AArch64::qsub0);
+ if (VT == MVT::v8i8) {
+ SelectPostLoad(Node, 2, AArch64::LD2Rv8b_POST, AArch64::dsub0);
+ return;
+ } else if (VT == MVT::v16i8) {
+ SelectPostLoad(Node, 2, AArch64::LD2Rv16b_POST, AArch64::qsub0);
+ return;
+ } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+ SelectPostLoad(Node, 2, AArch64::LD2Rv4h_POST, AArch64::dsub0);
+ return;
+ } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+ SelectPostLoad(Node, 2, AArch64::LD2Rv8h_POST, AArch64::qsub0);
+ return;
+ } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
+ SelectPostLoad(Node, 2, AArch64::LD2Rv2s_POST, AArch64::dsub0);
+ return;
+ } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
+ SelectPostLoad(Node, 2, AArch64::LD2Rv4s_POST, AArch64::qsub0);
+ return;
+ } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
+ SelectPostLoad(Node, 2, AArch64::LD2Rv1d_POST, AArch64::dsub0);
+ return;
+ } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
+ SelectPostLoad(Node, 2, AArch64::LD2Rv2d_POST, AArch64::qsub0);
+ return;
+ }
break;
}
case AArch64ISD::LD3DUPpost: {
- if (VT == MVT::v8i8)
- return SelectPostLoad(Node, 3, AArch64::LD3Rv8b_POST, AArch64::dsub0);
- else if (VT == MVT::v16i8)
- return SelectPostLoad(Node, 3, AArch64::LD3Rv16b_POST, AArch64::qsub0);
- else if (VT == MVT::v4i16 || VT == MVT::v4f16)
- return SelectPostLoad(Node, 3, AArch64::LD3Rv4h_POST, AArch64::dsub0);
- else if (VT == MVT::v8i16 || VT == MVT::v8f16)
- return SelectPostLoad(Node, 3, AArch64::LD3Rv8h_POST, AArch64::qsub0);
- else if (VT == MVT::v2i32 || VT == MVT::v2f32)
- return SelectPostLoad(Node, 3, AArch64::LD3Rv2s_POST, AArch64::dsub0);
- else if (VT == MVT::v4i32 || VT == MVT::v4f32)
- return SelectPostLoad(Node, 3, AArch64::LD3Rv4s_POST, AArch64::qsub0);
- else if (VT == MVT::v1i64 || VT == MVT::v1f64)
- return SelectPostLoad(Node, 3, AArch64::LD3Rv1d_POST, AArch64::dsub0);
- else if (VT == MVT::v2i64 || VT == MVT::v2f64)
- return SelectPostLoad(Node, 3, AArch64::LD3Rv2d_POST, AArch64::qsub0);
+ if (VT == MVT::v8i8) {
+ SelectPostLoad(Node, 3, AArch64::LD3Rv8b_POST, AArch64::dsub0);
+ return;
+ } else if (VT == MVT::v16i8) {
+ SelectPostLoad(Node, 3, AArch64::LD3Rv16b_POST, AArch64::qsub0);
+ return;
+ } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+ SelectPostLoad(Node, 3, AArch64::LD3Rv4h_POST, AArch64::dsub0);
+ return;
+ } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+ SelectPostLoad(Node, 3, AArch64::LD3Rv8h_POST, AArch64::qsub0);
+ return;
+ } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
+ SelectPostLoad(Node, 3, AArch64::LD3Rv2s_POST, AArch64::dsub0);
+ return;
+ } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
+ SelectPostLoad(Node, 3, AArch64::LD3Rv4s_POST, AArch64::qsub0);
+ return;
+ } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
+ SelectPostLoad(Node, 3, AArch64::LD3Rv1d_POST, AArch64::dsub0);
+ return;
+ } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
+ SelectPostLoad(Node, 3, AArch64::LD3Rv2d_POST, AArch64::qsub0);
+ return;
+ }
break;
}
case AArch64ISD::LD4DUPpost: {
- if (VT == MVT::v8i8)
- return SelectPostLoad(Node, 4, AArch64::LD4Rv8b_POST, AArch64::dsub0);
- else if (VT == MVT::v16i8)
- return SelectPostLoad(Node, 4, AArch64::LD4Rv16b_POST, AArch64::qsub0);
- else if (VT == MVT::v4i16 || VT == MVT::v4f16)
- return SelectPostLoad(Node, 4, AArch64::LD4Rv4h_POST, AArch64::dsub0);
- else if (VT == MVT::v8i16 || VT == MVT::v8f16)
- return SelectPostLoad(Node, 4, AArch64::LD4Rv8h_POST, AArch64::qsub0);
- else if (VT == MVT::v2i32 || VT == MVT::v2f32)
- return SelectPostLoad(Node, 4, AArch64::LD4Rv2s_POST, AArch64::dsub0);
- else if (VT == MVT::v4i32 || VT == MVT::v4f32)
- return SelectPostLoad(Node, 4, AArch64::LD4Rv4s_POST, AArch64::qsub0);
- else if (VT == MVT::v1i64 || VT == MVT::v1f64)
- return SelectPostLoad(Node, 4, AArch64::LD4Rv1d_POST, AArch64::dsub0);
- else if (VT == MVT::v2i64 || VT == MVT::v2f64)
- return SelectPostLoad(Node, 4, AArch64::LD4Rv2d_POST, AArch64::qsub0);
+ if (VT == MVT::v8i8) {
+ SelectPostLoad(Node, 4, AArch64::LD4Rv8b_POST, AArch64::dsub0);
+ return;
+ } else if (VT == MVT::v16i8) {
+ SelectPostLoad(Node, 4, AArch64::LD4Rv16b_POST, AArch64::qsub0);
+ return;
+ } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+ SelectPostLoad(Node, 4, AArch64::LD4Rv4h_POST, AArch64::dsub0);
+ return;
+ } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+ SelectPostLoad(Node, 4, AArch64::LD4Rv8h_POST, AArch64::qsub0);
+ return;
+ } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
+ SelectPostLoad(Node, 4, AArch64::LD4Rv2s_POST, AArch64::dsub0);
+ return;
+ } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
+ SelectPostLoad(Node, 4, AArch64::LD4Rv4s_POST, AArch64::qsub0);
+ return;
+ } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
+ SelectPostLoad(Node, 4, AArch64::LD4Rv1d_POST, AArch64::dsub0);
+ return;
+ } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
+ SelectPostLoad(Node, 4, AArch64::LD4Rv2d_POST, AArch64::qsub0);
+ return;
+ }
break;
}
case AArch64ISD::LD1LANEpost: {
- if (VT == MVT::v16i8 || VT == MVT::v8i8)
- return SelectPostLoadLane(Node, 1, AArch64::LD1i8_POST);
- else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
- VT == MVT::v8f16)
- return SelectPostLoadLane(Node, 1, AArch64::LD1i16_POST);
- else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
- VT == MVT::v2f32)
- return SelectPostLoadLane(Node, 1, AArch64::LD1i32_POST);
- else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
- VT == MVT::v1f64)
- return SelectPostLoadLane(Node, 1, AArch64::LD1i64_POST);
+ if (VT == MVT::v16i8 || VT == MVT::v8i8) {
+ SelectPostLoadLane(Node, 1, AArch64::LD1i8_POST);
+ return;
+ } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
+ VT == MVT::v8f16) {
+ SelectPostLoadLane(Node, 1, AArch64::LD1i16_POST);
+ return;
+ } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
+ VT == MVT::v2f32) {
+ SelectPostLoadLane(Node, 1, AArch64::LD1i32_POST);
+ return;
+ } else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
+ VT == MVT::v1f64) {
+ SelectPostLoadLane(Node, 1, AArch64::LD1i64_POST);
+ return;
+ }
break;
}
case AArch64ISD::LD2LANEpost: {
- if (VT == MVT::v16i8 || VT == MVT::v8i8)
- return SelectPostLoadLane(Node, 2, AArch64::LD2i8_POST);
- else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
- VT == MVT::v8f16)
- return SelectPostLoadLane(Node, 2, AArch64::LD2i16_POST);
- else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
- VT == MVT::v2f32)
- return SelectPostLoadLane(Node, 2, AArch64::LD2i32_POST);
- else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
- VT == MVT::v1f64)
- return SelectPostLoadLane(Node, 2, AArch64::LD2i64_POST);
+ if (VT == MVT::v16i8 || VT == MVT::v8i8) {
+ SelectPostLoadLane(Node, 2, AArch64::LD2i8_POST);
+ return;
+ } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
+ VT == MVT::v8f16) {
+ SelectPostLoadLane(Node, 2, AArch64::LD2i16_POST);
+ return;
+ } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
+ VT == MVT::v2f32) {
+ SelectPostLoadLane(Node, 2, AArch64::LD2i32_POST);
+ return;
+ } else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
+ VT == MVT::v1f64) {
+ SelectPostLoadLane(Node, 2, AArch64::LD2i64_POST);
+ return;
+ }
break;
}
case AArch64ISD::LD3LANEpost: {
- if (VT == MVT::v16i8 || VT == MVT::v8i8)
- return SelectPostLoadLane(Node, 3, AArch64::LD3i8_POST);
- else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
- VT == MVT::v8f16)
- return SelectPostLoadLane(Node, 3, AArch64::LD3i16_POST);
- else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
- VT == MVT::v2f32)
- return SelectPostLoadLane(Node, 3, AArch64::LD3i32_POST);
- else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
- VT == MVT::v1f64)
- return SelectPostLoadLane(Node, 3, AArch64::LD3i64_POST);
+ if (VT == MVT::v16i8 || VT == MVT::v8i8) {
+ SelectPostLoadLane(Node, 3, AArch64::LD3i8_POST);
+ return;
+ } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
+ VT == MVT::v8f16) {
+ SelectPostLoadLane(Node, 3, AArch64::LD3i16_POST);
+ return;
+ } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
+ VT == MVT::v2f32) {
+ SelectPostLoadLane(Node, 3, AArch64::LD3i32_POST);
+ return;
+ } else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
+ VT == MVT::v1f64) {
+ SelectPostLoadLane(Node, 3, AArch64::LD3i64_POST);
+ return;
+ }
break;
}
case AArch64ISD::LD4LANEpost: {
- if (VT == MVT::v16i8 || VT == MVT::v8i8)
- return SelectPostLoadLane(Node, 4, AArch64::LD4i8_POST);
- else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
- VT == MVT::v8f16)
- return SelectPostLoadLane(Node, 4, AArch64::LD4i16_POST);
- else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
- VT == MVT::v2f32)
- return SelectPostLoadLane(Node, 4, AArch64::LD4i32_POST);
- else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
- VT == MVT::v1f64)
- return SelectPostLoadLane(Node, 4, AArch64::LD4i64_POST);
+ if (VT == MVT::v16i8 || VT == MVT::v8i8) {
+ SelectPostLoadLane(Node, 4, AArch64::LD4i8_POST);
+ return;
+ } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
+ VT == MVT::v8f16) {
+ SelectPostLoadLane(Node, 4, AArch64::LD4i16_POST);
+ return;
+ } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
+ VT == MVT::v2f32) {
+ SelectPostLoadLane(Node, 4, AArch64::LD4i32_POST);
+ return;
+ } else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
+ VT == MVT::v1f64) {
+ SelectPostLoadLane(Node, 4, AArch64::LD4i64_POST);
+ return;
+ }
break;
}
case AArch64ISD::ST2post: {
VT = Node->getOperand(1).getValueType();
- if (VT == MVT::v8i8)
- return SelectPostStore(Node, 2, AArch64::ST2Twov8b_POST);
- else if (VT == MVT::v16i8)
- return SelectPostStore(Node, 2, AArch64::ST2Twov16b_POST);
- else if (VT == MVT::v4i16 || VT == MVT::v4f16)
- return SelectPostStore(Node, 2, AArch64::ST2Twov4h_POST);
- else if (VT == MVT::v8i16 || VT == MVT::v8f16)
- return SelectPostStore(Node, 2, AArch64::ST2Twov8h_POST);
- else if (VT == MVT::v2i32 || VT == MVT::v2f32)
- return SelectPostStore(Node, 2, AArch64::ST2Twov2s_POST);
- else if (VT == MVT::v4i32 || VT == MVT::v4f32)
- return SelectPostStore(Node, 2, AArch64::ST2Twov4s_POST);
- else if (VT == MVT::v2i64 || VT == MVT::v2f64)
- return SelectPostStore(Node, 2, AArch64::ST2Twov2d_POST);
- else if (VT == MVT::v1i64 || VT == MVT::v1f64)
- return SelectPostStore(Node, 2, AArch64::ST1Twov1d_POST);
+ if (VT == MVT::v8i8) {
+ SelectPostStore(Node, 2, AArch64::ST2Twov8b_POST);
+ return;
+ } else if (VT == MVT::v16i8) {
+ SelectPostStore(Node, 2, AArch64::ST2Twov16b_POST);
+ return;
+ } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+ SelectPostStore(Node, 2, AArch64::ST2Twov4h_POST);
+ return;
+ } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+ SelectPostStore(Node, 2, AArch64::ST2Twov8h_POST);
+ return;
+ } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
+ SelectPostStore(Node, 2, AArch64::ST2Twov2s_POST);
+ return;
+ } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
+ SelectPostStore(Node, 2, AArch64::ST2Twov4s_POST);
+ return;
+ } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
+ SelectPostStore(Node, 2, AArch64::ST2Twov2d_POST);
+ return;
+ } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
+ SelectPostStore(Node, 2, AArch64::ST1Twov1d_POST);
+ return;
+ }
break;
}
case AArch64ISD::ST3post: {
VT = Node->getOperand(1).getValueType();
- if (VT == MVT::v8i8)
- return SelectPostStore(Node, 3, AArch64::ST3Threev8b_POST);
- else if (VT == MVT::v16i8)
- return SelectPostStore(Node, 3, AArch64::ST3Threev16b_POST);
- else if (VT == MVT::v4i16 || VT == MVT::v4f16)
- return SelectPostStore(Node, 3, AArch64::ST3Threev4h_POST);
- else if (VT == MVT::v8i16 || VT == MVT::v8f16)
- return SelectPostStore(Node, 3, AArch64::ST3Threev8h_POST);
- else if (VT == MVT::v2i32 || VT == MVT::v2f32)
- return SelectPostStore(Node, 3, AArch64::ST3Threev2s_POST);
- else if (VT == MVT::v4i32 || VT == MVT::v4f32)
- return SelectPostStore(Node, 3, AArch64::ST3Threev4s_POST);
- else if (VT == MVT::v2i64 || VT == MVT::v2f64)
- return SelectPostStore(Node, 3, AArch64::ST3Threev2d_POST);
- else if (VT == MVT::v1i64 || VT == MVT::v1f64)
- return SelectPostStore(Node, 3, AArch64::ST1Threev1d_POST);
+ if (VT == MVT::v8i8) {
+ SelectPostStore(Node, 3, AArch64::ST3Threev8b_POST);
+ return;
+ } else if (VT == MVT::v16i8) {
+ SelectPostStore(Node, 3, AArch64::ST3Threev16b_POST);
+ return;
+ } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+ SelectPostStore(Node, 3, AArch64::ST3Threev4h_POST);
+ return;
+ } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+ SelectPostStore(Node, 3, AArch64::ST3Threev8h_POST);
+ return;
+ } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
+ SelectPostStore(Node, 3, AArch64::ST3Threev2s_POST);
+ return;
+ } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
+ SelectPostStore(Node, 3, AArch64::ST3Threev4s_POST);
+ return;
+ } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
+ SelectPostStore(Node, 3, AArch64::ST3Threev2d_POST);
+ return;
+ } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
+ SelectPostStore(Node, 3, AArch64::ST1Threev1d_POST);
+ return;
+ }
break;
}
case AArch64ISD::ST4post: {
VT = Node->getOperand(1).getValueType();
- if (VT == MVT::v8i8)
- return SelectPostStore(Node, 4, AArch64::ST4Fourv8b_POST);
- else if (VT == MVT::v16i8)
- return SelectPostStore(Node, 4, AArch64::ST4Fourv16b_POST);
- else if (VT == MVT::v4i16 || VT == MVT::v4f16)
- return SelectPostStore(Node, 4, AArch64::ST4Fourv4h_POST);
- else if (VT == MVT::v8i16 || VT == MVT::v8f16)
- return SelectPostStore(Node, 4, AArch64::ST4Fourv8h_POST);
- else if (VT == MVT::v2i32 || VT == MVT::v2f32)
- return SelectPostStore(Node, 4, AArch64::ST4Fourv2s_POST);
- else if (VT == MVT::v4i32 || VT == MVT::v4f32)
- return SelectPostStore(Node, 4, AArch64::ST4Fourv4s_POST);
- else if (VT == MVT::v2i64 || VT == MVT::v2f64)
- return SelectPostStore(Node, 4, AArch64::ST4Fourv2d_POST);
- else if (VT == MVT::v1i64 || VT == MVT::v1f64)
- return SelectPostStore(Node, 4, AArch64::ST1Fourv1d_POST);
+ if (VT == MVT::v8i8) {
+ SelectPostStore(Node, 4, AArch64::ST4Fourv8b_POST);
+ return;
+ } else if (VT == MVT::v16i8) {
+ SelectPostStore(Node, 4, AArch64::ST4Fourv16b_POST);
+ return;
+ } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+ SelectPostStore(Node, 4, AArch64::ST4Fourv4h_POST);
+ return;
+ } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+ SelectPostStore(Node, 4, AArch64::ST4Fourv8h_POST);
+ return;
+ } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
+ SelectPostStore(Node, 4, AArch64::ST4Fourv2s_POST);
+ return;
+ } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
+ SelectPostStore(Node, 4, AArch64::ST4Fourv4s_POST);
+ return;
+ } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
+ SelectPostStore(Node, 4, AArch64::ST4Fourv2d_POST);
+ return;
+ } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
+ SelectPostStore(Node, 4, AArch64::ST1Fourv1d_POST);
+ return;
+ }
break;
}
case AArch64ISD::ST1x2post: {
VT = Node->getOperand(1).getValueType();
- if (VT == MVT::v8i8)
- return SelectPostStore(Node, 2, AArch64::ST1Twov8b_POST);
- else if (VT == MVT::v16i8)
- return SelectPostStore(Node, 2, AArch64::ST1Twov16b_POST);
- else if (VT == MVT::v4i16 || VT == MVT::v4f16)
- return SelectPostStore(Node, 2, AArch64::ST1Twov4h_POST);
- else if (VT == MVT::v8i16 || VT == MVT::v8f16)
- return SelectPostStore(Node, 2, AArch64::ST1Twov8h_POST);
- else if (VT == MVT::v2i32 || VT == MVT::v2f32)
- return SelectPostStore(Node, 2, AArch64::ST1Twov2s_POST);
- else if (VT == MVT::v4i32 || VT == MVT::v4f32)
- return SelectPostStore(Node, 2, AArch64::ST1Twov4s_POST);
- else if (VT == MVT::v1i64 || VT == MVT::v1f64)
- return SelectPostStore(Node, 2, AArch64::ST1Twov1d_POST);
- else if (VT == MVT::v2i64 || VT == MVT::v2f64)
- return SelectPostStore(Node, 2, AArch64::ST1Twov2d_POST);
+ if (VT == MVT::v8i8) {
+ SelectPostStore(Node, 2, AArch64::ST1Twov8b_POST);
+ return;
+ } else if (VT == MVT::v16i8) {
+ SelectPostStore(Node, 2, AArch64::ST1Twov16b_POST);
+ return;
+ } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+ SelectPostStore(Node, 2, AArch64::ST1Twov4h_POST);
+ return;
+ } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+ SelectPostStore(Node, 2, AArch64::ST1Twov8h_POST);
+ return;
+ } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
+ SelectPostStore(Node, 2, AArch64::ST1Twov2s_POST);
+ return;
+ } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
+ SelectPostStore(Node, 2, AArch64::ST1Twov4s_POST);
+ return;
+ } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
+ SelectPostStore(Node, 2, AArch64::ST1Twov1d_POST);
+ return;
+ } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
+ SelectPostStore(Node, 2, AArch64::ST1Twov2d_POST);
+ return;
+ }
break;
}
case AArch64ISD::ST1x3post: {
VT = Node->getOperand(1).getValueType();
- if (VT == MVT::v8i8)
- return SelectPostStore(Node, 3, AArch64::ST1Threev8b_POST);
- else if (VT == MVT::v16i8)
- return SelectPostStore(Node, 3, AArch64::ST1Threev16b_POST);
- else if (VT == MVT::v4i16 || VT == MVT::v4f16)
- return SelectPostStore(Node, 3, AArch64::ST1Threev4h_POST);
- else if (VT == MVT::v8i16 || VT == MVT::v8f16)
- return SelectPostStore(Node, 3, AArch64::ST1Threev8h_POST);
- else if (VT == MVT::v2i32 || VT == MVT::v2f32)
- return SelectPostStore(Node, 3, AArch64::ST1Threev2s_POST);
- else if (VT == MVT::v4i32 || VT == MVT::v4f32)
- return SelectPostStore(Node, 3, AArch64::ST1Threev4s_POST);
- else if (VT == MVT::v1i64 || VT == MVT::v1f64)
- return SelectPostStore(Node, 3, AArch64::ST1Threev1d_POST);
- else if (VT == MVT::v2i64 || VT == MVT::v2f64)
- return SelectPostStore(Node, 3, AArch64::ST1Threev2d_POST);
+ if (VT == MVT::v8i8) {
+ SelectPostStore(Node, 3, AArch64::ST1Threev8b_POST);
+ return;
+ } else if (VT == MVT::v16i8) {
+ SelectPostStore(Node, 3, AArch64::ST1Threev16b_POST);
+ return;
+ } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+ SelectPostStore(Node, 3, AArch64::ST1Threev4h_POST);
+ return;
+ } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+ SelectPostStore(Node, 3, AArch64::ST1Threev8h_POST);
+ return;
+ } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
+ SelectPostStore(Node, 3, AArch64::ST1Threev2s_POST);
+ return;
+ } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
+ SelectPostStore(Node, 3, AArch64::ST1Threev4s_POST);
+ return;
+ } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
+ SelectPostStore(Node, 3, AArch64::ST1Threev1d_POST);
+ return;
+ } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
+ SelectPostStore(Node, 3, AArch64::ST1Threev2d_POST);
+ return;
+ }
break;
}
case AArch64ISD::ST1x4post: {
VT = Node->getOperand(1).getValueType();
- if (VT == MVT::v8i8)
- return SelectPostStore(Node, 4, AArch64::ST1Fourv8b_POST);
- else if (VT == MVT::v16i8)
- return SelectPostStore(Node, 4, AArch64::ST1Fourv16b_POST);
- else if (VT == MVT::v4i16 || VT == MVT::v4f16)
- return SelectPostStore(Node, 4, AArch64::ST1Fourv4h_POST);
- else if (VT == MVT::v8i16 || VT == MVT::v8f16)
- return SelectPostStore(Node, 4, AArch64::ST1Fourv8h_POST);
- else if (VT == MVT::v2i32 || VT == MVT::v2f32)
- return SelectPostStore(Node, 4, AArch64::ST1Fourv2s_POST);
- else if (VT == MVT::v4i32 || VT == MVT::v4f32)
- return SelectPostStore(Node, 4, AArch64::ST1Fourv4s_POST);
- else if (VT == MVT::v1i64 || VT == MVT::v1f64)
- return SelectPostStore(Node, 4, AArch64::ST1Fourv1d_POST);
- else if (VT == MVT::v2i64 || VT == MVT::v2f64)
- return SelectPostStore(Node, 4, AArch64::ST1Fourv2d_POST);
+ if (VT == MVT::v8i8) {
+ SelectPostStore(Node, 4, AArch64::ST1Fourv8b_POST);
+ return;
+ } else if (VT == MVT::v16i8) {
+ SelectPostStore(Node, 4, AArch64::ST1Fourv16b_POST);
+ return;
+ } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+ SelectPostStore(Node, 4, AArch64::ST1Fourv4h_POST);
+ return;
+ } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+ SelectPostStore(Node, 4, AArch64::ST1Fourv8h_POST);
+ return;
+ } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
+ SelectPostStore(Node, 4, AArch64::ST1Fourv2s_POST);
+ return;
+ } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
+ SelectPostStore(Node, 4, AArch64::ST1Fourv4s_POST);
+ return;
+ } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
+ SelectPostStore(Node, 4, AArch64::ST1Fourv1d_POST);
+ return;
+ } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
+ SelectPostStore(Node, 4, AArch64::ST1Fourv2d_POST);
+ return;
+ }
break;
}
case AArch64ISD::ST2LANEpost: {
VT = Node->getOperand(1).getValueType();
- if (VT == MVT::v16i8 || VT == MVT::v8i8)
- return SelectPostStoreLane(Node, 2, AArch64::ST2i8_POST);
- else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
- VT == MVT::v8f16)
- return SelectPostStoreLane(Node, 2, AArch64::ST2i16_POST);
- else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
- VT == MVT::v2f32)
- return SelectPostStoreLane(Node, 2, AArch64::ST2i32_POST);
- else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
- VT == MVT::v1f64)
- return SelectPostStoreLane(Node, 2, AArch64::ST2i64_POST);
+ if (VT == MVT::v16i8 || VT == MVT::v8i8) {
+ SelectPostStoreLane(Node, 2, AArch64::ST2i8_POST);
+ return;
+ } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
+ VT == MVT::v8f16) {
+ SelectPostStoreLane(Node, 2, AArch64::ST2i16_POST);
+ return;
+ } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
+ VT == MVT::v2f32) {
+ SelectPostStoreLane(Node, 2, AArch64::ST2i32_POST);
+ return;
+ } else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
+ VT == MVT::v1f64) {
+ SelectPostStoreLane(Node, 2, AArch64::ST2i64_POST);
+ return;
+ }
break;
}
case AArch64ISD::ST3LANEpost: {
VT = Node->getOperand(1).getValueType();
- if (VT == MVT::v16i8 || VT == MVT::v8i8)
- return SelectPostStoreLane(Node, 3, AArch64::ST3i8_POST);
- else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
- VT == MVT::v8f16)
- return SelectPostStoreLane(Node, 3, AArch64::ST3i16_POST);
- else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
- VT == MVT::v2f32)
- return SelectPostStoreLane(Node, 3, AArch64::ST3i32_POST);
- else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
- VT == MVT::v1f64)
- return SelectPostStoreLane(Node, 3, AArch64::ST3i64_POST);
+ if (VT == MVT::v16i8 || VT == MVT::v8i8) {
+ SelectPostStoreLane(Node, 3, AArch64::ST3i8_POST);
+ return;
+ } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
+ VT == MVT::v8f16) {
+ SelectPostStoreLane(Node, 3, AArch64::ST3i16_POST);
+ return;
+ } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
+ VT == MVT::v2f32) {
+ SelectPostStoreLane(Node, 3, AArch64::ST3i32_POST);
+ return;
+ } else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
+ VT == MVT::v1f64) {
+ SelectPostStoreLane(Node, 3, AArch64::ST3i64_POST);
+ return;
+ }
break;
}
case AArch64ISD::ST4LANEpost: {
VT = Node->getOperand(1).getValueType();
- if (VT == MVT::v16i8 || VT == MVT::v8i8)
- return SelectPostStoreLane(Node, 4, AArch64::ST4i8_POST);
- else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
- VT == MVT::v8f16)
- return SelectPostStoreLane(Node, 4, AArch64::ST4i16_POST);
- else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
- VT == MVT::v2f32)
- return SelectPostStoreLane(Node, 4, AArch64::ST4i32_POST);
- else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
- VT == MVT::v1f64)
- return SelectPostStoreLane(Node, 4, AArch64::ST4i64_POST);
+ if (VT == MVT::v16i8 || VT == MVT::v8i8) {
+ SelectPostStoreLane(Node, 4, AArch64::ST4i8_POST);
+ return;
+ } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
+ VT == MVT::v8f16) {
+ SelectPostStoreLane(Node, 4, AArch64::ST4i16_POST);
+ return;
+ } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
+ VT == MVT::v2f32) {
+ SelectPostStoreLane(Node, 4, AArch64::ST4i32_POST);
+ return;
+ } else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
+ VT == MVT::v1f64) {
+ SelectPostStoreLane(Node, 4, AArch64::ST4i64_POST);
+ return;
+ }
break;
}
}
// Select the default instruction
- ResNode = SelectCode(Node);
-
- DEBUG(errs() << "=> ");
- if (ResNode == nullptr || ResNode == Node)
- DEBUG(Node->dump(CurDAG));
- else
- DEBUG(ResNode->dump(CurDAG));
- DEBUG(errs() << "\n");
-
- return ResNode;
+ SelectCode(Node);
}
/// createAArch64ISelDag - This pass converts a legalized DAG into a
diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp
index 92cf1cd71970b..d6f2a190d4c85 100644
--- a/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -40,12 +40,6 @@ using namespace llvm;
STATISTIC(NumTailCalls, "Number of tail calls");
STATISTIC(NumShiftInserts, "Number of vector shift inserts");
-// Place holder until extr generation is tested fully.
-static cl::opt<bool>
-EnableAArch64ExtrGeneration("aarch64-extr-generation", cl::Hidden,
- cl::desc("Allow AArch64 (or (shift)(shift))->extract"),
- cl::init(true));
-
static cl::opt<bool>
EnableAArch64SlrGeneration("aarch64-shift-insert-generation", cl::Hidden,
cl::desc("Allow AArch64 SLI/SRI formation"),
@@ -59,6 +53,13 @@ cl::opt<bool> EnableAArch64ELFLocalDynamicTLSGeneration(
cl::desc("Allow AArch64 Local Dynamic TLS code generation"),
cl::init(false));
+// Disabled for causing self-hosting failures once returned-attribute inference
+// was enabled.
+static cl::opt<bool>
+EnableThisRetForwarding("aarch64-this-return-forwarding", cl::Hidden,
+ cl::desc("Directly forward this return"),
+ cl::init(false));
+
/// Value type used for condition codes.
static const MVT MVT_CC = MVT::i32;
@@ -225,13 +226,6 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand);
- // Expand the undefined-at-zero variants to cttz/ctlz to their defined-at-zero
- // counterparts, which AArch64 supports directly.
- setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Expand);
- setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Expand);
- setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Expand);
- setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Expand);
-
setOperationAction(ISD::CTPOP, MVT::i32, Custom);
setOperationAction(ISD::CTPOP, MVT::i64, Custom);
@@ -402,6 +396,8 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setOperationAction(ISD::PREFETCH, MVT::Other, Custom);
+ setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i128, Custom);
+
// Lower READCYCLECOUNTER using an mrs from PMCCNTR_EL0.
// This requires the Performance Monitors extension.
if (Subtarget->hasPerfMon())
@@ -476,7 +472,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
// Also, try to fold ADD into CSINC/CSINV..
setTargetDAGCombine(ISD::ADD);
setTargetDAGCombine(ISD::SUB);
-
+ setTargetDAGCombine(ISD::SRL);
setTargetDAGCombine(ISD::XOR);
setTargetDAGCombine(ISD::SINT_TO_FP);
setTargetDAGCombine(ISD::UINT_TO_FP);
@@ -518,7 +514,11 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
MaskAndBranchFoldingIsLegal = true;
EnableExtLdPromotion = true;
+ // Set required alignment.
setMinFunctionAlignment(2);
+ // Set preferred alignments.
+ setPrefFunctionAlignment(STI.getPrefFunctionAlignment());
+ setPrefLoopAlignment(STI.getPrefLoopAlignment());
setHasExtractBitsInsn(true);
@@ -583,6 +583,18 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Custom);
setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Custom);
+ setOperationAction(ISD::CTLZ, MVT::v1i64, Expand);
+ setOperationAction(ISD::CTLZ, MVT::v2i64, Expand);
+
+ setOperationAction(ISD::CTTZ, MVT::v2i8, Expand);
+ setOperationAction(ISD::CTTZ, MVT::v4i16, Expand);
+ setOperationAction(ISD::CTTZ, MVT::v2i32, Expand);
+ setOperationAction(ISD::CTTZ, MVT::v1i64, Expand);
+ setOperationAction(ISD::CTTZ, MVT::v16i8, Expand);
+ setOperationAction(ISD::CTTZ, MVT::v8i16, Expand);
+ setOperationAction(ISD::CTTZ, MVT::v4i32, Expand);
+ setOperationAction(ISD::CTTZ, MVT::v2i64, Expand);
+
// AArch64 doesn't have MUL.2d:
setOperationAction(ISD::MUL, MVT::v2i64, Expand);
// Custom handling for some quad-vector types to detect MULL.
@@ -623,91 +635,88 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
}
}
- // Prefer likely predicted branches to selects on out-of-order cores.
- if (Subtarget->isCortexA57())
- PredictableSelectIsExpensive = true;
+ PredictableSelectIsExpensive = Subtarget->predictableSelectIsExpensive();
}
-void AArch64TargetLowering::addTypeForNEON(EVT VT, EVT PromotedBitwiseVT) {
+void AArch64TargetLowering::addTypeForNEON(MVT VT, MVT PromotedBitwiseVT) {
if (VT == MVT::v2f32 || VT == MVT::v4f16) {
- setOperationAction(ISD::LOAD, VT.getSimpleVT(), Promote);
- AddPromotedToType(ISD::LOAD, VT.getSimpleVT(), MVT::v2i32);
+ setOperationAction(ISD::LOAD, VT, Promote);
+ AddPromotedToType(ISD::LOAD, VT, MVT::v2i32);
- setOperationAction(ISD::STORE, VT.getSimpleVT(), Promote);
- AddPromotedToType(ISD::STORE, VT.getSimpleVT(), MVT::v2i32);
+ setOperationAction(ISD::STORE, VT, Promote);
+ AddPromotedToType(ISD::STORE, VT, MVT::v2i32);
} else if (VT == MVT::v2f64 || VT == MVT::v4f32 || VT == MVT::v8f16) {
- setOperationAction(ISD::LOAD, VT.getSimpleVT(), Promote);
- AddPromotedToType(ISD::LOAD, VT.getSimpleVT(), MVT::v2i64);
+ setOperationAction(ISD::LOAD, VT, Promote);
+ AddPromotedToType(ISD::LOAD, VT, MVT::v2i64);
- setOperationAction(ISD::STORE, VT.getSimpleVT(), Promote);
- AddPromotedToType(ISD::STORE, VT.getSimpleVT(), MVT::v2i64);
+ setOperationAction(ISD::STORE, VT, Promote);
+ AddPromotedToType(ISD::STORE, VT, MVT::v2i64);
}
// Mark vector float intrinsics as expand.
if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64) {
- setOperationAction(ISD::FSIN, VT.getSimpleVT(), Expand);
- setOperationAction(ISD::FCOS, VT.getSimpleVT(), Expand);
- setOperationAction(ISD::FPOWI, VT.getSimpleVT(), Expand);
- setOperationAction(ISD::FPOW, VT.getSimpleVT(), Expand);
- setOperationAction(ISD::FLOG, VT.getSimpleVT(), Expand);
- setOperationAction(ISD::FLOG2, VT.getSimpleVT(), Expand);
- setOperationAction(ISD::FLOG10, VT.getSimpleVT(), Expand);
- setOperationAction(ISD::FEXP, VT.getSimpleVT(), Expand);
- setOperationAction(ISD::FEXP2, VT.getSimpleVT(), Expand);
+ setOperationAction(ISD::FSIN, VT, Expand);
+ setOperationAction(ISD::FCOS, VT, Expand);
+ setOperationAction(ISD::FPOWI, VT, Expand);
+ setOperationAction(ISD::FPOW, VT, Expand);
+ setOperationAction(ISD::FLOG, VT, Expand);
+ setOperationAction(ISD::FLOG2, VT, Expand);
+ setOperationAction(ISD::FLOG10, VT, Expand);
+ setOperationAction(ISD::FEXP, VT, Expand);
+ setOperationAction(ISD::FEXP2, VT, Expand);
// But we do support custom-lowering for FCOPYSIGN.
- setOperationAction(ISD::FCOPYSIGN, VT.getSimpleVT(), Custom);
- }
-
- setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT.getSimpleVT(), Custom);
- setOperationAction(ISD::INSERT_VECTOR_ELT, VT.getSimpleVT(), Custom);
- setOperationAction(ISD::BUILD_VECTOR, VT.getSimpleVT(), Custom);
- setOperationAction(ISD::VECTOR_SHUFFLE, VT.getSimpleVT(), Custom);
- setOperationAction(ISD::EXTRACT_SUBVECTOR, VT.getSimpleVT(), Custom);
- setOperationAction(ISD::SRA, VT.getSimpleVT(), Custom);
- setOperationAction(ISD::SRL, VT.getSimpleVT(), Custom);
- setOperationAction(ISD::SHL, VT.getSimpleVT(), Custom);
- setOperationAction(ISD::AND, VT.getSimpleVT(), Custom);
- setOperationAction(ISD::OR, VT.getSimpleVT(), Custom);
- setOperationAction(ISD::SETCC, VT.getSimpleVT(), Custom);
- setOperationAction(ISD::CONCAT_VECTORS, VT.getSimpleVT(), Legal);
-
- setOperationAction(ISD::SELECT, VT.getSimpleVT(), Expand);
- setOperationAction(ISD::SELECT_CC, VT.getSimpleVT(), Expand);
- setOperationAction(ISD::VSELECT, VT.getSimpleVT(), Expand);
+ setOperationAction(ISD::FCOPYSIGN, VT, Custom);
+ }
+
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
+ setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
+ setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
+ setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
+ setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
+ setOperationAction(ISD::SRA, VT, Custom);
+ setOperationAction(ISD::SRL, VT, Custom);
+ setOperationAction(ISD::SHL, VT, Custom);
+ setOperationAction(ISD::AND, VT, Custom);
+ setOperationAction(ISD::OR, VT, Custom);
+ setOperationAction(ISD::SETCC, VT, Custom);
+ setOperationAction(ISD::CONCAT_VECTORS, VT, Legal);
+
+ setOperationAction(ISD::SELECT, VT, Expand);
+ setOperationAction(ISD::SELECT_CC, VT, Expand);
+ setOperationAction(ISD::VSELECT, VT, Expand);
for (MVT InnerVT : MVT::all_valuetypes())
- setLoadExtAction(ISD::EXTLOAD, InnerVT, VT.getSimpleVT(), Expand);
+ setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
// CNT supports only B element sizes.
if (VT != MVT::v8i8 && VT != MVT::v16i8)
- setOperationAction(ISD::CTPOP, VT.getSimpleVT(), Expand);
+ setOperationAction(ISD::CTPOP, VT, Expand);
- setOperationAction(ISD::UDIV, VT.getSimpleVT(), Expand);
- setOperationAction(ISD::SDIV, VT.getSimpleVT(), Expand);
- setOperationAction(ISD::UREM, VT.getSimpleVT(), Expand);
- setOperationAction(ISD::SREM, VT.getSimpleVT(), Expand);
- setOperationAction(ISD::FREM, VT.getSimpleVT(), Expand);
+ setOperationAction(ISD::UDIV, VT, Expand);
+ setOperationAction(ISD::SDIV, VT, Expand);
+ setOperationAction(ISD::UREM, VT, Expand);
+ setOperationAction(ISD::SREM, VT, Expand);
+ setOperationAction(ISD::FREM, VT, Expand);
- setOperationAction(ISD::FP_TO_SINT, VT.getSimpleVT(), Custom);
- setOperationAction(ISD::FP_TO_UINT, VT.getSimpleVT(), Custom);
+ setOperationAction(ISD::FP_TO_SINT, VT, Custom);
+ setOperationAction(ISD::FP_TO_UINT, VT, Custom);
// [SU][MIN|MAX] are available for all NEON types apart from i64.
- if (!VT.isFloatingPoint() &&
- VT.getSimpleVT() != MVT::v2i64 && VT.getSimpleVT() != MVT::v1i64)
+ if (!VT.isFloatingPoint() && VT != MVT::v2i64 && VT != MVT::v1i64)
for (unsigned Opcode : {ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX})
- setOperationAction(Opcode, VT.getSimpleVT(), Legal);
+ setOperationAction(Opcode, VT, Legal);
// F[MIN|MAX][NUM|NAN] are available for all FP NEON types (not f16 though!).
if (VT.isFloatingPoint() && VT.getVectorElementType() != MVT::f16)
for (unsigned Opcode : {ISD::FMINNAN, ISD::FMAXNAN,
ISD::FMINNUM, ISD::FMAXNUM})
- setOperationAction(Opcode, VT.getSimpleVT(), Legal);
+ setOperationAction(Opcode, VT, Legal);
if (Subtarget->isLittleEndian()) {
for (unsigned im = (unsigned)ISD::PRE_INC;
im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
- setIndexedLoadAction(im, VT.getSimpleVT(), Legal);
- setIndexedStoreAction(im, VT.getSimpleVT(), Legal);
+ setIndexedLoadAction(im, VT, Legal);
+ setIndexedStoreAction(im, VT, Legal);
}
}
}
@@ -804,12 +813,9 @@ bool AArch64TargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
if (Subtarget->requiresStrictAlign())
return false;
- // FIXME: This is mostly true for Cyclone, but not necessarily others.
if (Fast) {
- // FIXME: Define an attribute for slow unaligned accesses instead of
- // relying on the CPU type as a proxy.
- // On Cyclone, unaligned 128-bit stores are slow.
- *Fast = !Subtarget->isCyclone() || VT.getStoreSize() != 16 ||
+ // Some CPUs are fine with unaligned stores except for 128-bit ones.
+ *Fast = !Subtarget->isMisaligned128StoreSlow() || VT.getStoreSize() != 16 ||
// See comments in performSTORECombine() for more details about
// these conditions.
@@ -954,12 +960,14 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
case AArch64ISD::ST4LANEpost: return "AArch64ISD::ST4LANEpost";
case AArch64ISD::SMULL: return "AArch64ISD::SMULL";
case AArch64ISD::UMULL: return "AArch64ISD::UMULL";
+ case AArch64ISD::FRSQRTE: return "AArch64ISD::FRSQRTE";
+ case AArch64ISD::FRECPE: return "AArch64ISD::FRECPE";
}
return nullptr;
}
MachineBasicBlock *
-AArch64TargetLowering::EmitF128CSEL(MachineInstr *MI,
+AArch64TargetLowering::EmitF128CSEL(MachineInstr &MI,
MachineBasicBlock *MBB) const {
// We materialise the F128CSEL pseudo-instruction as some control flow and a
// phi node:
@@ -976,14 +984,14 @@ AArch64TargetLowering::EmitF128CSEL(MachineInstr *MI,
MachineFunction *MF = MBB->getParent();
const TargetInstrInfo *TII = Subtarget->getInstrInfo();
const BasicBlock *LLVM_BB = MBB->getBasicBlock();
- DebugLoc DL = MI->getDebugLoc();
+ DebugLoc DL = MI.getDebugLoc();
MachineFunction::iterator It = ++MBB->getIterator();
- unsigned DestReg = MI->getOperand(0).getReg();
- unsigned IfTrueReg = MI->getOperand(1).getReg();
- unsigned IfFalseReg = MI->getOperand(2).getReg();
- unsigned CondCode = MI->getOperand(3).getImm();
- bool NZCVKilled = MI->getOperand(4).isKill();
+ unsigned DestReg = MI.getOperand(0).getReg();
+ unsigned IfTrueReg = MI.getOperand(1).getReg();
+ unsigned IfFalseReg = MI.getOperand(2).getReg();
+ unsigned CondCode = MI.getOperand(3).getImm();
+ bool NZCVKilled = MI.getOperand(4).isKill();
MachineBasicBlock *TrueBB = MF->CreateMachineBasicBlock(LLVM_BB);
MachineBasicBlock *EndBB = MF->CreateMachineBasicBlock(LLVM_BB);
@@ -1014,17 +1022,16 @@ AArch64TargetLowering::EmitF128CSEL(MachineInstr *MI,
.addReg(IfFalseReg)
.addMBB(MBB);
- MI->eraseFromParent();
+ MI.eraseFromParent();
return EndBB;
}
-MachineBasicBlock *
-AArch64TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
- MachineBasicBlock *BB) const {
- switch (MI->getOpcode()) {
+MachineBasicBlock *AArch64TargetLowering::EmitInstrWithCustomInserter(
+ MachineInstr &MI, MachineBasicBlock *BB) const {
+ switch (MI.getOpcode()) {
default:
#ifndef NDEBUG
- MI->dump();
+ MI.dump();
#endif
llvm_unreachable("Unexpected instruction for custom inserter!");
@@ -1135,6 +1142,35 @@ static void changeFPCCToAArch64CC(ISD::CondCode CC,
}
}
+/// Convert a DAG fp condition code to an AArch64 CC.
+/// This differs from changeFPCCToAArch64CC in that it returns cond codes that
+/// should be AND'ed instead of OR'ed.
+static void changeFPCCToANDAArch64CC(ISD::CondCode CC,
+ AArch64CC::CondCode &CondCode,
+ AArch64CC::CondCode &CondCode2) {
+ CondCode2 = AArch64CC::AL;
+ switch (CC) {
+ default:
+ changeFPCCToAArch64CC(CC, CondCode, CondCode2);
+ assert(CondCode2 == AArch64CC::AL);
+ break;
+ case ISD::SETONE:
+ // (a one b)
+ // == ((a olt b) || (a ogt b))
+ // == ((a ord b) && (a une b))
+ CondCode = AArch64CC::VC;
+ CondCode2 = AArch64CC::NE;
+ break;
+ case ISD::SETUEQ:
+ // (a ueq b)
+ // == ((a uno b) || (a oeq b))
+ // == ((a ule b) && (a uge b))
+ CondCode = AArch64CC::PL;
+ CondCode2 = AArch64CC::LE;
+ break;
+ }
+}
+
/// changeVectorFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64
/// CC usable with the vector instructions. Fewer operations are available
/// without a real NZCV register, so we have to use less efficient combinations
@@ -1174,11 +1210,18 @@ static bool isLegalArithImmed(uint64_t C) {
}
static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC,
- SDLoc dl, SelectionDAG &DAG) {
+ const SDLoc &dl, SelectionDAG &DAG) {
EVT VT = LHS.getValueType();
- if (VT.isFloatingPoint())
+ if (VT.isFloatingPoint()) {
+ assert(VT != MVT::f128);
+ if (VT == MVT::f16) {
+ LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, LHS);
+ RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, RHS);
+ VT = MVT::f32;
+ }
return DAG.getNode(AArch64ISD::FCMP, dl, VT, LHS, RHS);
+ }
// The CMP instruction is just an alias for SUBS, and representing it as
// SUBS means that it's possible to get CSE with subtract operations.
@@ -1258,22 +1301,31 @@ static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC,
/// Create a conditional comparison; Use CCMP, CCMN or FCCMP as appropriate.
static SDValue emitConditionalComparison(SDValue LHS, SDValue RHS,
ISD::CondCode CC, SDValue CCOp,
- SDValue Condition, unsigned NZCV,
- SDLoc DL, SelectionDAG &DAG) {
+ AArch64CC::CondCode Predicate,
+ AArch64CC::CondCode OutCC,
+ const SDLoc &DL, SelectionDAG &DAG) {
unsigned Opcode = 0;
- if (LHS.getValueType().isFloatingPoint())
+ if (LHS.getValueType().isFloatingPoint()) {
+ assert(LHS.getValueType() != MVT::f128);
+ if (LHS.getValueType() == MVT::f16) {
+ LHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, LHS);
+ RHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, RHS);
+ }
Opcode = AArch64ISD::FCCMP;
- else if (RHS.getOpcode() == ISD::SUB) {
+ } else if (RHS.getOpcode() == ISD::SUB) {
SDValue SubOp0 = RHS.getOperand(0);
if (isNullConstant(SubOp0) && (CC == ISD::SETEQ || CC == ISD::SETNE)) {
- // See emitComparison() on why we can only do this for SETEQ and SETNE.
- Opcode = AArch64ISD::CCMN;
- RHS = RHS.getOperand(1);
- }
+ // See emitComparison() on why we can only do this for SETEQ and SETNE.
+ Opcode = AArch64ISD::CCMN;
+ RHS = RHS.getOperand(1);
+ }
}
if (Opcode == 0)
Opcode = AArch64ISD::CCMP;
+ SDValue Condition = DAG.getConstant(Predicate, DL, MVT_CC);
+ AArch64CC::CondCode InvOutCC = AArch64CC::getInvertedCondCode(OutCC);
+ unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(InvOutCC);
SDValue NZCVOp = DAG.getConstant(NZCV, DL, MVT::i32);
return DAG.getNode(Opcode, DL, MVT_CC, LHS, RHS, NZCVOp, Condition, CCOp);
}
@@ -1284,31 +1336,49 @@ static SDValue emitConditionalComparison(SDValue LHS, SDValue RHS,
/// at the leafs only. i.e. "not (or (or x y) z)" can be changed to
/// "and (and (not x) (not y)) (not z)"; "not (or (and x y) z)" cannot be
/// brought into such a form.
-static bool isConjunctionDisjunctionTree(const SDValue Val, bool &CanPushNegate,
+static bool isConjunctionDisjunctionTree(const SDValue Val, bool &CanNegate,
unsigned Depth = 0) {
if (!Val.hasOneUse())
return false;
unsigned Opcode = Val->getOpcode();
if (Opcode == ISD::SETCC) {
- CanPushNegate = true;
+ if (Val->getOperand(0).getValueType() == MVT::f128)
+ return false;
+ CanNegate = true;
return true;
}
- // Protect against stack overflow.
- if (Depth > 15)
+ // Protect against exponential runtime and stack overflow.
+ if (Depth > 6)
return false;
if (Opcode == ISD::AND || Opcode == ISD::OR) {
SDValue O0 = Val->getOperand(0);
SDValue O1 = Val->getOperand(1);
- bool CanPushNegateL;
- if (!isConjunctionDisjunctionTree(O0, CanPushNegateL, Depth+1))
+ bool CanNegateL;
+ if (!isConjunctionDisjunctionTree(O0, CanNegateL, Depth+1))
return false;
- bool CanPushNegateR;
- if (!isConjunctionDisjunctionTree(O1, CanPushNegateR, Depth+1))
+ bool CanNegateR;
+ if (!isConjunctionDisjunctionTree(O1, CanNegateR, Depth+1))
return false;
- // We cannot push a negate through an AND operation (it would become an OR),
- // we can however change a (not (or x y)) to (and (not x) (not y)) if we can
- // push the negate through the x/y subtrees.
- CanPushNegate = (Opcode == ISD::OR) && CanPushNegateL && CanPushNegateR;
+
+ if (Opcode == ISD::OR) {
+ // For an OR expression we need to be able to negate at least one side or
+ // we cannot do the transformation at all.
+ if (!CanNegateL && !CanNegateR)
+ return false;
+ // We can however change a (not (or x y)) to (and (not x) (not y)) if we
+ // can negate the x and y subtrees.
+ CanNegate = CanNegateL && CanNegateR;
+ } else {
+ // If the operands are OR expressions then we finally need to negate their
+ // outputs, we can only do that for the operand with emitted last by
+ // negating OutCC, not for both operands.
+ bool NeedsNegOutL = O0->getOpcode() == ISD::OR;
+ bool NeedsNegOutR = O1->getOpcode() == ISD::OR;
+ if (NeedsNegOutL && NeedsNegOutR)
+ return false;
+ // We cannot negate an AND operation (it would become an OR),
+ CanNegate = false;
+ }
return true;
}
return false;
@@ -1324,10 +1394,9 @@ static bool isConjunctionDisjunctionTree(const SDValue Val, bool &CanPushNegate,
/// effects pushed to the tree leafs; @p Predicate is an NZCV flag predicate
/// for the comparisons in the current subtree; @p Depth limits the search
/// depth to avoid stack overflow.
-static SDValue emitConjunctionDisjunctionTree(SelectionDAG &DAG, SDValue Val,
- AArch64CC::CondCode &OutCC, bool PushNegate = false,
- SDValue CCOp = SDValue(), AArch64CC::CondCode Predicate = AArch64CC::AL,
- unsigned Depth = 0) {
+static SDValue emitConjunctionDisjunctionTreeRec(SelectionDAG &DAG, SDValue Val,
+ AArch64CC::CondCode &OutCC, bool Negate, SDValue CCOp,
+ AArch64CC::CondCode Predicate) {
// We're at a tree leaf, produce a conditional comparison operation.
unsigned Opcode = Val->getOpcode();
if (Opcode == ISD::SETCC) {
@@ -1335,7 +1404,7 @@ static SDValue emitConjunctionDisjunctionTree(SelectionDAG &DAG, SDValue Val,
SDValue RHS = Val->getOperand(1);
ISD::CondCode CC = cast<CondCodeSDNode>(Val->getOperand(2))->get();
bool isInteger = LHS.getValueType().isInteger();
- if (PushNegate)
+ if (Negate)
CC = getSetCCInverse(CC, isInteger);
SDLoc DL(Val);
// Determine OutCC and handle FP special case.
@@ -1344,68 +1413,62 @@ static SDValue emitConjunctionDisjunctionTree(SelectionDAG &DAG, SDValue Val,
} else {
assert(LHS.getValueType().isFloatingPoint());
AArch64CC::CondCode ExtraCC;
- changeFPCCToAArch64CC(CC, OutCC, ExtraCC);
- // Surpisingly some floating point conditions can't be tested with a
- // single condition code. Construct an additional comparison in this case.
- // See comment below on how we deal with OR conditions.
+ changeFPCCToANDAArch64CC(CC, OutCC, ExtraCC);
+ // Some floating point conditions can't be tested with a single condition
+ // code. Construct an additional comparison in this case.
if (ExtraCC != AArch64CC::AL) {
SDValue ExtraCmp;
if (!CCOp.getNode())
ExtraCmp = emitComparison(LHS, RHS, CC, DL, DAG);
- else {
- SDValue ConditionOp = DAG.getConstant(Predicate, DL, MVT_CC);
- // Note that we want the inverse of ExtraCC, so NZCV is not inversed.
- unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(ExtraCC);
- ExtraCmp = emitConditionalComparison(LHS, RHS, CC, CCOp, ConditionOp,
- NZCV, DL, DAG);
- }
+ else
+ ExtraCmp = emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate,
+ ExtraCC, DL, DAG);
CCOp = ExtraCmp;
- Predicate = AArch64CC::getInvertedCondCode(ExtraCC);
- OutCC = AArch64CC::getInvertedCondCode(OutCC);
+ Predicate = ExtraCC;
}
}
// Produce a normal comparison if we are first in the chain
- if (!CCOp.getNode())
+ if (!CCOp)
return emitComparison(LHS, RHS, CC, DL, DAG);
// Otherwise produce a ccmp.
- SDValue ConditionOp = DAG.getConstant(Predicate, DL, MVT_CC);
- AArch64CC::CondCode InvOutCC = AArch64CC::getInvertedCondCode(OutCC);
- unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(InvOutCC);
- return emitConditionalComparison(LHS, RHS, CC, CCOp, ConditionOp, NZCV, DL,
+ return emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate, OutCC, DL,
DAG);
- } else if ((Opcode != ISD::AND && Opcode != ISD::OR) || !Val->hasOneUse())
- return SDValue();
-
- assert((Opcode == ISD::OR || !PushNegate)
- && "Can only push negate through OR operation");
+ }
+ assert((Opcode == ISD::AND || (Opcode == ISD::OR && Val->hasOneUse())) &&
+ "Valid conjunction/disjunction tree");
// Check if both sides can be transformed.
SDValue LHS = Val->getOperand(0);
SDValue RHS = Val->getOperand(1);
- bool CanPushNegateL;
- if (!isConjunctionDisjunctionTree(LHS, CanPushNegateL, Depth+1))
- return SDValue();
- bool CanPushNegateR;
- if (!isConjunctionDisjunctionTree(RHS, CanPushNegateR, Depth+1))
- return SDValue();
- // Do we need to negate our operands?
- bool NegateOperands = Opcode == ISD::OR;
+ // In case of an OR we need to negate our operands and the result.
+ // (A v B) <=> not(not(A) ^ not(B))
+ bool NegateOpsAndResult = Opcode == ISD::OR;
// We can negate the results of all previous operations by inverting the
- // predicate flags giving us a free negation for one side. For the other side
- // we need to be able to push the negation to the leafs of the tree.
- if (NegateOperands) {
- if (!CanPushNegateL && !CanPushNegateR)
- return SDValue();
- // Order the side where we can push the negate through to LHS.
- if (!CanPushNegateL && CanPushNegateR)
+ // predicate flags giving us a free negation for one side. The other side
+ // must be negatable by itself.
+ if (NegateOpsAndResult) {
+ // See which side we can negate.
+ bool CanNegateL;
+ bool isValidL = isConjunctionDisjunctionTree(LHS, CanNegateL);
+ assert(isValidL && "Valid conjunction/disjunction tree");
+ (void)isValidL;
+
+#ifndef NDEBUG
+ bool CanNegateR;
+ bool isValidR = isConjunctionDisjunctionTree(RHS, CanNegateR);
+ assert(isValidR && "Valid conjunction/disjunction tree");
+ assert((CanNegateL || CanNegateR) && "Valid conjunction/disjunction tree");
+#endif
+
+ // Order the side which we cannot negate to RHS so we can emit it first.
+ if (!CanNegateL)
std::swap(LHS, RHS);
} else {
bool NeedsNegOutL = LHS->getOpcode() == ISD::OR;
- bool NeedsNegOutR = RHS->getOpcode() == ISD::OR;
- if (NeedsNegOutL && NeedsNegOutR)
- return SDValue();
+ assert((!NeedsNegOutL || RHS->getOpcode() != ISD::OR) &&
+ "Valid conjunction/disjunction tree");
// Order the side where we need to negate the output flags to RHS so it
// gets emitted first.
if (NeedsNegOutL)
@@ -1416,24 +1479,39 @@ static SDValue emitConjunctionDisjunctionTree(SelectionDAG &DAG, SDValue Val,
// through if we are already in a PushNegate case, otherwise we can negate
// the "flags to test" afterwards.
AArch64CC::CondCode RHSCC;
- SDValue CmpR = emitConjunctionDisjunctionTree(DAG, RHS, RHSCC, PushNegate,
- CCOp, Predicate, Depth+1);
- if (NegateOperands && !PushNegate)
+ SDValue CmpR = emitConjunctionDisjunctionTreeRec(DAG, RHS, RHSCC, Negate,
+ CCOp, Predicate);
+ if (NegateOpsAndResult && !Negate)
RHSCC = AArch64CC::getInvertedCondCode(RHSCC);
- // Emit LHS. We must push the negate through if we need to negate it.
- SDValue CmpL = emitConjunctionDisjunctionTree(DAG, LHS, OutCC, NegateOperands,
- CmpR, RHSCC, Depth+1);
+ // Emit LHS. We may need to negate it.
+ SDValue CmpL = emitConjunctionDisjunctionTreeRec(DAG, LHS, OutCC,
+ NegateOpsAndResult, CmpR,
+ RHSCC);
// If we transformed an OR to and AND then we have to negate the result
- // (or absorb a PushNegate resulting in a double negation).
- if (Opcode == ISD::OR && !PushNegate)
+ // (or absorb the Negate parameter).
+ if (NegateOpsAndResult && !Negate)
OutCC = AArch64CC::getInvertedCondCode(OutCC);
return CmpL;
}
+/// Emit conjunction or disjunction tree with the CMP/FCMP followed by a chain
+/// of CCMP/CFCMP ops. See @ref AArch64CCMP.
+/// \see emitConjunctionDisjunctionTreeRec().
+static SDValue emitConjunctionDisjunctionTree(SelectionDAG &DAG, SDValue Val,
+ AArch64CC::CondCode &OutCC) {
+ bool CanNegate;
+ if (!isConjunctionDisjunctionTree(Val, CanNegate))
+ return SDValue();
+
+ return emitConjunctionDisjunctionTreeRec(DAG, Val, OutCC, false, SDValue(),
+ AArch64CC::AL);
+}
+
/// @}
static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
- SDValue &AArch64cc, SelectionDAG &DAG, SDLoc dl) {
+ SDValue &AArch64cc, SelectionDAG &DAG,
+ const SDLoc &dl) {
if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) {
EVT VT = RHS.getValueType();
uint64_t C = RHSC->getZExtValue();
@@ -1994,7 +2072,7 @@ SDValue AArch64TargetLowering::LowerFSINCOS(SDValue Op,
StructType *RetTy = StructType::get(ArgTy, ArgTy, nullptr);
TargetLowering::CallLoweringInfo CLI(DAG);
CLI.setDebugLoc(dl).setChain(DAG.getEntryNode())
- .setCallee(CallingConv::Fast, RetTy, Callee, std::move(Args), 0);
+ .setCallee(CallingConv::Fast, RetTy, Callee, std::move(Args));
std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
return CallResult.first;
@@ -2096,8 +2174,7 @@ static SDValue skipExtensionForVectorMULL(SDNode *N, SelectionDAG &DAG) {
// The values are implicitly truncated so sext vs. zext doesn't matter.
Ops.push_back(DAG.getConstant(CInt.zextOrTrunc(32), dl, MVT::i32));
}
- return DAG.getNode(ISD::BUILD_VECTOR, dl,
- MVT::getVectorVT(TruncVT, NumElts), Ops);
+ return DAG.getBuildVector(MVT::getVectorVT(TruncVT, NumElts), dl, Ops);
}
static bool isSignExtended(SDNode *N, SelectionDAG &DAG) {
@@ -2213,7 +2290,7 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
SDLoc dl(Op);
switch (IntNo) {
default: return SDValue(); // Don't custom lower most intrinsics.
- case Intrinsic::aarch64_thread_pointer: {
+ case Intrinsic::thread_pointer: {
EVT PtrVT = getPointerTy(DAG.getDataLayout());
return DAG.getNode(AArch64ISD::THREAD_POINTER, dl, PtrVT);
}
@@ -2356,6 +2433,8 @@ CCAssignFn *AArch64TargetLowering::CCAssignFnForCall(CallingConv::ID CC,
return CC_AArch64_GHC;
case CallingConv::C:
case CallingConv::Fast:
+ case CallingConv::PreserveMost:
+ case CallingConv::CXX_FAST_TLS:
if (!Subtarget->isTargetDarwin())
return CC_AArch64_AAPCS;
return IsVarArg ? CC_AArch64_DarwinPCS_VarArg : CC_AArch64_DarwinPCS;
@@ -2364,8 +2443,8 @@ CCAssignFn *AArch64TargetLowering::CCAssignFnForCall(CallingConv::ID CC,
SDValue AArch64TargetLowering::LowerFormalArguments(
SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
- const SmallVectorImpl<ISD::InputArg> &Ins, SDLoc DL, SelectionDAG &DAG,
- SmallVectorImpl<SDValue> &InVals) const {
+ const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
+ SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
MachineFunction &MF = DAG.getMachineFunction();
MachineFrameInfo *MFI = MF.getFrameInfo();
@@ -2515,13 +2594,14 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
ArgValue = DAG.getExtLoad(
ExtType, DL, VA.getLocVT(), Chain, FIN,
MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI),
- MemVT, false, false, false, 0);
+ MemVT);
InVals.push_back(ArgValue);
}
}
// varargs
+ AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
if (isVarArg) {
if (!Subtarget->isTargetDarwin()) {
// The AAPCS variadic function ABI is identical to the non-variadic
@@ -2530,22 +2610,20 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
saveVarArgRegisters(CCInfo, DAG, DL, Chain);
}
- AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
// This will point to the next argument passed via stack.
unsigned StackOffset = CCInfo.getNextStackOffset();
// We currently pass all varargs at 8-byte alignment.
StackOffset = ((StackOffset + 7) & ~7);
- AFI->setVarArgsStackIndex(MFI->CreateFixedObject(4, StackOffset, true));
+ FuncInfo->setVarArgsStackIndex(MFI->CreateFixedObject(4, StackOffset, true));
}
- AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
unsigned StackArgSize = CCInfo.getNextStackOffset();
bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
if (DoesCalleeRestoreStack(CallConv, TailCallOpt)) {
// This is a non-standard ABI so by fiat I say we're allowed to make full
// use of the stack area to be popped, which must be aligned to 16 bytes in
// any case:
- StackArgSize = RoundUpToAlignment(StackArgSize, 16);
+ StackArgSize = alignTo(StackArgSize, 16);
// If we're expected to restore the stack (e.g. fastcc) then we'll be adding
// a multiple of 16.
@@ -2563,7 +2641,8 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
}
void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo,
- SelectionDAG &DAG, SDLoc DL,
+ SelectionDAG &DAG,
+ const SDLoc &DL,
SDValue &Chain) const {
MachineFunction &MF = DAG.getMachineFunction();
MachineFrameInfo *MFI = MF.getFrameInfo();
@@ -2590,8 +2669,7 @@ void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo,
SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
SDValue Store = DAG.getStore(
Val.getValue(1), DL, Val, FIN,
- MachinePointerInfo::getStack(DAG.getMachineFunction(), i * 8), false,
- false, 0);
+ MachinePointerInfo::getStack(DAG.getMachineFunction(), i * 8));
MemOps.push_back(Store);
FIN =
DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getConstant(8, DL, PtrVT));
@@ -2620,8 +2698,7 @@ void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo,
SDValue Store = DAG.getStore(
Val.getValue(1), DL, Val, FIN,
- MachinePointerInfo::getStack(DAG.getMachineFunction(), i * 16),
- false, false, 0);
+ MachinePointerInfo::getStack(DAG.getMachineFunction(), i * 16));
MemOps.push_back(Store);
FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN,
DAG.getConstant(16, DL, PtrVT));
@@ -2640,8 +2717,8 @@ void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo,
/// appropriate copies out of appropriate physical registers.
SDValue AArch64TargetLowering::LowerCallResult(
SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
- const SmallVectorImpl<ISD::InputArg> &Ins, SDLoc DL, SelectionDAG &DAG,
- SmallVectorImpl<SDValue> &InVals, bool isThisReturn,
+ const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
+ SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool isThisReturn,
SDValue ThisVal) const {
CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS
? RetCC_AArch64_WebKit_JS
@@ -2658,7 +2735,7 @@ SDValue AArch64TargetLowering::LowerCallResult(
// Pass 'this' value directly from the argument to return value, to avoid
// reg unit interference
- if (i == 0 && isThisReturn) {
+ if (i == 0 && isThisReturn && EnableThisRetForwarding) {
assert(!VA.needsCustom() && VA.getLocVT() == MVT::i64 &&
"unexpected return calling convention register assignment");
InVals.push_back(ThisVal);
@@ -2688,7 +2765,6 @@ SDValue AArch64TargetLowering::LowerCallResult(
bool AArch64TargetLowering::isEligibleForTailCallOptimization(
SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,
- bool isCalleeStructRet, bool isCallerStructRet,
const SmallVectorImpl<ISD::OutputArg> &Outs,
const SmallVectorImpl<SDValue> &OutVals,
const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
@@ -2698,7 +2774,7 @@ bool AArch64TargetLowering::isEligibleForTailCallOptimization(
if (!IsTailCallConvention(CalleeCC) && CalleeCC != CallingConv::C)
return false;
- const MachineFunction &MF = DAG.getMachineFunction();
+ MachineFunction &MF = DAG.getMachineFunction();
const Function *CallerF = MF.getFunction();
CallingConv::ID CallerCC = CallerF->getCallingConv();
bool CCMatch = CallerCC == CalleeCC;
@@ -2713,9 +2789,7 @@ bool AArch64TargetLowering::isEligibleForTailCallOptimization(
return false;
if (getTargetMachine().Options.GuaranteedTailCallOpt) {
- if (IsTailCallConvention(CalleeCC) && CCMatch)
- return true;
- return false;
+ return IsTailCallConvention(CalleeCC) && CCMatch;
}
// Externally-defined functions with weak linkage should not be
@@ -2742,6 +2816,7 @@ bool AArch64TargetLowering::isEligibleForTailCallOptimization(
assert((!isVarArg || CalleeCC == CallingConv::C) &&
"Unexpected variadic calling convention");
+ LLVMContext &C = *DAG.getContext();
if (isVarArg && !Outs.empty()) {
// At least two cases here: if caller is fastcc then we can't have any
// memory arguments (we'd be expected to clean up the stack afterwards). If
@@ -2750,8 +2825,7 @@ bool AArch64TargetLowering::isEligibleForTailCallOptimization(
// FIXME: for now we take the most conservative of these in both cases:
// disallow all variadic memory operands.
SmallVector<CCValAssign, 16> ArgLocs;
- CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), ArgLocs,
- *DAG.getContext());
+ CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, true));
for (const CCValAssign &ArgLoc : ArgLocs)
@@ -2759,34 +2833,18 @@ bool AArch64TargetLowering::isEligibleForTailCallOptimization(
return false;
}
- // If the calling conventions do not match, then we'd better make sure the
- // results are returned in the same way as what the caller expects.
+ // Check that the call results are passed in the same way.
+ if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,
+ CCAssignFnForCall(CalleeCC, isVarArg),
+ CCAssignFnForCall(CallerCC, isVarArg)))
+ return false;
+ // The callee has to preserve all registers the caller needs to preserve.
+ const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
+ const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
if (!CCMatch) {
- SmallVector<CCValAssign, 16> RVLocs1;
- CCState CCInfo1(CalleeCC, false, DAG.getMachineFunction(), RVLocs1,
- *DAG.getContext());
- CCInfo1.AnalyzeCallResult(Ins, CCAssignFnForCall(CalleeCC, isVarArg));
-
- SmallVector<CCValAssign, 16> RVLocs2;
- CCState CCInfo2(CallerCC, false, DAG.getMachineFunction(), RVLocs2,
- *DAG.getContext());
- CCInfo2.AnalyzeCallResult(Ins, CCAssignFnForCall(CallerCC, isVarArg));
-
- if (RVLocs1.size() != RVLocs2.size())
+ const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
+ if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
return false;
- for (unsigned i = 0, e = RVLocs1.size(); i != e; ++i) {
- if (RVLocs1[i].isRegLoc() != RVLocs2[i].isRegLoc())
- return false;
- if (RVLocs1[i].getLocInfo() != RVLocs2[i].getLocInfo())
- return false;
- if (RVLocs1[i].isRegLoc()) {
- if (RVLocs1[i].getLocReg() != RVLocs2[i].getLocReg())
- return false;
- } else {
- if (RVLocs1[i].getLocMemOffset() != RVLocs2[i].getLocMemOffset())
- return false;
- }
- }
}
// Nothing more to check if the callee is taking no arguments
@@ -2794,16 +2852,22 @@ bool AArch64TargetLowering::isEligibleForTailCallOptimization(
return true;
SmallVector<CCValAssign, 16> ArgLocs;
- CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), ArgLocs,
- *DAG.getContext());
+ CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, isVarArg));
const AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
- // If the stack arguments for this call would fit into our own save area then
- // the call can be made tail.
- return CCInfo.getNextStackOffset() <= FuncInfo->getBytesInStackArgArea();
+ // If the stack arguments for this call do not fit into our own save area then
+ // the call cannot be made tail.
+ if (CCInfo.getNextStackOffset() > FuncInfo->getBytesInStackArgArea())
+ return false;
+
+ const MachineRegisterInfo &MRI = MF.getRegInfo();
+ if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))
+ return false;
+
+ return true;
}
SDValue AArch64TargetLowering::addTokenForArgument(SDValue Chain,
@@ -2845,7 +2909,8 @@ bool AArch64TargetLowering::DoesCalleeRestoreStack(CallingConv::ID CallCC,
}
bool AArch64TargetLowering::IsTailCallConvention(CallingConv::ID CallCC) const {
- return CallCC == CallingConv::Fast;
+ return CallCC == CallingConv::Fast ||
+ CallCC == CallingConv::PreserveMost;
}
/// LowerCall - Lower a call to a callseq_start + CALL + callseq_end chain,
@@ -2865,7 +2930,6 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
bool IsVarArg = CLI.IsVarArg;
MachineFunction &MF = DAG.getMachineFunction();
- bool IsStructRet = (Outs.empty()) ? false : Outs[0].Flags.isSRet();
bool IsThisReturn = false;
AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
@@ -2875,8 +2939,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
if (IsTailCall) {
// Check if it's really possible to do a tail call.
IsTailCall = isEligibleForTailCallOptimization(
- Callee, CallConv, IsVarArg, IsStructRet,
- MF.getFunction()->hasStructRetAttr(), Outs, OutVals, Ins, DAG);
+ Callee, CallConv, IsVarArg, Outs, OutVals, Ins, DAG);
if (!IsTailCall && CLI.CS && CLI.CS->isMustTailCall())
report_fatal_error("failed to perform tail call elimination on a call "
"site marked musttail");
@@ -2959,7 +3022,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
// Since callee will pop argument stack as a tail call, we must keep the
// popped size 16-byte aligned.
- NumBytes = RoundUpToAlignment(NumBytes, 16);
+ NumBytes = alignTo(NumBytes, 16);
// FPDiff will be negative if this tail call requires more space than we
// would automatically have in our incoming argument space. Positive if we
@@ -3092,8 +3155,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
VA.getValVT() == MVT::i16)
Arg = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Arg);
- SDValue Store =
- DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo, false, false, 0);
+ SDValue Store = DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo);
MemOpChains.push_back(Store);
}
}
@@ -3199,9 +3261,8 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
Chain = DAG.getNode(AArch64ISD::CALL, DL, NodeTys, Ops);
InFlag = Chain.getValue(1);
- uint64_t CalleePopBytes = DoesCalleeRestoreStack(CallConv, TailCallOpt)
- ? RoundUpToAlignment(NumBytes, 16)
- : 0;
+ uint64_t CalleePopBytes =
+ DoesCalleeRestoreStack(CallConv, TailCallOpt) ? alignTo(NumBytes, 16) : 0;
Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, DL, true),
DAG.getIntPtrConstant(CalleePopBytes, DL, true),
@@ -3232,7 +3293,7 @@ AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
bool isVarArg,
const SmallVectorImpl<ISD::OutputArg> &Outs,
const SmallVectorImpl<SDValue> &OutVals,
- SDLoc DL, SelectionDAG &DAG) const {
+ const SDLoc &DL, SelectionDAG &DAG) const {
CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS
? RetCC_AArch64_WebKit_JS
: RetCC_AArch64_AAPCS;
@@ -3318,26 +3379,6 @@ SDValue AArch64TargetLowering::LowerGlobalAddress(SDValue Op,
return DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, GotAddr);
}
- if ((OpFlags & AArch64II::MO_CONSTPOOL) != 0) {
- assert(getTargetMachine().getCodeModel() == CodeModel::Small &&
- "use of MO_CONSTPOOL only supported on small model");
- SDValue Hi = DAG.getTargetConstantPool(GV, PtrVT, 0, 0, AArch64II::MO_PAGE);
- SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, Hi);
- unsigned char LoFlags = AArch64II::MO_PAGEOFF | AArch64II::MO_NC;
- SDValue Lo = DAG.getTargetConstantPool(GV, PtrVT, 0, 0, LoFlags);
- SDValue PoolAddr = DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, Lo);
- SDValue GlobalAddr = DAG.getLoad(
- PtrVT, DL, DAG.getEntryNode(), PoolAddr,
- MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
- /*isVolatile=*/false,
- /*isNonTemporal=*/true,
- /*isInvariant=*/true, 8);
- if (GN->getOffset() != 0)
- return DAG.getNode(ISD::ADD, DL, PtrVT, GlobalAddr,
- DAG.getConstant(GN->getOffset(), DL, PtrVT));
- return GlobalAddr;
- }
-
if (getTargetMachine().getCodeModel() == CodeModel::Large) {
const unsigned char MO_NC = AArch64II::MO_NC;
return DAG.getNode(
@@ -3405,8 +3446,9 @@ AArch64TargetLowering::LowerDarwinGlobalTLSAddress(SDValue Op,
SDValue Chain = DAG.getEntryNode();
SDValue FuncTLVGet =
DAG.getLoad(MVT::i64, DL, Chain, DescAddr,
- MachinePointerInfo::getGOT(DAG.getMachineFunction()), false,
- true, true, 8);
+ MachinePointerInfo::getGOT(DAG.getMachineFunction()),
+ /* Alignment = */ 8, MachineMemOperand::MONonTemporal |
+ MachineMemOperand::MOInvariant);
Chain = FuncTLVGet.getValue(1);
MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
@@ -3447,18 +3489,16 @@ AArch64TargetLowering::LowerDarwinGlobalTLSAddress(SDValue Op,
/// Therefore, a pseudo-instruction (TLSDESC_CALLSEQ) is used to represent the
/// above sequence, and expanded really late in the compilation flow, to ensure
/// the sequence is produced as per above.
-SDValue AArch64TargetLowering::LowerELFTLSDescCallSeq(SDValue SymAddr, SDLoc DL,
+SDValue AArch64TargetLowering::LowerELFTLSDescCallSeq(SDValue SymAddr,
+ const SDLoc &DL,
SelectionDAG &DAG) const {
EVT PtrVT = getPointerTy(DAG.getDataLayout());
SDValue Chain = DAG.getEntryNode();
SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
- SmallVector<SDValue, 2> Ops;
- Ops.push_back(Chain);
- Ops.push_back(SymAddr);
-
- Chain = DAG.getNode(AArch64ISD::TLSDESC_CALLSEQ, DL, NodeTys, Ops);
+ Chain =
+ DAG.getNode(AArch64ISD::TLSDESC_CALLSEQ, DL, NodeTys, {Chain, SymAddr});
SDValue Glue = Chain.getValue(1);
return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Glue);
@@ -3888,7 +3928,7 @@ SDValue AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
SDValue AArch64TargetLowering::LowerSELECT_CC(ISD::CondCode CC, SDValue LHS,
SDValue RHS, SDValue TVal,
- SDValue FVal, SDLoc dl,
+ SDValue FVal, const SDLoc &dl,
SelectionDAG &DAG) const {
// Handle f128 first, because it will result in a comparison of some RTLIB
// call result against zero.
@@ -4181,7 +4221,7 @@ SDValue AArch64TargetLowering::LowerDarwin_VASTART(SDValue Op,
getPointerTy(DAG.getDataLayout()));
const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
- MachinePointerInfo(SV), false, false, 0);
+ MachinePointerInfo(SV));
}
SDValue AArch64TargetLowering::LowerAAPCS_VASTART(SDValue Op,
@@ -4201,7 +4241,7 @@ SDValue AArch64TargetLowering::LowerAAPCS_VASTART(SDValue Op,
// void *__stack at offset 0
SDValue Stack = DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(), PtrVT);
MemOps.push_back(DAG.getStore(Chain, DL, Stack, VAList,
- MachinePointerInfo(SV), false, false, 8));
+ MachinePointerInfo(SV), /* Alignment = */ 8));
// void *__gr_top at offset 8
int GPRSize = FuncInfo->getVarArgsGPRSize();
@@ -4216,7 +4256,8 @@ SDValue AArch64TargetLowering::LowerAAPCS_VASTART(SDValue Op,
DAG.getConstant(GPRSize, DL, PtrVT));
MemOps.push_back(DAG.getStore(Chain, DL, GRTop, GRTopAddr,
- MachinePointerInfo(SV, 8), false, false, 8));
+ MachinePointerInfo(SV, 8),
+ /* Alignment = */ 8));
}
// void *__vr_top at offset 16
@@ -4231,24 +4272,23 @@ SDValue AArch64TargetLowering::LowerAAPCS_VASTART(SDValue Op,
DAG.getConstant(FPRSize, DL, PtrVT));
MemOps.push_back(DAG.getStore(Chain, DL, VRTop, VRTopAddr,
- MachinePointerInfo(SV, 16), false, false, 8));
+ MachinePointerInfo(SV, 16),
+ /* Alignment = */ 8));
}
// int __gr_offs at offset 24
SDValue GROffsAddr =
DAG.getNode(ISD::ADD, DL, PtrVT, VAList, DAG.getConstant(24, DL, PtrVT));
- MemOps.push_back(DAG.getStore(Chain, DL,
- DAG.getConstant(-GPRSize, DL, MVT::i32),
- GROffsAddr, MachinePointerInfo(SV, 24), false,
- false, 4));
+ MemOps.push_back(DAG.getStore(
+ Chain, DL, DAG.getConstant(-GPRSize, DL, MVT::i32), GROffsAddr,
+ MachinePointerInfo(SV, 24), /* Alignment = */ 4));
// int __vr_offs at offset 28
SDValue VROffsAddr =
DAG.getNode(ISD::ADD, DL, PtrVT, VAList, DAG.getConstant(28, DL, PtrVT));
- MemOps.push_back(DAG.getStore(Chain, DL,
- DAG.getConstant(-FPRSize, DL, MVT::i32),
- VROffsAddr, MachinePointerInfo(SV, 28), false,
- false, 4));
+ MemOps.push_back(DAG.getStore(
+ Chain, DL, DAG.getConstant(-FPRSize, DL, MVT::i32), VROffsAddr,
+ MachinePointerInfo(SV, 28), /* Alignment = */ 4));
return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
}
@@ -4287,8 +4327,7 @@ SDValue AArch64TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
unsigned Align = Op.getConstantOperandVal(3);
auto PtrVT = getPointerTy(DAG.getDataLayout());
- SDValue VAList = DAG.getLoad(PtrVT, DL, Chain, Addr, MachinePointerInfo(V),
- false, false, false, 0);
+ SDValue VAList = DAG.getLoad(PtrVT, DL, Chain, Addr, MachinePointerInfo(V));
Chain = VAList.getValue(1);
if (Align > 8) {
@@ -4318,14 +4357,14 @@ SDValue AArch64TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
SDValue VANext = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
DAG.getConstant(ArgSize, DL, PtrVT));
// Store the incremented VAList to the legalized pointer
- SDValue APStore = DAG.getStore(Chain, DL, VANext, Addr, MachinePointerInfo(V),
- false, false, 0);
+ SDValue APStore =
+ DAG.getStore(Chain, DL, VANext, Addr, MachinePointerInfo(V));
// Load the actual argument out of the pointer VAList
if (NeedFPTrunc) {
// Load the value as an f64.
- SDValue WideFP = DAG.getLoad(MVT::f64, DL, APStore, VAList,
- MachinePointerInfo(), false, false, false, 0);
+ SDValue WideFP =
+ DAG.getLoad(MVT::f64, DL, APStore, VAList, MachinePointerInfo());
// Round the value down to an f32.
SDValue NarrowFP = DAG.getNode(ISD::FP_ROUND, DL, VT, WideFP.getValue(0),
DAG.getIntPtrConstant(1, DL));
@@ -4334,8 +4373,7 @@ SDValue AArch64TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
return DAG.getMergeValues(Ops, DL);
}
- return DAG.getLoad(VT, DL, APStore, VAList, MachinePointerInfo(), false,
- false, false, 0);
+ return DAG.getLoad(VT, DL, APStore, VAList, MachinePointerInfo());
}
SDValue AArch64TargetLowering::LowerFRAMEADDR(SDValue Op,
@@ -4350,7 +4388,7 @@ SDValue AArch64TargetLowering::LowerFRAMEADDR(SDValue Op,
DAG.getCopyFromReg(DAG.getEntryNode(), DL, AArch64::FP, VT);
while (Depth--)
FrameAddr = DAG.getLoad(VT, DL, DAG.getEntryNode(), FrameAddr,
- MachinePointerInfo(), false, false, false, 0);
+ MachinePointerInfo());
return FrameAddr;
}
@@ -4381,7 +4419,7 @@ SDValue AArch64TargetLowering::LowerRETURNADDR(SDValue Op,
SDValue Offset = DAG.getConstant(8, DL, getPointerTy(DAG.getDataLayout()));
return DAG.getLoad(VT, DL, DAG.getEntryNode(),
DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset),
- MachinePointerInfo(), false, false, false, 0);
+ MachinePointerInfo());
}
// Return LR, which contains the return address. Mark it an implicit live-in.
@@ -4521,6 +4559,40 @@ bool AArch64TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
// AArch64 Optimization Hooks
//===----------------------------------------------------------------------===//
+/// getEstimate - Return the appropriate estimate DAG for either the reciprocal
+/// or the reciprocal square root.
+static SDValue getEstimate(const AArch64Subtarget &ST,
+ const AArch64TargetLowering::DAGCombinerInfo &DCI, unsigned Opcode,
+ const SDValue &Operand, unsigned &ExtraSteps) {
+ if (!ST.hasNEON())
+ return SDValue();
+
+ EVT VT = Operand.getValueType();
+
+ std::string RecipOp;
+ RecipOp = Opcode == (AArch64ISD::FRECPE) ? "div": "sqrt";
+ RecipOp = ((VT.isVector()) ? "vec-": "") + RecipOp;
+ RecipOp += (VT.getScalarType() == MVT::f64) ? "d": "f";
+
+ TargetRecip Recips = DCI.DAG.getTarget().Options.Reciprocals;
+ if (!Recips.isEnabled(RecipOp))
+ return SDValue();
+
+ ExtraSteps = Recips.getRefinementSteps(RecipOp);
+ return DCI.DAG.getNode(Opcode, SDLoc(Operand), VT, Operand);
+}
+
+SDValue AArch64TargetLowering::getRecipEstimate(SDValue Operand,
+ DAGCombinerInfo &DCI, unsigned &ExtraSteps) const {
+ return getEstimate(*Subtarget, DCI, AArch64ISD::FRECPE, Operand, ExtraSteps);
+}
+
+SDValue AArch64TargetLowering::getRsqrtEstimate(SDValue Operand,
+ DAGCombinerInfo &DCI, unsigned &ExtraSteps, bool &UseOneConst) const {
+ UseOneConst = true;
+ return getEstimate(*Subtarget, DCI, AArch64ISD::FRSQRTE, Operand, ExtraSteps);
+}
+
//===----------------------------------------------------------------------===//
// AArch64 Inline Assembly Support
//===----------------------------------------------------------------------===//
@@ -4548,6 +4620,27 @@ bool AArch64TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
// is prefixed by the %w modifier. Floating-point and SIMD register operands
// will be output with the v prefix unless prefixed by the %b, %h, %s, %d or
// %q modifier.
+const char *AArch64TargetLowering::LowerXConstraint(EVT ConstraintVT) const {
+ // At this point, we have to lower this constraint to something else, so we
+ // lower it to an "r" or "w". However, by doing this we will force the result
+ // to be in register, while the X constraint is much more permissive.
+ //
+ // Although we are correct (we are free to emit anything, without
+ // constraints), we might break use cases that would expect us to be more
+ // efficient and emit something else.
+ if (!Subtarget->hasFPARMv8())
+ return "r";
+
+ if (ConstraintVT.isFloatingPoint())
+ return "w";
+
+ if (ConstraintVT.isVector() &&
+ (ConstraintVT.getSizeInBits() == 64 ||
+ ConstraintVT.getSizeInBits() == 128))
+ return "w";
+
+ return "r";
+}
/// getConstraintType - Given a constraint letter, return the type of
/// constraint it is for this target.
@@ -4642,11 +4735,16 @@ AArch64TargetLowering::getRegForInlineAsmConstraint(
int RegNo;
bool Failed = Constraint.slice(2, Size - 1).getAsInteger(10, RegNo);
if (!Failed && RegNo >= 0 && RegNo <= 31) {
- // v0 - v31 are aliases of q0 - q31.
+ // v0 - v31 are aliases of q0 - q31 or d0 - d31 depending on size.
// By default we'll emit v0-v31 for this unless there's a modifier where
// we'll emit the correct register as well.
- Res.first = AArch64::FPR128RegClass.getRegister(RegNo);
- Res.second = &AArch64::FPR128RegClass;
+ if (VT != MVT::Other && VT.getSizeInBits() == 64) {
+ Res.first = AArch64::FPR64RegClass.getRegister(RegNo);
+ Res.second = &AArch64::FPR64RegClass;
+ } else {
+ Res.first = AArch64::FPR128RegClass.getRegister(RegNo);
+ Res.second = &AArch64::FPR128RegClass;
+ }
}
}
}
@@ -4862,11 +4960,12 @@ SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op,
SmallVector<ShuffleSourceInfo, 2> Sources;
for (unsigned i = 0; i < NumElts; ++i) {
SDValue V = Op.getOperand(i);
- if (V.getOpcode() == ISD::UNDEF)
+ if (V.isUndef())
continue;
- else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT) {
+ else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
+ !isa<ConstantSDNode>(V.getOperand(1))) {
// A shuffle can only come from building a vector from various
- // elements of other vectors.
+ // elements of other vectors, provided their indices are constant.
return SDValue();
}
@@ -4985,7 +5084,7 @@ SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op,
int BitsPerShuffleLane = ShuffleVT.getVectorElementType().getSizeInBits();
for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) {
SDValue Entry = Op.getOperand(i);
- if (Entry.getOpcode() == ISD::UNDEF)
+ if (Entry.isUndef())
continue;
auto Src = std::find(Sources.begin(), Sources.end(), Entry.getOperand(0));
@@ -5018,7 +5117,7 @@ SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op,
ShuffleOps[i] = Sources[i].ShuffleVec;
SDValue Shuffle = DAG.getVectorShuffle(ShuffleVT, dl, ShuffleOps[0],
- ShuffleOps[1], &Mask[0]);
+ ShuffleOps[1], Mask);
return DAG.getNode(ISD::BITCAST, dl, VT, Shuffle);
}
@@ -5304,7 +5403,7 @@ static SDValue tryFormConcatFromShuffle(SDValue Op, SelectionDAG &DAG) {
/// the specified operations to build the shuffle.
static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS,
SDValue RHS, SelectionDAG &DAG,
- SDLoc dl) {
+ const SDLoc &dl) {
unsigned OpNum = (PFEntry >> 26) & 0x0F;
unsigned LHSID = (PFEntry >> 13) & ((1 << 13) - 1);
unsigned RHSID = (PFEntry >> 0) & ((1 << 13) - 1);
@@ -5433,35 +5532,34 @@ static SDValue GenerateTBL(SDValue Op, ArrayRef<int> ShuffleMask,
SDValue V2Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V2);
SDValue Shuffle;
- if (V2.getNode()->getOpcode() == ISD::UNDEF) {
+ if (V2.getNode()->isUndef()) {
if (IndexLen == 8)
V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V1Cst);
Shuffle = DAG.getNode(
ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
DAG.getConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32), V1Cst,
- DAG.getNode(ISD::BUILD_VECTOR, DL, IndexVT,
- makeArrayRef(TBLMask.data(), IndexLen)));
+ DAG.getBuildVector(IndexVT, DL,
+ makeArrayRef(TBLMask.data(), IndexLen)));
} else {
if (IndexLen == 8) {
V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V2Cst);
Shuffle = DAG.getNode(
ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
DAG.getConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32), V1Cst,
- DAG.getNode(ISD::BUILD_VECTOR, DL, IndexVT,
- makeArrayRef(TBLMask.data(), IndexLen)));
+ DAG.getBuildVector(IndexVT, DL,
+ makeArrayRef(TBLMask.data(), IndexLen)));
} else {
// FIXME: We cannot, for the moment, emit a TBL2 instruction because we
// cannot currently represent the register constraints on the input
// table registers.
// Shuffle = DAG.getNode(AArch64ISD::TBL2, DL, IndexVT, V1Cst, V2Cst,
- // DAG.getNode(ISD::BUILD_VECTOR, DL, IndexVT,
- // &TBLMask[0], IndexLen));
+ // DAG.getBuildVector(IndexVT, DL, &TBLMask[0],
+ // IndexLen));
Shuffle = DAG.getNode(
ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
- DAG.getConstant(Intrinsic::aarch64_neon_tbl2, DL, MVT::i32),
- V1Cst, V2Cst,
- DAG.getNode(ISD::BUILD_VECTOR, DL, IndexVT,
- makeArrayRef(TBLMask.data(), IndexLen)));
+ DAG.getConstant(Intrinsic::aarch64_neon_tbl2, DL, MVT::i32), V1Cst,
+ V2Cst, DAG.getBuildVector(IndexVT, DL,
+ makeArrayRef(TBLMask.data(), IndexLen)));
}
}
return DAG.getNode(ISD::BITCAST, DL, Op.getValueType(), Shuffle);
@@ -5496,8 +5594,7 @@ SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
SDValue V1 = Op.getOperand(0);
SDValue V2 = Op.getOperand(1);
- if (ShuffleVectorSDNode::isSplatMask(&ShuffleMask[0],
- V1.getValueType().getSimpleVT())) {
+ if (SVN->isSplat()) {
int Lane = SVN->getSplatIndex();
// If this is undef splat, generate it via "just" vdup, if possible.
if (Lane == -1)
@@ -5546,8 +5643,7 @@ SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
Imm *= getExtFactor(V1);
return DAG.getNode(AArch64ISD::EXT, dl, V1.getValueType(), V1, V2,
DAG.getConstant(Imm, dl, MVT::i32));
- } else if (V2->getOpcode() == ISD::UNDEF &&
- isSingletonEXTMask(ShuffleMask, VT, Imm)) {
+ } else if (V2->isUndef() && isSingletonEXTMask(ShuffleMask, VT, Imm)) {
Imm *= getExtFactor(V1);
return DAG.getNode(AArch64ISD::EXT, dl, V1.getValueType(), V1, V1,
DAG.getConstant(Imm, dl, MVT::i32));
@@ -5580,8 +5676,7 @@ SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1);
}
- SDValue Concat = tryFormConcatFromShuffle(Op, DAG);
- if (Concat.getNode())
+ if (SDValue Concat = tryFormConcatFromShuffle(Op, DAG))
return Concat;
bool DstIsLeft;
@@ -5853,8 +5948,7 @@ SDValue AArch64TargetLowering::LowerVectorOR(SDValue Op,
SelectionDAG &DAG) const {
// Attempt to form a vector S[LR]I from (or (and X, C1), (lsl Y, C2))
if (EnableAArch64SlrGeneration) {
- SDValue Res = tryLowerToSLI(Op.getNode(), DAG);
- if (Res.getNode())
+ if (SDValue Res = tryLowerToSLI(Op.getNode(), DAG))
return Res;
}
@@ -5972,7 +6066,7 @@ static SDValue NormalizeBuildVector(SDValue Op,
}
Ops.push_back(Lane);
}
- return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
+ return DAG.getBuildVector(VT, dl, Ops);
}
SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
@@ -6217,7 +6311,7 @@ FailedModImm:
SDValue ConstantValue;
for (unsigned i = 0; i < NumElts; ++i) {
SDValue V = Op.getOperand(i);
- if (V.getOpcode() == ISD::UNDEF)
+ if (V.isUndef())
continue;
if (i > 0)
isOnlyLowElement = false;
@@ -6273,7 +6367,7 @@ FailedModImm:
for (unsigned i = 0; i < NumElts; ++i)
Ops.push_back(DAG.getNode(ISD::BITCAST, dl, NewType, Op.getOperand(i)));
EVT VecVT = EVT::getVectorVT(*DAG.getContext(), NewType, NumElts);
- SDValue Val = DAG.getNode(ISD::BUILD_VECTOR, dl, VecVT, Ops);
+ SDValue Val = DAG.getBuildVector(VecVT, dl, Ops);
Val = LowerBUILD_VECTOR(Val, DAG);
if (Val.getNode())
return DAG.getNode(ISD::BITCAST, dl, VT, Val);
@@ -6328,7 +6422,7 @@ FailedModImm:
// value is already in an S or D register.
// Do not do this for UNDEF/LOAD nodes because we have better patterns
// for those avoiding the SCALAR_TO_VECTOR/BUILD_VECTOR.
- if (Op0.getOpcode() != ISD::UNDEF && Op0.getOpcode() != ISD::LOAD &&
+ if (!Op0.isUndef() && Op0.getOpcode() != ISD::LOAD &&
(ElemSize == 32 || ElemSize == 64)) {
unsigned SubIdx = ElemSize == 32 ? AArch64::ssub : AArch64::dsub;
MachineSDNode *N =
@@ -6339,7 +6433,7 @@ FailedModImm:
}
for (; i < NumElts; ++i) {
SDValue V = Op.getOperand(i);
- if (V.getOpcode() == ISD::UNDEF)
+ if (V.isUndef())
continue;
SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i64);
Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Vec, V, LaneIdx);
@@ -6580,7 +6674,7 @@ SDValue AArch64TargetLowering::LowerVectorSRA_SRL_SHL(SDValue Op,
static SDValue EmitVectorComparison(SDValue LHS, SDValue RHS,
AArch64CC::CondCode CC, bool NoNans, EVT VT,
- SDLoc dl, SelectionDAG &DAG) {
+ const SDLoc &dl, SelectionDAG &DAG) {
EVT SrcVT = LHS.getValueType();
assert(VT.getSizeInBits() == SrcVT.getSizeInBits() &&
"function only supposed to emit natural comparisons");
@@ -6877,12 +6971,10 @@ bool AArch64TargetLowering::isProfitableToHoist(Instruction *I) const {
const DataLayout &DL = I->getModule()->getDataLayout();
EVT VT = getValueType(DL, User->getOperand(0)->getType());
- if (isFMAFasterThanFMulAndFAdd(VT) &&
- isOperationLegalOrCustom(ISD::FMA, VT) &&
- (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath))
- return false;
-
- return true;
+ return !(isFMAFasterThanFMulAndFAdd(VT) &&
+ isOperationLegalOrCustom(ISD::FMA, VT) &&
+ (Options.AllowFPOpFusion == FPOpFusion::Fast ||
+ Options.UnsafeFPMath));
}
// All 32-bit GPR operations implicitly zero the high-half of the corresponding
@@ -7183,16 +7275,17 @@ EVT AArch64TargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign,
// 12-bit optionally shifted immediates are legal for adds.
bool AArch64TargetLowering::isLegalAddImmediate(int64_t Immed) const {
- if ((Immed >> 12) == 0 || ((Immed & 0xfff) == 0 && Immed >> 24 == 0))
- return true;
- return false;
+ // Avoid UB for INT64_MIN.
+ if (Immed == std::numeric_limits<int64_t>::min())
+ return false;
+ // Same encoding for add/sub, just flip the sign.
+ Immed = std::abs(Immed);
+ return ((Immed >> 12) == 0 || ((Immed & 0xfff) == 0 && Immed >> 24 == 0));
}
// Integer comparisons are implemented with ADDS/SUBS, so the range of valid
// immediates is the same as for an add or a sub.
bool AArch64TargetLowering::isLegalICmpImmediate(int64_t Immed) const {
- if (Immed < 0)
- Immed *= -1;
return isLegalAddImmediate(Immed);
}
@@ -7244,10 +7337,8 @@ bool AArch64TargetLowering::isLegalAddressingMode(const DataLayout &DL,
// Check reg1 + SIZE_IN_BYTES * reg2 and reg1 + reg2
- if (!AM.Scale || AM.Scale == 1 ||
- (AM.Scale > 0 && (uint64_t)AM.Scale == NumBytes))
- return true;
- return false;
+ return !AM.Scale || AM.Scale == 1 ||
+ (AM.Scale > 0 && (uint64_t)AM.Scale == NumBytes);
}
int AArch64TargetLowering::getScalingFactorCost(const DataLayout &DL,
@@ -7334,6 +7425,33 @@ bool AArch64TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
return Shift < 3;
}
+/// Turn vector tests of the signbit in the form of:
+/// xor (sra X, elt_size(X)-1), -1
+/// into:
+/// cmge X, X, #0
+static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG,
+ const AArch64Subtarget *Subtarget) {
+ EVT VT = N->getValueType(0);
+ if (!Subtarget->hasNEON() || !VT.isVector())
+ return SDValue();
+
+ // There must be a shift right algebraic before the xor, and the xor must be a
+ // 'not' operation.
+ SDValue Shift = N->getOperand(0);
+ SDValue Ones = N->getOperand(1);
+ if (Shift.getOpcode() != AArch64ISD::VASHR || !Shift.hasOneUse() ||
+ !ISD::isBuildVectorAllOnes(Ones.getNode()))
+ return SDValue();
+
+ // The shift should be smearing the sign bit across each vector element.
+ auto *ShiftAmt = dyn_cast<ConstantSDNode>(Shift.getOperand(1));
+ EVT ShiftEltTy = Shift.getValueType().getVectorElementType();
+ if (!ShiftAmt || ShiftAmt->getZExtValue() != ShiftEltTy.getSizeInBits() - 1)
+ return SDValue();
+
+ return DAG.getNode(AArch64ISD::CMGEz, SDLoc(N), VT, Shift.getOperand(0));
+}
+
// Generate SUBS and CSEL for integer abs.
static SDValue performIntegerAbsCombine(SDNode *N, SelectionDAG &DAG) {
EVT VT = N->getValueType(0);
@@ -7362,13 +7480,15 @@ static SDValue performIntegerAbsCombine(SDNode *N, SelectionDAG &DAG) {
return SDValue();
}
-// performXorCombine - Attempts to handle integer ABS.
static SDValue performXorCombine(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const AArch64Subtarget *Subtarget) {
if (DCI.isBeforeLegalizeOps())
return SDValue();
+ if (SDValue Cmp = foldVectorXorShiftIntoCmp(N, DAG, Subtarget))
+ return Cmp;
+
return performIntegerAbsCombine(N, DAG);
}
@@ -7376,6 +7496,10 @@ SDValue
AArch64TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
SelectionDAG &DAG,
std::vector<SDNode *> *Created) const {
+ AttributeSet Attr = DAG.getMachineFunction().getFunction()->getAttributes();
+ if (isIntDivCheap(N->getValueType(0), Attr))
+ return SDValue(N,0); // Lower SDIV as SDIV
+
// fold (sdiv X, pow2)
EVT VT = N->getValueType(0);
if ((VT != MVT::i32 && VT != MVT::i64) ||
@@ -7426,7 +7550,7 @@ static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG,
// gated on a subtarget feature. For Cyclone, 32-bit MADD is 4 cycles and
// 64-bit is 5 cycles, so this is always a win.
if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
- APInt Value = C->getAPIntValue();
+ const APInt &Value = C->getAPIntValue();
EVT VT = N->getValueType(0);
SDLoc DL(N);
if (Value.isNonNegative()) {
@@ -7543,9 +7667,8 @@ static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG,
!cast<LoadSDNode>(N0)->isVolatile()) {
LoadSDNode *LN0 = cast<LoadSDNode>(N0);
SDValue Load = DAG.getLoad(VT, SDLoc(N), LN0->getChain(), LN0->getBasePtr(),
- LN0->getPointerInfo(), LN0->isVolatile(),
- LN0->isNonTemporal(), LN0->isInvariant(),
- LN0->getAlignment());
+ LN0->getPointerInfo(), LN0->getAlignment(),
+ LN0->getMemOperand()->getFlags());
// Make sure successors of the original load stay after it by updating them
// to use the new Chain.
@@ -7567,7 +7690,8 @@ static SDValue performFpToIntCombine(SDNode *N, SelectionDAG &DAG,
return SDValue();
SDValue Op = N->getOperand(0);
- if (!Op.getValueType().isVector() || Op.getOpcode() != ISD::FMUL)
+ if (!Op.getValueType().isVector() || !Op.getValueType().isSimple() ||
+ Op.getOpcode() != ISD::FMUL)
return SDValue();
SDValue ConstVec = Op->getOperand(1);
@@ -7801,25 +7925,49 @@ static SDValue tryCombineToBSL(SDNode *N,
static SDValue performORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
const AArch64Subtarget *Subtarget) {
// Attempt to form an EXTR from (or (shl VAL1, #N), (srl VAL2, #RegWidth-N))
- if (!EnableAArch64ExtrGeneration)
- return SDValue();
SelectionDAG &DAG = DCI.DAG;
EVT VT = N->getValueType(0);
if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
return SDValue();
- SDValue Res = tryCombineToEXTR(N, DCI);
- if (Res.getNode())
+ if (SDValue Res = tryCombineToEXTR(N, DCI))
return Res;
- Res = tryCombineToBSL(N, DCI);
- if (Res.getNode())
+ if (SDValue Res = tryCombineToBSL(N, DCI))
return Res;
return SDValue();
}
+static SDValue performSRLCombine(SDNode *N,
+ TargetLowering::DAGCombinerInfo &DCI) {
+ SelectionDAG &DAG = DCI.DAG;
+ EVT VT = N->getValueType(0);
+ if (VT != MVT::i32 && VT != MVT::i64)
+ return SDValue();
+
+ // Canonicalize (srl (bswap i32 x), 16) to (rotr (bswap i32 x), 16), if the
+ // high 16-bits of x are zero. Similarly, canonicalize (srl (bswap i64 x), 32)
+ // to (rotr (bswap i64 x), 32), if the high 32-bits of x are zero.
+ SDValue N0 = N->getOperand(0);
+ if (N0.getOpcode() == ISD::BSWAP) {
+ SDLoc DL(N);
+ SDValue N1 = N->getOperand(1);
+ SDValue N00 = N0.getOperand(0);
+ if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N1)) {
+ uint64_t ShiftAmt = C->getZExtValue();
+ if (VT == MVT::i32 && ShiftAmt == 16 &&
+ DAG.MaskedValueIsZero(N00, APInt::getHighBitsSet(32, 16)))
+ return DAG.getNode(ISD::ROTR, DL, VT, N0, N1);
+ if (VT == MVT::i64 && ShiftAmt == 32 &&
+ DAG.MaskedValueIsZero(N00, APInt::getHighBitsSet(64, 32)))
+ return DAG.getNode(ISD::ROTR, DL, VT, N0, N1);
+ }
+ }
+ return SDValue();
+}
+
static SDValue performBitcastCombine(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI,
SelectionDAG &DAG) {
@@ -8575,15 +8723,15 @@ static SDValue replaceSplatVectorStore(SelectionDAG &DAG, StoreSDNode *St) {
SDValue BasePtr = St->getBasePtr();
SDValue NewST1 =
DAG.getStore(St->getChain(), DL, SplatVal, BasePtr, St->getPointerInfo(),
- St->isVolatile(), St->isNonTemporal(), St->getAlignment());
+ St->getAlignment(), St->getMemOperand()->getFlags());
unsigned Offset = EltOffset;
while (--NumVecElts) {
SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr,
DAG.getConstant(Offset, DL, MVT::i64));
NewST1 = DAG.getStore(NewST1.getValue(0), DL, SplatVal, OffsetPtr,
- St->getPointerInfo(), St->isVolatile(),
- St->isNonTemporal(), Alignment);
+ St->getPointerInfo(), Alignment,
+ St->getMemOperand()->getFlags());
Offset += EltOffset;
}
return NewST1;
@@ -8603,9 +8751,7 @@ static SDValue split16BStores(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
// be included in TLI.allowsMisalignedMemoryAccesses(), and there should be
// a call to that function here.
- // Cyclone has bad performance on unaligned 16B stores when crossing line and
- // page boundaries. We want to split such stores.
- if (!Subtarget->isCyclone())
+ if (!Subtarget->isMisaligned128StoreSlow())
return SDValue();
// Don't split at -Oz.
@@ -8647,12 +8793,12 @@ static SDValue split16BStores(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
SDValue BasePtr = S->getBasePtr();
SDValue NewST1 =
DAG.getStore(S->getChain(), DL, SubVector0, BasePtr, S->getPointerInfo(),
- S->isVolatile(), S->isNonTemporal(), S->getAlignment());
+ S->getAlignment(), S->getMemOperand()->getFlags());
SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr,
DAG.getConstant(8, DL, MVT::i64));
return DAG.getStore(NewST1.getValue(0), DL, SubVector1, OffsetPtr,
- S->getPointerInfo(), S->isVolatile(), S->isNonTemporal(),
- S->getAlignment());
+ S->getPointerInfo(), S->getAlignment(),
+ S->getMemOperand()->getFlags());
}
/// Target-specific DAG combine function for post-increment LD1 (lane) and
@@ -8741,9 +8887,10 @@ static SDValue performPostLD1Combine(SDNode *N,
LoadSDN->getMemOperand());
// Update the uses.
- SmallVector<SDValue, 2> NewResults;
- NewResults.push_back(SDValue(LD, 0)); // The result of load
- NewResults.push_back(SDValue(UpdN.getNode(), 2)); // Chain
+ SDValue NewResults[] = {
+ SDValue(LD, 0), // The result of load
+ SDValue(UpdN.getNode(), 2) // Chain
+ };
DCI.CombineTo(LD, NewResults);
DCI.CombineTo(N, SDValue(UpdN.getNode(), 0)); // Dup/Inserted Result
DCI.CombineTo(User, SDValue(UpdN.getNode(), 1)); // Write back register
@@ -8774,8 +8921,7 @@ static SDValue performSTORECombine(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI,
SelectionDAG &DAG,
const AArch64Subtarget *Subtarget) {
- SDValue Split = split16BStores(N, DCI, DAG, Subtarget);
- if (Split.getNode())
+ if (SDValue Split = split16BStores(N, DCI, DAG, Subtarget))
return Split;
if (Subtarget->supportsAddressTopByteIgnored() &&
@@ -9215,10 +9361,8 @@ bool checkValueWidth(SDValue V, unsigned width, ISD::LoadExtType &ExtType) {
}
case ISD::Constant:
case ISD::TargetConstant: {
- if (std::abs(cast<ConstantSDNode>(V.getNode())->getSExtValue()) <
- 1LL << (width - 1))
- return true;
- return false;
+ return std::abs(cast<ConstantSDNode>(V.getNode())->getSExtValue()) <
+ 1LL << (width - 1);
}
}
@@ -9286,14 +9430,13 @@ bool checkValueWidth(SDValue V, unsigned width, ISD::LoadExtType &ExtType) {
// isEquivalentMaskless() is the code for testing if the AND can be removed
// factored out of the DAG recognition as the DAG can take several forms.
-static
-bool isEquivalentMaskless(unsigned CC, unsigned width,
- ISD::LoadExtType ExtType, signed AddConstant,
- signed CompConstant) {
+static bool isEquivalentMaskless(unsigned CC, unsigned width,
+ ISD::LoadExtType ExtType, int AddConstant,
+ int CompConstant) {
// By being careful about our equations and only writing the in term
// symbolic values and well known constants (0, 1, -1, MaxUInt) we can
// make them generally applicable to all bit widths.
- signed MaxUInt = (1 << width);
+ int MaxUInt = (1 << width);
// For the purposes of these comparisons sign extending the type is
// equivalent to zero extending the add and displacing it by half the integer
@@ -9441,8 +9584,7 @@ SDValue performCONDCombine(SDNode *N,
static SDValue performBRCONDCombine(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI,
SelectionDAG &DAG) {
- SDValue NV = performCONDCombine(N, DCI, DAG, 2, 3);
- if (NV.getNode())
+ if (SDValue NV = performCONDCombine(N, DCI, DAG, 2, 3))
N = NV.getNode();
SDValue Chain = N->getOperand(0);
SDValue Dest = N->getOperand(1);
@@ -9678,7 +9820,7 @@ static SDValue performSelectCombine(SDNode *N,
// Now duplicate the comparison mask we want across all other lanes.
SmallVector<int, 8> DUPMask(CCVT.getVectorNumElements(), 0);
- SDValue Mask = DAG.getVectorShuffle(CCVT, DL, SetCC, SetCC, DUPMask.data());
+ SDValue Mask = DAG.getVectorShuffle(CCVT, DL, SetCC, SetCC, DUPMask);
Mask = DAG.getNode(ISD::BITCAST, DL,
ResVT.changeVectorElementTypeToInteger(), Mask);
@@ -9716,6 +9858,8 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
return performFDivCombine(N, DAG, Subtarget);
case ISD::OR:
return performORCombine(N, DCI, Subtarget);
+ case ISD::SRL:
+ return performSRLCombine(N, DCI);
case ISD::INTRINSIC_WO_CHAIN:
return performIntrinsicCombine(N, DCI, Subtarget);
case ISD::ANY_EXTEND:
@@ -9829,10 +9973,7 @@ bool AArch64TargetLowering::isUsedByReturnOnly(SDNode *N,
// return instructions to help enable tail call optimizations for this
// instruction.
bool AArch64TargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const {
- if (!CI->isTailCall())
- return false;
-
- return true;
+ return CI->isTailCall();
}
bool AArch64TargetLowering::getIndexedAddressParts(SDNode *Op, SDValue &Base,
@@ -9935,6 +10076,31 @@ static void ReplaceReductionResults(SDNode *N,
Results.push_back(SplitVal);
}
+static void ReplaceCMP_SWAP_128Results(SDNode *N,
+ SmallVectorImpl<SDValue> & Results,
+ SelectionDAG &DAG) {
+ assert(N->getValueType(0) == MVT::i128 &&
+ "AtomicCmpSwap on types less than 128 should be legal");
+ SDValue Ops[] = {N->getOperand(1),
+ N->getOperand(2)->getOperand(0),
+ N->getOperand(2)->getOperand(1),
+ N->getOperand(3)->getOperand(0),
+ N->getOperand(3)->getOperand(1),
+ N->getOperand(0)};
+ SDNode *CmpSwap = DAG.getMachineNode(
+ AArch64::CMP_SWAP_128, SDLoc(N),
+ DAG.getVTList(MVT::i64, MVT::i64, MVT::i32, MVT::Other), Ops);
+
+ MachineFunction &MF = DAG.getMachineFunction();
+ MachineSDNode::mmo_iterator MemOp = MF.allocateMemRefsArray(1);
+ MemOp[0] = cast<MemSDNode>(N)->getMemOperand();
+ cast<MachineSDNode>(CmpSwap)->setMemRefs(MemOp, MemOp + 1);
+
+ Results.push_back(SDValue(CmpSwap, 0));
+ Results.push_back(SDValue(CmpSwap, 1));
+ Results.push_back(SDValue(CmpSwap, 3));
+}
+
void AArch64TargetLowering::ReplaceNodeResults(
SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const {
switch (N->getOpcode()) {
@@ -9966,11 +10132,16 @@ void AArch64TargetLowering::ReplaceNodeResults(
assert(N->getValueType(0) == MVT::i128 && "unexpected illegal conversion");
// Let normal code take care of it by not adding anything to Results.
return;
+ case ISD::ATOMIC_CMP_SWAP:
+ ReplaceCMP_SWAP_128Results(N, Results, DAG);
+ return;
}
}
bool AArch64TargetLowering::useLoadStackGuardNode() const {
- return true;
+ if (!Subtarget->isTargetAndroid())
+ return true;
+ return TargetLowering::useLoadStackGuardNode();
}
unsigned AArch64TargetLowering::combineRepeatedFPDivisors() const {
@@ -10017,14 +10188,19 @@ AArch64TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
bool AArch64TargetLowering::shouldExpandAtomicCmpXchgInIR(
AtomicCmpXchgInst *AI) const {
- return true;
+ // At -O0, fast-regalloc cannot cope with the live vregs necessary to
+ // implement cmpxchg without spilling. If the address being exchanged is also
+ // on the stack and close enough to the spill slot, this can lead to a
+ // situation where the monitor always gets cleared and the atomic operation
+ // can never succeed. So at -O0 we need a late-expanded pseudo-inst instead.
+ return getTargetMachine().getOptLevel() != 0;
}
Value *AArch64TargetLowering::emitLoadLinked(IRBuilder<> &Builder, Value *Addr,
AtomicOrdering Ord) const {
Module *M = Builder.GetInsertBlock()->getParent()->getParent();
Type *ValTy = cast<PointerType>(Addr->getType())->getElementType();
- bool IsAcquire = isAtLeastAcquire(Ord);
+ bool IsAcquire = isAcquireOrStronger(Ord);
// Since i128 isn't legal and intrinsics don't get type-lowered, the ldrexd
// intrinsic must return {i64, i64} and we have to recombine them into a
@@ -10066,7 +10242,7 @@ Value *AArch64TargetLowering::emitStoreConditional(IRBuilder<> &Builder,
Value *Val, Value *Addr,
AtomicOrdering Ord) const {
Module *M = Builder.GetInsertBlock()->getParent()->getParent();
- bool IsRelease = isAtLeastRelease(Ord);
+ bool IsRelease = isReleaseOrStronger(Ord);
// Since the intrinsics must have legal type, the i128 intrinsics take two
// parameters: "i64, i64". We must marshal Val into the appropriate form
@@ -10104,6 +10280,22 @@ bool AArch64TargetLowering::shouldNormalizeToSelectSequence(LLVMContext &,
return false;
}
+Value *AArch64TargetLowering::getIRStackGuard(IRBuilder<> &IRB) const {
+ if (!Subtarget->isTargetAndroid())
+ return TargetLowering::getIRStackGuard(IRB);
+
+ // Android provides a fixed TLS slot for the stack cookie. See the definition
+ // of TLS_SLOT_STACK_GUARD in
+ // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
+ const unsigned TlsOffset = 0x28;
+ Module *M = IRB.GetInsertBlock()->getParent()->getParent();
+ Function *ThreadPointerFunc =
+ Intrinsic::getDeclaration(M, Intrinsic::thread_pointer);
+ return IRB.CreatePointerCast(
+ IRB.CreateConstGEP1_32(IRB.CreateCall(ThreadPointerFunc), TlsOffset),
+ Type::getInt8PtrTy(IRB.getContext())->getPointerTo(0));
+}
+
Value *AArch64TargetLowering::getSafeStackPointerLocation(IRBuilder<> &IRB) const {
if (!Subtarget->isTargetAndroid())
return TargetLowering::getSafeStackPointerLocation(IRB);
@@ -10114,7 +10306,7 @@ Value *AArch64TargetLowering::getSafeStackPointerLocation(IRBuilder<> &IRB) cons
const unsigned TlsOffset = 0x48;
Module *M = IRB.GetInsertBlock()->getParent()->getParent();
Function *ThreadPointerFunc =
- Intrinsic::getDeclaration(M, Intrinsic::aarch64_thread_pointer);
+ Intrinsic::getDeclaration(M, Intrinsic::thread_pointer);
return IRB.CreatePointerCast(
IRB.CreateConstGEP1_32(IRB.CreateCall(ThreadPointerFunc), TlsOffset),
Type::getInt8PtrTy(IRB.getContext())->getPointerTo(0));
@@ -10166,3 +10358,16 @@ void AArch64TargetLowering::insertCopiesSplitCSR(
.addReg(NewVR);
}
}
+
+bool AArch64TargetLowering::isIntDivCheap(EVT VT, AttributeSet Attr) const {
+ // Integer division on AArch64 is expensive. However, when aggressively
+ // optimizing for code size, we prefer to use a div instruction, as it is
+ // usually smaller than the alternative sequence.
+ // The exception to this is vector division. Since AArch64 doesn't have vector
+ // integer division, leaving the division as-is is a loss even in terms of
+ // size, because it will have to be scalarized, while the alternative code
+ // sequence can be performed in vector form.
+ bool OptSize =
+ Attr.hasAttribute(AttributeSet::FunctionIndex, Attribute::MinSize);
+ return OptSize && !VT.isVector();
+}
diff --git a/lib/Target/AArch64/AArch64ISelLowering.h b/lib/Target/AArch64/AArch64ISelLowering.h
index e99616c94068f..c87cfed1f892b 100644
--- a/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/lib/Target/AArch64/AArch64ISelLowering.h
@@ -187,6 +187,10 @@ enum NodeType : unsigned {
SMULL,
UMULL,
+ // Reciprocal estimates.
+ FRECPE,
+ FRSQRTE,
+
// NEON Load/Store with post-increment base updates
LD2post = ISD::FIRST_TARGET_MEMORY_OPCODE,
LD3post,
@@ -272,11 +276,11 @@ public:
SDValue ReconstructShuffle(SDValue Op, SelectionDAG &DAG) const;
- MachineBasicBlock *EmitF128CSEL(MachineInstr *MI,
+ MachineBasicBlock *EmitF128CSEL(MachineInstr &MI,
MachineBasicBlock *BB) const;
MachineBasicBlock *
- EmitInstrWithCustomInserter(MachineInstr *MI,
+ EmitInstrWithCustomInserter(MachineInstr &MI,
MachineBasicBlock *MBB) const override;
bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I,
@@ -358,6 +362,10 @@ public:
TargetLoweringBase::LegalizeTypeAction
getPreferredVectorAction(EVT VT) const override;
+ /// If the target has a standard location for the stack protector cookie,
+ /// returns the address of that location. Otherwise, returns nullptr.
+ Value *getIRStackGuard(IRBuilder<> &IRB) const override;
+
/// If the target has a standard location for the unsafe stack pointer,
/// returns the address of that location. Otherwise, returns nullptr.
Value *getSafeStackPointerLocation(IRBuilder<> &IRB) const override;
@@ -378,6 +386,8 @@ public:
return AArch64::X1;
}
+ bool isIntDivCheap(EVT VT, AttributeSet Attr) const override;
+
bool isCheapToSpeculateCttz() const override {
return true;
}
@@ -385,6 +395,12 @@ public:
bool isCheapToSpeculateCtlz() const override {
return true;
}
+
+ bool hasBitPreservingFPLogic(EVT VT) const override {
+ // FIXME: Is this always true? It should be true for vectors at least.
+ return VT == MVT::f32 || VT == MVT::f64;
+ }
+
bool supportSplitCSR(MachineFunction *MF) const override {
return MF->getFunction()->getCallingConv() == CallingConv::CXX_FAST_TLS &&
MF->getFunction()->hasFnAttribute(Attribute::NoUnwind);
@@ -394,6 +410,10 @@ public:
MachineBasicBlock *Entry,
const SmallVectorImpl<MachineBasicBlock *> &Exits) const override;
+ bool supportSwiftError() const override {
+ return true;
+ }
+
private:
bool isExtFreeImpl(const Instruction *Ext) const override;
@@ -401,30 +421,30 @@ private:
/// make the right decision when generating code for different targets.
const AArch64Subtarget *Subtarget;
- void addTypeForNEON(EVT VT, EVT PromotedBitwiseVT);
+ void addTypeForNEON(MVT VT, MVT PromotedBitwiseVT);
void addDRTypeForNEON(MVT VT);
void addQRTypeForNEON(MVT VT);
- SDValue
- LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
- const SmallVectorImpl<ISD::InputArg> &Ins, SDLoc DL,
- SelectionDAG &DAG,
- SmallVectorImpl<SDValue> &InVals) const override;
+ SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv,
+ bool isVarArg,
+ const SmallVectorImpl<ISD::InputArg> &Ins,
+ const SDLoc &DL, SelectionDAG &DAG,
+ SmallVectorImpl<SDValue> &InVals) const override;
SDValue LowerCall(CallLoweringInfo & /*CLI*/,
SmallVectorImpl<SDValue> &InVals) const override;
SDValue LowerCallResult(SDValue Chain, SDValue InFlag,
CallingConv::ID CallConv, bool isVarArg,
- const SmallVectorImpl<ISD::InputArg> &Ins, SDLoc DL,
- SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
- bool isThisReturn, SDValue ThisVal) const;
+ const SmallVectorImpl<ISD::InputArg> &Ins,
+ const SDLoc &DL, SelectionDAG &DAG,
+ SmallVectorImpl<SDValue> &InVals, bool isThisReturn,
+ SDValue ThisVal) const;
SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
bool isEligibleForTailCallOptimization(
SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,
- bool isCalleeStructRet, bool isCallerStructRet,
const SmallVectorImpl<ISD::OutputArg> &Outs,
const SmallVectorImpl<SDValue> &OutVals,
const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const;
@@ -439,7 +459,7 @@ private:
bool IsTailCallConvention(CallingConv::ID CallCC) const;
- void saveVarArgRegisters(CCState &CCInfo, SelectionDAG &DAG, SDLoc DL,
+ void saveVarArgRegisters(CCState &CCInfo, SelectionDAG &DAG, const SDLoc &DL,
SDValue &Chain) const;
bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF,
@@ -449,21 +469,21 @@ private:
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
const SmallVectorImpl<ISD::OutputArg> &Outs,
- const SmallVectorImpl<SDValue> &OutVals, SDLoc DL,
+ const SmallVectorImpl<SDValue> &OutVals, const SDLoc &DL,
SelectionDAG &DAG) const override;
SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerDarwinGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerELFGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerELFTLSDescCallSeq(SDValue SymAddr, SDLoc DL,
+ SDValue LowerELFTLSDescCallSeq(SDValue SymAddr, const SDLoc &DL,
SelectionDAG &DAG) const;
SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerBR_CC(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerSELECT_CC(ISD::CondCode CC, SDValue LHS, SDValue RHS,
- SDValue TVal, SDValue FVal, SDLoc dl,
+ SDValue TVal, SDValue FVal, const SDLoc &dl,
SelectionDAG &DAG) const;
SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) const;
@@ -500,6 +520,11 @@ private:
SDValue BuildSDIVPow2(SDNode *N, const APInt &Divisor, SelectionDAG &DAG,
std::vector<SDNode *> *Created) const override;
+ SDValue getRsqrtEstimate(SDValue Operand, DAGCombinerInfo &DCI,
+ unsigned &RefinementSteps,
+ bool &UseOneConstNR) const override;
+ SDValue getRecipEstimate(SDValue Operand, DAGCombinerInfo &DCI,
+ unsigned &RefinementSteps) const override;
unsigned combineRepeatedFPDivisors() const override;
ConstraintType getConstraintType(StringRef Constraint) const override;
@@ -515,6 +540,9 @@ private:
std::pair<unsigned, const TargetRegisterClass *>
getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
StringRef Constraint, MVT VT) const override;
+
+ const char *LowerXConstraint(EVT ConstraintVT) const override;
+
void LowerAsmOperandForConstraint(SDValue Op, std::string &Constraint,
std::vector<SDValue> &Ops,
SelectionDAG &DAG) const override;
diff --git a/lib/Target/AArch64/AArch64InstrAtomics.td b/lib/Target/AArch64/AArch64InstrAtomics.td
index 4923a1161dfcf..59de62ad28771 100644
--- a/lib/Target/AArch64/AArch64InstrAtomics.td
+++ b/lib/Target/AArch64/AArch64InstrAtomics.td
@@ -29,7 +29,7 @@ def : Pat<(atomic_fence (imm), (imm)), (DMB (i32 0xb))>;
class acquiring_load<PatFrag base>
: PatFrag<(ops node:$ptr), (base node:$ptr), [{
AtomicOrdering Ordering = cast<AtomicSDNode>(N)->getOrdering();
- return isAtLeastAcquire(Ordering);
+ return isAcquireOrStronger(Ordering);
}]>;
// An atomic load operation that does not need either acquire or release
@@ -37,7 +37,7 @@ class acquiring_load<PatFrag base>
class relaxed_load<PatFrag base>
: PatFrag<(ops node:$ptr), (base node:$ptr), [{
AtomicOrdering Ordering = cast<AtomicSDNode>(N)->getOrdering();
- return !isAtLeastAcquire(Ordering);
+ return !isAcquireOrStronger(Ordering);
}]>;
// 8-bit loads
@@ -112,15 +112,16 @@ def : Pat<(relaxed_load<atomic_load_64>
class releasing_store<PatFrag base>
: PatFrag<(ops node:$ptr, node:$val), (base node:$ptr, node:$val), [{
AtomicOrdering Ordering = cast<AtomicSDNode>(N)->getOrdering();
- assert(Ordering != AcquireRelease && "unexpected store ordering");
- return isAtLeastRelease(Ordering);
+ assert(Ordering != AtomicOrdering::AcquireRelease &&
+ "unexpected store ordering");
+ return isReleaseOrStronger(Ordering);
}]>;
// An atomic store operation that doesn't actually need to be atomic on AArch64.
class relaxed_store<PatFrag base>
: PatFrag<(ops node:$ptr, node:$val), (base node:$ptr, node:$val), [{
AtomicOrdering Ordering = cast<AtomicSDNode>(N)->getOrdering();
- return !isAtLeastRelease(Ordering);
+ return !isReleaseOrStronger(Ordering);
}]>;
// 8-bit stores
@@ -361,3 +362,43 @@ def : Pat<(stlxr_4 (and GPR64:$val, 0xffffffff), GPR64sp:$addr),
// And clear exclusive.
def : Pat<(int_aarch64_clrex), (CLREX 0xf)>;
+
+//===----------------------------------
+// Atomic cmpxchg for -O0
+//===----------------------------------
+
+// The fast register allocator used during -O0 inserts spills to cover any VRegs
+// live across basic block boundaries. When this happens between an LDXR and an
+// STXR it can clear the exclusive monitor, causing all cmpxchg attempts to
+// fail.
+
+// Unfortunately, this means we have to have an alternative (expanded
+// post-regalloc) path for -O0 compilations. Fortunately this path can be
+// significantly more naive than the standard expansion: we conservatively
+// assume seq_cst, strong cmpxchg and omit clrex on failure.
+
+let Constraints = "@earlyclobber $Rd,@earlyclobber $status",
+ mayLoad = 1, mayStore = 1 in {
+def CMP_SWAP_8 : Pseudo<(outs GPR32:$Rd, GPR32:$status),
+ (ins GPR64:$addr, GPR32:$desired, GPR32:$new), []>,
+ Sched<[WriteAtomic]>;
+
+def CMP_SWAP_16 : Pseudo<(outs GPR32:$Rd, GPR32:$status),
+ (ins GPR64:$addr, GPR32:$desired, GPR32:$new), []>,
+ Sched<[WriteAtomic]>;
+
+def CMP_SWAP_32 : Pseudo<(outs GPR32:$Rd, GPR32:$status),
+ (ins GPR64:$addr, GPR32:$desired, GPR32:$new), []>,
+ Sched<[WriteAtomic]>;
+
+def CMP_SWAP_64 : Pseudo<(outs GPR64:$Rd, GPR32:$status),
+ (ins GPR64:$addr, GPR64:$desired, GPR64:$new), []>,
+ Sched<[WriteAtomic]>;
+}
+
+let Constraints = "@earlyclobber $RdLo,@earlyclobber $RdHi,@earlyclobber $status",
+ mayLoad = 1, mayStore = 1 in
+def CMP_SWAP_128 : Pseudo<(outs GPR64:$RdLo, GPR64:$RdHi, GPR32:$status),
+ (ins GPR64:$addr, GPR64:$desiredLo, GPR64:$desiredHi,
+ GPR64:$newLo, GPR64:$newHi), []>,
+ Sched<[WriteAtomic]>;
diff --git a/lib/Target/AArch64/AArch64InstrFormats.td b/lib/Target/AArch64/AArch64InstrFormats.td
index 6ac2175e50355..34d35e961210e 100644
--- a/lib/Target/AArch64/AArch64InstrFormats.td
+++ b/lib/Target/AArch64/AArch64InstrFormats.td
@@ -496,7 +496,7 @@ def imm0_65535 : Operand<i32>, ImmLeaf<i32, [{
return ((uint32_t)Imm) < 65536;
}]> {
let ParserMatchClass = Imm0_65535Operand;
- let PrintMethod = "printHexImm";
+ let PrintMethod = "printImmHex";
}
// imm0_255 predicate - True if the immediate is in the range [0,255].
@@ -505,7 +505,7 @@ def imm0_255 : Operand<i32>, ImmLeaf<i32, [{
return ((uint32_t)Imm) < 256;
}]> {
let ParserMatchClass = Imm0_255Operand;
- let PrintMethod = "printHexImm";
+ let PrintMethod = "printImm";
}
// imm0_127 predicate - True if the immediate is in the range [0,127]
@@ -514,7 +514,7 @@ def imm0_127 : Operand<i32>, ImmLeaf<i32, [{
return ((uint32_t)Imm) < 128;
}]> {
let ParserMatchClass = Imm0_127Operand;
- let PrintMethod = "printHexImm";
+ let PrintMethod = "printImm";
}
// NOTE: These imm0_N operands have to be of type i64 because i64 is the size
@@ -923,10 +923,7 @@ def psbhint_op : Operand<i32> {
// "psb" is an alias to "hint" only for certain values of CRm:Op2 fields.
if (!MCOp.isImm())
return false;
- bool ValidNamed;
- (void)AArch64PSBHint::PSBHintMapper().toString(MCOp.getImm(),
- STI.getFeatureBits(), ValidNamed);
- return ValidNamed;
+ return AArch64PSBHint::lookupPSBByEncoding(MCOp.getImm()) != nullptr;
}];
}
@@ -1549,7 +1546,7 @@ class ADRI<bit page, string asm, Operand adr, list<dag> pattern>
def movimm32_imm : Operand<i32> {
let ParserMatchClass = Imm0_65535Operand;
let EncoderMethod = "getMoveWideImmOpValue";
- let PrintMethod = "printHexImm";
+ let PrintMethod = "printImm";
}
def movimm32_shift : Operand<i32> {
let PrintMethod = "printShifter";
@@ -9377,7 +9374,8 @@ class BaseCASEncoding<dag oops, dag iops, string asm, string operands,
class BaseCAS<string order, string size, RegisterClass RC>
: BaseCASEncoding<(outs RC:$out),(ins RC:$Rs, RC:$Rt, GPR64sp:$Rn),
"cas" # order # size, "\t$Rs, $Rt, [$Rn]",
- "$out = $Rs",[]> {
+ "$out = $Rs",[]>,
+ Sched<[WriteAtomic]> {
let NP = 1;
}
@@ -9391,7 +9389,8 @@ multiclass CompareAndSwap<bits<1> Acq, bits<1> Rel, string order> {
class BaseCASP<string order, string size, RegisterOperand RC>
: BaseCASEncoding<(outs RC:$out),(ins RC:$Rs, RC:$Rt, GPR64sp:$Rn),
"casp" # order # size, "\t$Rs, $Rt, [$Rn]",
- "$out = $Rs",[]> {
+ "$out = $Rs",[]>,
+ Sched<[WriteAtomic]> {
let NP = 0;
}
@@ -9405,7 +9404,8 @@ multiclass CompareAndSwapPair<bits<1> Acq, bits<1> Rel, string order> {
let Predicates = [HasV8_1a] in
class BaseSWP<string order, string size, RegisterClass RC>
: I<(outs RC:$Rt),(ins RC:$Rs, GPR64sp:$Rn), "swp" # order # size,
- "\t$Rs, $Rt, [$Rn]","",[]> {
+ "\t$Rs, $Rt, [$Rn]","",[]>,
+ Sched<[WriteAtomic]> {
bits<2> Sz;
bit Acq;
bit Rel;
@@ -9436,7 +9436,8 @@ multiclass Swap<bits<1> Acq, bits<1> Rel, string order> {
let Predicates = [HasV8_1a], mayLoad = 1, mayStore = 1, hasSideEffects = 1 in
class BaseLDOPregister<string op, string order, string size, RegisterClass RC>
: I<(outs RC:$Rt),(ins RC:$Rs, GPR64sp:$Rn), "ld" # op # order # size,
- "\t$Rs, $Rt, [$Rn]","",[]> {
+ "\t$Rs, $Rt, [$Rn]","",[]>,
+ Sched<[WriteAtomic]> {
bits<2> Sz;
bit Acq;
bit Rel;
diff --git a/lib/Target/AArch64/AArch64InstrInfo.cpp b/lib/Target/AArch64/AArch64InstrInfo.cpp
index f398117de953b..0aa4708f35ac4 100644
--- a/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -22,27 +22,31 @@
#include "llvm/MC/MCInst.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/TargetRegistry.h"
+#include <algorithm>
using namespace llvm;
#define GET_INSTRINFO_CTOR_DTOR
#include "AArch64GenInstrInfo.inc"
+static LLVM_CONSTEXPR MachineMemOperand::Flags MOSuppressPair =
+ MachineMemOperand::MOTargetFlag1;
+
AArch64InstrInfo::AArch64InstrInfo(const AArch64Subtarget &STI)
: AArch64GenInstrInfo(AArch64::ADJCALLSTACKDOWN, AArch64::ADJCALLSTACKUP),
RI(STI.getTargetTriple()), Subtarget(STI) {}
/// GetInstSize - Return the number of bytes of code the specified
/// instruction may be. This returns the maximum number of bytes.
-unsigned AArch64InstrInfo::GetInstSizeInBytes(const MachineInstr *MI) const {
- const MachineBasicBlock &MBB = *MI->getParent();
+unsigned AArch64InstrInfo::GetInstSizeInBytes(const MachineInstr &MI) const {
+ const MachineBasicBlock &MBB = *MI.getParent();
const MachineFunction *MF = MBB.getParent();
const MCAsmInfo *MAI = MF->getTarget().getMCAsmInfo();
- if (MI->getOpcode() == AArch64::INLINEASM)
- return getInlineAsmLength(MI->getOperand(0).getSymbolName(), *MAI);
+ if (MI.getOpcode() == AArch64::INLINEASM)
+ return getInlineAsmLength(MI.getOperand(0).getSymbolName(), *MAI);
- const MCInstrDesc &Desc = MI->getDesc();
+ const MCInstrDesc &Desc = MI.getDesc();
switch (Desc.getOpcode()) {
default:
// Anything not explicitly designated otherwise is a nomal 4-byte insn.
@@ -89,25 +93,25 @@ static void parseCondBranch(MachineInstr *LastInst, MachineBasicBlock *&Target,
}
// Branch analysis.
-bool AArch64InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
- MachineBasicBlock *&TBB,
- MachineBasicBlock *&FBB,
- SmallVectorImpl<MachineOperand> &Cond,
- bool AllowModify) const {
+bool AArch64InstrInfo::analyzeBranch(MachineBasicBlock &MBB,
+ MachineBasicBlock *&TBB,
+ MachineBasicBlock *&FBB,
+ SmallVectorImpl<MachineOperand> &Cond,
+ bool AllowModify) const {
// If the block has no terminators, it just falls into the block after it.
MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr();
if (I == MBB.end())
return false;
- if (!isUnpredicatedTerminator(I))
+ if (!isUnpredicatedTerminator(*I))
return false;
// Get the last instruction in the block.
- MachineInstr *LastInst = I;
+ MachineInstr *LastInst = &*I;
// If there is only one terminator instruction, process it.
unsigned LastOpc = LastInst->getOpcode();
- if (I == MBB.begin() || !isUnpredicatedTerminator(--I)) {
+ if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) {
if (isUncondBranchOpcode(LastOpc)) {
TBB = LastInst->getOperand(0).getMBB();
return false;
@@ -121,7 +125,7 @@ bool AArch64InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
}
// Get the instruction before it if it is a terminator.
- MachineInstr *SecondLastInst = I;
+ MachineInstr *SecondLastInst = &*I;
unsigned SecondLastOpc = SecondLastInst->getOpcode();
// If AllowModify is true and the block ends with two or more unconditional
@@ -131,19 +135,19 @@ bool AArch64InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
LastInst->eraseFromParent();
LastInst = SecondLastInst;
LastOpc = LastInst->getOpcode();
- if (I == MBB.begin() || !isUnpredicatedTerminator(--I)) {
+ if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) {
// Return now the only terminator is an unconditional branch.
TBB = LastInst->getOperand(0).getMBB();
return false;
} else {
- SecondLastInst = I;
+ SecondLastInst = &*I;
SecondLastOpc = SecondLastInst->getOpcode();
}
}
}
// If there are three terminators, we don't know what sort of block this is.
- if (SecondLastInst && I != MBB.begin() && isUnpredicatedTerminator(--I))
+ if (SecondLastInst && I != MBB.begin() && isUnpredicatedTerminator(*--I))
return true;
// If the block ends with a B and a Bcc, handle it.
@@ -243,7 +247,7 @@ unsigned AArch64InstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
}
void AArch64InstrInfo::instantiateCondBranch(
- MachineBasicBlock &MBB, DebugLoc DL, MachineBasicBlock *TBB,
+ MachineBasicBlock &MBB, const DebugLoc &DL, MachineBasicBlock *TBB,
ArrayRef<MachineOperand> Cond) const {
if (Cond[0].getImm() != -1) {
// Regular Bcc
@@ -259,9 +263,11 @@ void AArch64InstrInfo::instantiateCondBranch(
}
}
-unsigned AArch64InstrInfo::InsertBranch(
- MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB,
- ArrayRef<MachineOperand> Cond, DebugLoc DL) const {
+unsigned AArch64InstrInfo::InsertBranch(MachineBasicBlock &MBB,
+ MachineBasicBlock *TBB,
+ MachineBasicBlock *FBB,
+ ArrayRef<MachineOperand> Cond,
+ const DebugLoc &DL) const {
// Shouldn't be a fall through.
assert(TBB && "InsertBranch must not be told to insert a fallthrough");
@@ -399,8 +405,8 @@ bool AArch64InstrInfo::canInsertSelect(
}
void AArch64InstrInfo::insertSelect(MachineBasicBlock &MBB,
- MachineBasicBlock::iterator I, DebugLoc DL,
- unsigned DstReg,
+ MachineBasicBlock::iterator I,
+ const DebugLoc &DL, unsigned DstReg,
ArrayRef<MachineOperand> Cond,
unsigned TrueReg, unsigned FalseReg) const {
MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
@@ -533,8 +539,8 @@ void AArch64InstrInfo::insertSelect(MachineBasicBlock &MBB,
}
/// Returns true if a MOVi32imm or MOVi64imm can be expanded to an ORRxx.
-static bool canBeExpandedToORR(const MachineInstr *MI, unsigned BitSize) {
- uint64_t Imm = MI->getOperand(1).getImm();
+static bool canBeExpandedToORR(const MachineInstr &MI, unsigned BitSize) {
+ uint64_t Imm = MI.getOperand(1).getImm();
uint64_t UImm = Imm << (64 - BitSize) >> (64 - BitSize);
uint64_t Encoding;
return AArch64_AM::processLogicalImmediate(UImm, BitSize, Encoding);
@@ -542,11 +548,13 @@ static bool canBeExpandedToORR(const MachineInstr *MI, unsigned BitSize) {
// FIXME: this implementation should be micro-architecture dependent, so a
// micro-architecture target hook should be introduced here in future.
-bool AArch64InstrInfo::isAsCheapAsAMove(const MachineInstr *MI) const {
- if (!Subtarget.isCortexA57() && !Subtarget.isCortexA53())
- return MI->isAsCheapAsAMove();
+bool AArch64InstrInfo::isAsCheapAsAMove(const MachineInstr &MI) const {
+ if (!Subtarget.hasCustomCheapAsMoveHandling())
+ return MI.isAsCheapAsAMove();
+
+ unsigned Imm;
- switch (MI->getOpcode()) {
+ switch (MI.getOpcode()) {
default:
return false;
@@ -555,7 +563,17 @@ bool AArch64InstrInfo::isAsCheapAsAMove(const MachineInstr *MI) const {
case AArch64::ADDXri:
case AArch64::SUBWri:
case AArch64::SUBXri:
- return (MI->getOperand(3).getImm() == 0);
+ return (Subtarget.getProcFamily() == AArch64Subtarget::ExynosM1 ||
+ MI.getOperand(3).getImm() == 0);
+
+ // add/sub on register with shift
+ case AArch64::ADDWrs:
+ case AArch64::ADDXrs:
+ case AArch64::SUBWrs:
+ case AArch64::SUBXrs:
+ Imm = MI.getOperand(3).getImm();
+ return (Subtarget.getProcFamily() == AArch64Subtarget::ExynosM1 &&
+ AArch64_AM::getArithShiftValue(Imm) < 4);
// logical ops on immediate
case AArch64::ANDWri:
@@ -580,12 +598,41 @@ bool AArch64InstrInfo::isAsCheapAsAMove(const MachineInstr *MI) const {
case AArch64::ORRWrr:
case AArch64::ORRXrr:
return true;
+
+ // logical ops on register with shift
+ case AArch64::ANDWrs:
+ case AArch64::ANDXrs:
+ case AArch64::BICWrs:
+ case AArch64::BICXrs:
+ case AArch64::EONWrs:
+ case AArch64::EONXrs:
+ case AArch64::EORWrs:
+ case AArch64::EORXrs:
+ case AArch64::ORNWrs:
+ case AArch64::ORNXrs:
+ case AArch64::ORRWrs:
+ case AArch64::ORRXrs:
+ Imm = MI.getOperand(3).getImm();
+ return (Subtarget.getProcFamily() == AArch64Subtarget::ExynosM1 &&
+ AArch64_AM::getShiftValue(Imm) < 4 &&
+ AArch64_AM::getShiftType(Imm) == AArch64_AM::LSL);
+
// If MOVi32imm or MOVi64imm can be expanded into ORRWri or
// ORRXri, it is as cheap as MOV
case AArch64::MOVi32imm:
return canBeExpandedToORR(MI, 32);
case AArch64::MOVi64imm:
return canBeExpandedToORR(MI, 64);
+
+ // It is cheap to zero out registers if the subtarget has ZeroCycleZeroing
+ // feature.
+ case AArch64::FMOVS0:
+ case AArch64::FMOVD0:
+ return Subtarget.hasZeroCycleZeroing();
+ case TargetOpcode::COPY:
+ return (Subtarget.hasZeroCycleZeroing() &&
+ (MI.getOperand(1).getReg() == AArch64::WZR ||
+ MI.getOperand(1).getReg() == AArch64::XZR));
}
llvm_unreachable("Unknown opcode to check as cheap as a move!");
@@ -611,20 +658,18 @@ bool AArch64InstrInfo::isCoalescableExtInstr(const MachineInstr &MI,
}
}
-bool
-AArch64InstrInfo::areMemAccessesTriviallyDisjoint(MachineInstr *MIa,
- MachineInstr *MIb,
- AliasAnalysis *AA) const {
+bool AArch64InstrInfo::areMemAccessesTriviallyDisjoint(
+ MachineInstr &MIa, MachineInstr &MIb, AliasAnalysis *AA) const {
const TargetRegisterInfo *TRI = &getRegisterInfo();
unsigned BaseRegA = 0, BaseRegB = 0;
- int OffsetA = 0, OffsetB = 0;
- int WidthA = 0, WidthB = 0;
+ int64_t OffsetA = 0, OffsetB = 0;
+ unsigned WidthA = 0, WidthB = 0;
- assert(MIa && MIa->mayLoadOrStore() && "MIa must be a load or store.");
- assert(MIb && MIb->mayLoadOrStore() && "MIb must be a load or store.");
+ assert(MIa.mayLoadOrStore() && "MIa must be a load or store.");
+ assert(MIb.mayLoadOrStore() && "MIb must be a load or store.");
- if (MIa->hasUnmodeledSideEffects() || MIb->hasUnmodeledSideEffects() ||
- MIa->hasOrderedMemoryRef() || MIb->hasOrderedMemoryRef())
+ if (MIa.hasUnmodeledSideEffects() || MIb.hasUnmodeledSideEffects() ||
+ MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef())
return false;
// Retrieve the base register, offset from the base register and width. Width
@@ -648,10 +693,10 @@ AArch64InstrInfo::areMemAccessesTriviallyDisjoint(MachineInstr *MIa,
/// analyzeCompare - For a comparison instruction, return the source registers
/// in SrcReg and SrcReg2, and the value it compares against in CmpValue.
/// Return true if the comparison instruction can be analyzed.
-bool AArch64InstrInfo::analyzeCompare(const MachineInstr *MI, unsigned &SrcReg,
+bool AArch64InstrInfo::analyzeCompare(const MachineInstr &MI, unsigned &SrcReg,
unsigned &SrcReg2, int &CmpMask,
int &CmpValue) const {
- switch (MI->getOpcode()) {
+ switch (MI.getOpcode()) {
default:
break;
case AArch64::SUBSWrr:
@@ -667,8 +712,8 @@ bool AArch64InstrInfo::analyzeCompare(const MachineInstr *MI, unsigned &SrcReg,
case AArch64::ADDSXrs:
case AArch64::ADDSXrx:
// Replace SUBSWrr with SUBWrr if NZCV is not used.
- SrcReg = MI->getOperand(1).getReg();
- SrcReg2 = MI->getOperand(2).getReg();
+ SrcReg = MI.getOperand(1).getReg();
+ SrcReg2 = MI.getOperand(2).getReg();
CmpMask = ~0;
CmpValue = 0;
return true;
@@ -676,17 +721,17 @@ bool AArch64InstrInfo::analyzeCompare(const MachineInstr *MI, unsigned &SrcReg,
case AArch64::ADDSWri:
case AArch64::SUBSXri:
case AArch64::ADDSXri:
- SrcReg = MI->getOperand(1).getReg();
+ SrcReg = MI.getOperand(1).getReg();
SrcReg2 = 0;
CmpMask = ~0;
// FIXME: In order to convert CmpValue to 0 or 1
- CmpValue = (MI->getOperand(2).getImm() != 0);
+ CmpValue = MI.getOperand(2).getImm() != 0;
return true;
case AArch64::ANDSWri:
case AArch64::ANDSXri:
// ANDS does not use the same encoding scheme as the others xxxS
// instructions.
- SrcReg = MI->getOperand(1).getReg();
+ SrcReg = MI.getOperand(1).getReg();
SrcReg2 = 0;
CmpMask = ~0;
// FIXME:The return val type of decodeLogicalImmediate is uint64_t,
@@ -694,17 +739,17 @@ bool AArch64InstrInfo::analyzeCompare(const MachineInstr *MI, unsigned &SrcReg,
// the high 32 bits of uint64_t will be lost.
// In fact it causes a bug in spec2006-483.xalancbmk
// CmpValue is only used to compare with zero in OptimizeCompareInstr
- CmpValue = (AArch64_AM::decodeLogicalImmediate(
- MI->getOperand(2).getImm(),
- MI->getOpcode() == AArch64::ANDSWri ? 32 : 64) != 0);
+ CmpValue = AArch64_AM::decodeLogicalImmediate(
+ MI.getOperand(2).getImm(),
+ MI.getOpcode() == AArch64::ANDSWri ? 32 : 64) != 0;
return true;
}
return false;
}
-static bool UpdateOperandRegClass(MachineInstr *Instr) {
- MachineBasicBlock *MBB = Instr->getParent();
+static bool UpdateOperandRegClass(MachineInstr &Instr) {
+ MachineBasicBlock *MBB = Instr.getParent();
assert(MBB && "Can't get MachineBasicBlock here");
MachineFunction *MF = MBB->getParent();
assert(MF && "Can't get MachineFunction here");
@@ -712,11 +757,11 @@ static bool UpdateOperandRegClass(MachineInstr *Instr) {
const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
MachineRegisterInfo *MRI = &MF->getRegInfo();
- for (unsigned OpIdx = 0, EndIdx = Instr->getNumOperands(); OpIdx < EndIdx;
+ for (unsigned OpIdx = 0, EndIdx = Instr.getNumOperands(); OpIdx < EndIdx;
++OpIdx) {
- MachineOperand &MO = Instr->getOperand(OpIdx);
+ MachineOperand &MO = Instr.getOperand(OpIdx);
const TargetRegisterClass *OpRegCstraints =
- Instr->getRegClassConstraint(OpIdx, TII, TRI);
+ Instr.getRegClassConstraint(OpIdx, TII, TRI);
// If there's no constraint, there's nothing to do.
if (!OpRegCstraints)
@@ -744,16 +789,16 @@ static bool UpdateOperandRegClass(MachineInstr *Instr) {
/// \brief Return the opcode that does not set flags when possible - otherwise
/// return the original opcode. The caller is responsible to do the actual
/// substitution and legality checking.
-static unsigned convertFlagSettingOpcode(const MachineInstr *MI) {
+static unsigned convertFlagSettingOpcode(const MachineInstr &MI) {
// Don't convert all compare instructions, because for some the zero register
// encoding becomes the sp register.
bool MIDefinesZeroReg = false;
- if (MI->definesRegister(AArch64::WZR) || MI->definesRegister(AArch64::XZR))
+ if (MI.definesRegister(AArch64::WZR) || MI.definesRegister(AArch64::XZR))
MIDefinesZeroReg = true;
- switch (MI->getOpcode()) {
+ switch (MI.getOpcode()) {
default:
- return MI->getOpcode();
+ return MI.getOpcode();
case AArch64::ADDSWrr:
return AArch64::ADDWrr;
case AArch64::ADDSWri:
@@ -789,60 +834,76 @@ static unsigned convertFlagSettingOpcode(const MachineInstr *MI) {
}
}
-/// True when condition code could be modified on the instruction
-/// trace starting at from and ending at to.
-static bool modifiesConditionCode(MachineInstr *From, MachineInstr *To,
- const bool CheckOnlyCCWrites,
- const TargetRegisterInfo *TRI) {
- // We iterate backward starting \p To until we hit \p From
- MachineBasicBlock::iterator I = To, E = From, B = To->getParent()->begin();
+enum AccessKind {
+ AK_Write = 0x01,
+ AK_Read = 0x10,
+ AK_All = 0x11
+};
+/// True when condition flags are accessed (either by writing or reading)
+/// on the instruction trace starting at From and ending at To.
+///
+/// Note: If From and To are from different blocks it's assumed CC are accessed
+/// on the path.
+static bool areCFlagsAccessedBetweenInstrs(
+ MachineBasicBlock::iterator From, MachineBasicBlock::iterator To,
+ const TargetRegisterInfo *TRI, const AccessKind AccessToCheck = AK_All) {
// Early exit if To is at the beginning of the BB.
- if (I == B)
+ if (To == To->getParent()->begin())
return true;
- // Check whether the definition of SrcReg is in the same basic block as
- // Compare. If not, assume the condition code gets modified on some path.
+ // Check whether the instructions are in the same basic block
+ // If not, assume the condition flags might get modified somewhere.
if (To->getParent() != From->getParent())
return true;
- // Check that NZCV isn't set on the trace.
- for (--I; I != E; --I) {
- const MachineInstr &Instr = *I;
+ // From must be above To.
+ assert(std::find_if(MachineBasicBlock::reverse_iterator(To),
+ To->getParent()->rend(), [From](MachineInstr &MI) {
+ return MachineBasicBlock::iterator(MI) == From;
+ }) != To->getParent()->rend());
- if (Instr.modifiesRegister(AArch64::NZCV, TRI) ||
- (!CheckOnlyCCWrites && Instr.readsRegister(AArch64::NZCV, TRI)))
- // This instruction modifies or uses NZCV after the one we want to
- // change.
- return true;
- if (I == B)
- // We currently don't allow the instruction trace to cross basic
- // block boundaries
+ // We iterate backward starting \p To until we hit \p From.
+ for (--To; To != From; --To) {
+ const MachineInstr &Instr = *To;
+
+ if ( ((AccessToCheck & AK_Write) && Instr.modifiesRegister(AArch64::NZCV, TRI)) ||
+ ((AccessToCheck & AK_Read) && Instr.readsRegister(AArch64::NZCV, TRI)))
return true;
}
return false;
}
-/// optimizeCompareInstr - Convert the instruction supplying the argument to the
-/// comparison into one that sets the zero bit in the flags register.
+
+/// Try to optimize a compare instruction. A compare instruction is an
+/// instruction which produces AArch64::NZCV. It can be truly compare instruction
+/// when there are no uses of its destination register.
+///
+/// The following steps are tried in order:
+/// 1. Convert CmpInstr into an unconditional version.
+/// 2. Remove CmpInstr if above there is an instruction producing a needed
+/// condition code or an instruction which can be converted into such an instruction.
+/// Only comparison with zero is supported.
bool AArch64InstrInfo::optimizeCompareInstr(
- MachineInstr *CmpInstr, unsigned SrcReg, unsigned SrcReg2, int CmpMask,
+ MachineInstr &CmpInstr, unsigned SrcReg, unsigned SrcReg2, int CmpMask,
int CmpValue, const MachineRegisterInfo *MRI) const {
+ assert(CmpInstr.getParent());
+ assert(MRI);
// Replace SUBSWrr with SUBWrr if NZCV is not used.
- int Cmp_NZCV = CmpInstr->findRegisterDefOperandIdx(AArch64::NZCV, true);
- if (Cmp_NZCV != -1) {
- if (CmpInstr->definesRegister(AArch64::WZR) ||
- CmpInstr->definesRegister(AArch64::XZR)) {
- CmpInstr->eraseFromParent();
+ int DeadNZCVIdx = CmpInstr.findRegisterDefOperandIdx(AArch64::NZCV, true);
+ if (DeadNZCVIdx != -1) {
+ if (CmpInstr.definesRegister(AArch64::WZR) ||
+ CmpInstr.definesRegister(AArch64::XZR)) {
+ CmpInstr.eraseFromParent();
return true;
}
- unsigned Opc = CmpInstr->getOpcode();
+ unsigned Opc = CmpInstr.getOpcode();
unsigned NewOpc = convertFlagSettingOpcode(CmpInstr);
if (NewOpc == Opc)
return false;
const MCInstrDesc &MCID = get(NewOpc);
- CmpInstr->setDesc(MCID);
- CmpInstr->RemoveOperand(Cmp_NZCV);
+ CmpInstr.setDesc(MCID);
+ CmpInstr.RemoveOperand(DeadNZCVIdx);
bool succeeded = UpdateOperandRegClass(CmpInstr);
(void)succeeded;
assert(succeeded && "Some operands reg class are incompatible!");
@@ -857,23 +918,21 @@ bool AArch64InstrInfo::optimizeCompareInstr(
return false;
// CmpInstr is a Compare instruction if destination register is not used.
- if (!MRI->use_nodbg_empty(CmpInstr->getOperand(0).getReg()))
- return false;
-
- // Get the unique definition of SrcReg.
- MachineInstr *MI = MRI->getUniqueVRegDef(SrcReg);
- if (!MI)
+ if (!MRI->use_nodbg_empty(CmpInstr.getOperand(0).getReg()))
return false;
- bool CheckOnlyCCWrites = false;
- const TargetRegisterInfo *TRI = &getRegisterInfo();
- if (modifiesConditionCode(MI, CmpInstr, CheckOnlyCCWrites, TRI))
- return false;
+ return substituteCmpToZero(CmpInstr, SrcReg, MRI);
+}
- unsigned NewOpc = MI->getOpcode();
- switch (MI->getOpcode()) {
+/// Get opcode of S version of Instr.
+/// If Instr is S version its opcode is returned.
+/// AArch64::INSTRUCTION_LIST_END is returned if Instr does not have S version
+/// or we are not interested in it.
+static unsigned sForm(MachineInstr &Instr) {
+ switch (Instr.getOpcode()) {
default:
- return false;
+ return AArch64::INSTRUCTION_LIST_END;
+
case AArch64::ADDSWrr:
case AArch64::ADDSWri:
case AArch64::ADDSXrr:
@@ -882,116 +941,221 @@ bool AArch64InstrInfo::optimizeCompareInstr(
case AArch64::SUBSWri:
case AArch64::SUBSXrr:
case AArch64::SUBSXri:
- break;
- case AArch64::ADDWrr: NewOpc = AArch64::ADDSWrr; break;
- case AArch64::ADDWri: NewOpc = AArch64::ADDSWri; break;
- case AArch64::ADDXrr: NewOpc = AArch64::ADDSXrr; break;
- case AArch64::ADDXri: NewOpc = AArch64::ADDSXri; break;
- case AArch64::ADCWr: NewOpc = AArch64::ADCSWr; break;
- case AArch64::ADCXr: NewOpc = AArch64::ADCSXr; break;
- case AArch64::SUBWrr: NewOpc = AArch64::SUBSWrr; break;
- case AArch64::SUBWri: NewOpc = AArch64::SUBSWri; break;
- case AArch64::SUBXrr: NewOpc = AArch64::SUBSXrr; break;
- case AArch64::SUBXri: NewOpc = AArch64::SUBSXri; break;
- case AArch64::SBCWr: NewOpc = AArch64::SBCSWr; break;
- case AArch64::SBCXr: NewOpc = AArch64::SBCSXr; break;
- case AArch64::ANDWri: NewOpc = AArch64::ANDSWri; break;
- case AArch64::ANDXri: NewOpc = AArch64::ANDSXri; break;
- }
-
- // Scan forward for the use of NZCV.
- // When checking against MI: if it's a conditional code requires
- // checking of V bit, then this is not safe to do.
- // It is safe to remove CmpInstr if NZCV is redefined or killed.
- // If we are done with the basic block, we need to check whether NZCV is
- // live-out.
- bool IsSafe = false;
- for (MachineBasicBlock::iterator I = CmpInstr,
- E = CmpInstr->getParent()->end();
- !IsSafe && ++I != E;) {
- const MachineInstr &Instr = *I;
- for (unsigned IO = 0, EO = Instr.getNumOperands(); !IsSafe && IO != EO;
- ++IO) {
- const MachineOperand &MO = Instr.getOperand(IO);
- if (MO.isRegMask() && MO.clobbersPhysReg(AArch64::NZCV)) {
- IsSafe = true;
- break;
- }
- if (!MO.isReg() || MO.getReg() != AArch64::NZCV)
- continue;
- if (MO.isDef()) {
- IsSafe = true;
- break;
- }
+ return Instr.getOpcode();;
+
+ case AArch64::ADDWrr: return AArch64::ADDSWrr;
+ case AArch64::ADDWri: return AArch64::ADDSWri;
+ case AArch64::ADDXrr: return AArch64::ADDSXrr;
+ case AArch64::ADDXri: return AArch64::ADDSXri;
+ case AArch64::ADCWr: return AArch64::ADCSWr;
+ case AArch64::ADCXr: return AArch64::ADCSXr;
+ case AArch64::SUBWrr: return AArch64::SUBSWrr;
+ case AArch64::SUBWri: return AArch64::SUBSWri;
+ case AArch64::SUBXrr: return AArch64::SUBSXrr;
+ case AArch64::SUBXri: return AArch64::SUBSXri;
+ case AArch64::SBCWr: return AArch64::SBCSWr;
+ case AArch64::SBCXr: return AArch64::SBCSXr;
+ case AArch64::ANDWri: return AArch64::ANDSWri;
+ case AArch64::ANDXri: return AArch64::ANDSXri;
+ }
+}
- // Decode the condition code.
- unsigned Opc = Instr.getOpcode();
- AArch64CC::CondCode CC;
- switch (Opc) {
- default:
- return false;
- case AArch64::Bcc:
- CC = (AArch64CC::CondCode)Instr.getOperand(IO - 2).getImm();
- break;
- case AArch64::CSINVWr:
- case AArch64::CSINVXr:
- case AArch64::CSINCWr:
- case AArch64::CSINCXr:
- case AArch64::CSELWr:
- case AArch64::CSELXr:
- case AArch64::CSNEGWr:
- case AArch64::CSNEGXr:
- case AArch64::FCSELSrrr:
- case AArch64::FCSELDrrr:
- CC = (AArch64CC::CondCode)Instr.getOperand(IO - 1).getImm();
- break;
- }
+/// Check if AArch64::NZCV should be alive in successors of MBB.
+static bool areCFlagsAliveInSuccessors(MachineBasicBlock *MBB) {
+ for (auto *BB : MBB->successors())
+ if (BB->isLiveIn(AArch64::NZCV))
+ return true;
+ return false;
+}
- // It is not safe to remove Compare instruction if Overflow(V) is used.
- switch (CC) {
- default:
- // NZCV can be used multiple times, we should continue.
- break;
- case AArch64CC::VS:
- case AArch64CC::VC:
- case AArch64CC::GE:
- case AArch64CC::LT:
- case AArch64CC::GT:
- case AArch64CC::LE:
- return false;
- }
+struct UsedNZCV {
+ bool N;
+ bool Z;
+ bool C;
+ bool V;
+ UsedNZCV(): N(false), Z(false), C(false), V(false) {}
+ UsedNZCV& operator |=(const UsedNZCV& UsedFlags) {
+ this->N |= UsedFlags.N;
+ this->Z |= UsedFlags.Z;
+ this->C |= UsedFlags.C;
+ this->V |= UsedFlags.V;
+ return *this;
+ }
+};
+
+/// Find a condition code used by the instruction.
+/// Returns AArch64CC::Invalid if either the instruction does not use condition
+/// codes or we don't optimize CmpInstr in the presence of such instructions.
+static AArch64CC::CondCode findCondCodeUsedByInstr(const MachineInstr &Instr) {
+ switch (Instr.getOpcode()) {
+ default:
+ return AArch64CC::Invalid;
+
+ case AArch64::Bcc: {
+ int Idx = Instr.findRegisterUseOperandIdx(AArch64::NZCV);
+ assert(Idx >= 2);
+ return static_cast<AArch64CC::CondCode>(Instr.getOperand(Idx - 2).getImm());
}
+
+ case AArch64::CSINVWr:
+ case AArch64::CSINVXr:
+ case AArch64::CSINCWr:
+ case AArch64::CSINCXr:
+ case AArch64::CSELWr:
+ case AArch64::CSELXr:
+ case AArch64::CSNEGWr:
+ case AArch64::CSNEGXr:
+ case AArch64::FCSELSrrr:
+ case AArch64::FCSELDrrr: {
+ int Idx = Instr.findRegisterUseOperandIdx(AArch64::NZCV);
+ assert(Idx >= 1);
+ return static_cast<AArch64CC::CondCode>(Instr.getOperand(Idx - 1).getImm());
+ }
+ }
+}
+
+static UsedNZCV getUsedNZCV(AArch64CC::CondCode CC) {
+ assert(CC != AArch64CC::Invalid);
+ UsedNZCV UsedFlags;
+ switch (CC) {
+ default:
+ break;
+
+ case AArch64CC::EQ: // Z set
+ case AArch64CC::NE: // Z clear
+ UsedFlags.Z = true;
+ break;
+
+ case AArch64CC::HI: // Z clear and C set
+ case AArch64CC::LS: // Z set or C clear
+ UsedFlags.Z = true;
+ case AArch64CC::HS: // C set
+ case AArch64CC::LO: // C clear
+ UsedFlags.C = true;
+ break;
+
+ case AArch64CC::MI: // N set
+ case AArch64CC::PL: // N clear
+ UsedFlags.N = true;
+ break;
+
+ case AArch64CC::VS: // V set
+ case AArch64CC::VC: // V clear
+ UsedFlags.V = true;
+ break;
+
+ case AArch64CC::GT: // Z clear, N and V the same
+ case AArch64CC::LE: // Z set, N and V differ
+ UsedFlags.Z = true;
+ case AArch64CC::GE: // N and V the same
+ case AArch64CC::LT: // N and V differ
+ UsedFlags.N = true;
+ UsedFlags.V = true;
+ break;
}
+ return UsedFlags;
+}
+
+static bool isADDSRegImm(unsigned Opcode) {
+ return Opcode == AArch64::ADDSWri || Opcode == AArch64::ADDSXri;
+}
+
+static bool isSUBSRegImm(unsigned Opcode) {
+ return Opcode == AArch64::SUBSWri || Opcode == AArch64::SUBSXri;
+}
+
+/// Check if CmpInstr can be substituted by MI.
+///
+/// CmpInstr can be substituted:
+/// - CmpInstr is either 'ADDS %vreg, 0' or 'SUBS %vreg, 0'
+/// - and, MI and CmpInstr are from the same MachineBB
+/// - and, condition flags are not alive in successors of the CmpInstr parent
+/// - and, if MI opcode is the S form there must be no defs of flags between
+/// MI and CmpInstr
+/// or if MI opcode is not the S form there must be neither defs of flags
+/// nor uses of flags between MI and CmpInstr.
+/// - and C/V flags are not used after CmpInstr
+static bool canInstrSubstituteCmpInstr(MachineInstr *MI, MachineInstr *CmpInstr,
+ const TargetRegisterInfo *TRI) {
+ assert(MI);
+ assert(sForm(*MI) != AArch64::INSTRUCTION_LIST_END);
+ assert(CmpInstr);
+
+ const unsigned CmpOpcode = CmpInstr->getOpcode();
+ if (!isADDSRegImm(CmpOpcode) && !isSUBSRegImm(CmpOpcode))
+ return false;
- // If NZCV is not killed nor re-defined, we should check whether it is
- // live-out. If it is live-out, do not optimize.
- if (!IsSafe) {
- MachineBasicBlock *ParentBlock = CmpInstr->getParent();
- for (auto *MBB : ParentBlock->successors())
- if (MBB->isLiveIn(AArch64::NZCV))
+ if (MI->getParent() != CmpInstr->getParent())
+ return false;
+
+ if (areCFlagsAliveInSuccessors(CmpInstr->getParent()))
+ return false;
+
+ AccessKind AccessToCheck = AK_Write;
+ if (sForm(*MI) != MI->getOpcode())
+ AccessToCheck = AK_All;
+ if (areCFlagsAccessedBetweenInstrs(MI, CmpInstr, TRI, AccessToCheck))
+ return false;
+
+ UsedNZCV NZCVUsedAfterCmp;
+ for (auto I = std::next(CmpInstr->getIterator()), E = CmpInstr->getParent()->instr_end();
+ I != E; ++I) {
+ const MachineInstr &Instr = *I;
+ if (Instr.readsRegister(AArch64::NZCV, TRI)) {
+ AArch64CC::CondCode CC = findCondCodeUsedByInstr(Instr);
+ if (CC == AArch64CC::Invalid) // Unsupported conditional instruction
return false;
+ NZCVUsedAfterCmp |= getUsedNZCV(CC);
+ }
+
+ if (Instr.modifiesRegister(AArch64::NZCV, TRI))
+ break;
}
+
+ return !NZCVUsedAfterCmp.C && !NZCVUsedAfterCmp.V;
+}
+
+/// Substitute an instruction comparing to zero with another instruction
+/// which produces needed condition flags.
+///
+/// Return true on success.
+bool AArch64InstrInfo::substituteCmpToZero(
+ MachineInstr &CmpInstr, unsigned SrcReg,
+ const MachineRegisterInfo *MRI) const {
+ assert(MRI);
+ // Get the unique definition of SrcReg.
+ MachineInstr *MI = MRI->getUniqueVRegDef(SrcReg);
+ if (!MI)
+ return false;
+
+ const TargetRegisterInfo *TRI = &getRegisterInfo();
+
+ unsigned NewOpc = sForm(*MI);
+ if (NewOpc == AArch64::INSTRUCTION_LIST_END)
+ return false;
+
+ if (!canInstrSubstituteCmpInstr(MI, &CmpInstr, TRI))
+ return false;
// Update the instruction to set NZCV.
MI->setDesc(get(NewOpc));
- CmpInstr->eraseFromParent();
- bool succeeded = UpdateOperandRegClass(MI);
+ CmpInstr.eraseFromParent();
+ bool succeeded = UpdateOperandRegClass(*MI);
(void)succeeded;
assert(succeeded && "Some operands reg class are incompatible!");
MI->addRegisterDefined(AArch64::NZCV, TRI);
return true;
}
-bool
-AArch64InstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
- if (MI->getOpcode() != TargetOpcode::LOAD_STACK_GUARD)
+bool AArch64InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
+ if (MI.getOpcode() != TargetOpcode::LOAD_STACK_GUARD)
return false;
- MachineBasicBlock &MBB = *MI->getParent();
- DebugLoc DL = MI->getDebugLoc();
- unsigned Reg = MI->getOperand(0).getReg();
+ MachineBasicBlock &MBB = *MI.getParent();
+ DebugLoc DL = MI.getDebugLoc();
+ unsigned Reg = MI.getOperand(0).getReg();
const GlobalValue *GV =
- cast<GlobalValue>((*MI->memoperands_begin())->getValue());
+ cast<GlobalValue>((*MI.memoperands_begin())->getValue());
const TargetMachine &TM = MBB.getParent()->getTarget();
unsigned char OpFlags = Subtarget.ClassifyGlobalReference(GV, TM);
const unsigned char MO_NC = AArch64II::MO_NC;
@@ -1000,8 +1164,9 @@ AArch64InstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
BuildMI(MBB, MI, DL, get(AArch64::LOADgot), Reg)
.addGlobalAddress(GV, 0, AArch64II::MO_GOT);
BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg)
- .addReg(Reg, RegState::Kill).addImm(0)
- .addMemOperand(*MI->memoperands_begin());
+ .addReg(Reg, RegState::Kill)
+ .addImm(0)
+ .addMemOperand(*MI.memoperands_begin());
} else if (TM.getCodeModel() == CodeModel::Large) {
BuildMI(MBB, MI, DL, get(AArch64::MOVZXi), Reg)
.addGlobalAddress(GV, 0, AArch64II::MO_G3).addImm(48);
@@ -1015,8 +1180,9 @@ AArch64InstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
.addReg(Reg, RegState::Kill)
.addGlobalAddress(GV, 0, AArch64II::MO_G0 | MO_NC).addImm(0);
BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg)
- .addReg(Reg, RegState::Kill).addImm(0)
- .addMemOperand(*MI->memoperands_begin());
+ .addReg(Reg, RegState::Kill)
+ .addImm(0)
+ .addMemOperand(*MI.memoperands_begin());
} else {
BuildMI(MBB, MI, DL, get(AArch64::ADRP), Reg)
.addGlobalAddress(GV, 0, OpFlags | AArch64II::MO_PAGE);
@@ -1024,7 +1190,7 @@ AArch64InstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg)
.addReg(Reg, RegState::Kill)
.addGlobalAddress(GV, 0, LoFlags)
- .addMemOperand(*MI->memoperands_begin());
+ .addMemOperand(*MI.memoperands_begin());
}
MBB.erase(MI);
@@ -1033,8 +1199,8 @@ AArch64InstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
}
/// Return true if this is this instruction has a non-zero immediate
-bool AArch64InstrInfo::hasShiftedReg(const MachineInstr *MI) const {
- switch (MI->getOpcode()) {
+bool AArch64InstrInfo::hasShiftedReg(const MachineInstr &MI) const {
+ switch (MI.getOpcode()) {
default:
break;
case AArch64::ADDSWrs:
@@ -1069,8 +1235,8 @@ bool AArch64InstrInfo::hasShiftedReg(const MachineInstr *MI) const {
case AArch64::SUBSXrs:
case AArch64::SUBWrs:
case AArch64::SUBXrs:
- if (MI->getOperand(3).isImm()) {
- unsigned val = MI->getOperand(3).getImm();
+ if (MI.getOperand(3).isImm()) {
+ unsigned val = MI.getOperand(3).getImm();
return (val != 0);
}
break;
@@ -1079,8 +1245,8 @@ bool AArch64InstrInfo::hasShiftedReg(const MachineInstr *MI) const {
}
/// Return true if this is this instruction has a non-zero immediate
-bool AArch64InstrInfo::hasExtendedReg(const MachineInstr *MI) const {
- switch (MI->getOpcode()) {
+bool AArch64InstrInfo::hasExtendedReg(const MachineInstr &MI) const {
+ switch (MI.getOpcode()) {
default:
break;
case AArch64::ADDSWrx:
@@ -1095,8 +1261,8 @@ bool AArch64InstrInfo::hasExtendedReg(const MachineInstr *MI) const {
case AArch64::SUBWrx:
case AArch64::SUBXrx:
case AArch64::SUBXrx64:
- if (MI->getOperand(3).isImm()) {
- unsigned val = MI->getOperand(3).getImm();
+ if (MI.getOperand(3).isImm()) {
+ unsigned val = MI.getOperand(3).getImm();
return (val != 0);
}
break;
@@ -1107,51 +1273,51 @@ bool AArch64InstrInfo::hasExtendedReg(const MachineInstr *MI) const {
// Return true if this instruction simply sets its single destination register
// to zero. This is equivalent to a register rename of the zero-register.
-bool AArch64InstrInfo::isGPRZero(const MachineInstr *MI) const {
- switch (MI->getOpcode()) {
+bool AArch64InstrInfo::isGPRZero(const MachineInstr &MI) const {
+ switch (MI.getOpcode()) {
default:
break;
case AArch64::MOVZWi:
case AArch64::MOVZXi: // movz Rd, #0 (LSL #0)
- if (MI->getOperand(1).isImm() && MI->getOperand(1).getImm() == 0) {
- assert(MI->getDesc().getNumOperands() == 3 &&
- MI->getOperand(2).getImm() == 0 && "invalid MOVZi operands");
+ if (MI.getOperand(1).isImm() && MI.getOperand(1).getImm() == 0) {
+ assert(MI.getDesc().getNumOperands() == 3 &&
+ MI.getOperand(2).getImm() == 0 && "invalid MOVZi operands");
return true;
}
break;
case AArch64::ANDWri: // and Rd, Rzr, #imm
- return MI->getOperand(1).getReg() == AArch64::WZR;
+ return MI.getOperand(1).getReg() == AArch64::WZR;
case AArch64::ANDXri:
- return MI->getOperand(1).getReg() == AArch64::XZR;
+ return MI.getOperand(1).getReg() == AArch64::XZR;
case TargetOpcode::COPY:
- return MI->getOperand(1).getReg() == AArch64::WZR;
+ return MI.getOperand(1).getReg() == AArch64::WZR;
}
return false;
}
// Return true if this instruction simply renames a general register without
// modifying bits.
-bool AArch64InstrInfo::isGPRCopy(const MachineInstr *MI) const {
- switch (MI->getOpcode()) {
+bool AArch64InstrInfo::isGPRCopy(const MachineInstr &MI) const {
+ switch (MI.getOpcode()) {
default:
break;
case TargetOpcode::COPY: {
// GPR32 copies will by lowered to ORRXrs
- unsigned DstReg = MI->getOperand(0).getReg();
+ unsigned DstReg = MI.getOperand(0).getReg();
return (AArch64::GPR32RegClass.contains(DstReg) ||
AArch64::GPR64RegClass.contains(DstReg));
}
case AArch64::ORRXrs: // orr Xd, Xzr, Xm (LSL #0)
- if (MI->getOperand(1).getReg() == AArch64::XZR) {
- assert(MI->getDesc().getNumOperands() == 4 &&
- MI->getOperand(3).getImm() == 0 && "invalid ORRrs operands");
+ if (MI.getOperand(1).getReg() == AArch64::XZR) {
+ assert(MI.getDesc().getNumOperands() == 4 &&
+ MI.getOperand(3).getImm() == 0 && "invalid ORRrs operands");
return true;
}
break;
case AArch64::ADDXri: // add Xd, Xn, #0 (LSL #0)
- if (MI->getOperand(2).getImm() == 0) {
- assert(MI->getDesc().getNumOperands() == 4 &&
- MI->getOperand(3).getImm() == 0 && "invalid ADDXri operands");
+ if (MI.getOperand(2).getImm() == 0) {
+ assert(MI.getDesc().getNumOperands() == 4 &&
+ MI.getOperand(3).getImm() == 0 && "invalid ADDXri operands");
return true;
}
break;
@@ -1161,19 +1327,19 @@ bool AArch64InstrInfo::isGPRCopy(const MachineInstr *MI) const {
// Return true if this instruction simply renames a general register without
// modifying bits.
-bool AArch64InstrInfo::isFPRCopy(const MachineInstr *MI) const {
- switch (MI->getOpcode()) {
+bool AArch64InstrInfo::isFPRCopy(const MachineInstr &MI) const {
+ switch (MI.getOpcode()) {
default:
break;
case TargetOpcode::COPY: {
// FPR64 copies will by lowered to ORR.16b
- unsigned DstReg = MI->getOperand(0).getReg();
+ unsigned DstReg = MI.getOperand(0).getReg();
return (AArch64::FPR64RegClass.contains(DstReg) ||
AArch64::FPR128RegClass.contains(DstReg));
}
case AArch64::ORRv16i8:
- if (MI->getOperand(1).getReg() == MI->getOperand(2).getReg()) {
- assert(MI->getDesc().getNumOperands() == 3 && MI->getOperand(0).isReg() &&
+ if (MI.getOperand(1).getReg() == MI.getOperand(2).getReg()) {
+ assert(MI.getDesc().getNumOperands() == 3 && MI.getOperand(0).isReg() &&
"invalid ORRv16i8 operands");
return true;
}
@@ -1182,9 +1348,9 @@ bool AArch64InstrInfo::isFPRCopy(const MachineInstr *MI) const {
return false;
}
-unsigned AArch64InstrInfo::isLoadFromStackSlot(const MachineInstr *MI,
+unsigned AArch64InstrInfo::isLoadFromStackSlot(const MachineInstr &MI,
int &FrameIndex) const {
- switch (MI->getOpcode()) {
+ switch (MI.getOpcode()) {
default:
break;
case AArch64::LDRWui:
@@ -1194,10 +1360,10 @@ unsigned AArch64InstrInfo::isLoadFromStackSlot(const MachineInstr *MI,
case AArch64::LDRSui:
case AArch64::LDRDui:
case AArch64::LDRQui:
- if (MI->getOperand(0).getSubReg() == 0 && MI->getOperand(1).isFI() &&
- MI->getOperand(2).isImm() && MI->getOperand(2).getImm() == 0) {
- FrameIndex = MI->getOperand(1).getIndex();
- return MI->getOperand(0).getReg();
+ if (MI.getOperand(0).getSubReg() == 0 && MI.getOperand(1).isFI() &&
+ MI.getOperand(2).isImm() && MI.getOperand(2).getImm() == 0) {
+ FrameIndex = MI.getOperand(1).getIndex();
+ return MI.getOperand(0).getReg();
}
break;
}
@@ -1205,9 +1371,9 @@ unsigned AArch64InstrInfo::isLoadFromStackSlot(const MachineInstr *MI,
return 0;
}
-unsigned AArch64InstrInfo::isStoreToStackSlot(const MachineInstr *MI,
+unsigned AArch64InstrInfo::isStoreToStackSlot(const MachineInstr &MI,
int &FrameIndex) const {
- switch (MI->getOpcode()) {
+ switch (MI.getOpcode()) {
default:
break;
case AArch64::STRWui:
@@ -1217,10 +1383,10 @@ unsigned AArch64InstrInfo::isStoreToStackSlot(const MachineInstr *MI,
case AArch64::STRSui:
case AArch64::STRDui:
case AArch64::STRQui:
- if (MI->getOperand(0).getSubReg() == 0 && MI->getOperand(1).isFI() &&
- MI->getOperand(2).isImm() && MI->getOperand(2).getImm() == 0) {
- FrameIndex = MI->getOperand(1).getIndex();
- return MI->getOperand(0).getReg();
+ if (MI.getOperand(0).getSubReg() == 0 && MI.getOperand(1).isFI() &&
+ MI.getOperand(2).isImm() && MI.getOperand(2).getImm() == 0) {
+ FrameIndex = MI.getOperand(1).getIndex();
+ return MI.getOperand(0).getReg();
}
break;
}
@@ -1230,8 +1396,8 @@ unsigned AArch64InstrInfo::isStoreToStackSlot(const MachineInstr *MI,
/// Return true if this is load/store scales or extends its register offset.
/// This refers to scaling a dynamic index as opposed to scaled immediates.
/// MI should be a memory op that allows scaled addressing.
-bool AArch64InstrInfo::isScaledAddr(const MachineInstr *MI) const {
- switch (MI->getOpcode()) {
+bool AArch64InstrInfo::isScaledAddr(const MachineInstr &MI) const {
+ switch (MI.getOpcode()) {
default:
break;
case AArch64::LDRBBroW:
@@ -1281,7 +1447,7 @@ bool AArch64InstrInfo::isScaledAddr(const MachineInstr *MI) const {
case AArch64::STRWroX:
case AArch64::STRXroX:
- unsigned Val = MI->getOperand(3).getImm();
+ unsigned Val = MI.getOperand(3).getImm();
AArch64_AM::ShiftExtendType ExtType = AArch64_AM::getMemExtendType(Val);
return (ExtType != AArch64_AM::UXTX) || AArch64_AM::getMemDoShift(Val);
}
@@ -1289,36 +1455,96 @@ bool AArch64InstrInfo::isScaledAddr(const MachineInstr *MI) const {
}
/// Check all MachineMemOperands for a hint to suppress pairing.
-bool AArch64InstrInfo::isLdStPairSuppressed(const MachineInstr *MI) const {
- assert(MOSuppressPair < (1 << MachineMemOperand::MOTargetNumBits) &&
- "Too many target MO flags");
- for (auto *MM : MI->memoperands()) {
- if (MM->getFlags() &
- (MOSuppressPair << MachineMemOperand::MOTargetStartBit)) {
- return true;
- }
- }
- return false;
+bool AArch64InstrInfo::isLdStPairSuppressed(const MachineInstr &MI) const {
+ return any_of(MI.memoperands(), [](MachineMemOperand *MMO) {
+ return MMO->getFlags() & MOSuppressPair;
+ });
}
/// Set a flag on the first MachineMemOperand to suppress pairing.
-void AArch64InstrInfo::suppressLdStPair(MachineInstr *MI) const {
- if (MI->memoperands_empty())
+void AArch64InstrInfo::suppressLdStPair(MachineInstr &MI) const {
+ if (MI.memoperands_empty())
return;
+ (*MI.memoperands_begin())->setFlags(MOSuppressPair);
+}
- assert(MOSuppressPair < (1 << MachineMemOperand::MOTargetNumBits) &&
- "Too many target MO flags");
- (*MI->memoperands_begin())
- ->setFlags(MOSuppressPair << MachineMemOperand::MOTargetStartBit);
+bool AArch64InstrInfo::isUnscaledLdSt(unsigned Opc) const {
+ switch (Opc) {
+ default:
+ return false;
+ case AArch64::STURSi:
+ case AArch64::STURDi:
+ case AArch64::STURQi:
+ case AArch64::STURBBi:
+ case AArch64::STURHHi:
+ case AArch64::STURWi:
+ case AArch64::STURXi:
+ case AArch64::LDURSi:
+ case AArch64::LDURDi:
+ case AArch64::LDURQi:
+ case AArch64::LDURWi:
+ case AArch64::LDURXi:
+ case AArch64::LDURSWi:
+ case AArch64::LDURHHi:
+ case AArch64::LDURBBi:
+ case AArch64::LDURSBWi:
+ case AArch64::LDURSHWi:
+ return true;
+ }
}
-bool
-AArch64InstrInfo::getMemOpBaseRegImmOfs(MachineInstr *LdSt, unsigned &BaseReg,
- unsigned &Offset,
- const TargetRegisterInfo *TRI) const {
- switch (LdSt->getOpcode()) {
+bool AArch64InstrInfo::isUnscaledLdSt(MachineInstr &MI) const {
+ return isUnscaledLdSt(MI.getOpcode());
+}
+
+// Is this a candidate for ld/st merging or pairing? For example, we don't
+// touch volatiles or load/stores that have a hint to avoid pair formation.
+bool AArch64InstrInfo::isCandidateToMergeOrPair(MachineInstr &MI) const {
+ // If this is a volatile load/store, don't mess with it.
+ if (MI.hasOrderedMemoryRef())
+ return false;
+
+ // Make sure this is a reg+imm (as opposed to an address reloc).
+ assert(MI.getOperand(1).isReg() && "Expected a reg operand.");
+ if (!MI.getOperand(2).isImm())
+ return false;
+
+ // Can't merge/pair if the instruction modifies the base register.
+ // e.g., ldr x0, [x0]
+ unsigned BaseReg = MI.getOperand(1).getReg();
+ const TargetRegisterInfo *TRI = &getRegisterInfo();
+ if (MI.modifiesRegister(BaseReg, TRI))
+ return false;
+
+ // Check if this load/store has a hint to avoid pair formation.
+ // MachineMemOperands hints are set by the AArch64StorePairSuppress pass.
+ if (isLdStPairSuppressed(MI))
+ return false;
+
+ // On some CPUs quad load/store pairs are slower than two single load/stores.
+ if (Subtarget.avoidQuadLdStPairs()) {
+ switch (MI.getOpcode()) {
+ default:
+ break;
+
+ case AArch64::LDURQi:
+ case AArch64::STURQi:
+ case AArch64::LDRQui:
+ case AArch64::STRQui:
+ return false;
+ }
+ }
+
+ return true;
+}
+
+bool AArch64InstrInfo::getMemOpBaseRegImmOfs(
+ MachineInstr &LdSt, unsigned &BaseReg, int64_t &Offset,
+ const TargetRegisterInfo *TRI) const {
+ switch (LdSt.getOpcode()) {
default:
return false;
+ // Scaled instructions.
case AArch64::STRSui:
case AArch64::STRDui:
case AArch64::STRQui:
@@ -1329,29 +1555,45 @@ AArch64InstrInfo::getMemOpBaseRegImmOfs(MachineInstr *LdSt, unsigned &BaseReg,
case AArch64::LDRQui:
case AArch64::LDRXui:
case AArch64::LDRWui:
- if (!LdSt->getOperand(1).isReg() || !LdSt->getOperand(2).isImm())
- return false;
- BaseReg = LdSt->getOperand(1).getReg();
- MachineFunction &MF = *LdSt->getParent()->getParent();
- unsigned Width = getRegClass(LdSt->getDesc(), 0, TRI, MF)->getSize();
- Offset = LdSt->getOperand(2).getImm() * Width;
- return true;
+ case AArch64::LDRSWui:
+ // Unscaled instructions.
+ case AArch64::STURSi:
+ case AArch64::STURDi:
+ case AArch64::STURQi:
+ case AArch64::STURXi:
+ case AArch64::STURWi:
+ case AArch64::LDURSi:
+ case AArch64::LDURDi:
+ case AArch64::LDURQi:
+ case AArch64::LDURWi:
+ case AArch64::LDURXi:
+ case AArch64::LDURSWi:
+ unsigned Width;
+ return getMemOpBaseRegImmOfsWidth(LdSt, BaseReg, Offset, Width, TRI);
};
}
bool AArch64InstrInfo::getMemOpBaseRegImmOfsWidth(
- MachineInstr *LdSt, unsigned &BaseReg, int &Offset, int &Width,
+ MachineInstr &LdSt, unsigned &BaseReg, int64_t &Offset, unsigned &Width,
const TargetRegisterInfo *TRI) const {
+ assert(LdSt.mayLoadOrStore() && "Expected a memory operation.");
// Handle only loads/stores with base register followed by immediate offset.
- if (LdSt->getNumOperands() != 3)
- return false;
- if (!LdSt->getOperand(1).isReg() || !LdSt->getOperand(2).isImm())
+ if (LdSt.getNumExplicitOperands() == 3) {
+ // Non-paired instruction (e.g., ldr x1, [x0, #8]).
+ if (!LdSt.getOperand(1).isReg() || !LdSt.getOperand(2).isImm())
+ return false;
+ } else if (LdSt.getNumExplicitOperands() == 4) {
+ // Paired instruction (e.g., ldp x1, x2, [x0, #8]).
+ if (!LdSt.getOperand(1).isReg() || !LdSt.getOperand(2).isReg() ||
+ !LdSt.getOperand(3).isImm())
+ return false;
+ } else
return false;
// Offset is calculated as the immediate operand multiplied by the scaling factor.
// Unscaled instructions have scaling factor set to 1.
- int Scale = 0;
- switch (LdSt->getOpcode()) {
+ unsigned Scale = 0;
+ switch (LdSt.getOpcode()) {
default:
return false;
case AArch64::LDURQi:
@@ -1392,18 +1634,48 @@ bool AArch64InstrInfo::getMemOpBaseRegImmOfsWidth(
Width = 1;
Scale = 1;
break;
+ case AArch64::LDPQi:
+ case AArch64::LDNPQi:
+ case AArch64::STPQi:
+ case AArch64::STNPQi:
+ Scale = 16;
+ Width = 32;
+ break;
case AArch64::LDRQui:
case AArch64::STRQui:
Scale = Width = 16;
break;
+ case AArch64::LDPXi:
+ case AArch64::LDPDi:
+ case AArch64::LDNPXi:
+ case AArch64::LDNPDi:
+ case AArch64::STPXi:
+ case AArch64::STPDi:
+ case AArch64::STNPXi:
+ case AArch64::STNPDi:
+ Scale = 8;
+ Width = 16;
+ break;
case AArch64::LDRXui:
case AArch64::LDRDui:
case AArch64::STRXui:
case AArch64::STRDui:
Scale = Width = 8;
break;
+ case AArch64::LDPWi:
+ case AArch64::LDPSi:
+ case AArch64::LDNPWi:
+ case AArch64::LDNPSi:
+ case AArch64::STPWi:
+ case AArch64::STPSi:
+ case AArch64::STNPWi:
+ case AArch64::STNPSi:
+ Scale = 4;
+ Width = 8;
+ break;
case AArch64::LDRWui:
case AArch64::LDRSui:
+ case AArch64::LDRSWui:
case AArch64::STRWui:
case AArch64::STRSui:
Scale = Width = 4;
@@ -1420,41 +1692,120 @@ bool AArch64InstrInfo::getMemOpBaseRegImmOfsWidth(
case AArch64::STRBBui:
Scale = Width = 1;
break;
- };
+ }
- BaseReg = LdSt->getOperand(1).getReg();
- Offset = LdSt->getOperand(2).getImm() * Scale;
+ if (LdSt.getNumExplicitOperands() == 3) {
+ BaseReg = LdSt.getOperand(1).getReg();
+ Offset = LdSt.getOperand(2).getImm() * Scale;
+ } else {
+ assert(LdSt.getNumExplicitOperands() == 4 && "invalid number of operands");
+ BaseReg = LdSt.getOperand(2).getReg();
+ Offset = LdSt.getOperand(3).getImm() * Scale;
+ }
return true;
}
+// Scale the unscaled offsets. Returns false if the unscaled offset can't be
+// scaled.
+static bool scaleOffset(unsigned Opc, int64_t &Offset) {
+ unsigned OffsetStride = 1;
+ switch (Opc) {
+ default:
+ return false;
+ case AArch64::LDURQi:
+ case AArch64::STURQi:
+ OffsetStride = 16;
+ break;
+ case AArch64::LDURXi:
+ case AArch64::LDURDi:
+ case AArch64::STURXi:
+ case AArch64::STURDi:
+ OffsetStride = 8;
+ break;
+ case AArch64::LDURWi:
+ case AArch64::LDURSi:
+ case AArch64::LDURSWi:
+ case AArch64::STURWi:
+ case AArch64::STURSi:
+ OffsetStride = 4;
+ break;
+ }
+ // If the byte-offset isn't a multiple of the stride, we can't scale this
+ // offset.
+ if (Offset % OffsetStride != 0)
+ return false;
+
+ // Convert the byte-offset used by unscaled into an "element" offset used
+ // by the scaled pair load/store instructions.
+ Offset /= OffsetStride;
+ return true;
+}
+
+static bool canPairLdStOpc(unsigned FirstOpc, unsigned SecondOpc) {
+ if (FirstOpc == SecondOpc)
+ return true;
+ // We can also pair sign-ext and zero-ext instructions.
+ switch (FirstOpc) {
+ default:
+ return false;
+ case AArch64::LDRWui:
+ case AArch64::LDURWi:
+ return SecondOpc == AArch64::LDRSWui || SecondOpc == AArch64::LDURSWi;
+ case AArch64::LDRSWui:
+ case AArch64::LDURSWi:
+ return SecondOpc == AArch64::LDRWui || SecondOpc == AArch64::LDURWi;
+ }
+ // These instructions can't be paired based on their opcodes.
+ return false;
+}
+
/// Detect opportunities for ldp/stp formation.
///
/// Only called for LdSt for which getMemOpBaseRegImmOfs returns true.
-bool AArch64InstrInfo::shouldClusterLoads(MachineInstr *FirstLdSt,
- MachineInstr *SecondLdSt,
- unsigned NumLoads) const {
+bool AArch64InstrInfo::shouldClusterMemOps(MachineInstr &FirstLdSt,
+ MachineInstr &SecondLdSt,
+ unsigned NumLoads) const {
// Only cluster up to a single pair.
if (NumLoads > 1)
return false;
- if (FirstLdSt->getOpcode() != SecondLdSt->getOpcode())
+
+ // Can we pair these instructions based on their opcodes?
+ unsigned FirstOpc = FirstLdSt.getOpcode();
+ unsigned SecondOpc = SecondLdSt.getOpcode();
+ if (!canPairLdStOpc(FirstOpc, SecondOpc))
+ return false;
+
+ // Can't merge volatiles or load/stores that have a hint to avoid pair
+ // formation, for example.
+ if (!isCandidateToMergeOrPair(FirstLdSt) ||
+ !isCandidateToMergeOrPair(SecondLdSt))
+ return false;
+
+ // isCandidateToMergeOrPair guarantees that operand 2 is an immediate.
+ int64_t Offset1 = FirstLdSt.getOperand(2).getImm();
+ if (isUnscaledLdSt(FirstOpc) && !scaleOffset(FirstOpc, Offset1))
+ return false;
+
+ int64_t Offset2 = SecondLdSt.getOperand(2).getImm();
+ if (isUnscaledLdSt(SecondOpc) && !scaleOffset(SecondOpc, Offset2))
return false;
- // getMemOpBaseRegImmOfs guarantees that oper 2 isImm.
- unsigned Ofs1 = FirstLdSt->getOperand(2).getImm();
- // Allow 6 bits of positive range.
- if (Ofs1 > 64)
+
+ // Pairwise instructions have a 7-bit signed offset field.
+ if (Offset1 > 63 || Offset1 < -64)
return false;
+
// The caller should already have ordered First/SecondLdSt by offset.
- unsigned Ofs2 = SecondLdSt->getOperand(2).getImm();
- return Ofs1 + 1 == Ofs2;
+ assert(Offset1 <= Offset2 && "Caller should have ordered offsets.");
+ return Offset1 + 1 == Offset2;
}
-bool AArch64InstrInfo::shouldScheduleAdjacent(MachineInstr *First,
- MachineInstr *Second) const {
- if (Subtarget.isCyclone()) {
- // Cyclone can fuse CMN, CMP, TST followed by Bcc.
- unsigned SecondOpcode = Second->getOpcode();
+bool AArch64InstrInfo::shouldScheduleAdjacent(MachineInstr &First,
+ MachineInstr &Second) const {
+ if (Subtarget.hasMacroOpFusion()) {
+ // Fuse CMN, CMP, TST followed by Bcc.
+ unsigned SecondOpcode = Second.getOpcode();
if (SecondOpcode == AArch64::Bcc) {
- switch (First->getOpcode()) {
+ switch (First.getOpcode()) {
default:
return false;
case AArch64::SUBSWri:
@@ -1466,10 +1817,10 @@ bool AArch64InstrInfo::shouldScheduleAdjacent(MachineInstr *First,
return true;
}
}
- // Cyclone B0 also supports ALU operations followed by CBZ/CBNZ.
+ // Fuse ALU operations followed by CBZ/CBNZ.
if (SecondOpcode == AArch64::CBNZW || SecondOpcode == AArch64::CBNZX ||
SecondOpcode == AArch64::CBZW || SecondOpcode == AArch64::CBZX) {
- switch (First->getOpcode()) {
+ switch (First.getOpcode()) {
default:
return false;
case AArch64::ADDWri:
@@ -1491,7 +1842,7 @@ bool AArch64InstrInfo::shouldScheduleAdjacent(MachineInstr *First,
MachineInstr *AArch64InstrInfo::emitFrameIndexDebugValue(
MachineFunction &MF, int FrameIx, uint64_t Offset, const MDNode *Var,
- const MDNode *Expr, DebugLoc DL) const {
+ const MDNode *Expr, const DebugLoc &DL) const {
MachineInstrBuilder MIB = BuildMI(MF, DL, get(AArch64::DBG_VALUE))
.addFrameIndex(FrameIx)
.addImm(0)
@@ -1521,7 +1872,7 @@ static bool forwardCopyWillClobberTuple(unsigned DestReg, unsigned SrcReg,
}
void AArch64InstrInfo::copyPhysRegTuple(
- MachineBasicBlock &MBB, MachineBasicBlock::iterator I, DebugLoc DL,
+ MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL,
unsigned DestReg, unsigned SrcReg, bool KillSrc, unsigned Opcode,
llvm::ArrayRef<unsigned> Indices) const {
assert(Subtarget.hasNEON() &&
@@ -1547,9 +1898,9 @@ void AArch64InstrInfo::copyPhysRegTuple(
}
void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
- MachineBasicBlock::iterator I, DebugLoc DL,
- unsigned DestReg, unsigned SrcReg,
- bool KillSrc) const {
+ MachineBasicBlock::iterator I,
+ const DebugLoc &DL, unsigned DestReg,
+ unsigned SrcReg, bool KillSrc) const {
if (AArch64::GPR32spRegClass.contains(DestReg) &&
(AArch64::GPR32spRegClass.contains(SrcReg) || SrcReg == AArch64::WZR)) {
const TargetRegisterInfo *TRI = &getRegisterInfo();
@@ -1818,8 +2169,7 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
if (SrcReg == AArch64::NZCV) {
assert(AArch64::GPR64RegClass.contains(DestReg) && "Invalid NZCV copy");
- BuildMI(MBB, I, DL, get(AArch64::MRS))
- .addReg(DestReg)
+ BuildMI(MBB, I, DL, get(AArch64::MRS), DestReg)
.addImm(AArch64SysReg::NZCV)
.addReg(AArch64::NZCV, RegState::Implicit | getKillRegState(KillSrc));
return;
@@ -1879,39 +2229,45 @@ void AArch64InstrInfo::storeRegToStackSlot(
else if (AArch64::DDRegClass.hasSubClassEq(RC)) {
assert(Subtarget.hasNEON() &&
"Unexpected register store without NEON");
- Opc = AArch64::ST1Twov1d, Offset = false;
+ Opc = AArch64::ST1Twov1d;
+ Offset = false;
}
break;
case 24:
if (AArch64::DDDRegClass.hasSubClassEq(RC)) {
assert(Subtarget.hasNEON() &&
"Unexpected register store without NEON");
- Opc = AArch64::ST1Threev1d, Offset = false;
+ Opc = AArch64::ST1Threev1d;
+ Offset = false;
}
break;
case 32:
if (AArch64::DDDDRegClass.hasSubClassEq(RC)) {
assert(Subtarget.hasNEON() &&
"Unexpected register store without NEON");
- Opc = AArch64::ST1Fourv1d, Offset = false;
+ Opc = AArch64::ST1Fourv1d;
+ Offset = false;
} else if (AArch64::QQRegClass.hasSubClassEq(RC)) {
assert(Subtarget.hasNEON() &&
"Unexpected register store without NEON");
- Opc = AArch64::ST1Twov2d, Offset = false;
+ Opc = AArch64::ST1Twov2d;
+ Offset = false;
}
break;
case 48:
if (AArch64::QQQRegClass.hasSubClassEq(RC)) {
assert(Subtarget.hasNEON() &&
"Unexpected register store without NEON");
- Opc = AArch64::ST1Threev2d, Offset = false;
+ Opc = AArch64::ST1Threev2d;
+ Offset = false;
}
break;
case 64:
if (AArch64::QQQQRegClass.hasSubClassEq(RC)) {
assert(Subtarget.hasNEON() &&
"Unexpected register store without NEON");
- Opc = AArch64::ST1Fourv2d, Offset = false;
+ Opc = AArch64::ST1Fourv2d;
+ Offset = false;
}
break;
}
@@ -1977,39 +2333,45 @@ void AArch64InstrInfo::loadRegFromStackSlot(
else if (AArch64::DDRegClass.hasSubClassEq(RC)) {
assert(Subtarget.hasNEON() &&
"Unexpected register load without NEON");
- Opc = AArch64::LD1Twov1d, Offset = false;
+ Opc = AArch64::LD1Twov1d;
+ Offset = false;
}
break;
case 24:
if (AArch64::DDDRegClass.hasSubClassEq(RC)) {
assert(Subtarget.hasNEON() &&
"Unexpected register load without NEON");
- Opc = AArch64::LD1Threev1d, Offset = false;
+ Opc = AArch64::LD1Threev1d;
+ Offset = false;
}
break;
case 32:
if (AArch64::DDDDRegClass.hasSubClassEq(RC)) {
assert(Subtarget.hasNEON() &&
"Unexpected register load without NEON");
- Opc = AArch64::LD1Fourv1d, Offset = false;
+ Opc = AArch64::LD1Fourv1d;
+ Offset = false;
} else if (AArch64::QQRegClass.hasSubClassEq(RC)) {
assert(Subtarget.hasNEON() &&
"Unexpected register load without NEON");
- Opc = AArch64::LD1Twov2d, Offset = false;
+ Opc = AArch64::LD1Twov2d;
+ Offset = false;
}
break;
case 48:
if (AArch64::QQQRegClass.hasSubClassEq(RC)) {
assert(Subtarget.hasNEON() &&
"Unexpected register load without NEON");
- Opc = AArch64::LD1Threev2d, Offset = false;
+ Opc = AArch64::LD1Threev2d;
+ Offset = false;
}
break;
case 64:
if (AArch64::QQQQRegClass.hasSubClassEq(RC)) {
assert(Subtarget.hasNEON() &&
"Unexpected register load without NEON");
- Opc = AArch64::LD1Fourv2d, Offset = false;
+ Opc = AArch64::LD1Fourv2d;
+ Offset = false;
}
break;
}
@@ -2024,13 +2386,16 @@ void AArch64InstrInfo::loadRegFromStackSlot(
}
void llvm::emitFrameOffset(MachineBasicBlock &MBB,
- MachineBasicBlock::iterator MBBI, DebugLoc DL,
+ MachineBasicBlock::iterator MBBI, const DebugLoc &DL,
unsigned DestReg, unsigned SrcReg, int Offset,
const TargetInstrInfo *TII,
MachineInstr::MIFlag Flag, bool SetNZCV) {
if (DestReg == SrcReg && Offset == 0)
return;
+ assert((DestReg != AArch64::SP || Offset % 16 == 0) &&
+ "SP increment/decrement not 16-byte aligned");
+
bool isSub = Offset < 0;
if (isSub)
Offset = -Offset;
@@ -2082,8 +2447,9 @@ void llvm::emitFrameOffset(MachineBasicBlock &MBB,
}
MachineInstr *AArch64InstrInfo::foldMemoryOperandImpl(
- MachineFunction &MF, MachineInstr *MI, ArrayRef<unsigned> Ops,
- MachineBasicBlock::iterator InsertPt, int FrameIndex) const {
+ MachineFunction &MF, MachineInstr &MI, ArrayRef<unsigned> Ops,
+ MachineBasicBlock::iterator InsertPt, int FrameIndex,
+ LiveIntervals *LIS) const {
// This is a bit of a hack. Consider this instruction:
//
// %vreg0<def> = COPY %SP; GPR64all:%vreg0
@@ -2097,9 +2463,9 @@ MachineInstr *AArch64InstrInfo::foldMemoryOperandImpl(
//
// <rdar://problem/11522048>
//
- if (MI->isCopy()) {
- unsigned DstReg = MI->getOperand(0).getReg();
- unsigned SrcReg = MI->getOperand(1).getReg();
+ if (MI.isCopy()) {
+ unsigned DstReg = MI.getOperand(0).getReg();
+ unsigned SrcReg = MI.getOperand(1).getReg();
if (SrcReg == AArch64::SP &&
TargetRegisterInfo::isVirtualRegister(DstReg)) {
MF.getRegInfo().constrainRegClass(DstReg, &AArch64::GPR64RegClass);
@@ -2393,9 +2759,10 @@ void AArch64InstrInfo::getNoopForMachoTarget(MCInst &NopInst) const {
NopInst.setOpcode(AArch64::HINT);
NopInst.addOperand(MCOperand::createImm(0));
}
-/// useMachineCombiner - return true when a target supports MachineCombiner
+
+// AArch64 supports MachineCombiner.
bool AArch64InstrInfo::useMachineCombiner() const {
- // AArch64 supports the combiner
+
return true;
}
//
@@ -2456,37 +2823,75 @@ static bool isCombineInstrCandidate64(unsigned Opc) {
return false;
}
//
+// FP Opcodes that can be combined with a FMUL
+static bool isCombineInstrCandidateFP(const MachineInstr &Inst) {
+ switch (Inst.getOpcode()) {
+ case AArch64::FADDSrr:
+ case AArch64::FADDDrr:
+ case AArch64::FADDv2f32:
+ case AArch64::FADDv2f64:
+ case AArch64::FADDv4f32:
+ case AArch64::FSUBSrr:
+ case AArch64::FSUBDrr:
+ case AArch64::FSUBv2f32:
+ case AArch64::FSUBv2f64:
+ case AArch64::FSUBv4f32:
+ return Inst.getParent()->getParent()->getTarget().Options.UnsafeFPMath;
+ default:
+ break;
+ }
+ return false;
+}
+//
// Opcodes that can be combined with a MUL
static bool isCombineInstrCandidate(unsigned Opc) {
return (isCombineInstrCandidate32(Opc) || isCombineInstrCandidate64(Opc));
}
-static bool canCombineWithMUL(MachineBasicBlock &MBB, MachineOperand &MO,
- unsigned MulOpc, unsigned ZeroReg) {
+//
+// Utility routine that checks if \param MO is defined by an
+// \param CombineOpc instruction in the basic block \param MBB
+static bool canCombine(MachineBasicBlock &MBB, MachineOperand &MO,
+ unsigned CombineOpc, unsigned ZeroReg = 0,
+ bool CheckZeroReg = false) {
MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
MachineInstr *MI = nullptr;
- // We need a virtual register definition.
+
if (MO.isReg() && TargetRegisterInfo::isVirtualRegister(MO.getReg()))
MI = MRI.getUniqueVRegDef(MO.getReg());
// And it needs to be in the trace (otherwise, it won't have a depth).
- if (!MI || MI->getParent() != &MBB || (unsigned)MI->getOpcode() != MulOpc)
- return false;
-
- assert(MI->getNumOperands() >= 4 && MI->getOperand(0).isReg() &&
- MI->getOperand(1).isReg() && MI->getOperand(2).isReg() &&
- MI->getOperand(3).isReg() && "MAdd/MSub must have a least 4 regs");
-
- // The third input reg must be zero.
- if (MI->getOperand(3).getReg() != ZeroReg)
+ if (!MI || MI->getParent() != &MBB || (unsigned)MI->getOpcode() != CombineOpc)
return false;
-
// Must only used by the user we combine with.
if (!MRI.hasOneNonDBGUse(MI->getOperand(0).getReg()))
return false;
+ if (CheckZeroReg) {
+ assert(MI->getNumOperands() >= 4 && MI->getOperand(0).isReg() &&
+ MI->getOperand(1).isReg() && MI->getOperand(2).isReg() &&
+ MI->getOperand(3).isReg() && "MAdd/MSub must have a least 4 regs");
+ // The third input reg must be zero.
+ if (MI->getOperand(3).getReg() != ZeroReg)
+ return false;
+ }
+
return true;
}
+//
+// Is \param MO defined by an integer multiply and can be combined?
+static bool canCombineWithMUL(MachineBasicBlock &MBB, MachineOperand &MO,
+ unsigned MulOpc, unsigned ZeroReg) {
+ return canCombine(MBB, MO, MulOpc, ZeroReg, true);
+}
+
+//
+// Is \param MO defined by a floating-point multiply and can be combined?
+static bool canCombineWithFMUL(MachineBasicBlock &MBB, MachineOperand &MO,
+ unsigned MulOpc) {
+ return canCombine(MBB, MO, MulOpc);
+}
+
// TODO: There are many more machine instruction opcodes to match:
// 1. Other data types (integer, vectors)
// 2. Other math / logic operations (xor, or)
@@ -2522,17 +2927,17 @@ static bool getMaddPatterns(MachineInstr &Root,
bool Found = false;
if (!isCombineInstrCandidate(Opc))
- return 0;
+ return false;
if (isCombineInstrSettingFlag(Opc)) {
int Cmp_NZCV = Root.findRegisterDefOperandIdx(AArch64::NZCV, true);
// When NZCV is live bail out.
if (Cmp_NZCV == -1)
- return 0;
- unsigned NewOpc = convertFlagSettingOpcode(&Root);
+ return false;
+ unsigned NewOpc = convertFlagSettingOpcode(Root);
// When opcode can't change bail out.
// CHECKME: do we miss any cases for opcode conversion?
if (NewOpc == Opc)
- return 0;
+ return false;
Opc = NewOpc;
}
@@ -2620,7 +3025,230 @@ static bool getMaddPatterns(MachineInstr &Root,
}
return Found;
}
+/// Floating-Point Support
+
+/// Find instructions that can be turned into madd.
+static bool getFMAPatterns(MachineInstr &Root,
+ SmallVectorImpl<MachineCombinerPattern> &Patterns) {
+
+ if (!isCombineInstrCandidateFP(Root))
+ return 0;
+ MachineBasicBlock &MBB = *Root.getParent();
+ bool Found = false;
+
+ switch (Root.getOpcode()) {
+ default:
+ assert(false && "Unsupported FP instruction in combiner\n");
+ break;
+ case AArch64::FADDSrr:
+ assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() &&
+ "FADDWrr does not have register operands");
+ if (canCombineWithFMUL(MBB, Root.getOperand(1), AArch64::FMULSrr)) {
+ Patterns.push_back(MachineCombinerPattern::FMULADDS_OP1);
+ Found = true;
+ } else if (canCombineWithFMUL(MBB, Root.getOperand(1),
+ AArch64::FMULv1i32_indexed)) {
+ Patterns.push_back(MachineCombinerPattern::FMLAv1i32_indexed_OP1);
+ Found = true;
+ }
+ if (canCombineWithFMUL(MBB, Root.getOperand(2), AArch64::FMULSrr)) {
+ Patterns.push_back(MachineCombinerPattern::FMULADDS_OP2);
+ Found = true;
+ } else if (canCombineWithFMUL(MBB, Root.getOperand(2),
+ AArch64::FMULv1i32_indexed)) {
+ Patterns.push_back(MachineCombinerPattern::FMLAv1i32_indexed_OP2);
+ Found = true;
+ }
+ break;
+ case AArch64::FADDDrr:
+ if (canCombineWithFMUL(MBB, Root.getOperand(1), AArch64::FMULDrr)) {
+ Patterns.push_back(MachineCombinerPattern::FMULADDD_OP1);
+ Found = true;
+ } else if (canCombineWithFMUL(MBB, Root.getOperand(1),
+ AArch64::FMULv1i64_indexed)) {
+ Patterns.push_back(MachineCombinerPattern::FMLAv1i64_indexed_OP1);
+ Found = true;
+ }
+ if (canCombineWithFMUL(MBB, Root.getOperand(2), AArch64::FMULDrr)) {
+ Patterns.push_back(MachineCombinerPattern::FMULADDD_OP2);
+ Found = true;
+ } else if (canCombineWithFMUL(MBB, Root.getOperand(2),
+ AArch64::FMULv1i64_indexed)) {
+ Patterns.push_back(MachineCombinerPattern::FMLAv1i64_indexed_OP2);
+ Found = true;
+ }
+ break;
+ case AArch64::FADDv2f32:
+ if (canCombineWithFMUL(MBB, Root.getOperand(1),
+ AArch64::FMULv2i32_indexed)) {
+ Patterns.push_back(MachineCombinerPattern::FMLAv2i32_indexed_OP1);
+ Found = true;
+ } else if (canCombineWithFMUL(MBB, Root.getOperand(1),
+ AArch64::FMULv2f32)) {
+ Patterns.push_back(MachineCombinerPattern::FMLAv2f32_OP1);
+ Found = true;
+ }
+ if (canCombineWithFMUL(MBB, Root.getOperand(2),
+ AArch64::FMULv2i32_indexed)) {
+ Patterns.push_back(MachineCombinerPattern::FMLAv2i32_indexed_OP2);
+ Found = true;
+ } else if (canCombineWithFMUL(MBB, Root.getOperand(2),
+ AArch64::FMULv2f32)) {
+ Patterns.push_back(MachineCombinerPattern::FMLAv2f32_OP2);
+ Found = true;
+ }
+ break;
+ case AArch64::FADDv2f64:
+ if (canCombineWithFMUL(MBB, Root.getOperand(1),
+ AArch64::FMULv2i64_indexed)) {
+ Patterns.push_back(MachineCombinerPattern::FMLAv2i64_indexed_OP1);
+ Found = true;
+ } else if (canCombineWithFMUL(MBB, Root.getOperand(1),
+ AArch64::FMULv2f64)) {
+ Patterns.push_back(MachineCombinerPattern::FMLAv2f64_OP1);
+ Found = true;
+ }
+ if (canCombineWithFMUL(MBB, Root.getOperand(2),
+ AArch64::FMULv2i64_indexed)) {
+ Patterns.push_back(MachineCombinerPattern::FMLAv2i64_indexed_OP2);
+ Found = true;
+ } else if (canCombineWithFMUL(MBB, Root.getOperand(2),
+ AArch64::FMULv2f64)) {
+ Patterns.push_back(MachineCombinerPattern::FMLAv2f64_OP2);
+ Found = true;
+ }
+ break;
+ case AArch64::FADDv4f32:
+ if (canCombineWithFMUL(MBB, Root.getOperand(1),
+ AArch64::FMULv4i32_indexed)) {
+ Patterns.push_back(MachineCombinerPattern::FMLAv4i32_indexed_OP1);
+ Found = true;
+ } else if (canCombineWithFMUL(MBB, Root.getOperand(1),
+ AArch64::FMULv4f32)) {
+ Patterns.push_back(MachineCombinerPattern::FMLAv4f32_OP1);
+ Found = true;
+ }
+ if (canCombineWithFMUL(MBB, Root.getOperand(2),
+ AArch64::FMULv4i32_indexed)) {
+ Patterns.push_back(MachineCombinerPattern::FMLAv4i32_indexed_OP2);
+ Found = true;
+ } else if (canCombineWithFMUL(MBB, Root.getOperand(2),
+ AArch64::FMULv4f32)) {
+ Patterns.push_back(MachineCombinerPattern::FMLAv4f32_OP2);
+ Found = true;
+ }
+ break;
+
+ case AArch64::FSUBSrr:
+ if (canCombineWithFMUL(MBB, Root.getOperand(1), AArch64::FMULSrr)) {
+ Patterns.push_back(MachineCombinerPattern::FMULSUBS_OP1);
+ Found = true;
+ }
+ if (canCombineWithFMUL(MBB, Root.getOperand(2), AArch64::FMULSrr)) {
+ Patterns.push_back(MachineCombinerPattern::FMULSUBS_OP2);
+ Found = true;
+ } else if (canCombineWithFMUL(MBB, Root.getOperand(2),
+ AArch64::FMULv1i32_indexed)) {
+ Patterns.push_back(MachineCombinerPattern::FMLSv1i32_indexed_OP2);
+ Found = true;
+ }
+ break;
+ case AArch64::FSUBDrr:
+ if (canCombineWithFMUL(MBB, Root.getOperand(1), AArch64::FMULDrr)) {
+ Patterns.push_back(MachineCombinerPattern::FMULSUBD_OP1);
+ Found = true;
+ }
+ if (canCombineWithFMUL(MBB, Root.getOperand(2), AArch64::FMULDrr)) {
+ Patterns.push_back(MachineCombinerPattern::FMULSUBD_OP2);
+ Found = true;
+ } else if (canCombineWithFMUL(MBB, Root.getOperand(2),
+ AArch64::FMULv1i64_indexed)) {
+ Patterns.push_back(MachineCombinerPattern::FMLSv1i64_indexed_OP2);
+ Found = true;
+ }
+ break;
+ case AArch64::FSUBv2f32:
+ if (canCombineWithFMUL(MBB, Root.getOperand(2),
+ AArch64::FMULv2i32_indexed)) {
+ Patterns.push_back(MachineCombinerPattern::FMLSv2i32_indexed_OP2);
+ Found = true;
+ } else if (canCombineWithFMUL(MBB, Root.getOperand(2),
+ AArch64::FMULv2f32)) {
+ Patterns.push_back(MachineCombinerPattern::FMLSv2f32_OP2);
+ Found = true;
+ }
+ break;
+ case AArch64::FSUBv2f64:
+ if (canCombineWithFMUL(MBB, Root.getOperand(2),
+ AArch64::FMULv2i64_indexed)) {
+ Patterns.push_back(MachineCombinerPattern::FMLSv2i64_indexed_OP2);
+ Found = true;
+ } else if (canCombineWithFMUL(MBB, Root.getOperand(2),
+ AArch64::FMULv2f64)) {
+ Patterns.push_back(MachineCombinerPattern::FMLSv2f64_OP2);
+ Found = true;
+ }
+ break;
+ case AArch64::FSUBv4f32:
+ if (canCombineWithFMUL(MBB, Root.getOperand(2),
+ AArch64::FMULv4i32_indexed)) {
+ Patterns.push_back(MachineCombinerPattern::FMLSv4i32_indexed_OP2);
+ Found = true;
+ } else if (canCombineWithFMUL(MBB, Root.getOperand(2),
+ AArch64::FMULv4f32)) {
+ Patterns.push_back(MachineCombinerPattern::FMLSv4f32_OP2);
+ Found = true;
+ }
+ break;
+ }
+ return Found;
+}
+
+/// Return true when a code sequence can improve throughput. It
+/// should be called only for instructions in loops.
+/// \param Pattern - combiner pattern
+bool
+AArch64InstrInfo::isThroughputPattern(MachineCombinerPattern Pattern) const {
+ switch (Pattern) {
+ default:
+ break;
+ case MachineCombinerPattern::FMULADDS_OP1:
+ case MachineCombinerPattern::FMULADDS_OP2:
+ case MachineCombinerPattern::FMULSUBS_OP1:
+ case MachineCombinerPattern::FMULSUBS_OP2:
+ case MachineCombinerPattern::FMULADDD_OP1:
+ case MachineCombinerPattern::FMULADDD_OP2:
+ case MachineCombinerPattern::FMULSUBD_OP1:
+ case MachineCombinerPattern::FMULSUBD_OP2:
+ case MachineCombinerPattern::FMLAv1i32_indexed_OP1:
+ case MachineCombinerPattern::FMLAv1i32_indexed_OP2:
+ case MachineCombinerPattern::FMLAv1i64_indexed_OP1:
+ case MachineCombinerPattern::FMLAv1i64_indexed_OP2:
+ case MachineCombinerPattern::FMLAv2f32_OP2:
+ case MachineCombinerPattern::FMLAv2f32_OP1:
+ case MachineCombinerPattern::FMLAv2f64_OP1:
+ case MachineCombinerPattern::FMLAv2f64_OP2:
+ case MachineCombinerPattern::FMLAv2i32_indexed_OP1:
+ case MachineCombinerPattern::FMLAv2i32_indexed_OP2:
+ case MachineCombinerPattern::FMLAv2i64_indexed_OP1:
+ case MachineCombinerPattern::FMLAv2i64_indexed_OP2:
+ case MachineCombinerPattern::FMLAv4f32_OP1:
+ case MachineCombinerPattern::FMLAv4f32_OP2:
+ case MachineCombinerPattern::FMLAv4i32_indexed_OP1:
+ case MachineCombinerPattern::FMLAv4i32_indexed_OP2:
+ case MachineCombinerPattern::FMLSv1i32_indexed_OP2:
+ case MachineCombinerPattern::FMLSv1i64_indexed_OP2:
+ case MachineCombinerPattern::FMLSv2i32_indexed_OP2:
+ case MachineCombinerPattern::FMLSv2i64_indexed_OP2:
+ case MachineCombinerPattern::FMLSv2f32_OP2:
+ case MachineCombinerPattern::FMLSv2f64_OP2:
+ case MachineCombinerPattern::FMLSv4i32_indexed_OP2:
+ case MachineCombinerPattern::FMLSv4f32_OP2:
+ return true;
+ } // end switch (Pattern)
+ return false;
+}
/// Return true when there is potentially a faster code sequence for an
/// instruction chain ending in \p Root. All potential patterns are listed in
/// the \p Pattern vector. Pattern should be sorted in priority order since the
@@ -2629,28 +3257,35 @@ static bool getMaddPatterns(MachineInstr &Root,
bool AArch64InstrInfo::getMachineCombinerPatterns(
MachineInstr &Root,
SmallVectorImpl<MachineCombinerPattern> &Patterns) const {
+ // Integer patterns
if (getMaddPatterns(Root, Patterns))
return true;
+ // Floating point patterns
+ if (getFMAPatterns(Root, Patterns))
+ return true;
return TargetInstrInfo::getMachineCombinerPatterns(Root, Patterns);
}
-/// genMadd - Generate madd instruction and combine mul and add.
-/// Example:
-/// MUL I=A,B,0
-/// ADD R,I,C
-/// ==> MADD R,A,B,C
-/// \param Root is the ADD instruction
+enum class FMAInstKind { Default, Indexed, Accumulator };
+/// genFusedMultiply - Generate fused multiply instructions.
+/// This function supports both integer and floating point instructions.
+/// A typical example:
+/// F|MUL I=A,B,0
+/// F|ADD R,I,C
+/// ==> F|MADD R,A,B,C
+/// \param Root is the F|ADD instruction
/// \param [out] InsInstrs is a vector of machine instructions and will
/// contain the generated madd instruction
/// \param IdxMulOpd is index of operand in Root that is the result of
-/// the MUL. In the example above IdxMulOpd is 1.
-/// \param MaddOpc the opcode fo the madd instruction
-static MachineInstr *genMadd(MachineFunction &MF, MachineRegisterInfo &MRI,
- const TargetInstrInfo *TII, MachineInstr &Root,
- SmallVectorImpl<MachineInstr *> &InsInstrs,
- unsigned IdxMulOpd, unsigned MaddOpc,
- const TargetRegisterClass *RC) {
+/// the F|MUL. In the example above IdxMulOpd is 1.
+/// \param MaddOpc the opcode fo the f|madd instruction
+static MachineInstr *
+genFusedMultiply(MachineFunction &MF, MachineRegisterInfo &MRI,
+ const TargetInstrInfo *TII, MachineInstr &Root,
+ SmallVectorImpl<MachineInstr *> &InsInstrs, unsigned IdxMulOpd,
+ unsigned MaddOpc, const TargetRegisterClass *RC,
+ FMAInstKind kind = FMAInstKind::Default) {
assert(IdxMulOpd == 1 || IdxMulOpd == 2);
unsigned IdxOtherOpd = IdxMulOpd == 1 ? 2 : 1;
@@ -2672,12 +3307,26 @@ static MachineInstr *genMadd(MachineFunction &MF, MachineRegisterInfo &MRI,
if (TargetRegisterInfo::isVirtualRegister(SrcReg2))
MRI.constrainRegClass(SrcReg2, RC);
- MachineInstrBuilder MIB = BuildMI(MF, Root.getDebugLoc(), TII->get(MaddOpc),
- ResultReg)
- .addReg(SrcReg0, getKillRegState(Src0IsKill))
- .addReg(SrcReg1, getKillRegState(Src1IsKill))
- .addReg(SrcReg2, getKillRegState(Src2IsKill));
- // Insert the MADD
+ MachineInstrBuilder MIB;
+ if (kind == FMAInstKind::Default)
+ MIB = BuildMI(MF, Root.getDebugLoc(), TII->get(MaddOpc), ResultReg)
+ .addReg(SrcReg0, getKillRegState(Src0IsKill))
+ .addReg(SrcReg1, getKillRegState(Src1IsKill))
+ .addReg(SrcReg2, getKillRegState(Src2IsKill));
+ else if (kind == FMAInstKind::Indexed)
+ MIB = BuildMI(MF, Root.getDebugLoc(), TII->get(MaddOpc), ResultReg)
+ .addReg(SrcReg2, getKillRegState(Src2IsKill))
+ .addReg(SrcReg0, getKillRegState(Src0IsKill))
+ .addReg(SrcReg1, getKillRegState(Src1IsKill))
+ .addImm(MUL->getOperand(3).getImm());
+ else if (kind == FMAInstKind::Accumulator)
+ MIB = BuildMI(MF, Root.getDebugLoc(), TII->get(MaddOpc), ResultReg)
+ .addReg(SrcReg2, getKillRegState(Src2IsKill))
+ .addReg(SrcReg0, getKillRegState(Src0IsKill))
+ .addReg(SrcReg1, getKillRegState(Src1IsKill));
+ else
+ assert(false && "Invalid FMA instruction kind \n");
+ // Insert the MADD (MADD, FMA, FMS, FMLA, FMSL)
InsInstrs.push_back(MIB);
return MUL;
}
@@ -2765,7 +3414,7 @@ void AArch64InstrInfo::genAlternativeCodeSequence(
Opc = AArch64::MADDXrrr;
RC = &AArch64::GPR64RegClass;
}
- MUL = genMadd(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
+ MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
break;
case MachineCombinerPattern::MULADDW_OP2:
case MachineCombinerPattern::MULADDX_OP2:
@@ -2780,7 +3429,7 @@ void AArch64InstrInfo::genAlternativeCodeSequence(
Opc = AArch64::MADDXrrr;
RC = &AArch64::GPR64RegClass;
}
- MUL = genMadd(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
+ MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
break;
case MachineCombinerPattern::MULADDWI_OP1:
case MachineCombinerPattern::MULADDXI_OP1: {
@@ -2872,7 +3521,7 @@ void AArch64InstrInfo::genAlternativeCodeSequence(
Opc = AArch64::MSUBXrrr;
RC = &AArch64::GPR64RegClass;
}
- MUL = genMadd(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
+ MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
break;
case MachineCombinerPattern::MULSUBWI_OP1:
case MachineCombinerPattern::MULSUBXI_OP1: {
@@ -2917,6 +3566,234 @@ void AArch64InstrInfo::genAlternativeCodeSequence(
}
break;
}
+ // Floating Point Support
+ case MachineCombinerPattern::FMULADDS_OP1:
+ case MachineCombinerPattern::FMULADDD_OP1:
+ // MUL I=A,B,0
+ // ADD R,I,C
+ // ==> MADD R,A,B,C
+ // --- Create(MADD);
+ if (Pattern == MachineCombinerPattern::FMULADDS_OP1) {
+ Opc = AArch64::FMADDSrrr;
+ RC = &AArch64::FPR32RegClass;
+ } else {
+ Opc = AArch64::FMADDDrrr;
+ RC = &AArch64::FPR64RegClass;
+ }
+ MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
+ break;
+ case MachineCombinerPattern::FMULADDS_OP2:
+ case MachineCombinerPattern::FMULADDD_OP2:
+ // FMUL I=A,B,0
+ // FADD R,C,I
+ // ==> FMADD R,A,B,C
+ // --- Create(FMADD);
+ if (Pattern == MachineCombinerPattern::FMULADDS_OP2) {
+ Opc = AArch64::FMADDSrrr;
+ RC = &AArch64::FPR32RegClass;
+ } else {
+ Opc = AArch64::FMADDDrrr;
+ RC = &AArch64::FPR64RegClass;
+ }
+ MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
+ break;
+
+ case MachineCombinerPattern::FMLAv1i32_indexed_OP1:
+ Opc = AArch64::FMLAv1i32_indexed;
+ RC = &AArch64::FPR32RegClass;
+ MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
+ FMAInstKind::Indexed);
+ break;
+ case MachineCombinerPattern::FMLAv1i32_indexed_OP2:
+ Opc = AArch64::FMLAv1i32_indexed;
+ RC = &AArch64::FPR32RegClass;
+ MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
+ FMAInstKind::Indexed);
+ break;
+
+ case MachineCombinerPattern::FMLAv1i64_indexed_OP1:
+ Opc = AArch64::FMLAv1i64_indexed;
+ RC = &AArch64::FPR64RegClass;
+ MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
+ FMAInstKind::Indexed);
+ break;
+ case MachineCombinerPattern::FMLAv1i64_indexed_OP2:
+ Opc = AArch64::FMLAv1i64_indexed;
+ RC = &AArch64::FPR64RegClass;
+ MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
+ FMAInstKind::Indexed);
+ break;
+
+ case MachineCombinerPattern::FMLAv2i32_indexed_OP1:
+ case MachineCombinerPattern::FMLAv2f32_OP1:
+ RC = &AArch64::FPR64RegClass;
+ if (Pattern == MachineCombinerPattern::FMLAv2i32_indexed_OP1) {
+ Opc = AArch64::FMLAv2i32_indexed;
+ MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
+ FMAInstKind::Indexed);
+ } else {
+ Opc = AArch64::FMLAv2f32;
+ MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
+ FMAInstKind::Accumulator);
+ }
+ break;
+ case MachineCombinerPattern::FMLAv2i32_indexed_OP2:
+ case MachineCombinerPattern::FMLAv2f32_OP2:
+ RC = &AArch64::FPR64RegClass;
+ if (Pattern == MachineCombinerPattern::FMLAv2i32_indexed_OP2) {
+ Opc = AArch64::FMLAv2i32_indexed;
+ MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
+ FMAInstKind::Indexed);
+ } else {
+ Opc = AArch64::FMLAv2f32;
+ MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
+ FMAInstKind::Accumulator);
+ }
+ break;
+
+ case MachineCombinerPattern::FMLAv2i64_indexed_OP1:
+ case MachineCombinerPattern::FMLAv2f64_OP1:
+ RC = &AArch64::FPR128RegClass;
+ if (Pattern == MachineCombinerPattern::FMLAv2i64_indexed_OP1) {
+ Opc = AArch64::FMLAv2i64_indexed;
+ MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
+ FMAInstKind::Indexed);
+ } else {
+ Opc = AArch64::FMLAv2f64;
+ MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
+ FMAInstKind::Accumulator);
+ }
+ break;
+ case MachineCombinerPattern::FMLAv2i64_indexed_OP2:
+ case MachineCombinerPattern::FMLAv2f64_OP2:
+ RC = &AArch64::FPR128RegClass;
+ if (Pattern == MachineCombinerPattern::FMLAv2i64_indexed_OP2) {
+ Opc = AArch64::FMLAv2i64_indexed;
+ MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
+ FMAInstKind::Indexed);
+ } else {
+ Opc = AArch64::FMLAv2f64;
+ MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
+ FMAInstKind::Accumulator);
+ }
+ break;
+
+ case MachineCombinerPattern::FMLAv4i32_indexed_OP1:
+ case MachineCombinerPattern::FMLAv4f32_OP1:
+ RC = &AArch64::FPR128RegClass;
+ if (Pattern == MachineCombinerPattern::FMLAv4i32_indexed_OP1) {
+ Opc = AArch64::FMLAv4i32_indexed;
+ MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
+ FMAInstKind::Indexed);
+ } else {
+ Opc = AArch64::FMLAv4f32;
+ MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
+ FMAInstKind::Accumulator);
+ }
+ break;
+
+ case MachineCombinerPattern::FMLAv4i32_indexed_OP2:
+ case MachineCombinerPattern::FMLAv4f32_OP2:
+ RC = &AArch64::FPR128RegClass;
+ if (Pattern == MachineCombinerPattern::FMLAv4i32_indexed_OP2) {
+ Opc = AArch64::FMLAv4i32_indexed;
+ MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
+ FMAInstKind::Indexed);
+ } else {
+ Opc = AArch64::FMLAv4f32;
+ MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
+ FMAInstKind::Accumulator);
+ }
+ break;
+
+ case MachineCombinerPattern::FMULSUBS_OP1:
+ case MachineCombinerPattern::FMULSUBD_OP1: {
+ // FMUL I=A,B,0
+ // FSUB R,I,C
+ // ==> FNMSUB R,A,B,C // = -C + A*B
+ // --- Create(FNMSUB);
+ if (Pattern == MachineCombinerPattern::FMULSUBS_OP1) {
+ Opc = AArch64::FNMSUBSrrr;
+ RC = &AArch64::FPR32RegClass;
+ } else {
+ Opc = AArch64::FNMSUBDrrr;
+ RC = &AArch64::FPR64RegClass;
+ }
+ MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
+ break;
+ }
+ case MachineCombinerPattern::FMULSUBS_OP2:
+ case MachineCombinerPattern::FMULSUBD_OP2: {
+ // FMUL I=A,B,0
+ // FSUB R,C,I
+ // ==> FMSUB R,A,B,C (computes C - A*B)
+ // --- Create(FMSUB);
+ if (Pattern == MachineCombinerPattern::FMULSUBS_OP2) {
+ Opc = AArch64::FMSUBSrrr;
+ RC = &AArch64::FPR32RegClass;
+ } else {
+ Opc = AArch64::FMSUBDrrr;
+ RC = &AArch64::FPR64RegClass;
+ }
+ MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
+ break;
+
+ case MachineCombinerPattern::FMLSv1i32_indexed_OP2:
+ Opc = AArch64::FMLSv1i32_indexed;
+ RC = &AArch64::FPR32RegClass;
+ MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
+ FMAInstKind::Indexed);
+ break;
+
+ case MachineCombinerPattern::FMLSv1i64_indexed_OP2:
+ Opc = AArch64::FMLSv1i64_indexed;
+ RC = &AArch64::FPR64RegClass;
+ MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
+ FMAInstKind::Indexed);
+ break;
+
+ case MachineCombinerPattern::FMLSv2f32_OP2:
+ case MachineCombinerPattern::FMLSv2i32_indexed_OP2:
+ RC = &AArch64::FPR64RegClass;
+ if (Pattern == MachineCombinerPattern::FMLSv2i32_indexed_OP2) {
+ Opc = AArch64::FMLSv2i32_indexed;
+ MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
+ FMAInstKind::Indexed);
+ } else {
+ Opc = AArch64::FMLSv2f32;
+ MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
+ FMAInstKind::Accumulator);
+ }
+ break;
+
+ case MachineCombinerPattern::FMLSv2f64_OP2:
+ case MachineCombinerPattern::FMLSv2i64_indexed_OP2:
+ RC = &AArch64::FPR128RegClass;
+ if (Pattern == MachineCombinerPattern::FMLSv2i64_indexed_OP2) {
+ Opc = AArch64::FMLSv2i64_indexed;
+ MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
+ FMAInstKind::Indexed);
+ } else {
+ Opc = AArch64::FMLSv2f64;
+ MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
+ FMAInstKind::Accumulator);
+ }
+ break;
+
+ case MachineCombinerPattern::FMLSv4f32_OP2:
+ case MachineCombinerPattern::FMLSv4i32_indexed_OP2:
+ RC = &AArch64::FPR128RegClass;
+ if (Pattern == MachineCombinerPattern::FMLSv4i32_indexed_OP2) {
+ Opc = AArch64::FMLSv4i32_indexed;
+ MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
+ FMAInstKind::Indexed);
+ } else {
+ Opc = AArch64::FMLSv4f32;
+ MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
+ FMAInstKind::Accumulator);
+ }
+ break;
+ }
} // end switch (Pattern)
// Record MUL and ADD/SUB for deletion
DelInstrs.push_back(MUL);
@@ -2940,14 +3817,23 @@ void AArch64InstrInfo::genAlternativeCodeSequence(
/// to
/// b.<condition code>
///
+/// Replace compare and branch sequence by TBZ/TBNZ instruction when the
+/// compare's constant operand is power of 2.
+///
+/// Examples:
+/// and w8, w8, #0x400
+/// cbnz w8, L1
+/// to
+/// tbnz w8, #10, L1
+///
/// \param MI Conditional Branch
/// \return True when the simple conditional branch is generated
///
-bool AArch64InstrInfo::optimizeCondBranch(MachineInstr *MI) const {
+bool AArch64InstrInfo::optimizeCondBranch(MachineInstr &MI) const {
bool IsNegativeBranch = false;
bool IsTestAndBranch = false;
unsigned TargetBBInMI = 0;
- switch (MI->getOpcode()) {
+ switch (MI.getOpcode()) {
default:
llvm_unreachable("Unknown branch instruction?");
case AArch64::Bcc:
@@ -2976,48 +3862,108 @@ bool AArch64InstrInfo::optimizeCondBranch(MachineInstr *MI) const {
// So we increment a zero register and test for bits other
// than bit 0? Conservatively bail out in case the verifier
// missed this case.
- if (IsTestAndBranch && MI->getOperand(1).getImm())
+ if (IsTestAndBranch && MI.getOperand(1).getImm())
return false;
// Find Definition.
- assert(MI->getParent() && "Incomplete machine instruciton\n");
- MachineBasicBlock *MBB = MI->getParent();
+ assert(MI.getParent() && "Incomplete machine instruciton\n");
+ MachineBasicBlock *MBB = MI.getParent();
MachineFunction *MF = MBB->getParent();
MachineRegisterInfo *MRI = &MF->getRegInfo();
- unsigned VReg = MI->getOperand(0).getReg();
+ unsigned VReg = MI.getOperand(0).getReg();
if (!TargetRegisterInfo::isVirtualRegister(VReg))
return false;
MachineInstr *DefMI = MRI->getVRegDef(VReg);
- // Look for CSINC
- if (!(DefMI->getOpcode() == AArch64::CSINCWr &&
- DefMI->getOperand(1).getReg() == AArch64::WZR &&
- DefMI->getOperand(2).getReg() == AArch64::WZR) &&
- !(DefMI->getOpcode() == AArch64::CSINCXr &&
- DefMI->getOperand(1).getReg() == AArch64::XZR &&
- DefMI->getOperand(2).getReg() == AArch64::XZR))
- return false;
+ // Look through COPY instructions to find definition.
+ while (DefMI->isCopy()) {
+ unsigned CopyVReg = DefMI->getOperand(1).getReg();
+ if (!MRI->hasOneNonDBGUse(CopyVReg))
+ return false;
+ if (!MRI->hasOneDef(CopyVReg))
+ return false;
+ DefMI = MRI->getVRegDef(CopyVReg);
+ }
- if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, true) != -1)
+ switch (DefMI->getOpcode()) {
+ default:
return false;
+ // Fold AND into a TBZ/TBNZ if constant operand is power of 2.
+ case AArch64::ANDWri:
+ case AArch64::ANDXri: {
+ if (IsTestAndBranch)
+ return false;
+ if (DefMI->getParent() != MBB)
+ return false;
+ if (!MRI->hasOneNonDBGUse(VReg))
+ return false;
- AArch64CC::CondCode CC =
- (AArch64CC::CondCode)DefMI->getOperand(3).getImm();
- bool CheckOnlyCCWrites = true;
- // Convert only when the condition code is not modified between
- // the CSINC and the branch. The CC may be used by other
- // instructions in between.
- if (modifiesConditionCode(DefMI, MI, CheckOnlyCCWrites, &getRegisterInfo()))
- return false;
- MachineBasicBlock &RefToMBB = *MBB;
- MachineBasicBlock *TBB = MI->getOperand(TargetBBInMI).getMBB();
- DebugLoc DL = MI->getDebugLoc();
- if (IsNegativeBranch)
- CC = AArch64CC::getInvertedCondCode(CC);
- BuildMI(RefToMBB, MI, DL, get(AArch64::Bcc)).addImm(CC).addMBB(TBB);
- MI->eraseFromParent();
- return true;
+ bool Is32Bit = (DefMI->getOpcode() == AArch64::ANDWri);
+ uint64_t Mask = AArch64_AM::decodeLogicalImmediate(
+ DefMI->getOperand(2).getImm(), Is32Bit ? 32 : 64);
+ if (!isPowerOf2_64(Mask))
+ return false;
+
+ MachineOperand &MO = DefMI->getOperand(1);
+ unsigned NewReg = MO.getReg();
+ if (!TargetRegisterInfo::isVirtualRegister(NewReg))
+ return false;
+
+ assert(!MRI->def_empty(NewReg) && "Register must be defined.");
+
+ MachineBasicBlock &RefToMBB = *MBB;
+ MachineBasicBlock *TBB = MI.getOperand(1).getMBB();
+ DebugLoc DL = MI.getDebugLoc();
+ unsigned Imm = Log2_64(Mask);
+ unsigned Opc = (Imm < 32)
+ ? (IsNegativeBranch ? AArch64::TBNZW : AArch64::TBZW)
+ : (IsNegativeBranch ? AArch64::TBNZX : AArch64::TBZX);
+ MachineInstr *NewMI = BuildMI(RefToMBB, MI, DL, get(Opc))
+ .addReg(NewReg)
+ .addImm(Imm)
+ .addMBB(TBB);
+ // Register lives on to the CBZ now.
+ MO.setIsKill(false);
+
+ // For immediate smaller than 32, we need to use the 32-bit
+ // variant (W) in all cases. Indeed the 64-bit variant does not
+ // allow to encode them.
+ // Therefore, if the input register is 64-bit, we need to take the
+ // 32-bit sub-part.
+ if (!Is32Bit && Imm < 32)
+ NewMI->getOperand(0).setSubReg(AArch64::sub_32);
+ MI.eraseFromParent();
+ return true;
+ }
+ // Look for CSINC
+ case AArch64::CSINCWr:
+ case AArch64::CSINCXr: {
+ if (!(DefMI->getOperand(1).getReg() == AArch64::WZR &&
+ DefMI->getOperand(2).getReg() == AArch64::WZR) &&
+ !(DefMI->getOperand(1).getReg() == AArch64::XZR &&
+ DefMI->getOperand(2).getReg() == AArch64::XZR))
+ return false;
+
+ if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, true) != -1)
+ return false;
+
+ AArch64CC::CondCode CC = (AArch64CC::CondCode)DefMI->getOperand(3).getImm();
+ // Convert only when the condition code is not modified between
+ // the CSINC and the branch. The CC may be used by other
+ // instructions in between.
+ if (areCFlagsAccessedBetweenInstrs(DefMI, MI, &getRegisterInfo(), AK_Write))
+ return false;
+ MachineBasicBlock &RefToMBB = *MBB;
+ MachineBasicBlock *TBB = MI.getOperand(TargetBBInMI).getMBB();
+ DebugLoc DL = MI.getDebugLoc();
+ if (IsNegativeBranch)
+ CC = AArch64CC::getInvertedCondCode(CC);
+ BuildMI(RefToMBB, MI, DL, get(AArch64::Bcc)).addImm(CC).addMBB(TBB);
+ MI.eraseFromParent();
+ return true;
+ }
+ }
}
std::pair<unsigned, unsigned>
@@ -3046,7 +3992,6 @@ AArch64InstrInfo::getSerializableBitmaskMachineOperandTargetFlags() const {
static const std::pair<unsigned, const char *> TargetFlags[] = {
{MO_GOT, "aarch64-got"},
{MO_NC, "aarch64-nc"},
- {MO_TLS, "aarch64-tls"},
- {MO_CONSTPOOL, "aarch64-constant-pool"}};
+ {MO_TLS, "aarch64-tls"}};
return makeArrayRef(TargetFlags);
}
diff --git a/lib/Target/AArch64/AArch64InstrInfo.h b/lib/Target/AArch64/AArch64InstrInfo.h
index b5bb446f8c167..24bc0e6397477 100644
--- a/lib/Target/AArch64/AArch64InstrInfo.h
+++ b/lib/Target/AArch64/AArch64InstrInfo.h
@@ -28,12 +28,6 @@ class AArch64Subtarget;
class AArch64TargetMachine;
class AArch64InstrInfo : public AArch64GenInstrInfo {
- // Reserve bits in the MachineMemOperand target hint flags, starting at 1.
- // They will be shifted into MOTargetHintStart when accessed.
- enum TargetMemOperandFlags {
- MOSuppressPair = 1
- };
-
const AArch64RegisterInfo RI;
const AArch64Subtarget &Subtarget;
@@ -45,76 +39,88 @@ public:
/// always be able to get register info as well (through this method).
const AArch64RegisterInfo &getRegisterInfo() const { return RI; }
- unsigned GetInstSizeInBytes(const MachineInstr *MI) const;
+ unsigned GetInstSizeInBytes(const MachineInstr &MI) const;
- bool isAsCheapAsAMove(const MachineInstr *MI) const override;
+ bool isAsCheapAsAMove(const MachineInstr &MI) const override;
bool isCoalescableExtInstr(const MachineInstr &MI, unsigned &SrcReg,
unsigned &DstReg, unsigned &SubIdx) const override;
bool
- areMemAccessesTriviallyDisjoint(MachineInstr *MIa, MachineInstr *MIb,
+ areMemAccessesTriviallyDisjoint(MachineInstr &MIa, MachineInstr &MIb,
AliasAnalysis *AA = nullptr) const override;
- unsigned isLoadFromStackSlot(const MachineInstr *MI,
+ unsigned isLoadFromStackSlot(const MachineInstr &MI,
int &FrameIndex) const override;
- unsigned isStoreToStackSlot(const MachineInstr *MI,
+ unsigned isStoreToStackSlot(const MachineInstr &MI,
int &FrameIndex) const override;
/// Returns true if there is a shiftable register and that the shift value
/// is non-zero.
- bool hasShiftedReg(const MachineInstr *MI) const;
+ bool hasShiftedReg(const MachineInstr &MI) const;
/// Returns true if there is an extendable register and that the extending
/// value is non-zero.
- bool hasExtendedReg(const MachineInstr *MI) const;
+ bool hasExtendedReg(const MachineInstr &MI) const;
/// \brief Does this instruction set its full destination register to zero?
- bool isGPRZero(const MachineInstr *MI) const;
+ bool isGPRZero(const MachineInstr &MI) const;
/// \brief Does this instruction rename a GPR without modifying bits?
- bool isGPRCopy(const MachineInstr *MI) const;
+ bool isGPRCopy(const MachineInstr &MI) const;
/// \brief Does this instruction rename an FPR without modifying bits?
- bool isFPRCopy(const MachineInstr *MI) const;
+ bool isFPRCopy(const MachineInstr &MI) const;
/// Return true if this is load/store scales or extends its register offset.
/// This refers to scaling a dynamic index as opposed to scaled immediates.
/// MI should be a memory op that allows scaled addressing.
- bool isScaledAddr(const MachineInstr *MI) const;
+ bool isScaledAddr(const MachineInstr &MI) const;
/// Return true if pairing the given load or store is hinted to be
/// unprofitable.
- bool isLdStPairSuppressed(const MachineInstr *MI) const;
+ bool isLdStPairSuppressed(const MachineInstr &MI) const;
+
+ /// Return true if this is an unscaled load/store.
+ bool isUnscaledLdSt(unsigned Opc) const;
+
+ /// Return true if this is an unscaled load/store.
+ bool isUnscaledLdSt(MachineInstr &MI) const;
+
+ /// Return true if this is a load/store that can be potentially paired/merged.
+ bool isCandidateToMergeOrPair(MachineInstr &MI) const;
/// Hint that pairing the given load or store is unprofitable.
- void suppressLdStPair(MachineInstr *MI) const;
+ void suppressLdStPair(MachineInstr &MI) const;
- bool getMemOpBaseRegImmOfs(MachineInstr *LdSt, unsigned &BaseReg,
- unsigned &Offset,
+ bool getMemOpBaseRegImmOfs(MachineInstr &LdSt, unsigned &BaseReg,
+ int64_t &Offset,
const TargetRegisterInfo *TRI) const override;
- bool getMemOpBaseRegImmOfsWidth(MachineInstr *LdSt, unsigned &BaseReg,
- int &Offset, int &Width,
+ bool getMemOpBaseRegImmOfsWidth(MachineInstr &LdSt, unsigned &BaseReg,
+ int64_t &Offset, unsigned &Width,
const TargetRegisterInfo *TRI) const;
bool enableClusterLoads() const override { return true; }
- bool shouldClusterLoads(MachineInstr *FirstLdSt, MachineInstr *SecondLdSt,
- unsigned NumLoads) const override;
+ bool enableClusterStores() const override { return true; }
+
+ bool shouldClusterMemOps(MachineInstr &FirstLdSt, MachineInstr &SecondLdSt,
+ unsigned NumLoads) const override;
- bool shouldScheduleAdjacent(MachineInstr *First,
- MachineInstr *Second) const override;
+ bool shouldScheduleAdjacent(MachineInstr &First,
+ MachineInstr &Second) const override;
MachineInstr *emitFrameIndexDebugValue(MachineFunction &MF, int FrameIx,
uint64_t Offset, const MDNode *Var,
- const MDNode *Expr, DebugLoc DL) const;
+ const MDNode *Expr,
+ const DebugLoc &DL) const;
void copyPhysRegTuple(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
- DebugLoc DL, unsigned DestReg, unsigned SrcReg,
+ const DebugLoc &DL, unsigned DestReg, unsigned SrcReg,
bool KillSrc, unsigned Opcode,
llvm::ArrayRef<unsigned> Indices) const;
void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
- DebugLoc DL, unsigned DestReg, unsigned SrcReg,
+ const DebugLoc &DL, unsigned DestReg, unsigned SrcReg,
bool KillSrc) const override;
void storeRegToStackSlot(MachineBasicBlock &MBB,
@@ -129,40 +135,47 @@ public:
const TargetRegisterInfo *TRI) const override;
using TargetInstrInfo::foldMemoryOperandImpl;
- MachineInstr *foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI,
- ArrayRef<unsigned> Ops,
- MachineBasicBlock::iterator InsertPt,
- int FrameIndex) const override;
+ MachineInstr *
+ foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI,
+ ArrayRef<unsigned> Ops,
+ MachineBasicBlock::iterator InsertPt, int FrameIndex,
+ LiveIntervals *LIS = nullptr) const override;
- bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
+ bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
MachineBasicBlock *&FBB,
SmallVectorImpl<MachineOperand> &Cond,
bool AllowModify = false) const override;
unsigned RemoveBranch(MachineBasicBlock &MBB) const override;
unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
MachineBasicBlock *FBB, ArrayRef<MachineOperand> Cond,
- DebugLoc DL) const override;
+ const DebugLoc &DL) const override;
bool
ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const override;
bool canInsertSelect(const MachineBasicBlock &, ArrayRef<MachineOperand> Cond,
unsigned, unsigned, int &, int &, int &) const override;
void insertSelect(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
- DebugLoc DL, unsigned DstReg, ArrayRef<MachineOperand> Cond,
- unsigned TrueReg, unsigned FalseReg) const override;
+ const DebugLoc &DL, unsigned DstReg,
+ ArrayRef<MachineOperand> Cond, unsigned TrueReg,
+ unsigned FalseReg) const override;
void getNoopForMachoTarget(MCInst &NopInst) const override;
/// analyzeCompare - For a comparison instruction, return the source registers
/// in SrcReg and SrcReg2, and the value it compares against in CmpValue.
/// Return true if the comparison instruction can be analyzed.
- bool analyzeCompare(const MachineInstr *MI, unsigned &SrcReg,
+ bool analyzeCompare(const MachineInstr &MI, unsigned &SrcReg,
unsigned &SrcReg2, int &CmpMask,
int &CmpValue) const override;
/// optimizeCompareInstr - Convert the instruction supplying the argument to
/// the comparison into one that sets the zero bit in the flags register.
- bool optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg,
+ bool optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg,
unsigned SrcReg2, int CmpMask, int CmpValue,
const MachineRegisterInfo *MRI) const override;
- bool optimizeCondBranch(MachineInstr *MI) const override;
+ bool optimizeCondBranch(MachineInstr &MI) const override;
+
+ /// Return true when a code sequence can improve throughput. It
+ /// should be called only for instructions in loops.
+ /// \param Pattern - combiner pattern
+ bool isThroughputPattern(MachineCombinerPattern Pattern) const override;
/// Return true when there is potentially a faster code sequence
/// for an instruction chain ending in <Root>. All potential patterns are
/// listed in the <Patterns> array.
@@ -179,10 +192,10 @@ public:
SmallVectorImpl<MachineInstr *> &InsInstrs,
SmallVectorImpl<MachineInstr *> &DelInstrs,
DenseMap<unsigned, unsigned> &InstrIdxForVirtReg) const override;
- /// useMachineCombiner - AArch64 supports MachineCombiner
+ /// AArch64 supports MachineCombiner.
bool useMachineCombiner() const override;
- bool expandPostRAPseudo(MachineBasicBlock::iterator MI) const override;
+ bool expandPostRAPseudo(MachineInstr &MI) const override;
std::pair<unsigned, unsigned>
decomposeMachineOperandsTargetFlags(unsigned TF) const override;
@@ -192,9 +205,11 @@ public:
getSerializableBitmaskMachineOperandTargetFlags() const override;
private:
- void instantiateCondBranch(MachineBasicBlock &MBB, DebugLoc DL,
+ void instantiateCondBranch(MachineBasicBlock &MBB, const DebugLoc &DL,
MachineBasicBlock *TBB,
ArrayRef<MachineOperand> Cond) const;
+ bool substituteCmpToZero(MachineInstr &CmpInstr, unsigned SrcReg,
+ const MachineRegisterInfo *MRI) const;
};
/// emitFrameOffset - Emit instructions as needed to set DestReg to SrcReg
@@ -202,8 +217,8 @@ private:
/// insertion (PEI) pass, where a virtual scratch register may be allocated
/// if necessary, to be replaced by the scavenger at the end of PEI.
void emitFrameOffset(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
- DebugLoc DL, unsigned DestReg, unsigned SrcReg, int Offset,
- const TargetInstrInfo *TII,
+ const DebugLoc &DL, unsigned DestReg, unsigned SrcReg,
+ int Offset, const TargetInstrInfo *TII,
MachineInstr::MIFlag = MachineInstr::NoFlags,
bool SetNZCV = false);
diff --git a/lib/Target/AArch64/AArch64InstrInfo.td b/lib/Target/AArch64/AArch64InstrInfo.td
index d02bc9ff394d3..af9ed812e6da3 100644
--- a/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/lib/Target/AArch64/AArch64InstrInfo.td
@@ -26,6 +26,8 @@ def HasCrypto : Predicate<"Subtarget->hasCrypto()">,
AssemblerPredicate<"FeatureCrypto", "crypto">;
def HasCRC : Predicate<"Subtarget->hasCRC()">,
AssemblerPredicate<"FeatureCRC", "crc">;
+def HasRAS : Predicate<"Subtarget->hasRAS()">,
+ AssemblerPredicate<"FeatureRAS", "ras">;
def HasPerfMon : Predicate<"Subtarget->hasPerfMon()">;
def HasFullFP16 : Predicate<"Subtarget->hasFullFP16()">,
AssemblerPredicate<"FeatureFullFP16", "fullfp16">;
@@ -34,7 +36,8 @@ def HasSPE : Predicate<"Subtarget->hasSPE()">,
def IsLE : Predicate<"Subtarget->isLittleEndian()">;
def IsBE : Predicate<"!Subtarget->isLittleEndian()">;
-def IsCyclone : Predicate<"Subtarget->isCyclone()">;
+def UseAlternateSExtLoadCVTF32
+ : Predicate<"Subtarget->useAlternateSExtLoadCVTF32Pattern()">;
//===----------------------------------------------------------------------===//
// AArch64-specific DAG Nodes.
@@ -283,6 +286,9 @@ def SDT_AArch64mull : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisInt<1>,
def AArch64smull : SDNode<"AArch64ISD::SMULL", SDT_AArch64mull>;
def AArch64umull : SDNode<"AArch64ISD::UMULL", SDT_AArch64mull>;
+def AArch64frecpe : SDNode<"AArch64ISD::FRECPE", SDTFPUnaryOp>;
+def AArch64frsqrte : SDNode<"AArch64ISD::FRSQRTE", SDTFPUnaryOp>;
+
def AArch64saddv : SDNode<"AArch64ISD::SADDV", SDT_AArch64UnaryVec>;
def AArch64uaddv : SDNode<"AArch64ISD::UADDV", SDT_AArch64UnaryVec>;
def AArch64sminv : SDNode<"AArch64ISD::SMINV", SDT_AArch64UnaryVec>;
@@ -295,9 +301,6 @@ def AArch64umaxv : SDNode<"AArch64ISD::UMAXV", SDT_AArch64UnaryVec>;
//===----------------------------------------------------------------------===//
// AArch64 Instruction Predicate Definitions.
-//
-def HasZCZ : Predicate<"Subtarget->hasZeroCycleZeroing()">;
-def NoZCZ : Predicate<"!Subtarget->hasZeroCycleZeroing()">;
def IsDarwin : Predicate<"Subtarget->isTargetDarwin()">;
def IsNotDarwin: Predicate<"!Subtarget->isTargetDarwin()">;
def ForCodeSize : Predicate<"ForCodeSize">;
@@ -312,10 +315,13 @@ include "AArch64InstrFormats.td"
//===----------------------------------------------------------------------===//
let Defs = [SP], Uses = [SP], hasSideEffects = 1, isCodeGenOnly = 1 in {
+// We set Sched to empty list because we expect these instructions to simply get
+// removed in most cases.
def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i32imm:$amt),
- [(AArch64callseq_start timm:$amt)]>;
+ [(AArch64callseq_start timm:$amt)]>, Sched<[]>;
def ADJCALLSTACKUP : Pseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2),
- [(AArch64callseq_end timm:$amt1, timm:$amt2)]>;
+ [(AArch64callseq_end timm:$amt1, timm:$amt2)]>,
+ Sched<[]>;
} // Defs = [SP], Uses = [SP], hasSideEffects = 1, isCodeGenOnly = 1
let isReMaterializable = 1, isCodeGenOnly = 1 in {
@@ -383,6 +389,7 @@ def : InstAlias<"wfe", (HINT 0b010)>;
def : InstAlias<"wfi", (HINT 0b011)>;
def : InstAlias<"sev", (HINT 0b100)>;
def : InstAlias<"sevl", (HINT 0b101)>;
+def : InstAlias<"esb", (HINT 0b10000)>, Requires<[HasRAS]>;
// v8.2a Statistical Profiling extension
def : InstAlias<"psb $op", (HINT psbhint_op:$op)>, Requires<[HasSPE]>;
@@ -528,6 +535,12 @@ def i64imm_32bit : ImmLeaf<i64, [{
return (Imm & 0xffffffffULL) == static_cast<uint64_t>(Imm);
}]>;
+def s64imm_32bit : ImmLeaf<i64, [{
+ int64_t Imm64 = static_cast<int64_t>(Imm);
+ return Imm64 >= std::numeric_limits<int32_t>::min() &&
+ Imm64 <= std::numeric_limits<int32_t>::max();
+}]>;
+
def trunc_imm : SDNodeXForm<imm, [{
return CurDAG->getTargetConstant(N->getZExtValue(), SDLoc(N), MVT::i32);
}]>;
@@ -679,10 +692,11 @@ def : InstAlias<"negs $dst, $src$shift",
// Unsigned/Signed divide
defm UDIV : Div<0, "udiv", udiv>;
defm SDIV : Div<1, "sdiv", sdiv>;
-let isCodeGenOnly = 1 in {
-defm UDIV_Int : Div<0, "udiv", int_aarch64_udiv>;
-defm SDIV_Int : Div<1, "sdiv", int_aarch64_sdiv>;
-}
+
+def : Pat<(int_aarch64_udiv GPR32:$Rn, GPR32:$Rm), (UDIVWr $Rn, $Rm)>;
+def : Pat<(int_aarch64_udiv GPR64:$Rn, GPR64:$Rm), (UDIVXr $Rn, $Rm)>;
+def : Pat<(int_aarch64_sdiv GPR32:$Rn, GPR32:$Rm), (SDIVWr $Rn, $Rm)>;
+def : Pat<(int_aarch64_sdiv GPR64:$Rn, GPR64:$Rm), (SDIVXr $Rn, $Rm)>;
// Variable shift
defm ASRV : Shift<0b10, "asr", sra>;
@@ -734,6 +748,40 @@ def : Pat<(i64 (ineg (mul (sext GPR32:$Rn), (sext GPR32:$Rm)))),
(SMSUBLrrr GPR32:$Rn, GPR32:$Rm, XZR)>;
def : Pat<(i64 (ineg (mul (zext GPR32:$Rn), (zext GPR32:$Rm)))),
(UMSUBLrrr GPR32:$Rn, GPR32:$Rm, XZR)>;
+
+def : Pat<(i64 (mul (sext GPR32:$Rn), (s64imm_32bit:$C))),
+ (SMADDLrrr GPR32:$Rn, (MOVi32imm (trunc_imm imm:$C)), XZR)>;
+def : Pat<(i64 (mul (zext GPR32:$Rn), (i64imm_32bit:$C))),
+ (UMADDLrrr GPR32:$Rn, (MOVi32imm (trunc_imm imm:$C)), XZR)>;
+def : Pat<(i64 (mul (sext_inreg GPR64:$Rn, i32), (s64imm_32bit:$C))),
+ (SMADDLrrr (i32 (EXTRACT_SUBREG GPR64:$Rn, sub_32)),
+ (MOVi32imm (trunc_imm imm:$C)), XZR)>;
+
+def : Pat<(i64 (ineg (mul (sext GPR32:$Rn), (s64imm_32bit:$C)))),
+ (SMSUBLrrr GPR32:$Rn, (MOVi32imm (trunc_imm imm:$C)), XZR)>;
+def : Pat<(i64 (ineg (mul (zext GPR32:$Rn), (i64imm_32bit:$C)))),
+ (UMSUBLrrr GPR32:$Rn, (MOVi32imm (trunc_imm imm:$C)), XZR)>;
+def : Pat<(i64 (ineg (mul (sext_inreg GPR64:$Rn, i32), (s64imm_32bit:$C)))),
+ (SMSUBLrrr (i32 (EXTRACT_SUBREG GPR64:$Rn, sub_32)),
+ (MOVi32imm (trunc_imm imm:$C)), XZR)>;
+
+def : Pat<(i64 (add (mul (sext GPR32:$Rn), (s64imm_32bit:$C)), GPR64:$Ra)),
+ (SMADDLrrr GPR32:$Rn, (MOVi32imm (trunc_imm imm:$C)), GPR64:$Ra)>;
+def : Pat<(i64 (add (mul (zext GPR32:$Rn), (i64imm_32bit:$C)), GPR64:$Ra)),
+ (UMADDLrrr GPR32:$Rn, (MOVi32imm (trunc_imm imm:$C)), GPR64:$Ra)>;
+def : Pat<(i64 (add (mul (sext_inreg GPR64:$Rn, i32), (s64imm_32bit:$C)),
+ GPR64:$Ra)),
+ (SMADDLrrr (i32 (EXTRACT_SUBREG GPR64:$Rn, sub_32)),
+ (MOVi32imm (trunc_imm imm:$C)), GPR64:$Ra)>;
+
+def : Pat<(i64 (sub GPR64:$Ra, (mul (sext GPR32:$Rn), (s64imm_32bit:$C)))),
+ (SMSUBLrrr GPR32:$Rn, (MOVi32imm (trunc_imm imm:$C)), GPR64:$Ra)>;
+def : Pat<(i64 (sub GPR64:$Ra, (mul (zext GPR32:$Rn), (i64imm_32bit:$C)))),
+ (UMSUBLrrr GPR32:$Rn, (MOVi32imm (trunc_imm imm:$C)), GPR64:$Ra)>;
+def : Pat<(i64 (sub GPR64:$Ra, (mul (sext_inreg GPR64:$Rn, i32),
+ (s64imm_32bit:$C)))),
+ (SMSUBLrrr (i32 (EXTRACT_SUBREG GPR64:$Rn, sub_32)),
+ (MOVi32imm (trunc_imm imm:$C)), GPR64:$Ra)>;
} // AddedComplexity = 5
def : MulAccumWAlias<"mul", MADDWrrr>;
@@ -1089,6 +1137,14 @@ def : Pat<(AArch64csel (i32 0), (i32 -1), (i32 imm:$cc), NZCV),
(CSINVWr WZR, WZR, (i32 imm:$cc))>;
def : Pat<(AArch64csel (i64 0), (i64 -1), (i32 imm:$cc), NZCV),
(CSINVXr XZR, XZR, (i32 imm:$cc))>;
+def : Pat<(AArch64csel GPR32:$tval, (i32 -1), (i32 imm:$cc), NZCV),
+ (CSINVWr GPR32:$tval, WZR, (i32 imm:$cc))>;
+def : Pat<(AArch64csel GPR64:$tval, (i64 -1), (i32 imm:$cc), NZCV),
+ (CSINVXr GPR64:$tval, XZR, (i32 imm:$cc))>;
+def : Pat<(AArch64csel (i32 -1), GPR32:$fval, (i32 imm:$cc), NZCV),
+ (CSINVWr GPR32:$fval, WZR, (i32 (inv_cond_XFORM imm:$cc)))>;
+def : Pat<(AArch64csel (i64 -1), GPR64:$fval, (i32 imm:$cc), NZCV),
+ (CSINVXr GPR64:$fval, XZR, (i32 (inv_cond_XFORM imm:$cc)))>;
// The inverse of the condition code from the alias instruction is what is used
// in the aliased instruction. The parser all ready inverts the condition code
@@ -1158,7 +1214,8 @@ def BR : BranchReg<0b0000, "br", [(brind GPR64:$Rn)]>;
// Create a separate pseudo-instruction for codegen to use so that we don't
// flag lr as used in every function. It'll be restored before the RET by the
// epilogue if it's legitimately used.
-def RET_ReallyLR : Pseudo<(outs), (ins), [(AArch64retflag)]> {
+def RET_ReallyLR : Pseudo<(outs), (ins), [(AArch64retflag)]>,
+ Sched<[WriteBrReg]> {
let isTerminator = 1;
let isBarrier = 1;
let isReturn = 1;
@@ -1168,7 +1225,7 @@ def RET_ReallyLR : Pseudo<(outs), (ins), [(AArch64retflag)]> {
// R_AARCH64_TLSDESC_CALL relocation at the offset of the following instruction
// (which in the usual case is a BLR).
let hasSideEffects = 1 in
-def TLSDESCCALL : Pseudo<(outs), (ins i64imm:$sym), []> {
+def TLSDESCCALL : Pseudo<(outs), (ins i64imm:$sym), []>, Sched<[]> {
let AsmString = ".tlsdesccall $sym";
}
@@ -1178,7 +1235,8 @@ let isCall = 1, Defs = [LR, X0, X1], hasSideEffects = 1,
isCodeGenOnly = 1 in
def TLSDESC_CALLSEQ
: Pseudo<(outs), (ins i64imm:$sym),
- [(AArch64tlsdesc_callseq tglobaltlsaddr:$sym)]>;
+ [(AArch64tlsdesc_callseq tglobaltlsaddr:$sym)]>,
+ Sched<[WriteI, WriteLD, WriteI, WriteBrReg]>;
def : Pat<(AArch64tlsdesc_callseq texternalsym:$sym),
(TLSDESC_CALLSEQ texternalsym:$sym)>;
@@ -2444,13 +2502,32 @@ defm FCVTZS : FPToIntegerUnscaled<0b11, 0b000, "fcvtzs", fp_to_sint>;
defm FCVTZU : FPToIntegerUnscaled<0b11, 0b001, "fcvtzu", fp_to_uint>;
defm FCVTZS : FPToIntegerScaled<0b11, 0b000, "fcvtzs", fp_to_sint>;
defm FCVTZU : FPToIntegerScaled<0b11, 0b001, "fcvtzu", fp_to_uint>;
-let isCodeGenOnly = 1 in {
-defm FCVTZS_Int : FPToIntegerUnscaled<0b11, 0b000, "fcvtzs", int_aarch64_neon_fcvtzs>;
-defm FCVTZU_Int : FPToIntegerUnscaled<0b11, 0b001, "fcvtzu", int_aarch64_neon_fcvtzu>;
-defm FCVTZS_Int : FPToIntegerScaled<0b11, 0b000, "fcvtzs", int_aarch64_neon_fcvtzs>;
-defm FCVTZU_Int : FPToIntegerScaled<0b11, 0b001, "fcvtzu", int_aarch64_neon_fcvtzu>;
+
+multiclass FPToIntegerIntPats<Intrinsic round, string INST> {
+ def : Pat<(i32 (round f16:$Rn)), (!cast<Instruction>(INST # UWHr) $Rn)>;
+ def : Pat<(i64 (round f16:$Rn)), (!cast<Instruction>(INST # UXHr) $Rn)>;
+ def : Pat<(i32 (round f32:$Rn)), (!cast<Instruction>(INST # UWSr) $Rn)>;
+ def : Pat<(i64 (round f32:$Rn)), (!cast<Instruction>(INST # UXSr) $Rn)>;
+ def : Pat<(i32 (round f64:$Rn)), (!cast<Instruction>(INST # UWDr) $Rn)>;
+ def : Pat<(i64 (round f64:$Rn)), (!cast<Instruction>(INST # UXDr) $Rn)>;
+
+ def : Pat<(i32 (round (fmul f16:$Rn, fixedpoint_f16_i32:$scale))),
+ (!cast<Instruction>(INST # SWHri) $Rn, $scale)>;
+ def : Pat<(i64 (round (fmul f16:$Rn, fixedpoint_f16_i64:$scale))),
+ (!cast<Instruction>(INST # SXHri) $Rn, $scale)>;
+ def : Pat<(i32 (round (fmul f32:$Rn, fixedpoint_f32_i32:$scale))),
+ (!cast<Instruction>(INST # SWSri) $Rn, $scale)>;
+ def : Pat<(i64 (round (fmul f32:$Rn, fixedpoint_f32_i64:$scale))),
+ (!cast<Instruction>(INST # SXSri) $Rn, $scale)>;
+ def : Pat<(i32 (round (fmul f64:$Rn, fixedpoint_f64_i32:$scale))),
+ (!cast<Instruction>(INST # SWDri) $Rn, $scale)>;
+ def : Pat<(i64 (round (fmul f64:$Rn, fixedpoint_f64_i64:$scale))),
+ (!cast<Instruction>(INST # SXDri) $Rn, $scale)>;
}
+defm : FPToIntegerIntPats<int_aarch64_neon_fcvtzs, "FCVTZS">;
+defm : FPToIntegerIntPats<int_aarch64_neon_fcvtzu, "FCVTZU">;
+
multiclass FPToIntegerPats<SDNode to_int, SDNode round, string INST> {
def : Pat<(i32 (to_int (round f32:$Rn))),
(!cast<Instruction>(INST # UWSr) f32:$Rn)>;
@@ -2485,13 +2562,11 @@ defm UCVTF : IntegerToFP<1, "ucvtf", uint_to_fp>;
defm FMOV : UnscaledConversion<"fmov">;
// Add pseudo ops for FMOV 0 so we can mark them as isReMaterializable
-let isReMaterializable = 1, isCodeGenOnly = 1 in {
+let isReMaterializable = 1, isCodeGenOnly = 1, isAsCheapAsAMove = 1 in {
def FMOVS0 : Pseudo<(outs FPR32:$Rd), (ins), [(set f32:$Rd, (fpimm0))]>,
- PseudoInstExpansion<(FMOVWSr FPR32:$Rd, WZR)>,
- Requires<[NoZCZ]>;
+ Sched<[WriteF]>;
def FMOVD0 : Pseudo<(outs FPR64:$Rd), (ins), [(set f64:$Rd, (fpimm0))]>,
- PseudoInstExpansion<(FMOVXDr FPR64:$Rd, XZR)>,
- Requires<[NoZCZ]>;
+ Sched<[WriteF]>;
}
//===----------------------------------------------------------------------===//
@@ -2617,6 +2692,7 @@ def F128CSEL : Pseudo<(outs FPR128:$Rd),
(i32 imm:$cond), NZCV))]> {
let Uses = [NZCV];
let usesCustomInserter = 1;
+ let hasNoSchedulingInfo = 1;
}
@@ -2742,12 +2818,19 @@ defm FCVTXN : SIMDFPInexactCvtTwoVector<1, 0, 0b10110, "fcvtxn",
int_aarch64_neon_fcvtxn>;
defm FCVTZS : SIMDTwoVectorFPToInt<0, 1, 0b11011, "fcvtzs", fp_to_sint>;
defm FCVTZU : SIMDTwoVectorFPToInt<1, 1, 0b11011, "fcvtzu", fp_to_uint>;
-let isCodeGenOnly = 1 in {
-defm FCVTZS_Int : SIMDTwoVectorFPToInt<0, 1, 0b11011, "fcvtzs",
- int_aarch64_neon_fcvtzs>;
-defm FCVTZU_Int : SIMDTwoVectorFPToInt<1, 1, 0b11011, "fcvtzu",
- int_aarch64_neon_fcvtzu>;
-}
+
+def : Pat<(v4i16 (int_aarch64_neon_fcvtzs v4f16:$Rn)), (FCVTZSv4f16 $Rn)>;
+def : Pat<(v8i16 (int_aarch64_neon_fcvtzs v8f16:$Rn)), (FCVTZSv8f16 $Rn)>;
+def : Pat<(v2i32 (int_aarch64_neon_fcvtzs v2f32:$Rn)), (FCVTZSv2f32 $Rn)>;
+def : Pat<(v4i32 (int_aarch64_neon_fcvtzs v4f32:$Rn)), (FCVTZSv4f32 $Rn)>;
+def : Pat<(v2i64 (int_aarch64_neon_fcvtzs v2f64:$Rn)), (FCVTZSv2f64 $Rn)>;
+
+def : Pat<(v4i16 (int_aarch64_neon_fcvtzu v4f16:$Rn)), (FCVTZUv4f16 $Rn)>;
+def : Pat<(v8i16 (int_aarch64_neon_fcvtzu v8f16:$Rn)), (FCVTZUv8f16 $Rn)>;
+def : Pat<(v2i32 (int_aarch64_neon_fcvtzu v2f32:$Rn)), (FCVTZUv2f32 $Rn)>;
+def : Pat<(v4i32 (int_aarch64_neon_fcvtzu v4f32:$Rn)), (FCVTZUv4f32 $Rn)>;
+def : Pat<(v2i64 (int_aarch64_neon_fcvtzu v2f64:$Rn)), (FCVTZUv2f64 $Rn)>;
+
defm FNEG : SIMDTwoVectorFP<1, 1, 0b01111, "fneg", fneg>;
defm FRECPE : SIMDTwoVectorFP<0, 1, 0b11101, "frecpe", int_aarch64_neon_frecpe>;
defm FRINTA : SIMDTwoVectorFP<1, 0, 0b11000, "frinta", frnd>;
@@ -3318,6 +3401,19 @@ def : Pat<(f64 (int_aarch64_neon_frecpe (f64 FPR64:$Rn))),
def : Pat<(v1f64 (int_aarch64_neon_frecpe (v1f64 FPR64:$Rn))),
(FRECPEv1i64 FPR64:$Rn)>;
+def : Pat<(f32 (AArch64frecpe (f32 FPR32:$Rn))),
+ (FRECPEv1i32 FPR32:$Rn)>;
+def : Pat<(v2f32 (AArch64frecpe (v2f32 V64:$Rn))),
+ (FRECPEv2f32 V64:$Rn)>;
+def : Pat<(v4f32 (AArch64frecpe (v4f32 FPR128:$Rn))),
+ (FRECPEv4f32 FPR128:$Rn)>;
+def : Pat<(f64 (AArch64frecpe (f64 FPR64:$Rn))),
+ (FRECPEv1i64 FPR64:$Rn)>;
+def : Pat<(v1f64 (AArch64frecpe (v1f64 FPR64:$Rn))),
+ (FRECPEv1i64 FPR64:$Rn)>;
+def : Pat<(v2f64 (AArch64frecpe (v2f64 FPR128:$Rn))),
+ (FRECPEv2f64 FPR128:$Rn)>;
+
def : Pat<(f32 (int_aarch64_neon_frecpx (f32 FPR32:$Rn))),
(FRECPXv1i32 FPR32:$Rn)>;
def : Pat<(f64 (int_aarch64_neon_frecpx (f64 FPR64:$Rn))),
@@ -3330,6 +3426,19 @@ def : Pat<(f64 (int_aarch64_neon_frsqrte (f64 FPR64:$Rn))),
def : Pat<(v1f64 (int_aarch64_neon_frsqrte (v1f64 FPR64:$Rn))),
(FRSQRTEv1i64 FPR64:$Rn)>;
+def : Pat<(f32 (AArch64frsqrte (f32 FPR32:$Rn))),
+ (FRSQRTEv1i32 FPR32:$Rn)>;
+def : Pat<(v2f32 (AArch64frsqrte (v2f32 V64:$Rn))),
+ (FRSQRTEv2f32 V64:$Rn)>;
+def : Pat<(v4f32 (AArch64frsqrte (v4f32 FPR128:$Rn))),
+ (FRSQRTEv4f32 FPR128:$Rn)>;
+def : Pat<(f64 (AArch64frsqrte (f64 FPR64:$Rn))),
+ (FRSQRTEv1i64 FPR64:$Rn)>;
+def : Pat<(v1f64 (AArch64frsqrte (v1f64 FPR64:$Rn))),
+ (FRSQRTEv1i64 FPR64:$Rn)>;
+def : Pat<(v2f64 (AArch64frsqrte (v2f64 FPR128:$Rn))),
+ (FRSQRTEv2f64 FPR128:$Rn)>;
+
// If an integer is about to be converted to a floating point value,
// just load it on the floating point unit.
// Here are the patterns for 8 and 16-bits to float.
@@ -4319,18 +4428,6 @@ def MOVIv2d_ns : SIMDModifiedImmVectorNoShift<1, 1, 0, 0b1110, V128,
"movi", ".2d",
[(set (v2i64 V128:$Rd), (AArch64movi_edit imm0_255:$imm8))]>;
-
-// Use movi.2d to materialize 0.0 if the HW does zero-cycle zeroing.
-// Complexity is added to break a tie with a plain MOVI.
-let AddedComplexity = 1 in {
-def : Pat<(f32 fpimm0),
- (f32 (EXTRACT_SUBREG (v2i64 (MOVIv2d_ns (i32 0))), ssub))>,
- Requires<[HasZCZ]>;
-def : Pat<(f64 fpimm0),
- (f64 (EXTRACT_SUBREG (v2i64 (MOVIv2d_ns (i32 0))), dsub))>,
- Requires<[HasZCZ]>;
-}
-
def : Pat<(v2i64 immAllZerosV), (MOVIv2d_ns (i32 0))>;
def : Pat<(v4i32 immAllZerosV), (MOVIv2d_ns (i32 0))>;
def : Pat<(v8i16 immAllZerosV), (MOVIv2d_ns (i32 0))>;
@@ -4845,7 +4942,8 @@ class SExtLoadi8CVTf32Pat<dag addrmode, dag INST>
0),
dsub)),
0),
- ssub)))>, Requires<[NotForCodeSize, IsCyclone]>;
+ ssub)))>,
+ Requires<[NotForCodeSize, UseAlternateSExtLoadCVTF32]>;
def : SExtLoadi8CVTf32Pat<(ro8.Wpat GPR64sp:$Rn, GPR32:$Rm, ro8.Wext:$ext),
(LDRBroW GPR64sp:$Rn, GPR32:$Rm, ro8.Wext:$ext)>;
@@ -4898,7 +4996,8 @@ class SExtLoadi16CVTf64Pat<dag addrmode, dag INST>
0),
dsub)),
0),
- dsub)))>, Requires<[NotForCodeSize, IsCyclone]>;
+ dsub)))>,
+ Requires<[NotForCodeSize, UseAlternateSExtLoadCVTF32]>;
def : SExtLoadi16CVTf64Pat<(ro16.Wpat GPR64sp:$Rn, GPR32:$Rm, ro16.Wext:$ext),
(LDRHroW GPR64sp:$Rn, GPR32:$Rm, ro16.Wext:$ext)>;
@@ -5982,7 +6081,7 @@ def : NTStore64Pat<v8i8>;
def : Pat<(nontemporalstore GPR64:$Rt,
(am_indexed7s32 GPR64sp:$Rn, simm7s4:$offset)),
(STNPWi (EXTRACT_SUBREG GPR64:$Rt, sub_32),
- (EXTRACT_SUBREG (UBFMXri GPR64:$Rt, 0, 31), sub_32),
+ (EXTRACT_SUBREG (UBFMXri GPR64:$Rt, 32, 63), sub_32),
GPR64sp:$Rn, simm7s4:$offset)>;
} // AddedComplexity=10
} // Predicates = [IsLE]
@@ -5990,8 +6089,10 @@ def : Pat<(nontemporalstore GPR64:$Rt,
// Tail call return handling. These are all compiler pseudo-instructions,
// so no encoding information or anything like that.
let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [SP] in {
- def TCRETURNdi : Pseudo<(outs), (ins i64imm:$dst, i32imm:$FPDiff),[]>;
- def TCRETURNri : Pseudo<(outs), (ins tcGPR64:$dst, i32imm:$FPDiff), []>;
+ def TCRETURNdi : Pseudo<(outs), (ins i64imm:$dst, i32imm:$FPDiff), []>,
+ Sched<[WriteBrReg]>;
+ def TCRETURNri : Pseudo<(outs), (ins tcGPR64:$dst, i32imm:$FPDiff), []>,
+ Sched<[WriteBrReg]>;
}
def : Pat<(AArch64tcret tcGPR64:$dst, (i32 timm:$FPDiff)),
diff --git a/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp b/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
index 43664df3b861a..dca13fc494140 100644
--- a/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
+++ b/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
@@ -33,9 +33,6 @@ using namespace llvm;
#define DEBUG_TYPE "aarch64-ldst-opt"
-/// AArch64AllocLoadStoreOpt - Post-register allocation pass to combine
-/// load / store instructions to form ldp / stp instructions.
-
STATISTIC(NumPairCreated, "Number of load/store pair instructions generated");
STATISTIC(NumPostFolded, "Number of post-index updates folded");
STATISTIC(NumPreFolded, "Number of pre-index updates folded");
@@ -45,9 +42,19 @@ STATISTIC(NumNarrowLoadsPromoted, "Number of narrow loads promoted");
STATISTIC(NumZeroStoresPromoted, "Number of narrow zero stores promoted");
STATISTIC(NumLoadsFromStoresPromoted, "Number of loads from stores promoted");
-static cl::opt<unsigned> ScanLimit("aarch64-load-store-scan-limit",
+// The LdStLimit limits how far we search for load/store pairs.
+static cl::opt<unsigned> LdStLimit("aarch64-load-store-scan-limit",
cl::init(20), cl::Hidden);
+// The UpdateLimit limits how far we search for update instructions when we form
+// pre-/post-index instructions.
+static cl::opt<unsigned> UpdateLimit("aarch64-update-scan-limit", cl::init(100),
+ cl::Hidden);
+
+static cl::opt<bool> EnableNarrowLdMerge("enable-narrow-ld-merge", cl::Hidden,
+ cl::init(false),
+ cl::desc("Enable narrow load merge"));
+
namespace llvm {
void initializeAArch64LoadStoreOptPass(PassRegistry &);
}
@@ -88,22 +95,29 @@ struct AArch64LoadStoreOpt : public MachineFunctionPass {
const TargetRegisterInfo *TRI;
const AArch64Subtarget *Subtarget;
+ // Track which registers have been modified and used.
+ BitVector ModifiedRegs, UsedRegs;
+
// Scan the instructions looking for a load/store that can be combined
// with the current instruction into a load/store pair.
// Return the matching instruction if one is found, else MBB->end().
MachineBasicBlock::iterator findMatchingInsn(MachineBasicBlock::iterator I,
LdStPairFlags &Flags,
- unsigned Limit);
+ unsigned Limit,
+ bool FindNarrowMerge);
// Scan the instructions looking for a store that writes to the address from
// which the current load instruction reads. Return true if one is found.
bool findMatchingStore(MachineBasicBlock::iterator I, unsigned Limit,
MachineBasicBlock::iterator &StoreI);
+ // Merge the two instructions indicated into a wider instruction.
+ MachineBasicBlock::iterator
+ mergeNarrowInsns(MachineBasicBlock::iterator I,
+ MachineBasicBlock::iterator MergeMI,
+ const LdStPairFlags &Flags);
+
// Merge the two instructions indicated into a single pair-wise instruction.
- // If MergeForward is true, erase the first instruction and fold its
- // operation into the second. If false, the reverse. Return the instruction
- // following the first instruction (which may change during processing).
MachineBasicBlock::iterator
mergePairedInsns(MachineBasicBlock::iterator I,
MachineBasicBlock::iterator Paired,
@@ -118,8 +132,8 @@ struct AArch64LoadStoreOpt : public MachineFunctionPass {
// be combined with the current instruction (a load or store) using
// pre or post indexed addressing with writeback. Scan forwards.
MachineBasicBlock::iterator
- findMatchingUpdateInsnForward(MachineBasicBlock::iterator I, unsigned Limit,
- int UnscaledOffset);
+ findMatchingUpdateInsnForward(MachineBasicBlock::iterator I,
+ int UnscaledOffset, unsigned Limit);
// Scan the instruction list to find a base register update that can
// be combined with the current instruction (a load or store) using
@@ -129,7 +143,7 @@ struct AArch64LoadStoreOpt : public MachineFunctionPass {
// Find an instruction that updates the base register of the ld/st
// instruction.
- bool isMatchingUpdateInsn(MachineInstr *MemMI, MachineInstr *MI,
+ bool isMatchingUpdateInsn(MachineInstr &MemMI, MachineInstr &MI,
unsigned BaseReg, int Offset);
// Merge a pre- or post-index base register update into a ld/st instruction.
@@ -140,17 +154,21 @@ struct AArch64LoadStoreOpt : public MachineFunctionPass {
// Find and merge foldable ldr/str instructions.
bool tryToMergeLdStInst(MachineBasicBlock::iterator &MBBI);
+ // Find and pair ldr/str instructions.
+ bool tryToPairLdStInst(MachineBasicBlock::iterator &MBBI);
+
// Find and promote load instructions which read directly from store.
bool tryToPromoteLoadFromStore(MachineBasicBlock::iterator &MBBI);
- // Check if converting two narrow loads into a single wider load with
- // bitfield extracts could be enabled.
- bool enableNarrowLdMerge(MachineFunction &Fn);
-
bool optimizeBlock(MachineBasicBlock &MBB, bool enableNarrowLdOpt);
bool runOnMachineFunction(MachineFunction &Fn) override;
+ MachineFunctionProperties getRequiredProperties() const override {
+ return MachineFunctionProperties().set(
+ MachineFunctionProperties::Property::AllVRegsAllocated);
+ }
+
const char *getPassName() const override {
return AARCH64_LOAD_STORE_OPT_NAME;
}
@@ -161,37 +179,8 @@ char AArch64LoadStoreOpt::ID = 0;
INITIALIZE_PASS(AArch64LoadStoreOpt, "aarch64-ldst-opt",
AARCH64_LOAD_STORE_OPT_NAME, false, false)
-static bool isUnscaledLdSt(unsigned Opc) {
- switch (Opc) {
- default:
- return false;
- case AArch64::STURSi:
- case AArch64::STURDi:
- case AArch64::STURQi:
- case AArch64::STURBBi:
- case AArch64::STURHHi:
- case AArch64::STURWi:
- case AArch64::STURXi:
- case AArch64::LDURSi:
- case AArch64::LDURDi:
- case AArch64::LDURQi:
- case AArch64::LDURWi:
- case AArch64::LDURXi:
- case AArch64::LDURSWi:
- case AArch64::LDURHHi:
- case AArch64::LDURBBi:
- case AArch64::LDURSBWi:
- case AArch64::LDURSHWi:
- return true;
- }
-}
-
-static bool isUnscaledLdSt(MachineInstr *MI) {
- return isUnscaledLdSt(MI->getOpcode());
-}
-
-static unsigned getBitExtrOpcode(MachineInstr *MI) {
- switch (MI->getOpcode()) {
+static unsigned getBitExtrOpcode(MachineInstr &MI) {
+ switch (MI.getOpcode()) {
default:
llvm_unreachable("Unexpected opcode.");
case AArch64::LDRBBui:
@@ -219,10 +208,6 @@ static bool isNarrowStore(unsigned Opc) {
}
}
-static bool isNarrowStore(MachineInstr *MI) {
- return isNarrowStore(MI->getOpcode());
-}
-
static bool isNarrowLoad(unsigned Opc) {
switch (Opc) {
default:
@@ -239,13 +224,17 @@ static bool isNarrowLoad(unsigned Opc) {
}
}
-static bool isNarrowLoad(MachineInstr *MI) {
- return isNarrowLoad(MI->getOpcode());
+static bool isNarrowLoad(MachineInstr &MI) {
+ return isNarrowLoad(MI.getOpcode());
+}
+
+static bool isNarrowLoadOrStore(unsigned Opc) {
+ return isNarrowLoad(Opc) || isNarrowStore(Opc);
}
// Scaling factor for unscaled load or store.
-static int getMemScale(MachineInstr *MI) {
- switch (MI->getOpcode()) {
+static int getMemScale(MachineInstr &MI) {
+ switch (MI.getOpcode()) {
default:
llvm_unreachable("Opcode has unknown scale!");
case AArch64::LDRBBui:
@@ -354,6 +343,37 @@ static unsigned getMatchingNonSExtOpcode(unsigned Opc,
}
}
+static unsigned getMatchingWideOpcode(unsigned Opc) {
+ switch (Opc) {
+ default:
+ llvm_unreachable("Opcode has no wide equivalent!");
+ case AArch64::STRBBui:
+ return AArch64::STRHHui;
+ case AArch64::STRHHui:
+ return AArch64::STRWui;
+ case AArch64::STURBBi:
+ return AArch64::STURHHi;
+ case AArch64::STURHHi:
+ return AArch64::STURWi;
+ case AArch64::STURWi:
+ return AArch64::STURXi;
+ case AArch64::STRWui:
+ return AArch64::STRXui;
+ case AArch64::LDRHHui:
+ case AArch64::LDRSHWui:
+ return AArch64::LDRWui;
+ case AArch64::LDURHHi:
+ case AArch64::LDURSHWi:
+ return AArch64::LDURWi;
+ case AArch64::LDRBBui:
+ case AArch64::LDRSBWui:
+ return AArch64::LDRHHui;
+ case AArch64::LDURBBi:
+ case AArch64::LDURSBWi:
+ return AArch64::LDURHHi;
+ }
+}
+
static unsigned getMatchingPairOpcode(unsigned Opc) {
switch (Opc) {
default:
@@ -367,14 +387,6 @@ static unsigned getMatchingPairOpcode(unsigned Opc) {
case AArch64::STRQui:
case AArch64::STURQi:
return AArch64::STPQi;
- case AArch64::STRBBui:
- return AArch64::STRHHui;
- case AArch64::STRHHui:
- return AArch64::STRWui;
- case AArch64::STURBBi:
- return AArch64::STURHHi;
- case AArch64::STURHHi:
- return AArch64::STURWi;
case AArch64::STRWui:
case AArch64::STURWi:
return AArch64::STPWi;
@@ -399,25 +411,13 @@ static unsigned getMatchingPairOpcode(unsigned Opc) {
case AArch64::LDRSWui:
case AArch64::LDURSWi:
return AArch64::LDPSWi;
- case AArch64::LDRHHui:
- case AArch64::LDRSHWui:
- return AArch64::LDRWui;
- case AArch64::LDURHHi:
- case AArch64::LDURSHWi:
- return AArch64::LDURWi;
- case AArch64::LDRBBui:
- case AArch64::LDRSBWui:
- return AArch64::LDRHHui;
- case AArch64::LDURBBi:
- case AArch64::LDURSBWi:
- return AArch64::LDURHHi;
}
}
-static unsigned isMatchingStore(MachineInstr *LoadInst,
- MachineInstr *StoreInst) {
- unsigned LdOpc = LoadInst->getOpcode();
- unsigned StOpc = StoreInst->getOpcode();
+static unsigned isMatchingStore(MachineInstr &LoadInst,
+ MachineInstr &StoreInst) {
+ unsigned LdOpc = LoadInst.getOpcode();
+ unsigned StOpc = StoreInst.getOpcode();
switch (LdOpc) {
default:
llvm_unreachable("Unsupported load instruction!");
@@ -562,8 +562,8 @@ static unsigned getPostIndexedOpcode(unsigned Opc) {
}
}
-static bool isPairedLdSt(const MachineInstr *MI) {
- switch (MI->getOpcode()) {
+static bool isPairedLdSt(const MachineInstr &MI) {
+ switch (MI.getOpcode()) {
default:
return false;
case AArch64::LDPSi:
@@ -581,41 +581,55 @@ static bool isPairedLdSt(const MachineInstr *MI) {
}
}
-static const MachineOperand &getLdStRegOp(const MachineInstr *MI,
+static const MachineOperand &getLdStRegOp(const MachineInstr &MI,
unsigned PairedRegOp = 0) {
assert(PairedRegOp < 2 && "Unexpected register operand idx.");
unsigned Idx = isPairedLdSt(MI) ? PairedRegOp : 0;
- return MI->getOperand(Idx);
+ return MI.getOperand(Idx);
}
-static const MachineOperand &getLdStBaseOp(const MachineInstr *MI) {
+static const MachineOperand &getLdStBaseOp(const MachineInstr &MI) {
unsigned Idx = isPairedLdSt(MI) ? 2 : 1;
- return MI->getOperand(Idx);
+ return MI.getOperand(Idx);
}
-static const MachineOperand &getLdStOffsetOp(const MachineInstr *MI) {
+static const MachineOperand &getLdStOffsetOp(const MachineInstr &MI) {
unsigned Idx = isPairedLdSt(MI) ? 3 : 2;
- return MI->getOperand(Idx);
+ return MI.getOperand(Idx);
}
-static bool isLdOffsetInRangeOfSt(MachineInstr *LoadInst,
- MachineInstr *StoreInst) {
+static bool isLdOffsetInRangeOfSt(MachineInstr &LoadInst,
+ MachineInstr &StoreInst,
+ const AArch64InstrInfo *TII) {
assert(isMatchingStore(LoadInst, StoreInst) && "Expect only matched ld/st.");
int LoadSize = getMemScale(LoadInst);
int StoreSize = getMemScale(StoreInst);
- int UnscaledStOffset = isUnscaledLdSt(StoreInst)
+ int UnscaledStOffset = TII->isUnscaledLdSt(StoreInst)
? getLdStOffsetOp(StoreInst).getImm()
: getLdStOffsetOp(StoreInst).getImm() * StoreSize;
- int UnscaledLdOffset = isUnscaledLdSt(LoadInst)
+ int UnscaledLdOffset = TII->isUnscaledLdSt(LoadInst)
? getLdStOffsetOp(LoadInst).getImm()
: getLdStOffsetOp(LoadInst).getImm() * LoadSize;
return (UnscaledStOffset <= UnscaledLdOffset) &&
(UnscaledLdOffset + LoadSize <= (UnscaledStOffset + StoreSize));
}
+static bool isPromotableZeroStoreOpcode(unsigned Opc) {
+ return isNarrowStore(Opc) || Opc == AArch64::STRWui || Opc == AArch64::STURWi;
+}
+
+static bool isPromotableZeroStoreOpcode(MachineInstr &MI) {
+ return isPromotableZeroStoreOpcode(MI.getOpcode());
+}
+
+static bool isPromotableZeroStoreInst(MachineInstr &MI) {
+ return (isPromotableZeroStoreOpcode(MI)) &&
+ getLdStRegOp(MI).getReg() == AArch64::WZR;
+}
+
MachineBasicBlock::iterator
-AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I,
- MachineBasicBlock::iterator Paired,
+AArch64LoadStoreOpt::mergeNarrowInsns(MachineBasicBlock::iterator I,
+ MachineBasicBlock::iterator MergeMI,
const LdStPairFlags &Flags) {
MachineBasicBlock::iterator NextI = I;
++NextI;
@@ -623,128 +637,124 @@ AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I,
// to skip one further. Either way we merge will invalidate the iterator,
// and we don't need to scan the new instruction, as it's a pairwise
// instruction, which we're not considering for further action anyway.
- if (NextI == Paired)
+ if (NextI == MergeMI)
++NextI;
- int SExtIdx = Flags.getSExtIdx();
- unsigned Opc =
- SExtIdx == -1 ? I->getOpcode() : getMatchingNonSExtOpcode(I->getOpcode());
- bool IsUnscaled = isUnscaledLdSt(Opc);
- int OffsetStride = IsUnscaled ? getMemScale(I) : 1;
+ unsigned Opc = I->getOpcode();
+ bool IsScaled = !TII->isUnscaledLdSt(Opc);
+ int OffsetStride = IsScaled ? 1 : getMemScale(*I);
bool MergeForward = Flags.getMergeForward();
- unsigned NewOpc = getMatchingPairOpcode(Opc);
// Insert our new paired instruction after whichever of the paired
// instructions MergeForward indicates.
- MachineBasicBlock::iterator InsertionPoint = MergeForward ? Paired : I;
+ MachineBasicBlock::iterator InsertionPoint = MergeForward ? MergeMI : I;
// Also based on MergeForward is from where we copy the base register operand
// so we get the flags compatible with the input code.
const MachineOperand &BaseRegOp =
- MergeForward ? getLdStBaseOp(Paired) : getLdStBaseOp(I);
+ MergeForward ? getLdStBaseOp(*MergeMI) : getLdStBaseOp(*I);
// Which register is Rt and which is Rt2 depends on the offset order.
MachineInstr *RtMI, *Rt2MI;
- if (getLdStOffsetOp(I).getImm() ==
- getLdStOffsetOp(Paired).getImm() + OffsetStride) {
- RtMI = Paired;
- Rt2MI = I;
- // Here we swapped the assumption made for SExtIdx.
- // I.e., we turn ldp I, Paired into ldp Paired, I.
- // Update the index accordingly.
- if (SExtIdx != -1)
- SExtIdx = (SExtIdx + 1) % 2;
+ if (getLdStOffsetOp(*I).getImm() ==
+ getLdStOffsetOp(*MergeMI).getImm() + OffsetStride) {
+ RtMI = &*MergeMI;
+ Rt2MI = &*I;
} else {
- RtMI = I;
- Rt2MI = Paired;
+ RtMI = &*I;
+ Rt2MI = &*MergeMI;
}
- int OffsetImm = getLdStOffsetOp(RtMI).getImm();
+ int OffsetImm = getLdStOffsetOp(*RtMI).getImm();
+ // Change the scaled offset from small to large type.
+ if (IsScaled) {
+ assert(((OffsetImm & 1) == 0) && "Unexpected offset to merge");
+ OffsetImm /= 2;
+ }
+ DebugLoc DL = I->getDebugLoc();
+ MachineBasicBlock *MBB = I->getParent();
if (isNarrowLoad(Opc)) {
- // Change the scaled offset from small to large type.
- if (!IsUnscaled) {
- assert(((OffsetImm & 1) == 0) && "Unexpected offset to merge");
- OffsetImm /= 2;
- }
- MachineInstr *RtNewDest = MergeForward ? I : Paired;
+ MachineInstr *RtNewDest = &*(MergeForward ? I : MergeMI);
// When merging small (< 32 bit) loads for big-endian targets, the order of
// the component parts gets swapped.
if (!Subtarget->isLittleEndian())
std::swap(RtMI, Rt2MI);
// Construct the new load instruction.
MachineInstr *NewMemMI, *BitExtMI1, *BitExtMI2;
- NewMemMI = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(),
- TII->get(NewOpc))
- .addOperand(getLdStRegOp(RtNewDest))
- .addOperand(BaseRegOp)
- .addImm(OffsetImm)
- .setMemRefs(I->mergeMemRefsWith(*Paired));
+ NewMemMI =
+ BuildMI(*MBB, InsertionPoint, DL, TII->get(getMatchingWideOpcode(Opc)))
+ .addOperand(getLdStRegOp(*RtNewDest))
+ .addOperand(BaseRegOp)
+ .addImm(OffsetImm)
+ .setMemRefs(I->mergeMemRefsWith(*MergeMI));
+ (void)NewMemMI;
DEBUG(
dbgs()
<< "Creating the new load and extract. Replacing instructions:\n ");
DEBUG(I->print(dbgs()));
DEBUG(dbgs() << " ");
- DEBUG(Paired->print(dbgs()));
+ DEBUG(MergeMI->print(dbgs()));
DEBUG(dbgs() << " with instructions:\n ");
DEBUG((NewMemMI)->print(dbgs()));
- int Width = getMemScale(I) == 1 ? 8 : 16;
+ int Width = getMemScale(*I) == 1 ? 8 : 16;
int LSBLow = 0;
int LSBHigh = Width;
int ImmsLow = LSBLow + Width - 1;
int ImmsHigh = LSBHigh + Width - 1;
- MachineInstr *ExtDestMI = MergeForward ? Paired : I;
+ MachineInstr *ExtDestMI = &*(MergeForward ? MergeMI : I);
if ((ExtDestMI == Rt2MI) == Subtarget->isLittleEndian()) {
// Create the bitfield extract for high bits.
- BitExtMI1 = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(),
- TII->get(getBitExtrOpcode(Rt2MI)))
- .addOperand(getLdStRegOp(Rt2MI))
- .addReg(getLdStRegOp(RtNewDest).getReg())
- .addImm(LSBHigh)
- .addImm(ImmsHigh);
+ BitExtMI1 =
+ BuildMI(*MBB, InsertionPoint, DL, TII->get(getBitExtrOpcode(*Rt2MI)))
+ .addOperand(getLdStRegOp(*Rt2MI))
+ .addReg(getLdStRegOp(*RtNewDest).getReg())
+ .addImm(LSBHigh)
+ .addImm(ImmsHigh);
// Create the bitfield extract for low bits.
if (RtMI->getOpcode() == getMatchingNonSExtOpcode(RtMI->getOpcode())) {
// For unsigned, prefer to use AND for low bits.
- BitExtMI2 = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(),
- TII->get(AArch64::ANDWri))
- .addOperand(getLdStRegOp(RtMI))
- .addReg(getLdStRegOp(RtNewDest).getReg())
+ BitExtMI2 = BuildMI(*MBB, InsertionPoint, DL, TII->get(AArch64::ANDWri))
+ .addOperand(getLdStRegOp(*RtMI))
+ .addReg(getLdStRegOp(*RtNewDest).getReg())
.addImm(ImmsLow);
} else {
- BitExtMI2 = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(),
- TII->get(getBitExtrOpcode(RtMI)))
- .addOperand(getLdStRegOp(RtMI))
- .addReg(getLdStRegOp(RtNewDest).getReg())
- .addImm(LSBLow)
- .addImm(ImmsLow);
+ BitExtMI2 =
+ BuildMI(*MBB, InsertionPoint, DL, TII->get(getBitExtrOpcode(*RtMI)))
+ .addOperand(getLdStRegOp(*RtMI))
+ .addReg(getLdStRegOp(*RtNewDest).getReg())
+ .addImm(LSBLow)
+ .addImm(ImmsLow);
}
} else {
// Create the bitfield extract for low bits.
if (RtMI->getOpcode() == getMatchingNonSExtOpcode(RtMI->getOpcode())) {
// For unsigned, prefer to use AND for low bits.
- BitExtMI1 = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(),
- TII->get(AArch64::ANDWri))
- .addOperand(getLdStRegOp(RtMI))
- .addReg(getLdStRegOp(RtNewDest).getReg())
+ BitExtMI1 = BuildMI(*MBB, InsertionPoint, DL, TII->get(AArch64::ANDWri))
+ .addOperand(getLdStRegOp(*RtMI))
+ .addReg(getLdStRegOp(*RtNewDest).getReg())
.addImm(ImmsLow);
} else {
- BitExtMI1 = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(),
- TII->get(getBitExtrOpcode(RtMI)))
- .addOperand(getLdStRegOp(RtMI))
- .addReg(getLdStRegOp(RtNewDest).getReg())
- .addImm(LSBLow)
- .addImm(ImmsLow);
+ BitExtMI1 =
+ BuildMI(*MBB, InsertionPoint, DL, TII->get(getBitExtrOpcode(*RtMI)))
+ .addOperand(getLdStRegOp(*RtMI))
+ .addReg(getLdStRegOp(*RtNewDest).getReg())
+ .addImm(LSBLow)
+ .addImm(ImmsLow);
}
// Create the bitfield extract for high bits.
- BitExtMI2 = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(),
- TII->get(getBitExtrOpcode(Rt2MI)))
- .addOperand(getLdStRegOp(Rt2MI))
- .addReg(getLdStRegOp(RtNewDest).getReg())
- .addImm(LSBHigh)
- .addImm(ImmsHigh);
+ BitExtMI2 =
+ BuildMI(*MBB, InsertionPoint, DL, TII->get(getBitExtrOpcode(*Rt2MI)))
+ .addOperand(getLdStRegOp(*Rt2MI))
+ .addReg(getLdStRegOp(*RtNewDest).getReg())
+ .addImm(LSBHigh)
+ .addImm(ImmsHigh);
}
+ (void)BitExtMI1;
+ (void)BitExtMI2;
+
DEBUG(dbgs() << " ");
DEBUG((BitExtMI1)->print(dbgs()));
DEBUG(dbgs() << " ");
@@ -753,47 +763,122 @@ AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I,
// Erase the old instructions.
I->eraseFromParent();
- Paired->eraseFromParent();
+ MergeMI->eraseFromParent();
return NextI;
}
+ assert(isPromotableZeroStoreInst(*I) && isPromotableZeroStoreInst(*MergeMI) &&
+ "Expected promotable zero store");
// Construct the new instruction.
MachineInstrBuilder MIB;
- if (isNarrowStore(Opc)) {
- // Change the scaled offset from small to large type.
- if (!IsUnscaled) {
- assert(((OffsetImm & 1) == 0) && "Unexpected offset to merge");
- OffsetImm /= 2;
+ MIB = BuildMI(*MBB, InsertionPoint, DL, TII->get(getMatchingWideOpcode(Opc)))
+ .addReg(isNarrowStore(Opc) ? AArch64::WZR : AArch64::XZR)
+ .addOperand(BaseRegOp)
+ .addImm(OffsetImm)
+ .setMemRefs(I->mergeMemRefsWith(*MergeMI));
+ (void)MIB;
+
+ DEBUG(dbgs() << "Creating wider load/store. Replacing instructions:\n ");
+ DEBUG(I->print(dbgs()));
+ DEBUG(dbgs() << " ");
+ DEBUG(MergeMI->print(dbgs()));
+ DEBUG(dbgs() << " with instruction:\n ");
+ DEBUG(((MachineInstr *)MIB)->print(dbgs()));
+ DEBUG(dbgs() << "\n");
+
+ // Erase the old instructions.
+ I->eraseFromParent();
+ MergeMI->eraseFromParent();
+ return NextI;
+}
+
+MachineBasicBlock::iterator
+AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I,
+ MachineBasicBlock::iterator Paired,
+ const LdStPairFlags &Flags) {
+ MachineBasicBlock::iterator NextI = I;
+ ++NextI;
+ // If NextI is the second of the two instructions to be merged, we need
+ // to skip one further. Either way we merge will invalidate the iterator,
+ // and we don't need to scan the new instruction, as it's a pairwise
+ // instruction, which we're not considering for further action anyway.
+ if (NextI == Paired)
+ ++NextI;
+
+ int SExtIdx = Flags.getSExtIdx();
+ unsigned Opc =
+ SExtIdx == -1 ? I->getOpcode() : getMatchingNonSExtOpcode(I->getOpcode());
+ bool IsUnscaled = TII->isUnscaledLdSt(Opc);
+ int OffsetStride = IsUnscaled ? getMemScale(*I) : 1;
+
+ bool MergeForward = Flags.getMergeForward();
+ // Insert our new paired instruction after whichever of the paired
+ // instructions MergeForward indicates.
+ MachineBasicBlock::iterator InsertionPoint = MergeForward ? Paired : I;
+ // Also based on MergeForward is from where we copy the base register operand
+ // so we get the flags compatible with the input code.
+ const MachineOperand &BaseRegOp =
+ MergeForward ? getLdStBaseOp(*Paired) : getLdStBaseOp(*I);
+
+ int Offset = getLdStOffsetOp(*I).getImm();
+ int PairedOffset = getLdStOffsetOp(*Paired).getImm();
+ bool PairedIsUnscaled = TII->isUnscaledLdSt(Paired->getOpcode());
+ if (IsUnscaled != PairedIsUnscaled) {
+ // We're trying to pair instructions that differ in how they are scaled. If
+ // I is scaled then scale the offset of Paired accordingly. Otherwise, do
+ // the opposite (i.e., make Paired's offset unscaled).
+ int MemSize = getMemScale(*Paired);
+ if (PairedIsUnscaled) {
+ // If the unscaled offset isn't a multiple of the MemSize, we can't
+ // pair the operations together.
+ assert(!(PairedOffset % getMemScale(*Paired)) &&
+ "Offset should be a multiple of the stride!");
+ PairedOffset /= MemSize;
+ } else {
+ PairedOffset *= MemSize;
}
- MIB = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(),
- TII->get(NewOpc))
- .addOperand(getLdStRegOp(I))
- .addOperand(BaseRegOp)
- .addImm(OffsetImm)
- .setMemRefs(I->mergeMemRefsWith(*Paired));
+ }
+
+ // Which register is Rt and which is Rt2 depends on the offset order.
+ MachineInstr *RtMI, *Rt2MI;
+ if (Offset == PairedOffset + OffsetStride) {
+ RtMI = &*Paired;
+ Rt2MI = &*I;
+ // Here we swapped the assumption made for SExtIdx.
+ // I.e., we turn ldp I, Paired into ldp Paired, I.
+ // Update the index accordingly.
+ if (SExtIdx != -1)
+ SExtIdx = (SExtIdx + 1) % 2;
} else {
- // Handle Unscaled
- if (IsUnscaled)
- OffsetImm /= OffsetStride;
- MIB = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(),
- TII->get(NewOpc))
- .addOperand(getLdStRegOp(RtMI))
- .addOperand(getLdStRegOp(Rt2MI))
- .addOperand(BaseRegOp)
- .addImm(OffsetImm);
+ RtMI = &*I;
+ Rt2MI = &*Paired;
+ }
+ int OffsetImm = getLdStOffsetOp(*RtMI).getImm();
+ // Scale the immediate offset, if necessary.
+ if (TII->isUnscaledLdSt(RtMI->getOpcode())) {
+ assert(!(OffsetImm % getMemScale(*RtMI)) &&
+ "Unscaled offset cannot be scaled.");
+ OffsetImm /= getMemScale(*RtMI);
}
- (void)MIB;
+ // Construct the new instruction.
+ MachineInstrBuilder MIB;
+ DebugLoc DL = I->getDebugLoc();
+ MachineBasicBlock *MBB = I->getParent();
+ MIB = BuildMI(*MBB, InsertionPoint, DL, TII->get(getMatchingPairOpcode(Opc)))
+ .addOperand(getLdStRegOp(*RtMI))
+ .addOperand(getLdStRegOp(*Rt2MI))
+ .addOperand(BaseRegOp)
+ .addImm(OffsetImm)
+ .setMemRefs(I->mergeMemRefsWith(*Paired));
- // FIXME: Do we need/want to copy the mem operands from the source
- // instructions? Probably. What uses them after this?
+ (void)MIB;
DEBUG(dbgs() << "Creating pair load/store. Replacing instructions:\n ");
DEBUG(I->print(dbgs()));
DEBUG(dbgs() << " ");
DEBUG(Paired->print(dbgs()));
DEBUG(dbgs() << " with instruction:\n ");
-
if (SExtIdx != -1) {
// Generate the sign extension for the proper result of the ldp.
// I.e., with X1, that would be:
@@ -814,26 +899,23 @@ AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I,
// Insert this definition right after the generated LDP, i.e., before
// InsertionPoint.
MachineInstrBuilder MIBKill =
- BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(),
- TII->get(TargetOpcode::KILL), DstRegW)
+ BuildMI(*MBB, InsertionPoint, DL, TII->get(TargetOpcode::KILL), DstRegW)
.addReg(DstRegW)
.addReg(DstRegX, RegState::Define);
MIBKill->getOperand(2).setImplicit();
// Create the sign extension.
MachineInstrBuilder MIBSXTW =
- BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(),
- TII->get(AArch64::SBFMXri), DstRegX)
+ BuildMI(*MBB, InsertionPoint, DL, TII->get(AArch64::SBFMXri), DstRegX)
.addReg(DstRegX)
.addImm(0)
.addImm(31);
(void)MIBSXTW;
DEBUG(dbgs() << " Extend operand:\n ");
DEBUG(((MachineInstr *)MIBSXTW)->print(dbgs()));
- DEBUG(dbgs() << "\n");
} else {
DEBUG(((MachineInstr *)MIB)->print(dbgs()));
- DEBUG(dbgs() << "\n");
}
+ DEBUG(dbgs() << "\n");
// Erase the old instructions.
I->eraseFromParent();
@@ -848,10 +930,10 @@ AArch64LoadStoreOpt::promoteLoadFromStore(MachineBasicBlock::iterator LoadI,
MachineBasicBlock::iterator NextI = LoadI;
++NextI;
- int LoadSize = getMemScale(LoadI);
- int StoreSize = getMemScale(StoreI);
- unsigned LdRt = getLdStRegOp(LoadI).getReg();
- unsigned StRt = getLdStRegOp(StoreI).getReg();
+ int LoadSize = getMemScale(*LoadI);
+ int StoreSize = getMemScale(*StoreI);
+ unsigned LdRt = getLdStRegOp(*LoadI).getReg();
+ unsigned StRt = getLdStRegOp(*StoreI).getReg();
bool IsStoreXReg = TRI->getRegClass(AArch64::GPR64RegClassID)->contains(StRt);
assert((IsStoreXReg ||
@@ -881,15 +963,16 @@ AArch64LoadStoreOpt::promoteLoadFromStore(MachineBasicBlock::iterator LoadI,
// performance and correctness are verified only in little-endian.
if (!Subtarget->isLittleEndian())
return NextI;
- bool IsUnscaled = isUnscaledLdSt(LoadI);
- assert(IsUnscaled == isUnscaledLdSt(StoreI) && "Unsupported ld/st match");
+ bool IsUnscaled = TII->isUnscaledLdSt(*LoadI);
+ assert(IsUnscaled == TII->isUnscaledLdSt(*StoreI) &&
+ "Unsupported ld/st match");
assert(LoadSize <= StoreSize && "Invalid load size");
int UnscaledLdOffset = IsUnscaled
- ? getLdStOffsetOp(LoadI).getImm()
- : getLdStOffsetOp(LoadI).getImm() * LoadSize;
+ ? getLdStOffsetOp(*LoadI).getImm()
+ : getLdStOffsetOp(*LoadI).getImm() * LoadSize;
int UnscaledStOffset = IsUnscaled
- ? getLdStOffsetOp(StoreI).getImm()
- : getLdStOffsetOp(StoreI).getImm() * StoreSize;
+ ? getLdStOffsetOp(*StoreI).getImm()
+ : getLdStOffsetOp(*StoreI).getImm() * StoreSize;
int Width = LoadSize * 8;
int Immr = 8 * (UnscaledLdOffset - UnscaledStOffset);
int Imms = Immr + Width - 1;
@@ -926,6 +1009,7 @@ AArch64LoadStoreOpt::promoteLoadFromStore(MachineBasicBlock::iterator LoadI,
.addImm(Imms);
}
}
+ (void)BitExtMI;
DEBUG(dbgs() << "Promoting load by replacing :\n ");
DEBUG(StoreI->print(dbgs()));
@@ -944,16 +1028,18 @@ AArch64LoadStoreOpt::promoteLoadFromStore(MachineBasicBlock::iterator LoadI,
/// trackRegDefsUses - Remember what registers the specified instruction uses
/// and modifies.
-static void trackRegDefsUses(const MachineInstr *MI, BitVector &ModifiedRegs,
+static void trackRegDefsUses(const MachineInstr &MI, BitVector &ModifiedRegs,
BitVector &UsedRegs,
const TargetRegisterInfo *TRI) {
- for (const MachineOperand &MO : MI->operands()) {
+ for (const MachineOperand &MO : MI.operands()) {
if (MO.isRegMask())
ModifiedRegs.setBitsNotInMask(MO.getRegMask());
if (!MO.isReg())
continue;
unsigned Reg = MO.getReg();
+ if (!Reg)
+ continue;
if (MO.isDef()) {
for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
ModifiedRegs.set(*AI);
@@ -968,38 +1054,42 @@ static void trackRegDefsUses(const MachineInstr *MI, BitVector &ModifiedRegs,
static bool inBoundsForPair(bool IsUnscaled, int Offset, int OffsetStride) {
// Convert the byte-offset used by unscaled into an "element" offset used
// by the scaled pair load/store instructions.
- if (IsUnscaled)
+ if (IsUnscaled) {
+ // If the byte-offset isn't a multiple of the stride, there's no point
+ // trying to match it.
+ if (Offset % OffsetStride)
+ return false;
Offset /= OffsetStride;
-
+ }
return Offset <= 63 && Offset >= -64;
}
// Do alignment, specialized to power of 2 and for signed ints,
// avoiding having to do a C-style cast from uint_64t to int when
-// using RoundUpToAlignment from include/llvm/Support/MathExtras.h.
+// using alignTo from include/llvm/Support/MathExtras.h.
// FIXME: Move this function to include/MathExtras.h?
static int alignTo(int Num, int PowOf2) {
return (Num + PowOf2 - 1) & ~(PowOf2 - 1);
}
-static bool mayAlias(MachineInstr *MIa, MachineInstr *MIb,
+static bool mayAlias(MachineInstr &MIa, MachineInstr &MIb,
const AArch64InstrInfo *TII) {
// One of the instructions must modify memory.
- if (!MIa->mayStore() && !MIb->mayStore())
+ if (!MIa.mayStore() && !MIb.mayStore())
return false;
// Both instructions must be memory operations.
- if (!MIa->mayLoadOrStore() && !MIb->mayLoadOrStore())
+ if (!MIa.mayLoadOrStore() && !MIb.mayLoadOrStore())
return false;
return !TII->areMemAccessesTriviallyDisjoint(MIa, MIb);
}
-static bool mayAlias(MachineInstr *MIa,
+static bool mayAlias(MachineInstr &MIa,
SmallVectorImpl<MachineInstr *> &MemInsns,
const AArch64InstrInfo *TII) {
- for (auto &MIb : MemInsns)
- if (mayAlias(MIa, MIb, TII))
+ for (MachineInstr *MIb : MemInsns)
+ if (mayAlias(MIa, *MIb, TII))
return true;
return false;
@@ -1008,40 +1098,43 @@ static bool mayAlias(MachineInstr *MIa,
bool AArch64LoadStoreOpt::findMatchingStore(
MachineBasicBlock::iterator I, unsigned Limit,
MachineBasicBlock::iterator &StoreI) {
- MachineBasicBlock::iterator E = I->getParent()->begin();
+ MachineBasicBlock::iterator B = I->getParent()->begin();
MachineBasicBlock::iterator MBBI = I;
- MachineInstr *FirstMI = I;
- unsigned BaseReg = getLdStBaseOp(FirstMI).getReg();
+ MachineInstr &LoadMI = *I;
+ unsigned BaseReg = getLdStBaseOp(LoadMI).getReg();
+
+ // If the load is the first instruction in the block, there's obviously
+ // not any matching store.
+ if (MBBI == B)
+ return false;
// Track which registers have been modified and used between the first insn
// and the second insn.
- BitVector ModifiedRegs, UsedRegs;
- ModifiedRegs.resize(TRI->getNumRegs());
- UsedRegs.resize(TRI->getNumRegs());
+ ModifiedRegs.reset();
+ UsedRegs.reset();
- for (unsigned Count = 0; MBBI != E && Count < Limit;) {
+ unsigned Count = 0;
+ do {
--MBBI;
- MachineInstr *MI = MBBI;
- // Skip DBG_VALUE instructions. Otherwise debug info can affect the
- // optimization by changing how far we scan.
- if (MI->isDebugValue())
- continue;
- // Now that we know this is a real instruction, count it.
- ++Count;
+ MachineInstr &MI = *MBBI;
+
+ // Don't count DBG_VALUE instructions towards the search limit.
+ if (!MI.isDebugValue())
+ ++Count;
// If the load instruction reads directly from the address to which the
// store instruction writes and the stored value is not modified, we can
// promote the load. Since we do not handle stores with pre-/post-index,
// it's unnecessary to check if BaseReg is modified by the store itself.
- if (MI->mayStore() && isMatchingStore(FirstMI, MI) &&
+ if (MI.mayStore() && isMatchingStore(LoadMI, MI) &&
BaseReg == getLdStBaseOp(MI).getReg() &&
- isLdOffsetInRangeOfSt(FirstMI, MI) &&
+ isLdOffsetInRangeOfSt(LoadMI, MI, TII) &&
!ModifiedRegs[getLdStRegOp(MI).getReg()]) {
StoreI = MBBI;
return true;
}
- if (MI->isCall())
+ if (MI.isCall())
return false;
// Update modified / uses register lists.
@@ -1053,139 +1146,165 @@ bool AArch64LoadStoreOpt::findMatchingStore(
return false;
// If we encounter a store aliased with the load, return early.
- if (MI->mayStore() && mayAlias(FirstMI, MI, TII))
+ if (MI.mayStore() && mayAlias(LoadMI, MI, TII))
return false;
- }
+ } while (MBBI != B && Count < Limit);
return false;
}
-/// findMatchingInsn - Scan the instructions looking for a load/store that can
-/// be combined with the current instruction into a load/store pair.
+// Returns true if FirstMI and MI are candidates for merging or pairing.
+// Otherwise, returns false.
+static bool areCandidatesToMergeOrPair(MachineInstr &FirstMI, MachineInstr &MI,
+ LdStPairFlags &Flags,
+ const AArch64InstrInfo *TII) {
+ // If this is volatile or if pairing is suppressed, not a candidate.
+ if (MI.hasOrderedMemoryRef() || TII->isLdStPairSuppressed(MI))
+ return false;
+
+ // We should have already checked FirstMI for pair suppression and volatility.
+ assert(!FirstMI.hasOrderedMemoryRef() &&
+ !TII->isLdStPairSuppressed(FirstMI) &&
+ "FirstMI shouldn't get here if either of these checks are true.");
+
+ unsigned OpcA = FirstMI.getOpcode();
+ unsigned OpcB = MI.getOpcode();
+
+ // Opcodes match: nothing more to check.
+ if (OpcA == OpcB)
+ return true;
+
+ // Try to match a sign-extended load/store with a zero-extended load/store.
+ bool IsValidLdStrOpc, PairIsValidLdStrOpc;
+ unsigned NonSExtOpc = getMatchingNonSExtOpcode(OpcA, &IsValidLdStrOpc);
+ assert(IsValidLdStrOpc &&
+ "Given Opc should be a Load or Store with an immediate");
+ // OpcA will be the first instruction in the pair.
+ if (NonSExtOpc == getMatchingNonSExtOpcode(OpcB, &PairIsValidLdStrOpc)) {
+ Flags.setSExtIdx(NonSExtOpc == (unsigned)OpcA ? 1 : 0);
+ return true;
+ }
+
+ // If the second instruction isn't even a load/store, bail out.
+ if (!PairIsValidLdStrOpc)
+ return false;
+
+ // FIXME: We don't support merging narrow loads/stores with mixed
+ // scaled/unscaled offsets.
+ if (isNarrowLoadOrStore(OpcA) || isNarrowLoadOrStore(OpcB))
+ return false;
+
+ // Try to match an unscaled load/store with a scaled load/store.
+ return TII->isUnscaledLdSt(OpcA) != TII->isUnscaledLdSt(OpcB) &&
+ getMatchingPairOpcode(OpcA) == getMatchingPairOpcode(OpcB);
+
+ // FIXME: Can we also match a mixed sext/zext unscaled/scaled pair?
+}
+
+/// Scan the instructions looking for a load/store that can be combined with the
+/// current instruction into a wider equivalent or a load/store pair.
MachineBasicBlock::iterator
AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
- LdStPairFlags &Flags, unsigned Limit) {
+ LdStPairFlags &Flags, unsigned Limit,
+ bool FindNarrowMerge) {
MachineBasicBlock::iterator E = I->getParent()->end();
MachineBasicBlock::iterator MBBI = I;
- MachineInstr *FirstMI = I;
+ MachineInstr &FirstMI = *I;
++MBBI;
- unsigned Opc = FirstMI->getOpcode();
- bool MayLoad = FirstMI->mayLoad();
- bool IsUnscaled = isUnscaledLdSt(FirstMI);
+ bool MayLoad = FirstMI.mayLoad();
+ bool IsUnscaled = TII->isUnscaledLdSt(FirstMI);
unsigned Reg = getLdStRegOp(FirstMI).getReg();
unsigned BaseReg = getLdStBaseOp(FirstMI).getReg();
int Offset = getLdStOffsetOp(FirstMI).getImm();
- bool IsNarrowStore = isNarrowStore(Opc);
-
- // For narrow stores, find only the case where the stored value is WZR.
- if (IsNarrowStore && Reg != AArch64::WZR)
- return E;
-
- // Early exit if the first instruction modifies the base register.
- // e.g., ldr x0, [x0]
- if (FirstMI->modifiesRegister(BaseReg, TRI))
- return E;
-
- // Early exit if the offset if not possible to match. (6 bits of positive
- // range, plus allow an extra one in case we find a later insn that matches
- // with Offset-1)
int OffsetStride = IsUnscaled ? getMemScale(FirstMI) : 1;
- if (!(isNarrowLoad(Opc) || IsNarrowStore) &&
- !inBoundsForPair(IsUnscaled, Offset, OffsetStride))
- return E;
+ bool IsPromotableZeroStore = isPromotableZeroStoreInst(FirstMI);
// Track which registers have been modified and used between the first insn
// (inclusive) and the second insn.
- BitVector ModifiedRegs, UsedRegs;
- ModifiedRegs.resize(TRI->getNumRegs());
- UsedRegs.resize(TRI->getNumRegs());
+ ModifiedRegs.reset();
+ UsedRegs.reset();
// Remember any instructions that read/write memory between FirstMI and MI.
SmallVector<MachineInstr *, 4> MemInsns;
for (unsigned Count = 0; MBBI != E && Count < Limit; ++MBBI) {
- MachineInstr *MI = MBBI;
+ MachineInstr &MI = *MBBI;
// Skip DBG_VALUE instructions. Otherwise debug info can affect the
// optimization by changing how far we scan.
- if (MI->isDebugValue())
+ if (MI.isDebugValue())
continue;
// Now that we know this is a real instruction, count it.
++Count;
- bool CanMergeOpc = Opc == MI->getOpcode();
Flags.setSExtIdx(-1);
- if (!CanMergeOpc) {
- bool IsValidLdStrOpc;
- unsigned NonSExtOpc = getMatchingNonSExtOpcode(Opc, &IsValidLdStrOpc);
- assert(IsValidLdStrOpc &&
- "Given Opc should be a Load or Store with an immediate");
- // Opc will be the first instruction in the pair.
- Flags.setSExtIdx(NonSExtOpc == (unsigned)Opc ? 1 : 0);
- CanMergeOpc = NonSExtOpc == getMatchingNonSExtOpcode(MI->getOpcode());
- }
-
- if (CanMergeOpc && getLdStOffsetOp(MI).isImm()) {
- assert(MI->mayLoadOrStore() && "Expected memory operation.");
+ if (areCandidatesToMergeOrPair(FirstMI, MI, Flags, TII) &&
+ getLdStOffsetOp(MI).isImm()) {
+ assert(MI.mayLoadOrStore() && "Expected memory operation.");
// If we've found another instruction with the same opcode, check to see
// if the base and offset are compatible with our starting instruction.
// These instructions all have scaled immediate operands, so we just
// check for +1/-1. Make sure to check the new instruction offset is
// actually an immediate and not a symbolic reference destined for
// a relocation.
- //
- // Pairwise instructions have a 7-bit signed offset field. Single insns
- // have a 12-bit unsigned offset field. To be a valid combine, the
- // final offset must be in range.
unsigned MIBaseReg = getLdStBaseOp(MI).getReg();
int MIOffset = getLdStOffsetOp(MI).getImm();
+ bool MIIsUnscaled = TII->isUnscaledLdSt(MI);
+ if (IsUnscaled != MIIsUnscaled) {
+ // We're trying to pair instructions that differ in how they are scaled.
+ // If FirstMI is scaled then scale the offset of MI accordingly.
+ // Otherwise, do the opposite (i.e., make MI's offset unscaled).
+ int MemSize = getMemScale(MI);
+ if (MIIsUnscaled) {
+ // If the unscaled offset isn't a multiple of the MemSize, we can't
+ // pair the operations together: bail and keep looking.
+ if (MIOffset % MemSize)
+ continue;
+ MIOffset /= MemSize;
+ } else {
+ MIOffset *= MemSize;
+ }
+ }
+
if (BaseReg == MIBaseReg && ((Offset == MIOffset + OffsetStride) ||
(Offset + OffsetStride == MIOffset))) {
int MinOffset = Offset < MIOffset ? Offset : MIOffset;
- // If this is a volatile load/store that otherwise matched, stop looking
- // as something is going on that we don't have enough information to
- // safely transform. Similarly, stop if we see a hint to avoid pairs.
- if (MI->hasOrderedMemoryRef() || TII->isLdStPairSuppressed(MI))
- return E;
- // If the resultant immediate offset of merging these instructions
- // is out of range for a pairwise instruction, bail and keep looking.
- bool MIIsUnscaled = isUnscaledLdSt(MI);
- bool IsNarrowLoad = isNarrowLoad(MI->getOpcode());
- if (!IsNarrowLoad &&
- !inBoundsForPair(MIIsUnscaled, MinOffset, OffsetStride)) {
- trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
- MemInsns.push_back(MI);
- continue;
- }
-
- if (IsNarrowLoad || IsNarrowStore) {
+ if (FindNarrowMerge) {
// If the alignment requirements of the scaled wide load/store
- // instruction can't express the offset of the scaled narrow
- // input, bail and keep looking.
- if (!IsUnscaled && alignTo(MinOffset, 2) != MinOffset) {
+ // instruction can't express the offset of the scaled narrow input,
+ // bail and keep looking. For promotable zero stores, allow only when
+ // the stored value is the same (i.e., WZR).
+ if ((!IsUnscaled && alignTo(MinOffset, 2) != MinOffset) ||
+ (IsPromotableZeroStore && Reg != getLdStRegOp(MI).getReg())) {
trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
- MemInsns.push_back(MI);
+ MemInsns.push_back(&MI);
continue;
}
} else {
+ // Pairwise instructions have a 7-bit signed offset field. Single
+ // insns have a 12-bit unsigned offset field. If the resultant
+ // immediate offset of merging these instructions is out of range for
+ // a pairwise instruction, bail and keep looking.
+ if (!inBoundsForPair(IsUnscaled, MinOffset, OffsetStride)) {
+ trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
+ MemInsns.push_back(&MI);
+ continue;
+ }
// If the alignment requirements of the paired (scaled) instruction
// can't express the offset of the unscaled input, bail and keep
// looking.
if (IsUnscaled && (alignTo(MinOffset, OffsetStride) != MinOffset)) {
trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
- MemInsns.push_back(MI);
+ MemInsns.push_back(&MI);
continue;
}
}
// If the destination register of the loads is the same register, bail
// and keep looking. A load-pair instruction with both destination
// registers the same is UNPREDICTABLE and will result in an exception.
- // For narrow stores, allow only when the stored value is the same
- // (i.e., WZR).
- if ((MayLoad && Reg == getLdStRegOp(MI).getReg()) ||
- (IsNarrowStore && Reg != getLdStRegOp(MI).getReg())) {
+ if (MayLoad && Reg == getLdStRegOp(MI).getReg()) {
trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
- MemInsns.push_back(MI);
+ MemInsns.push_back(&MI);
continue;
}
@@ -1194,7 +1313,7 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
// and first alias with the second, we can combine the second into the
// first.
if (!ModifiedRegs[getLdStRegOp(MI).getReg()] &&
- !(MI->mayLoad() && UsedRegs[getLdStRegOp(MI).getReg()]) &&
+ !(MI.mayLoad() && UsedRegs[getLdStRegOp(MI).getReg()]) &&
!mayAlias(MI, MemInsns, TII)) {
Flags.setMergeForward(false);
return MBBI;
@@ -1217,7 +1336,7 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
// If the instruction wasn't a matching load or store. Stop searching if we
// encounter a call instruction that might modify memory.
- if (MI->isCall())
+ if (MI.isCall())
return E;
// Update modified / uses register lists.
@@ -1229,8 +1348,8 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
return E;
// Update list of instructions that read/write memory.
- if (MI->mayLoadOrStore())
- MemInsns.push_back(MI);
+ if (MI.mayLoadOrStore())
+ MemInsns.push_back(&MI);
}
return E;
}
@@ -1258,22 +1377,24 @@ AArch64LoadStoreOpt::mergeUpdateInsn(MachineBasicBlock::iterator I,
unsigned NewOpc = IsPreIdx ? getPreIndexedOpcode(I->getOpcode())
: getPostIndexedOpcode(I->getOpcode());
MachineInstrBuilder MIB;
- if (!isPairedLdSt(I)) {
+ if (!isPairedLdSt(*I)) {
// Non-paired instruction.
MIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc))
- .addOperand(getLdStRegOp(Update))
- .addOperand(getLdStRegOp(I))
- .addOperand(getLdStBaseOp(I))
- .addImm(Value);
+ .addOperand(getLdStRegOp(*Update))
+ .addOperand(getLdStRegOp(*I))
+ .addOperand(getLdStBaseOp(*I))
+ .addImm(Value)
+ .setMemRefs(I->memoperands_begin(), I->memoperands_end());
} else {
// Paired instruction.
- int Scale = getMemScale(I);
+ int Scale = getMemScale(*I);
MIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc))
- .addOperand(getLdStRegOp(Update))
- .addOperand(getLdStRegOp(I, 0))
- .addOperand(getLdStRegOp(I, 1))
- .addOperand(getLdStBaseOp(I))
- .addImm(Value / Scale);
+ .addOperand(getLdStRegOp(*Update))
+ .addOperand(getLdStRegOp(*I, 0))
+ .addOperand(getLdStRegOp(*I, 1))
+ .addOperand(getLdStBaseOp(*I))
+ .addImm(Value / Scale)
+ .setMemRefs(I->memoperands_begin(), I->memoperands_end());
}
(void)MIB;
@@ -1296,10 +1417,10 @@ AArch64LoadStoreOpt::mergeUpdateInsn(MachineBasicBlock::iterator I,
return NextI;
}
-bool AArch64LoadStoreOpt::isMatchingUpdateInsn(MachineInstr *MemMI,
- MachineInstr *MI,
+bool AArch64LoadStoreOpt::isMatchingUpdateInsn(MachineInstr &MemMI,
+ MachineInstr &MI,
unsigned BaseReg, int Offset) {
- switch (MI->getOpcode()) {
+ switch (MI.getOpcode()) {
default:
break;
case AArch64::SUBXri:
@@ -1309,20 +1430,20 @@ bool AArch64LoadStoreOpt::isMatchingUpdateInsn(MachineInstr *MemMI,
case AArch64::ADDXri:
// Make sure it's a vanilla immediate operand, not a relocation or
// anything else we can't handle.
- if (!MI->getOperand(2).isImm())
+ if (!MI.getOperand(2).isImm())
break;
// Watch out for 1 << 12 shifted value.
- if (AArch64_AM::getShiftValue(MI->getOperand(3).getImm()))
+ if (AArch64_AM::getShiftValue(MI.getOperand(3).getImm()))
break;
// The update instruction source and destination register must be the
// same as the load/store base register.
- if (MI->getOperand(0).getReg() != BaseReg ||
- MI->getOperand(1).getReg() != BaseReg)
+ if (MI.getOperand(0).getReg() != BaseReg ||
+ MI.getOperand(1).getReg() != BaseReg)
break;
bool IsPairedInsn = isPairedLdSt(MemMI);
- int UpdateOffset = MI->getOperand(2).getImm();
+ int UpdateOffset = MI.getOperand(2).getImm();
// For non-paired load/store instructions, the immediate must fit in a
// signed 9-bit integer.
if (!IsPairedInsn && (UpdateOffset > 255 || UpdateOffset < -256))
@@ -1343,7 +1464,7 @@ bool AArch64LoadStoreOpt::isMatchingUpdateInsn(MachineInstr *MemMI,
// If we have a non-zero Offset, we check that it matches the amount
// we're adding to the register.
- if (!Offset || Offset == MI->getOperand(2).getImm())
+ if (!Offset || Offset == MI.getOperand(2).getImm())
return true;
break;
}
@@ -1351,9 +1472,9 @@ bool AArch64LoadStoreOpt::isMatchingUpdateInsn(MachineInstr *MemMI,
}
MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnForward(
- MachineBasicBlock::iterator I, unsigned Limit, int UnscaledOffset) {
+ MachineBasicBlock::iterator I, int UnscaledOffset, unsigned Limit) {
MachineBasicBlock::iterator E = I->getParent()->end();
- MachineInstr *MemMI = I;
+ MachineInstr &MemMI = *I;
MachineBasicBlock::iterator MBBI = I;
unsigned BaseReg = getLdStBaseOp(MemMI).getReg();
@@ -1376,22 +1497,20 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnForward(
// Track which registers have been modified and used between the first insn
// (inclusive) and the second insn.
- BitVector ModifiedRegs, UsedRegs;
- ModifiedRegs.resize(TRI->getNumRegs());
- UsedRegs.resize(TRI->getNumRegs());
+ ModifiedRegs.reset();
+ UsedRegs.reset();
++MBBI;
- for (unsigned Count = 0; MBBI != E; ++MBBI) {
- MachineInstr *MI = MBBI;
- // Skip DBG_VALUE instructions. Otherwise debug info can affect the
- // optimization by changing how far we scan.
- if (MI->isDebugValue())
+ for (unsigned Count = 0; MBBI != E && Count < Limit; ++MBBI) {
+ MachineInstr &MI = *MBBI;
+ // Skip DBG_VALUE instructions.
+ if (MI.isDebugValue())
continue;
// Now that we know this is a real instruction, count it.
++Count;
// If we found a match, return it.
- if (isMatchingUpdateInsn(I, MI, BaseReg, UnscaledOffset))
+ if (isMatchingUpdateInsn(*I, MI, BaseReg, UnscaledOffset))
return MBBI;
// Update the status of what the instruction clobbered and used.
@@ -1409,7 +1528,7 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward(
MachineBasicBlock::iterator I, unsigned Limit) {
MachineBasicBlock::iterator B = I->getParent()->begin();
MachineBasicBlock::iterator E = I->getParent()->end();
- MachineInstr *MemMI = I;
+ MachineInstr &MemMI = *I;
MachineBasicBlock::iterator MBBI = I;
unsigned BaseReg = getLdStBaseOp(MemMI).getReg();
@@ -1430,22 +1549,19 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward(
// Track which registers have been modified and used between the first insn
// (inclusive) and the second insn.
- BitVector ModifiedRegs, UsedRegs;
- ModifiedRegs.resize(TRI->getNumRegs());
- UsedRegs.resize(TRI->getNumRegs());
- --MBBI;
- for (unsigned Count = 0; MBBI != B; --MBBI) {
- MachineInstr *MI = MBBI;
- // Skip DBG_VALUE instructions. Otherwise debug info can affect the
- // optimization by changing how far we scan.
- if (MI->isDebugValue())
- continue;
+ ModifiedRegs.reset();
+ UsedRegs.reset();
+ unsigned Count = 0;
+ do {
+ --MBBI;
+ MachineInstr &MI = *MBBI;
- // Now that we know this is a real instruction, count it.
- ++Count;
+ // Don't count DBG_VALUE instructions towards the search limit.
+ if (!MI.isDebugValue())
+ ++Count;
// If we found a match, return it.
- if (isMatchingUpdateInsn(I, MI, BaseReg, Offset))
+ if (isMatchingUpdateInsn(*I, MI, BaseReg, Offset))
return MBBI;
// Update the status of what the instruction clobbered and used.
@@ -1455,15 +1571,15 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward(
// return early.
if (ModifiedRegs[BaseReg] || UsedRegs[BaseReg])
return E;
- }
+ } while (MBBI != B && Count < Limit);
return E;
}
bool AArch64LoadStoreOpt::tryToPromoteLoadFromStore(
MachineBasicBlock::iterator &MBBI) {
- MachineInstr *MI = MBBI;
+ MachineInstr &MI = *MBBI;
// If this is a volatile load, don't mess with it.
- if (MI->hasOrderedMemoryRef())
+ if (MI.hasOrderedMemoryRef())
return false;
// Make sure this is a reg+imm.
@@ -1471,9 +1587,9 @@ bool AArch64LoadStoreOpt::tryToPromoteLoadFromStore(
if (!getLdStOffsetOp(MI).isImm())
return false;
- // Look backward up to ScanLimit instructions.
+ // Look backward up to LdStLimit instructions.
MachineBasicBlock::iterator StoreI;
- if (findMatchingStore(MBBI, ScanLimit, StoreI)) {
+ if (findMatchingStore(MBBI, LdStLimit, StoreI)) {
++NumLoadsFromStoresPromoted;
// Promote the load. Keeping the iterator straight is a
// pain, so we let the merge routine tell us what the next instruction
@@ -1484,40 +1600,70 @@ bool AArch64LoadStoreOpt::tryToPromoteLoadFromStore(
return false;
}
+// Find narrow loads that can be converted into a single wider load with
+// bitfield extract instructions. Also merge adjacent zero stores into a wider
+// store.
bool AArch64LoadStoreOpt::tryToMergeLdStInst(
MachineBasicBlock::iterator &MBBI) {
- MachineInstr *MI = MBBI;
- MachineBasicBlock::iterator E = MI->getParent()->end();
- // If this is a volatile load/store, don't mess with it.
- if (MI->hasOrderedMemoryRef())
- return false;
+ assert((isNarrowLoad(*MBBI) || isPromotableZeroStoreOpcode(*MBBI)) &&
+ "Expected narrow op.");
+ MachineInstr &MI = *MBBI;
+ MachineBasicBlock::iterator E = MI.getParent()->end();
- // Make sure this is a reg+imm (as opposed to an address reloc).
- if (!getLdStOffsetOp(MI).isImm())
+ if (!TII->isCandidateToMergeOrPair(MI))
return false;
- // Check if this load/store has a hint to avoid pair formation.
- // MachineMemOperands hints are set by the AArch64StorePairSuppress pass.
- if (TII->isLdStPairSuppressed(MI))
+ // For promotable zero stores, the stored value should be WZR.
+ if (isPromotableZeroStoreOpcode(MI) &&
+ getLdStRegOp(MI).getReg() != AArch64::WZR)
return false;
- // Look ahead up to ScanLimit instructions for a pairable instruction.
+ // Look ahead up to LdStLimit instructions for a mergable instruction.
LdStPairFlags Flags;
- MachineBasicBlock::iterator Paired = findMatchingInsn(MBBI, Flags, ScanLimit);
- if (Paired != E) {
+ MachineBasicBlock::iterator MergeMI =
+ findMatchingInsn(MBBI, Flags, LdStLimit, /* FindNarrowMerge = */ true);
+ if (MergeMI != E) {
if (isNarrowLoad(MI)) {
++NumNarrowLoadsPromoted;
- } else if (isNarrowStore(MI)) {
+ } else if (isPromotableZeroStoreInst(MI)) {
++NumZeroStoresPromoted;
- } else {
- ++NumPairCreated;
- if (isUnscaledLdSt(MI))
- ++NumUnscaledPairCreated;
}
+ // Keeping the iterator straight is a pain, so we let the merge routine tell
+ // us what the next instruction is after it's done mucking about.
+ MBBI = mergeNarrowInsns(MBBI, MergeMI, Flags);
+ return true;
+ }
+ return false;
+}
- // Merge the loads into a pair. Keeping the iterator straight is a
- // pain, so we let the merge routine tell us what the next instruction
- // is after it's done mucking about.
+// Find loads and stores that can be merged into a single load or store pair
+// instruction.
+bool AArch64LoadStoreOpt::tryToPairLdStInst(MachineBasicBlock::iterator &MBBI) {
+ MachineInstr &MI = *MBBI;
+ MachineBasicBlock::iterator E = MI.getParent()->end();
+
+ if (!TII->isCandidateToMergeOrPair(MI))
+ return false;
+
+ // Early exit if the offset is not possible to match. (6 bits of positive
+ // range, plus allow an extra one in case we find a later insn that matches
+ // with Offset-1)
+ bool IsUnscaled = TII->isUnscaledLdSt(MI);
+ int Offset = getLdStOffsetOp(MI).getImm();
+ int OffsetStride = IsUnscaled ? getMemScale(MI) : 1;
+ if (!inBoundsForPair(IsUnscaled, Offset, OffsetStride))
+ return false;
+
+ // Look ahead up to LdStLimit instructions for a pairable instruction.
+ LdStPairFlags Flags;
+ MachineBasicBlock::iterator Paired =
+ findMatchingInsn(MBBI, Flags, LdStLimit, /* FindNarrowMerge = */ false);
+ if (Paired != E) {
+ ++NumPairCreated;
+ if (TII->isUnscaledLdSt(MI))
+ ++NumUnscaledPairCreated;
+ // Keeping the iterator straight is a pain, so we let the merge routine tell
+ // us what the next instruction is after it's done mucking about.
MBBI = mergePairedInsns(MBBI, Paired, Flags);
return true;
}
@@ -1527,7 +1673,7 @@ bool AArch64LoadStoreOpt::tryToMergeLdStInst(
bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB,
bool enableNarrowLdOpt) {
bool Modified = false;
- // Three tranformations to do here:
+ // Four tranformations to do here:
// 1) Find loads that directly read from stores and promote them by
// replacing with mov instructions. If the store is wider than the load,
// the load will be replaced with a bitfield extract.
@@ -1536,35 +1682,11 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB,
// ldrh w2, [x0, #6]
// ; becomes
// str w1, [x0, #4]
- // lsr w2, w1, #16
- // 2) Find narrow loads that can be converted into a single wider load
- // with bitfield extract instructions.
- // e.g.,
- // ldrh w0, [x2]
- // ldrh w1, [x2, #2]
- // ; becomes
- // ldr w0, [x2]
- // ubfx w1, w0, #16, #16
- // and w0, w0, #ffff
- // 3) Find loads and stores that can be merged into a single load or store
- // pair instruction.
- // e.g.,
- // ldr x0, [x2]
- // ldr x1, [x2, #8]
- // ; becomes
- // ldp x0, x1, [x2]
- // 4) Find base register updates that can be merged into the load or store
- // as a base-reg writeback.
- // e.g.,
- // ldr x0, [x2]
- // add x2, x2, #4
- // ; becomes
- // ldr x0, [x2], #4
-
+ // lsr w2, w1, #16
for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
MBBI != E;) {
- MachineInstr *MI = MBBI;
- switch (MI->getOpcode()) {
+ MachineInstr &MI = *MBBI;
+ switch (MI.getOpcode()) {
default:
// Just move on to the next instruction.
++MBBI;
@@ -1586,47 +1708,49 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB,
++MBBI;
break;
}
- // FIXME: Do the other instructions.
}
}
-
+ // 2) Find narrow loads that can be converted into a single wider load
+ // with bitfield extract instructions.
+ // e.g.,
+ // ldrh w0, [x2]
+ // ldrh w1, [x2, #2]
+ // ; becomes
+ // ldr w0, [x2]
+ // ubfx w1, w0, #16, #16
+ // and w0, w0, #ffff
+ //
+ // Also merge adjacent zero stores into a wider store.
+ // e.g.,
+ // strh wzr, [x0]
+ // strh wzr, [x0, #2]
+ // ; becomes
+ // str wzr, [x0]
for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
enableNarrowLdOpt && MBBI != E;) {
- MachineInstr *MI = MBBI;
- switch (MI->getOpcode()) {
- default:
- // Just move on to the next instruction.
- ++MBBI;
- break;
- // Scaled instructions.
- case AArch64::LDRBBui:
- case AArch64::LDRHHui:
- case AArch64::LDRSBWui:
- case AArch64::LDRSHWui:
- case AArch64::STRBBui:
- case AArch64::STRHHui:
- // Unscaled instructions.
- case AArch64::LDURBBi:
- case AArch64::LDURHHi:
- case AArch64::LDURSBWi:
- case AArch64::LDURSHWi:
- case AArch64::STURBBi:
- case AArch64::STURHHi: {
+ MachineInstr &MI = *MBBI;
+ unsigned Opc = MI.getOpcode();
+ if (isPromotableZeroStoreOpcode(Opc) ||
+ (EnableNarrowLdMerge && isNarrowLoad(Opc))) {
if (tryToMergeLdStInst(MBBI)) {
Modified = true;
- break;
- }
+ } else
+ ++MBBI;
+ } else
++MBBI;
- break;
- }
- // FIXME: Do the other instructions.
- }
}
+ // 3) Find loads and stores that can be merged into a single load or store
+ // pair instruction.
+ // e.g.,
+ // ldr x0, [x2]
+ // ldr x1, [x2, #8]
+ // ; becomes
+ // ldp x0, x1, [x2]
for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
MBBI != E;) {
- MachineInstr *MI = MBBI;
- switch (MI->getOpcode()) {
+ MachineInstr &MI = *MBBI;
+ switch (MI.getOpcode()) {
default:
// Just move on to the next instruction.
++MBBI;
@@ -1655,23 +1779,28 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB,
case AArch64::LDURWi:
case AArch64::LDURXi:
case AArch64::LDURSWi: {
- if (tryToMergeLdStInst(MBBI)) {
+ if (tryToPairLdStInst(MBBI)) {
Modified = true;
break;
}
++MBBI;
break;
}
- // FIXME: Do the other instructions.
}
}
-
+ // 4) Find base register updates that can be merged into the load or store
+ // as a base-reg writeback.
+ // e.g.,
+ // ldr x0, [x2]
+ // add x2, x2, #4
+ // ; becomes
+ // ldr x0, [x2], #4
for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
MBBI != E;) {
- MachineInstr *MI = MBBI;
+ MachineInstr &MI = *MBBI;
// Do update merging. It's simpler to keep this separate from the above
- // switch, though not strictly necessary.
- unsigned Opc = MI->getOpcode();
+ // switchs, though not strictly necessary.
+ unsigned Opc = MI.getOpcode();
switch (Opc) {
default:
// Just move on to the next instruction.
@@ -1726,7 +1855,7 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB,
// merged into:
// ldr x0, [x20], #32
MachineBasicBlock::iterator Update =
- findMatchingUpdateInsnForward(MBBI, ScanLimit, 0);
+ findMatchingUpdateInsnForward(MBBI, 0, UpdateLimit);
if (Update != E) {
// Merge the update into the ld/st.
MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/false);
@@ -1736,7 +1865,7 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB,
}
// Don't know how to handle pre/post-index versions, so move to the next
// instruction.
- if (isUnscaledLdSt(Opc)) {
+ if (TII->isUnscaledLdSt(Opc)) {
++MBBI;
break;
}
@@ -1746,7 +1875,7 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB,
// ldr x1, [x0]
// merged into:
// ldr x1, [x0, #8]!
- Update = findMatchingUpdateInsnBackward(MBBI, ScanLimit);
+ Update = findMatchingUpdateInsnBackward(MBBI, UpdateLimit);
if (Update != E) {
// Merge the update into the ld/st.
MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/true);
@@ -1764,7 +1893,7 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB,
// add x0, x0, #64
// merged into:
// ldr x1, [x0, #64]!
- Update = findMatchingUpdateInsnForward(MBBI, ScanLimit, UnscaledOffset);
+ Update = findMatchingUpdateInsnForward(MBBI, UnscaledOffset, UpdateLimit);
if (Update != E) {
// Merge the update into the ld/st.
MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/true);
@@ -1777,29 +1906,29 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB,
++MBBI;
break;
}
- // FIXME: Do the other instructions.
}
}
return Modified;
}
-bool AArch64LoadStoreOpt::enableNarrowLdMerge(MachineFunction &Fn) {
- bool ProfitableArch = Subtarget->isCortexA57();
- // FIXME: The benefit from converting narrow loads into a wider load could be
- // microarchitectural as it assumes that a single load with two bitfield
- // extracts is cheaper than two narrow loads. Currently, this conversion is
- // enabled only in cortex-a57 on which performance benefits were verified.
- return ProfitableArch && !Subtarget->requiresStrictAlign();
-}
-
bool AArch64LoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
+ if (skipFunction(*Fn.getFunction()))
+ return false;
+
Subtarget = &static_cast<const AArch64Subtarget &>(Fn.getSubtarget());
TII = static_cast<const AArch64InstrInfo *>(Subtarget->getInstrInfo());
TRI = Subtarget->getRegisterInfo();
+ // Resize the modified and used register bitfield trackers. We do this once
+ // per function and then clear the bitfield each time we optimize a load or
+ // store.
+ ModifiedRegs.resize(TRI->getNumRegs());
+ UsedRegs.resize(TRI->getNumRegs());
+
bool Modified = false;
- bool enableNarrowLdOpt = enableNarrowLdMerge(Fn);
+ bool enableNarrowLdOpt =
+ Subtarget->mergeNarrowLoads() && !Subtarget->requiresStrictAlign();
for (auto &MBB : Fn)
Modified |= optimizeBlock(MBB, enableNarrowLdOpt);
@@ -1809,6 +1938,11 @@ bool AArch64LoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
// FIXME: Do we need/want a pre-alloc pass like ARM has to try to keep
// loads and stores near one another?
+// FIXME: When pairing store instructions it's very possible for this pass to
+// hoist a store with a KILL marker above another use (without a KILL marker).
+// The resulting IR is invalid, but nothing uses the KILL markers after this
+// pass, so it's never caused a problem in practice.
+
/// createAArch64LoadStoreOptimizationPass - returns an instance of the
/// load / store optimization pass.
FunctionPass *llvm::createAArch64LoadStoreOptimizationPass() {
diff --git a/lib/Target/AArch64/AArch64MachineFunctionInfo.h b/lib/Target/AArch64/AArch64MachineFunctionInfo.h
index 318f839535053..49e7767741eaa 100644
--- a/lib/Target/AArch64/AArch64MachineFunctionInfo.h
+++ b/lib/Target/AArch64/AArch64MachineFunctionInfo.h
@@ -48,6 +48,9 @@ class AArch64FunctionInfo : public MachineFunctionInfo {
/// \brief Amount of stack frame size, not including callee-saved registers.
unsigned LocalStackSize;
+ /// \brief Amount of stack frame size used for saving callee-saved registers.
+ unsigned CalleeSavedStackSize;
+
/// \brief Number of TLS accesses using the special (combinable)
/// _TLS_MODULE_BASE_ symbol.
unsigned NumLocalDynamicTLSAccesses;
@@ -76,18 +79,28 @@ class AArch64FunctionInfo : public MachineFunctionInfo {
/// copies.
bool IsSplitCSR;
+ /// True when the stack gets realigned dynamically because the size of stack
+ /// frame is unknown at compile time. e.g., in case of VLAs.
+ bool StackRealigned;
+
+ /// True when the callee-save stack area has unused gaps that may be used for
+ /// other stack allocations.
+ bool CalleeSaveStackHasFreeSpace;
+
public:
AArch64FunctionInfo()
: BytesInStackArgArea(0), ArgumentStackToRestore(0), HasStackFrame(false),
NumLocalDynamicTLSAccesses(0), VarArgsStackIndex(0), VarArgsGPRIndex(0),
VarArgsGPRSize(0), VarArgsFPRIndex(0), VarArgsFPRSize(0),
- IsSplitCSR(false) {}
+ IsSplitCSR(false), StackRealigned(false),
+ CalleeSaveStackHasFreeSpace(false) {}
explicit AArch64FunctionInfo(MachineFunction &MF)
: BytesInStackArgArea(0), ArgumentStackToRestore(0), HasStackFrame(false),
NumLocalDynamicTLSAccesses(0), VarArgsStackIndex(0), VarArgsGPRIndex(0),
VarArgsGPRSize(0), VarArgsFPRIndex(0), VarArgsFPRSize(0),
- IsSplitCSR(false) {
+ IsSplitCSR(false), StackRealigned(false),
+ CalleeSaveStackHasFreeSpace(false) {
(void)MF;
}
@@ -102,12 +115,25 @@ public:
bool hasStackFrame() const { return HasStackFrame; }
void setHasStackFrame(bool s) { HasStackFrame = s; }
+ bool isStackRealigned() const { return StackRealigned; }
+ void setStackRealigned(bool s) { StackRealigned = s; }
+
+ bool hasCalleeSaveStackFreeSpace() const {
+ return CalleeSaveStackHasFreeSpace;
+ }
+ void setCalleeSaveStackHasFreeSpace(bool s) {
+ CalleeSaveStackHasFreeSpace = s;
+ }
+
bool isSplitCSR() const { return IsSplitCSR; }
void setIsSplitCSR(bool s) { IsSplitCSR = s; }
void setLocalStackSize(unsigned Size) { LocalStackSize = Size; }
unsigned getLocalStackSize() const { return LocalStackSize; }
+ void setCalleeSavedStackSize(unsigned Size) { CalleeSavedStackSize = Size; }
+ unsigned getCalleeSavedStackSize() const { return CalleeSavedStackSize; }
+
void incNumLocalDynamicTLSAccesses() { ++NumLocalDynamicTLSAccesses; }
unsigned getNumLocalDynamicTLSAccesses() const {
return NumLocalDynamicTLSAccesses;
@@ -140,15 +166,15 @@ public:
SmallVector<const MachineInstr *, 3> Args;
public:
- typedef SmallVectorImpl<const MachineInstr *> LOHArgs;
+ typedef ArrayRef<const MachineInstr *> LOHArgs;
- MILOHDirective(MCLOHType Kind, const LOHArgs &Args)
+ MILOHDirective(MCLOHType Kind, LOHArgs Args)
: Kind(Kind), Args(Args.begin(), Args.end()) {
assert(isValidMCLOHType(Kind) && "Invalid LOH directive type!");
}
MCLOHType getKind() const { return Kind; }
- const LOHArgs &getArgs() const { return Args; }
+ LOHArgs getArgs() const { return Args; }
};
typedef MILOHDirective::LOHArgs MILOHArgs;
@@ -157,7 +183,7 @@ public:
const MILOHContainer &getLOHContainer() const { return LOHContainerSet; }
/// Add a LOH directive of this @p Kind and this @p Args.
- void addLOHDirective(MCLOHType Kind, const MILOHArgs &Args) {
+ void addLOHDirective(MCLOHType Kind, MILOHArgs Args) {
LOHContainerSet.push_back(MILOHDirective(Kind, Args));
LOHRelated.insert(Args.begin(), Args.end());
}
diff --git a/lib/Target/AArch64/AArch64PBQPRegAlloc.cpp b/lib/Target/AArch64/AArch64PBQPRegAlloc.cpp
index 5394875a6bc12..038162c6f54a9 100644
--- a/lib/Target/AArch64/AArch64PBQPRegAlloc.cpp
+++ b/lib/Target/AArch64/AArch64PBQPRegAlloc.cpp
@@ -320,7 +320,7 @@ void A57ChainingConstraint::addInterChainConstraint(PBQPRAGraph &G, unsigned Rd,
static bool regJustKilledBefore(const LiveIntervals &LIs, unsigned reg,
const MachineInstr &MI) {
const LiveInterval &LI = LIs.getInterval(reg);
- SlotIndex SI = LIs.getInstructionIndex(&MI);
+ SlotIndex SI = LIs.getInstructionIndex(MI);
return LI.expiredAt(SI);
}
diff --git a/lib/Target/AArch64/AArch64PromoteConstant.cpp b/lib/Target/AArch64/AArch64PromoteConstant.cpp
index 79c09d9f058d6..b1e40510b2ae7 100644
--- a/lib/Target/AArch64/AArch64PromoteConstant.cpp
+++ b/lib/Target/AArch64/AArch64PromoteConstant.cpp
@@ -85,6 +85,21 @@ namespace {
class AArch64PromoteConstant : public ModulePass {
public:
+ struct PromotedConstant {
+ bool ShouldConvert = false;
+ GlobalVariable *GV = nullptr;
+ };
+ typedef SmallDenseMap<Constant *, PromotedConstant, 16> PromotionCacheTy;
+
+ struct UpdateRecord {
+ Constant *C;
+ Instruction *User;
+ unsigned Op;
+
+ UpdateRecord(Constant *C, Instruction *User, unsigned Op)
+ : C(C), User(User), Op(Op) {}
+ };
+
static char ID;
AArch64PromoteConstant() : ModulePass(ID) {}
@@ -94,9 +109,12 @@ public:
/// global variables with module scope.
bool runOnModule(Module &M) override {
DEBUG(dbgs() << getPassName() << '\n');
+ if (skipModule(M))
+ return false;
bool Changed = false;
+ PromotionCacheTy PromotionCache;
for (auto &MF : M) {
- Changed |= runOnFunction(MF);
+ Changed |= runOnFunction(MF, PromotionCache);
}
return Changed;
}
@@ -105,7 +123,7 @@ private:
/// Look for interesting constants used within the given function.
/// Promote them into global variables, load these global variables within
/// the related function, so that the number of inserted load is minimal.
- bool runOnFunction(Function &F);
+ bool runOnFunction(Function &F, PromotionCacheTy &PromotionCache);
// This transformation requires dominator info
void getAnalysisUsage(AnalysisUsage &AU) const override {
@@ -115,79 +133,72 @@ private:
}
/// Type to store a list of Uses.
- typedef SmallVector<Use *, 4> Uses;
+ typedef SmallVector<std::pair<Instruction *, unsigned>, 4> Uses;
/// Map an insertion point to all the uses it dominates.
typedef DenseMap<Instruction *, Uses> InsertionPoints;
- /// Map a function to the required insertion point of load for a
- /// global variable.
- typedef DenseMap<Function *, InsertionPoints> InsertionPointsPerFunc;
/// Find the closest point that dominates the given Use.
- Instruction *findInsertionPoint(Use &Use);
+ Instruction *findInsertionPoint(Instruction &User, unsigned OpNo);
/// Check if the given insertion point is dominated by an existing
/// insertion point.
/// If true, the given use is added to the list of dominated uses for
/// the related existing point.
/// \param NewPt the insertion point to be checked
- /// \param Use the use to be added into the list of dominated uses
+ /// \param User the user of the constant
+ /// \param OpNo the operand number of the use
/// \param InsertPts existing insertion points
/// \pre NewPt and all instruction in InsertPts belong to the same function
/// \return true if one of the insertion point in InsertPts dominates NewPt,
/// false otherwise
- bool isDominated(Instruction *NewPt, Use &Use, InsertionPoints &InsertPts);
+ bool isDominated(Instruction *NewPt, Instruction *User, unsigned OpNo,
+ InsertionPoints &InsertPts);
/// Check if the given insertion point can be merged with an existing
/// insertion point in a common dominator.
/// If true, the given use is added to the list of the created insertion
/// point.
/// \param NewPt the insertion point to be checked
- /// \param Use the use to be added into the list of dominated uses
+ /// \param User the user of the constant
+ /// \param OpNo the operand number of the use
/// \param InsertPts existing insertion points
/// \pre NewPt and all instruction in InsertPts belong to the same function
/// \pre isDominated returns false for the exact same parameters.
/// \return true if it exists an insertion point in InsertPts that could
/// have been merged with NewPt in a common dominator,
/// false otherwise
- bool tryAndMerge(Instruction *NewPt, Use &Use, InsertionPoints &InsertPts);
+ bool tryAndMerge(Instruction *NewPt, Instruction *User, unsigned OpNo,
+ InsertionPoints &InsertPts);
/// Compute the minimal insertion points to dominates all the interesting
/// uses of value.
/// Insertion points are group per function and each insertion point
/// contains a list of all the uses it dominates within the related function
- /// \param Val constant to be examined
- /// \param[out] InsPtsPerFunc output storage of the analysis
- void computeInsertionPoints(Constant *Val,
- InsertionPointsPerFunc &InsPtsPerFunc);
+ /// \param User the user of the constant
+ /// \param OpNo the operand number of the constant
+ /// \param[out] InsertPts output storage of the analysis
+ void computeInsertionPoint(Instruction *User, unsigned OpNo,
+ InsertionPoints &InsertPts);
/// Insert a definition of a new global variable at each point contained in
/// InsPtsPerFunc and update the related uses (also contained in
/// InsPtsPerFunc).
- bool insertDefinitions(Constant *Cst, InsertionPointsPerFunc &InsPtsPerFunc);
-
- /// Compute the minimal insertion points to dominate all the interesting
- /// uses of Val and insert a definition of a new global variable
- /// at these points.
- /// Also update the uses of Val accordingly.
- /// Currently a use of Val is considered interesting if:
- /// - Val is not UndefValue
- /// - Val is not zeroinitialized
- /// - Replacing Val per a load of a global variable is valid.
- /// \see shouldConvert for more details
- bool computeAndInsertDefinitions(Constant *Val);
-
- /// Promote the given constant into a global variable if it is expected to
- /// be profitable.
- /// \return true if Cst has been promoted
- bool promoteConstant(Constant *Cst);
+ void insertDefinitions(Function &F, GlobalVariable &GV,
+ InsertionPoints &InsertPts);
+
+ /// Do the constant promotion indicated by the Updates records, keeping track
+ /// of globals in PromotionCache.
+ void promoteConstants(Function &F, SmallVectorImpl<UpdateRecord> &Updates,
+ PromotionCacheTy &PromotionCache);
/// Transfer the list of dominated uses of IPI to NewPt in InsertPts.
/// Append Use to this list and delete the entry of IPI in InsertPts.
- static void appendAndTransferDominatedUses(Instruction *NewPt, Use &Use,
+ static void appendAndTransferDominatedUses(Instruction *NewPt,
+ Instruction *User, unsigned OpNo,
InsertionPoints::iterator &IPI,
InsertionPoints &InsertPts) {
// Record the dominated use.
- IPI->second.push_back(&Use);
+ IPI->second.emplace_back(User, OpNo);
// Transfer the dominated uses of IPI to NewPt
// Inserting into the DenseMap may invalidate existing iterator.
// Keep a copy of the key to find the iterator to erase. Keep a copy of the
@@ -285,10 +296,7 @@ static bool shouldConvertUse(const Constant *Cst, const Instruction *Instr,
// Do not mess with inline asm.
const CallInst *CI = dyn_cast<const CallInst>(Instr);
- if (CI && isa<const InlineAsm>(CI->getCalledValue()))
- return false;
-
- return true;
+ return !(CI && isa<const InlineAsm>(CI->getCalledValue()));
}
/// Check if the given Cst should be converted into
@@ -305,7 +313,7 @@ static bool shouldConvertUse(const Constant *Cst, const Instruction *Instr,
/// for the regular approach, even for float).
/// Again, the simplest solution would be to promote every
/// constant and rematerialize them when they are actually cheap to create.
-static bool shouldConvert(const Constant *Cst) {
+static bool shouldConvertImpl(const Constant *Cst) {
if (isa<const UndefValue>(Cst))
return false;
@@ -328,18 +336,28 @@ static bool shouldConvert(const Constant *Cst) {
return isConstantUsingVectorTy(Cst->getType());
}
-Instruction *AArch64PromoteConstant::findInsertionPoint(Use &Use) {
- Instruction *User = cast<Instruction>(Use.getUser());
+static bool
+shouldConvert(Constant &C,
+ AArch64PromoteConstant::PromotionCacheTy &PromotionCache) {
+ auto Converted = PromotionCache.insert(
+ std::make_pair(&C, AArch64PromoteConstant::PromotedConstant()));
+ if (Converted.second)
+ Converted.first->second.ShouldConvert = shouldConvertImpl(&C);
+ return Converted.first->second.ShouldConvert;
+}
+Instruction *AArch64PromoteConstant::findInsertionPoint(Instruction &User,
+ unsigned OpNo) {
// If this user is a phi, the insertion point is in the related
// incoming basic block.
- if (PHINode *PhiInst = dyn_cast<PHINode>(User))
- return PhiInst->getIncomingBlock(Use.getOperandNo())->getTerminator();
+ if (PHINode *PhiInst = dyn_cast<PHINode>(&User))
+ return PhiInst->getIncomingBlock(OpNo)->getTerminator();
- return User;
+ return &User;
}
-bool AArch64PromoteConstant::isDominated(Instruction *NewPt, Use &Use,
+bool AArch64PromoteConstant::isDominated(Instruction *NewPt, Instruction *User,
+ unsigned OpNo,
InsertionPoints &InsertPts) {
DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>(
@@ -358,14 +376,15 @@ bool AArch64PromoteConstant::isDominated(Instruction *NewPt, Use &Use,
DEBUG(dbgs() << "Insertion point dominated by:\n");
DEBUG(IPI.first->print(dbgs()));
DEBUG(dbgs() << '\n');
- IPI.second.push_back(&Use);
+ IPI.second.emplace_back(User, OpNo);
return true;
}
}
return false;
}
-bool AArch64PromoteConstant::tryAndMerge(Instruction *NewPt, Use &Use,
+bool AArch64PromoteConstant::tryAndMerge(Instruction *NewPt, Instruction *User,
+ unsigned OpNo,
InsertionPoints &InsertPts) {
DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>(
*NewPt->getParent()->getParent()).getDomTree();
@@ -385,7 +404,7 @@ bool AArch64PromoteConstant::tryAndMerge(Instruction *NewPt, Use &Use,
DEBUG(dbgs() << "Merge insertion point with:\n");
DEBUG(IPI->first->print(dbgs()));
DEBUG(dbgs() << "\nat considered insertion point.\n");
- appendAndTransferDominatedUses(NewPt, Use, IPI, InsertPts);
+ appendAndTransferDominatedUses(NewPt, User, OpNo, IPI, InsertPts);
return true;
}
@@ -409,149 +428,141 @@ bool AArch64PromoteConstant::tryAndMerge(Instruction *NewPt, Use &Use,
DEBUG(dbgs() << '\n');
DEBUG(NewPt->print(dbgs()));
DEBUG(dbgs() << '\n');
- appendAndTransferDominatedUses(NewPt, Use, IPI, InsertPts);
+ appendAndTransferDominatedUses(NewPt, User, OpNo, IPI, InsertPts);
return true;
}
return false;
}
-void AArch64PromoteConstant::computeInsertionPoints(
- Constant *Val, InsertionPointsPerFunc &InsPtsPerFunc) {
- DEBUG(dbgs() << "** Compute insertion points **\n");
- for (Use &Use : Val->uses()) {
- Instruction *User = dyn_cast<Instruction>(Use.getUser());
-
- // If the user is not an Instruction, we cannot modify it.
- if (!User)
- continue;
-
- // Filter out uses that should not be converted.
- if (!shouldConvertUse(Val, User, Use.getOperandNo()))
- continue;
+void AArch64PromoteConstant::computeInsertionPoint(
+ Instruction *User, unsigned OpNo, InsertionPoints &InsertPts) {
+ DEBUG(dbgs() << "Considered use, opidx " << OpNo << ":\n");
+ DEBUG(User->print(dbgs()));
+ DEBUG(dbgs() << '\n');
- DEBUG(dbgs() << "Considered use, opidx " << Use.getOperandNo() << ":\n");
- DEBUG(User->print(dbgs()));
- DEBUG(dbgs() << '\n');
+ Instruction *InsertionPoint = findInsertionPoint(*User, OpNo);
- Instruction *InsertionPoint = findInsertionPoint(Use);
+ DEBUG(dbgs() << "Considered insertion point:\n");
+ DEBUG(InsertionPoint->print(dbgs()));
+ DEBUG(dbgs() << '\n');
- DEBUG(dbgs() << "Considered insertion point:\n");
- DEBUG(InsertionPoint->print(dbgs()));
- DEBUG(dbgs() << '\n');
+ if (isDominated(InsertionPoint, User, OpNo, InsertPts))
+ return;
+ // This insertion point is useful, check if we can merge some insertion
+ // point in a common dominator or if NewPt dominates an existing one.
+ if (tryAndMerge(InsertionPoint, User, OpNo, InsertPts))
+ return;
- // Check if the current insertion point is useless, i.e., it is dominated
- // by another one.
- InsertionPoints &InsertPts =
- InsPtsPerFunc[InsertionPoint->getParent()->getParent()];
- if (isDominated(InsertionPoint, Use, InsertPts))
- continue;
- // This insertion point is useful, check if we can merge some insertion
- // point in a common dominator or if NewPt dominates an existing one.
- if (tryAndMerge(InsertionPoint, Use, InsertPts))
- continue;
-
- DEBUG(dbgs() << "Keep considered insertion point\n");
+ DEBUG(dbgs() << "Keep considered insertion point\n");
- // It is definitely useful by its own
- InsertPts[InsertionPoint].push_back(&Use);
- }
+ // It is definitely useful by its own
+ InsertPts[InsertionPoint].emplace_back(User, OpNo);
}
-bool AArch64PromoteConstant::insertDefinitions(
- Constant *Cst, InsertionPointsPerFunc &InsPtsPerFunc) {
- // We will create one global variable per Module.
- DenseMap<Module *, GlobalVariable *> ModuleToMergedGV;
- bool HasChanged = false;
+static void ensurePromotedGV(Function &F, Constant &C,
+ AArch64PromoteConstant::PromotedConstant &PC) {
+ assert(PC.ShouldConvert &&
+ "Expected that we should convert this to a global");
+ if (PC.GV)
+ return;
+ PC.GV = new GlobalVariable(
+ *F.getParent(), C.getType(), true, GlobalValue::InternalLinkage, nullptr,
+ "_PromotedConst", nullptr, GlobalVariable::NotThreadLocal);
+ PC.GV->setInitializer(&C);
+ DEBUG(dbgs() << "Global replacement: ");
+ DEBUG(PC.GV->print(dbgs()));
+ DEBUG(dbgs() << '\n');
+ ++NumPromoted;
+}
- // Traverse all insertion points in all the function.
- for (const auto &FctToInstPtsIt : InsPtsPerFunc) {
- const InsertionPoints &InsertPts = FctToInstPtsIt.second;
-// Do more checking for debug purposes.
+void AArch64PromoteConstant::insertDefinitions(Function &F,
+ GlobalVariable &PromotedGV,
+ InsertionPoints &InsertPts) {
#ifndef NDEBUG
- DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>(
- *FctToInstPtsIt.first).getDomTree();
+ // Do more checking for debug purposes.
+ DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>(F).getDomTree();
#endif
- assert(!InsertPts.empty() && "Empty uses does not need a definition");
-
- Module *M = FctToInstPtsIt.first->getParent();
- GlobalVariable *&PromotedGV = ModuleToMergedGV[M];
- if (!PromotedGV) {
- PromotedGV = new GlobalVariable(
- *M, Cst->getType(), true, GlobalValue::InternalLinkage, nullptr,
- "_PromotedConst", nullptr, GlobalVariable::NotThreadLocal);
- PromotedGV->setInitializer(Cst);
- DEBUG(dbgs() << "Global replacement: ");
- DEBUG(PromotedGV->print(dbgs()));
- DEBUG(dbgs() << '\n');
- ++NumPromoted;
- HasChanged = true;
- }
-
- for (const auto &IPI : InsertPts) {
- // Create the load of the global variable.
- IRBuilder<> Builder(IPI.first);
- LoadInst *LoadedCst = Builder.CreateLoad(PromotedGV);
- DEBUG(dbgs() << "**********\n");
- DEBUG(dbgs() << "New def: ");
- DEBUG(LoadedCst->print(dbgs()));
- DEBUG(dbgs() << '\n');
+ assert(!InsertPts.empty() && "Empty uses does not need a definition");
+
+ for (const auto &IPI : InsertPts) {
+ // Create the load of the global variable.
+ IRBuilder<> Builder(IPI.first);
+ LoadInst *LoadedCst = Builder.CreateLoad(&PromotedGV);
+ DEBUG(dbgs() << "**********\n");
+ DEBUG(dbgs() << "New def: ");
+ DEBUG(LoadedCst->print(dbgs()));
+ DEBUG(dbgs() << '\n');
- // Update the dominated uses.
- for (Use *Use : IPI.second) {
+ // Update the dominated uses.
+ for (auto Use : IPI.second) {
#ifndef NDEBUG
- assert(DT.dominates(LoadedCst, findInsertionPoint(*Use)) &&
- "Inserted definition does not dominate all its uses!");
+ assert(DT.dominates(LoadedCst,
+ findInsertionPoint(*Use.first, Use.second)) &&
+ "Inserted definition does not dominate all its uses!");
#endif
- DEBUG(dbgs() << "Use to update " << Use->getOperandNo() << ":");
- DEBUG(Use->getUser()->print(dbgs()));
- DEBUG(dbgs() << '\n');
- Use->set(LoadedCst);
- ++NumPromotedUses;
- }
+ DEBUG({
+ dbgs() << "Use to update " << Use.second << ":";
+ Use.first->print(dbgs());
+ dbgs() << '\n';
+ });
+ Use.first->setOperand(Use.second, LoadedCst);
+ ++NumPromotedUses;
}
}
- return HasChanged;
}
-bool AArch64PromoteConstant::computeAndInsertDefinitions(Constant *Val) {
- InsertionPointsPerFunc InsertPtsPerFunc;
- computeInsertionPoints(Val, InsertPtsPerFunc);
- return insertDefinitions(Val, InsertPtsPerFunc);
-}
-
-bool AArch64PromoteConstant::promoteConstant(Constant *Cst) {
- assert(Cst && "Given variable is not a valid constant.");
-
- if (!shouldConvert(Cst))
- return false;
-
- DEBUG(dbgs() << "******************************\n");
- DEBUG(dbgs() << "Candidate constant: ");
- DEBUG(Cst->print(dbgs()));
- DEBUG(dbgs() << '\n');
-
- return computeAndInsertDefinitions(Cst);
+void AArch64PromoteConstant::promoteConstants(
+ Function &F, SmallVectorImpl<UpdateRecord> &Updates,
+ PromotionCacheTy &PromotionCache) {
+ // Promote the constants.
+ for (auto U = Updates.begin(), E = Updates.end(); U != E;) {
+ DEBUG(dbgs() << "** Compute insertion points **\n");
+ auto First = U;
+ Constant *C = First->C;
+ InsertionPoints InsertPts;
+ do {
+ computeInsertionPoint(U->User, U->Op, InsertPts);
+ } while (++U != E && U->C == C);
+
+ auto &Promotion = PromotionCache[C];
+ ensurePromotedGV(F, *C, Promotion);
+ insertDefinitions(F, *Promotion.GV, InsertPts);
+ }
}
-bool AArch64PromoteConstant::runOnFunction(Function &F) {
+bool AArch64PromoteConstant::runOnFunction(Function &F,
+ PromotionCacheTy &PromotionCache) {
// Look for instructions using constant vector. Promote that constant to a
// global variable. Create as few loads of this variable as possible and
// update the uses accordingly.
- bool LocalChange = false;
- SmallPtrSet<Constant *, 8> AlreadyChecked;
-
+ SmallVector<UpdateRecord, 64> Updates;
for (Instruction &I : instructions(&F)) {
// Traverse the operand, looking for constant vectors. Replace them by a
// load of a global variable of constant vector type.
- for (Value *Op : I.operand_values()) {
- Constant *Cst = dyn_cast<Constant>(Op);
+ for (Use &U : I.operands()) {
+ Constant *Cst = dyn_cast<Constant>(U);
// There is no point in promoting global values as they are already
// global. Do not promote constant expressions either, as they may
// require some code expansion.
- if (Cst && !isa<GlobalValue>(Cst) && !isa<ConstantExpr>(Cst) &&
- AlreadyChecked.insert(Cst).second)
- LocalChange |= promoteConstant(Cst);
+ if (!Cst || isa<GlobalValue>(Cst) || isa<ConstantExpr>(Cst))
+ continue;
+
+ // Check if this constant is worth promoting.
+ if (!shouldConvert(*Cst, PromotionCache))
+ continue;
+
+ // Check if this use should be promoted.
+ unsigned OpNo = &U - I.op_begin();
+ if (!shouldConvertUse(Cst, &I, OpNo))
+ continue;
+
+ Updates.emplace_back(Cst, &I, OpNo);
}
}
- return LocalChange;
+
+ if (Updates.empty())
+ return false;
+
+ promoteConstants(F, Updates, PromotionCache);
+ return true;
}
diff --git a/lib/Target/AArch64/AArch64RedundantCopyElimination.cpp b/lib/Target/AArch64/AArch64RedundantCopyElimination.cpp
new file mode 100644
index 0000000000000..60d8bbd260bb7
--- /dev/null
+++ b/lib/Target/AArch64/AArch64RedundantCopyElimination.cpp
@@ -0,0 +1,182 @@
+//=- AArch64RedundantCopyElimination.cpp - Remove useless copy for AArch64 -=//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+// This pass removes unnecessary zero copies in BBs that are targets of
+// cbz/cbnz instructions. For instance, the copy instruction in the code below
+// can be removed because the CBZW jumps to BB#2 when W0 is zero.
+// BB#1:
+// CBZW %W0, <BB#2>
+// BB#2:
+// %W0 = COPY %WZR
+// This pass should be run after register allocation.
+//
+// FIXME: This should be extended to handle any constant other than zero. E.g.,
+// cmp w0, #1
+// b.eq .BB1
+// BB1:
+// mov w0, #1
+//
+// FIXME: This could also be extended to check the whole dominance subtree below
+// the comparison if the compile time regression is acceptable.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AArch64.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/iterator_range.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "aarch64-copyelim"
+
+STATISTIC(NumCopiesRemoved, "Number of copies removed.");
+
+namespace llvm {
+void initializeAArch64RedundantCopyEliminationPass(PassRegistry &);
+}
+
+namespace {
+class AArch64RedundantCopyElimination : public MachineFunctionPass {
+ const MachineRegisterInfo *MRI;
+ const TargetRegisterInfo *TRI;
+
+public:
+ static char ID;
+ AArch64RedundantCopyElimination() : MachineFunctionPass(ID) {}
+ bool optimizeCopy(MachineBasicBlock *MBB);
+ bool runOnMachineFunction(MachineFunction &MF) override;
+ MachineFunctionProperties getRequiredProperties() const override {
+ return MachineFunctionProperties().set(
+ MachineFunctionProperties::Property::AllVRegsAllocated);
+ }
+ const char *getPassName() const override {
+ return "AArch64 Redundant Copy Elimination";
+ }
+};
+char AArch64RedundantCopyElimination::ID = 0;
+}
+
+INITIALIZE_PASS(AArch64RedundantCopyElimination, "aarch64-copyelim",
+ "AArch64 redundant copy elimination pass", false, false)
+
+static bool guaranteesZeroRegInBlock(MachineInstr &MI, MachineBasicBlock *MBB) {
+ unsigned Opc = MI.getOpcode();
+ // Check if the current basic block is the target block to which the
+ // CBZ/CBNZ instruction jumps when its Wt/Xt is zero.
+ if ((Opc == AArch64::CBZW || Opc == AArch64::CBZX) &&
+ MBB == MI.getOperand(1).getMBB())
+ return true;
+ else if ((Opc == AArch64::CBNZW || Opc == AArch64::CBNZX) &&
+ MBB != MI.getOperand(1).getMBB())
+ return true;
+
+ return false;
+}
+
+bool AArch64RedundantCopyElimination::optimizeCopy(MachineBasicBlock *MBB) {
+ // Check if the current basic block has a single predecessor.
+ if (MBB->pred_size() != 1)
+ return false;
+
+ MachineBasicBlock *PredMBB = *MBB->pred_begin();
+ MachineBasicBlock::iterator CompBr = PredMBB->getLastNonDebugInstr();
+ if (CompBr == PredMBB->end() || PredMBB->succ_size() != 2)
+ return false;
+
+ ++CompBr;
+ do {
+ --CompBr;
+ if (guaranteesZeroRegInBlock(*CompBr, MBB))
+ break;
+ } while (CompBr != PredMBB->begin() && CompBr->isTerminator());
+
+ // We've not found a CBZ/CBNZ, time to bail out.
+ if (!guaranteesZeroRegInBlock(*CompBr, MBB))
+ return false;
+
+ unsigned TargetReg = CompBr->getOperand(0).getReg();
+ if (!TargetReg)
+ return false;
+ assert(TargetRegisterInfo::isPhysicalRegister(TargetReg) &&
+ "Expect physical register");
+
+ // Remember all registers aliasing with TargetReg.
+ SmallSetVector<unsigned, 8> TargetRegs;
+ for (MCRegAliasIterator AI(TargetReg, TRI, true); AI.isValid(); ++AI)
+ TargetRegs.insert(*AI);
+
+ bool Changed = false;
+ MachineBasicBlock::iterator LastChange = MBB->begin();
+ unsigned SmallestDef = TargetReg;
+ // Remove redundant Copy instructions unless TargetReg is modified.
+ for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E;) {
+ MachineInstr *MI = &*I;
+ ++I;
+ if (MI->isCopy() && MI->getOperand(0).isReg() &&
+ MI->getOperand(1).isReg()) {
+
+ unsigned DefReg = MI->getOperand(0).getReg();
+ unsigned SrcReg = MI->getOperand(1).getReg();
+
+ if ((SrcReg == AArch64::XZR || SrcReg == AArch64::WZR) &&
+ !MRI->isReserved(DefReg) &&
+ (TargetReg == DefReg || TRI->isSuperRegister(DefReg, TargetReg))) {
+ DEBUG(dbgs() << "Remove redundant Copy : ");
+ DEBUG((MI)->print(dbgs()));
+
+ MI->eraseFromParent();
+ Changed = true;
+ LastChange = I;
+ NumCopiesRemoved++;
+ SmallestDef =
+ TRI->isSubRegister(SmallestDef, DefReg) ? DefReg : SmallestDef;
+ continue;
+ }
+ }
+
+ if (MI->modifiesRegister(TargetReg, TRI))
+ break;
+ }
+
+ if (!Changed)
+ return false;
+
+ // Otherwise, we have to fixup the use-def chain, starting with the
+ // CBZ/CBNZ. Conservatively mark as much as we can live.
+ CompBr->clearRegisterKills(SmallestDef, TRI);
+
+ if (std::none_of(TargetRegs.begin(), TargetRegs.end(),
+ [&](unsigned Reg) { return MBB->isLiveIn(Reg); }))
+ MBB->addLiveIn(TargetReg);
+
+ // Clear any kills of TargetReg between CompBr and the last removed COPY.
+ for (MachineInstr &MMI :
+ make_range(MBB->begin()->getIterator(), LastChange->getIterator()))
+ MMI.clearRegisterKills(SmallestDef, TRI);
+
+ return true;
+}
+
+bool AArch64RedundantCopyElimination::runOnMachineFunction(
+ MachineFunction &MF) {
+ if (skipFunction(*MF.getFunction()))
+ return false;
+ TRI = MF.getSubtarget().getRegisterInfo();
+ MRI = &MF.getRegInfo();
+ bool Changed = false;
+ for (MachineBasicBlock &MBB : MF)
+ Changed |= optimizeCopy(&MBB);
+ return Changed;
+}
+
+FunctionPass *llvm::createAArch64RedundantCopyEliminationPass() {
+ return new AArch64RedundantCopyElimination();
+}
diff --git a/lib/Target/AArch64/AArch64RegisterBankInfo.cpp b/lib/Target/AArch64/AArch64RegisterBankInfo.cpp
new file mode 100644
index 0000000000000..0a1831bd9a8ca
--- /dev/null
+++ b/lib/Target/AArch64/AArch64RegisterBankInfo.cpp
@@ -0,0 +1,168 @@
+//===- AArch64RegisterBankInfo.cpp -------------------------------*- C++ -*-==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file implements the targeting of the RegisterBankInfo class for
+/// AArch64.
+/// \todo This should be generated by TableGen.
+//===----------------------------------------------------------------------===//
+
+#include "AArch64RegisterBankInfo.h"
+#include "AArch64InstrInfo.h" // For XXXRegClassID.
+#include "llvm/CodeGen/GlobalISel/RegisterBank.h"
+#include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+
+using namespace llvm;
+
+#ifndef LLVM_BUILD_GLOBAL_ISEL
+#error "You shouldn't build this"
+#endif
+
+AArch64RegisterBankInfo::AArch64RegisterBankInfo(const TargetRegisterInfo &TRI)
+ : RegisterBankInfo(AArch64::NumRegisterBanks) {
+ // Initialize the GPR bank.
+ createRegisterBank(AArch64::GPRRegBankID, "GPR");
+ // The GPR register bank is fully defined by all the registers in
+ // GR64all + its subclasses.
+ addRegBankCoverage(AArch64::GPRRegBankID, AArch64::GPR64allRegClassID, TRI);
+ const RegisterBank &RBGPR = getRegBank(AArch64::GPRRegBankID);
+ (void)RBGPR;
+ assert(RBGPR.covers(*TRI.getRegClass(AArch64::GPR32RegClassID)) &&
+ "Subclass not added?");
+ assert(RBGPR.getSize() == 64 && "GPRs should hold up to 64-bit");
+
+ // Initialize the FPR bank.
+ createRegisterBank(AArch64::FPRRegBankID, "FPR");
+ // The FPR register bank is fully defined by all the registers in
+ // GR64all + its subclasses.
+ addRegBankCoverage(AArch64::FPRRegBankID, AArch64::QQQQRegClassID, TRI);
+ const RegisterBank &RBFPR = getRegBank(AArch64::FPRRegBankID);
+ (void)RBFPR;
+ assert(RBFPR.covers(*TRI.getRegClass(AArch64::QQRegClassID)) &&
+ "Subclass not added?");
+ assert(RBFPR.covers(*TRI.getRegClass(AArch64::FPR64RegClassID)) &&
+ "Subclass not added?");
+ assert(RBFPR.getSize() == 512 &&
+ "FPRs should hold up to 512-bit via QQQQ sequence");
+
+ // Initialize the CCR bank.
+ createRegisterBank(AArch64::CCRRegBankID, "CCR");
+ addRegBankCoverage(AArch64::CCRRegBankID, AArch64::CCRRegClassID, TRI);
+ const RegisterBank &RBCCR = getRegBank(AArch64::CCRRegBankID);
+ (void)RBCCR;
+ assert(RBCCR.covers(*TRI.getRegClass(AArch64::CCRRegClassID)) &&
+ "Class not added?");
+ assert(RBCCR.getSize() == 32 && "CCR should hold up to 32-bit");
+
+ assert(verify(TRI) && "Invalid register bank information");
+}
+
+unsigned AArch64RegisterBankInfo::copyCost(const RegisterBank &A,
+ const RegisterBank &B,
+ unsigned Size) const {
+ // What do we do with different size?
+ // copy are same size.
+ // Will introduce other hooks for different size:
+ // * extract cost.
+ // * build_sequence cost.
+ // TODO: Add more accurate cost for FPR to/from GPR.
+ return RegisterBankInfo::copyCost(A, B, Size);
+}
+
+const RegisterBank &AArch64RegisterBankInfo::getRegBankFromRegClass(
+ const TargetRegisterClass &RC) const {
+ switch (RC.getID()) {
+ case AArch64::FPR8RegClassID:
+ case AArch64::FPR16RegClassID:
+ case AArch64::FPR32RegClassID:
+ case AArch64::FPR64RegClassID:
+ case AArch64::FPR128RegClassID:
+ case AArch64::FPR128_loRegClassID:
+ case AArch64::DDRegClassID:
+ case AArch64::DDDRegClassID:
+ case AArch64::DDDDRegClassID:
+ case AArch64::QQRegClassID:
+ case AArch64::QQQRegClassID:
+ case AArch64::QQQQRegClassID:
+ return getRegBank(AArch64::FPRRegBankID);
+ case AArch64::GPR32commonRegClassID:
+ case AArch64::GPR32RegClassID:
+ case AArch64::GPR32spRegClassID:
+ case AArch64::GPR32sponlyRegClassID:
+ case AArch64::GPR32allRegClassID:
+ case AArch64::GPR64commonRegClassID:
+ case AArch64::GPR64RegClassID:
+ case AArch64::GPR64spRegClassID:
+ case AArch64::GPR64sponlyRegClassID:
+ case AArch64::GPR64allRegClassID:
+ case AArch64::tcGPR64RegClassID:
+ case AArch64::WSeqPairsClassRegClassID:
+ case AArch64::XSeqPairsClassRegClassID:
+ return getRegBank(AArch64::GPRRegBankID);
+ case AArch64::CCRRegClassID:
+ return getRegBank(AArch64::CCRRegBankID);
+ default:
+ llvm_unreachable("Register class not supported");
+ }
+}
+
+RegisterBankInfo::InstructionMappings
+AArch64RegisterBankInfo::getInstrAlternativeMappings(
+ const MachineInstr &MI) const {
+ switch (MI.getOpcode()) {
+ case TargetOpcode::G_OR: {
+ // 32 and 64-bit or can be mapped on either FPR or
+ // GPR for the same cost.
+ const MachineFunction &MF = *MI.getParent()->getParent();
+ const TargetSubtargetInfo &STI = MF.getSubtarget();
+ const TargetRegisterInfo &TRI = *STI.getRegisterInfo();
+ const MachineRegisterInfo &MRI = MF.getRegInfo();
+
+ unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, TRI);
+ if (Size != 32 && Size != 64)
+ break;
+
+ // If the instruction has any implicit-defs or uses,
+ // do not mess with it.
+ if (MI.getNumOperands() != 3)
+ break;
+ InstructionMappings AltMappings;
+ InstructionMapping GPRMapping(/*ID*/ 1, /*Cost*/ 1, /*NumOperands*/ 3);
+ InstructionMapping FPRMapping(/*ID*/ 2, /*Cost*/ 1, /*NumOperands*/ 3);
+ for (unsigned Idx = 0; Idx != 3; ++Idx) {
+ GPRMapping.setOperandMapping(Idx, Size,
+ getRegBank(AArch64::GPRRegBankID));
+ FPRMapping.setOperandMapping(Idx, Size,
+ getRegBank(AArch64::FPRRegBankID));
+ }
+ AltMappings.emplace_back(std::move(GPRMapping));
+ AltMappings.emplace_back(std::move(FPRMapping));
+ return AltMappings;
+ }
+ default:
+ break;
+ }
+ return RegisterBankInfo::getInstrAlternativeMappings(MI);
+}
+
+void AArch64RegisterBankInfo::applyMappingImpl(
+ const OperandsMapper &OpdMapper) const {
+ switch (OpdMapper.getMI().getOpcode()) {
+ case TargetOpcode::G_OR: {
+ // Those ID must match getInstrAlternativeMappings.
+ assert((OpdMapper.getInstrMapping().getID() == 1 ||
+ OpdMapper.getInstrMapping().getID() == 2) &&
+ "Don't know how to handle that ID");
+ return applyDefaultMapping(OpdMapper);
+ }
+ default:
+ llvm_unreachable("Don't know how to handle that operation");
+ }
+}
diff --git a/lib/Target/AArch64/AArch64RegisterBankInfo.h b/lib/Target/AArch64/AArch64RegisterBankInfo.h
new file mode 100644
index 0000000000000..907bcfdea161b
--- /dev/null
+++ b/lib/Target/AArch64/AArch64RegisterBankInfo.h
@@ -0,0 +1,69 @@
+//===- AArch64RegisterBankInfo -----------------------------------*- C++ -*-==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file declares the targeting of the RegisterBankInfo class for AArch64.
+/// \todo This should be generated by TableGen.
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64REGISTERBANKINFO_H
+#define LLVM_LIB_TARGET_AARCH64_AARCH64REGISTERBANKINFO_H
+
+#include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
+
+namespace llvm {
+
+class TargetRegisterInfo;
+
+namespace AArch64 {
+enum {
+ GPRRegBankID = 0, /// General Purpose Registers: W, X.
+ FPRRegBankID = 1, /// Floating Point/Vector Registers: B, H, S, D, Q.
+ CCRRegBankID = 2, /// Conditional register: NZCV.
+ NumRegisterBanks
+};
+} // End AArch64 namespace.
+
+/// This class provides the information for the target register banks.
+class AArch64RegisterBankInfo : public RegisterBankInfo {
+ /// See RegisterBankInfo::applyMapping.
+ void applyMappingImpl(const OperandsMapper &OpdMapper) const override;
+
+public:
+ AArch64RegisterBankInfo(const TargetRegisterInfo &TRI);
+ /// Get the cost of a copy from \p B to \p A, or put differently,
+ /// get the cost of A = COPY B. Since register banks may cover
+ /// different size, \p Size specifies what will be the size in bits
+ /// that will be copied around.
+ ///
+ /// \note Since this is a copy, both registers have the same size.
+ unsigned copyCost(const RegisterBank &A, const RegisterBank &B,
+ unsigned Size) const override;
+
+ /// Get a register bank that covers \p RC.
+ ///
+ /// \pre \p RC is a user-defined register class (as opposed as one
+ /// generated by TableGen).
+ ///
+ /// \note The mapping RC -> RegBank could be built while adding the
+ /// coverage for the register banks. However, we do not do it, because,
+ /// at least for now, we only need this information for register classes
+ /// that are used in the description of instruction. In other words,
+ /// there are just a handful of them and we do not want to waste space.
+ ///
+ /// \todo This should be TableGen'ed.
+ const RegisterBank &
+ getRegBankFromRegClass(const TargetRegisterClass &RC) const override;
+
+ /// Get the alternative mappings for \p MI.
+ /// Alternative in the sense different from getInstrMapping.
+ InstructionMappings
+ getInstrAlternativeMappings(const MachineInstr &MI) const override;
+};
+} // End llvm namespace.
+#endif
diff --git a/lib/Target/AArch64/AArch64RegisterInfo.cpp b/lib/Target/AArch64/AArch64RegisterInfo.cpp
index 32b4888f2f647..af867da4823d1 100644
--- a/lib/Target/AArch64/AArch64RegisterInfo.cpp
+++ b/lib/Target/AArch64/AArch64RegisterInfo.cpp
@@ -25,7 +25,6 @@
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/RegisterScavenging.h"
#include "llvm/IR/Function.h"
-#include "llvm/Support/CommandLine.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Target/TargetFrameLowering.h"
#include "llvm/Target/TargetOptions.h"
@@ -51,6 +50,13 @@ AArch64RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
return MF->getInfo<AArch64FunctionInfo>()->isSplitCSR() ?
CSR_AArch64_CXX_TLS_Darwin_PE_SaveList :
CSR_AArch64_CXX_TLS_Darwin_SaveList;
+ if (MF->getSubtarget<AArch64Subtarget>().getTargetLowering()
+ ->supportSwiftError() &&
+ MF->getFunction()->getAttributes().hasAttrSomewhere(
+ Attribute::SwiftError))
+ return CSR_AArch64_AAPCS_SwiftError_SaveList;
+ if (MF->getFunction()->getCallingConv() == CallingConv::PreserveMost)
+ return CSR_AArch64_RT_MostRegs_SaveList;
else
return CSR_AArch64_AAPCS_SaveList;
}
@@ -74,6 +80,12 @@ AArch64RegisterInfo::getCallPreservedMask(const MachineFunction &MF,
return CSR_AArch64_AllRegs_RegMask;
if (CC == CallingConv::CXX_FAST_TLS)
return CSR_AArch64_CXX_TLS_Darwin_RegMask;
+ if (MF.getSubtarget<AArch64Subtarget>().getTargetLowering()
+ ->supportSwiftError() &&
+ MF.getFunction()->getAttributes().hasAttrSomewhere(Attribute::SwiftError))
+ return CSR_AArch64_AAPCS_SwiftError_RegMask;
+ if (CC == CallingConv::PreserveMost)
+ return CSR_AArch64_RT_MostRegs_RegMask;
else
return CSR_AArch64_AAPCS_RegMask;
}
@@ -190,9 +202,7 @@ bool AArch64RegisterInfo::hasBasePointer(const MachineFunction &MF) const {
// If it's wrong, we'll materialize the constant and still get to the
// object; it's just suboptimal. Negative offsets use the unscaled
// load/store instructions, which have a 9-bit signed immediate.
- if (MFI->getLocalFrameSize() < 256)
- return false;
- return true;
+ return MFI->getLocalFrameSize() >= 256;
}
return false;
@@ -231,9 +241,7 @@ bool AArch64RegisterInfo::requiresFrameIndexScavenging(
bool
AArch64RegisterInfo::cannotEliminateFrame(const MachineFunction &MF) const {
const MachineFrameInfo *MFI = MF.getFrameInfo();
- // Only consider eliminating leaf frames.
- if (MFI->hasCalls() || (MF.getTarget().Options.DisableFramePointerElim(MF) &&
- MFI->adjustsStack()))
+ if (MF.getTarget().Options.DisableFramePointerElim(MF) && MFI->adjustsStack())
return true;
return MFI->hasVarSizedObjects() || MFI->isFrameAddressTaken();
}
@@ -396,8 +404,6 @@ void AArch64RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
MI.getOperand(FIOperandNum).ChangeToRegister(ScratchReg, false, false, true);
}
-namespace llvm {
-
unsigned AArch64RegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
MachineFunction &MF) const {
const AArch64FrameLowering *TFI = getFrameLowering(MF);
@@ -437,5 +443,3 @@ unsigned AArch64RegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
return 16;
}
}
-
-} // namespace llvm
diff --git a/lib/Target/AArch64/AArch64RegisterInfo.td b/lib/Target/AArch64/AArch64RegisterInfo.td
index a8c8b176efa9f..5fbaff00a5e71 100644
--- a/lib/Target/AArch64/AArch64RegisterInfo.td
+++ b/lib/Target/AArch64/AArch64RegisterInfo.td
@@ -1,4 +1,4 @@
-//=- AArch64RegisterInfo.td - Describe the AArch64 Regisers --*- tablegen -*-=//
+//=- AArch64RegisterInfo.td - Describe the AArch64 Registers -*- tablegen -*-=//
//
// The LLVM Compiler Infrastructure
//
diff --git a/lib/Target/AArch64/AArch64SchedA53.td b/lib/Target/AArch64/AArch64SchedA53.td
index d709bee7b9eb4..93ca079275c8c 100644
--- a/lib/Target/AArch64/AArch64SchedA53.td
+++ b/lib/Target/AArch64/AArch64SchedA53.td
@@ -19,13 +19,13 @@
def CortexA53Model : SchedMachineModel {
let MicroOpBufferSize = 0; // Explicitly set to zero since A53 is in-order.
let IssueWidth = 2; // 2 micro-ops are dispatched per cycle.
- let MinLatency = 1 ; // OperandCycles are interpreted as MinLatency.
let LoadLatency = 3; // Optimistic load latency assuming bypass.
// This is overriden by OperandCycles if the
// Itineraries are queried instead.
let MispredictPenalty = 9; // Based on "Cortex-A53 Software Optimisation
// Specification - Instruction Timings"
// v 1.0 Spreadsheet
+ let CompleteModel = 1;
}
@@ -109,6 +109,8 @@ def A53WriteVST2 : SchedWriteRes<[A53UnitLdSt]> { let Latency = 5;
def A53WriteVST3 : SchedWriteRes<[A53UnitLdSt]> { let Latency = 6;
let ResourceCycles = [3]; }
+def : WriteRes<WriteAtomic, []> { let Unsupported = 1; }
+
// Branch
def : WriteRes<WriteBr, [A53UnitB]>;
def : WriteRes<WriteBrReg, [A53UnitB]>;
diff --git a/lib/Target/AArch64/AArch64SchedA57.td b/lib/Target/AArch64/AArch64SchedA57.td
index ca4457af8525a..a266351f7ffc0 100644
--- a/lib/Target/AArch64/AArch64SchedA57.td
+++ b/lib/Target/AArch64/AArch64SchedA57.td
@@ -30,6 +30,7 @@ def CortexA57Model : SchedMachineModel {
// Enable partial & runtime unrolling. The magic number is chosen based on
// experiments and benchmarking data.
let LoopMicroOpBufferSize = 16;
+ let CompleteModel = 1;
}
//===----------------------------------------------------------------------===//
@@ -96,6 +97,8 @@ def : SchedAlias<WriteV, A57Write_3cyc_1V>;
def : SchedAlias<WriteVLD, A57Write_5cyc_1L>;
def : SchedAlias<WriteVST, A57Write_1cyc_1S>;
+def : WriteRes<WriteAtomic, []> { let Unsupported = 1; }
+
def : WriteRes<WriteSys, []> { let Latency = 1; }
def : WriteRes<WriteBarrier, []> { let Latency = 1; }
def : WriteRes<WriteHint, []> { let Latency = 1; }
diff --git a/lib/Target/AArch64/AArch64SchedCyclone.td b/lib/Target/AArch64/AArch64SchedCyclone.td
index a2a1802377894..9fd3ae6818e5d 100644
--- a/lib/Target/AArch64/AArch64SchedCyclone.td
+++ b/lib/Target/AArch64/AArch64SchedCyclone.td
@@ -1,4 +1,4 @@
-//=- ARMSchedCyclone.td - AArch64 Cyclone Scheduling Defs ----*- tablegen -*-=//
+//=- AArch64SchedCyclone.td - Cyclone Scheduling Definitions -*- tablegen -*-=//
//
// The LLVM Compiler Infrastructure
//
@@ -17,6 +17,7 @@ def CycloneModel : SchedMachineModel {
let MicroOpBufferSize = 192; // Based on the reorder buffer.
let LoadLatency = 4; // Optimistic load latency.
let MispredictPenalty = 16; // 14-19 cycles are typical.
+ let CompleteModel = 1;
}
//===----------------------------------------------------------------------===//
@@ -107,7 +108,7 @@ def WriteX : SchedWriteRes<[]> { let Latency = 0; }
// The move is replaced by a single nop micro-op.
// MOVZ Rd, #0
// AND Rd, Rzr, #imm
-def WriteZPred : SchedPredicate<[{TII->isGPRZero(MI)}]>;
+def WriteZPred : SchedPredicate<[{TII->isGPRZero(*MI)}]>;
def WriteImmZ : SchedWriteVariant<[
SchedVar<WriteZPred, [WriteX]>,
SchedVar<NoSchedPred, [WriteImm]>]>;
@@ -116,8 +117,8 @@ def : InstRW<[WriteImmZ], (instrs MOVZWi,MOVZXi,ANDWri,ANDXri)>;
// Move GPR is a register rename and single nop micro-op.
// ORR Xd, XZR, Xm
// ADD Xd, Xn, #0
-def WriteIMovPred : SchedPredicate<[{TII->isGPRCopy(MI)}]>;
-def WriteVMovPred : SchedPredicate<[{TII->isFPRCopy(MI)}]>;
+def WriteIMovPred : SchedPredicate<[{TII->isGPRCopy(*MI)}]>;
+def WriteVMovPred : SchedPredicate<[{TII->isFPRCopy(*MI)}]>;
def WriteMov : SchedWriteVariant<[
SchedVar<WriteIMovPred, [WriteX]>,
SchedVar<WriteVMovPred, [WriteX]>,
@@ -726,7 +727,7 @@ def : InstRW<[WriteVLDShuffle, WriteAdr, WriteV, WriteV],
def : InstRW<[WriteVLDShuffle, WriteVLDShuffle, WriteV],
(instrs LD3Rv1d,LD3Rv2d)>;
def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVLDShuffle, WriteV],
- (instrs LD3Rv2d_POST,LD3Rv2d_POST)>;
+ (instrs LD3Rv1d_POST,LD3Rv2d_POST)>;
def : InstRW<[WriteVLDShuffle, WriteVLDShuffle, WriteV, WriteV],
(instregex "LD4Fourv(8b|4h|2s)$")>;
@@ -851,6 +852,9 @@ def : InstRW<[WriteAdr, WriteVSTPairShuffle], (instregex "ST4i(8|16|32)_POST")>;
def : InstRW<[WriteVSTShuffle, WriteVSTShuffle], (instrs ST4i64)>;
def : InstRW<[WriteAdr, WriteVSTShuffle, WriteVSTShuffle],(instrs ST4i64_POST)>;
+// Atomic operations are not supported.
+def : WriteRes<WriteAtomic, []> { let Unsupported = 1; }
+
//---
// Unused SchedRead types
//---
diff --git a/lib/Target/AArch64/AArch64SchedKryo.td b/lib/Target/AArch64/AArch64SchedKryo.td
new file mode 100644
index 0000000000000..4e491a04c78df
--- /dev/null
+++ b/lib/Target/AArch64/AArch64SchedKryo.td
@@ -0,0 +1,133 @@
+//==- AArch64SchedKryo.td - Qualcomm Kryo Scheduling Defs ---*- tablegen -*-==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the machine model for Qualcomm Kryo to support
+// instruction scheduling and other instruction cost heuristics.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// The issue width is set to five, matching the five issue queues for expanded
+// uops. Now, the latency spreadsheet has information based on fragmented uops,
+// but these do not actually take up an issue queue.
+
+def KryoModel : SchedMachineModel {
+ let IssueWidth = 5; // 5-wide issue for expanded uops
+ let MicroOpBufferSize = 128; // Out-of-order with temporary unified issue buffer
+ let LoadLatency = 4; // Optimistic load latency
+ let MispredictPenalty = 14; // Fetch + Decode/Rename/Dispatch + Branch
+
+ // Enable partial & runtime unrolling. The magic number is chosen based on
+ // experiments and benchmarking data.
+ let LoopMicroOpBufferSize = 16;
+ let CompleteModel = 1;
+}
+
+//===----------------------------------------------------------------------===//
+// Define each kind of processor resource and number available on Kryo.
+
+let SchedModel = KryoModel in {
+ def KryoUnitXA : ProcResource<1>; // Type X(A) micro-ops
+ def KryoUnitXB : ProcResource<1>; // Type X(B) micro-ops
+ def KryoUnitYA : ProcResource<1>; // Type Y(A) micro-ops
+ def KryoUnitYB : ProcResource<1>; // Type Y(B) micro-ops
+ def KryoUnitX : ProcResGroup<[KryoUnitXA, // Type X micro-ops
+ KryoUnitXB]>;
+ def KryoUnitY : ProcResGroup<[KryoUnitYA, // Type Y micro-ops
+ KryoUnitYB]>;
+ def KryoUnitXY : ProcResGroup<[KryoUnitXA, // Type XY micro-ops
+ KryoUnitXB,
+ KryoUnitYA,
+ KryoUnitYB]>;
+ def KryoUnitLSA : ProcResource<1>; // Type LS(A) micro-ops
+ def KryoUnitLSB : ProcResource<1>; // Type LS(B) micro-ops
+ def KryoUnitLS : ProcResGroup<[KryoUnitLSA, // Type LS micro-ops
+ KryoUnitLSB]>;
+}
+
+let SchedModel = KryoModel in {
+
+//===----------------------------------------------------------------------===//
+// Map the target-defined scheduler read/write resources and latency for
+// Kryo.
+
+def : WriteRes<WriteImm, [KryoUnitXY]> { let Latency = 1; }
+def : WriteRes<WriteI, [KryoUnitXY]> { let Latency = 1; }
+def : WriteRes<WriteISReg, [KryoUnitXY, KryoUnitXY]>
+ { let Latency = 2; let NumMicroOps = 2; }
+def : WriteRes<WriteIEReg, [KryoUnitXY, KryoUnitXY]>
+ { let Latency = 2; let NumMicroOps = 2; }
+def : WriteRes<WriteExtr, [KryoUnitXY, KryoUnitX]>
+ { let Latency = 2; let NumMicroOps = 2; }
+def : WriteRes<WriteIS, [KryoUnitXY]> { let Latency = 2; }
+def : WriteRes<WriteID32, [KryoUnitXA, KryoUnitY]>
+ { let Latency = 8; let NumMicroOps = 1; } // Fragent -1
+def : WriteRes<WriteID64, [KryoUnitXA, KryoUnitY]>
+ { let Latency = 8; let NumMicroOps = 1; } // Fragent -1
+def : WriteRes<WriteIM32, [KryoUnitX]> { let Latency = 5; }
+def : WriteRes<WriteIM64, [KryoUnitX]> { let Latency = 5; }
+def : WriteRes<WriteBr, [KryoUnitXY]> { let Latency = 1; }
+def : WriteRes<WriteBrReg, [KryoUnitXY]> { let Latency = 1; }
+def : WriteRes<WriteLD, [KryoUnitLS]> { let Latency = 4; }
+def : WriteRes<WriteST, [KryoUnitLS]> { let Latency = 4; }
+def : WriteRes<WriteSTP, [KryoUnitLS]> { let Latency = 4; }
+def : WriteRes<WriteAdr, [KryoUnitXY]> { let Latency = 6; }
+def : WriteRes<WriteLDIdx, [KryoUnitLS]> { let Latency = 4; }
+def : WriteRes<WriteSTIdx, [KryoUnitLS]> { let Latency = 4; }
+def : WriteRes<WriteF, [KryoUnitXY, KryoUnitXY]>
+ { let Latency = 3; let NumMicroOps = 2; }
+def : WriteRes<WriteFCmp, [KryoUnitXY]> { let Latency = 2; }
+def : WriteRes<WriteFCvt, [KryoUnitX]> { let Latency = 4; }
+def : WriteRes<WriteFCopy, [KryoUnitXY]> { let Latency = 6; }
+def : WriteRes<WriteFImm, [KryoUnitXY]> { let Latency = 6; }
+def : WriteRes<WriteFMul, [KryoUnitX, KryoUnitX]>
+ { let Latency = 6; let NumMicroOps = 2; }
+def : WriteRes<WriteFDiv, [KryoUnitXA, KryoUnitY]>
+ { let Latency = 12; let NumMicroOps = 2; } // Fragent -1 / NoRSV +1
+def : WriteRes<WriteV, [KryoUnitXY]> { let Latency = 6; }
+def : WriteRes<WriteVLD, [KryoUnitLS]> { let Latency = 4; }
+def : WriteRes<WriteVST, [KryoUnitLS]> { let Latency = 4; }
+
+def : WriteRes<WriteSys, []> { let Latency = 1; }
+def : WriteRes<WriteBarrier, []> { let Latency = 1; }
+def : WriteRes<WriteHint, []> { let Latency = 1; }
+
+def : WriteRes<WriteLDHi, []> { let Latency = 4; }
+
+def : WriteRes<WriteAtomic, []> { let Unsupported = 1; }
+
+// No forwarding logic is modelled yet.
+def : ReadAdvance<ReadI, 0>;
+def : ReadAdvance<ReadISReg, 0>;
+def : ReadAdvance<ReadIEReg, 0>;
+def : ReadAdvance<ReadIM, 0>;
+def : ReadAdvance<ReadIMA, 0>;
+def : ReadAdvance<ReadID, 0>;
+def : ReadAdvance<ReadExtrHi, 0>;
+def : ReadAdvance<ReadAdrBase, 0>;
+def : ReadAdvance<ReadVLD, 0>;
+
+
+//===----------------------------------------------------------------------===//
+// Specialize the coarse model by associating instruction groups with the
+// subtarget-defined types. As the modeled is refined, this will override most
+// of the above SchedWriteRes and SchedAlias mappings.
+
+// Miscellaneous
+// -----------------------------------------------------------------------------
+
+def : InstRW<[WriteI], (instrs COPY)>;
+
+
+// Detailed Refinedments
+// -----------------------------------------------------------------------------
+include "AArch64SchedKryoDetails.td"
+
+
+} // SchedModel = KryoModel
diff --git a/lib/Target/AArch64/AArch64SchedKryoDetails.td b/lib/Target/AArch64/AArch64SchedKryoDetails.td
new file mode 100644
index 0000000000000..426ae6103e4b5
--- /dev/null
+++ b/lib/Target/AArch64/AArch64SchedKryoDetails.td
@@ -0,0 +1,2358 @@
+//=- AArch64SchedKryoDetails.td - QC Kryo Scheduling Defs ----*- tablegen -*-=//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the uop and latency details for the machine model for the
+// Qualcomm Kryo subtarget.
+//
+//===----------------------------------------------------------------------===//
+
+def KryoWrite_3cyc_X_noRSV_138ln :
+ SchedWriteRes<[KryoUnitX]> {
+ let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_X_noRSV_138ln],
+ (instregex "(S|U)R?SRA(d|(v2i32|v4i16|v8i8)_shift)")>;
+
+def KryoWrite_3cyc_X_X_139ln :
+ SchedWriteRes<[KryoUnitX, KryoUnitX]> {
+ let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_X_X_139ln],
+ (instregex "(S|U)R?SRA(v2i64|v4i32|v8i16|v16i8)_shift")>;
+
+def KryoWrite_4cyc_XY_XY_noRSV_172ln :
+ SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+ let Latency = 4; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_4cyc_XY_XY_noRSV_172ln],
+ (instregex "(S|U)ABA(v8i8|v4i16|v2i32)")>;
+def KryoWrite_4cyc_XY_XY_XY_XY_178ln :
+ SchedWriteRes<[KryoUnitXY, KryoUnitXY, KryoUnitXY, KryoUnitXY]> {
+ let Latency = 4; let NumMicroOps = 4;
+}
+def : InstRW<[KryoWrite_4cyc_XY_XY_XY_XY_178ln],
+ (instregex "(S|U)ABA(v16i8|v8i16|v4i32)")>;
+def KryoWrite_3cyc_XY_XY_XY_XY_177ln :
+ SchedWriteRes<[KryoUnitXY, KryoUnitXY, KryoUnitXY, KryoUnitXY]> {
+ let Latency = 3; let NumMicroOps = 4;
+}
+def : InstRW<[KryoWrite_3cyc_XY_XY_XY_XY_177ln],
+ (instregex "(S|U)ABALv.*")>;
+def KryoWrite_3cyc_XY_XY_166ln :
+ SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+ let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_XY_XY_166ln],
+ (instregex "(S|U)(ABD|QSUB|RHADD)(v16i8|v8i16|v4i32|v2i64)")>;
+def KryoWrite_3cyc_XY_noRSV_159ln :
+ SchedWriteRes<[KryoUnitXY]> {
+ let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_XY_noRSV_159ln],
+ (instregex "(S|U)(ABD|RHADD)(v8i8|v4i16|v2i32)")>;
+def KryoWrite_3cyc_XY_XY_165ln :
+ SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+ let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_XY_XY_165ln],
+ (instregex "(S|U)ABDLv.*")>;
+def KryoWrite_3cyc_X_noRSV_154ln :
+ SchedWriteRes<[KryoUnitX]> {
+let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_X_noRSV_154ln],
+ (instregex "(S|U)ADALP(v8i8|v4i16|v2i32)_v.*")>;
+def KryoWrite_3cyc_X_X_155ln :
+ SchedWriteRes<[KryoUnitX, KryoUnitX]> {
+ let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_X_X_155ln],
+ (instregex "(S|U)ADALP(v16i8|v8i16|v4i32)_v.*")>;
+def KryoWrite_2cyc_XY_XY_151ln :
+ SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+ let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_XY_151ln],
+ (instregex "(S|U)(ADD|SUB)Lv.*")>;
+def KryoWrite_2cyc_XY_noRSV_148ln :
+ SchedWriteRes<[KryoUnitXY]> {
+ let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_noRSV_148ln],
+ (instregex "((S|U)ADDLP|ABS)(v2i32|v4i16|v8i8)(_v.*)?")>;
+def KryoWrite_2cyc_XY_XY_150ln :
+ SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+ let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_XY_150ln],
+ (instregex "((S|U)ADDLP|ABS)(v2i64|v4i32|v8i16|v16i8)(_v.*)?")>;
+def KryoWrite_3cyc_XY_XY_XY_noRSV_179ln :
+ SchedWriteRes<[KryoUnitXY, KryoUnitXY, KryoUnitXY]> {
+ let Latency = 3; let NumMicroOps = 4;
+}
+def : InstRW<[KryoWrite_3cyc_XY_XY_XY_noRSV_179ln],
+ (instrs SADDLVv4i32v, UADDLVv4i32v)>;
+def KryoWrite_5cyc_XY_XY_XY_noRSV_180ln :
+ SchedWriteRes<[KryoUnitXY, KryoUnitXY, KryoUnitXY]> {
+ let Latency = 5; let NumMicroOps = 4;
+}
+def : InstRW<[KryoWrite_5cyc_XY_XY_XY_noRSV_180ln],
+ (instrs SADDLVv8i16v, UADDLVv8i16v)>;
+def KryoWrite_6cyc_XY_XY_X_noRSV_181ln :
+ SchedWriteRes<[KryoUnitXY, KryoUnitXY, KryoUnitX]> {
+ let Latency = 6; let NumMicroOps = 4;
+}
+def : InstRW<[KryoWrite_6cyc_XY_XY_X_noRSV_181ln],
+ (instrs SADDLVv16i8v, UADDLVv16i8v)>;
+def KryoWrite_3cyc_XY_noRSV_158ln :
+ SchedWriteRes<[KryoUnitXY]> {
+ let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_XY_noRSV_158ln],
+ (instrs SADDLVv4i16v, UADDLVv4i16v, ADDVv4i16v)>;
+def KryoWrite_4cyc_X_noRSV_169ln :
+ SchedWriteRes<[KryoUnitX]> {
+ let Latency = 4; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_4cyc_X_noRSV_169ln],
+ (instrs SADDLVv8i8v, UADDLVv8i8v, ADDVv8i8v)>;
+def KryoWrite_2cyc_XY_XY_XY_XY_176ln :
+ SchedWriteRes<[KryoUnitXY, KryoUnitXY, KryoUnitXY, KryoUnitXY]> {
+ let Latency = 2; let NumMicroOps = 4;
+}
+def : InstRW<[KryoWrite_2cyc_XY_XY_XY_XY_176ln],
+ (instregex "(S|U)(ADDW|SUBW)v.*")>;
+def KryoWrite_4cyc_X_noRSV_40ln :
+ SchedWriteRes<[KryoUnitX]> {
+ let Latency = 4; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_4cyc_X_noRSV_40ln],
+ (instregex "(S|U)CVTFS(W|X)(D|S)ri")>;
+def KryoWrite_4cyc_X_noRSV_97ln :
+ SchedWriteRes<[KryoUnitX]> {
+ let Latency = 4; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_4cyc_X_noRSV_97ln],
+ (instregex "(S|U)CVTFU(W|X)(D|S)ri")>;
+def KryoWrite_4cyc_X_noRSV_110ln :
+ SchedWriteRes<[KryoUnitX]> {
+ let Latency = 4; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_4cyc_X_noRSV_110ln],
+ (instregex "(S|U)CVTF(v1i32|v2i32|v1i64|v2f32|d|s)(_shift)?")>;
+def KryoWrite_4cyc_X_X_114ln :
+ SchedWriteRes<[KryoUnitX, KryoUnitX]> {
+ let Latency = 4; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_4cyc_X_X_114ln],
+ (instregex "(S|U)CVTF(v2i64|v4i32|v2f64|v4f32)(_shift)?")>;
+def KryoWrite_1cyc_XA_Y_98ln :
+ SchedWriteRes<[KryoUnitXA, KryoUnitY]> {
+ let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_XA_Y_98ln],
+ (instregex "(S|U)DIV(_Int)?(W|X)r")>;
+def KryoWrite_2cyc_XY_XY_152ln :
+ SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+ let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_XY_152ln],
+ (instregex "(S|U)H(ADD|SUB)(v16i8|v8i16|v4i32)")>;
+def KryoWrite_2cyc_XY_noRSV_149ln :
+ SchedWriteRes<[KryoUnitXY]> {
+ let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_noRSV_149ln],
+ (instregex "((S|U)H(ADD|SUB)|ADDP)(v8i8|v4i16|v2i32)")>;
+def KryoWrite_4cyc_X_70ln :
+ SchedWriteRes<[KryoUnitX]> {
+ let Latency = 4; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_4cyc_X_70ln],
+ (instregex "(S|U)(MADDL|MSUBL)rrr")>;
+def KryoWrite_4cyc_X_X_191ln :
+ SchedWriteRes<[KryoUnitX, KryoUnitX]> {
+ let Latency = 4; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_4cyc_X_X_191ln],
+ (instregex "(S|U|SQD)(MLAL|MLSL|MULL)v.*")>;
+def KryoWrite_1cyc_XY_195ln :
+ SchedWriteRes<[KryoUnitXY]> {
+ let Latency = 1; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_1cyc_XY_195ln],
+ (instregex "(S|U)MOVv.*")>;
+def KryoWrite_5cyc_X_71ln :
+ SchedWriteRes<[KryoUnitX]> {
+ let Latency = 5; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_5cyc_X_71ln],
+ (instrs SMULHrr, UMULHrr)>;
+def KryoWrite_3cyc_XY_noRSV_186ln :
+ SchedWriteRes<[KryoUnitXY]> {
+ let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_XY_noRSV_186ln],
+ (instregex "^(S|U)QADD(v8i8|v4i16|v2i32)")>;
+def KryoWrite_3cyc_XY_XY_187ln :
+ SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+ let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_XY_XY_187ln],
+ (instregex "^(S|U)QADD(v16i8|v8i16|v4i32|v2i64)")>;
+def KryoWrite_3cyc_XY_noRSV_69ln :
+ SchedWriteRes<[KryoUnitXY]> {
+ let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_XY_noRSV_69ln],
+ (instregex "(S|U|SU|US)QADD(v1i8|v1i16|v2i16|v1i32|v1i64)")>;
+def KryoWrite_3cyc_XY_noRSV_248ln :
+ SchedWriteRes<[KryoUnitXY]> {
+ let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_XY_noRSV_248ln],
+ (instregex "(S|U)QSHLU?(d|s|h|b|(v8i8|v4i16|v2i32)_shift)$")>;
+def KryoWrite_3cyc_XY_XY_250ln :
+ SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+ let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_XY_XY_250ln],
+ (instregex "(S|U)(QSHLU?|RSHR)(v16i8|v8i16|v4i32|v2i64)_shift$")>;
+def KryoWrite_3cyc_XY_noRSV_246ln :
+ SchedWriteRes<[KryoUnitXY]> {
+ let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_XY_noRSV_246ln],
+ (instregex "(S|U)(QSHL|RSHL|QRSHL)(v1i8|v1i16|v1i32|v1i64|v8i8|v4i16|v2i32)$")>;
+def KryoWrite_3cyc_XY_XY_251ln :
+ SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+ let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_XY_XY_251ln],
+ (instregex "(S|U)(QSHL|RSHL|QRSHL)(v16i8|v8i16|v4i32|v2i64)$")>;
+def KryoWrite_6cyc_XY_X_238ln :
+ SchedWriteRes<[KryoUnitXY, KryoUnitX]> {
+ let Latency = 6; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_6cyc_XY_X_238ln],
+ (instregex "((S|U)QR?SHRN|SQR?SHRUN)(v16i8|v8i16|v4i32)_shift$")>;
+def KryoWrite_3cyc_XY_noRSV_249ln :
+ SchedWriteRes<[KryoUnitXY]> {
+ let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_XY_noRSV_249ln],
+ (instregex "((S|U)QR?SHRN|SQR?SHRUN)(s|h|b)?")>;
+def KryoWrite_6cyc_XY_X_noRSV_252ln :
+ SchedWriteRes<[KryoUnitXY, KryoUnitX]> {
+ let Latency = 6; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_6cyc_XY_X_noRSV_252ln],
+ (instregex "((S|U)QR?SHRN|SQR?SHRUN)(v8i8|v4i16|v2i32)_shift?")>;
+def KryoWrite_3cyc_XY_noRSV_161ln :
+ SchedWriteRes<[KryoUnitXY]> {
+ let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_XY_noRSV_161ln],
+ (instregex "(S|U)QSUB(v8i8|v4i16|v2i32|v1i64|v1i32|v1i16|v1i8)")>;
+def KryoWrite_3cyc_XY_noRSV_163ln :
+ SchedWriteRes<[KryoUnitXY]> {
+ let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_XY_noRSV_163ln],
+ (instregex "(S|U)QXTU?N(v16i8|v8i16|v4i32|v8i8|v4i16|v2i32)")>;
+def KryoWrite_3cyc_XY_noRSV_162ln :
+ SchedWriteRes<[KryoUnitXY]> {
+ let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_XY_noRSV_162ln],
+ (instregex "(S|U)QXTU?N(v1i8|v1i16|v1i32)")>;
+def KryoWrite_3cyc_XY_noRSV_247ln :
+ SchedWriteRes<[KryoUnitXY]> {
+ let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_XY_noRSV_247ln],
+ (instregex "(S|U)RSHR(d|(v8i8|v4i16|v2i32)_shift)$")>;
+def KryoWrite_2cyc_XY_noRSV_239ln :
+ SchedWriteRes<[KryoUnitXY]> {
+ let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_noRSV_239ln],
+ (instregex "(S|U)SHL(d|v8i8|v4i16|v2i32|v1i64)$")>;
+def KryoWrite_2cyc_XY_XY_243ln :
+ SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+ let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_XY_243ln],
+ (instregex "(S|U)SHL(v16i8|v8i16|v4i32|v2i64)$")>;
+def KryoWrite_2cyc_XY_XY_241ln :
+ SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+ let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_XY_241ln],
+ (instregex "(S|U)?SHLL(v16i8|v8i16|v4i32|v8i8|v4i16|v2i32)(_shift)?$")>;
+def KryoWrite_2cyc_XY_noRSV_240ln :
+ SchedWriteRes<[KryoUnitXY]> {
+ let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_noRSV_240ln],
+ (instregex "((S|U)SHR|SHL)(d|(v8i8|v4i16|v2i32)_shift)$")>;
+def KryoWrite_2cyc_XY_XY_242ln :
+ SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+ let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_XY_242ln],
+ (instregex "((S|U)SHR|SHL)(v16i8|v8i16|v4i32|v2i64)_shift$")>;
+def KryoWrite_2cyc_XY_XY_183ln :
+ SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+ let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_XY_183ln],
+ (instregex "(S|U)(MAX|MIN)P?(v16i8|v8i16|v4i32)")>;
+def KryoWrite_2cyc_XY_noRSV_182ln :
+ SchedWriteRes<[KryoUnitXY]> {
+ let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_noRSV_182ln],
+ (instregex "(S|U)(MAX|MIN)P?(v8i8|v4i16|v2i32)")>;
+def KryoWrite_3cyc_XY_noRSV_184ln :
+ SchedWriteRes<[KryoUnitXY]> {
+ let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_XY_noRSV_184ln],
+ (instregex "(S|U)(MAX|MIN)V(v4i16v|v8i8v|v4i32)")>;
+def KryoWrite_4cyc_X_noRSV_185ln :
+ SchedWriteRes<[KryoUnitX]> {
+ let Latency = 4; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_4cyc_X_noRSV_185ln],
+ (instregex "(S|U)(MAX|MIN)V(v16i8v|v8i16v)")>;
+def KryoWrite_2cyc_XY_noRSV_67ln :
+ SchedWriteRes<[KryoUnitXY]> {
+ let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_noRSV_67ln],
+ (instrs ABSv1i64)>;
+def KryoWrite_1cyc_XY_63ln :
+ SchedWriteRes<[KryoUnitXY]> {
+ let Latency = 1; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_1cyc_XY_63ln, ReadI, ReadI],
+ (instregex "ADC.*")>;
+def KryoWrite_1cyc_XY_63_1ln :
+ SchedWriteRes<[KryoUnitXY]> {
+ let Latency = 1; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_1cyc_XY_63_1ln],
+ (instregex "ADR.*")>;
+def KryoWrite_1cyc_XY_62ln :
+ SchedWriteRes<[KryoUnitXY]> {
+ let Latency = 1; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_1cyc_XY_62ln, ReadI],
+ (instregex "ADDS?(W|X)ri")>;
+def KryoWrite_2cyc_XY_XY_64ln :
+ SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+ let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_XY_64ln, ReadI, ReadI],
+ (instregex "ADDS?(W|X)r(r|s|x)(64)?")>;
+def KryoWrite_1cyc_XY_noRSV_65ln :
+ SchedWriteRes<[KryoUnitXY]> {
+ let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_XY_noRSV_65ln],
+ (instrs ADDv1i64)>;
+def KryoWrite_1cyc_XY_noRSV_144ln :
+ SchedWriteRes<[KryoUnitXY]> {
+ let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_XY_noRSV_144ln],
+ (instregex "(ADD|SUB)(v8i8|v4i16|v2i32|v1i64)")>;
+def KryoWrite_1cyc_XY_XY_146ln :
+ SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+ let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_XY_XY_146ln],
+ (instregex "(ADD|SUB)(v16i8|v8i16|v4i32|v2i64)")>;
+def KryoWrite_4cyc_XY_X_noRSV_171ln :
+ SchedWriteRes<[KryoUnitXY, KryoUnitX]> {
+ let Latency = 4; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_4cyc_XY_X_noRSV_171ln],
+ (instregex "(ADD|SUB)HNv.*")>;
+def KryoWrite_1cyc_XY_noRSV_66ln :
+ SchedWriteRes<[KryoUnitXY]> {
+ let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_XY_noRSV_66ln],
+ (instrs ADDPv2i64p)>;
+def KryoWrite_2cyc_XY_XY_153ln :
+ SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+ let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_XY_153ln],
+ (instregex "ADDP(v16i8|v8i16|v4i32|v2i64)")>;
+def KryoWrite_3cyc_XY_XY_noRSV_170ln :
+ SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+ let Latency = 3; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_3cyc_XY_XY_noRSV_170ln],
+ (instrs ADDVv4i32v)>;
+def KryoWrite_4cyc_XY_XY_noRSV_173ln :
+ SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+ let Latency = 4; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_4cyc_XY_XY_noRSV_173ln],
+ (instrs ADDVv8i16v)>;
+def KryoWrite_5cyc_XY_X_noRSV_174ln :
+ SchedWriteRes<[KryoUnitXY, KryoUnitX]> {
+ let Latency = 5; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_5cyc_XY_X_noRSV_174ln],
+ (instrs ADDVv16i8v)>;
+def KryoWrite_3cyc_XY_XY_X_X_27ln :
+ SchedWriteRes<[KryoUnitXY, KryoUnitXY, KryoUnitX, KryoUnitX]> {
+ let Latency = 3; let NumMicroOps = 4;
+}
+def : InstRW<[KryoWrite_3cyc_XY_XY_X_X_27ln],
+ (instrs AESDrr, AESErr)>;
+def KryoWrite_2cyc_X_X_22ln :
+ SchedWriteRes<[KryoUnitX, KryoUnitX]> {
+ let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_X_X_22ln],
+ (instrs AESIMCrr, AESMCrr)>;
+def KryoWrite_1cyc_XY_noRSV_76ln :
+ SchedWriteRes<[KryoUnitXY]> {
+ let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_XY_noRSV_76ln],
+ (instregex "((AND|ORN|EOR|EON)S?(Wr[rsi]|v8i8|v4i16|v2i32)|(ORR|BIC)S?(Wr[rs]|v8i8|v4i16|v2i32))")>;
+def KryoWrite_1cyc_XY_XY_79ln :
+ SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+ let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_XY_XY_79ln],
+ (instregex "((AND|ORN|EOR|EON)S?(Xr[rsi]|v16i8|v8i16|v4i32)|(ORR|BIC)S?(Xr[rs]|v16i8|v8i16|v4i32))")>;
+def KryoWrite_1cyc_X_72ln :
+ SchedWriteRes<[KryoUnitX]> {
+ let Latency = 1; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_1cyc_X_72ln],
+ (instregex "(S|U)?BFM.*")>;
+def KryoWrite_1cyc_XY_noRSV_77ln :
+ SchedWriteRes<[KryoUnitXY]> {
+ let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_XY_noRSV_77ln],
+ (instregex "(BIC|ORR)S?Wri")>;
+def KryoWrite_1cyc_XY_XY_78ln :
+ SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+ let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_XY_XY_78ln],
+ (instregex "(BIC|ORR)S?Xri")>;
+def KryoWrite_1cyc_X_noRSV_74ln :
+ SchedWriteRes<[KryoUnitX]> {
+ let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_X_noRSV_74ln],
+ (instrs BIFv8i8, BITv8i8, BSLv8i8)>;
+def KryoWrite_1cyc_X_X_75ln :
+ SchedWriteRes<[KryoUnitX, KryoUnitX]> {
+ let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_X_X_75ln],
+ (instrs BIFv16i8, BITv16i8, BSLv16i8)>;
+def KryoWrite_0cyc_noRSV_11ln :
+ SchedWriteRes<[]> {
+ let Latency = 0; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_0cyc_noRSV_11ln],
+ (instrs BRK, DCPS1, DCPS2, DCPS3, HLT, HVC, ISB, HINT, SMC, SVC)>;
+def KryoWrite_0cyc_XY_16ln :
+ SchedWriteRes<[KryoUnitXY]> {
+ let Latency = 0; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_0cyc_XY_16ln, ReadI],
+ (instregex "(CCMN|CCMP)(W|X)i")>;
+def KryoWrite_0cyc_XY_16_1ln :
+ SchedWriteRes<[KryoUnitXY]> {
+ let Latency = 0; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_0cyc_XY_16_1ln, ReadI, ReadI],
+ (instregex "(CCMN|CCMP)(W|X)r")>;
+def KryoWrite_2cyc_XY_3ln :
+ SchedWriteRes<[KryoUnitXY]> {
+ let Latency = 2; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_2cyc_XY_3ln, ReadI],
+ (instregex "(CLS|CLZ)(W|X)r")>;
+def KryoWrite_2cyc_XY_noRSV_7ln :
+ SchedWriteRes<[KryoUnitXY]> {
+ let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_noRSV_7ln],
+ (instregex "(CLS|CLZ|CNT)(v4i32|v8i16|v16i8)")>;
+def KryoWrite_2cyc_XY_XY_8ln :
+ SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+ let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_XY_8ln],
+ (instregex "(CLS|CLZ|CNT)(v2i32|v4i16|v8i8)")>;
+def KryoWrite_2cyc_XY_noRSV_80ln :
+ SchedWriteRes<[KryoUnitXY]> {
+ let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_noRSV_80ln],
+ (instregex "CM(EQ|GE|HS|GT|HI|TST)(v8i8|v4i16|v2i32|v1i64)$")>;
+def KryoWrite_2cyc_XY_XY_83ln :
+ SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+ let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_XY_83ln],
+ (instregex "CM(EQ|GE|HS|GT|HI|TST)(v16i8|v8i16|v4i32|v2i64)$")>;
+def KryoWrite_2cyc_XY_noRSV_81ln :
+ SchedWriteRes<[KryoUnitXY]> {
+ let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_noRSV_81ln],
+ (instregex "CM(EQ|LE|GE|GT|LT)(v8i8|v4i16|v2i32|v1i64)rz$")>;
+def KryoWrite_2cyc_XY_XY_82ln :
+ SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+ let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_XY_82ln],
+ (instregex "CM(EQ|LE|GE|GT|LT)(v16i8|v8i16|v4i32|v2i64)rz$")>;
+def KryoWrite_3cyc_XY_4ln :
+ SchedWriteRes<[KryoUnitXY]> {
+ let Latency = 3; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_3cyc_XY_4ln, ReadI, ReadISReg],
+ (instregex "CRC32.*")>;
+def KryoWrite_1cyc_XY_20ln :
+ SchedWriteRes<[KryoUnitXY]> {
+ let Latency = 1; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_1cyc_XY_20ln, ReadI, ReadI],
+ (instregex "CSEL(W|X)r")>;
+def KryoWrite_1cyc_X_17ln :
+ SchedWriteRes<[KryoUnitX]> {
+ let Latency = 1; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_1cyc_X_17ln, ReadI, ReadI],
+ (instregex "(CSINC|CSNEG)(W|X)r")>;
+def KryoWrite_1cyc_XY_18ln :
+ SchedWriteRes<[KryoUnitXY]> {
+ let Latency = 1; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_1cyc_XY_18ln, ReadI, ReadI],
+ (instregex "(CSINV)(W|X)r")>;
+def KryoWrite_3cyc_LS_X_13ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitX]> {
+ let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_LS_X_13ln],
+ (instrs DRPS)>;
+def KryoWrite_0cyc_LS_10ln :
+ SchedWriteRes<[KryoUnitLS]> {
+ let Latency = 0; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_0cyc_LS_10ln],
+ (instrs DSB, DMB, CLREX)>;
+def KryoWrite_1cyc_X_noRSV_196ln :
+ SchedWriteRes<[KryoUnitX]> {
+ let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_X_noRSV_196ln],
+ (instregex "DUP(v8i8|v4i16|v2i32)(gpr|lane)")>;
+def KryoWrite_1cyc_X_X_197ln :
+ SchedWriteRes<[KryoUnitX, KryoUnitX]> {
+ let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_X_X_197ln],
+ (instregex "DUP(v16i8|v8i16|v4i32|v2i64)(gpr|lane)")>;
+def KryoWrite_3cyc_LS_LS_X_15ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitLS, KryoUnitX]> {
+ let Latency = 3; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_3cyc_LS_LS_X_15ln],
+ (instrs ERET)>;
+def KryoWrite_1cyc_X_noRSV_207ln :
+ SchedWriteRes<[KryoUnitX]> {
+ let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_X_noRSV_207ln],
+ (instrs EXTv8i8)>;
+def KryoWrite_1cyc_X_X_212ln :
+ SchedWriteRes<[KryoUnitX, KryoUnitX]> {
+ let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_X_X_212ln],
+ (instrs EXTv16i8)>;
+def KryoWrite_2cyc_XY_X_136ln :
+ SchedWriteRes<[KryoUnitXY, KryoUnitX]> {
+ let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_X_136ln],
+ (instrs EXTRWrri, EXTRXrri)>;
+def KryoWrite_2cyc_XY_noRSV_35ln :
+ SchedWriteRes<[KryoUnitXY]> {
+ let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_noRSV_35ln],
+ (instregex "F(MAX|MIN)(NM)?P?(D|S)rr")>;
+def KryoWrite_2cyc_XY_XY_106ln :
+ SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+ let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_XY_106ln],
+ (instregex "(F(MAX|MIN)(NM)?P?|FAC(GE|GT)|FCM(EQ|GE|GT))(v2i64p|v2f64|v4f32)")>;
+def KryoWrite_2cyc_XY_noRSV_104ln :
+ SchedWriteRes<[KryoUnitXY]> {
+ let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_noRSV_104ln],
+ (instregex "(F(MAX|MIN)(NM)?P?|FAC(GE|GT)|FCM(EQ|GE|GT))(v2f32|v2i32p)")>;
+def KryoWrite_3cyc_XY_noRSV_107ln :
+ SchedWriteRes<[KryoUnitXY]> {
+ let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_XY_noRSV_107ln],
+ (instregex "F(MAX|MIN)(NM)?Vv4i32v")>;
+def KryoWrite_3cyc_XY_noRSV_101ln :
+ SchedWriteRes<[KryoUnitXY]> {
+ let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_XY_noRSV_101ln],
+ (instregex "FABD(32|64|v2f32)")>;
+def KryoWrite_3cyc_XY_XY_103ln :
+ SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+ let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_XY_XY_103ln],
+ (instregex "(FABD|FADD|FSUB|FADDP)(v4f32|v2f64)")>;
+def KryoWrite_1cyc_XY_noRSV_48ln :
+ SchedWriteRes<[KryoUnitXY]> {
+ let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_XY_noRSV_48ln],
+ (instregex "F(ABS|NEG)(D|S)r")>;
+def KryoWrite_1cyc_XY_noRSV_124ln :
+ SchedWriteRes<[KryoUnitXY]> {
+ let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_XY_noRSV_124ln],
+ (instregex "F(ABS|NEG)v2f32")>;
+def KryoWrite_1cyc_XY_XY_125ln :
+ SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+ let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_XY_XY_125ln],
+ (instregex "F(ABS|NEG)(v2f64|v4f32)")>;
+def KryoWrite_2cyc_XY_noRSV_33ln :
+ SchedWriteRes<[KryoUnitXY]> {
+ let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_noRSV_33ln],
+ (instregex "(FAC(GE|GT)|FCM(EQ|GE|GT))(32|64)")>;
+def KryoWrite_3cyc_XY_noRSV_30ln :
+ SchedWriteRes<[KryoUnitXY]> {
+ let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_XY_noRSV_30ln],
+ (instregex "(FADD|FSUB)(D|S)rr")>;
+def KryoWrite_3cyc_XY_noRSV_100ln :
+ SchedWriteRes<[KryoUnitXY]> {
+ let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_XY_noRSV_100ln],
+ (instregex "(FADD|FSUB|FADDP)v2f32")>;
+def KryoWrite_3cyc_XY_noRSV_29ln :
+ SchedWriteRes<[KryoUnitXY]> {
+ let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_XY_noRSV_29ln],
+ (instregex "FADDP(v2i32p|v2i64p)")>;
+def KryoWrite_0cyc_XY_31ln :
+ SchedWriteRes<[KryoUnitXY]> {
+ let Latency = 0; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_0cyc_XY_31ln],
+ (instregex "FCCMPE?(D|S)rr")>;
+def KryoWrite_2cyc_XY_noRSV_34ln :
+ SchedWriteRes<[KryoUnitXY]> {
+ let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_noRSV_34ln],
+ (instregex "FCM(EQ|LE|GE|GT|LT)(v1i32|v1i64)rz")>;
+def KryoWrite_2cyc_XY_XY_36ln :
+ SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+ let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_XY_36ln],
+ (instregex "FCM(EQ|LE|GE|GT|LT)(v2i64|v4i32)rz")>;
+def KryoWrite_2cyc_XY_noRSV_105ln :
+ SchedWriteRes<[KryoUnitXY]> {
+ let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_noRSV_105ln],
+ (instregex "FCM(EQ|LE|GE|GT|LT)v2i32rz")>;
+def KryoWrite_0cyc_XY_32ln :
+ SchedWriteRes<[KryoUnitXY]> {
+ let Latency = 0; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_0cyc_XY_32ln],
+ (instregex "FCMPE?(D|S)r(r|i)")>;
+def KryoWrite_1cyc_XY_noRSV_49ln :
+ SchedWriteRes<[KryoUnitXY]> {
+ let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_XY_noRSV_49ln],
+ (instrs FCSELDrrr, FCSELSrrr)>;
+def KryoWrite_4cyc_X_noRSV_41ln :
+ SchedWriteRes<[KryoUnitX]> {
+ let Latency = 4; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_4cyc_X_noRSV_41ln],
+ (instrs FCVTDHr, FCVTDSr, FCVTHDr, FCVTHSr, FCVTSDr, FCVTSHr)>;
+def KryoWrite_4cyc_X_38ln :
+ SchedWriteRes<[KryoUnitX]> {
+ let Latency = 4; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_4cyc_X_38ln],
+ (instregex "FCVT(((A|N|M|P)(S|U)(S|U)|Z(S|U)_Int(S|U))(W|X)(D|S)ri?|Z(S|U)(d|s))$")>;
+def KryoWrite_4cyc_X_noRSV_113ln :
+ SchedWriteRes<[KryoUnitX]> {
+ let Latency = 4; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_4cyc_X_noRSV_113ln],
+ (instregex "FCVT((A|N|M|P)(S|U)|Z(S|U)_Int)(v1i32|v1i64|v2f32)$")>;
+def KryoWrite_4cyc_X_X_117ln :
+ SchedWriteRes<[KryoUnitX, KryoUnitX]> {
+ let Latency = 4; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_4cyc_X_X_117ln],
+ (instregex "FCVT((A|N|M|P)(S|U)|Z(S|U)_Int)(v4f32|v2f64)$")>;
+def KryoWrite_5cyc_X_X_XY_noRSV_119ln :
+ SchedWriteRes<[KryoUnitX, KryoUnitX, KryoUnitXY]> {
+ let Latency = 5; let NumMicroOps = 4;
+}
+def : InstRW<[KryoWrite_5cyc_X_X_XY_noRSV_119ln],
+ (instregex "FCVTX?N(v2f32|v4f32|v2i32|v4i16|v4i32|v8i16)$")>;
+def KryoWrite_4cyc_X_X_116ln :
+ SchedWriteRes<[KryoUnitX, KryoUnitX]> {
+ let Latency = 4; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_4cyc_X_X_116ln],
+ (instregex "FCVTL(v2i32|v4i16|v4i32|v8i16)$")>;
+def KryoWrite_4cyc_X_noRSV_112ln :
+ SchedWriteRes<[KryoUnitX]> {
+ let Latency = 4; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_4cyc_X_noRSV_112ln],
+ (instrs FCVTXNv1i64)>;
+def KryoWrite_4cyc_X_37ln :
+ SchedWriteRes<[KryoUnitX]> {
+ let Latency = 4; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_4cyc_X_37ln],
+ (instregex "FCVTZ(S|U)(S|U)(W|X)(D|S)ri?$")>;
+def KryoWrite_4cyc_X_noRSV_111ln :
+ SchedWriteRes<[KryoUnitX]> {
+ let Latency = 4; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_4cyc_X_noRSV_111ln],
+ (instregex "FCVTZ(S|U)(v2f32|v1i32|v1i64|v2i32(_shift)?)$")>;
+def KryoWrite_4cyc_X_X_115ln :
+ SchedWriteRes<[KryoUnitX, KryoUnitX]> {
+ let Latency = 4; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_4cyc_X_X_115ln],
+ (instregex "FCVTZ(S|U)(v2f64|v4f32|(v2i64|v4i32)(_shift)?)$")>;
+def KryoWrite_1cyc_XA_Y_noRSV_43ln :
+ SchedWriteRes<[KryoUnitXA, KryoUnitY]> {
+ let Latency = 1; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_1cyc_XA_Y_noRSV_43ln],
+ (instrs FDIVDrr, FDIVSrr)>;
+def KryoWrite_1cyc_XA_Y_noRSV_121ln :
+ SchedWriteRes<[KryoUnitXA, KryoUnitY]> {
+ let Latency = 1; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_1cyc_XA_Y_noRSV_121ln],
+ (instrs FDIVv2f32)>;
+def KryoWrite_1cyc_XA_Y_XA_Y_123ln :
+ SchedWriteRes<[KryoUnitXA, KryoUnitY, KryoUnitXA, KryoUnitY]> {
+ let Latency = 1; let NumMicroOps = 4;
+}
+def : InstRW<[KryoWrite_1cyc_XA_Y_XA_Y_123ln],
+ (instrs FDIVv2f64, FDIVv4f32)>;
+def KryoWrite_5cyc_X_noRSV_55ln :
+ SchedWriteRes<[KryoUnitX]> {
+ let Latency = 5; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_5cyc_X_noRSV_55ln],
+ (instregex "FN?M(ADD|SUB)Srrr")>;
+def KryoWrite_6cyc_X_noRSV_57ln :
+ SchedWriteRes<[KryoUnitX]> {
+ let Latency = 6; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_6cyc_X_noRSV_57ln],
+ (instregex "FN?M(ADD|SUB)Drrr")>;
+def KryoWrite_5cyc_X_noRSV_51ln :
+ SchedWriteRes<[KryoUnitX]> {
+ let Latency = 5; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_5cyc_X_noRSV_51ln],
+ (instrs FMLAv2f32, FMLSv2f32, FMLAv1i32_indexed, FMLSv1i32_indexed)>;
+def KryoWrite_5cyc_X_X_56ln :
+ SchedWriteRes<[KryoUnitX, KryoUnitX]> {
+ let Latency = 5; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_5cyc_X_X_56ln],
+ (instrs FMLAv4f32, FMLSv4f32)>;
+def KryoWrite_6cyc_X_X_61ln :
+ SchedWriteRes<[KryoUnitX, KryoUnitX]> {
+ let Latency = 6; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_6cyc_X_X_61ln],
+ (instrs FMLAv2f64, FMLSv2f64)>;
+def KryoWrite_5cyc_X_noRSV_128ln :
+ SchedWriteRes<[KryoUnitX]> {
+ let Latency = 5; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_5cyc_X_noRSV_128ln],
+ (instrs FMLAv2i32_indexed, FMLSv2i32_indexed)>;
+def KryoWrite_5cyc_X_X_131ln :
+ SchedWriteRes<[KryoUnitX, KryoUnitX]> {
+ let Latency = 5; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_5cyc_X_X_131ln],
+ (instrs FMLAv4i32_indexed, FMLSv4i32_indexed)>;
+def KryoWrite_6cyc_X_X_134ln :
+ SchedWriteRes<[KryoUnitX, KryoUnitX]> {
+ let Latency = 6; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_6cyc_X_X_134ln],
+ (instrs FMLAv2i64_indexed, FMLSv2i64_indexed)>;
+def KryoWrite_6cyc_X_noRSV_60ln :
+ SchedWriteRes<[KryoUnitX]> {
+ let Latency = 6; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_6cyc_X_noRSV_60ln],
+ (instrs FMLAv1i64_indexed, FMLSv1i64_indexed, FMULv1i64_indexed, FMULXv1i64_indexed)>;
+def KryoWrite_1cyc_XY_45ln :
+ SchedWriteRes<[KryoUnitXY]> {
+ let Latency = 1; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_1cyc_XY_45ln],
+ (instregex "FMOV(XDHigh|DXHigh|DX)r")>;
+def KryoWrite_1cyc_XY_noRSV_47ln :
+ SchedWriteRes<[KryoUnitXY]> {
+ let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_XY_noRSV_47ln],
+ (instregex "FMOV(Di|Dr|Si|Sr|SWr|WSr|XDr|v.*_ns)")>;
+def KryoWrite_5cyc_X_noRSV_53ln :
+ SchedWriteRes<[KryoUnitX]> {
+ let Latency = 5; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_5cyc_X_noRSV_53ln],
+ (instrs FMULv1i32_indexed, FMULXv1i32_indexed)>;
+def KryoWrite_5cyc_X_noRSV_127ln :
+ SchedWriteRes<[KryoUnitX]> {
+ let Latency = 5; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_5cyc_X_noRSV_127ln],
+ (instrs FMULv2f32, FMULXv2f32, FMULv2i32_indexed, FMULXv2i32_indexed)>;
+def KryoWrite_5cyc_X_X_130ln :
+ SchedWriteRes<[KryoUnitX, KryoUnitX]> {
+ let Latency = 5; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_5cyc_X_X_130ln],
+ (instrs FMULv4f32, FMULXv4f32, FMULv4i32_indexed, FMULXv4i32_indexed)>;
+def KryoWrite_6cyc_X_X_133ln :
+ SchedWriteRes<[KryoUnitX, KryoUnitX]> {
+ let Latency = 6; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_6cyc_X_X_133ln],
+ (instrs FMULv2f64, FMULXv2f64, FMULv2i64_indexed, FMULXv2i64_indexed)>;
+def KryoWrite_5cyc_X_noRSV_54ln :
+ SchedWriteRes<[KryoUnitX]> {
+ let Latency = 5; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_5cyc_X_noRSV_54ln],
+ (instrs FMULSrr, FNMULSrr, FMULX32)>;
+def KryoWrite_6cyc_X_noRSV_59ln :
+ SchedWriteRes<[KryoUnitX]> {
+ let Latency = 6; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_6cyc_X_noRSV_59ln],
+ (instrs FMULDrr, FNMULDrr, FMULX64)>;
+def KryoWrite_3cyc_XY_noRSV_28ln :
+ SchedWriteRes<[KryoUnitXY]> {
+ let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_XY_noRSV_28ln],
+ (instrs FRECPEv1i32, FRECPEv1i64, FRSQRTEv1i32, FRSQRTEv1i64 )>;
+def KryoWrite_3cyc_XY_noRSV_99ln :
+ SchedWriteRes<[KryoUnitXY]> {
+ let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_XY_noRSV_99ln],
+ (instrs FRECPEv2f32, FRSQRTEv2f32)>;
+def KryoWrite_3cyc_XY_XY_102ln :
+ SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+ let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_XY_XY_102ln],
+ (instrs FRECPEv2f64, FRECPEv4f32, FRSQRTEv2f64, FRSQRTEv4f32)>;
+def KryoWrite_5cyc_X_noRSV_52ln :
+ SchedWriteRes<[KryoUnitX]> {
+ let Latency = 5; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_5cyc_X_noRSV_52ln],
+ (instrs FRECPS32, FRSQRTS32)>;
+def KryoWrite_6cyc_X_noRSV_58ln :
+ SchedWriteRes<[KryoUnitX]> {
+ let Latency = 6; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_6cyc_X_noRSV_58ln],
+ (instrs FRECPS64, FRSQRTS64)>;
+def KryoWrite_5cyc_X_noRSV_126ln :
+ SchedWriteRes<[KryoUnitX]> {
+ let Latency = 5; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_5cyc_X_noRSV_126ln],
+ (instrs FRECPSv2f32, FRSQRTSv2f32)>;
+def KryoWrite_5cyc_X_X_129ln :
+ SchedWriteRes<[KryoUnitX, KryoUnitX]> {
+ let Latency = 5; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_5cyc_X_X_129ln],
+ (instrs FRECPSv4f32, FRSQRTSv4f32)>;
+def KryoWrite_6cyc_X_X_132ln :
+ SchedWriteRes<[KryoUnitX, KryoUnitX]> {
+ let Latency = 6; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_6cyc_X_X_132ln],
+ (instrs FRECPSv2f64, FRSQRTSv2f64)>;
+def KryoWrite_3cyc_XY_noRSV_50ln :
+ SchedWriteRes<[KryoUnitXY]> {
+ let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_XY_noRSV_50ln],
+ (instrs FRECPXv1i32, FRECPXv1i64)>;
+def KryoWrite_2cyc_XY_noRSV_39ln :
+ SchedWriteRes<[KryoUnitXY]> {
+ let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_noRSV_39ln],
+ (instregex "FRINT(A|I|M|N|P|X|Z)(S|D)r")>;
+def KryoWrite_2cyc_XY_noRSV_108ln :
+ SchedWriteRes<[KryoUnitXY]> {
+ let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_noRSV_108ln],
+ (instregex "FRINT(A|I|M|N|P|X|Z)v2f32")>;
+def KryoWrite_2cyc_XY_XY_109ln :
+ SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+ let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_XY_109ln],
+ (instregex "FRINT(A|I|M|N|P|X|Z)(v2f64|v4f32)")>;
+def KryoWrite_1cyc_XA_Y_noRSV_42ln :
+ SchedWriteRes<[KryoUnitXA, KryoUnitY]> {
+ let Latency = 1; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_1cyc_XA_Y_noRSV_42ln],
+ (instregex "FSQRT(S|D)r")>;
+def KryoWrite_1cyc_XA_Y_noRSV_120ln :
+ SchedWriteRes<[KryoUnitXA, KryoUnitY]> {
+ let Latency = 1; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_1cyc_XA_Y_noRSV_120ln],
+ (instregex "FSQRTv2f32")>;
+def KryoWrite_1cyc_XA_Y_XA_Y_122ln :
+ SchedWriteRes<[KryoUnitXA, KryoUnitY, KryoUnitXA, KryoUnitY]> {
+ let Latency = 1; let NumMicroOps = 4;
+}
+def : InstRW<[KryoWrite_1cyc_XA_Y_XA_Y_122ln],
+ (instregex "FSQRT(v2f64|v4f32)")>;
+def KryoWrite_1cyc_X_201ln :
+ SchedWriteRes<[KryoUnitX]> {
+ let Latency = 1; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_1cyc_X_201ln],
+ (instregex "INSv.*")>;
+def KryoWrite_3cyc_LS_255ln :
+ SchedWriteRes<[KryoUnitLS]> {
+ let Latency = 3; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_3cyc_LS_255ln],
+ (instregex "LD1(One(v16b|v8h|v4s|v2d)|i64)$")>;
+def KryoWrite_4cyc_LS_X_270ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitX]> {
+ let Latency = 4; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_4cyc_LS_X_270ln],
+ (instregex "LD1(i8|i16|i32)$")>;
+def KryoWrite_3cyc_LS_noRSV_285ln :
+ SchedWriteRes<[KryoUnitLS]> {
+ let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_LS_noRSV_285ln],
+ (instregex "LD1One(v8b|v4h|v2s|v1d)$")>;
+def KryoWrite_3cyc_LS_XY_289ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitXY]> {
+ let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_LS_XY_289ln, WriteAdr],
+ (instregex "LD1(One(v16b|v8h|v4s|v2d)|i64)_POST$")>;
+def KryoWrite_4cyc_LS_XY_X_298ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitXY, KryoUnitX]> {
+ let Latency = 4; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_4cyc_LS_XY_X_298ln, WriteAdr],
+ (instregex "LD1(i8|i16|i32)_POST$")>;
+def KryoWrite_3cyc_LS_LS_LS_308ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitLS, KryoUnitLS]> {
+ let Latency = 3; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_3cyc_LS_LS_LS_308ln],
+ (instregex "LD1Three(v16b|v8h|v4s|v2d)$")>;
+def KryoWrite_3cyc_LS_XY_noRSV_317ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitXY]> {
+ let Latency = 3; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_3cyc_LS_XY_noRSV_317ln, WriteAdr],
+ (instregex "LD1One(v8b|v4h|v2s|v1d)_POST$")>;
+def KryoWrite_3cyc_LS_LS_LS_LS_328ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitLS, KryoUnitLS, KryoUnitLS]> {
+ let Latency = 3; let NumMicroOps = 4;
+}
+def : InstRW<[KryoWrite_3cyc_LS_LS_LS_LS_328ln, WriteAdr],
+ (instregex "LD1Four(v16b|v8h|v4s|v2d)_POST$")>;
+def KryoWrite_3cyc_LS_XY_LS_LS_332ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitXY, KryoUnitLS, KryoUnitLS]> {
+ let Latency = 3; let NumMicroOps = 4;
+}
+def : InstRW<[KryoWrite_3cyc_LS_XY_LS_LS_332ln, WriteAdr],
+ (instregex "LD1Three(v16b|v8h|v4s|v2d)_POST$")>;
+def KryoWrite_3cyc_LS_LS_noRSV_noRSV_noRSV_348ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitLS]> {
+ let Latency = 3; let NumMicroOps = 5;
+}
+def : InstRW<[KryoWrite_3cyc_LS_LS_noRSV_noRSV_noRSV_348ln],
+ (instregex "LD1Three(v8b|v4h|v2s|v1d)$")>;
+def KryoWrite_3cyc_LS_XY_LS_LS_LS_351ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitXY, KryoUnitLS, KryoUnitLS, KryoUnitLS]> {
+ let Latency = 3; let NumMicroOps = 5;
+}
+def : InstRW<[KryoWrite_3cyc_LS_XY_LS_LS_LS_351ln],
+ (instregex "LD1Four(v16b|v8h|v4s|v2d)$")>;
+def KryoWrite_3cyc_LS_LS_noRSV_noRSV_noRSV_noRSV_358ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitLS]> {
+ let Latency = 3; let NumMicroOps = 6;
+}
+def : InstRW<[KryoWrite_3cyc_LS_LS_noRSV_noRSV_noRSV_noRSV_358ln],
+ (instregex "LD1Four(v8b|v4h|v2s|v1d)$")>;
+def KryoWrite_3cyc_LS_XY_LS_noRSV_noRSV_noRSV_360ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitXY, KryoUnitLS]> {
+ let Latency = 3; let NumMicroOps = 6;
+}
+def : InstRW<[KryoWrite_3cyc_LS_XY_LS_noRSV_noRSV_noRSV_360ln, WriteAdr],
+ (instregex "LD1Three(v8b|v4h|v2s|v1d)_POST$")>;
+def KryoWrite_3cyc_LS_XY_LS_noRSV_noRSV_noRSV_noRSV_368ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitXY, KryoUnitLS]> {
+ let Latency = 3; let NumMicroOps = 7;
+}
+def : InstRW<[KryoWrite_3cyc_LS_XY_LS_noRSV_noRSV_noRSV_noRSV_368ln, WriteAdr],
+ (instregex "LD1Four(v8b|v4h|v2s|v1d)_POST$")>;
+def KryoWrite_3cyc_LS_LS_281ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitLS]> {
+ let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_LS_LS_281ln],
+ (instregex "LD(1|2)Two(v16b|v8h|v4s|v2d)$")>;
+def KryoWrite_3cyc_LS_noRSV_noRSV_311ln :
+ SchedWriteRes<[KryoUnitLS]> {
+ let Latency = 3; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_3cyc_LS_noRSV_noRSV_311ln],
+ (instregex "LD(1|2)Two(v8b|v4h|v2s|v1d)$")>;
+def KryoWrite_3cyc_LS_XY_LS_313ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitXY, KryoUnitLS]> {
+ let Latency = 3; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_3cyc_LS_XY_LS_313ln, WriteAdr],
+ (instregex "LD(1|2)Two(v16b|v8h|v4s|v2d)_POST$")>;
+def KryoWrite_3cyc_LS_XY_noRSV_noRSV_334ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitXY]> {
+ let Latency = 3; let NumMicroOps = 4;
+}
+def : InstRW<[KryoWrite_3cyc_LS_XY_noRSV_noRSV_334ln, WriteAdr],
+ (instregex "LD(1|2)Two(v8b|v4h|v2s|v1d)_POST$")>;
+def KryoWrite_3cyc_LS_256ln :
+ SchedWriteRes<[KryoUnitLS]> {
+ let Latency = 3; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_3cyc_LS_256ln],
+ (instregex "LD1R(v16b|v8h|v4s|v2d)$")>;
+def KryoWrite_3cyc_LS_noRSV_286ln :
+ SchedWriteRes<[KryoUnitLS]> {
+ let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_LS_noRSV_286ln],
+ (instregex "LD1R(v8b|v4h|v2s|v1d)$")>;
+def KryoWrite_3cyc_LS_XY_290ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitXY]> {
+ let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_LS_XY_290ln, WriteAdr],
+ (instregex "LD1R(v16b|v8h|v4s|v2d)_POST$")>;
+def KryoWrite_3cyc_LS_XY_noRSV_318ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitXY]> {
+ let Latency = 3; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_3cyc_LS_XY_noRSV_318ln, WriteAdr],
+ (instregex "LD1R(v8b|v4h|v2s|v1d)_POST$")>;
+def KryoWrite_3cyc_LS_257ln :
+ SchedWriteRes<[KryoUnitLS]> {
+ let Latency = 3; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_3cyc_LS_257ln],
+ (instregex "LD2i64$")>;
+def KryoWrite_3cyc_LS_XY_291ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitXY]> {
+ let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_LS_XY_291ln, WriteAdr],
+ (instregex "LD2i64_POST$")>;
+def KryoWrite_4cyc_LS_X_X_296ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitX, KryoUnitX]> {
+ let Latency = 4; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_4cyc_LS_X_X_296ln],
+ (instregex "LD2(i8|i16|i32)$")>;
+def KryoWrite_4cyc_LS_XY_X_X_321ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitXY, KryoUnitX, KryoUnitX]> {
+ let Latency = 4; let NumMicroOps = 4;
+}
+def : InstRW<[KryoWrite_4cyc_LS_XY_X_X_321ln, WriteAdr],
+ (instregex "LD2(i8|i16|i32)_POST$")>;
+def KryoWrite_3cyc_LS_LS_282ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitLS]> {
+ let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_LS_LS_282ln],
+ (instregex "LD2R(v16b|v8h|v4s|v2d)$")>;
+def KryoWrite_3cyc_LS_noRSV_noRSV_312ln :
+ SchedWriteRes<[KryoUnitLS]> {
+ let Latency = 3; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_3cyc_LS_noRSV_noRSV_312ln],
+ (instregex "LD2R(v8b|v4h|v2s|v1d)$")>;
+def KryoWrite_3cyc_LS_XY_LS_314ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitXY, KryoUnitLS]> {
+ let Latency = 3; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_3cyc_LS_XY_LS_314ln, WriteAdr],
+ (instregex "LD2R(v16b|v8h|v4s|v2d)_POST$")>;
+def KryoWrite_3cyc_LS_XY_noRSV_noRSV_335ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitXY]> {
+ let Latency = 3; let NumMicroOps = 4;
+}
+def : InstRW<[KryoWrite_3cyc_LS_XY_noRSV_noRSV_335ln, WriteAdr],
+ (instregex "LD2R(v8b|v4h|v2s|v1d)_POST$")>;
+def KryoWrite_3cyc_LS_LS_283ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitLS]> {
+ let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_LS_LS_283ln],
+ (instregex "LD3i64$")>;
+def KryoWrite_3cyc_LS_LS_LS_309ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitLS, KryoUnitLS]> {
+ let Latency = 3; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_3cyc_LS_LS_LS_309ln],
+ (instregex "LD3Threev2d$")>;
+def KryoWrite_3cyc_LS_XY_LS_315ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitXY, KryoUnitLS]> {
+ let Latency = 3; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_3cyc_LS_XY_LS_315ln, WriteAdr],
+ (instregex "LD3i64_POST$")>;
+def KryoWrite_4cyc_LS_X_X_X_320ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitX, KryoUnitX, KryoUnitX]> {
+ let Latency = 4; let NumMicroOps = 4;
+}
+def : InstRW<[KryoWrite_4cyc_LS_X_X_X_320ln],
+ (instregex "LD3(i8|i16|i32)$")>;
+def KryoWrite_3cyc_LS_XY_LS_LS_331ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitXY, KryoUnitLS, KryoUnitLS]> {
+ let Latency = 3; let NumMicroOps = 4;
+}
+def : InstRW<[KryoWrite_3cyc_LS_XY_LS_LS_331ln, WriteAdr],
+ (instregex "LD3Threev2d_POST$")>;
+def KryoWrite_4cyc_LS_XY_X_X_X_338ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitXY, KryoUnitX, KryoUnitX, KryoUnitX]> {
+ let Latency = 4; let NumMicroOps = 5;
+}
+def : InstRW<[KryoWrite_4cyc_LS_XY_X_X_X_338ln, WriteAdr],
+ (instregex "LD3(i8|i16|i32)_POST$")>;
+def KryoWrite_4cyc_LS_LS_X_X_X_noRSV_noRSV_noRSV_373ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitLS, KryoUnitX, KryoUnitX, KryoUnitX]> {
+ let Latency = 4; let NumMicroOps = 8;
+}
+def : InstRW<[KryoWrite_4cyc_LS_LS_X_X_X_noRSV_noRSV_noRSV_373ln],
+ (instregex "LD3Three(v8b|v4h|v2s)$")>;
+def KryoWrite_4cyc_LS_XY_LS_X_X_X_noRSV_noRSV_noRSV_380ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitXY, KryoUnitLS, KryoUnitX, KryoUnitX,
+ KryoUnitX]> {
+ let Latency = 4; let NumMicroOps = 9;
+}
+def : InstRW<[KryoWrite_4cyc_LS_XY_LS_X_X_X_noRSV_noRSV_noRSV_380ln, WriteAdr],
+ (instregex "LD3Three(v8b|v4h|v2s)_POST$")>;
+def KryoWrite_4cyc_LS_LS_X_X_X_LS_LS_X_X_X_381ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitLS, KryoUnitX, KryoUnitX, KryoUnitX,
+ KryoUnitLS, KryoUnitLS, KryoUnitX, KryoUnitX, KryoUnitX]> {
+ let Latency = 4; let NumMicroOps = 10;
+}
+def : InstRW<[KryoWrite_4cyc_LS_LS_X_X_X_LS_LS_X_X_X_381ln],
+ (instregex "LD3Three(v16b|v8h|v4s)$")>;
+def KryoWrite_4cyc_LS_LS_X_X_X_LS_XY_LS_X_X_X_383ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitLS, KryoUnitX, KryoUnitX, KryoUnitX,
+ KryoUnitLS, KryoUnitXY, KryoUnitLS, KryoUnitX, KryoUnitX,
+ KryoUnitX]> {
+ let Latency = 4; let NumMicroOps = 11;
+}
+def : InstRW<[KryoWrite_4cyc_LS_LS_X_X_X_LS_XY_LS_X_X_X_383ln, WriteAdr],
+ (instregex "LD3Three(v16b|v8h|v4s)_POST$")>;
+def KryoWrite_3cyc_LS_LS_LS_310ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitLS, KryoUnitLS]> {
+ let Latency = 3; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_3cyc_LS_LS_LS_310ln],
+ (instregex "LD3R(v16b|v8h|v4s|v2d)$")>;
+def KryoWrite_3cyc_LS_XY_LS_LS_333ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitXY, KryoUnitLS, KryoUnitLS]> {
+ let Latency = 3; let NumMicroOps = 4;
+}
+def : InstRW<[KryoWrite_3cyc_LS_XY_LS_LS_333ln, WriteAdr],
+ (instregex "LD3R(v16b|v8h|v4s|v2d)_POST$")>;
+def KryoWrite_3cyc_LS_LS_noRSV_noRSV_noRSV_349ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitLS]> {
+ let Latency = 3; let NumMicroOps = 5;
+}
+def : InstRW<[KryoWrite_3cyc_LS_LS_noRSV_noRSV_noRSV_349ln],
+ (instregex "LD3R(v8b|v4h|v2s|v1d)$")>;
+def KryoWrite_3cyc_LS_XY_LS_noRSV_noRSV_noRSV_361ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitXY, KryoUnitLS]> {
+ let Latency = 3; let NumMicroOps = 6;
+}
+def : InstRW<[KryoWrite_3cyc_LS_XY_LS_noRSV_noRSV_noRSV_361ln, WriteAdr],
+ (instregex "LD3R(v8b|v4h|v2s|v1d)_POST$")>;
+def KryoWrite_3cyc_LS_LS_284ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitLS]> {
+ let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_LS_LS_284ln],
+ (instregex "LD4i64$")>;
+def KryoWrite_3cyc_LS_XY_LS_316ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitXY, KryoUnitLS]> {
+ let Latency = 3; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_3cyc_LS_XY_LS_316ln, WriteAdr],
+ (instregex "LD4i64_POST$")>;
+def KryoWrite_3cyc_LS_LS_LS_LS_329ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitLS, KryoUnitLS, KryoUnitLS]> {
+ let Latency = 3; let NumMicroOps = 4;
+}
+def : InstRW<[KryoWrite_3cyc_LS_LS_LS_LS_329ln],
+ (instregex "LD4Four(v2d)$")>;
+def KryoWrite_4cyc_LS_X_X_X_X_337ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX]> {
+ let Latency = 4; let NumMicroOps = 5;
+}
+def : InstRW<[KryoWrite_4cyc_LS_X_X_X_X_337ln],
+ (instregex "LD4(i8|i16|i32)$")>;
+def KryoWrite_3cyc_LS_XY_LS_LS_LS_350ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitXY, KryoUnitLS, KryoUnitLS, KryoUnitLS]> {
+ let Latency = 3; let NumMicroOps = 5;
+}
+def : InstRW<[KryoWrite_3cyc_LS_XY_LS_LS_LS_350ln, WriteAdr],
+ (instregex "LD4Four(v2d)_POST$")>;
+def KryoWrite_4cyc_LS_XY_X_X_X_X_355ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitXY, KryoUnitX, KryoUnitX, KryoUnitX,
+ KryoUnitX]> {
+ let Latency = 4; let NumMicroOps = 6;
+}
+def : InstRW<[KryoWrite_4cyc_LS_XY_X_X_X_X_355ln, WriteAdr],
+ (instregex "LD4(i8|i16|i32)_POST$")>;
+def KryoWrite_4cyc_LS_LS_X_X_X_X_noRSV_noRSV_noRSV_noRSV_382ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitLS, KryoUnitX, KryoUnitX, KryoUnitX,
+ KryoUnitX]> {
+ let Latency = 4; let NumMicroOps = 10;
+}
+def : InstRW<[KryoWrite_4cyc_LS_LS_X_X_X_X_noRSV_noRSV_noRSV_noRSV_382ln],
+ (instregex "LD4Four(v8b|v4h|v2s)$")>;
+def KryoWrite_4cyc_LS_XY_LS_X_X_X_X_noRSV_noRSV_noRSV_noRSV_384ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitXY, KryoUnitLS, KryoUnitX, KryoUnitX,
+ KryoUnitX, KryoUnitX]> {
+ let Latency = 4; let NumMicroOps = 11;
+}
+def : InstRW<[KryoWrite_4cyc_LS_XY_LS_X_X_X_X_noRSV_noRSV_noRSV_noRSV_384ln, WriteAdr],
+ (instregex "LD4Four(v8b|v4h|v2s)_POST$")>;
+def KryoWrite_4cyc_LS_LS_X_X_X_X_LS_LS_X_X_X_X_386ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitLS, KryoUnitX, KryoUnitX, KryoUnitX,
+ KryoUnitX, KryoUnitLS, KryoUnitLS, KryoUnitX, KryoUnitX,
+ KryoUnitX, KryoUnitX]> {
+ let Latency = 4; let NumMicroOps = 12;
+}
+def : InstRW<[KryoWrite_4cyc_LS_LS_X_X_X_X_LS_LS_X_X_X_X_386ln],
+ (instregex "LD4Four(v16b|v8h|v4s)$")>;
+def KryoWrite_4cyc_LS_LS_X_X_X_X_LS_XY_LS_X_X_X_X_389ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitLS, KryoUnitX, KryoUnitX, KryoUnitX,
+ KryoUnitX, KryoUnitLS, KryoUnitXY, KryoUnitLS, KryoUnitX,
+ KryoUnitX, KryoUnitX, KryoUnitX]> {
+ let Latency = 4; let NumMicroOps = 13;
+}
+def : InstRW<[KryoWrite_4cyc_LS_LS_X_X_X_X_LS_XY_LS_X_X_X_X_389ln, WriteAdr],
+ (instregex "LD4Four(v16b|v8h|v4s)_POST$")>;
+def KryoWrite_3cyc_LS_LS_LS_LS_330ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitLS, KryoUnitLS, KryoUnitLS]> {
+ let Latency = 3; let NumMicroOps = 4;
+}
+def : InstRW<[KryoWrite_3cyc_LS_LS_LS_LS_330ln],
+ (instregex "LD4R(v16b|v8h|v4s|v2d)$")>;
+def KryoWrite_3cyc_LS_XY_LS_LS_LS_352ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitXY, KryoUnitLS, KryoUnitLS, KryoUnitLS]> {
+ let Latency = 3; let NumMicroOps = 5;
+}
+def : InstRW<[KryoWrite_3cyc_LS_XY_LS_LS_LS_352ln, WriteAdr],
+ (instregex "LD4R(v16b|v8h|v4s|v2d)_POST$")>;
+def KryoWrite_3cyc_LS_LS_noRSV_noRSV_noRSV_noRSV_359ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitLS]> {
+ let Latency = 3; let NumMicroOps = 6;
+}
+def : InstRW<[KryoWrite_3cyc_LS_LS_noRSV_noRSV_noRSV_noRSV_359ln],
+ (instregex "LD4R(v8b|v4h|v2s|v1d)$")>;
+def KryoWrite_3cyc_LS_XY_LS_noRSV_noRSV_noRSV_noRSV_369ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitXY, KryoUnitLS]> {
+ let Latency = 3; let NumMicroOps = 7;
+}
+def : InstRW<[KryoWrite_3cyc_LS_XY_LS_noRSV_noRSV_noRSV_noRSV_369ln, WriteAdr],
+ (instregex "LD4R(v8b|v4h|v2s|v1d)_POST$")>;
+def KryoWrite_3cyc_LS_LS_400ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitLS]> {
+ let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_LS_LS_400ln],
+ (instregex "(LDAX?R(B|H|W|X)|LDAXP(W|X))")>;
+def KryoWrite_3cyc_LS_LS_401ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitLS]> {
+ let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_LS_LS_401ln, WriteLDHi],
+ (instrs LDNPQi)>;
+def KryoWrite_3cyc_LS_noRSV_noRSV_408ln :
+ SchedWriteRes<[KryoUnitLS]> {
+ let Latency = 3; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_3cyc_LS_noRSV_noRSV_408ln, WriteLDHi],
+ (instrs LDNPDi, LDNPSi)>;
+def KryoWrite_3cyc_LS_394ln :
+ SchedWriteRes<[KryoUnitLS]> {
+ let Latency = 3; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_3cyc_LS_394ln, WriteLDHi],
+ (instrs LDNPWi, LDNPXi)>;
+def KryoWrite_3cyc_LS_LS_402ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitLS]> {
+ let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_LS_LS_402ln, WriteLDHi],
+ (instrs LDPQi)>;
+def KryoWrite_3cyc_LS_noRSV_noRSV_409ln :
+ SchedWriteRes<[KryoUnitLS]> {
+ let Latency = 3; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_3cyc_LS_noRSV_noRSV_409ln, WriteLDHi],
+ (instrs LDPDi, LDPSi)>;
+def KryoWrite_3cyc_LS_XY_LS_410ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitXY, KryoUnitLS]> {
+ let Latency = 3; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_3cyc_LS_XY_LS_410ln, WriteLDHi, WriteAdr],
+ (instregex "LDPQ(post|pre)")>;
+def KryoWrite_3cyc_LS_XY_noRSV_noRSV_411ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitXY]> {
+ let Latency = 3; let NumMicroOps = 4;
+}
+def : InstRW<[KryoWrite_3cyc_LS_XY_noRSV_noRSV_411ln, WriteLDHi, WriteAdr],
+ (instregex "LDP(D|S)(post|pre)")>;
+def KryoWrite_3cyc_LS_393ln :
+ SchedWriteRes<[KryoUnitLS]> {
+ let Latency = 3; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_3cyc_LS_393ln, WriteLDHi],
+ (instrs LDPWi, LDPXi)>;
+def KryoWrite_3cyc_LS_XY_403ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitXY]> {
+ let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_LS_XY_403ln, WriteLDHi, WriteAdr],
+ (instregex "LDP(W|X)(post|pre)")>;
+def KryoWrite_4cyc_LS_395ln :
+ SchedWriteRes<[KryoUnitLS]> {
+ let Latency = 4; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_4cyc_LS_395ln, WriteLDHi],
+ (instrs LDPSWi)>;
+def KryoWrite_4cyc_LS_XY_405ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitXY]> {
+ let Latency = 4; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_4cyc_LS_XY_405ln, WriteLDHi, WriteAdr],
+ (instrs LDPSWpost, LDPSWpre)>;
+def KryoWrite_3cyc_LS_264ln :
+ SchedWriteRes<[KryoUnitLS]> {
+ let Latency = 3; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_3cyc_LS_264ln],
+ (instrs LDRQui, LDRQl)>;
+def KryoWrite_4cyc_X_LS_271ln :
+ SchedWriteRes<[KryoUnitX, KryoUnitLS]> {
+ let Latency = 4; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_4cyc_X_LS_271ln],
+ (instrs LDRQroW, LDRQroX)>;
+def KryoWrite_3cyc_LS_noRSV_287ln :
+ SchedWriteRes<[KryoUnitLS]> {
+ let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_LS_noRSV_287ln],
+ (instregex "LDR((D|S)l|(D|S|H|B)ui)")>;
+def KryoWrite_3cyc_LS_XY_293ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitXY]> {
+ let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_LS_XY_293ln, WriteAdr],
+ (instrs LDRQpost, LDRQpre)>;
+def KryoWrite_4cyc_X_LS_noRSV_297ln :
+ SchedWriteRes<[KryoUnitX, KryoUnitLS]> {
+ let Latency = 4; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_4cyc_X_LS_noRSV_297ln],
+ (instregex "LDR(D|S|H|B)ro(W|X)")>;
+def KryoWrite_3cyc_LS_XY_noRSV_319ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitXY]> {
+ let Latency = 3; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_3cyc_LS_XY_noRSV_319ln, WriteAdr],
+ (instregex "LDR(D|S|H|B)(post|pre)")>;
+def KryoWrite_3cyc_LS_261ln :
+ SchedWriteRes<[KryoUnitLS]> {
+ let Latency = 3; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_3cyc_LS_261ln],
+ (instregex "LDR(BB|HH|W|X)ui")>;
+def KryoWrite_3cyc_LS_XY_292ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitXY]> {
+ let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_LS_XY_292ln, WriteAdr],
+ (instregex "LDR(BB|HH|W|X)(post|pre)")>;
+def KryoWrite_4cyc_X_LS_272ln :
+ SchedWriteRes<[KryoUnitX, KryoUnitLS]> {
+ let Latency = 4; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_4cyc_X_LS_272ln],
+ (instregex "(LDR(BB|HH|W|X)ro(W|X)|PRFMro(W|X))")>;
+def KryoWrite_3cyc_LS_262ln :
+ SchedWriteRes<[KryoUnitLS]> {
+ let Latency = 3; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_3cyc_LS_262ln],
+ (instrs LDRWl, LDRXl)>;
+def KryoWrite_4cyc_LS_268ln :
+ SchedWriteRes<[KryoUnitLS]> {
+ let Latency = 4; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_4cyc_LS_268ln],
+ (instregex "LDRS(BW|BX|HW|HX|W)ui")>;
+def KryoWrite_5cyc_X_LS_273ln :
+ SchedWriteRes<[KryoUnitX, KryoUnitLS]> {
+ let Latency = 5; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_5cyc_X_LS_273ln],
+ (instregex "LDRS(BW|BX|HW|HX|W)ro(W|X)")>;
+def KryoWrite_4cyc_LS_XY_294ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitXY]> {
+ let Latency = 4; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_4cyc_LS_XY_294ln, WriteAdr],
+ (instregex "LDRS(BW|BX|HW|HX|W)(post|pre)")>;
+def KryoWrite_4cyc_LS_269ln :
+ SchedWriteRes<[KryoUnitLS]> {
+ let Latency = 4; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_4cyc_LS_269ln],
+ (instrs LDRSWl)>;
+def KryoWrite_3cyc_LS_260ln :
+ SchedWriteRes<[KryoUnitLS]> {
+ let Latency = 3; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_3cyc_LS_260ln],
+ (instregex "LDTR(B|H|W|X)i")>;
+def KryoWrite_4cyc_LS_267ln :
+ SchedWriteRes<[KryoUnitLS]> {
+ let Latency = 4; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_4cyc_LS_267ln],
+ (instregex "LDTRS(BW|BX|HW|HX|W)i")>;
+def KryoWrite_3cyc_LS_263ln :
+ SchedWriteRes<[KryoUnitLS]> {
+ let Latency = 3; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_3cyc_LS_263ln],
+ (instrs LDURQi)>;
+def KryoWrite_3cyc_LS_noRSV_288ln :
+ SchedWriteRes<[KryoUnitLS]> {
+ let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_LS_noRSV_288ln],
+ (instregex "LDUR(D|S|H|B)i")>;
+def KryoWrite_3cyc_LS_259ln :
+ SchedWriteRes<[KryoUnitLS]> {
+ let Latency = 3; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_3cyc_LS_259ln],
+ (instregex "LDUR(BB|HH|W|X)i")>;
+def KryoWrite_4cyc_LS_266ln :
+ SchedWriteRes<[KryoUnitLS]> {
+ let Latency = 4; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_4cyc_LS_266ln],
+ (instregex "LDURS(B|H)?(W|X)i")>;
+def KryoWrite_3cyc_LS_258ln :
+ SchedWriteRes<[KryoUnitLS]> {
+ let Latency = 3; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_3cyc_LS_258ln],
+ (instregex "LDXP(W|X)")>;
+def KryoWrite_3cyc_LS_258_1ln :
+ SchedWriteRes<[KryoUnitLS]> {
+ let Latency = 3; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_3cyc_LS_258_1ln],
+ (instregex "LDXR(B|H|W|X)")>;
+def KryoWrite_2cyc_XY_XY_137ln :
+ SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+ let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_XY_137ln],
+ (instrs LSLVWr, LSLVXr)>;
+def KryoWrite_1cyc_XY_135ln :
+ SchedWriteRes<[KryoUnitXY]> {
+ let Latency = 1; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_1cyc_XY_135ln],
+ (instregex "(LS|AS|RO)RV(W|X)r")>;
+def KryoWrite_4cyc_X_84ln :
+ SchedWriteRes<[KryoUnitX]> {
+ let Latency = 4; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_4cyc_X_84ln],
+ (instrs MADDWrrr, MSUBWrrr)>;
+def KryoWrite_5cyc_X_85ln :
+ SchedWriteRes<[KryoUnitX]> {
+ let Latency = 5; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_5cyc_X_85ln],
+ (instrs MADDXrrr, MSUBXrrr)>;
+def KryoWrite_4cyc_X_noRSV_188ln :
+ SchedWriteRes<[KryoUnitX]> {
+ let Latency = 4; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_4cyc_X_noRSV_188ln],
+ (instregex "(MLA|MLS|MUL)(v8i8|v4i16|v2i32)(_indexed)?")>;
+def KryoWrite_4cyc_X_X_192ln :
+ SchedWriteRes<[KryoUnitX, KryoUnitX]> {
+ let Latency = 4; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_4cyc_X_X_192ln],
+ (instregex "(MLA|MLS|MUL|SQR?DMULH)(v16i8|v8i16|v4i32)(_indexed)?")>;
+def KryoWrite_1cyc_XY_noRSV_198ln :
+ SchedWriteRes<[KryoUnitXY]> {
+ let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_XY_noRSV_198ln],
+ (instregex "(MOVI|MVNI)(D|v8b_ns|v2i32|v4i16|v2s_msl)")>;
+def KryoWrite_1cyc_XY_XY_199ln :
+ SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+ let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_XY_XY_199ln],
+ (instregex "(MOVI|MVNI)(v2d_ns|v16b_ns|v4i32|v8i16|v4s_msl)")>;
+def KryoWrite_1cyc_X_89ln :
+ SchedWriteRes<[KryoUnitX]> {
+ let Latency = 1; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_1cyc_X_89ln],
+ (instrs MOVKWi, MOVKXi)>;
+def KryoWrite_1cyc_XY_91ln :
+ SchedWriteRes<[KryoUnitXY]> {
+ let Latency = 1; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_1cyc_XY_91ln],
+ (instrs MOVNWi, MOVNXi)>;
+def KryoWrite_1cyc_XY_90ln :
+ SchedWriteRes<[KryoUnitXY]> {
+ let Latency = 1; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_1cyc_XY_90ln],
+ (instrs MOVZWi, MOVZXi)>;
+def KryoWrite_2cyc_XY_93ln :
+ SchedWriteRes<[KryoUnitXY]> {
+ let Latency = 2; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_2cyc_XY_93ln],
+ (instrs MRS)>;
+def KryoWrite_0cyc_X_87ln :
+ SchedWriteRes<[KryoUnitX]> {
+ let Latency = 0; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_0cyc_X_87ln],
+ (instrs MSRpstateImm4)>;
+def : InstRW<[KryoWrite_0cyc_X_87ln],
+ (instrs MSRpstateImm1)>;
+def KryoWrite_0cyc_XY_88ln :
+ SchedWriteRes<[KryoUnitXY]> {
+ let Latency = 0; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_0cyc_XY_88ln],
+ (instrs MSR)>;
+def KryoWrite_1cyc_XY_noRSV_143ln :
+ SchedWriteRes<[KryoUnitXY]> {
+ let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_XY_noRSV_143ln],
+ (instregex "NEG(v8i8|v4i16|v2i32|v1i64)")>;
+def KryoWrite_1cyc_XY_XY_145ln :
+ SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+ let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_XY_XY_145ln],
+ (instregex "NEG(v16i8|v8i16|v4i32|v2i64)")>;
+def KryoWrite_1cyc_XY_noRSV_193ln :
+ SchedWriteRes<[KryoUnitXY]> {
+ let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_XY_noRSV_193ln],
+ (instrs NOTv8i8)>;
+def KryoWrite_1cyc_XY_XY_194ln :
+ SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+ let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_XY_XY_194ln],
+ (instrs NOTv16i8)>;
+def KryoWrite_2cyc_XY_noRSV_234ln :
+ SchedWriteRes<[KryoUnitXY]> {
+ let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_noRSV_234ln],
+ (instrs PMULv8i8)>;
+def KryoWrite_2cyc_XY_XY_236ln :
+ SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+ let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_XY_236ln],
+ (instrs PMULv16i8)>;
+def KryoWrite_2cyc_XY_XY_235ln :
+ SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+ let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_XY_235ln],
+ (instrs PMULLv8i8, PMULLv16i8)>;
+def KryoWrite_3cyc_XY_XY_237ln :
+ SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+ let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_XY_XY_237ln],
+ (instrs PMULLv1i64, PMULLv2i64)>;
+def KryoWrite_0cyc_LS_254ln :
+ SchedWriteRes<[KryoUnitLS]> {
+ let Latency = 0; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_0cyc_LS_254ln],
+ (instrs PRFMl, PRFMui)>;
+def KryoWrite_0cyc_LS_253ln :
+ SchedWriteRes<[KryoUnitLS]> {
+ let Latency = 0; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_0cyc_LS_253ln],
+ (instrs PRFUMi)>;
+def KryoWrite_6cyc_XY_X_noRSV_175ln :
+ SchedWriteRes<[KryoUnitXY, KryoUnitX]> {
+ let Latency = 6; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_6cyc_XY_X_noRSV_175ln],
+ (instregex "R(ADD|SUB)HNv.*")>;
+def KryoWrite_2cyc_XY_204ln :
+ SchedWriteRes<[KryoUnitXY]> {
+ let Latency = 2; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_2cyc_XY_204ln],
+ (instrs RBITWr, RBITXr)>;
+def KryoWrite_2cyc_XY_noRSV_218ln :
+ SchedWriteRes<[KryoUnitXY]> {
+ let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_noRSV_218ln],
+ (instrs RBITv8i8)>;
+def KryoWrite_2cyc_XY_XY_219ln :
+ SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+ let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_XY_219ln],
+ (instrs RBITv16i8)>;
+def KryoWrite_1cyc_X_202ln :
+ SchedWriteRes<[KryoUnitX]> {
+ let Latency = 1; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_1cyc_X_202ln],
+ (instregex "REV(16|32)?(W|X)r")>;
+def KryoWrite_1cyc_XY_noRSV_214ln :
+ SchedWriteRes<[KryoUnitXY]> {
+ let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_XY_noRSV_214ln],
+ (instregex "REV(16|32|64)(v8i8|v4i16|v2i32)")>;
+def KryoWrite_1cyc_XY_XY_216ln :
+ SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+ let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_XY_XY_216ln],
+ (instregex "REV(16|32|64)(v16i8|v8i16|v4i32)")>;
+def KryoWrite_3cyc_X_noRSV_244ln :
+ SchedWriteRes<[KryoUnitX]> {
+ let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_X_noRSV_244ln],
+ (instregex "S(L|R)I(d|(v8i8|v4i16|v2i32)_shift)")>;
+def KryoWrite_3cyc_X_X_245ln :
+ SchedWriteRes<[KryoUnitX, KryoUnitX]> {
+ let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_X_X_245ln],
+ (instregex "S(L|R)I(v16i8|v8i16|v4i32|v2i64)_shift")>;
+def KryoWrite_1cyc_XY_2ln :
+ SchedWriteRes<[KryoUnitXY]> {
+ let Latency = 1; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_1cyc_XY_2ln, ReadI, ReadI],
+ (instregex "SBCS?(W|X)r")>;
+def KryoWrite_2cyc_XA_XA_XA_24ln :
+ SchedWriteRes<[KryoUnitXA, KryoUnitXA, KryoUnitXA]> {
+ let Latency = 2; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_2cyc_XA_XA_XA_24ln],
+ (instrs SHA1Crrr, SHA1Mrrr, SHA1Prrr)>;
+def KryoWrite_1cyc_XY_noRSV_21ln :
+ SchedWriteRes<[KryoUnitXY]> {
+ let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_XY_noRSV_21ln],
+ (instrs SHA1Hrr)>;
+def KryoWrite_2cyc_X_X_23ln :
+ SchedWriteRes<[KryoUnitX, KryoUnitX]> {
+ let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_X_X_23ln],
+ (instrs SHA1SU0rrr, SHA1SU1rr, SHA256SU0rr)>;
+def KryoWrite_4cyc_XA_XA_XA_25ln :
+ SchedWriteRes<[KryoUnitXA, KryoUnitXA, KryoUnitXA]> {
+ let Latency = 4; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_4cyc_XA_XA_XA_25ln],
+ (instrs SHA256Hrrr, SHA256H2rrr)>;
+def KryoWrite_3cyc_XY_XY_X_X_26ln :
+ SchedWriteRes<[KryoUnitXY, KryoUnitXY, KryoUnitX, KryoUnitX]> {
+ let Latency = 3; let NumMicroOps = 4;
+}
+def : InstRW<[KryoWrite_3cyc_XY_XY_X_X_26ln],
+ (instrs SHA256SU1rrr)>;
+def KryoWrite_4cyc_X_noRSV_189ln :
+ SchedWriteRes<[KryoUnitX]> {
+ let Latency = 4; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_4cyc_X_noRSV_189ln],
+ (instregex "SQR?DMULH(v8i8|v4i16|v1i32|v2i32|v1i16)(_indexed)?")>;
+def KryoWrite_3cyc_XY_noRSV_68ln :
+ SchedWriteRes<[KryoUnitXY]> {
+ let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_XY_noRSV_68ln],
+ (instregex "SQ(ABS|NEG)(v1i8|v1i16|v1i32|v1i64)")>;
+def KryoWrite_3cyc_XY_noRSV_157ln :
+ SchedWriteRes<[KryoUnitXY]> {
+ let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_XY_noRSV_157ln],
+ (instregex "SQ(ABS|NEG)(v8i8|v4i16|v2i32)")>;
+def KryoWrite_3cyc_XY_XY_164ln :
+ SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+ let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_XY_XY_164ln],
+ (instregex "SQ(ABS|NEG)(v16i8|v8i16|v4i32|v2i64)")>;
+def KryoWrite_4cyc_X_noRSV_190ln :
+ SchedWriteRes<[KryoUnitX]> {
+ let Latency = 4; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_4cyc_X_noRSV_190ln],
+ (instregex "SQD(MLAL|MLSL|MULL)(i16|i32)")>;
+def KryoWrite_0cyc_LS_Y_274ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitY]> {
+ let Latency = 0; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_0cyc_LS_Y_274ln],
+ (instregex "ST1(One(v8b|v4h|v2s|v1d|v16b|v8h|v4s|v2d)|(i8|i16|i32|i64)|Two(v8b|v4h|v2s|v1d))$")>;
+def KryoWrite_1cyc_LS_Y_X_301ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitY, KryoUnitX]> {
+ let Latency = 1; let NumMicroOps = 3;
+}
+def : InstRW<[WriteAdr, KryoWrite_1cyc_LS_Y_X_301ln],
+ (instregex "ST1(One(v8b|v4h|v2s|v1d|v16b|v8h|v4s|v2d)|(i8|i16|i32|i64)|Two(v8b|v4h|v2s|v1d))_POST$")>;
+def KryoWrite_1cyc_LS_Y_XY_305ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitY, KryoUnitXY]> {
+ let Latency = 1; let NumMicroOps = 3;
+}
+def : InstRW<[WriteAdr, KryoWrite_1cyc_LS_Y_XY_305ln],
+ (instregex "ST1(One(v16b|v8h|v4s|v2d)|Two(v8b|v4h|v2s|v1d))_POST$")>;
+def KryoWrite_0cyc_LS_Y_LS_Y_323ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitY, KryoUnitLS, KryoUnitY]> {
+ let Latency = 0; let NumMicroOps = 4;
+}
+def : InstRW<[WriteAdr, KryoWrite_0cyc_LS_Y_LS_Y_323ln],
+ (instregex "ST1(Two(v16b|v8h|v4s|v2d)|(Three|Four)(v8b|v4h|v2s|v1d))_POST$")>;
+def KryoWrite_1cyc_LS_Y_XY_LS_Y_345ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitY, KryoUnitXY, KryoUnitLS, KryoUnitY]> {
+ let Latency = 1; let NumMicroOps = 5;
+}
+def : InstRW<[KryoWrite_1cyc_LS_Y_XY_LS_Y_345ln],
+ (instregex "ST1(Two(v16b|v8h|v4s|v2d)|(Three|Four)(v8b|v4h|v2s|v1d))$")>;
+def KryoWrite_0cyc_LS_Y_LS_Y_LS_Y_356ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitY, KryoUnitLS, KryoUnitY, KryoUnitLS,
+ KryoUnitY]> {
+ let Latency = 0; let NumMicroOps = 6;
+}
+def : InstRW<[KryoWrite_0cyc_LS_Y_LS_Y_LS_Y_356ln],
+ (instregex "ST1Three(v16b|v8h|v4s|v2d)$")>;
+def KryoWrite_1cyc_LS_Y_XY_LS_Y_LS_Y_366ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitY, KryoUnitXY, KryoUnitLS, KryoUnitY,
+ KryoUnitLS, KryoUnitY]> {
+ let Latency = 1; let NumMicroOps = 7;
+}
+def : InstRW<[WriteAdr, KryoWrite_1cyc_LS_Y_XY_LS_Y_LS_Y_366ln],
+ (instregex "ST1Three(v16b|v8h|v4s|v2d)_POST$")>;
+def KryoWrite_0cyc_LS_Y_LS_Y_LS_Y_LS_Y_371ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitY, KryoUnitLS, KryoUnitY, KryoUnitLS,
+ KryoUnitY, KryoUnitLS, KryoUnitY]> {
+ let Latency = 0; let NumMicroOps = 8;
+}
+def : InstRW<[KryoWrite_0cyc_LS_Y_LS_Y_LS_Y_LS_Y_371ln],
+ (instregex "ST1Four(v16b|v8h|v4s|v2d)$")>;
+def KryoWrite_0cyc_LS_Y_LS_Y_XY_LS_Y_LS_Y_377ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitY, KryoUnitLS, KryoUnitY, KryoUnitXY,
+ KryoUnitLS, KryoUnitY, KryoUnitLS, KryoUnitY]> {
+ let Latency = 0; let NumMicroOps = 9;
+}
+def : InstRW<[WriteAdr, KryoWrite_0cyc_LS_Y_LS_Y_XY_LS_Y_LS_Y_377ln],
+ (instregex "ST1Four(v16b|v8h|v4s|v2d)_POST$")>;
+def KryoWrite_0cyc_LS_Y_275ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitY]> {
+ let Latency = 0; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_0cyc_LS_Y_275ln],
+ (instregex "ST2(Two(v8b|v4h|v2s|v1d|v16b|v8h|v4s|v2d)|(i8|i16|i32|i64))$")>;
+def KryoWrite_1cyc_LS_Y_XY_306ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitY, KryoUnitXY]> {
+ let Latency = 1; let NumMicroOps = 3;
+}
+def : InstRW<[WriteAdr, KryoWrite_1cyc_LS_Y_XY_306ln],
+ (instregex "ST2(Two(v8b|v4h|v2s|v1d)|(i8|i16|i32|i64))_POST$")>;
+def KryoWrite_0cyc_LS_Y_LS_Y_322ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitY, KryoUnitLS, KryoUnitY]> {
+ let Latency = 0; let NumMicroOps = 4;
+}
+def : InstRW<[KryoWrite_0cyc_LS_Y_LS_Y_322ln],
+ (instregex "ST2Two(v16b|v8h|v4s|v2d)$")>;
+def KryoWrite_1cyc_LS_Y_XY_LS_Y_344ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitY, KryoUnitXY, KryoUnitLS, KryoUnitY]> {
+ let Latency = 1; let NumMicroOps = 5;
+}
+def : InstRW<[WriteAdr, KryoWrite_1cyc_LS_Y_XY_LS_Y_344ln],
+ (instregex "ST2Two(v16b|v8h|v4s|v2d)_POST$")>;
+def KryoWrite_0cyc_LS_Y_LS_Y_324ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitY, KryoUnitLS, KryoUnitY]> {
+ let Latency = 0; let NumMicroOps = 4;
+}
+def : InstRW<[KryoWrite_0cyc_LS_Y_LS_Y_324ln],
+ (instregex "ST3(Threev1d|(i8|i16|i32|i64))$")>;
+def KryoWrite_1cyc_LS_Y_XY_LS_Y_346ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitY, KryoUnitXY, KryoUnitLS, KryoUnitY]> {
+ let Latency = 1; let NumMicroOps = 5;
+}
+def : InstRW<[WriteAdr, KryoWrite_1cyc_LS_Y_XY_LS_Y_346ln],
+ (instregex "ST3(Threev1d|(i8|i16|i32|i64))_POST$")>;
+def KryoWrite_1cyc_X_X_LS_Y_LS_Y_353ln :
+ SchedWriteRes<[KryoUnitX, KryoUnitX, KryoUnitLS, KryoUnitY, KryoUnitLS,
+ KryoUnitY]> {
+ let Latency = 1; let NumMicroOps = 6;
+}
+def : InstRW<[KryoWrite_1cyc_X_X_LS_Y_LS_Y_353ln],
+ (instregex "ST3Three(v8b|v4h|v2s)$")>;
+def KryoWrite_0cyc_LS_Y_LS_Y_LS_Y_357ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitY, KryoUnitLS, KryoUnitY, KryoUnitLS,
+ KryoUnitY]> {
+ let Latency = 0; let NumMicroOps = 6;
+}
+def : InstRW<[KryoWrite_0cyc_LS_Y_LS_Y_LS_Y_357ln],
+ (instregex "ST3Threev2d$")>;
+def KryoWrite_1cyc_X_X_LS_Y_XY_LS_Y_363ln :
+ SchedWriteRes<[KryoUnitX, KryoUnitX, KryoUnitLS, KryoUnitY, KryoUnitXY,
+ KryoUnitLS, KryoUnitY]> {
+ let Latency = 1; let NumMicroOps = 7;
+}
+def : InstRW<[WriteAdr, KryoWrite_1cyc_X_X_LS_Y_XY_LS_Y_363ln],
+ (instregex "ST3Three(v8b|v4h|v2s)_POST$")>;
+def KryoWrite_1cyc_LS_Y_XY_LS_Y_LS_Y_367ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitY, KryoUnitXY, KryoUnitLS, KryoUnitY,
+ KryoUnitLS, KryoUnitY]> {
+ let Latency = 1; let NumMicroOps = 7;
+}
+def : InstRW<[WriteAdr, KryoWrite_1cyc_LS_Y_XY_LS_Y_LS_Y_367ln],
+ (instregex "ST3Threev2d_POST$")>;
+def KryoWrite_1cyc_X_X_LS_Y_LS_Y_X_X_LS_Y_LS_Y_385ln :
+ SchedWriteRes<[KryoUnitX, KryoUnitX, KryoUnitLS, KryoUnitY, KryoUnitLS,
+ KryoUnitY, KryoUnitX, KryoUnitX, KryoUnitLS, KryoUnitY,
+ KryoUnitLS, KryoUnitY]> {
+ let Latency = 1; let NumMicroOps = 12;
+}
+def : InstRW<[KryoWrite_1cyc_X_X_LS_Y_LS_Y_X_X_LS_Y_LS_Y_385ln],
+ (instregex "ST3Three(v16b|v8h|v4s)$")>;
+def KryoWrite_1cyc_X_X_LS_Y_LS_Y_X_X_LS_Y_XY_LS_Y_388ln :
+ SchedWriteRes<[KryoUnitX, KryoUnitX, KryoUnitLS, KryoUnitY, KryoUnitLS,
+ KryoUnitY, KryoUnitX, KryoUnitX, KryoUnitLS, KryoUnitY,
+ KryoUnitXY, KryoUnitLS, KryoUnitY]> {
+ let Latency = 1; let NumMicroOps = 13;
+}
+def : InstRW<[WriteAdr, KryoWrite_1cyc_X_X_LS_Y_LS_Y_X_X_LS_Y_XY_LS_Y_388ln],
+ (instregex "ST3Three(v16b|v8h|v4s)_POST$")>;
+def KryoWrite_0cyc_LS_Y_LS_Y_325ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitY, KryoUnitLS, KryoUnitY]> {
+ let Latency = 0; let NumMicroOps = 4;
+}
+def : InstRW<[KryoWrite_0cyc_LS_Y_LS_Y_325ln],
+ (instregex "ST4(Fourv1d|(i8|i16|i32|i64))$")>;
+def KryoWrite_1cyc_LS_Y_XY_LS_Y_347ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitY, KryoUnitXY, KryoUnitLS, KryoUnitY]> {
+ let Latency = 1; let NumMicroOps = 5;
+}
+def : InstRW<[WriteAdr, KryoWrite_1cyc_LS_Y_XY_LS_Y_347ln],
+ (instregex "ST4(Fourv1d|(i8|i16|i32|i64))_POST$")>;
+def KryoWrite_1cyc_X_X_LS_Y_X_X_LS_Y_370ln :
+ SchedWriteRes<[KryoUnitX, KryoUnitX, KryoUnitLS, KryoUnitY, KryoUnitX,
+ KryoUnitX, KryoUnitLS, KryoUnitY]> {
+ let Latency = 1; let NumMicroOps = 8;
+}
+def : InstRW<[KryoWrite_1cyc_X_X_LS_Y_X_X_LS_Y_370ln],
+ (instregex "ST4Four(v8b|v4h|v2s)$")>;
+def KryoWrite_0cyc_LS_Y_LS_Y_LS_Y_LS_Y_372ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitY, KryoUnitLS, KryoUnitY, KryoUnitLS,
+ KryoUnitY, KryoUnitLS, KryoUnitY]> {
+ let Latency = 0; let NumMicroOps = 8;
+}
+def : InstRW<[KryoWrite_0cyc_LS_Y_LS_Y_LS_Y_LS_Y_372ln],
+ (instregex "ST4Fourv2d$")>;
+def KryoWrite_1cyc_X_X_LS_Y_XY_X_X_LS_Y_375ln :
+ SchedWriteRes<[KryoUnitX, KryoUnitX, KryoUnitLS, KryoUnitY, KryoUnitXY,
+ KryoUnitX, KryoUnitX, KryoUnitLS, KryoUnitY]> {
+ let Latency = 1; let NumMicroOps = 9;
+}
+def : InstRW<[WriteAdr, KryoWrite_1cyc_X_X_LS_Y_XY_X_X_LS_Y_375ln],
+ (instregex "ST4Four(v8b|v4h|v2s)_POST$")>;
+def KryoWrite_0cyc_LS_Y_LS_Y_XY_LS_Y_LS_Y_379ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitY, KryoUnitLS, KryoUnitY, KryoUnitXY,
+ KryoUnitLS, KryoUnitY, KryoUnitLS, KryoUnitY]> {
+ let Latency = 0; let NumMicroOps = 9;
+}
+def : InstRW<[WriteAdr, KryoWrite_0cyc_LS_Y_LS_Y_XY_LS_Y_LS_Y_379ln],
+ (instregex "ST4Fourv2d_POST$")>;
+def KryoWrite_1cyc_X_X_LS_Y_X_X_LS_Y_X_X_LS_Y_X_X_LS_Y_390ln :
+ SchedWriteRes<[KryoUnitX, KryoUnitX, KryoUnitLS, KryoUnitY, KryoUnitX,
+ KryoUnitX, KryoUnitLS, KryoUnitY, KryoUnitX, KryoUnitX,
+ KryoUnitLS, KryoUnitY, KryoUnitX, KryoUnitX, KryoUnitLS,
+ KryoUnitY]> {
+ let Latency = 1; let NumMicroOps = 16;
+}
+def : InstRW<[KryoWrite_1cyc_X_X_LS_Y_X_X_LS_Y_X_X_LS_Y_X_X_LS_Y_390ln],
+ (instregex "ST4Four(v16b|v8h|v4s)$")>;
+def KryoWrite_1cyc_X_X_LS_Y_X_X_LS_Y_X_X_LS_Y_XY_X_X_LS_Y_392ln :
+ SchedWriteRes<[KryoUnitX, KryoUnitX, KryoUnitLS, KryoUnitY, KryoUnitX,
+ KryoUnitX, KryoUnitLS, KryoUnitY, KryoUnitX, KryoUnitX,
+ KryoUnitLS, KryoUnitY, KryoUnitXY, KryoUnitX, KryoUnitX,
+ KryoUnitLS, KryoUnitY]> {
+ let Latency = 1; let NumMicroOps = 17;
+}
+def : InstRW<[WriteAdr, KryoWrite_1cyc_X_X_LS_Y_X_X_LS_Y_X_X_LS_Y_XY_X_X_LS_Y_392ln],
+ (instregex "ST4Four(v16b|v8h|v4s)_POST$")>;
+def KryoWrite_0cyc_LS_LS_Y_299ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitLS, KryoUnitY]> {
+ let Latency = 0; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_0cyc_LS_LS_Y_299ln],
+ (instregex "STLR(B|H|W|X)")>;
+def KryoWrite_3cyc_LS_LS_Y_307ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitLS, KryoUnitY]> {
+ let Latency = 3; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_3cyc_LS_LS_Y_307ln],
+ (instregex "STLX(P(W|X)|R(B|H|W|X))")>;
+def KryoWrite_0cyc_LS_Y_276ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitY]> {
+ let Latency = 0; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_0cyc_LS_Y_276ln],
+ (instrs STNPDi, STNPSi)>;
+def KryoWrite_0cyc_LS_Y_LS_Y_326ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitY, KryoUnitLS, KryoUnitY]> {
+ let Latency = 0; let NumMicroOps = 4;
+}
+def : InstRW<[KryoWrite_0cyc_LS_Y_LS_Y_326ln],
+ (instrs STNPQi)>;
+def KryoWrite_0cyc_LS_Y_280ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitY]> {
+ let Latency = 0; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_0cyc_LS_Y_280ln],
+ (instrs STNPWi, STNPXi)>;
+def KryoWrite_0cyc_LS_Y_277ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitY]> {
+ let Latency = 0; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_0cyc_LS_Y_277ln],
+ (instregex "STP(D|S)i")>;
+def KryoWrite_1cyc_LS_Y_X_303ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitY, KryoUnitX]> {
+ let Latency = 1; let NumMicroOps = 3;
+}
+def : InstRW<[WriteAdr, KryoWrite_1cyc_LS_Y_X_303ln],
+ (instregex "STP(D|S)(post|pre)")>;
+def KryoWrite_0cyc_LS_Y_LS_Y_327ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitY, KryoUnitLS, KryoUnitY]> {
+ let Latency = 0; let NumMicroOps = 4;
+}
+def : InstRW<[KryoWrite_0cyc_LS_Y_LS_Y_327ln],
+ (instrs STPQi)>;
+def KryoWrite_1cyc_LS_Y_X_LS_Y_343ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitY, KryoUnitX, KryoUnitLS, KryoUnitY]> {
+ let Latency = 1; let NumMicroOps = 5;
+}
+def : InstRW<[WriteAdr, KryoWrite_1cyc_LS_Y_X_LS_Y_343ln],
+ (instrs STPQpost, STPQpre)>;
+def KryoWrite_0cyc_LS_Y_279ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitY]> {
+ let Latency = 0; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_0cyc_LS_Y_279ln],
+ (instregex "STP(W|X)i")>;
+def KryoWrite_1cyc_LS_X_Y_300ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitX, KryoUnitY]> {
+ let Latency = 1; let NumMicroOps = 3;
+}
+def : InstRW<[WriteAdr, KryoWrite_1cyc_LS_X_Y_300ln],
+ (instregex "STP(W|X)(post|pre)")>;
+def KryoWrite_0cyc_LS_Y_278ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitY]> {
+ let Latency = 0; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_0cyc_LS_Y_278ln],
+ (instregex "STR(Q|D|S|H|B)ui")>;
+def KryoWrite_1cyc_X_LS_Y_295ln :
+ SchedWriteRes<[KryoUnitX, KryoUnitLS, KryoUnitY]> {
+ let Latency = 1; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_1cyc_X_LS_Y_295ln],
+ (instregex "STR(D|S|H|B)ro(W|X)")>;
+def KryoWrite_1cyc_LS_Y_X_304ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitY, KryoUnitX]> {
+ let Latency = 1; let NumMicroOps = 3;
+}
+def : InstRW<[WriteAdr, KryoWrite_1cyc_LS_Y_X_304ln],
+ (instregex "STR(Q|D|S|H|B)(post|pre)")>;
+def KryoWrite_2cyc_X_LS_Y_XY_LS_Y_354ln :
+ SchedWriteRes<[KryoUnitX, KryoUnitLS, KryoUnitY, KryoUnitXY, KryoUnitLS,
+ KryoUnitY]> {
+ let Latency = 2; let NumMicroOps = 6;
+}
+def : InstRW<[KryoWrite_2cyc_X_LS_Y_XY_LS_Y_354ln],
+ (instregex "STRQro(W|X)")>;
+def KryoWrite_0cyc_LS_Y_399ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitY]> {
+ let Latency = 0; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_0cyc_LS_Y_399ln],
+ (instregex "STR(BB|HH|W|X)ui")>;
+def KryoWrite_1cyc_X_LS_Y_406ln :
+ SchedWriteRes<[KryoUnitX, KryoUnitLS, KryoUnitY]> {
+ let Latency = 1; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_1cyc_X_LS_Y_406ln],
+ (instregex "STR(BB|HH|W|X)ro(W|X)")>;
+def KryoWrite_1cyc_LS_X_Y_407ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitX, KryoUnitY]> {
+ let Latency = 1; let NumMicroOps = 3;
+}
+def : InstRW<[WriteAdr, KryoWrite_1cyc_LS_X_Y_407ln],
+ (instregex "STR(BB|HH|W|X)(post|pre)")>;
+def KryoWrite_0cyc_LS_Y_398ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitY]> {
+ let Latency = 0; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_0cyc_LS_Y_398ln],
+ (instregex "STTR(B|H|W|X)i")>;
+def KryoWrite_0cyc_LS_Y_396ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitY]> {
+ let Latency = 0; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_0cyc_LS_Y_396ln],
+ (instregex "STUR(Q|D|S|H|B)i")>;
+def KryoWrite_0cyc_LS_Y_397ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitY]> {
+ let Latency = 0; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_0cyc_LS_Y_397ln],
+ (instregex "STUR(BB|HH|W|X)i")>;
+def KryoWrite_3cyc_LS_Y_404ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitY]> {
+ let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_LS_Y_404ln],
+ (instregex "STX(P(W|X)|R(B|H|W|X))")>;
+def KryoWrite_3cyc_XY_noRSV_160ln :
+ SchedWriteRes<[KryoUnitXY]> {
+ let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_XY_noRSV_160ln],
+ (instregex "^(SU|US)QADD(v8i8|v4i16|v2i32)")>;
+def KryoWrite_3cyc_XY_XY_167ln :
+ SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+ let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_XY_XY_167ln],
+ (instregex "^(SU|US)QADD(v16i8|v8i16|v4i32|v2i64)")>;
+def KryoWrite_1cyc_XY_1ln :
+ SchedWriteRes<[KryoUnitXY]> {
+ let Latency = 1; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_1cyc_XY_1ln, ReadI],
+ (instregex "SUBS?(W|X)ri")>;
+def KryoWrite_2cyc_XY_XY_5ln :
+ SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+ let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_XY_5ln, ReadI, ReadIEReg],
+ (instregex "SUBS?(W|X)rx")>;
+def KryoWrite_2cyc_XY_XY_5_1ln :
+ SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+ let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_XY_5_1ln, ReadI, ReadISReg],
+ (instregex "SUBS?(W|X)rs")>;
+def KryoWrite_1cyc_XY_noRSV_6ln :
+ SchedWriteRes<[KryoUnitXY]> {
+ let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_XY_noRSV_6ln, ReadI, ReadI],
+ (instregex "SUBS?(W|X)rr")>;
+def KryoWrite_0cyc_LS_9ln :
+ SchedWriteRes<[KryoUnitLS]> {
+ let Latency = 0; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_0cyc_LS_9ln],
+ (instregex "SYSL?xt")>;
+def KryoWrite_1cyc_X_noRSV_205ln :
+ SchedWriteRes<[KryoUnitX]> {
+ let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_X_noRSV_205ln],
+ (instrs TBLv8i8One)>;
+def KryoWrite_1cyc_X_X_208ln :
+ SchedWriteRes<[KryoUnitX, KryoUnitX]> {
+ let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_X_X_208ln],
+ (instrs TBLv16i8One)>;
+def KryoWrite_2cyc_X_X_X_noRSV_222ln :
+ SchedWriteRes<[KryoUnitX, KryoUnitX, KryoUnitX]> {
+ let Latency = 2; let NumMicroOps = 4;
+}
+def : InstRW<[KryoWrite_2cyc_X_X_X_noRSV_222ln],
+ (instrs TBLv8i8Two)>;
+def KryoWrite_2cyc_X_X_X_X_X_X_224ln :
+ SchedWriteRes<[KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX,
+ KryoUnitX]> {
+ let Latency = 2; let NumMicroOps = 6;
+}
+def : InstRW<[KryoWrite_2cyc_X_X_X_X_X_X_224ln],
+ (instrs TBLv16i8Two)>;
+def KryoWrite_3cyc_X_X_X_X_X_noRSV_225ln :
+ SchedWriteRes<[KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX]> {
+ let Latency = 3; let NumMicroOps = 6;
+}
+def : InstRW<[KryoWrite_3cyc_X_X_X_X_X_noRSV_225ln],
+ (instrs TBLv8i8Three)>;
+def KryoWrite_3cyc_X_X_X_X_X_X_X_noRSV_228ln :
+ SchedWriteRes<[KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX,
+ KryoUnitX, KryoUnitX]> {
+ let Latency = 3; let NumMicroOps = 8;
+}
+def : InstRW<[KryoWrite_3cyc_X_X_X_X_X_X_X_noRSV_228ln],
+ (instrs TBLv8i8Four)>;
+def KryoWrite_4cyc_X_X_X_X_X_X_X_X_XY_X_X_230ln :
+ SchedWriteRes<[KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX,
+ KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitXY, KryoUnitX,
+ KryoUnitX]> {
+ let Latency = 4; let NumMicroOps = 11;
+}
+def : InstRW<[KryoWrite_4cyc_X_X_X_X_X_X_X_X_XY_X_X_230ln],
+ (instrs TBLv16i8Three)>;
+def KryoWrite_4cyc_X_X_X_X_X_X_X_X_X_X_XY_X_X_X_X_232ln :
+ SchedWriteRes<[KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX,
+ KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX,
+ KryoUnitXY, KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX]> {
+ let Latency = 4; let NumMicroOps = 15;
+}
+def : InstRW<[KryoWrite_4cyc_X_X_X_X_X_X_X_X_X_X_XY_X_X_X_X_232ln],
+ (instrs TBLv16i8Four)>;
+def KryoWrite_2cyc_X_X_noRSV_220ln :
+ SchedWriteRes<[KryoUnitX, KryoUnitX]> {
+ let Latency = 2; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_2cyc_X_X_noRSV_220ln],
+ (instrs TBXv8i8One)>;
+def KryoWrite_2cyc_X_X_X_X_221ln :
+ SchedWriteRes<[KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX]> {
+ let Latency = 2; let NumMicroOps = 4;
+}
+def : InstRW<[KryoWrite_2cyc_X_X_X_X_221ln],
+ (instrs TBXv16i8One)>;
+def KryoWrite_3cyc_X_X_X_X_noRSV_223ln :
+ SchedWriteRes<[KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX]> {
+ let Latency = 3; let NumMicroOps = 5;
+}
+def : InstRW<[KryoWrite_3cyc_X_X_X_X_noRSV_223ln],
+ (instrs TBXv8i8Two)>;
+def KryoWrite_4cyc_X_X_X_X_X_X_noRSV_226ln :
+ SchedWriteRes<[KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX,
+ KryoUnitX]> {
+ let Latency = 4; let NumMicroOps = 7;
+}
+def : InstRW<[KryoWrite_4cyc_X_X_X_X_X_X_noRSV_226ln],
+ (instrs TBXv8i8Three)>;
+def KryoWrite_3cyc_X_X_X_X_X_X_X_X_227ln :
+ SchedWriteRes<[KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX,
+ KryoUnitX, KryoUnitX, KryoUnitX]> {
+ let Latency = 3; let NumMicroOps = 8;
+}
+def : InstRW<[KryoWrite_3cyc_X_X_X_X_X_X_X_X_227ln],
+ (instrs TBXv16i8Two)>;
+def KryoWrite_4cyc_X_X_X_X_X_X_X_X_noRSV_229ln :
+ SchedWriteRes<[KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX,
+ KryoUnitX, KryoUnitX, KryoUnitX]> {
+ let Latency = 4; let NumMicroOps = 9;
+}
+def : InstRW<[KryoWrite_4cyc_X_X_X_X_X_X_X_X_noRSV_229ln],
+ (instrs TBXv8i8Four)>;
+def KryoWrite_5cyc_X_X_X_X_X_X_X_X_X_XY_X_X_X_231ln :
+ SchedWriteRes<[KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX,
+ KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitXY,
+ KryoUnitX, KryoUnitX, KryoUnitX]> {
+ let Latency = 5; let NumMicroOps = 13;
+}
+def : InstRW<[KryoWrite_5cyc_X_X_X_X_X_X_X_X_X_XY_X_X_X_231ln],
+ (instrs TBXv16i8Three)>;
+def KryoWrite_5cyc_X_X_X_X_X_X_X_X_X_X_X_XY_X_X_X_X_X_233ln :
+ SchedWriteRes<[KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX,
+ KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX,
+ KryoUnitX, KryoUnitXY, KryoUnitX, KryoUnitX, KryoUnitX,
+ KryoUnitX, KryoUnitX]> {
+ let Latency = 5; let NumMicroOps = 17;
+}
+def : InstRW<[KryoWrite_5cyc_X_X_X_X_X_X_X_X_X_X_X_XY_X_X_X_X_X_233ln],
+ (instrs TBXv16i8Four)>;
+def KryoWrite_1cyc_XY_XY_217ln :
+ SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+ let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_XY_XY_217ln],
+ (instregex "((TRN1|TRN2|ZIP1|UZP1|UZP2)v2i64|ZIP2(v2i64|v4i32|v8i16|v16i8))")>;
+def KryoWrite_1cyc_X_X_211ln :
+ SchedWriteRes<[KryoUnitX, KryoUnitX]> {
+ let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_X_X_211ln],
+ (instregex "(TRN1|TRN2)(v4i32|v8i16|v16i8)")>;
+def KryoWrite_1cyc_X_XY_213ln :
+ SchedWriteRes<[KryoUnitX, KryoUnitXY]> {
+ let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_X_XY_213ln],
+ (instregex "(TRN1|TRN2)(v2i32|v4i16|v8i8)")>;
+def KryoWrite_3cyc_XY_noRSV_156ln :
+ SchedWriteRes<[KryoUnitXY]> {
+ let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_XY_noRSV_156ln],
+ (instrs URECPEv2i32, URSQRTEv2i32)>;
+def KryoWrite_3cyc_XY_XY_168ln :
+ SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+ let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_XY_XY_168ln],
+ (instrs URECPEv4i32, URSQRTEv4i32)>;
+def KryoWrite_1cyc_X_X_210ln :
+ SchedWriteRes<[KryoUnitX, KryoUnitX]> {
+ let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_X_X_210ln],
+ (instregex "(UZP1|UZP2)(v4i32|v8i16|v16i8)")>;
+def KryoWrite_1cyc_X_noRSV_206ln :
+ SchedWriteRes<[KryoUnitX]> {
+ let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_X_noRSV_206ln],
+ (instregex "(UZP1|UZP2|ZIP1|ZIP2)(v2i32|v4i16|v8i8)")>;
+def KryoWrite_1cyc_XY_noRSV_215ln :
+ SchedWriteRes<[KryoUnitXY]> {
+ let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_XY_noRSV_215ln],
+ (instregex "XTNv.*")>;
+def KryoWrite_1cyc_X_X_209ln :
+ SchedWriteRes<[KryoUnitX, KryoUnitX]> {
+ let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_X_X_209ln],
+ (instregex "ZIP1(v4i32|v8i16|v16i8)")>;
diff --git a/lib/Target/AArch64/AArch64SchedM1.td b/lib/Target/AArch64/AArch64SchedM1.td
index 6525628dbfd6e..2288b8dfc223c 100644
--- a/lib/Target/AArch64/AArch64SchedM1.td
+++ b/lib/Target/AArch64/AArch64SchedM1.td
@@ -19,9 +19,8 @@
def ExynosM1Model : SchedMachineModel {
let IssueWidth = 4; // Up to 4 uops per cycle.
- let MinLatency = 0; // OoO.
let MicroOpBufferSize = 96; // ROB size.
- let LoopMicroOpBufferSize = 32; // Instruction queue size.
+ let LoopMicroOpBufferSize = 24; // Based on the instruction queue size.
let LoadLatency = 4; // Optimistic load cases.
let MispredictPenalty = 14; // Minimum branch misprediction penalty.
let CompleteModel = 0; // Use the default model otherwise.
@@ -142,12 +141,13 @@ def : WriteRes<WriteVST, [M1UnitS, M1UnitFST]> { let Latency = 1; }
def : WriteRes<WriteV, [M1UnitFADD]> { let Latency = 3; }
// Other miscellaneous instructions.
-def : WriteRes<WriteSys, []> { let Latency = 1; }
+def : WriteRes<WriteAtomic, []> { let Unsupported = 1; }
def : WriteRes<WriteBarrier, []> { let Latency = 1; }
def : WriteRes<WriteHint, []> { let Latency = 1; }
+def : WriteRes<WriteSys, []> { let Latency = 1; }
//===----------------------------------------------------------------------===//
-// Fast forwarding.
+// Generic fast forwarding.
// TODO: Add FP register forwarding rules.
@@ -187,6 +187,10 @@ def M1WriteNEONH : SchedWriteRes<[M1UnitNALU,
M1UnitFST]> { let Latency = 3; }
def M1WriteNEONI : SchedWriteRes<[M1UnitFST,
M1UnitL]> { let Latency = 9; }
+def M1WriteNEONJ : SchedWriteRes<[M1UnitNMISC,
+ M1UnitFMAC]> { let Latency = 6; }
+def M1WriteNEONK : SchedWriteRes<[M1UnitNMISC,
+ M1UnitFMAC]> { let Latency = 7; }
def M1WriteALU1 : SchedWriteRes<[M1UnitALU]> { let Latency = 1; }
def M1WriteB : SchedWriteRes<[M1UnitB]> { let Latency = 1; }
// FIXME: This is the worst case, conditional branch and link.
@@ -305,8 +309,10 @@ def : InstRW<[M1WriteFVAR15], (instregex "FSQRTv.f32")>;
def : InstRW<[M1WriteFVAR23], (instregex "FSQRTv2f64")>;
def : InstRW<[M1WriteNMISC1], (instregex "^F(MAX|MIN)(NM)?V?v")>;
def : InstRW<[M1WriteNMISC2], (instregex "^F(MAX|MIN)(NM)?Pv")>;
-def : InstRW<[M1WriteFMAC4], (instregex "^FMULX?v")>;
-def : InstRW<[M1WriteFMAC5], (instregex "^FML[AS]v")>;
+def : InstRW<[M1WriteNEONJ], (instregex "^FMULX?v.i")>;
+def : InstRW<[M1WriteFMAC4], (instregex "^FMULX?v.f")>;
+def : InstRW<[M1WriteNEONK], (instregex "^FML[AS]v.i")>;
+def : InstRW<[M1WriteFMAC5], (instregex "^FML[AS]v.f")>;
def : InstRW<[M1WriteFCVT3], (instregex "^FRINT[AIMNPXZ]v")>;
// ASIMD miscellaneous instructions.
@@ -337,16 +343,19 @@ def : InstRW<[WriteSequence<[M1WriteNAL12], 4>],
(instregex "^TB[LX]v16i8Four")>;
def : InstRW<[M1WriteNEOND], (instregex "^[SU]MOVv")>;
def : InstRW<[M1WriteNALU1], (instregex "^INSv.+lane")>;
-def : InstRW<[M1WriteNALU1], (instregex "^(TRN|UZP)(1|2)(v8i8|v4i16|v2i32)")>;
-def : InstRW<[M1WriteNALU2], (instregex "^(TRN|UZP)(1|2)(v16i8|v8i16|v4i32|v2i64)")>;
-def : InstRW<[M1WriteNALU1], (instregex "^ZIP(1|2)v")>;
+def : InstRW<[M1WriteNALU1], (instregex "^(TRN|UZP)[12](v8i8|v4i16|v2i32)")>;
+def : InstRW<[M1WriteNALU2], (instregex "^(TRN|UZP)[12](v16i8|v8i16|v4i32|v2i64)")>;
+def : InstRW<[M1WriteNALU1], (instregex "^ZIP[12]v")>;
// ASIMD load instructions.
// ASIMD store instructions.
// Cryptography instructions.
-def : InstRW<[M1WriteNCRYPT1], (instregex "^AES")>;
+def M1WriteAES : SchedWriteRes<[M1UnitNCRYPT]> { let Latency = 1; }
+def M1ReadAES : SchedReadAdvance<1, [M1WriteAES]>;
+def : InstRW<[M1WriteAES, M1ReadAES], (instregex "^AES")>;
+
def : InstRW<[M1WriteNCRYPT1], (instregex "^PMUL")>;
def : InstRW<[M1WriteNCRYPT1], (instregex "^SHA1(H|SU)")>;
def : InstRW<[M1WriteNCRYPT5], (instregex "^SHA1[CMP]")>;
diff --git a/lib/Target/AArch64/AArch64SchedVulcan.td b/lib/Target/AArch64/AArch64SchedVulcan.td
new file mode 100644
index 0000000000000..0aa2462eba837
--- /dev/null
+++ b/lib/Target/AArch64/AArch64SchedVulcan.td
@@ -0,0 +1,855 @@
+//=- AArch64SchedVulcan.td - Vulcan Scheduling Defs ----------*- tablegen -*-=//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// 1. Introduction
+//
+// This file defines the machine model for Broadcom Vulcan to support
+// instruction scheduling and other instruction cost heuristics.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// 2. Pipeline Description.
+
+def VulcanModel : SchedMachineModel {
+ let IssueWidth = 4; // 4 micro-ops dispatched at a time.
+ let MicroOpBufferSize = 180; // 180 entries in micro-op re-order buffer.
+ let LoadLatency = 4; // Optimistic load latency.
+ let MispredictPenalty = 12; // Extra cycles for mispredicted branch.
+ // Determined via a mix of micro-arch details and experimentation.
+ let LoopMicroOpBufferSize = 32;
+ let PostRAScheduler = 1; // Using PostRA sched.
+ let CompleteModel = 1;
+}
+
+// Define the issue ports.
+
+// Port 0: ALU, FP/SIMD.
+def VulcanP0 : ProcResource<1>;
+
+// Port 1: ALU, FP/SIMD, integer mul/div.
+def VulcanP1 : ProcResource<1>;
+
+// Port 2: ALU, Branch.
+def VulcanP2 : ProcResource<1>;
+
+// Port 3: Store data.
+def VulcanP3 : ProcResource<1>;
+
+// Port 4: Load/store.
+def VulcanP4 : ProcResource<1>;
+
+// Port 5: Load/store.
+def VulcanP5 : ProcResource<1>;
+
+let SchedModel = VulcanModel in {
+
+// Define groups for the functional units on each
+// issue port. Each group created will be used
+// by a WriteRes later on.
+//
+// NOTE: Some groups only contain one member. This
+// is a way to create names for the various functional
+// units that share a single issue port. For example,
+// "VulcanI1" for ALU ops on port 1 and "VulcanF1" for
+// FP ops on port 1.
+
+// Integer divide and multiply micro-ops only on port 1.
+def VulcanI1 : ProcResGroup<[VulcanP1]>;
+
+// Branch micro-ops only on port 2.
+def VulcanI2 : ProcResGroup<[VulcanP2]>;
+
+// ALU micro-ops on ports 0, 1, and 2.
+def VulcanI012 : ProcResGroup<[VulcanP0, VulcanP1, VulcanP2]>;
+
+// Crypto FP/SIMD micro-ops only on port 1.
+def VulcanF1 : ProcResGroup<[VulcanP1]>;
+
+// FP/SIMD micro-ops on ports 0 and 1.
+def VulcanF01 : ProcResGroup<[VulcanP0, VulcanP1]>;
+
+// Store data micro-ops only on port 3.
+def VulcanSD : ProcResGroup<[VulcanP3]>;
+
+// Load/store micro-ops on ports 4 and 5.
+def VulcanLS01 : ProcResGroup<[VulcanP4, VulcanP5]>;
+
+// 60 entry unified scheduler.
+def VulcanAny : ProcResGroup<[VulcanP0, VulcanP1, VulcanP2,
+ VulcanP3, VulcanP4, VulcanP5]> {
+ let BufferSize=60;
+}
+
+// Define commonly used write types for InstRW specializations.
+// All definitions follow the format: VulcanWrite_<NumCycles>Cyc_<Resources>.
+
+// 3 cycles on I1.
+def VulcanWrite_3Cyc_I1 : SchedWriteRes<[VulcanI1]> { let Latency = 3; }
+
+// 4 cycles on I1.
+def VulcanWrite_4Cyc_I1 : SchedWriteRes<[VulcanI1]> { let Latency = 4; }
+
+// 1 cycle on I0, I1, or I2.
+def VulcanWrite_1Cyc_I012 : SchedWriteRes<[VulcanI012]> { let Latency = 1; }
+
+// 5 cycles on F1.
+def VulcanWrite_5Cyc_F1 : SchedWriteRes<[VulcanF1]> { let Latency = 5; }
+
+// 7 cycles on F1.
+def VulcanWrite_7Cyc_F1 : SchedWriteRes<[VulcanF1]> { let Latency = 7; }
+
+// 4 cycles on F0 or F1.
+def VulcanWrite_4Cyc_F01 : SchedWriteRes<[VulcanF01]> { let Latency = 4; }
+
+// 5 cycles on F0 or F1.
+def VulcanWrite_5Cyc_F01 : SchedWriteRes<[VulcanF01]> { let Latency = 5; }
+
+// 6 cycles on F0 or F1.
+def VulcanWrite_6Cyc_F01 : SchedWriteRes<[VulcanF01]> { let Latency = 6; }
+
+// 7 cycles on F0 or F1.
+def VulcanWrite_7Cyc_F01 : SchedWriteRes<[VulcanF01]> { let Latency = 7; }
+
+// 8 cycles on F0 or F1.
+def VulcanWrite_8Cyc_F01 : SchedWriteRes<[VulcanF01]> { let Latency = 8; }
+
+// 16 cycles on F0 or F1.
+def VulcanWrite_16Cyc_F01 : SchedWriteRes<[VulcanF01]> {
+ let Latency = 16;
+ let ResourceCycles = [8];
+}
+
+// 23 cycles on F0 or F1.
+def VulcanWrite_23Cyc_F01 : SchedWriteRes<[VulcanF01]> {
+ let Latency = 23;
+ let ResourceCycles = [11];
+}
+
+// 1 cycles on LS0 or LS1.
+def VulcanWrite_1Cyc_LS01 : SchedWriteRes<[VulcanLS01]> { let Latency = 1; }
+
+// 4 cycles on LS0 or LS1.
+def VulcanWrite_4Cyc_LS01 : SchedWriteRes<[VulcanLS01]> { let Latency = 4; }
+
+// 5 cycles on LS0 or LS1.
+def VulcanWrite_5Cyc_LS01 : SchedWriteRes<[VulcanLS01]> { let Latency = 5; }
+
+// 6 cycles on LS0 or LS1.
+def VulcanWrite_6Cyc_LS01 : SchedWriteRes<[VulcanLS01]> { let Latency = 6; }
+
+// 5 cycles on LS0 or LS1 and I0, I1, or I2.
+def VulcanWrite_5Cyc_LS01_I012 : SchedWriteRes<[VulcanLS01, VulcanI012]> {
+ let Latency = 5;
+ let NumMicroOps = 2;
+}
+
+// 5 cycles on LS0 or LS1 and 2 of I0, I1, or I2.
+def VulcanWrite_6Cyc_LS01_I012_I012 :
+ SchedWriteRes<[VulcanLS01, VulcanI012, VulcanI012]> {
+ let Latency = 6;
+ let NumMicroOps = 3;
+}
+
+// 1 cycles on LS0 or LS1 and F0 or F1.
+def VulcanWrite_1Cyc_LS01_F01 : SchedWriteRes<[VulcanLS01, VulcanF01]> {
+ let Latency = 1;
+ let NumMicroOps = 2;
+}
+
+// 5 cycles on LS0 or LS1 and F0 or F1.
+def VulcanWrite_5Cyc_LS01_F01 : SchedWriteRes<[VulcanLS01, VulcanF01]> {
+ let Latency = 5;
+ let NumMicroOps = 2;
+}
+
+// 6 cycles on LS0 or LS1 and F0 or F1.
+def VulcanWrite_6Cyc_LS01_F01 : SchedWriteRes<[VulcanLS01, VulcanF01]> {
+ let Latency = 6;
+ let NumMicroOps = 2;
+}
+
+// 7 cycles on LS0 or LS1 and F0 or F1.
+def VulcanWrite_7Cyc_LS01_F01 : SchedWriteRes<[VulcanLS01, VulcanF01]> {
+ let Latency = 7;
+ let NumMicroOps = 2;
+}
+
+// 8 cycles on LS0 or LS1 and F0 or F1.
+def VulcanWrite_8Cyc_LS01_F01 : SchedWriteRes<[VulcanLS01, VulcanF01]> {
+ let Latency = 8;
+ let NumMicroOps = 2;
+}
+
+// Define commonly used read types.
+
+// No forwarding is provided for these types.
+def : ReadAdvance<ReadI, 0>;
+def : ReadAdvance<ReadISReg, 0>;
+def : ReadAdvance<ReadIEReg, 0>;
+def : ReadAdvance<ReadIM, 0>;
+def : ReadAdvance<ReadIMA, 0>;
+def : ReadAdvance<ReadID, 0>;
+def : ReadAdvance<ReadExtrHi, 0>;
+def : ReadAdvance<ReadAdrBase, 0>;
+def : ReadAdvance<ReadVLD, 0>;
+
+}
+
+
+//===----------------------------------------------------------------------===//
+// 3. Instruction Tables.
+
+let SchedModel = VulcanModel in {
+
+//---
+// 3.1 Branch Instructions
+//---
+
+// Branch, immed
+// Branch and link, immed
+// Compare and branch
+def : WriteRes<WriteBr, [VulcanI2]> { let Latency = 1; }
+
+def : WriteRes<WriteSys, []> { let Latency = 1; }
+def : WriteRes<WriteBarrier, []> { let Latency = 1; }
+def : WriteRes<WriteHint, []> { let Latency = 1; }
+
+def : WriteRes<WriteAtomic, []> { let Unsupported = 1; }
+
+// Branch, register
+// Branch and link, register != LR
+// Branch and link, register = LR
+def : WriteRes<WriteBrReg, [VulcanI2]> { let Latency = 1; }
+
+//---
+// 3.2 Arithmetic and Logical Instructions
+// 3.3 Move and Shift Instructions
+//---
+
+// ALU, basic
+// Conditional compare
+// Conditional select
+// Address generation
+def : WriteRes<WriteI, [VulcanI012]> { let Latency = 1; }
+def : InstRW<[WriteI], (instrs COPY)>;
+
+// ALU, extend and/or shift
+def : WriteRes<WriteISReg, [VulcanI012]> {
+ let Latency = 2;
+ let ResourceCycles = [2];
+}
+
+def : WriteRes<WriteIEReg, [VulcanI012]> {
+ let Latency = 2;
+ let ResourceCycles = [2];
+}
+
+// Move immed
+def : WriteRes<WriteImm, [VulcanI012]> { let Latency = 1; }
+
+// Variable shift
+def : WriteRes<WriteIS, [VulcanI012]> { let Latency = 1; }
+
+//---
+// 3.4 Divide and Multiply Instructions
+//---
+
+// Divide, W-form
+// Latency range of 13-23. Take the average.
+def : WriteRes<WriteID32, [VulcanI1]> {
+ let Latency = 18;
+ let ResourceCycles = [18];
+}
+
+// Divide, X-form
+// Latency range of 13-39. Take the average.
+def : WriteRes<WriteID64, [VulcanI1]> {
+ let Latency = 26;
+ let ResourceCycles = [26];
+}
+
+// Multiply accumulate, W-form
+def : WriteRes<WriteIM32, [VulcanI012]> { let Latency = 5; }
+
+// Multiply accumulate, X-form
+def : WriteRes<WriteIM64, [VulcanI012]> { let Latency = 5; }
+
+// Bitfield extract, two reg
+def : WriteRes<WriteExtr, [VulcanI012]> { let Latency = 1; }
+
+// Bitfield move, basic
+// Bitfield move, insert
+// NOTE: Handled by WriteIS.
+
+// Count leading
+def : InstRW<[VulcanWrite_3Cyc_I1], (instregex "^CLS(W|X)r$",
+ "^CLZ(W|X)r$")>;
+
+// Reverse bits/bytes
+// NOTE: Handled by WriteI.
+
+//---
+// 3.6 Load Instructions
+// 3.10 FP Load Instructions
+//---
+
+// Load register, literal
+// Load register, unscaled immed
+// Load register, immed unprivileged
+// Load register, unsigned immed
+def : WriteRes<WriteLD, [VulcanLS01]> { let Latency = 4; }
+
+// Load register, immed post-index
+// NOTE: Handled by WriteLD, WriteI.
+// Load register, immed pre-index
+// NOTE: Handled by WriteLD, WriteAdr.
+def : WriteRes<WriteAdr, [VulcanI012]> { let Latency = 1; }
+
+// Load register offset, basic
+// Load register, register offset, scale by 4/8
+// Load register, register offset, scale by 2
+// Load register offset, extend
+// Load register, register offset, extend, scale by 4/8
+// Load register, register offset, extend, scale by 2
+def VulcanWriteLDIdx : SchedWriteVariant<[
+ SchedVar<ScaledIdxPred, [VulcanWrite_6Cyc_LS01_I012_I012]>,
+ SchedVar<NoSchedPred, [VulcanWrite_5Cyc_LS01_I012]>]>;
+def : SchedAlias<WriteLDIdx, VulcanWriteLDIdx>;
+
+def VulcanReadAdrBase : SchedReadVariant<[
+ SchedVar<ScaledIdxPred, [ReadDefault]>,
+ SchedVar<NoSchedPred, [ReadDefault]>]>;
+def : SchedAlias<ReadAdrBase, VulcanReadAdrBase>;
+
+// Load pair, immed offset, normal
+// Load pair, immed offset, signed words, base != SP
+// Load pair, immed offset signed words, base = SP
+// LDP only breaks into *one* LS micro-op. Thus
+// the resources are handling by WriteLD.
+def : WriteRes<WriteLDHi, []> {
+ let Latency = 5;
+}
+
+// Load pair, immed pre-index, normal
+// Load pair, immed pre-index, signed words
+// Load pair, immed post-index, normal
+// Load pair, immed post-index, signed words
+// NOTE: Handled by WriteLD, WriteLDHi, WriteAdr.
+
+//--
+// 3.7 Store Instructions
+// 3.11 FP Store Instructions
+//--
+
+// Store register, unscaled immed
+// Store register, immed unprivileged
+// Store register, unsigned immed
+def : WriteRes<WriteST, [VulcanLS01, VulcanSD]> {
+ let Latency = 1;
+ let NumMicroOps = 2;
+}
+
+// Store register, immed post-index
+// NOTE: Handled by WriteAdr, WriteST, ReadAdrBase
+
+// Store register, immed pre-index
+// NOTE: Handled by WriteAdr, WriteST
+
+// Store register, register offset, basic
+// Store register, register offset, scaled by 4/8
+// Store register, register offset, scaled by 2
+// Store register, register offset, extend
+// Store register, register offset, extend, scale by 4/8
+// Store register, register offset, extend, scale by 1
+def : WriteRes<WriteSTIdx, [VulcanLS01, VulcanSD, VulcanI012]> {
+ let Latency = 1;
+ let NumMicroOps = 3;
+}
+
+// Store pair, immed offset, W-form
+// Store pair, immed offset, X-form
+def : WriteRes<WriteSTP, [VulcanLS01, VulcanSD]> {
+ let Latency = 1;
+ let NumMicroOps = 2;
+}
+
+// Store pair, immed post-index, W-form
+// Store pair, immed post-index, X-form
+// Store pair, immed pre-index, W-form
+// Store pair, immed pre-index, X-form
+// NOTE: Handled by WriteAdr, WriteSTP.
+
+//---
+// 3.8 FP Data Processing Instructions
+//---
+
+// FP absolute value
+// FP min/max
+// FP negate
+def : WriteRes<WriteF, [VulcanF01]> { let Latency = 5; }
+
+// FP arithmetic
+def : InstRW<[VulcanWrite_6Cyc_F01], (instregex "^FADD", "^FSUB")>;
+
+// FP compare
+def : WriteRes<WriteFCmp, [VulcanF01]> { let Latency = 5; }
+
+// FP divide, S-form
+// FP square root, S-form
+def : WriteRes<WriteFDiv, [VulcanF01]> {
+ let Latency = 16;
+ let ResourceCycles = [8];
+}
+
+// FP divide, D-form
+// FP square root, D-form
+def : InstRW<[VulcanWrite_23Cyc_F01], (instrs FDIVDrr, FSQRTDr)>;
+
+// FP multiply
+// FP multiply accumulate
+def : WriteRes<WriteFMul, [VulcanF01]> { let Latency = 6; }
+
+// FP round to integral
+def : InstRW<[VulcanWrite_7Cyc_F01],
+ (instregex "^FRINT(A|I|M|N|P|X|Z)(Sr|Dr)")>;
+
+// FP select
+def : InstRW<[VulcanWrite_4Cyc_F01], (instregex "^FCSEL")>;
+
+//---
+// 3.9 FP Miscellaneous Instructions
+//---
+
+// FP convert, from vec to vec reg
+// FP convert, from gen to vec reg
+// FP convert, from vec to gen reg
+def : WriteRes<WriteFCvt, [VulcanF01]> { let Latency = 7; }
+
+// FP move, immed
+// FP move, register
+def : WriteRes<WriteFImm, [VulcanF01]> { let Latency = 4; }
+
+// FP transfer, from gen to vec reg
+// FP transfer, from vec to gen reg
+def : WriteRes<WriteFCopy, [VulcanF01]> { let Latency = 4; }
+def : InstRW<[VulcanWrite_5Cyc_F01], (instrs FMOVXDHighr, FMOVDXHighr)>;
+
+//---
+// 3.12 ASIMD Integer Instructions
+//---
+
+// ASIMD absolute diff, D-form
+// ASIMD absolute diff, Q-form
+// ASIMD absolute diff accum, D-form
+// ASIMD absolute diff accum, Q-form
+// ASIMD absolute diff accum long
+// ASIMD absolute diff long
+// ASIMD arith, basic
+// ASIMD arith, complex
+// ASIMD compare
+// ASIMD logical (AND, BIC, EOR)
+// ASIMD max/min, basic
+// ASIMD max/min, reduce, 4H/4S
+// ASIMD max/min, reduce, 8B/8H
+// ASIMD max/min, reduce, 16B
+// ASIMD multiply, D-form
+// ASIMD multiply, Q-form
+// ASIMD multiply accumulate long
+// ASIMD multiply accumulate saturating long
+// ASIMD multiply long
+// ASIMD pairwise add and accumulate
+// ASIMD shift accumulate
+// ASIMD shift by immed, basic
+// ASIMD shift by immed and insert, basic, D-form
+// ASIMD shift by immed and insert, basic, Q-form
+// ASIMD shift by immed, complex
+// ASIMD shift by register, basic, D-form
+// ASIMD shift by register, basic, Q-form
+// ASIMD shift by register, complex, D-form
+// ASIMD shift by register, complex, Q-form
+def : WriteRes<WriteV, [VulcanF01]> { let Latency = 7; }
+
+// ASIMD arith, reduce, 4H/4S
+// ASIMD arith, reduce, 8B/8H
+// ASIMD arith, reduce, 16B
+def : InstRW<[VulcanWrite_5Cyc_F01],
+ (instregex "^ADDVv", "^SADDLVv", "^UADDLVv")>;
+
+// ASIMD logical (MOV, MVN, ORN, ORR)
+def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^ORRv", "^ORNv", "^NOTv")>;
+
+// ASIMD polynomial (8x8) multiply long
+def : InstRW<[VulcanWrite_5Cyc_F01], (instrs PMULLv8i8, PMULLv16i8)>;
+
+//---
+// 3.13 ASIMD Floating-point Instructions
+//---
+
+// ASIMD FP absolute value
+def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^FABSv")>;
+
+// ASIMD FP arith, normal, D-form
+// ASIMD FP arith, normal, Q-form
+def : InstRW<[VulcanWrite_6Cyc_F01], (instregex "^FABDv", "^FADDv", "^FSUBv")>;
+
+// ASIMD FP arith,pairwise, D-form
+// ASIMD FP arith, pairwise, Q-form
+def : InstRW<[VulcanWrite_6Cyc_F01], (instregex "^FADDPv")>;
+
+// ASIMD FP compare, D-form
+// ASIMD FP compare, Q-form
+def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^FACGEv", "^FACGTv")>;
+def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^FCMEQv", "^FCMGEv",
+ "^FCMGTv", "^FCMLEv",
+ "^FCMLTv")>;
+
+// ASIMD FP convert, long
+// ASIMD FP convert, narrow
+// ASIMD FP convert, other, D-form
+// ASIMD FP convert, other, Q-form
+// NOTE: Handled by WriteV.
+
+// ASIMD FP divide, D-form, F32
+def : InstRW<[VulcanWrite_16Cyc_F01], (instrs FDIVv2f32)>;
+
+// ASIMD FP divide, Q-form, F32
+def : InstRW<[VulcanWrite_16Cyc_F01], (instrs FDIVv4f32)>;
+
+// ASIMD FP divide, Q-form, F64
+def : InstRW<[VulcanWrite_23Cyc_F01], (instrs FDIVv2f64)>;
+
+// ASIMD FP max/min, normal, D-form
+// ASIMD FP max/min, normal, Q-form
+def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^FMAXv", "^FMAXNMv",
+ "^FMINv", "^FMINNMv")>;
+
+// ASIMD FP max/min, pairwise, D-form
+// ASIMD FP max/min, pairwise, Q-form
+def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^FMAXPv", "^FMAXNMPv",
+ "^FMINPv", "^FMINNMPv")>;
+
+// ASIMD FP max/min, reduce
+def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^FMAXVv", "^FMAXNMVv",
+ "^FMINVv", "^FMINNMVv")>;
+
+// ASIMD FP multiply, D-form, FZ
+// ASIMD FP multiply, D-form, no FZ
+// ASIMD FP multiply, Q-form, FZ
+// ASIMD FP multiply, Q-form, no FZ
+def : InstRW<[VulcanWrite_6Cyc_F01], (instregex "^FMULv", "^FMULXv")>;
+
+// ASIMD FP multiply accumulate, Dform, FZ
+// ASIMD FP multiply accumulate, Dform, no FZ
+// ASIMD FP multiply accumulate, Qform, FZ
+// ASIMD FP multiply accumulate, Qform, no FZ
+def : InstRW<[VulcanWrite_6Cyc_F01], (instregex "^FMLAv", "^FMLSv")>;
+
+// ASIMD FP negate
+def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^FNEGv")>;
+
+// ASIMD FP round, D-form
+// ASIMD FP round, Q-form
+// NOTE: Handled by WriteV.
+
+//--
+// 3.14 ASIMD Miscellaneous Instructions
+//--
+
+// ASIMD bit reverse
+def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^RBITv")>;
+
+// ASIMD bitwise insert, D-form
+// ASIMD bitwise insert, Q-form
+def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^BIFv", "^BITv", "^BSLv")>;
+
+// ASIMD count, D-form
+// ASIMD count, Q-form
+def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^CLSv", "^CLZv", "^CNTv")>;
+
+// ASIMD duplicate, gen reg
+// ASIMD duplicate, element
+def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^DUPv")>;
+
+// ASIMD extract
+def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^EXTv")>;
+
+// ASIMD extract narrow
+// ASIMD extract narrow, saturating
+// NOTE: Handled by WriteV.
+
+// ASIMD insert, element to element
+def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^INSv")>;
+
+// ASIMD move, integer immed
+def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^MOVIv", "^MOVIDv")>;
+
+// ASIMD move, FP immed
+def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^FMOVv")>;
+
+// ASIMD reciprocal estimate, D-form
+// ASIMD reciprocal estimate, Q-form
+def : InstRW<[VulcanWrite_5Cyc_F01],
+ (instregex "^FRECPEv", "^FRECPXv", "^URECPEv",
+ "^FRSQRTEv", "^URSQRTEv")>;
+
+// ASIMD reciprocal step, D-form, FZ
+// ASIMD reciprocal step, D-form, no FZ
+// ASIMD reciprocal step, Q-form, FZ
+// ASIMD reciprocal step, Q-form, no FZ
+def : InstRW<[VulcanWrite_6Cyc_F01], (instregex "^FRECPSv", "^FRSQRTSv")>;
+
+// ASIMD reverse
+def : InstRW<[VulcanWrite_5Cyc_F01],
+ (instregex "^REV16v", "^REV32v", "^REV64v")>;
+
+// ASIMD table lookup, D-form
+// ASIMD table lookup, Q-form
+def : InstRW<[VulcanWrite_8Cyc_F01], (instregex "^TBLv", "^TBXv")>;
+
+// ASIMD transfer, element to word or word
+def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^UMOVv")>;
+
+// ASIMD transfer, element to gen reg
+def : InstRW<[VulcanWrite_6Cyc_F01], (instregex "^SMOVv", "^UMOVv")>;
+
+// ASIMD transfer gen reg to element
+def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^INSv")>;
+
+// ASIMD transpose
+def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^TRN1v", "^TRN2v",
+ "^UZP1v", "^UZP2v")>;
+
+// ASIMD unzip/zip
+def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^ZIP1v", "^ZIP2v")>;
+
+//--
+// 3.15 ASIMD Load Instructions
+//--
+
+// ASIMD load, 1 element, multiple, 1 reg, D-form
+// ASIMD load, 1 element, multiple, 1 reg, Q-form
+def : InstRW<[VulcanWrite_4Cyc_LS01],
+ (instregex "^LD1Onev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[VulcanWrite_4Cyc_LS01, WriteAdr],
+ (instregex "^LD1Onev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD load, 1 element, multiple, 2 reg, D-form
+// ASIMD load, 1 element, multiple, 2 reg, Q-form
+def : InstRW<[VulcanWrite_4Cyc_LS01],
+ (instregex "^LD1Twov(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[VulcanWrite_4Cyc_LS01, WriteAdr],
+ (instregex "^LD1Twov(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD load, 1 element, multiple, 3 reg, D-form
+// ASIMD load, 1 element, multiple, 3 reg, Q-form
+def : InstRW<[VulcanWrite_5Cyc_LS01],
+ (instregex "^LD1Threev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[VulcanWrite_5Cyc_LS01, WriteAdr],
+ (instregex "^LD1Threev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD load, 1 element, multiple, 4 reg, D-form
+// ASIMD load, 1 element, multiple, 4 reg, Q-form
+def : InstRW<[VulcanWrite_6Cyc_LS01],
+ (instregex "^LD1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[VulcanWrite_6Cyc_LS01, WriteAdr],
+ (instregex "^LD1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD load, 1 element, one lane, B/H/S
+// ASIMD load, 1 element, one lane, D
+def : InstRW<[VulcanWrite_5Cyc_LS01_F01], (instregex "^LD1i(8|16|32|64)$")>;
+def : InstRW<[VulcanWrite_5Cyc_LS01_F01, WriteAdr],
+ (instregex "^LD1i(8|16|32|64)_POST$")>;
+
+// ASIMD load, 1 element, all lanes, D-form, B/H/S
+// ASIMD load, 1 element, all lanes, D-form, D
+// ASIMD load, 1 element, all lanes, Q-form
+def : InstRW<[VulcanWrite_5Cyc_LS01_F01],
+ (instregex "^LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[VulcanWrite_5Cyc_LS01_F01, WriteAdr],
+ (instregex "^LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD load, 2 element, multiple, D-form, B/H/S
+// ASIMD load, 2 element, multiple, Q-form, D
+def : InstRW<[VulcanWrite_5Cyc_LS01_F01],
+ (instregex "^LD2Twov(8b|4h|2s|16b|8h|4s|2d)$")>;
+def : InstRW<[VulcanWrite_5Cyc_LS01_F01, WriteAdr],
+ (instregex "^LD2Twov(8b|4h|2s|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD load, 2 element, one lane, B/H
+// ASIMD load, 2 element, one lane, S
+// ASIMD load, 2 element, one lane, D
+def : InstRW<[VulcanWrite_5Cyc_LS01_F01], (instregex "^LD2i(8|16|32|64)$")>;
+def : InstRW<[VulcanWrite_5Cyc_LS01_F01, WriteAdr],
+ (instregex "^LD2i(8|16|32|64)_POST$")>;
+
+// ASIMD load, 2 element, all lanes, D-form, B/H/S
+// ASIMD load, 2 element, all lanes, D-form, D
+// ASIMD load, 2 element, all lanes, Q-form
+def : InstRW<[VulcanWrite_5Cyc_LS01_F01],
+ (instregex "^LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[VulcanWrite_5Cyc_LS01_F01, WriteAdr],
+ (instregex "^LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD load, 3 element, multiple, D-form, B/H/S
+// ASIMD load, 3 element, multiple, Q-form, B/H/S
+// ASIMD load, 3 element, multiple, Q-form, D
+def : InstRW<[VulcanWrite_8Cyc_LS01_F01],
+ (instregex "^LD3Threev(8b|4h|2s|16b|8h|4s|2d)$")>;
+def : InstRW<[VulcanWrite_8Cyc_LS01_F01, WriteAdr],
+ (instregex "^LD3Threev(8b|4h|2s|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD load, 3 element, one lone, B/H
+// ASIMD load, 3 element, one lane, S
+// ASIMD load, 3 element, one lane, D
+def : InstRW<[VulcanWrite_7Cyc_LS01_F01], (instregex "^LD3i(8|16|32|64)$")>;
+def : InstRW<[VulcanWrite_7Cyc_LS01_F01, WriteAdr],
+ (instregex "^LD3i(8|16|32|64)_POST$")>;
+
+// ASIMD load, 3 element, all lanes, D-form, B/H/S
+// ASIMD load, 3 element, all lanes, D-form, D
+// ASIMD load, 3 element, all lanes, Q-form, B/H/S
+// ASIMD load, 3 element, all lanes, Q-form, D
+def : InstRW<[VulcanWrite_7Cyc_LS01_F01],
+ (instregex "^LD3Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[VulcanWrite_7Cyc_LS01_F01, WriteAdr],
+ (instregex "^LD3Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD load, 4 element, multiple, D-form, B/H/S
+// ASIMD load, 4 element, multiple, Q-form, B/H/S
+// ASIMD load, 4 element, multiple, Q-form, D
+def : InstRW<[VulcanWrite_8Cyc_LS01_F01],
+ (instregex "^LD4Fourv(8b|4h|2s|16b|8h|4s|2d)$")>;
+def : InstRW<[VulcanWrite_8Cyc_LS01_F01, WriteAdr],
+ (instregex "^LD4Fourv(8b|4h|2s|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD load, 4 element, one lane, B/H
+// ASIMD load, 4 element, one lane, S
+// ASIMD load, 4 element, one lane, D
+def : InstRW<[VulcanWrite_6Cyc_LS01_F01], (instregex "^LD4i(8|16|32|64)$")>;
+def : InstRW<[VulcanWrite_6Cyc_LS01_F01, WriteAdr],
+ (instregex "^LD4i(8|16|32|64)_POST$")>;
+
+// ASIMD load, 4 element, all lanes, D-form, B/H/S
+// ASIMD load, 4 element, all lanes, D-form, D
+// ASIMD load, 4 element, all lanes, Q-form, B/H/S
+// ASIMD load, 4 element, all lanes, Q-form, D
+def : InstRW<[VulcanWrite_6Cyc_LS01_F01],
+ (instregex "^LD4Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[VulcanWrite_6Cyc_LS01_F01, WriteAdr],
+ (instregex "^LD4Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+//--
+// 3.16 ASIMD Store Instructions
+//--
+
+// ASIMD store, 1 element, multiple, 1 reg, D-form
+// ASIMD store, 1 element, multiple, 1 reg, Q-form
+def : InstRW<[VulcanWrite_1Cyc_LS01],
+ (instregex "^ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[VulcanWrite_1Cyc_LS01, WriteAdr],
+ (instregex "^ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD store, 1 element, multiple, 2 reg, D-form
+// ASIMD store, 1 element, multiple, 2 reg, Q-form
+def : InstRW<[VulcanWrite_1Cyc_LS01],
+ (instregex "^ST1Twov(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[VulcanWrite_1Cyc_LS01, WriteAdr],
+ (instregex "^ST1Twov(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD store, 1 element, multiple, 3 reg, D-form
+// ASIMD store, 1 element, multiple, 3 reg, Q-form
+def : InstRW<[VulcanWrite_1Cyc_LS01],
+ (instregex "^ST1Threev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[VulcanWrite_1Cyc_LS01, WriteAdr],
+ (instregex "^ST1Threev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD store, 1 element, multiple, 4 reg, D-form
+// ASIMD store, 1 element, multiple, 4 reg, Q-form
+def : InstRW<[VulcanWrite_1Cyc_LS01],
+ (instregex "^ST1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[VulcanWrite_1Cyc_LS01, WriteAdr],
+ (instregex "^ST1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD store, 1 element, one lane, B/H/S
+// ASIMD store, 1 element, one lane, D
+def : InstRW<[VulcanWrite_1Cyc_LS01_F01],
+ (instregex "^ST1i(8|16|32|64)$")>;
+def : InstRW<[VulcanWrite_1Cyc_LS01_F01, WriteAdr],
+ (instregex "^ST1i(8|16|32|64)_POST$")>;
+
+// ASIMD store, 2 element, multiple, D-form, B/H/S
+// ASIMD store, 2 element, multiple, Q-form, B/H/S
+// ASIMD store, 2 element, multiple, Q-form, D
+def : InstRW<[VulcanWrite_1Cyc_LS01_F01],
+ (instregex "^ST2Twov(8b|4h|2s|16b|8h|4s|2d)$")>;
+def : InstRW<[VulcanWrite_1Cyc_LS01_F01, WriteAdr],
+ (instregex "^ST2Twov(8b|4h|2s|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD store, 2 element, one lane, B/H/S
+// ASIMD store, 2 element, one lane, D
+def : InstRW<[VulcanWrite_1Cyc_LS01_F01],
+ (instregex "^ST2i(8|16|32|64)$")>;
+def : InstRW<[VulcanWrite_1Cyc_LS01_F01, WriteAdr],
+ (instregex "^ST2i(8|16|32|64)_POST$")>;
+
+// ASIMD store, 3 element, multiple, D-form, B/H/S
+// ASIMD store, 3 element, multiple, Q-form, B/H/S
+// ASIMD store, 3 element, multiple, Q-form, D
+def : InstRW<[VulcanWrite_1Cyc_LS01_F01],
+ (instregex "^ST3Threev(8b|4h|2s|16b|8h|4s|2d)$")>;
+def : InstRW<[VulcanWrite_1Cyc_LS01_F01, WriteAdr],
+ (instregex "^ST3Threev(8b|4h|2s|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD store, 3 element, one lane, B/H
+// ASIMD store, 3 element, one lane, S
+// ASIMD store, 3 element, one lane, D
+def : InstRW<[VulcanWrite_1Cyc_LS01_F01], (instregex "^ST3i(8|16|32|64)$")>;
+def : InstRW<[VulcanWrite_1Cyc_LS01_F01, WriteAdr],
+ (instregex "^ST3i(8|16|32|64)_POST$")>;
+
+// ASIMD store, 4 element, multiple, D-form, B/H/S
+// ASIMD store, 4 element, multiple, Q-form, B/H/S
+// ASIMD store, 4 element, multiple, Q-form, D
+def : InstRW<[VulcanWrite_1Cyc_LS01_F01],
+ (instregex "^ST4Fourv(8b|4h|2s|16b|8h|4s|2d)$")>;
+def : InstRW<[VulcanWrite_1Cyc_LS01_F01, WriteAdr],
+ (instregex "^ST4Fourv(8b|4h|2s|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD store, 4 element, one lane, B/H
+// ASIMD store, 4 element, one lane, S
+// ASIMD store, 4 element, one lane, D
+def : InstRW<[VulcanWrite_1Cyc_LS01_F01], (instregex "^ST4i(8|16|32|64)$")>;
+def : InstRW<[VulcanWrite_1Cyc_LS01_F01, WriteAdr],
+ (instregex "^ST4i(8|16|32|64)_POST$")>;
+
+//--
+// 3.17 Cryptography Extensions
+//--
+
+// Crypto AES ops
+def : InstRW<[VulcanWrite_5Cyc_F1], (instregex "^AES")>;
+
+// Crypto polynomial (64x64) multiply long
+def : InstRW<[VulcanWrite_5Cyc_F1], (instrs PMULLv1i64, PMULLv2i64)>;
+
+// Crypto SHA1 xor ops
+// Crypto SHA1 schedule acceleration ops
+// Crypto SHA256 schedule acceleration op (1 u-op)
+// Crypto SHA256 schedule acceleration op (2 u-ops)
+// Crypto SHA256 hash acceleration ops
+def : InstRW<[VulcanWrite_7Cyc_F1], (instregex "^SHA")>;
+
+//--
+// 3.18 CRC
+//--
+
+// CRC checksum ops
+def : InstRW<[VulcanWrite_4Cyc_I1], (instregex "^CRC32")>;
+
+} // SchedModel = VulcanModel
diff --git a/lib/Target/AArch64/AArch64Schedule.td b/lib/Target/AArch64/AArch64Schedule.td
index eaa9110ab1bc6..ce81f48acf712 100644
--- a/lib/Target/AArch64/AArch64Schedule.td
+++ b/lib/Target/AArch64/AArch64Schedule.td
@@ -51,15 +51,15 @@ def WriteSTIdx : SchedWrite; // Store to a register index (maybe scaled).
def ReadAdrBase : SchedRead; // Read the base resister of a reg-offset LD/ST.
// Predicate for determining when a shiftable register is shifted.
-def RegShiftedPred : SchedPredicate<[{TII->hasShiftedReg(MI)}]>;
+def RegShiftedPred : SchedPredicate<[{TII->hasShiftedReg(*MI)}]>;
// Predicate for determining when a extendedable register is extended.
-def RegExtendedPred : SchedPredicate<[{TII->hasExtendedReg(MI)}]>;
+def RegExtendedPred : SchedPredicate<[{TII->hasExtendedReg(*MI)}]>;
// ScaledIdxPred is true if a WriteLDIdx operand will be
// scaled. Subtargets can use this to dynamically select resources and
// latency for WriteLDIdx and ReadAdrBase.
-def ScaledIdxPred : SchedPredicate<[{TII->isScaledAddr(MI)}]>;
+def ScaledIdxPred : SchedPredicate<[{TII->isScaledAddr(*MI)}]>;
// Serialized two-level address load.
// EXAMPLE: LOADGot
@@ -92,6 +92,8 @@ def WriteV : SchedWrite; // Vector ops.
def WriteVLD : SchedWrite; // Vector loads.
def WriteVST : SchedWrite; // Vector stores.
+def WriteAtomic : SchedWrite; // Atomic memory operations (CAS, Swap, LDOP)
+
// Read the unwritten lanes of the VLD's destination registers.
def ReadVLD : SchedRead;
diff --git a/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp b/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
index f40293021d74e..66a8f332513a7 100644
--- a/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
+++ b/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
@@ -17,7 +17,7 @@ using namespace llvm;
#define DEBUG_TYPE "aarch64-selectiondag-info"
SDValue AArch64SelectionDAGInfo::EmitTargetCodeForMemset(
- SelectionDAG &DAG, SDLoc dl, SDValue Chain, SDValue Dst, SDValue Src,
+ SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,
SDValue Size, unsigned Align, bool isVolatile,
MachinePointerInfo DstPtrInfo) const {
// Check to see if there is a specialized entry-point for memory zeroing.
@@ -44,10 +44,16 @@ SDValue AArch64SelectionDAGInfo::EmitTargetCodeForMemset(
TargetLowering::CallLoweringInfo CLI(DAG);
CLI.setDebugLoc(dl).setChain(Chain)
.setCallee(CallingConv::C, Type::getVoidTy(*DAG.getContext()),
- DAG.getExternalSymbol(bzeroEntry, IntPtr), std::move(Args), 0)
+ DAG.getExternalSymbol(bzeroEntry, IntPtr), std::move(Args))
.setDiscardResult();
std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
return CallResult.second;
}
return SDValue();
}
+bool AArch64SelectionDAGInfo::generateFMAsInMachineCombiner(
+ CodeGenOpt::Level OptLevel) const {
+ if (OptLevel >= CodeGenOpt::Aggressive)
+ return true;
+ return false;
+}
diff --git a/lib/Target/AArch64/AArch64SelectionDAGInfo.h b/lib/Target/AArch64/AArch64SelectionDAGInfo.h
index 97421b45b122e..7e4f11091226d 100644
--- a/lib/Target/AArch64/AArch64SelectionDAGInfo.h
+++ b/lib/Target/AArch64/AArch64SelectionDAGInfo.h
@@ -7,24 +7,24 @@
//
//===----------------------------------------------------------------------===//
//
-// This file defines the AArch64 subclass for TargetSelectionDAGInfo.
+// This file defines the AArch64 subclass for SelectionDAGTargetInfo.
//
//===----------------------------------------------------------------------===//
#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64SELECTIONDAGINFO_H
#define LLVM_LIB_TARGET_AARCH64_AARCH64SELECTIONDAGINFO_H
-#include "llvm/Target/TargetSelectionDAGInfo.h"
+#include "llvm/CodeGen/SelectionDAGTargetInfo.h"
namespace llvm {
-class AArch64SelectionDAGInfo : public TargetSelectionDAGInfo {
+class AArch64SelectionDAGInfo : public SelectionDAGTargetInfo {
public:
-
- SDValue EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl, SDValue Chain,
- SDValue Dst, SDValue Src, SDValue Size,
- unsigned Align, bool isVolatile,
+ SDValue EmitTargetCodeForMemset(SelectionDAG &DAG, const SDLoc &dl,
+ SDValue Chain, SDValue Dst, SDValue Src,
+ SDValue Size, unsigned Align, bool isVolatile,
MachinePointerInfo DstPtrInfo) const override;
+ bool generateFMAsInMachineCombiner(CodeGenOpt::Level OptLevel) const override;
};
}
diff --git a/lib/Target/AArch64/AArch64StorePairSuppress.cpp b/lib/Target/AArch64/AArch64StorePairSuppress.cpp
index 1c6b15790ea98..f904b23794169 100644
--- a/lib/Target/AArch64/AArch64StorePairSuppress.cpp
+++ b/lib/Target/AArch64/AArch64StorePairSuppress.cpp
@@ -115,6 +115,9 @@ bool AArch64StorePairSuppress::isNarrowFPStore(const MachineInstr &MI) {
}
bool AArch64StorePairSuppress::runOnMachineFunction(MachineFunction &MF) {
+ if (skipFunction(*MF.getFunction()))
+ return false;
+
const TargetSubtargetInfo &ST = MF.getSubtarget();
TII = static_cast<const AArch64InstrInfo *>(ST.getInstrInfo());
TRI = ST.getRegisterInfo();
@@ -141,8 +144,8 @@ bool AArch64StorePairSuppress::runOnMachineFunction(MachineFunction &MF) {
if (!isNarrowFPStore(MI))
continue;
unsigned BaseReg;
- unsigned Offset;
- if (TII->getMemOpBaseRegImmOfs(&MI, BaseReg, Offset, TRI)) {
+ int64_t Offset;
+ if (TII->getMemOpBaseRegImmOfs(MI, BaseReg, Offset, TRI)) {
if (PrevBaseReg == BaseReg) {
// If this block can take STPs, skip ahead to the next block.
if (!SuppressSTP && shouldAddSTPToBlock(MI.getParent()))
@@ -150,7 +153,7 @@ bool AArch64StorePairSuppress::runOnMachineFunction(MachineFunction &MF) {
// Otherwise, continue unpairing the stores in this block.
DEBUG(dbgs() << "Unpairing store " << MI << "\n");
SuppressSTP = true;
- TII->suppressLdStPair(&MI);
+ TII->suppressLdStPair(MI);
}
PrevBaseReg = BaseReg;
} else
diff --git a/lib/Target/AArch64/AArch64Subtarget.cpp b/lib/Target/AArch64/AArch64Subtarget.cpp
index f6ee8cf47a6a4..7dd8ccbe6c25e 100644
--- a/lib/Target/AArch64/AArch64Subtarget.cpp
+++ b/lib/Target/AArch64/AArch64Subtarget.cpp
@@ -11,10 +11,9 @@
//
//===----------------------------------------------------------------------===//
+#include "AArch64Subtarget.h"
#include "AArch64InstrInfo.h"
#include "AArch64PBQPRegAlloc.h"
-#include "AArch64Subtarget.h"
-#include "llvm/ADT/SmallVector.h"
#include "llvm/CodeGen/MachineScheduler.h"
#include "llvm/IR/GlobalValue.h"
#include "llvm/Support/TargetRegistry.h"
@@ -44,58 +43,83 @@ AArch64Subtarget::initializeSubtargetDependencies(StringRef FS) {
CPUString = "generic";
ParseSubtargetFeatures(CPUString, FS);
+ initializeProperties();
+
return *this;
}
+void AArch64Subtarget::initializeProperties() {
+ // Initialize CPU specific properties. We should add a tablegen feature for
+ // this in the future so we can specify it together with the subtarget
+ // features.
+ switch (ARMProcFamily) {
+ case Cyclone:
+ CacheLineSize = 64;
+ PrefetchDistance = 280;
+ MinPrefetchStride = 2048;
+ MaxPrefetchIterationsAhead = 3;
+ break;
+ case CortexA57:
+ MaxInterleaveFactor = 4;
+ break;
+ case ExynosM1:
+ PrefFunctionAlignment = 4;
+ PrefLoopAlignment = 3;
+ break;
+ case Kryo:
+ MaxInterleaveFactor = 4;
+ VectorInsertExtractBaseCost = 2;
+ CacheLineSize = 128;
+ PrefetchDistance = 740;
+ MinPrefetchStride = 1024;
+ MaxPrefetchIterationsAhead = 11;
+ break;
+ case Vulcan:
+ MaxInterleaveFactor = 4;
+ break;
+ case CortexA35: break;
+ case CortexA53: break;
+ case CortexA72: break;
+ case CortexA73: break;
+ case Others: break;
+ }
+}
+
AArch64Subtarget::AArch64Subtarget(const Triple &TT, const std::string &CPU,
const std::string &FS,
const TargetMachine &TM, bool LittleEndian)
- : AArch64GenSubtargetInfo(TT, CPU, FS), ARMProcFamily(Others),
- HasV8_1aOps(false), HasV8_2aOps(false), HasFPARMv8(false), HasNEON(false),
- HasCrypto(false), HasCRC(false), HasPerfMon(false), HasFullFP16(false),
- HasZeroCycleRegMove(false), HasZeroCycleZeroing(false),
- StrictAlign(false), ReserveX18(TT.isOSDarwin()), IsLittle(LittleEndian),
- CPUString(CPU), TargetTriple(TT), FrameLowering(),
+ : AArch64GenSubtargetInfo(TT, CPU, FS), ReserveX18(TT.isOSDarwin()),
+ IsLittle(LittleEndian), CPUString(CPU), TargetTriple(TT), FrameLowering(),
InstrInfo(initializeSubtargetDependencies(FS)), TSInfo(),
- TLInfo(TM, *this) {}
+ TLInfo(TM, *this), GISel() {}
+
+const CallLowering *AArch64Subtarget::getCallLowering() const {
+ assert(GISel && "Access to GlobalISel APIs not set");
+ return GISel->getCallLowering();
+}
+
+const RegisterBankInfo *AArch64Subtarget::getRegBankInfo() const {
+ assert(GISel && "Access to GlobalISel APIs not set");
+ return GISel->getRegBankInfo();
+}
-/// ClassifyGlobalReference - Find the target operand flags that describe
-/// how a global value should be referenced for the current subtarget.
+/// Find the target operand flags that describe how a global value should be
+/// referenced for the current subtarget.
unsigned char
AArch64Subtarget::ClassifyGlobalReference(const GlobalValue *GV,
- const TargetMachine &TM) const {
- bool isDef = GV->isStrongDefinitionForLinker();
-
+ const TargetMachine &TM) const {
// MachO large model always goes via a GOT, simply to get a single 8-byte
// absolute relocation on all global addresses.
if (TM.getCodeModel() == CodeModel::Large && isTargetMachO())
return AArch64II::MO_GOT;
+ if (!TM.shouldAssumeDSOLocal(*GV->getParent(), GV))
+ return AArch64II::MO_GOT;
+
// The small code mode's direct accesses use ADRP, which cannot necessarily
// produce the value 0 (if the code is above 4GB).
- if (TM.getCodeModel() == CodeModel::Small && GV->hasExternalWeakLinkage()) {
- // In PIC mode use the GOT, but in absolute mode use a constant pool load.
- if (TM.getRelocationModel() == Reloc::Static)
- return AArch64II::MO_CONSTPOOL;
- else
- return AArch64II::MO_GOT;
- }
-
- // If symbol visibility is hidden, the extra load is not needed if
- // the symbol is definitely defined in the current translation unit.
-
- // The handling of non-hidden symbols in PIC mode is rather target-dependent:
- // + On MachO, if the symbol is defined in this module the GOT can be
- // skipped.
- // + On ELF, the R_AARCH64_COPY relocation means that even symbols actually
- // defined could end up in unexpected places. Use a GOT.
- if (TM.getRelocationModel() != Reloc::Static && GV->hasDefaultVisibility()) {
- if (isTargetMachO())
- return isDef ? AArch64II::MO_NO_FLAG : AArch64II::MO_GOT;
- else
- // No need to go through the GOT for local symbols on ELF.
- return GV->hasLocalLinkage() ? AArch64II::MO_NO_FLAG : AArch64II::MO_GOT;
- }
+ if (TM.getCodeModel() == CodeModel::Small && GV->hasExternalWeakLinkage())
+ return AArch64II::MO_GOT;
return AArch64II::MO_NO_FLAG;
}
@@ -114,8 +138,7 @@ const char *AArch64Subtarget::getBZeroEntry() const {
}
void AArch64Subtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
- MachineInstr *begin, MachineInstr *end,
- unsigned NumRegionInstrs) const {
+ unsigned NumRegionInstrs) const {
// LNT run (at least on Cyclone) showed reasonably significant gains for
// bi-directional scheduling. 253.perlbmk.
Policy.OnlyTopDown = false;
@@ -123,8 +146,7 @@ void AArch64Subtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
// Enabling or Disabling the latency heuristic is a close call: It seems to
// help nearly no benchmark on out-of-order architectures, on the other hand
// it regresses register pressure on a few benchmarking.
- if (isCyclone())
- Policy.DisableLatencyHeuristic = true;
+ Policy.DisableLatencyHeuristic = DisableLatencySchedHeuristic;
}
bool AArch64Subtarget::enableEarlyIfConversion() const {
@@ -146,8 +168,5 @@ bool AArch64Subtarget::supportsAddressTopByteIgnored() const {
std::unique_ptr<PBQPRAConstraint>
AArch64Subtarget::getCustomPBQPConstraints() const {
- if (!isCortexA57())
- return nullptr;
-
- return llvm::make_unique<A57ChainingConstraint>();
+ return balanceFPOps() ? llvm::make_unique<A57ChainingConstraint>() : nullptr;
}
diff --git a/lib/Target/AArch64/AArch64Subtarget.h b/lib/Target/AArch64/AArch64Subtarget.h
index 151133b2f32c9..16a35405c8924 100644
--- a/lib/Target/AArch64/AArch64Subtarget.h
+++ b/lib/Target/AArch64/AArch64Subtarget.h
@@ -19,6 +19,7 @@
#include "AArch64InstrInfo.h"
#include "AArch64RegisterInfo.h"
#include "AArch64SelectionDAGInfo.h"
+#include "llvm/CodeGen/GlobalISel/GISelAccessor.h"
#include "llvm/IR/DataLayout.h"
#include "llvm/Target/TargetSubtargetInfo.h"
#include <string>
@@ -32,38 +33,64 @@ class StringRef;
class Triple;
class AArch64Subtarget : public AArch64GenSubtargetInfo {
-protected:
- enum ARMProcFamilyEnum {
+public:
+ enum ARMProcFamilyEnum : uint8_t {
Others,
CortexA35,
CortexA53,
CortexA57,
+ CortexA72,
+ CortexA73,
Cyclone,
- ExynosM1
+ ExynosM1,
+ Kryo,
+ Vulcan
};
+protected:
/// ARMProcFamily - ARM processor family: Cortex-A53, Cortex-A57, and others.
- ARMProcFamilyEnum ARMProcFamily;
+ ARMProcFamilyEnum ARMProcFamily = Others;
- bool HasV8_1aOps;
- bool HasV8_2aOps;
+ bool HasV8_1aOps = false;
+ bool HasV8_2aOps = false;
- bool HasFPARMv8;
- bool HasNEON;
- bool HasCrypto;
- bool HasCRC;
- bool HasPerfMon;
- bool HasFullFP16;
- bool HasSPE;
+ bool HasFPARMv8 = false;
+ bool HasNEON = false;
+ bool HasCrypto = false;
+ bool HasCRC = false;
+ bool HasRAS = false;
+ bool HasPerfMon = false;
+ bool HasFullFP16 = false;
+ bool HasSPE = false;
// HasZeroCycleRegMove - Has zero-cycle register mov instructions.
- bool HasZeroCycleRegMove;
+ bool HasZeroCycleRegMove = false;
// HasZeroCycleZeroing - Has zero-cycle zeroing instructions.
- bool HasZeroCycleZeroing;
+ bool HasZeroCycleZeroing = false;
// StrictAlign - Disallow unaligned memory accesses.
- bool StrictAlign;
+ bool StrictAlign = false;
+ bool MergeNarrowLoads = false;
+ bool UseAA = false;
+ bool PredictableSelectIsExpensive = false;
+ bool BalanceFPOps = false;
+ bool CustomAsCheapAsMove = false;
+ bool UsePostRAScheduler = false;
+ bool Misaligned128StoreIsSlow = false;
+ bool AvoidQuadLdStPairs = false;
+ bool UseAlternateSExtLoadCVTF32Pattern = false;
+ bool HasMacroOpFusion = false;
+ bool DisableLatencySchedHeuristic = false;
+ bool UseRSqrt = false;
+ uint8_t MaxInterleaveFactor = 2;
+ uint8_t VectorInsertExtractBaseCost = 3;
+ uint16_t CacheLineSize = 0;
+ uint16_t PrefetchDistance = 0;
+ uint16_t MinPrefetchStride = 1;
+ unsigned MaxPrefetchIterationsAhead = UINT_MAX;
+ unsigned PrefFunctionAlignment = 0;
+ unsigned PrefLoopAlignment = 0;
// ReserveX18 - X18 is not available as a general purpose register.
bool ReserveX18;
@@ -80,12 +107,20 @@ protected:
AArch64InstrInfo InstrInfo;
AArch64SelectionDAGInfo TSInfo;
AArch64TargetLowering TLInfo;
+ /// Gather the accessor points to GlobalISel-related APIs.
+ /// This is used to avoid ifndefs spreading around while GISel is
+ /// an optional library.
+ std::unique_ptr<GISelAccessor> GISel;
+
private:
/// initializeSubtargetDependencies - Initializes using CPUString and the
/// passed in feature string so that we can use initializer lists for
/// subtarget initialization.
AArch64Subtarget &initializeSubtargetDependencies(StringRef FS);
+ /// Initialize properties based on the selected processor family.
+ void initializeProperties();
+
public:
/// This constructor initializes the data members to match that
/// of the specified triple.
@@ -93,6 +128,11 @@ public:
const std::string &FS, const TargetMachine &TM,
bool LittleEndian);
+ /// This object will take onwership of \p GISelAccessor.
+ void setGISelAccessor(GISelAccessor &GISel) {
+ this->GISel.reset(&GISel);
+ }
+
const AArch64SelectionDAGInfo *getSelectionDAGInfo() const override {
return &TSInfo;
}
@@ -106,10 +146,20 @@ public:
const AArch64RegisterInfo *getRegisterInfo() const override {
return &getInstrInfo()->getRegisterInfo();
}
+ const CallLowering *getCallLowering() const override;
+ const RegisterBankInfo *getRegBankInfo() const override;
const Triple &getTargetTriple() const { return TargetTriple; }
bool enableMachineScheduler() const override { return true; }
bool enablePostRAScheduler() const override {
- return isGeneric() || isCortexA53() || isCortexA57();
+ return UsePostRAScheduler;
+ }
+
+ /// Returns ARM processor family.
+ /// Avoid this function! CPU specifics should be kept local to this class
+ /// and preferably modeled with SubtargetFeatures or properties in
+ /// initializeProperties().
+ ARMProcFamilyEnum getProcFamily() const {
+ return ARMProcFamily;
}
bool hasV8_1aOps() const { return HasV8_1aOps; }
@@ -126,6 +176,33 @@ public:
bool hasNEON() const { return HasNEON; }
bool hasCrypto() const { return HasCrypto; }
bool hasCRC() const { return HasCRC; }
+ bool hasRAS() const { return HasRAS; }
+ bool mergeNarrowLoads() const { return MergeNarrowLoads; }
+ bool balanceFPOps() const { return BalanceFPOps; }
+ bool predictableSelectIsExpensive() const {
+ return PredictableSelectIsExpensive;
+ }
+ bool hasCustomCheapAsMoveHandling() const { return CustomAsCheapAsMove; }
+ bool isMisaligned128StoreSlow() const { return Misaligned128StoreIsSlow; }
+ bool avoidQuadLdStPairs() const { return AvoidQuadLdStPairs; }
+ bool useAlternateSExtLoadCVTF32Pattern() const {
+ return UseAlternateSExtLoadCVTF32Pattern;
+ }
+ bool hasMacroOpFusion() const { return HasMacroOpFusion; }
+ bool useRSqrt() const { return UseRSqrt; }
+ unsigned getMaxInterleaveFactor() const { return MaxInterleaveFactor; }
+ unsigned getVectorInsertExtractBaseCost() const {
+ return VectorInsertExtractBaseCost;
+ }
+ unsigned getCacheLineSize() const { return CacheLineSize; }
+ unsigned getPrefetchDistance() const { return PrefetchDistance; }
+ unsigned getMinPrefetchStride() const { return MinPrefetchStride; }
+ unsigned getMaxPrefetchIterationsAhead() const {
+ return MaxPrefetchIterationsAhead;
+ }
+ unsigned getPrefFunctionAlignment() const { return PrefFunctionAlignment; }
+ unsigned getPrefLoopAlignment() const { return PrefLoopAlignment; }
+
/// CPU has TBI (top byte of addresses is ignored during HW address
/// translation) and OS enables it.
bool supportsAddressTopByteIgnored() const;
@@ -146,13 +223,7 @@ public:
bool isTargetELF() const { return TargetTriple.isOSBinFormatELF(); }
bool isTargetMachO() const { return TargetTriple.isOSBinFormatMachO(); }
- bool isGeneric() const { return CPUString == "generic"; }
- bool isCyclone() const { return CPUString == "cyclone"; }
- bool isCortexA57() const { return CPUString == "cortex-a57"; }
- bool isCortexA53() const { return CPUString == "cortex-a53"; }
- bool isExynosM1() const { return CPUString == "exynos-m1"; }
-
- bool useAA() const override { return isCortexA53(); }
+ bool useAA() const override { return UseAA; }
/// getMaxInlineSizeThreshold - Returns the maximum memset / memcpy size
/// that still makes it profitable to inline the call.
@@ -174,8 +245,7 @@ public:
/// returns null.
const char *getBZeroEntry() const;
- void overrideSchedPolicy(MachineSchedPolicy &Policy, MachineInstr *begin,
- MachineInstr *end,
+ void overrideSchedPolicy(MachineSchedPolicy &Policy,
unsigned NumRegionInstrs) const override;
bool enableEarlyIfConversion() const override;
diff --git a/lib/Target/AArch64/AArch64SystemOperands.td b/lib/Target/AArch64/AArch64SystemOperands.td
new file mode 100644
index 0000000000000..a3736c0868fb7
--- /dev/null
+++ b/lib/Target/AArch64/AArch64SystemOperands.td
@@ -0,0 +1,1018 @@
+//===- AArch64SystemOperands.td ----------------------------*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the symbolic operands permitted for various kinds of
+// AArch64 system instruction.
+//
+//===----------------------------------------------------------------------===//
+
+include "llvm/TableGen/SearchableTable.td"
+
+//===----------------------------------------------------------------------===//
+// AT (address translate) instruction options.
+//===----------------------------------------------------------------------===//
+
+class AT<string name, bits<2> op0, bits<3> op1, bits<4> crn, bits<4> crm,
+ bits<3> op2> : SearchableTable {
+ let SearchableFields = ["Name", "Encoding"];
+ let EnumValueField = "Encoding";
+
+ string Name = name;
+ bits<16> Encoding;
+ let Encoding{15-14} = op0;
+ let Encoding{13-11} = op1;
+ let Encoding{10-7} = crn;
+ let Encoding{6-3} = crm;
+ let Encoding{2-0} = op2;
+}
+
+def : AT<"S1E1R", 0b01, 0b000, 0b0111, 0b1000, 0b000>;
+def : AT<"S1E2R", 0b01, 0b100, 0b0111, 0b1000, 0b000>;
+def : AT<"S1E3R", 0b01, 0b110, 0b0111, 0b1000, 0b000>;
+def : AT<"S1E1W", 0b01, 0b000, 0b0111, 0b1000, 0b001>;
+def : AT<"S1E2W", 0b01, 0b100, 0b0111, 0b1000, 0b001>;
+def : AT<"S1E3W", 0b01, 0b110, 0b0111, 0b1000, 0b001>;
+def : AT<"S1E0R", 0b01, 0b000, 0b0111, 0b1000, 0b010>;
+def : AT<"S1E0W", 0b01, 0b000, 0b0111, 0b1000, 0b011>;
+def : AT<"S12E1R", 0b01, 0b100, 0b0111, 0b1000, 0b100>;
+def : AT<"S12E1W", 0b01, 0b100, 0b0111, 0b1000, 0b101>;
+def : AT<"S12E0R", 0b01, 0b100, 0b0111, 0b1000, 0b110>;
+def : AT<"S12E0W", 0b01, 0b100, 0b0111, 0b1000, 0b111>;
+def : AT<"S1E1RP", 0b01, 0b000, 0b0111, 0b1001, 0b000>;
+def : AT<"S1E1WP", 0b01, 0b000, 0b0111, 0b1001, 0b001>;
+
+
+//===----------------------------------------------------------------------===//
+// DMB/DSB (data barrier) instruction options.
+//===----------------------------------------------------------------------===//
+
+class DB<string name, bits<4> encoding> : SearchableTable {
+ let SearchableFields = ["Name", "Encoding"];
+ let EnumValueField = "Encoding";
+
+ string Name = name;
+ bits<4> Encoding = encoding;
+}
+
+def : DB<"oshld", 0x1>;
+def : DB<"oshst", 0x2>;
+def : DB<"osh", 0x3>;
+def : DB<"nshld", 0x5>;
+def : DB<"nshst", 0x6>;
+def : DB<"nsh", 0x7>;
+def : DB<"ishld", 0x9>;
+def : DB<"ishst", 0xa>;
+def : DB<"ish", 0xb>;
+def : DB<"ld", 0xd>;
+def : DB<"st", 0xe>;
+def : DB<"sy", 0xf>;
+
+//===----------------------------------------------------------------------===//
+// DC (data cache maintenance) instruction options.
+//===----------------------------------------------------------------------===//
+
+class DC<string name, bits<2> op0, bits<3> op1, bits<4> crn, bits<4> crm,
+ bits<3> op2> : SearchableTable {
+ let SearchableFields = ["Name", "Encoding"];
+ let EnumValueField = "Encoding";
+
+ string Name = name;
+ bits<16> Encoding;
+ let Encoding{15-14} = op0;
+ let Encoding{13-11} = op1;
+ let Encoding{10-7} = crn;
+ let Encoding{6-3} = crm;
+ let Encoding{2-0} = op2;
+}
+
+def : DC<"ZVA", 0b01, 0b011, 0b0111, 0b0100, 0b001>;
+def : DC<"IVAC", 0b01, 0b000, 0b0111, 0b0110, 0b001>;
+def : DC<"ISW", 0b01, 0b000, 0b0111, 0b0110, 0b010>;
+def : DC<"CVAC", 0b01, 0b011, 0b0111, 0b1010, 0b001>;
+def : DC<"CSW", 0b01, 0b000, 0b0111, 0b1010, 0b010>;
+def : DC<"CVAU", 0b01, 0b011, 0b0111, 0b1011, 0b001>;
+def : DC<"CIVAC", 0b01, 0b011, 0b0111, 0b1110, 0b001>;
+def : DC<"CISW", 0b01, 0b000, 0b0111, 0b1110, 0b010>;
+
+//===----------------------------------------------------------------------===//
+// IC (instruction cache maintenance) instruction options.
+//===----------------------------------------------------------------------===//
+
+class IC<string name, bits<3> op1, bits<4> crn, bits<4> crm, bits<3> op2,
+ bit needsreg> : SearchableTable {
+ let SearchableFields = ["Name", "Encoding"];
+ let EnumValueField = "Encoding";
+
+ string Name = name;
+ bits<14> Encoding;
+ let Encoding{13-11} = op1;
+ let Encoding{10-7} = crn;
+ let Encoding{6-3} = crm;
+ let Encoding{2-0} = op2;
+ bit NeedsReg = needsreg;
+}
+
+def : IC<"IALLUIS", 0b000, 0b0111, 0b0001, 0b000, 0>;
+def : IC<"IALLU", 0b000, 0b0111, 0b0101, 0b000, 0>;
+def : IC<"IVAU", 0b000, 0b0111, 0b0001, 0b000, 1>;
+
+//===----------------------------------------------------------------------===//
+// ISB (instruction-fetch barrier) instruction options.
+//===----------------------------------------------------------------------===//
+
+class ISB<string name, bits<4> encoding> : SearchableTable{
+ let SearchableFields = ["Name", "Encoding"];
+ let EnumValueField = "Encoding";
+
+ string Name = name;
+ bits<4> Encoding;
+ let Encoding = encoding;
+}
+
+def : ISB<"sy", 0xf>;
+
+//===----------------------------------------------------------------------===//
+// PRFM (prefetch) instruction options.
+//===----------------------------------------------------------------------===//
+
+class PRFM<string name, bits<5> encoding> : SearchableTable {
+ let SearchableFields = ["Name", "Encoding"];
+ let EnumValueField = "Encoding";
+
+ string Name = name;
+ bits<5> Encoding;
+ let Encoding = encoding;
+}
+
+def : PRFM<"pldl1keep", 0x00>;
+def : PRFM<"pldl1strm", 0x01>;
+def : PRFM<"pldl2keep", 0x02>;
+def : PRFM<"pldl2strm", 0x03>;
+def : PRFM<"pldl3keep", 0x04>;
+def : PRFM<"pldl3strm", 0x05>;
+def : PRFM<"plil1keep", 0x08>;
+def : PRFM<"plil1strm", 0x09>;
+def : PRFM<"plil2keep", 0x0a>;
+def : PRFM<"plil2strm", 0x0b>;
+def : PRFM<"plil3keep", 0x0c>;
+def : PRFM<"plil3strm", 0x0d>;
+def : PRFM<"pstl1keep", 0x10>;
+def : PRFM<"pstl1strm", 0x11>;
+def : PRFM<"pstl2keep", 0x12>;
+def : PRFM<"pstl2strm", 0x13>;
+def : PRFM<"pstl3keep", 0x14>;
+def : PRFM<"pstl3strm", 0x15>;
+
+//===----------------------------------------------------------------------===//
+// PState instruction options.
+//===----------------------------------------------------------------------===//
+
+class PState<string name, bits<5> encoding> : SearchableTable {
+ let SearchableFields = ["Name", "Encoding"];
+ let EnumValueField = "Encoding";
+
+ string Name = name;
+ bits<5> Encoding;
+ let Encoding = encoding;
+ code Requires = [{ {} }];
+}
+
+def : PState<"SPSel", 0b00101>;
+def : PState<"DAIFSet", 0b11110>;
+def : PState<"DAIFClr", 0b11111>;
+// v8.1a "Privileged Access Never" extension-specific PStates
+let Requires = [{ {AArch64::HasV8_1aOps} }] in
+def : PState<"PAN", 0b00100>;
+// v8.2a "User Access Override" extension-specific PStates
+let Requires = [{ {AArch64::HasV8_2aOps} }] in
+def : PState<"UAO", 0b00011>;
+
+
+//===----------------------------------------------------------------------===//
+// PSB instruction options.
+//===----------------------------------------------------------------------===//
+
+class PSB<string name, bits<5> encoding> : SearchableTable {
+ let SearchableFields = ["Name", "Encoding"];
+ let EnumValueField = "Encoding";
+
+ string Name = name;
+ bits<5> Encoding;
+ let Encoding = encoding;
+}
+
+def : PSB<"csync", 0x11>;
+
+//===----------------------------------------------------------------------===//
+// TLBI (translation lookaside buffer invalidate) instruction options.
+//===----------------------------------------------------------------------===//
+
+class TLBI<string name, bits<2> op0, bits<3> op1, bits<4> crn, bits<4> crm,
+ bits<3> op2, bit needsreg = 1> : SearchableTable {
+ let SearchableFields = ["Name", "Encoding"];
+ let EnumValueField = "Encoding";
+
+ string Name = name;
+ bits<16> Encoding;
+ let Encoding{15-14} = op0;
+ let Encoding{13-11} = op1;
+ let Encoding{10-7} = crn;
+ let Encoding{6-3} = crm;
+ let Encoding{2-0} = op2;
+ bit NeedsReg = needsreg;
+}
+
+def : TLBI<"IPAS2E1IS", 0b01, 0b100, 0b1000, 0b0000, 0b001>;
+def : TLBI<"IPAS2LE1IS", 0b01, 0b100, 0b1000, 0b0000, 0b101>;
+def : TLBI<"VMALLE1IS", 0b01, 0b000, 0b1000, 0b0011, 0b000, 0>;
+def : TLBI<"ALLE2IS", 0b01, 0b100, 0b1000, 0b0011, 0b000, 0>;
+def : TLBI<"ALLE3IS", 0b01, 0b110, 0b1000, 0b0011, 0b000, 0>;
+def : TLBI<"VAE1IS", 0b01, 0b000, 0b1000, 0b0011, 0b001>;
+def : TLBI<"VAE2IS", 0b01, 0b100, 0b1000, 0b0011, 0b001>;
+def : TLBI<"VAE3IS", 0b01, 0b110, 0b1000, 0b0011, 0b001>;
+def : TLBI<"ASIDE1IS", 0b01, 0b000, 0b1000, 0b0011, 0b010>;
+def : TLBI<"VAAE1IS", 0b01, 0b000, 0b1000, 0b0011, 0b011>;
+def : TLBI<"ALLE1IS", 0b01, 0b100, 0b1000, 0b0011, 0b100, 0>;
+def : TLBI<"VALE1IS", 0b01, 0b000, 0b1000, 0b0011, 0b101>;
+def : TLBI<"VALE2IS", 0b01, 0b100, 0b1000, 0b0011, 0b101>;
+def : TLBI<"VALE3IS", 0b01, 0b110, 0b1000, 0b0011, 0b101>;
+def : TLBI<"VMALLS12E1IS", 0b01, 0b100, 0b1000, 0b0011, 0b110, 0>;
+def : TLBI<"VAALE1IS", 0b01, 0b000, 0b1000, 0b0011, 0b111>;
+def : TLBI<"IPAS2E1", 0b01, 0b100, 0b1000, 0b0100, 0b001>;
+def : TLBI<"IPAS2LE1", 0b01, 0b100, 0b1000, 0b0100, 0b101>;
+def : TLBI<"VMALLE1", 0b01, 0b000, 0b1000, 0b0111, 0b000, 0>;
+def : TLBI<"ALLE2", 0b01, 0b100, 0b1000, 0b0111, 0b000, 0>;
+def : TLBI<"ALLE3", 0b01, 0b110, 0b1000, 0b0111, 0b000, 0>;
+def : TLBI<"VAE1", 0b01, 0b000, 0b1000, 0b0111, 0b001>;
+def : TLBI<"VAE2", 0b01, 0b100, 0b1000, 0b0111, 0b001>;
+def : TLBI<"VAE3", 0b01, 0b110, 0b1000, 0b0111, 0b001>;
+def : TLBI<"ASIDE1", 0b01, 0b000, 0b1000, 0b0111, 0b010>;
+def : TLBI<"VAAE1", 0b01, 0b000, 0b1000, 0b0111, 0b011>;
+def : TLBI<"ALLE1", 0b01, 0b100, 0b1000, 0b0111, 0b100, 0>;
+def : TLBI<"VALE1", 0b01, 0b000, 0b1000, 0b0111, 0b101>;
+def : TLBI<"VALE2", 0b01, 0b100, 0b1000, 0b0111, 0b101>;
+def : TLBI<"VALE3", 0b01, 0b110, 0b1000, 0b0111, 0b101>;
+def : TLBI<"VMALLS12E1", 0b01, 0b100, 0b1000, 0b0111, 0b110, 0>;
+def : TLBI<"VAALE1", 0b01, 0b000, 0b1000, 0b0111, 0b111>;
+
+
+//===----------------------------------------------------------------------===//
+// MRS/MSR (system register read/write) instruction options.
+//===----------------------------------------------------------------------===//
+
+class SysReg<string name, bits<2> op0, bits<3> op1, bits<4> crn, bits<4> crm,
+ bits<3> op2> : SearchableTable {
+ let SearchableFields = ["Name", "Encoding"];
+ let EnumValueField = "Encoding";
+
+ string Name = name;
+ bits<16> Encoding;
+ let Encoding{15-14} = op0;
+ let Encoding{13-11} = op1;
+ let Encoding{10-7} = crn;
+ let Encoding{6-3} = crm;
+ let Encoding{2-0} = op2;
+ bit Readable = ?;
+ bit Writeable = ?;
+ code Requires = [{ {} }];
+}
+
+class RWSysReg<string name, bits<2> op0, bits<3> op1, bits<4> crn, bits<4> crm,
+ bits<3> op2>
+ : SysReg<name, op0, op1, crn, crm, op2> {
+ let Readable = 1;
+ let Writeable = 1;
+}
+
+class ROSysReg<string name, bits<2> op0, bits<3> op1, bits<4> crn, bits<4> crm,
+ bits<3> op2>
+ : SysReg<name, op0, op1, crn, crm, op2> {
+ let Readable = 1;
+ let Writeable = 0;
+}
+
+class WOSysReg<string name, bits<2> op0, bits<3> op1, bits<4> crn, bits<4> crm,
+ bits<3> op2>
+ : SysReg<name, op0, op1, crn, crm, op2> {
+ let Readable = 0;
+ let Writeable = 1;
+}
+
+//===----------------------
+// Read-only regs
+//===----------------------
+
+// Op0 Op1 CRn CRm Op2
+def : ROSysReg<"MDCCSR_EL0", 0b10, 0b011, 0b0000, 0b0001, 0b000>;
+def : ROSysReg<"DBGDTRRX_EL0", 0b10, 0b011, 0b0000, 0b0101, 0b000>;
+def : ROSysReg<"MDRAR_EL1", 0b10, 0b000, 0b0001, 0b0000, 0b000>;
+def : ROSysReg<"OSLSR_EL1", 0b10, 0b000, 0b0001, 0b0001, 0b100>;
+def : ROSysReg<"DBGAUTHSTATUS_EL1", 0b10, 0b000, 0b0111, 0b1110, 0b110>;
+def : ROSysReg<"PMCEID0_EL0", 0b11, 0b011, 0b1001, 0b1100, 0b110>;
+def : ROSysReg<"PMCEID1_EL0", 0b11, 0b011, 0b1001, 0b1100, 0b111>;
+def : ROSysReg<"MIDR_EL1", 0b11, 0b000, 0b0000, 0b0000, 0b000>;
+def : ROSysReg<"CCSIDR_EL1", 0b11, 0b001, 0b0000, 0b0000, 0b000>;
+def : ROSysReg<"CLIDR_EL1", 0b11, 0b001, 0b0000, 0b0000, 0b001>;
+def : ROSysReg<"CTR_EL0", 0b11, 0b011, 0b0000, 0b0000, 0b001>;
+def : ROSysReg<"MPIDR_EL1", 0b11, 0b000, 0b0000, 0b0000, 0b101>;
+def : ROSysReg<"REVIDR_EL1", 0b11, 0b000, 0b0000, 0b0000, 0b110>;
+def : ROSysReg<"AIDR_EL1", 0b11, 0b001, 0b0000, 0b0000, 0b111>;
+def : ROSysReg<"DCZID_EL0", 0b11, 0b011, 0b0000, 0b0000, 0b111>;
+def : ROSysReg<"ID_PFR0_EL1", 0b11, 0b000, 0b0000, 0b0001, 0b000>;
+def : ROSysReg<"ID_PFR1_EL1", 0b11, 0b000, 0b0000, 0b0001, 0b001>;
+def : ROSysReg<"ID_DFR0_EL1", 0b11, 0b000, 0b0000, 0b0001, 0b010>;
+def : ROSysReg<"ID_AFR0_EL1", 0b11, 0b000, 0b0000, 0b0001, 0b011>;
+def : ROSysReg<"ID_MMFR0_EL1", 0b11, 0b000, 0b0000, 0b0001, 0b100>;
+def : ROSysReg<"ID_MMFR1_EL1", 0b11, 0b000, 0b0000, 0b0001, 0b101>;
+def : ROSysReg<"ID_MMFR2_EL1", 0b11, 0b000, 0b0000, 0b0001, 0b110>;
+def : ROSysReg<"ID_MMFR3_EL1", 0b11, 0b000, 0b0000, 0b0001, 0b111>;
+def : ROSysReg<"ID_ISAR0_EL1", 0b11, 0b000, 0b0000, 0b0010, 0b000>;
+def : ROSysReg<"ID_ISAR1_EL1", 0b11, 0b000, 0b0000, 0b0010, 0b001>;
+def : ROSysReg<"ID_ISAR2_EL1", 0b11, 0b000, 0b0000, 0b0010, 0b010>;
+def : ROSysReg<"ID_ISAR3_EL1", 0b11, 0b000, 0b0000, 0b0010, 0b011>;
+def : ROSysReg<"ID_ISAR4_EL1", 0b11, 0b000, 0b0000, 0b0010, 0b100>;
+def : ROSysReg<"ID_ISAR5_EL1", 0b11, 0b000, 0b0000, 0b0010, 0b101>;
+def : ROSysReg<"ID_AA64PFR0_EL1", 0b11, 0b000, 0b0000, 0b0100, 0b000>;
+def : ROSysReg<"ID_AA64PFR1_EL1", 0b11, 0b000, 0b0000, 0b0100, 0b001>;
+def : ROSysReg<"ID_AA64DFR0_EL1", 0b11, 0b000, 0b0000, 0b0101, 0b000>;
+def : ROSysReg<"ID_AA64DFR1_EL1", 0b11, 0b000, 0b0000, 0b0101, 0b001>;
+def : ROSysReg<"ID_AA64AFR0_EL1", 0b11, 0b000, 0b0000, 0b0101, 0b100>;
+def : ROSysReg<"ID_AA64AFR1_EL1", 0b11, 0b000, 0b0000, 0b0101, 0b101>;
+def : ROSysReg<"ID_AA64ISAR0_EL1", 0b11, 0b000, 0b0000, 0b0110, 0b000>;
+def : ROSysReg<"ID_AA64ISAR1_EL1", 0b11, 0b000, 0b0000, 0b0110, 0b001>;
+def : ROSysReg<"ID_AA64MMFR0_EL1", 0b11, 0b000, 0b0000, 0b0111, 0b000>;
+def : ROSysReg<"ID_AA64MMFR1_EL1", 0b11, 0b000, 0b0000, 0b0111, 0b001>;
+def : ROSysReg<"ID_AA64MMFR2_EL1", 0b11, 0b000, 0b0000, 0b0111, 0b010> {
+ let Requires = [{ {AArch64::HasV8_2aOps} }];
+}
+def : ROSysReg<"MVFR0_EL1", 0b11, 0b000, 0b0000, 0b0011, 0b000>;
+def : ROSysReg<"MVFR1_EL1", 0b11, 0b000, 0b0000, 0b0011, 0b001>;
+def : ROSysReg<"MVFR2_EL1", 0b11, 0b000, 0b0000, 0b0011, 0b010>;
+def : ROSysReg<"RVBAR_EL1", 0b11, 0b000, 0b1100, 0b0000, 0b001>;
+def : ROSysReg<"RVBAR_EL2", 0b11, 0b100, 0b1100, 0b0000, 0b001>;
+def : ROSysReg<"RVBAR_EL3", 0b11, 0b110, 0b1100, 0b0000, 0b001>;
+def : ROSysReg<"ISR_EL1", 0b11, 0b000, 0b1100, 0b0001, 0b000>;
+def : ROSysReg<"CNTPCT_EL0", 0b11, 0b011, 0b1110, 0b0000, 0b001>;
+def : ROSysReg<"CNTVCT_EL0", 0b11, 0b011, 0b1110, 0b0000, 0b010>;
+def : ROSysReg<"ID_MMFR4_EL1", 0b11, 0b000, 0b0000, 0b0010, 0b110>;
+
+// Trace registers
+// Op0 Op1 CRn CRm Op2
+def : ROSysReg<"TRCSTATR", 0b10, 0b001, 0b0000, 0b0011, 0b000>;
+def : ROSysReg<"TRCIDR8", 0b10, 0b001, 0b0000, 0b0000, 0b110>;
+def : ROSysReg<"TRCIDR9", 0b10, 0b001, 0b0000, 0b0001, 0b110>;
+def : ROSysReg<"TRCIDR10", 0b10, 0b001, 0b0000, 0b0010, 0b110>;
+def : ROSysReg<"TRCIDR11", 0b10, 0b001, 0b0000, 0b0011, 0b110>;
+def : ROSysReg<"TRCIDR12", 0b10, 0b001, 0b0000, 0b0100, 0b110>;
+def : ROSysReg<"TRCIDR13", 0b10, 0b001, 0b0000, 0b0101, 0b110>;
+def : ROSysReg<"TRCIDR0", 0b10, 0b001, 0b0000, 0b1000, 0b111>;
+def : ROSysReg<"TRCIDR1", 0b10, 0b001, 0b0000, 0b1001, 0b111>;
+def : ROSysReg<"TRCIDR2", 0b10, 0b001, 0b0000, 0b1010, 0b111>;
+def : ROSysReg<"TRCIDR3", 0b10, 0b001, 0b0000, 0b1011, 0b111>;
+def : ROSysReg<"TRCIDR4", 0b10, 0b001, 0b0000, 0b1100, 0b111>;
+def : ROSysReg<"TRCIDR5", 0b10, 0b001, 0b0000, 0b1101, 0b111>;
+def : ROSysReg<"TRCIDR6", 0b10, 0b001, 0b0000, 0b1110, 0b111>;
+def : ROSysReg<"TRCIDR7", 0b10, 0b001, 0b0000, 0b1111, 0b111>;
+def : ROSysReg<"TRCOSLSR", 0b10, 0b001, 0b0001, 0b0001, 0b100>;
+def : ROSysReg<"TRCPDSR", 0b10, 0b001, 0b0001, 0b0101, 0b100>;
+def : ROSysReg<"TRCDEVAFF0", 0b10, 0b001, 0b0111, 0b1010, 0b110>;
+def : ROSysReg<"TRCDEVAFF1", 0b10, 0b001, 0b0111, 0b1011, 0b110>;
+def : ROSysReg<"TRCLSR", 0b10, 0b001, 0b0111, 0b1101, 0b110>;
+def : ROSysReg<"TRCAUTHSTATUS", 0b10, 0b001, 0b0111, 0b1110, 0b110>;
+def : ROSysReg<"TRCDEVARCH", 0b10, 0b001, 0b0111, 0b1111, 0b110>;
+def : ROSysReg<"TRCDEVID", 0b10, 0b001, 0b0111, 0b0010, 0b111>;
+def : ROSysReg<"TRCDEVTYPE", 0b10, 0b001, 0b0111, 0b0011, 0b111>;
+def : ROSysReg<"TRCPIDR4", 0b10, 0b001, 0b0111, 0b0100, 0b111>;
+def : ROSysReg<"TRCPIDR5", 0b10, 0b001, 0b0111, 0b0101, 0b111>;
+def : ROSysReg<"TRCPIDR6", 0b10, 0b001, 0b0111, 0b0110, 0b111>;
+def : ROSysReg<"TRCPIDR7", 0b10, 0b001, 0b0111, 0b0111, 0b111>;
+def : ROSysReg<"TRCPIDR0", 0b10, 0b001, 0b0111, 0b1000, 0b111>;
+def : ROSysReg<"TRCPIDR1", 0b10, 0b001, 0b0111, 0b1001, 0b111>;
+def : ROSysReg<"TRCPIDR2", 0b10, 0b001, 0b0111, 0b1010, 0b111>;
+def : ROSysReg<"TRCPIDR3", 0b10, 0b001, 0b0111, 0b1011, 0b111>;
+def : ROSysReg<"TRCCIDR0", 0b10, 0b001, 0b0111, 0b1100, 0b111>;
+def : ROSysReg<"TRCCIDR1", 0b10, 0b001, 0b0111, 0b1101, 0b111>;
+def : ROSysReg<"TRCCIDR2", 0b10, 0b001, 0b0111, 0b1110, 0b111>;
+def : ROSysReg<"TRCCIDR3", 0b10, 0b001, 0b0111, 0b1111, 0b111>;
+
+// GICv3 registers
+// Op0 Op1 CRn CRm Op2
+def : ROSysReg<"ICC_IAR1_EL1", 0b11, 0b000, 0b1100, 0b1100, 0b000>;
+def : ROSysReg<"ICC_IAR0_EL1", 0b11, 0b000, 0b1100, 0b1000, 0b000>;
+def : ROSysReg<"ICC_HPPIR1_EL1", 0b11, 0b000, 0b1100, 0b1100, 0b010>;
+def : ROSysReg<"ICC_HPPIR0_EL1", 0b11, 0b000, 0b1100, 0b1000, 0b010>;
+def : ROSysReg<"ICC_RPR_EL1", 0b11, 0b000, 0b1100, 0b1011, 0b011>;
+def : ROSysReg<"ICH_VTR_EL2", 0b11, 0b100, 0b1100, 0b1011, 0b001>;
+def : ROSysReg<"ICH_EISR_EL2", 0b11, 0b100, 0b1100, 0b1011, 0b011>;
+def : ROSysReg<"ICH_ELSR_EL2", 0b11, 0b100, 0b1100, 0b1011, 0b101>;
+
+// v8.1a "Limited Ordering Regions" extension-specific system register
+// Op0 Op1 CRn CRm Op2
+let Requires = [{ {AArch64::HasV8_1aOps} }] in
+def : ROSysReg<"LORID_EL1", 0b11, 0b000, 0b1010, 0b0100, 0b111>;
+
+// v8.2a "RAS extension" registers
+// Op0 Op1 CRn CRm Op2
+let Requires = [{ {AArch64::FeatureRAS} }] in {
+def : ROSysReg<"ERRIDR_EL1", 0b11, 0b000, 0b0101, 0b0011, 0b000>;
+def : ROSysReg<"ERXFR_EL1", 0b11, 0b000, 0b0101, 0b0100, 0b000>;
+}
+
+//===----------------------
+// Write-only regs
+//===----------------------
+
+// Op0 Op1 CRn CRm Op2
+def : WOSysReg<"DBGDTRTX_EL0", 0b10, 0b011, 0b0000, 0b0101, 0b000>;
+def : WOSysReg<"OSLAR_EL1", 0b10, 0b000, 0b0001, 0b0000, 0b100>;
+def : WOSysReg<"PMSWINC_EL0", 0b11, 0b011, 0b1001, 0b1100, 0b100>;
+
+// Trace Registers
+// Op0 Op1 CRn CRm Op2
+def : WOSysReg<"TRCOSLAR", 0b10, 0b001, 0b0001, 0b0000, 0b100>;
+def : WOSysReg<"TRCLAR", 0b10, 0b001, 0b0111, 0b1100, 0b110>;
+
+// GICv3 registers
+// Op0 Op1 CRn CRm Op2
+def : WOSysReg<"ICC_EOIR1_EL1", 0b11, 0b000, 0b1100, 0b1100, 0b001>;
+def : WOSysReg<"ICC_EOIR0_EL1", 0b11, 0b000, 0b1100, 0b1000, 0b001>;
+def : WOSysReg<"ICC_DIR_EL1", 0b11, 0b000, 0b1100, 0b1011, 0b001>;
+def : WOSysReg<"ICC_SGI1R_EL1", 0b11, 0b000, 0b1100, 0b1011, 0b101>;
+def : WOSysReg<"ICC_ASGI1R_EL1", 0b11, 0b000, 0b1100, 0b1011, 0b110>;
+def : WOSysReg<"ICC_SGI0R_EL1", 0b11, 0b000, 0b1100, 0b1011, 0b111>;
+
+//===----------------------
+// Read-write regs
+//===----------------------
+
+// Op0 Op1 CRn CRm Op2
+def : RWSysReg<"OSDTRRX_EL1", 0b10, 0b000, 0b0000, 0b0000, 0b010>;
+def : RWSysReg<"OSDTRTX_EL1", 0b10, 0b000, 0b0000, 0b0011, 0b010>;
+def : RWSysReg<"TEECR32_EL1", 0b10, 0b010, 0b0000, 0b0000, 0b000>;
+def : RWSysReg<"MDCCINT_EL1", 0b10, 0b000, 0b0000, 0b0010, 0b000>;
+def : RWSysReg<"MDSCR_EL1", 0b10, 0b000, 0b0000, 0b0010, 0b010>;
+def : RWSysReg<"DBGDTR_EL0", 0b10, 0b011, 0b0000, 0b0100, 0b000>;
+def : RWSysReg<"OSECCR_EL1", 0b10, 0b000, 0b0000, 0b0110, 0b010>;
+def : RWSysReg<"DBGVCR32_EL2", 0b10, 0b100, 0b0000, 0b0111, 0b000>;
+def : RWSysReg<"DBGBVR0_EL1", 0b10, 0b000, 0b0000, 0b0000, 0b100>;
+def : RWSysReg<"DBGBVR1_EL1", 0b10, 0b000, 0b0000, 0b0001, 0b100>;
+def : RWSysReg<"DBGBVR2_EL1", 0b10, 0b000, 0b0000, 0b0010, 0b100>;
+def : RWSysReg<"DBGBVR3_EL1", 0b10, 0b000, 0b0000, 0b0011, 0b100>;
+def : RWSysReg<"DBGBVR4_EL1", 0b10, 0b000, 0b0000, 0b0100, 0b100>;
+def : RWSysReg<"DBGBVR5_EL1", 0b10, 0b000, 0b0000, 0b0101, 0b100>;
+def : RWSysReg<"DBGBVR6_EL1", 0b10, 0b000, 0b0000, 0b0110, 0b100>;
+def : RWSysReg<"DBGBVR7_EL1", 0b10, 0b000, 0b0000, 0b0111, 0b100>;
+def : RWSysReg<"DBGBVR8_EL1", 0b10, 0b000, 0b0000, 0b1000, 0b100>;
+def : RWSysReg<"DBGBVR9_EL1", 0b10, 0b000, 0b0000, 0b1001, 0b100>;
+def : RWSysReg<"DBGBVR10_EL1", 0b10, 0b000, 0b0000, 0b1010, 0b100>;
+def : RWSysReg<"DBGBVR11_EL1", 0b10, 0b000, 0b0000, 0b1011, 0b100>;
+def : RWSysReg<"DBGBVR12_EL1", 0b10, 0b000, 0b0000, 0b1100, 0b100>;
+def : RWSysReg<"DBGBVR13_EL1", 0b10, 0b000, 0b0000, 0b1101, 0b100>;
+def : RWSysReg<"DBGBVR14_EL1", 0b10, 0b000, 0b0000, 0b1110, 0b100>;
+def : RWSysReg<"DBGBVR15_EL1", 0b10, 0b000, 0b0000, 0b1111, 0b100>;
+def : RWSysReg<"DBGBCR0_EL1", 0b10, 0b000, 0b0000, 0b0000, 0b101>;
+def : RWSysReg<"DBGBCR1_EL1", 0b10, 0b000, 0b0000, 0b0001, 0b101>;
+def : RWSysReg<"DBGBCR2_EL1", 0b10, 0b000, 0b0000, 0b0010, 0b101>;
+def : RWSysReg<"DBGBCR3_EL1", 0b10, 0b000, 0b0000, 0b0011, 0b101>;
+def : RWSysReg<"DBGBCR4_EL1", 0b10, 0b000, 0b0000, 0b0100, 0b101>;
+def : RWSysReg<"DBGBCR5_EL1", 0b10, 0b000, 0b0000, 0b0101, 0b101>;
+def : RWSysReg<"DBGBCR6_EL1", 0b10, 0b000, 0b0000, 0b0110, 0b101>;
+def : RWSysReg<"DBGBCR7_EL1", 0b10, 0b000, 0b0000, 0b0111, 0b101>;
+def : RWSysReg<"DBGBCR8_EL1", 0b10, 0b000, 0b0000, 0b1000, 0b101>;
+def : RWSysReg<"DBGBCR9_EL1", 0b10, 0b000, 0b0000, 0b1001, 0b101>;
+def : RWSysReg<"DBGBCR10_EL1", 0b10, 0b000, 0b0000, 0b1010, 0b101>;
+def : RWSysReg<"DBGBCR11_EL1", 0b10, 0b000, 0b0000, 0b1011, 0b101>;
+def : RWSysReg<"DBGBCR12_EL1", 0b10, 0b000, 0b0000, 0b1100, 0b101>;
+def : RWSysReg<"DBGBCR13_EL1", 0b10, 0b000, 0b0000, 0b1101, 0b101>;
+def : RWSysReg<"DBGBCR14_EL1", 0b10, 0b000, 0b0000, 0b1110, 0b101>;
+def : RWSysReg<"DBGBCR15_EL1", 0b10, 0b000, 0b0000, 0b1111, 0b101>;
+def : RWSysReg<"DBGWVR0_EL1", 0b10, 0b000, 0b0000, 0b0000, 0b110>;
+def : RWSysReg<"DBGWVR1_EL1", 0b10, 0b000, 0b0000, 0b0001, 0b110>;
+def : RWSysReg<"DBGWVR2_EL1", 0b10, 0b000, 0b0000, 0b0010, 0b110>;
+def : RWSysReg<"DBGWVR3_EL1", 0b10, 0b000, 0b0000, 0b0011, 0b110>;
+def : RWSysReg<"DBGWVR4_EL1", 0b10, 0b000, 0b0000, 0b0100, 0b110>;
+def : RWSysReg<"DBGWVR5_EL1", 0b10, 0b000, 0b0000, 0b0101, 0b110>;
+def : RWSysReg<"DBGWVR6_EL1", 0b10, 0b000, 0b0000, 0b0110, 0b110>;
+def : RWSysReg<"DBGWVR7_EL1", 0b10, 0b000, 0b0000, 0b0111, 0b110>;
+def : RWSysReg<"DBGWVR8_EL1", 0b10, 0b000, 0b0000, 0b1000, 0b110>;
+def : RWSysReg<"DBGWVR9_EL1", 0b10, 0b000, 0b0000, 0b1001, 0b110>;
+def : RWSysReg<"DBGWVR10_EL1", 0b10, 0b000, 0b0000, 0b1010, 0b110>;
+def : RWSysReg<"DBGWVR11_EL1", 0b10, 0b000, 0b0000, 0b1011, 0b110>;
+def : RWSysReg<"DBGWVR12_EL1", 0b10, 0b000, 0b0000, 0b1100, 0b110>;
+def : RWSysReg<"DBGWVR13_EL1", 0b10, 0b000, 0b0000, 0b1101, 0b110>;
+def : RWSysReg<"DBGWVR14_EL1", 0b10, 0b000, 0b0000, 0b1110, 0b110>;
+def : RWSysReg<"DBGWVR15_EL1", 0b10, 0b000, 0b0000, 0b1111, 0b110>;
+def : RWSysReg<"DBGWCR0_EL1", 0b10, 0b000, 0b0000, 0b0000, 0b111>;
+def : RWSysReg<"DBGWCR1_EL1", 0b10, 0b000, 0b0000, 0b0001, 0b111>;
+def : RWSysReg<"DBGWCR2_EL1", 0b10, 0b000, 0b0000, 0b0010, 0b111>;
+def : RWSysReg<"DBGWCR3_EL1", 0b10, 0b000, 0b0000, 0b0011, 0b111>;
+def : RWSysReg<"DBGWCR4_EL1", 0b10, 0b000, 0b0000, 0b0100, 0b111>;
+def : RWSysReg<"DBGWCR5_EL1", 0b10, 0b000, 0b0000, 0b0101, 0b111>;
+def : RWSysReg<"DBGWCR6_EL1", 0b10, 0b000, 0b0000, 0b0110, 0b111>;
+def : RWSysReg<"DBGWCR7_EL1", 0b10, 0b000, 0b0000, 0b0111, 0b111>;
+def : RWSysReg<"DBGWCR8_EL1", 0b10, 0b000, 0b0000, 0b1000, 0b111>;
+def : RWSysReg<"DBGWCR9_EL1", 0b10, 0b000, 0b0000, 0b1001, 0b111>;
+def : RWSysReg<"DBGWCR10_EL1", 0b10, 0b000, 0b0000, 0b1010, 0b111>;
+def : RWSysReg<"DBGWCR11_EL1", 0b10, 0b000, 0b0000, 0b1011, 0b111>;
+def : RWSysReg<"DBGWCR12_EL1", 0b10, 0b000, 0b0000, 0b1100, 0b111>;
+def : RWSysReg<"DBGWCR13_EL1", 0b10, 0b000, 0b0000, 0b1101, 0b111>;
+def : RWSysReg<"DBGWCR14_EL1", 0b10, 0b000, 0b0000, 0b1110, 0b111>;
+def : RWSysReg<"DBGWCR15_EL1", 0b10, 0b000, 0b0000, 0b1111, 0b111>;
+def : RWSysReg<"TEEHBR32_EL1", 0b10, 0b010, 0b0001, 0b0000, 0b000>;
+def : RWSysReg<"OSDLR_EL1", 0b10, 0b000, 0b0001, 0b0011, 0b100>;
+def : RWSysReg<"DBGPRCR_EL1", 0b10, 0b000, 0b0001, 0b0100, 0b100>;
+def : RWSysReg<"DBGCLAIMSET_EL1", 0b10, 0b000, 0b0111, 0b1000, 0b110>;
+def : RWSysReg<"DBGCLAIMCLR_EL1", 0b10, 0b000, 0b0111, 0b1001, 0b110>;
+def : RWSysReg<"CSSELR_EL1", 0b11, 0b010, 0b0000, 0b0000, 0b000>;
+def : RWSysReg<"VPIDR_EL2", 0b11, 0b100, 0b0000, 0b0000, 0b000>;
+def : RWSysReg<"VMPIDR_EL2", 0b11, 0b100, 0b0000, 0b0000, 0b101>;
+def : RWSysReg<"CPACR_EL1", 0b11, 0b000, 0b0001, 0b0000, 0b010>;
+def : RWSysReg<"SCTLR_EL1", 0b11, 0b000, 0b0001, 0b0000, 0b000>;
+def : RWSysReg<"SCTLR_EL2", 0b11, 0b100, 0b0001, 0b0000, 0b000>;
+def : RWSysReg<"SCTLR_EL3", 0b11, 0b110, 0b0001, 0b0000, 0b000>;
+def : RWSysReg<"ACTLR_EL1", 0b11, 0b000, 0b0001, 0b0000, 0b001>;
+def : RWSysReg<"ACTLR_EL2", 0b11, 0b100, 0b0001, 0b0000, 0b001>;
+def : RWSysReg<"ACTLR_EL3", 0b11, 0b110, 0b0001, 0b0000, 0b001>;
+def : RWSysReg<"HCR_EL2", 0b11, 0b100, 0b0001, 0b0001, 0b000>;
+def : RWSysReg<"SCR_EL3", 0b11, 0b110, 0b0001, 0b0001, 0b000>;
+def : RWSysReg<"MDCR_EL2", 0b11, 0b100, 0b0001, 0b0001, 0b001>;
+def : RWSysReg<"SDER32_EL3", 0b11, 0b110, 0b0001, 0b0001, 0b001>;
+def : RWSysReg<"CPTR_EL2", 0b11, 0b100, 0b0001, 0b0001, 0b010>;
+def : RWSysReg<"CPTR_EL3", 0b11, 0b110, 0b0001, 0b0001, 0b010>;
+def : RWSysReg<"HSTR_EL2", 0b11, 0b100, 0b0001, 0b0001, 0b011>;
+def : RWSysReg<"HACR_EL2", 0b11, 0b100, 0b0001, 0b0001, 0b111>;
+def : RWSysReg<"MDCR_EL3", 0b11, 0b110, 0b0001, 0b0011, 0b001>;
+def : RWSysReg<"TTBR0_EL1", 0b11, 0b000, 0b0010, 0b0000, 0b000>;
+def : RWSysReg<"TTBR0_EL2", 0b11, 0b100, 0b0010, 0b0000, 0b000>;
+def : RWSysReg<"TTBR0_EL3", 0b11, 0b110, 0b0010, 0b0000, 0b000>;
+def : RWSysReg<"TTBR1_EL1", 0b11, 0b000, 0b0010, 0b0000, 0b001>;
+def : RWSysReg<"TCR_EL1", 0b11, 0b000, 0b0010, 0b0000, 0b010>;
+def : RWSysReg<"TCR_EL2", 0b11, 0b100, 0b0010, 0b0000, 0b010>;
+def : RWSysReg<"TCR_EL3", 0b11, 0b110, 0b0010, 0b0000, 0b010>;
+def : RWSysReg<"VTTBR_EL2", 0b11, 0b100, 0b0010, 0b0001, 0b000>;
+def : RWSysReg<"VTCR_EL2", 0b11, 0b100, 0b0010, 0b0001, 0b010>;
+def : RWSysReg<"DACR32_EL2", 0b11, 0b100, 0b0011, 0b0000, 0b000>;
+def : RWSysReg<"SPSR_EL1", 0b11, 0b000, 0b0100, 0b0000, 0b000>;
+def : RWSysReg<"SPSR_EL2", 0b11, 0b100, 0b0100, 0b0000, 0b000>;
+def : RWSysReg<"SPSR_EL3", 0b11, 0b110, 0b0100, 0b0000, 0b000>;
+def : RWSysReg<"ELR_EL1", 0b11, 0b000, 0b0100, 0b0000, 0b001>;
+def : RWSysReg<"ELR_EL2", 0b11, 0b100, 0b0100, 0b0000, 0b001>;
+def : RWSysReg<"ELR_EL3", 0b11, 0b110, 0b0100, 0b0000, 0b001>;
+def : RWSysReg<"SP_EL0", 0b11, 0b000, 0b0100, 0b0001, 0b000>;
+def : RWSysReg<"SP_EL1", 0b11, 0b100, 0b0100, 0b0001, 0b000>;
+def : RWSysReg<"SP_EL2", 0b11, 0b110, 0b0100, 0b0001, 0b000>;
+def : RWSysReg<"SPSel", 0b11, 0b000, 0b0100, 0b0010, 0b000>;
+def : RWSysReg<"NZCV", 0b11, 0b011, 0b0100, 0b0010, 0b000>;
+def : RWSysReg<"DAIF", 0b11, 0b011, 0b0100, 0b0010, 0b001>;
+def : RWSysReg<"CurrentEL", 0b11, 0b000, 0b0100, 0b0010, 0b010>;
+def : RWSysReg<"SPSR_irq", 0b11, 0b100, 0b0100, 0b0011, 0b000>;
+def : RWSysReg<"SPSR_abt", 0b11, 0b100, 0b0100, 0b0011, 0b001>;
+def : RWSysReg<"SPSR_und", 0b11, 0b100, 0b0100, 0b0011, 0b010>;
+def : RWSysReg<"SPSR_fiq", 0b11, 0b100, 0b0100, 0b0011, 0b011>;
+def : RWSysReg<"FPCR", 0b11, 0b011, 0b0100, 0b0100, 0b000>;
+def : RWSysReg<"FPSR", 0b11, 0b011, 0b0100, 0b0100, 0b001>;
+def : RWSysReg<"DSPSR_EL0", 0b11, 0b011, 0b0100, 0b0101, 0b000>;
+def : RWSysReg<"DLR_EL0", 0b11, 0b011, 0b0100, 0b0101, 0b001>;
+def : RWSysReg<"IFSR32_EL2", 0b11, 0b100, 0b0101, 0b0000, 0b001>;
+def : RWSysReg<"AFSR0_EL1", 0b11, 0b000, 0b0101, 0b0001, 0b000>;
+def : RWSysReg<"AFSR0_EL2", 0b11, 0b100, 0b0101, 0b0001, 0b000>;
+def : RWSysReg<"AFSR0_EL3", 0b11, 0b110, 0b0101, 0b0001, 0b000>;
+def : RWSysReg<"AFSR1_EL1", 0b11, 0b000, 0b0101, 0b0001, 0b001>;
+def : RWSysReg<"AFSR1_EL2", 0b11, 0b100, 0b0101, 0b0001, 0b001>;
+def : RWSysReg<"AFSR1_EL3", 0b11, 0b110, 0b0101, 0b0001, 0b001>;
+def : RWSysReg<"ESR_EL1", 0b11, 0b000, 0b0101, 0b0010, 0b000>;
+def : RWSysReg<"ESR_EL2", 0b11, 0b100, 0b0101, 0b0010, 0b000>;
+def : RWSysReg<"ESR_EL3", 0b11, 0b110, 0b0101, 0b0010, 0b000>;
+def : RWSysReg<"FPEXC32_EL2", 0b11, 0b100, 0b0101, 0b0011, 0b000>;
+def : RWSysReg<"FAR_EL1", 0b11, 0b000, 0b0110, 0b0000, 0b000>;
+def : RWSysReg<"FAR_EL2", 0b11, 0b100, 0b0110, 0b0000, 0b000>;
+def : RWSysReg<"FAR_EL3", 0b11, 0b110, 0b0110, 0b0000, 0b000>;
+def : RWSysReg<"HPFAR_EL2", 0b11, 0b100, 0b0110, 0b0000, 0b100>;
+def : RWSysReg<"PAR_EL1", 0b11, 0b000, 0b0111, 0b0100, 0b000>;
+def : RWSysReg<"PMCR_EL0", 0b11, 0b011, 0b1001, 0b1100, 0b000>;
+def : RWSysReg<"PMCNTENSET_EL0", 0b11, 0b011, 0b1001, 0b1100, 0b001>;
+def : RWSysReg<"PMCNTENCLR_EL0", 0b11, 0b011, 0b1001, 0b1100, 0b010>;
+def : RWSysReg<"PMOVSCLR_EL0", 0b11, 0b011, 0b1001, 0b1100, 0b011>;
+def : RWSysReg<"PMSELR_EL0", 0b11, 0b011, 0b1001, 0b1100, 0b101>;
+def : RWSysReg<"PMCCNTR_EL0", 0b11, 0b011, 0b1001, 0b1101, 0b000>;
+def : RWSysReg<"PMXEVTYPER_EL0", 0b11, 0b011, 0b1001, 0b1101, 0b001>;
+def : RWSysReg<"PMXEVCNTR_EL0", 0b11, 0b011, 0b1001, 0b1101, 0b010>;
+def : RWSysReg<"PMUSERENR_EL0", 0b11, 0b011, 0b1001, 0b1110, 0b000>;
+def : RWSysReg<"PMINTENSET_EL1", 0b11, 0b000, 0b1001, 0b1110, 0b001>;
+def : RWSysReg<"PMINTENCLR_EL1", 0b11, 0b000, 0b1001, 0b1110, 0b010>;
+def : RWSysReg<"PMOVSSET_EL0", 0b11, 0b011, 0b1001, 0b1110, 0b011>;
+def : RWSysReg<"MAIR_EL1", 0b11, 0b000, 0b1010, 0b0010, 0b000>;
+def : RWSysReg<"MAIR_EL2", 0b11, 0b100, 0b1010, 0b0010, 0b000>;
+def : RWSysReg<"MAIR_EL3", 0b11, 0b110, 0b1010, 0b0010, 0b000>;
+def : RWSysReg<"AMAIR_EL1", 0b11, 0b000, 0b1010, 0b0011, 0b000>;
+def : RWSysReg<"AMAIR_EL2", 0b11, 0b100, 0b1010, 0b0011, 0b000>;
+def : RWSysReg<"AMAIR_EL3", 0b11, 0b110, 0b1010, 0b0011, 0b000>;
+def : RWSysReg<"VBAR_EL1", 0b11, 0b000, 0b1100, 0b0000, 0b000>;
+def : RWSysReg<"VBAR_EL2", 0b11, 0b100, 0b1100, 0b0000, 0b000>;
+def : RWSysReg<"VBAR_EL3", 0b11, 0b110, 0b1100, 0b0000, 0b000>;
+def : RWSysReg<"RMR_EL1", 0b11, 0b000, 0b1100, 0b0000, 0b010>;
+def : RWSysReg<"RMR_EL2", 0b11, 0b100, 0b1100, 0b0000, 0b010>;
+def : RWSysReg<"RMR_EL3", 0b11, 0b110, 0b1100, 0b0000, 0b010>;
+def : RWSysReg<"CONTEXTIDR_EL1", 0b11, 0b000, 0b1101, 0b0000, 0b001>;
+def : RWSysReg<"TPIDR_EL0", 0b11, 0b011, 0b1101, 0b0000, 0b010>;
+def : RWSysReg<"TPIDR_EL2", 0b11, 0b100, 0b1101, 0b0000, 0b010>;
+def : RWSysReg<"TPIDR_EL3", 0b11, 0b110, 0b1101, 0b0000, 0b010>;
+def : RWSysReg<"TPIDRRO_EL0", 0b11, 0b011, 0b1101, 0b0000, 0b011>;
+def : RWSysReg<"TPIDR_EL1", 0b11, 0b000, 0b1101, 0b0000, 0b100>;
+def : RWSysReg<"CNTFRQ_EL0", 0b11, 0b011, 0b1110, 0b0000, 0b000>;
+def : RWSysReg<"CNTVOFF_EL2", 0b11, 0b100, 0b1110, 0b0000, 0b011>;
+def : RWSysReg<"CNTKCTL_EL1", 0b11, 0b000, 0b1110, 0b0001, 0b000>;
+def : RWSysReg<"CNTHCTL_EL2", 0b11, 0b100, 0b1110, 0b0001, 0b000>;
+def : RWSysReg<"CNTP_TVAL_EL0", 0b11, 0b011, 0b1110, 0b0010, 0b000>;
+def : RWSysReg<"CNTHP_TVAL_EL2", 0b11, 0b100, 0b1110, 0b0010, 0b000>;
+def : RWSysReg<"CNTPS_TVAL_EL1", 0b11, 0b111, 0b1110, 0b0010, 0b000>;
+def : RWSysReg<"CNTP_CTL_EL0", 0b11, 0b011, 0b1110, 0b0010, 0b001>;
+def : RWSysReg<"CNTHP_CTL_EL2", 0b11, 0b100, 0b1110, 0b0010, 0b001>;
+def : RWSysReg<"CNTPS_CTL_EL1", 0b11, 0b111, 0b1110, 0b0010, 0b001>;
+def : RWSysReg<"CNTP_CVAL_EL0", 0b11, 0b011, 0b1110, 0b0010, 0b010>;
+def : RWSysReg<"CNTHP_CVAL_EL2", 0b11, 0b100, 0b1110, 0b0010, 0b010>;
+def : RWSysReg<"CNTPS_CVAL_EL1", 0b11, 0b111, 0b1110, 0b0010, 0b010>;
+def : RWSysReg<"CNTV_TVAL_EL0", 0b11, 0b011, 0b1110, 0b0011, 0b000>;
+def : RWSysReg<"CNTV_CTL_EL0", 0b11, 0b011, 0b1110, 0b0011, 0b001>;
+def : RWSysReg<"CNTV_CVAL_EL0", 0b11, 0b011, 0b1110, 0b0011, 0b010>;
+def : RWSysReg<"PMEVCNTR0_EL0", 0b11, 0b011, 0b1110, 0b1000, 0b000>;
+def : RWSysReg<"PMEVCNTR1_EL0", 0b11, 0b011, 0b1110, 0b1000, 0b001>;
+def : RWSysReg<"PMEVCNTR2_EL0", 0b11, 0b011, 0b1110, 0b1000, 0b010>;
+def : RWSysReg<"PMEVCNTR3_EL0", 0b11, 0b011, 0b1110, 0b1000, 0b011>;
+def : RWSysReg<"PMEVCNTR4_EL0", 0b11, 0b011, 0b1110, 0b1000, 0b100>;
+def : RWSysReg<"PMEVCNTR5_EL0", 0b11, 0b011, 0b1110, 0b1000, 0b101>;
+def : RWSysReg<"PMEVCNTR6_EL0", 0b11, 0b011, 0b1110, 0b1000, 0b110>;
+def : RWSysReg<"PMEVCNTR7_EL0", 0b11, 0b011, 0b1110, 0b1000, 0b111>;
+def : RWSysReg<"PMEVCNTR8_EL0", 0b11, 0b011, 0b1110, 0b1001, 0b000>;
+def : RWSysReg<"PMEVCNTR9_EL0", 0b11, 0b011, 0b1110, 0b1001, 0b001>;
+def : RWSysReg<"PMEVCNTR10_EL0", 0b11, 0b011, 0b1110, 0b1001, 0b010>;
+def : RWSysReg<"PMEVCNTR11_EL0", 0b11, 0b011, 0b1110, 0b1001, 0b011>;
+def : RWSysReg<"PMEVCNTR12_EL0", 0b11, 0b011, 0b1110, 0b1001, 0b100>;
+def : RWSysReg<"PMEVCNTR13_EL0", 0b11, 0b011, 0b1110, 0b1001, 0b101>;
+def : RWSysReg<"PMEVCNTR14_EL0", 0b11, 0b011, 0b1110, 0b1001, 0b110>;
+def : RWSysReg<"PMEVCNTR15_EL0", 0b11, 0b011, 0b1110, 0b1001, 0b111>;
+def : RWSysReg<"PMEVCNTR16_EL0", 0b11, 0b011, 0b1110, 0b1010, 0b000>;
+def : RWSysReg<"PMEVCNTR17_EL0", 0b11, 0b011, 0b1110, 0b1010, 0b001>;
+def : RWSysReg<"PMEVCNTR18_EL0", 0b11, 0b011, 0b1110, 0b1010, 0b010>;
+def : RWSysReg<"PMEVCNTR19_EL0", 0b11, 0b011, 0b1110, 0b1010, 0b011>;
+def : RWSysReg<"PMEVCNTR20_EL0", 0b11, 0b011, 0b1110, 0b1010, 0b100>;
+def : RWSysReg<"PMEVCNTR21_EL0", 0b11, 0b011, 0b1110, 0b1010, 0b101>;
+def : RWSysReg<"PMEVCNTR22_EL0", 0b11, 0b011, 0b1110, 0b1010, 0b110>;
+def : RWSysReg<"PMEVCNTR23_EL0", 0b11, 0b011, 0b1110, 0b1010, 0b111>;
+def : RWSysReg<"PMEVCNTR24_EL0", 0b11, 0b011, 0b1110, 0b1011, 0b000>;
+def : RWSysReg<"PMEVCNTR25_EL0", 0b11, 0b011, 0b1110, 0b1011, 0b001>;
+def : RWSysReg<"PMEVCNTR26_EL0", 0b11, 0b011, 0b1110, 0b1011, 0b010>;
+def : RWSysReg<"PMEVCNTR27_EL0", 0b11, 0b011, 0b1110, 0b1011, 0b011>;
+def : RWSysReg<"PMEVCNTR28_EL0", 0b11, 0b011, 0b1110, 0b1011, 0b100>;
+def : RWSysReg<"PMEVCNTR29_EL0", 0b11, 0b011, 0b1110, 0b1011, 0b101>;
+def : RWSysReg<"PMEVCNTR30_EL0", 0b11, 0b011, 0b1110, 0b1011, 0b110>;
+def : RWSysReg<"PMCCFILTR_EL0", 0b11, 0b011, 0b1110, 0b1111, 0b111>;
+def : RWSysReg<"PMEVTYPER0_EL0", 0b11, 0b011, 0b1110, 0b1100, 0b000>;
+def : RWSysReg<"PMEVTYPER1_EL0", 0b11, 0b011, 0b1110, 0b1100, 0b001>;
+def : RWSysReg<"PMEVTYPER2_EL0", 0b11, 0b011, 0b1110, 0b1100, 0b010>;
+def : RWSysReg<"PMEVTYPER3_EL0", 0b11, 0b011, 0b1110, 0b1100, 0b011>;
+def : RWSysReg<"PMEVTYPER4_EL0", 0b11, 0b011, 0b1110, 0b1100, 0b100>;
+def : RWSysReg<"PMEVTYPER5_EL0", 0b11, 0b011, 0b1110, 0b1100, 0b101>;
+def : RWSysReg<"PMEVTYPER6_EL0", 0b11, 0b011, 0b1110, 0b1100, 0b110>;
+def : RWSysReg<"PMEVTYPER7_EL0", 0b11, 0b011, 0b1110, 0b1100, 0b111>;
+def : RWSysReg<"PMEVTYPER8_EL0", 0b11, 0b011, 0b1110, 0b1101, 0b000>;
+def : RWSysReg<"PMEVTYPER9_EL0", 0b11, 0b011, 0b1110, 0b1101, 0b001>;
+def : RWSysReg<"PMEVTYPER10_EL0", 0b11, 0b011, 0b1110, 0b1101, 0b010>;
+def : RWSysReg<"PMEVTYPER11_EL0", 0b11, 0b011, 0b1110, 0b1101, 0b011>;
+def : RWSysReg<"PMEVTYPER12_EL0", 0b11, 0b011, 0b1110, 0b1101, 0b100>;
+def : RWSysReg<"PMEVTYPER13_EL0", 0b11, 0b011, 0b1110, 0b1101, 0b101>;
+def : RWSysReg<"PMEVTYPER14_EL0", 0b11, 0b011, 0b1110, 0b1101, 0b110>;
+def : RWSysReg<"PMEVTYPER15_EL0", 0b11, 0b011, 0b1110, 0b1101, 0b111>;
+def : RWSysReg<"PMEVTYPER16_EL0", 0b11, 0b011, 0b1110, 0b1110, 0b000>;
+def : RWSysReg<"PMEVTYPER17_EL0", 0b11, 0b011, 0b1110, 0b1110, 0b001>;
+def : RWSysReg<"PMEVTYPER18_EL0", 0b11, 0b011, 0b1110, 0b1110, 0b010>;
+def : RWSysReg<"PMEVTYPER19_EL0", 0b11, 0b011, 0b1110, 0b1110, 0b011>;
+def : RWSysReg<"PMEVTYPER20_EL0", 0b11, 0b011, 0b1110, 0b1110, 0b100>;
+def : RWSysReg<"PMEVTYPER21_EL0", 0b11, 0b011, 0b1110, 0b1110, 0b101>;
+def : RWSysReg<"PMEVTYPER22_EL0", 0b11, 0b011, 0b1110, 0b1110, 0b110>;
+def : RWSysReg<"PMEVTYPER23_EL0", 0b11, 0b011, 0b1110, 0b1110, 0b111>;
+def : RWSysReg<"PMEVTYPER24_EL0", 0b11, 0b011, 0b1110, 0b1111, 0b000>;
+def : RWSysReg<"PMEVTYPER25_EL0", 0b11, 0b011, 0b1110, 0b1111, 0b001>;
+def : RWSysReg<"PMEVTYPER26_EL0", 0b11, 0b011, 0b1110, 0b1111, 0b010>;
+def : RWSysReg<"PMEVTYPER27_EL0", 0b11, 0b011, 0b1110, 0b1111, 0b011>;
+def : RWSysReg<"PMEVTYPER28_EL0", 0b11, 0b011, 0b1110, 0b1111, 0b100>;
+def : RWSysReg<"PMEVTYPER29_EL0", 0b11, 0b011, 0b1110, 0b1111, 0b101>;
+def : RWSysReg<"PMEVTYPER30_EL0", 0b11, 0b011, 0b1110, 0b1111, 0b110>;
+
+// Trace registers
+// Op0 Op1 CRn CRm Op2
+def : RWSysReg<"TRCPRGCTLR", 0b10, 0b001, 0b0000, 0b0001, 0b000>;
+def : RWSysReg<"TRCPROCSELR", 0b10, 0b001, 0b0000, 0b0010, 0b000>;
+def : RWSysReg<"TRCCONFIGR", 0b10, 0b001, 0b0000, 0b0100, 0b000>;
+def : RWSysReg<"TRCAUXCTLR", 0b10, 0b001, 0b0000, 0b0110, 0b000>;
+def : RWSysReg<"TRCEVENTCTL0R", 0b10, 0b001, 0b0000, 0b1000, 0b000>;
+def : RWSysReg<"TRCEVENTCTL1R", 0b10, 0b001, 0b0000, 0b1001, 0b000>;
+def : RWSysReg<"TRCSTALLCTLR", 0b10, 0b001, 0b0000, 0b1011, 0b000>;
+def : RWSysReg<"TRCTSCTLR", 0b10, 0b001, 0b0000, 0b1100, 0b000>;
+def : RWSysReg<"TRCSYNCPR", 0b10, 0b001, 0b0000, 0b1101, 0b000>;
+def : RWSysReg<"TRCCCCTLR", 0b10, 0b001, 0b0000, 0b1110, 0b000>;
+def : RWSysReg<"TRCBBCTLR", 0b10, 0b001, 0b0000, 0b1111, 0b000>;
+def : RWSysReg<"TRCTRACEIDR", 0b10, 0b001, 0b0000, 0b0000, 0b001>;
+def : RWSysReg<"TRCQCTLR", 0b10, 0b001, 0b0000, 0b0001, 0b001>;
+def : RWSysReg<"TRCVICTLR", 0b10, 0b001, 0b0000, 0b0000, 0b010>;
+def : RWSysReg<"TRCVIIECTLR", 0b10, 0b001, 0b0000, 0b0001, 0b010>;
+def : RWSysReg<"TRCVISSCTLR", 0b10, 0b001, 0b0000, 0b0010, 0b010>;
+def : RWSysReg<"TRCVIPCSSCTLR", 0b10, 0b001, 0b0000, 0b0011, 0b010>;
+def : RWSysReg<"TRCVDCTLR", 0b10, 0b001, 0b0000, 0b1000, 0b010>;
+def : RWSysReg<"TRCVDSACCTLR", 0b10, 0b001, 0b0000, 0b1001, 0b010>;
+def : RWSysReg<"TRCVDARCCTLR", 0b10, 0b001, 0b0000, 0b1010, 0b010>;
+def : RWSysReg<"TRCSEQEVR0", 0b10, 0b001, 0b0000, 0b0000, 0b100>;
+def : RWSysReg<"TRCSEQEVR1", 0b10, 0b001, 0b0000, 0b0001, 0b100>;
+def : RWSysReg<"TRCSEQEVR2", 0b10, 0b001, 0b0000, 0b0010, 0b100>;
+def : RWSysReg<"TRCSEQRSTEVR", 0b10, 0b001, 0b0000, 0b0110, 0b100>;
+def : RWSysReg<"TRCSEQSTR", 0b10, 0b001, 0b0000, 0b0111, 0b100>;
+def : RWSysReg<"TRCEXTINSELR", 0b10, 0b001, 0b0000, 0b1000, 0b100>;
+def : RWSysReg<"TRCCNTRLDVR0", 0b10, 0b001, 0b0000, 0b0000, 0b101>;
+def : RWSysReg<"TRCCNTRLDVR1", 0b10, 0b001, 0b0000, 0b0001, 0b101>;
+def : RWSysReg<"TRCCNTRLDVR2", 0b10, 0b001, 0b0000, 0b0010, 0b101>;
+def : RWSysReg<"TRCCNTRLDVR3", 0b10, 0b001, 0b0000, 0b0011, 0b101>;
+def : RWSysReg<"TRCCNTCTLR0", 0b10, 0b001, 0b0000, 0b0100, 0b101>;
+def : RWSysReg<"TRCCNTCTLR1", 0b10, 0b001, 0b0000, 0b0101, 0b101>;
+def : RWSysReg<"TRCCNTCTLR2", 0b10, 0b001, 0b0000, 0b0110, 0b101>;
+def : RWSysReg<"TRCCNTCTLR3", 0b10, 0b001, 0b0000, 0b0111, 0b101>;
+def : RWSysReg<"TRCCNTVR0", 0b10, 0b001, 0b0000, 0b1000, 0b101>;
+def : RWSysReg<"TRCCNTVR1", 0b10, 0b001, 0b0000, 0b1001, 0b101>;
+def : RWSysReg<"TRCCNTVR2", 0b10, 0b001, 0b0000, 0b1010, 0b101>;
+def : RWSysReg<"TRCCNTVR3", 0b10, 0b001, 0b0000, 0b1011, 0b101>;
+def : RWSysReg<"TRCIMSPEC0", 0b10, 0b001, 0b0000, 0b0000, 0b111>;
+def : RWSysReg<"TRCIMSPEC1", 0b10, 0b001, 0b0000, 0b0001, 0b111>;
+def : RWSysReg<"TRCIMSPEC2", 0b10, 0b001, 0b0000, 0b0010, 0b111>;
+def : RWSysReg<"TRCIMSPEC3", 0b10, 0b001, 0b0000, 0b0011, 0b111>;
+def : RWSysReg<"TRCIMSPEC4", 0b10, 0b001, 0b0000, 0b0100, 0b111>;
+def : RWSysReg<"TRCIMSPEC5", 0b10, 0b001, 0b0000, 0b0101, 0b111>;
+def : RWSysReg<"TRCIMSPEC6", 0b10, 0b001, 0b0000, 0b0110, 0b111>;
+def : RWSysReg<"TRCIMSPEC7", 0b10, 0b001, 0b0000, 0b0111, 0b111>;
+def : RWSysReg<"TRCRSCTLR2", 0b10, 0b001, 0b0001, 0b0010, 0b000>;
+def : RWSysReg<"TRCRSCTLR3", 0b10, 0b001, 0b0001, 0b0011, 0b000>;
+def : RWSysReg<"TRCRSCTLR4", 0b10, 0b001, 0b0001, 0b0100, 0b000>;
+def : RWSysReg<"TRCRSCTLR5", 0b10, 0b001, 0b0001, 0b0101, 0b000>;
+def : RWSysReg<"TRCRSCTLR6", 0b10, 0b001, 0b0001, 0b0110, 0b000>;
+def : RWSysReg<"TRCRSCTLR7", 0b10, 0b001, 0b0001, 0b0111, 0b000>;
+def : RWSysReg<"TRCRSCTLR8", 0b10, 0b001, 0b0001, 0b1000, 0b000>;
+def : RWSysReg<"TRCRSCTLR9", 0b10, 0b001, 0b0001, 0b1001, 0b000>;
+def : RWSysReg<"TRCRSCTLR10", 0b10, 0b001, 0b0001, 0b1010, 0b000>;
+def : RWSysReg<"TRCRSCTLR11", 0b10, 0b001, 0b0001, 0b1011, 0b000>;
+def : RWSysReg<"TRCRSCTLR12", 0b10, 0b001, 0b0001, 0b1100, 0b000>;
+def : RWSysReg<"TRCRSCTLR13", 0b10, 0b001, 0b0001, 0b1101, 0b000>;
+def : RWSysReg<"TRCRSCTLR14", 0b10, 0b001, 0b0001, 0b1110, 0b000>;
+def : RWSysReg<"TRCRSCTLR15", 0b10, 0b001, 0b0001, 0b1111, 0b000>;
+def : RWSysReg<"TRCRSCTLR16", 0b10, 0b001, 0b0001, 0b0000, 0b001>;
+def : RWSysReg<"TRCRSCTLR17", 0b10, 0b001, 0b0001, 0b0001, 0b001>;
+def : RWSysReg<"TRCRSCTLR18", 0b10, 0b001, 0b0001, 0b0010, 0b001>;
+def : RWSysReg<"TRCRSCTLR19", 0b10, 0b001, 0b0001, 0b0011, 0b001>;
+def : RWSysReg<"TRCRSCTLR20", 0b10, 0b001, 0b0001, 0b0100, 0b001>;
+def : RWSysReg<"TRCRSCTLR21", 0b10, 0b001, 0b0001, 0b0101, 0b001>;
+def : RWSysReg<"TRCRSCTLR22", 0b10, 0b001, 0b0001, 0b0110, 0b001>;
+def : RWSysReg<"TRCRSCTLR23", 0b10, 0b001, 0b0001, 0b0111, 0b001>;
+def : RWSysReg<"TRCRSCTLR24", 0b10, 0b001, 0b0001, 0b1000, 0b001>;
+def : RWSysReg<"TRCRSCTLR25", 0b10, 0b001, 0b0001, 0b1001, 0b001>;
+def : RWSysReg<"TRCRSCTLR26", 0b10, 0b001, 0b0001, 0b1010, 0b001>;
+def : RWSysReg<"TRCRSCTLR27", 0b10, 0b001, 0b0001, 0b1011, 0b001>;
+def : RWSysReg<"TRCRSCTLR28", 0b10, 0b001, 0b0001, 0b1100, 0b001>;
+def : RWSysReg<"TRCRSCTLR29", 0b10, 0b001, 0b0001, 0b1101, 0b001>;
+def : RWSysReg<"TRCRSCTLR30", 0b10, 0b001, 0b0001, 0b1110, 0b001>;
+def : RWSysReg<"TRCRSCTLR31", 0b10, 0b001, 0b0001, 0b1111, 0b001>;
+def : RWSysReg<"TRCSSCCR0", 0b10, 0b001, 0b0001, 0b0000, 0b010>;
+def : RWSysReg<"TRCSSCCR1", 0b10, 0b001, 0b0001, 0b0001, 0b010>;
+def : RWSysReg<"TRCSSCCR2", 0b10, 0b001, 0b0001, 0b0010, 0b010>;
+def : RWSysReg<"TRCSSCCR3", 0b10, 0b001, 0b0001, 0b0011, 0b010>;
+def : RWSysReg<"TRCSSCCR4", 0b10, 0b001, 0b0001, 0b0100, 0b010>;
+def : RWSysReg<"TRCSSCCR5", 0b10, 0b001, 0b0001, 0b0101, 0b010>;
+def : RWSysReg<"TRCSSCCR6", 0b10, 0b001, 0b0001, 0b0110, 0b010>;
+def : RWSysReg<"TRCSSCCR7", 0b10, 0b001, 0b0001, 0b0111, 0b010>;
+def : RWSysReg<"TRCSSCSR0", 0b10, 0b001, 0b0001, 0b1000, 0b010>;
+def : RWSysReg<"TRCSSCSR1", 0b10, 0b001, 0b0001, 0b1001, 0b010>;
+def : RWSysReg<"TRCSSCSR2", 0b10, 0b001, 0b0001, 0b1010, 0b010>;
+def : RWSysReg<"TRCSSCSR3", 0b10, 0b001, 0b0001, 0b1011, 0b010>;
+def : RWSysReg<"TRCSSCSR4", 0b10, 0b001, 0b0001, 0b1100, 0b010>;
+def : RWSysReg<"TRCSSCSR5", 0b10, 0b001, 0b0001, 0b1101, 0b010>;
+def : RWSysReg<"TRCSSCSR6", 0b10, 0b001, 0b0001, 0b1110, 0b010>;
+def : RWSysReg<"TRCSSCSR7", 0b10, 0b001, 0b0001, 0b1111, 0b010>;
+def : RWSysReg<"TRCSSPCICR0", 0b10, 0b001, 0b0001, 0b0000, 0b011>;
+def : RWSysReg<"TRCSSPCICR1", 0b10, 0b001, 0b0001, 0b0001, 0b011>;
+def : RWSysReg<"TRCSSPCICR2", 0b10, 0b001, 0b0001, 0b0010, 0b011>;
+def : RWSysReg<"TRCSSPCICR3", 0b10, 0b001, 0b0001, 0b0011, 0b011>;
+def : RWSysReg<"TRCSSPCICR4", 0b10, 0b001, 0b0001, 0b0100, 0b011>;
+def : RWSysReg<"TRCSSPCICR5", 0b10, 0b001, 0b0001, 0b0101, 0b011>;
+def : RWSysReg<"TRCSSPCICR6", 0b10, 0b001, 0b0001, 0b0110, 0b011>;
+def : RWSysReg<"TRCSSPCICR7", 0b10, 0b001, 0b0001, 0b0111, 0b011>;
+def : RWSysReg<"TRCPDCR", 0b10, 0b001, 0b0001, 0b0100, 0b100>;
+def : RWSysReg<"TRCACVR0", 0b10, 0b001, 0b0010, 0b0000, 0b000>;
+def : RWSysReg<"TRCACVR1", 0b10, 0b001, 0b0010, 0b0010, 0b000>;
+def : RWSysReg<"TRCACVR2", 0b10, 0b001, 0b0010, 0b0100, 0b000>;
+def : RWSysReg<"TRCACVR3", 0b10, 0b001, 0b0010, 0b0110, 0b000>;
+def : RWSysReg<"TRCACVR4", 0b10, 0b001, 0b0010, 0b1000, 0b000>;
+def : RWSysReg<"TRCACVR5", 0b10, 0b001, 0b0010, 0b1010, 0b000>;
+def : RWSysReg<"TRCACVR6", 0b10, 0b001, 0b0010, 0b1100, 0b000>;
+def : RWSysReg<"TRCACVR7", 0b10, 0b001, 0b0010, 0b1110, 0b000>;
+def : RWSysReg<"TRCACVR8", 0b10, 0b001, 0b0010, 0b0000, 0b001>;
+def : RWSysReg<"TRCACVR9", 0b10, 0b001, 0b0010, 0b0010, 0b001>;
+def : RWSysReg<"TRCACVR10", 0b10, 0b001, 0b0010, 0b0100, 0b001>;
+def : RWSysReg<"TRCACVR11", 0b10, 0b001, 0b0010, 0b0110, 0b001>;
+def : RWSysReg<"TRCACVR12", 0b10, 0b001, 0b0010, 0b1000, 0b001>;
+def : RWSysReg<"TRCACVR13", 0b10, 0b001, 0b0010, 0b1010, 0b001>;
+def : RWSysReg<"TRCACVR14", 0b10, 0b001, 0b0010, 0b1100, 0b001>;
+def : RWSysReg<"TRCACVR15", 0b10, 0b001, 0b0010, 0b1110, 0b001>;
+def : RWSysReg<"TRCACATR0", 0b10, 0b001, 0b0010, 0b0000, 0b010>;
+def : RWSysReg<"TRCACATR1", 0b10, 0b001, 0b0010, 0b0010, 0b010>;
+def : RWSysReg<"TRCACATR2", 0b10, 0b001, 0b0010, 0b0100, 0b010>;
+def : RWSysReg<"TRCACATR3", 0b10, 0b001, 0b0010, 0b0110, 0b010>;
+def : RWSysReg<"TRCACATR4", 0b10, 0b001, 0b0010, 0b1000, 0b010>;
+def : RWSysReg<"TRCACATR5", 0b10, 0b001, 0b0010, 0b1010, 0b010>;
+def : RWSysReg<"TRCACATR6", 0b10, 0b001, 0b0010, 0b1100, 0b010>;
+def : RWSysReg<"TRCACATR7", 0b10, 0b001, 0b0010, 0b1110, 0b010>;
+def : RWSysReg<"TRCACATR8", 0b10, 0b001, 0b0010, 0b0000, 0b011>;
+def : RWSysReg<"TRCACATR9", 0b10, 0b001, 0b0010, 0b0010, 0b011>;
+def : RWSysReg<"TRCACATR10", 0b10, 0b001, 0b0010, 0b0100, 0b011>;
+def : RWSysReg<"TRCACATR11", 0b10, 0b001, 0b0010, 0b0110, 0b011>;
+def : RWSysReg<"TRCACATR12", 0b10, 0b001, 0b0010, 0b1000, 0b011>;
+def : RWSysReg<"TRCACATR13", 0b10, 0b001, 0b0010, 0b1010, 0b011>;
+def : RWSysReg<"TRCACATR14", 0b10, 0b001, 0b0010, 0b1100, 0b011>;
+def : RWSysReg<"TRCACATR15", 0b10, 0b001, 0b0010, 0b1110, 0b011>;
+def : RWSysReg<"TRCDVCVR0", 0b10, 0b001, 0b0010, 0b0000, 0b100>;
+def : RWSysReg<"TRCDVCVR1", 0b10, 0b001, 0b0010, 0b0100, 0b100>;
+def : RWSysReg<"TRCDVCVR2", 0b10, 0b001, 0b0010, 0b1000, 0b100>;
+def : RWSysReg<"TRCDVCVR3", 0b10, 0b001, 0b0010, 0b1100, 0b100>;
+def : RWSysReg<"TRCDVCVR4", 0b10, 0b001, 0b0010, 0b0000, 0b101>;
+def : RWSysReg<"TRCDVCVR5", 0b10, 0b001, 0b0010, 0b0100, 0b101>;
+def : RWSysReg<"TRCDVCVR6", 0b10, 0b001, 0b0010, 0b1000, 0b101>;
+def : RWSysReg<"TRCDVCVR7", 0b10, 0b001, 0b0010, 0b1100, 0b101>;
+def : RWSysReg<"TRCDVCMR0", 0b10, 0b001, 0b0010, 0b0000, 0b110>;
+def : RWSysReg<"TRCDVCMR1", 0b10, 0b001, 0b0010, 0b0100, 0b110>;
+def : RWSysReg<"TRCDVCMR2", 0b10, 0b001, 0b0010, 0b1000, 0b110>;
+def : RWSysReg<"TRCDVCMR3", 0b10, 0b001, 0b0010, 0b1100, 0b110>;
+def : RWSysReg<"TRCDVCMR4", 0b10, 0b001, 0b0010, 0b0000, 0b111>;
+def : RWSysReg<"TRCDVCMR5", 0b10, 0b001, 0b0010, 0b0100, 0b111>;
+def : RWSysReg<"TRCDVCMR6", 0b10, 0b001, 0b0010, 0b1000, 0b111>;
+def : RWSysReg<"TRCDVCMR7", 0b10, 0b001, 0b0010, 0b1100, 0b111>;
+def : RWSysReg<"TRCCIDCVR0", 0b10, 0b001, 0b0011, 0b0000, 0b000>;
+def : RWSysReg<"TRCCIDCVR1", 0b10, 0b001, 0b0011, 0b0010, 0b000>;
+def : RWSysReg<"TRCCIDCVR2", 0b10, 0b001, 0b0011, 0b0100, 0b000>;
+def : RWSysReg<"TRCCIDCVR3", 0b10, 0b001, 0b0011, 0b0110, 0b000>;
+def : RWSysReg<"TRCCIDCVR4", 0b10, 0b001, 0b0011, 0b1000, 0b000>;
+def : RWSysReg<"TRCCIDCVR5", 0b10, 0b001, 0b0011, 0b1010, 0b000>;
+def : RWSysReg<"TRCCIDCVR6", 0b10, 0b001, 0b0011, 0b1100, 0b000>;
+def : RWSysReg<"TRCCIDCVR7", 0b10, 0b001, 0b0011, 0b1110, 0b000>;
+def : RWSysReg<"TRCVMIDCVR0", 0b10, 0b001, 0b0011, 0b0000, 0b001>;
+def : RWSysReg<"TRCVMIDCVR1", 0b10, 0b001, 0b0011, 0b0010, 0b001>;
+def : RWSysReg<"TRCVMIDCVR2", 0b10, 0b001, 0b0011, 0b0100, 0b001>;
+def : RWSysReg<"TRCVMIDCVR3", 0b10, 0b001, 0b0011, 0b0110, 0b001>;
+def : RWSysReg<"TRCVMIDCVR4", 0b10, 0b001, 0b0011, 0b1000, 0b001>;
+def : RWSysReg<"TRCVMIDCVR5", 0b10, 0b001, 0b0011, 0b1010, 0b001>;
+def : RWSysReg<"TRCVMIDCVR6", 0b10, 0b001, 0b0011, 0b1100, 0b001>;
+def : RWSysReg<"TRCVMIDCVR7", 0b10, 0b001, 0b0011, 0b1110, 0b001>;
+def : RWSysReg<"TRCCIDCCTLR0", 0b10, 0b001, 0b0011, 0b0000, 0b010>;
+def : RWSysReg<"TRCCIDCCTLR1", 0b10, 0b001, 0b0011, 0b0001, 0b010>;
+def : RWSysReg<"TRCVMIDCCTLR0", 0b10, 0b001, 0b0011, 0b0010, 0b010>;
+def : RWSysReg<"TRCVMIDCCTLR1", 0b10, 0b001, 0b0011, 0b0011, 0b010>;
+def : RWSysReg<"TRCITCTRL", 0b10, 0b001, 0b0111, 0b0000, 0b100>;
+def : RWSysReg<"TRCCLAIMSET", 0b10, 0b001, 0b0111, 0b1000, 0b110>;
+def : RWSysReg<"TRCCLAIMCLR", 0b10, 0b001, 0b0111, 0b1001, 0b110>;
+
+// GICv3 registers
+// Op0 Op1 CRn CRm Op2
+def : RWSysReg<"ICC_BPR1_EL1", 0b11, 0b000, 0b1100, 0b1100, 0b011>;
+def : RWSysReg<"ICC_BPR0_EL1", 0b11, 0b000, 0b1100, 0b1000, 0b011>;
+def : RWSysReg<"ICC_PMR_EL1", 0b11, 0b000, 0b0100, 0b0110, 0b000>;
+def : RWSysReg<"ICC_CTLR_EL1", 0b11, 0b000, 0b1100, 0b1100, 0b100>;
+def : RWSysReg<"ICC_CTLR_EL3", 0b11, 0b110, 0b1100, 0b1100, 0b100>;
+def : RWSysReg<"ICC_SRE_EL1", 0b11, 0b000, 0b1100, 0b1100, 0b101>;
+def : RWSysReg<"ICC_SRE_EL2", 0b11, 0b100, 0b1100, 0b1001, 0b101>;
+def : RWSysReg<"ICC_SRE_EL3", 0b11, 0b110, 0b1100, 0b1100, 0b101>;
+def : RWSysReg<"ICC_IGRPEN0_EL1", 0b11, 0b000, 0b1100, 0b1100, 0b110>;
+def : RWSysReg<"ICC_IGRPEN1_EL1", 0b11, 0b000, 0b1100, 0b1100, 0b111>;
+def : RWSysReg<"ICC_IGRPEN1_EL3", 0b11, 0b110, 0b1100, 0b1100, 0b111>;
+def : RWSysReg<"ICC_SEIEN_EL1", 0b11, 0b000, 0b1100, 0b1101, 0b000>;
+def : RWSysReg<"ICC_AP0R0_EL1", 0b11, 0b000, 0b1100, 0b1000, 0b100>;
+def : RWSysReg<"ICC_AP0R1_EL1", 0b11, 0b000, 0b1100, 0b1000, 0b101>;
+def : RWSysReg<"ICC_AP0R2_EL1", 0b11, 0b000, 0b1100, 0b1000, 0b110>;
+def : RWSysReg<"ICC_AP0R3_EL1", 0b11, 0b000, 0b1100, 0b1000, 0b111>;
+def : RWSysReg<"ICC_AP1R0_EL1", 0b11, 0b000, 0b1100, 0b1001, 0b000>;
+def : RWSysReg<"ICC_AP1R1_EL1", 0b11, 0b000, 0b1100, 0b1001, 0b001>;
+def : RWSysReg<"ICC_AP1R2_EL1", 0b11, 0b000, 0b1100, 0b1001, 0b010>;
+def : RWSysReg<"ICC_AP1R3_EL1", 0b11, 0b000, 0b1100, 0b1001, 0b011>;
+def : RWSysReg<"ICH_AP0R0_EL2", 0b11, 0b100, 0b1100, 0b1000, 0b000>;
+def : RWSysReg<"ICH_AP0R1_EL2", 0b11, 0b100, 0b1100, 0b1000, 0b001>;
+def : RWSysReg<"ICH_AP0R2_EL2", 0b11, 0b100, 0b1100, 0b1000, 0b010>;
+def : RWSysReg<"ICH_AP0R3_EL2", 0b11, 0b100, 0b1100, 0b1000, 0b011>;
+def : RWSysReg<"ICH_AP1R0_EL2", 0b11, 0b100, 0b1100, 0b1001, 0b000>;
+def : RWSysReg<"ICH_AP1R1_EL2", 0b11, 0b100, 0b1100, 0b1001, 0b001>;
+def : RWSysReg<"ICH_AP1R2_EL2", 0b11, 0b100, 0b1100, 0b1001, 0b010>;
+def : RWSysReg<"ICH_AP1R3_EL2", 0b11, 0b100, 0b1100, 0b1001, 0b011>;
+def : RWSysReg<"ICH_HCR_EL2", 0b11, 0b100, 0b1100, 0b1011, 0b000>;
+def : RWSysReg<"ICH_MISR_EL2", 0b11, 0b100, 0b1100, 0b1011, 0b010>;
+def : RWSysReg<"ICH_VMCR_EL2", 0b11, 0b100, 0b1100, 0b1011, 0b111>;
+def : RWSysReg<"ICH_VSEIR_EL2", 0b11, 0b100, 0b1100, 0b1001, 0b100>;
+def : RWSysReg<"ICH_LR0_EL2", 0b11, 0b100, 0b1100, 0b1100, 0b000>;
+def : RWSysReg<"ICH_LR1_EL2", 0b11, 0b100, 0b1100, 0b1100, 0b001>;
+def : RWSysReg<"ICH_LR2_EL2", 0b11, 0b100, 0b1100, 0b1100, 0b010>;
+def : RWSysReg<"ICH_LR3_EL2", 0b11, 0b100, 0b1100, 0b1100, 0b011>;
+def : RWSysReg<"ICH_LR4_EL2", 0b11, 0b100, 0b1100, 0b1100, 0b100>;
+def : RWSysReg<"ICH_LR5_EL2", 0b11, 0b100, 0b1100, 0b1100, 0b101>;
+def : RWSysReg<"ICH_LR6_EL2", 0b11, 0b100, 0b1100, 0b1100, 0b110>;
+def : RWSysReg<"ICH_LR7_EL2", 0b11, 0b100, 0b1100, 0b1100, 0b111>;
+def : RWSysReg<"ICH_LR8_EL2", 0b11, 0b100, 0b1100, 0b1101, 0b000>;
+def : RWSysReg<"ICH_LR9_EL2", 0b11, 0b100, 0b1100, 0b1101, 0b001>;
+def : RWSysReg<"ICH_LR10_EL2", 0b11, 0b100, 0b1100, 0b1101, 0b010>;
+def : RWSysReg<"ICH_LR11_EL2", 0b11, 0b100, 0b1100, 0b1101, 0b011>;
+def : RWSysReg<"ICH_LR12_EL2", 0b11, 0b100, 0b1100, 0b1101, 0b100>;
+def : RWSysReg<"ICH_LR13_EL2", 0b11, 0b100, 0b1100, 0b1101, 0b101>;
+def : RWSysReg<"ICH_LR14_EL2", 0b11, 0b100, 0b1100, 0b1101, 0b110>;
+def : RWSysReg<"ICH_LR15_EL2", 0b11, 0b100, 0b1100, 0b1101, 0b111>;
+
+// v8.1a "Privileged Access Never" extension-specific system registers
+let Requires = [{ {AArch64::HasV8_1aOps} }] in
+def : RWSysReg<"PAN", 0b11, 0b000, 0b0100, 0b0010, 0b011>;
+
+// v8.1a "Limited Ordering Regions" extension-specific system registers
+// Op0 Op1 CRn CRm Op2
+let Requires = [{ {AArch64::HasV8_1aOps} }] in {
+def : RWSysReg<"LORSA_EL1", 0b11, 0b000, 0b1010, 0b0100, 0b000>;
+def : RWSysReg<"LOREA_EL1", 0b11, 0b000, 0b1010, 0b0100, 0b001>;
+def : RWSysReg<"LORN_EL1", 0b11, 0b000, 0b1010, 0b0100, 0b010>;
+def : RWSysReg<"LORC_EL1", 0b11, 0b000, 0b1010, 0b0100, 0b011>;
+}
+
+// v8.1a "Virtualization hos extensions" system registers
+// Op0 Op1 CRn CRm Op2
+let Requires = [{ {AArch64::HasV8_1aOps} }] in {
+def : RWSysReg<"TTBR1_EL2", 0b11, 0b100, 0b0010, 0b0000, 0b001>;
+def : RWSysReg<"CONTEXTIDR_EL2", 0b11, 0b100, 0b1101, 0b0000, 0b001>;
+def : RWSysReg<"CNTHV_TVAL_EL2", 0b11, 0b100, 0b1110, 0b0011, 0b000>;
+def : RWSysReg<"CNTHV_CVAL_EL2", 0b11, 0b100, 0b1110, 0b0011, 0b010>;
+def : RWSysReg<"CNTHV_CTL_EL2", 0b11, 0b100, 0b1110, 0b0011, 0b001>;
+def : RWSysReg<"SCTLR_EL12", 0b11, 0b101, 0b0001, 0b0000, 0b000>;
+def : RWSysReg<"CPACR_EL12", 0b11, 0b101, 0b0001, 0b0000, 0b010>;
+def : RWSysReg<"TTBR0_EL12", 0b11, 0b101, 0b0010, 0b0000, 0b000>;
+def : RWSysReg<"TTBR1_EL12", 0b11, 0b101, 0b0010, 0b0000, 0b001>;
+def : RWSysReg<"TCR_EL12", 0b11, 0b101, 0b0010, 0b0000, 0b010>;
+def : RWSysReg<"AFSR0_EL12", 0b11, 0b101, 0b0101, 0b0001, 0b000>;
+def : RWSysReg<"AFSR1_EL12", 0b11, 0b101, 0b0101, 0b0001, 0b001>;
+def : RWSysReg<"ESR_EL12", 0b11, 0b101, 0b0101, 0b0010, 0b000>;
+def : RWSysReg<"FAR_EL12", 0b11, 0b101, 0b0110, 0b0000, 0b000>;
+def : RWSysReg<"MAIR_EL12", 0b11, 0b101, 0b1010, 0b0010, 0b000>;
+def : RWSysReg<"AMAIR_EL12", 0b11, 0b101, 0b1010, 0b0011, 0b000>;
+def : RWSysReg<"VBAR_EL12", 0b11, 0b101, 0b1100, 0b0000, 0b000>;
+def : RWSysReg<"CONTEXTIDR_EL12", 0b11, 0b101, 0b1101, 0b0000, 0b001>;
+def : RWSysReg<"CNTKCTL_EL12", 0b11, 0b101, 0b1110, 0b0001, 0b000>;
+def : RWSysReg<"CNTP_TVAL_EL02", 0b11, 0b101, 0b1110, 0b0010, 0b000>;
+def : RWSysReg<"CNTP_CTL_EL02", 0b11, 0b101, 0b1110, 0b0010, 0b001>;
+def : RWSysReg<"CNTP_CVAL_EL02", 0b11, 0b101, 0b1110, 0b0010, 0b010>;
+def : RWSysReg<"CNTV_TVAL_EL02", 0b11, 0b101, 0b1110, 0b0011, 0b000>;
+def : RWSysReg<"CNTV_CTL_EL02", 0b11, 0b101, 0b1110, 0b0011, 0b001>;
+def : RWSysReg<"CNTV_CVAL_EL02", 0b11, 0b101, 0b1110, 0b0011, 0b010>;
+def : RWSysReg<"SPSR_EL12", 0b11, 0b101, 0b0100, 0b0000, 0b000>;
+def : RWSysReg<"ELR_EL12", 0b11, 0b101, 0b0100, 0b0000, 0b001>;
+}
+// v8.2a registers
+// Op0 Op1 CRn CRm Op2
+let Requires = [{ {AArch64::HasV8_2aOps} }] in
+def : RWSysReg<"UAO", 0b11, 0b000, 0b0100, 0b0010, 0b100>;
+
+// v8.2a "Statistical Profiling extension" registers
+// Op0 Op1 CRn CRm Op2
+let Requires = [{ {AArch64::FeatureSPE} }] in {
+def : RWSysReg<"PMBLIMITR_EL1", 0b11, 0b000, 0b1001, 0b1010, 0b000>;
+def : RWSysReg<"PMBPTR_EL1", 0b11, 0b000, 0b1001, 0b1010, 0b001>;
+def : RWSysReg<"PMBSR_EL1", 0b11, 0b000, 0b1001, 0b1010, 0b011>;
+def : RWSysReg<"PMBIDR_EL1", 0b11, 0b000, 0b1001, 0b1010, 0b111>;
+def : RWSysReg<"PMSCR_EL2", 0b11, 0b100, 0b1001, 0b1001, 0b000>;
+def : RWSysReg<"PMSCR_EL12", 0b11, 0b101, 0b1001, 0b1001, 0b000>;
+def : RWSysReg<"PMSCR_EL1", 0b11, 0b000, 0b1001, 0b1001, 0b000>;
+def : RWSysReg<"PMSICR_EL1", 0b11, 0b000, 0b1001, 0b1001, 0b010>;
+def : RWSysReg<"PMSIRR_EL1", 0b11, 0b000, 0b1001, 0b1001, 0b011>;
+def : RWSysReg<"PMSFCR_EL1", 0b11, 0b000, 0b1001, 0b1001, 0b100>;
+def : RWSysReg<"PMSEVFR_EL1", 0b11, 0b000, 0b1001, 0b1001, 0b101>;
+def : RWSysReg<"PMSLATFR_EL1", 0b11, 0b000, 0b1001, 0b1001, 0b110>;
+def : RWSysReg<"PMSIDR_EL1", 0b11, 0b000, 0b1001, 0b1001, 0b111>;
+}
+
+// v8.2a "RAS extension" registers
+// Op0 Op1 CRn CRm Op2
+let Requires = [{ {AArch64::FeatureRAS} }] in {
+def : RWSysReg<"ERRSELR_EL1", 0b11, 0b000, 0b0101, 0b0011, 0b001>;
+def : RWSysReg<"ERXCTLR_EL1", 0b11, 0b000, 0b0101, 0b0100, 0b001>;
+def : RWSysReg<"ERXSTATUS_EL1", 0b11, 0b000, 0b0101, 0b0100, 0b010>;
+def : RWSysReg<"ERXADDR_EL1", 0b11, 0b000, 0b0101, 0b0100, 0b011>;
+def : RWSysReg<"ERXMISC0_EL1", 0b11, 0b000, 0b0101, 0b0101, 0b000>;
+def : RWSysReg<"ERXMISC1_EL1", 0b11, 0b000, 0b0101, 0b0101, 0b001>;
+def : RWSysReg<"DISR_EL1", 0b11, 0b000, 0b1100, 0b0001, 0b001>;
+def : RWSysReg<"VDISR_EL2", 0b11, 0b100, 0b1100, 0b0001, 0b001>;
+def : RWSysReg<"VSESR_EL2", 0b11, 0b100, 0b0101, 0b0010, 0b011>;
+}
+
+// Cyclone specific system registers
+// Op0 Op1 CRn CRm Op2
+let Requires = [{ {AArch64::ProcCyclone} }] in
+def : RWSysReg<"CPM_IOACC_CTL_EL3", 0b11, 0b111, 0b1111, 0b0010, 0b000>;
diff --git a/lib/Target/AArch64/AArch64TargetMachine.cpp b/lib/Target/AArch64/AArch64TargetMachine.cpp
index c52c5544fc7e2..0b6345ff8011b 100644
--- a/lib/Target/AArch64/AArch64TargetMachine.cpp
+++ b/lib/Target/AArch64/AArch64TargetMachine.cpp
@@ -11,13 +11,19 @@
//===----------------------------------------------------------------------===//
#include "AArch64.h"
+#include "AArch64CallLowering.h"
+#include "AArch64RegisterBankInfo.h"
#include "AArch64TargetMachine.h"
#include "AArch64TargetObjectFile.h"
#include "AArch64TargetTransformInfo.h"
+#include "llvm/CodeGen/GlobalISel/IRTranslator.h"
+#include "llvm/CodeGen/GlobalISel/RegBankSelect.h"
#include "llvm/CodeGen/Passes.h"
#include "llvm/CodeGen/RegAllocRegistry.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/InitializePasses.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/TargetRegistry.h"
#include "llvm/Target/TargetOptions.h"
@@ -58,6 +64,11 @@ EnableDeadRegisterElimination("aarch64-dead-def-elimination", cl::Hidden,
cl::init(true));
static cl::opt<bool>
+EnableRedundantCopyElimination("aarch64-redundant-copy-elim",
+ cl::desc("Enable the redundant copy elimination pass"),
+ cl::init(true), cl::Hidden);
+
+static cl::opt<bool>
EnableLoadStoreOpt("aarch64-load-store-opt", cl::desc("Enable the load/store pair"
" optimization pass"), cl::init(true), cl::Hidden);
@@ -92,11 +103,19 @@ static cl::opt<cl::boolOrDefault>
EnableGlobalMerge("aarch64-global-merge", cl::Hidden,
cl::desc("Enable the global merge pass"));
+static cl::opt<bool>
+ EnableLoopDataPrefetch("aarch64-loop-data-prefetch", cl::Hidden,
+ cl::desc("Enable the loop data prefetch pass"),
+ cl::init(true));
+
extern "C" void LLVMInitializeAArch64Target() {
// Register the target.
RegisterTargetMachine<AArch64leTargetMachine> X(TheAArch64leTarget);
RegisterTargetMachine<AArch64beTargetMachine> Y(TheAArch64beTarget);
RegisterTargetMachine<AArch64leTargetMachine> Z(TheARM64Target);
+ auto PR = PassRegistry::getPassRegistry();
+ initializeGlobalISel(*PR);
+ initializeAArch64ExpandPseudoPass(*PR);
}
//===----------------------------------------------------------------------===//
@@ -114,29 +133,79 @@ static std::string computeDataLayout(const Triple &TT, bool LittleEndian) {
if (TT.isOSBinFormatMachO())
return "e-m:o-i64:64-i128:128-n32:64-S128";
if (LittleEndian)
- return "e-m:e-i64:64-i128:128-n32:64-S128";
- return "E-m:e-i64:64-i128:128-n32:64-S128";
+ return "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128";
+ return "E-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128";
}
-/// TargetMachine ctor - Create an AArch64 architecture model.
+// Helper function to set up the defaults for reciprocals.
+static void initReciprocals(AArch64TargetMachine& TM, AArch64Subtarget& ST)
+{
+ // For the estimates, convergence is quadratic, so essentially the number of
+ // digits is doubled after each iteration. ARMv8, the minimum architected
+ // accuracy of the initial estimate is 2^-8. Therefore, the number of extra
+ // steps to refine the result for float (23 mantissa bits) and for double
+ // (52 mantissa bits) are 2 and 3, respectively.
+ unsigned ExtraStepsF = 2,
+ ExtraStepsD = ExtraStepsF + 1;
+ bool UseRsqrt = ST.useRSqrt();
+
+ TM.Options.Reciprocals.setDefaults("sqrtf", UseRsqrt, ExtraStepsF);
+ TM.Options.Reciprocals.setDefaults("sqrtd", UseRsqrt, ExtraStepsD);
+ TM.Options.Reciprocals.setDefaults("vec-sqrtf", UseRsqrt, ExtraStepsF);
+ TM.Options.Reciprocals.setDefaults("vec-sqrtd", UseRsqrt, ExtraStepsD);
+
+ TM.Options.Reciprocals.setDefaults("divf", false, ExtraStepsF);
+ TM.Options.Reciprocals.setDefaults("divd", false, ExtraStepsD);
+ TM.Options.Reciprocals.setDefaults("vec-divf", false, ExtraStepsF);
+ TM.Options.Reciprocals.setDefaults("vec-divd", false, ExtraStepsD);
+}
+
+static Reloc::Model getEffectiveRelocModel(const Triple &TT,
+ Optional<Reloc::Model> RM) {
+ // AArch64 Darwin is always PIC.
+ if (TT.isOSDarwin())
+ return Reloc::PIC_;
+ // On ELF platforms the default static relocation model has a smart enough
+ // linker to cope with referencing external symbols defined in a shared
+ // library. Hence DynamicNoPIC doesn't need to be promoted to PIC.
+ if (!RM.hasValue() || *RM == Reloc::DynamicNoPIC)
+ return Reloc::Static;
+ return *RM;
+}
+
+/// Create an AArch64 architecture model.
///
-AArch64TargetMachine::AArch64TargetMachine(const Target &T, const Triple &TT,
- StringRef CPU, StringRef FS,
- const TargetOptions &Options,
- Reloc::Model RM, CodeModel::Model CM,
- CodeGenOpt::Level OL,
- bool LittleEndian)
+AArch64TargetMachine::AArch64TargetMachine(
+ const Target &T, const Triple &TT, StringRef CPU, StringRef FS,
+ const TargetOptions &Options, Optional<Reloc::Model> RM,
+ CodeModel::Model CM, CodeGenOpt::Level OL, bool LittleEndian)
// This nested ternary is horrible, but DL needs to be properly
// initialized before TLInfo is constructed.
: LLVMTargetMachine(T, computeDataLayout(TT, LittleEndian), TT, CPU, FS,
- Options, RM, CM, OL),
+ Options, getEffectiveRelocModel(TT, RM), CM, OL),
TLOF(createTLOF(getTargetTriple())),
- isLittle(LittleEndian) {
+ Subtarget(TT, CPU, FS, *this, LittleEndian) {
+ initReciprocals(*this, Subtarget);
initAsmInfo();
}
AArch64TargetMachine::~AArch64TargetMachine() {}
+#ifdef LLVM_BUILD_GLOBAL_ISEL
+namespace {
+struct AArch64GISelActualAccessor : public GISelAccessor {
+ std::unique_ptr<CallLowering> CallLoweringInfo;
+ std::unique_ptr<RegisterBankInfo> RegBankInfo;
+ const CallLowering *getCallLowering() const override {
+ return CallLoweringInfo.get();
+ }
+ const RegisterBankInfo *getRegBankInfo() const override {
+ return RegBankInfo.get();
+ }
+};
+} // End anonymous namespace.
+#endif
+
const AArch64Subtarget *
AArch64TargetMachine::getSubtargetImpl(const Function &F) const {
Attribute CPUAttr = F.getFnAttribute("target-cpu");
@@ -156,7 +225,18 @@ AArch64TargetMachine::getSubtargetImpl(const Function &F) const {
// function that reside in TargetOptions.
resetTargetOptions(F);
I = llvm::make_unique<AArch64Subtarget>(TargetTriple, CPU, FS, *this,
- isLittle);
+ Subtarget.isLittleEndian());
+#ifndef LLVM_BUILD_GLOBAL_ISEL
+ GISelAccessor *GISel = new GISelAccessor();
+#else
+ AArch64GISelActualAccessor *GISel =
+ new AArch64GISelActualAccessor();
+ GISel->CallLoweringInfo.reset(
+ new AArch64CallLowering(*I->getTargetLowering()));
+ GISel->RegBankInfo.reset(
+ new AArch64RegisterBankInfo(*I->getRegisterInfo()));
+#endif
+ I->setGISelAccessor(*GISel);
}
return I.get();
}
@@ -165,16 +245,16 @@ void AArch64leTargetMachine::anchor() { }
AArch64leTargetMachine::AArch64leTargetMachine(
const Target &T, const Triple &TT, StringRef CPU, StringRef FS,
- const TargetOptions &Options, Reloc::Model RM, CodeModel::Model CM,
- CodeGenOpt::Level OL)
+ const TargetOptions &Options, Optional<Reloc::Model> RM,
+ CodeModel::Model CM, CodeGenOpt::Level OL)
: AArch64TargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, true) {}
void AArch64beTargetMachine::anchor() { }
AArch64beTargetMachine::AArch64beTargetMachine(
const Target &T, const Triple &TT, StringRef CPU, StringRef FS,
- const TargetOptions &Options, Reloc::Model RM, CodeModel::Model CM,
- CodeGenOpt::Level OL)
+ const TargetOptions &Options, Optional<Reloc::Model> RM,
+ CodeModel::Model CM, CodeGenOpt::Level OL)
: AArch64TargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, false) {}
namespace {
@@ -194,6 +274,10 @@ public:
void addIRPasses() override;
bool addPreISel() override;
bool addInstSelector() override;
+#ifdef LLVM_BUILD_GLOBAL_ISEL
+ bool addIRTranslator() override;
+ bool addRegBankSelect() override;
+#endif
bool addILPOpts() override;
void addPreRegAlloc() override;
void addPostRegAlloc() override;
@@ -223,6 +307,13 @@ void AArch64PassConfig::addIRPasses() {
if (TM->getOptLevel() != CodeGenOpt::None && EnableAtomicTidy)
addPass(createCFGSimplificationPass());
+ // Run LoopDataPrefetch
+ //
+ // Run this before LSR to remove the multiplies involved in computing the
+ // pointer values N iterations ahead.
+ if (TM->getOptLevel() != CodeGenOpt::None && EnableLoopDataPrefetch)
+ addPass(createLoopDataPrefetchPass());
+
TargetPassConfig::addIRPasses();
// Match interleaved memory accesses to ldN/stN intrinsics.
@@ -278,6 +369,17 @@ bool AArch64PassConfig::addInstSelector() {
return false;
}
+#ifdef LLVM_BUILD_GLOBAL_ISEL
+bool AArch64PassConfig::addIRTranslator() {
+ addPass(new IRTranslator());
+ return false;
+}
+bool AArch64PassConfig::addRegBankSelect() {
+ addPass(new RegBankSelect());
+ return false;
+}
+#endif
+
bool AArch64PassConfig::addILPOpts() {
if (EnableCondOpt)
addPass(createAArch64ConditionOptimizerPass());
@@ -303,6 +405,10 @@ void AArch64PassConfig::addPreRegAlloc() {
}
void AArch64PassConfig::addPostRegAlloc() {
+ // Remove redundant copy instructions.
+ if (TM->getOptLevel() != CodeGenOpt::None && EnableRedundantCopyElimination)
+ addPass(createAArch64RedundantCopyEliminationPass());
+
// Change dead register definitions to refer to the zero register.
if (TM->getOptLevel() != CodeGenOpt::None && EnableDeadRegisterElimination)
addPass(createAArch64DeadRegisterDefinitions());
diff --git a/lib/Target/AArch64/AArch64TargetMachine.h b/lib/Target/AArch64/AArch64TargetMachine.h
index 8d49a29386ac8..b44107b065bd0 100644
--- a/lib/Target/AArch64/AArch64TargetMachine.h
+++ b/lib/Target/AArch64/AArch64TargetMachine.h
@@ -29,7 +29,7 @@ protected:
public:
AArch64TargetMachine(const Target &T, const Triple &TT, StringRef CPU,
StringRef FS, const TargetOptions &Options,
- Reloc::Model RM, CodeModel::Model CM,
+ Optional<Reloc::Model> RM, CodeModel::Model CM,
CodeGenOpt::Level OL, bool IsLittleEndian);
~AArch64TargetMachine() override;
@@ -46,28 +46,28 @@ public:
}
private:
- bool isLittle;
+ AArch64Subtarget Subtarget;
};
-// AArch64leTargetMachine - AArch64 little endian target machine.
+// AArch64 little endian target machine.
//
class AArch64leTargetMachine : public AArch64TargetMachine {
virtual void anchor();
public:
AArch64leTargetMachine(const Target &T, const Triple &TT, StringRef CPU,
StringRef FS, const TargetOptions &Options,
- Reloc::Model RM, CodeModel::Model CM,
+ Optional<Reloc::Model> RM, CodeModel::Model CM,
CodeGenOpt::Level OL);
};
-// AArch64beTargetMachine - AArch64 big endian target machine.
+// AArch64 big endian target machine.
//
class AArch64beTargetMachine : public AArch64TargetMachine {
virtual void anchor();
public:
AArch64beTargetMachine(const Target &T, const Triple &TT, StringRef CPU,
StringRef FS, const TargetOptions &Options,
- Reloc::Model RM, CodeModel::Model CM,
+ Optional<Reloc::Model> RM, CodeModel::Model CM,
CodeGenOpt::Level OL);
};
diff --git a/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 9af0e6444789a..ecf4d93068a4e 100644
--- a/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -291,6 +291,61 @@ int AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) {
return BaseT::getCastInstrCost(Opcode, Dst, Src);
}
+int AArch64TTIImpl::getExtractWithExtendCost(unsigned Opcode, Type *Dst,
+ VectorType *VecTy,
+ unsigned Index) {
+
+ // Make sure we were given a valid extend opcode.
+ assert((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) &&
+ "Invalid opcode");
+
+ // We are extending an element we extract from a vector, so the source type
+ // of the extend is the element type of the vector.
+ auto *Src = VecTy->getElementType();
+
+ // Sign- and zero-extends are for integer types only.
+ assert(isa<IntegerType>(Dst) && isa<IntegerType>(Src) && "Invalid type");
+
+ // Get the cost for the extract. We compute the cost (if any) for the extend
+ // below.
+ auto Cost = getVectorInstrCost(Instruction::ExtractElement, VecTy, Index);
+
+ // Legalize the types.
+ auto VecLT = TLI->getTypeLegalizationCost(DL, VecTy);
+ auto DstVT = TLI->getValueType(DL, Dst);
+ auto SrcVT = TLI->getValueType(DL, Src);
+
+ // If the resulting type is still a vector and the destination type is legal,
+ // we may get the extension for free. If not, get the default cost for the
+ // extend.
+ if (!VecLT.second.isVector() || !TLI->isTypeLegal(DstVT))
+ return Cost + getCastInstrCost(Opcode, Dst, Src);
+
+ // The destination type should be larger than the element type. If not, get
+ // the default cost for the extend.
+ if (DstVT.getSizeInBits() < SrcVT.getSizeInBits())
+ return Cost + getCastInstrCost(Opcode, Dst, Src);
+
+ switch (Opcode) {
+ default:
+ llvm_unreachable("Opcode should be either SExt or ZExt");
+
+ // For sign-extends, we only need a smov, which performs the extension
+ // automatically.
+ case Instruction::SExt:
+ return Cost;
+
+ // For zero-extends, the extend is performed automatically by a umov unless
+ // the destination type is i64 and the element type is i8 or i16.
+ case Instruction::ZExt:
+ if (DstVT.getSizeInBits() != 64u || SrcVT.getSizeInBits() == 32u)
+ return Cost;
+ }
+
+ // If we are unable to perform the extend for free, get the default cost.
+ return Cost + getCastInstrCost(Opcode, Dst, Src);
+}
+
int AArch64TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
unsigned Index) {
assert(Val->isVectorTy() && "This must be a vector type");
@@ -313,7 +368,7 @@ int AArch64TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
}
// All other insert/extracts cost this much.
- return 3;
+ return ST->getVectorInsertExtractBaseCost();
}
int AArch64TTIImpl::getArithmeticInstrCost(
@@ -472,9 +527,7 @@ int AArch64TTIImpl::getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys) {
}
unsigned AArch64TTIImpl::getMaxInterleaveFactor(unsigned VF) {
- if (ST->isCortexA57())
- return 4;
- return 2;
+ return ST->getMaxInterleaveFactor();
}
void AArch64TTIImpl::getUnrollingPreferences(Loop *L,
@@ -571,3 +624,19 @@ bool AArch64TTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst,
}
return true;
}
+
+unsigned AArch64TTIImpl::getCacheLineSize() {
+ return ST->getCacheLineSize();
+}
+
+unsigned AArch64TTIImpl::getPrefetchDistance() {
+ return ST->getPrefetchDistance();
+}
+
+unsigned AArch64TTIImpl::getMinPrefetchStride() {
+ return ST->getMinPrefetchStride();
+}
+
+unsigned AArch64TTIImpl::getMaxPrefetchIterationsAhead() {
+ return ST->getMaxPrefetchIterationsAhead();
+}
diff --git a/lib/Target/AArch64/AArch64TargetTransformInfo.h b/lib/Target/AArch64/AArch64TargetTransformInfo.h
index ec58c4fe309f3..4f2e8310d769d 100644
--- a/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -99,6 +99,9 @@ public:
int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src);
+ int getExtractWithExtendCost(unsigned Opcode, Type *Dst, VectorType *VecTy,
+ unsigned Index);
+
int getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index);
int getArithmeticInstrCost(
@@ -127,6 +130,14 @@ public:
int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor,
ArrayRef<unsigned> Indices, unsigned Alignment,
unsigned AddressSpace);
+
+ unsigned getCacheLineSize();
+
+ unsigned getPrefetchDistance();
+
+ unsigned getMinPrefetchStride();
+
+ unsigned getMaxPrefetchIterationsAhead();
/// @}
};
diff --git a/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
index 394c8e78581f1..aebc370333e3b 100644
--- a/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
+++ b/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
@@ -13,7 +13,6 @@
#include "Utils/AArch64BaseInfo.h"
#include "llvm/ADT/APInt.h"
#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallString.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/StringSwitch.h"
#include "llvm/ADT/Twine.h"
@@ -24,13 +23,14 @@
#include "llvm/MC/MCParser/MCAsmLexer.h"
#include "llvm/MC/MCParser/MCAsmParser.h"
#include "llvm/MC/MCParser/MCParsedAsmOperand.h"
+#include "llvm/MC/MCParser/MCTargetAsmParser.h"
#include "llvm/MC/MCRegisterInfo.h"
#include "llvm/MC/MCStreamer.h"
#include "llvm/MC/MCSubtargetInfo.h"
#include "llvm/MC/MCSymbol.h"
-#include "llvm/MC/MCTargetAsmParser.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/TargetParser.h"
#include "llvm/Support/TargetRegistry.h"
#include "llvm/Support/raw_ostream.h"
#include <cstdio>
@@ -70,6 +70,8 @@ private:
bool Error(SMLoc L, const Twine &Msg) { return getParser().Error(L, Msg); }
bool showMatchError(SMLoc Loc, unsigned ErrCode);
+ bool parseDirectiveArch(SMLoc L);
+ bool parseDirectiveCPU(SMLoc L);
bool parseDirectiveWord(unsigned Size, SMLoc L);
bool parseDirectiveInst(SMLoc L);
@@ -866,14 +868,7 @@ public:
if (!CE) return false;
uint64_t Value = CE->getValue();
- if (RegWidth == 32)
- Value &= 0xffffffffULL;
-
- // "lsl #0" takes precedence: in practice this only affects "#0, lsl #0".
- if (Value == 0 && Shift != 0)
- return false;
-
- return (Value & ~(0xffffULL << Shift)) == 0;
+ return AArch64_AM::isMOVZMovAlias(Value, Shift, RegWidth);
}
template<int RegWidth, int Shift>
@@ -884,16 +879,7 @@ public:
if (!CE) return false;
uint64_t Value = CE->getValue();
- // MOVZ takes precedence over MOVN.
- for (int MOVZShift = 0; MOVZShift <= 48; MOVZShift += 16)
- if ((Value & ~(0xffffULL << MOVZShift)) == 0)
- return false;
-
- Value = ~Value;
- if (RegWidth == 32)
- Value &= 0xffffffffULL;
-
- return (Value & ~(0xffffULL << Shift)) == 0;
+ return AArch64_AM::isMOVNMovAlias(Value, Shift, RegWidth);
}
bool isFPImm() const { return Kind == k_FPImm; }
@@ -2087,12 +2073,9 @@ AArch64AsmParser::tryParsePrefetch(OperandVector &Operands) {
return MatchOperand_ParseFail;
}
- bool Valid;
- auto Mapper = AArch64PRFM::PRFMMapper();
- StringRef Name =
- Mapper.toString(MCE->getValue(), getSTI().getFeatureBits(), Valid);
- Operands.push_back(AArch64Operand::CreatePrefetch(prfop, Name,
- S, getContext()));
+ auto PRFM = AArch64PRFM::lookupPRFMByEncoding(MCE->getValue());
+ Operands.push_back(AArch64Operand::CreatePrefetch(
+ prfop, PRFM ? PRFM->Name : "", S, getContext()));
return MatchOperand_Success;
}
@@ -2101,18 +2084,15 @@ AArch64AsmParser::tryParsePrefetch(OperandVector &Operands) {
return MatchOperand_ParseFail;
}
- bool Valid;
- auto Mapper = AArch64PRFM::PRFMMapper();
- unsigned prfop =
- Mapper.fromString(Tok.getString(), getSTI().getFeatureBits(), Valid);
- if (!Valid) {
+ auto PRFM = AArch64PRFM::lookupPRFMByName(Tok.getString());
+ if (!PRFM) {
TokError("pre-fetch hint expected");
return MatchOperand_ParseFail;
}
Parser.Lex(); // Eat identifier token.
- Operands.push_back(AArch64Operand::CreatePrefetch(prfop, Tok.getString(),
- S, getContext()));
+ Operands.push_back(AArch64Operand::CreatePrefetch(
+ PRFM->Encoding, Tok.getString(), S, getContext()));
return MatchOperand_Success;
}
@@ -2127,18 +2107,15 @@ AArch64AsmParser::tryParsePSBHint(OperandVector &Operands) {
return MatchOperand_ParseFail;
}
- bool Valid;
- auto Mapper = AArch64PSBHint::PSBHintMapper();
- unsigned psbhint =
- Mapper.fromString(Tok.getString(), getSTI().getFeatureBits(), Valid);
- if (!Valid) {
+ auto PSB = AArch64PSBHint::lookupPSBByName(Tok.getString());
+ if (!PSB) {
TokError("invalid operand for instruction");
return MatchOperand_ParseFail;
}
Parser.Lex(); // Eat identifier token.
- Operands.push_back(AArch64Operand::CreatePSBHint(psbhint, Tok.getString(),
- S, getContext()));
+ Operands.push_back(AArch64Operand::CreatePSBHint(
+ PSB->Encoding, Tok.getString(), S, getContext()));
return MatchOperand_Success;
}
@@ -2762,12 +2739,9 @@ AArch64AsmParser::tryParseBarrierOperand(OperandVector &Operands) {
Error(ExprLoc, "barrier operand out of range");
return MatchOperand_ParseFail;
}
- bool Valid;
- auto Mapper = AArch64DB::DBarrierMapper();
- StringRef Name =
- Mapper.toString(MCE->getValue(), getSTI().getFeatureBits(), Valid);
- Operands.push_back( AArch64Operand::CreateBarrier(MCE->getValue(), Name,
- ExprLoc, getContext()));
+ auto DB = AArch64DB::lookupDBByEncoding(MCE->getValue());
+ Operands.push_back(AArch64Operand::CreateBarrier(
+ MCE->getValue(), DB ? DB->Name : "", ExprLoc, getContext()));
return MatchOperand_Success;
}
@@ -2776,23 +2750,20 @@ AArch64AsmParser::tryParseBarrierOperand(OperandVector &Operands) {
return MatchOperand_ParseFail;
}
- bool Valid;
- auto Mapper = AArch64DB::DBarrierMapper();
- unsigned Opt =
- Mapper.fromString(Tok.getString(), getSTI().getFeatureBits(), Valid);
- if (!Valid) {
+ auto DB = AArch64DB::lookupDBByName(Tok.getString());
+ if (!DB) {
TokError("invalid barrier option name");
return MatchOperand_ParseFail;
}
// The only valid named option for ISB is 'sy'
- if (Mnemonic == "isb" && Opt != AArch64DB::SY) {
+ if (Mnemonic == "isb" && DB->Encoding != AArch64DB::sy) {
TokError("'sy' or #imm operand expected");
return MatchOperand_ParseFail;
}
- Operands.push_back( AArch64Operand::CreateBarrier(Opt, Tok.getString(),
- getLoc(), getContext()));
+ Operands.push_back(AArch64Operand::CreateBarrier(
+ DB->Encoding, Tok.getString(), getLoc(), getContext()));
Parser.Lex(); // Consume the option
return MatchOperand_Success;
@@ -2806,28 +2777,22 @@ AArch64AsmParser::tryParseSysReg(OperandVector &Operands) {
if (Tok.isNot(AsmToken::Identifier))
return MatchOperand_NoMatch;
- bool IsKnown;
- auto MRSMapper = AArch64SysReg::MRSMapper();
- uint32_t MRSReg = MRSMapper.fromString(Tok.getString(),
- getSTI().getFeatureBits(), IsKnown);
- assert(IsKnown == (MRSReg != -1U) &&
- "register should be -1 if and only if it's unknown");
-
- auto MSRMapper = AArch64SysReg::MSRMapper();
- uint32_t MSRReg = MSRMapper.fromString(Tok.getString(),
- getSTI().getFeatureBits(), IsKnown);
- assert(IsKnown == (MSRReg != -1U) &&
- "register should be -1 if and only if it's unknown");
-
- auto PStateMapper = AArch64PState::PStateMapper();
- uint32_t PStateField =
- PStateMapper.fromString(Tok.getString(),
- getSTI().getFeatureBits(), IsKnown);
- assert(IsKnown == (PStateField != -1U) &&
- "register should be -1 if and only if it's unknown");
-
- Operands.push_back(AArch64Operand::CreateSysReg(
- Tok.getString(), getLoc(), MRSReg, MSRReg, PStateField, getContext()));
+ int MRSReg, MSRReg;
+ auto SysReg = AArch64SysReg::lookupSysRegByName(Tok.getString());
+ if (SysReg && SysReg->haveFeatures(getSTI().getFeatureBits())) {
+ MRSReg = SysReg->Readable ? SysReg->Encoding : -1;
+ MSRReg = SysReg->Writeable ? SysReg->Encoding : -1;
+ } else
+ MRSReg = MSRReg = AArch64SysReg::parseGenericRegister(Tok.getString());
+
+ auto PState = AArch64PState::lookupPStateByName(Tok.getString());
+ unsigned PStateImm = -1;
+ if (PState && PState->haveFeatures(getSTI().getFeatureBits()))
+ PStateImm = PState->Encoding;
+
+ Operands.push_back(
+ AArch64Operand::CreateSysReg(Tok.getString(), getLoc(), MRSReg, MSRReg,
+ PStateImm, getContext()));
Parser.Lex(); // Eat identifier
return MatchOperand_Success;
@@ -4195,6 +4160,10 @@ bool AArch64AsmParser::ParseDirective(AsmToken DirectiveID) {
StringRef IDVal = DirectiveID.getIdentifier();
SMLoc Loc = DirectiveID.getLoc();
+ if (IDVal == ".arch")
+ return parseDirectiveArch(Loc);
+ if (IDVal == ".cpu")
+ return parseDirectiveCPU(Loc);
if (IDVal == ".hword")
return parseDirectiveWord(2, Loc);
if (IDVal == ".word")
@@ -4216,6 +4185,99 @@ bool AArch64AsmParser::ParseDirective(AsmToken DirectiveID) {
return parseDirectiveLOH(IDVal, Loc);
}
+static const struct {
+ const char *Name;
+ const FeatureBitset Features;
+} ExtensionMap[] = {
+ { "crc", {AArch64::FeatureCRC} },
+ { "crypto", {AArch64::FeatureCrypto} },
+ { "fp", {AArch64::FeatureFPARMv8} },
+ { "simd", {AArch64::FeatureNEON} },
+
+ // FIXME: Unsupported extensions
+ { "lse", {} },
+ { "pan", {} },
+ { "lor", {} },
+ { "rdma", {} },
+ { "profile", {} },
+};
+
+/// parseDirectiveArch
+/// ::= .arch token
+bool AArch64AsmParser::parseDirectiveArch(SMLoc L) {
+ SMLoc ArchLoc = getLoc();
+
+ StringRef Arch, ExtensionString;
+ std::tie(Arch, ExtensionString) =
+ getParser().parseStringToEndOfStatement().trim().split('+');
+
+ unsigned ID = AArch64::parseArch(Arch);
+ if (ID == ARM::AK_INVALID) {
+ Error(ArchLoc, "unknown arch name");
+ return false;
+ }
+
+ MCSubtargetInfo &STI = copySTI();
+ STI.setDefaultFeatures("", "");
+ if (!ExtensionString.empty())
+ STI.setDefaultFeatures("", ("+" + ExtensionString).str());
+ setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits()));
+
+ return false;
+}
+
+/// parseDirectiveCPU
+/// ::= .cpu id
+bool AArch64AsmParser::parseDirectiveCPU(SMLoc L) {
+ SMLoc CPULoc = getLoc();
+
+ StringRef CPU, ExtensionString;
+ std::tie(CPU, ExtensionString) =
+ getParser().parseStringToEndOfStatement().trim().split('+');
+
+ SmallVector<StringRef, 4> RequestedExtensions;
+ if (!ExtensionString.empty())
+ ExtensionString.split(RequestedExtensions, '+');
+
+ // FIXME This is using tablegen data, but should be moved to ARMTargetParser
+ // once that is tablegen'ed
+ if (!getSTI().isCPUStringValid(CPU)) {
+ Error(CPULoc, "unknown CPU name");
+ return false;
+ }
+
+ MCSubtargetInfo &STI = copySTI();
+ STI.setDefaultFeatures(CPU, "");
+
+ FeatureBitset Features = STI.getFeatureBits();
+ for (auto Name : RequestedExtensions) {
+ bool EnableFeature = true;
+
+ if (Name.startswith_lower("no")) {
+ EnableFeature = false;
+ Name = Name.substr(2);
+ }
+
+ for (const auto &Extension : ExtensionMap) {
+ if (Extension.Name != Name)
+ continue;
+
+ if (Extension.Features.none())
+ report_fatal_error("unsupported architectural extension: " + Name);
+
+ FeatureBitset ToggleFeatures = EnableFeature
+ ? (~Features & Extension.Features)
+ : ( Features & Extension.Features);
+ uint64_t Features =
+ ComputeAvailableFeatures(STI.ToggleFeature(ToggleFeatures));
+ setAvailableFeatures(Features);
+
+ break;
+ }
+ }
+ return false;
+}
+
/// parseDirectiveWord
/// ::= .word [ expression (, expression)* ]
bool AArch64AsmParser::parseDirectiveWord(unsigned Size, SMLoc L) {
diff --git a/lib/Target/AArch64/AsmParser/Makefile b/lib/Target/AArch64/AsmParser/Makefile
deleted file mode 100644
index 00268c76f8e83..0000000000000
--- a/lib/Target/AArch64/AsmParser/Makefile
+++ /dev/null
@@ -1,15 +0,0 @@
-##===- lib/Target/AArch64/AsmParser/Makefile ---------------*- Makefile -*-===##
-#
-# The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-LEVEL = ../../../..
-LIBRARYNAME = LLVMAArch64AsmParser
-
-# Hack: we need to include 'main' ARM target directory to grab private headers
-CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
-
-include $(LEVEL)/Makefile.common
diff --git a/lib/Target/AArch64/CMakeLists.txt b/lib/Target/AArch64/CMakeLists.txt
index f26327ff84ad8..a79960ea96053 100644
--- a/lib/Target/AArch64/CMakeLists.txt
+++ b/lib/Target/AArch64/CMakeLists.txt
@@ -12,8 +12,25 @@ tablegen(LLVM AArch64GenFastISel.inc -gen-fast-isel)
tablegen(LLVM AArch64GenCallingConv.inc -gen-callingconv)
tablegen(LLVM AArch64GenSubtargetInfo.inc -gen-subtarget)
tablegen(LLVM AArch64GenDisassemblerTables.inc -gen-disassembler)
+tablegen(LLVM AArch64GenSystemOperands.inc -gen-searchable-tables)
+
add_public_tablegen_target(AArch64CommonTableGen)
+# List of all GlobalISel files.
+set(GLOBAL_ISEL_FILES
+ AArch64CallLowering.cpp
+ AArch64RegisterBankInfo.cpp
+ )
+
+# Add GlobalISel files to the dependencies if the user wants to build it.
+if(LLVM_BUILD_GLOBAL_ISEL)
+ set(GLOBAL_ISEL_BUILD_FILES ${GLOBAL_ISEL_FILES})
+else()
+ set(GLOBAL_ISEL_BUILD_FILES"")
+ set(LLVM_OPTIONAL_SOURCES LLVMGlobalISel ${GLOBAL_ISEL_FILES})
+endif()
+
+
add_llvm_target(AArch64CodeGen
AArch64A57FPLoadBalancing.cpp
AArch64AddressTypePromotion.cpp
@@ -29,6 +46,7 @@ add_llvm_target(AArch64CodeGen
AArch64A53Fix835769.cpp
AArch64FrameLowering.cpp
AArch64ConditionOptimizer.cpp
+ AArch64RedundantCopyElimination.cpp
AArch64ISelDAGToDAG.cpp
AArch64ISelLowering.cpp
AArch64InstrInfo.cpp
@@ -43,6 +61,7 @@ add_llvm_target(AArch64CodeGen
AArch64TargetMachine.cpp
AArch64TargetObjectFile.cpp
AArch64TargetTransformInfo.cpp
+ ${GLOBAL_ISEL_BUILD_FILES}
)
add_dependencies(LLVMAArch64CodeGen intrinsics_gen)
diff --git a/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp b/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp
index f1f968e73123e..fe6ea31b90613 100644
--- a/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp
+++ b/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp
@@ -1523,13 +1523,12 @@ static DecodeStatus DecodeSystemPStateInstruction(llvm::MCInst &Inst,
Inst.addOperand(MCOperand::createImm(pstate_field));
Inst.addOperand(MCOperand::createImm(crm));
- bool ValidNamed;
- const AArch64Disassembler *Dis =
+ const AArch64Disassembler *Dis =
static_cast<const AArch64Disassembler *>(Decoder);
- (void)AArch64PState::PStateMapper().toString(pstate_field,
- Dis->getSubtargetInfo().getFeatureBits(), ValidNamed);
-
- return ValidNamed ? Success : Fail;
+ auto PState = AArch64PState::lookupPStateByEncoding(pstate_field);
+ if (PState && PState->haveFeatures(Dis->getSubtargetInfo().getFeatureBits()))
+ return Success;
+ return Fail;
}
static DecodeStatus DecodeTestAndBranch(llvm::MCInst &Inst, uint32_t insn,
@@ -1574,7 +1573,7 @@ static DecodeStatus DecodeWSeqPairsClassRegisterClass(MCInst &Inst,
unsigned RegNo,
uint64_t Addr,
const void *Decoder) {
- return DecodeGPRSeqPairsClassRegisterClass(Inst,
+ return DecodeGPRSeqPairsClassRegisterClass(Inst,
AArch64::WSeqPairsClassRegClassID,
RegNo, Addr, Decoder);
}
@@ -1583,7 +1582,7 @@ static DecodeStatus DecodeXSeqPairsClassRegisterClass(MCInst &Inst,
unsigned RegNo,
uint64_t Addr,
const void *Decoder) {
- return DecodeGPRSeqPairsClassRegisterClass(Inst,
+ return DecodeGPRSeqPairsClassRegisterClass(Inst,
AArch64::XSeqPairsClassRegClassID,
RegNo, Addr, Decoder);
}
diff --git a/lib/Target/AArch64/Disassembler/AArch64Disassembler.h b/lib/Target/AArch64/Disassembler/AArch64Disassembler.h
index 7fb57adfeebaa..e475e505e7d12 100644
--- a/lib/Target/AArch64/Disassembler/AArch64Disassembler.h
+++ b/lib/Target/AArch64/Disassembler/AArch64Disassembler.h
@@ -13,7 +13,7 @@
#ifndef LLVM_LIB_TARGET_AARCH64_DISASSEMBLER_AARCH64DISASSEMBLER_H
#define LLVM_LIB_TARGET_AARCH64_DISASSEMBLER_AARCH64DISASSEMBLER_H
-#include "llvm/MC/MCDisassembler.h"
+#include "llvm/MC/MCDisassembler/MCDisassembler.h"
namespace llvm {
diff --git a/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.cpp b/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.cpp
index 82bc949927ce4..19d0ba2e1c415 100644
--- a/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.cpp
+++ b/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.cpp
@@ -134,9 +134,11 @@ bool AArch64ExternalSymbolizer::tryAddingSymbolicOperand(
if (ReferenceType == LLVMDisassembler_ReferenceType_Out_LitPool_SymAddr)
CommentStream << "literal pool symbol address: " << ReferenceName;
else if (ReferenceType ==
- LLVMDisassembler_ReferenceType_Out_LitPool_CstrAddr)
- CommentStream << "literal pool for: \"" << ReferenceName << "\"";
- else if (ReferenceType ==
+ LLVMDisassembler_ReferenceType_Out_LitPool_CstrAddr) {
+ CommentStream << "literal pool for: \"";
+ CommentStream.write_escaped(ReferenceName);
+ CommentStream << "\"";
+ } else if (ReferenceType ==
LLVMDisassembler_ReferenceType_Out_Objc_CFString_Ref)
CommentStream << "Objc cfstring ref: @\"" << ReferenceName << "\"";
else if (ReferenceType ==
diff --git a/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.h b/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.h
index 12b8450b13c66..49e8449637971 100644
--- a/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.h
+++ b/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.h
@@ -14,7 +14,7 @@
#ifndef LLVM_LIB_TARGET_AARCH64_DISASSEMBLER_AARCH64EXTERNALSYMBOLIZER_H
#define LLVM_LIB_TARGET_AARCH64_DISASSEMBLER_AARCH64EXTERNALSYMBOLIZER_H
-#include "llvm/MC/MCExternalSymbolizer.h"
+#include "llvm/MC/MCDisassembler/MCExternalSymbolizer.h"
namespace llvm {
diff --git a/lib/Target/AArch64/Disassembler/Makefile b/lib/Target/AArch64/Disassembler/Makefile
deleted file mode 100644
index 741bb817a6334..0000000000000
--- a/lib/Target/AArch64/Disassembler/Makefile
+++ /dev/null
@@ -1,16 +0,0 @@
-##===- lib/Target/AArch64/Disassembler/Makefile ------------*- Makefile -*-===##
-#
-# The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-
-LEVEL = ../../../..
-LIBRARYNAME = LLVMAArch64Disassembler
-
-# Hack: we need to include 'main' arm target directory to grab private headers
-CPPFLAGS = -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
-
-include $(LEVEL)/Makefile.common
diff --git a/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp b/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp
index d8a8108243705..b4f85204714f1 100644
--- a/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp
+++ b/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp
@@ -219,6 +219,54 @@ void AArch64InstPrinter::printInst(const MCInst *MI, raw_ostream &O,
return;
}
+ // MOVZ, MOVN and "ORR wzr, #imm" instructions are aliases for MOV, but their
+ // domains overlap so they need to be prioritized. The chain is "MOVZ lsl #0 >
+ // MOVZ lsl #N > MOVN lsl #0 > MOVN lsl #N > ORR". The highest instruction
+ // that can represent the move is the MOV alias, and the rest get printed
+ // normally.
+ if ((Opcode == AArch64::MOVZXi || Opcode == AArch64::MOVZWi) &&
+ MI->getOperand(1).isImm() && MI->getOperand(2).isImm()) {
+ int RegWidth = Opcode == AArch64::MOVZXi ? 64 : 32;
+ int Shift = MI->getOperand(2).getImm();
+ uint64_t Value = (uint64_t)MI->getOperand(1).getImm() << Shift;
+
+ if (AArch64_AM::isMOVZMovAlias(Value, Shift,
+ Opcode == AArch64::MOVZXi ? 64 : 32)) {
+ O << "\tmov\t" << getRegisterName(MI->getOperand(0).getReg()) << ", #"
+ << formatImm(SignExtend64(Value, RegWidth));
+ return;
+ }
+ }
+
+ if ((Opcode == AArch64::MOVNXi || Opcode == AArch64::MOVNWi) &&
+ MI->getOperand(1).isImm() && MI->getOperand(2).isImm()) {
+ int RegWidth = Opcode == AArch64::MOVNXi ? 64 : 32;
+ int Shift = MI->getOperand(2).getImm();
+ uint64_t Value = ~((uint64_t)MI->getOperand(1).getImm() << Shift);
+ if (RegWidth == 32)
+ Value = Value & 0xffffffff;
+
+ if (AArch64_AM::isMOVNMovAlias(Value, Shift, RegWidth)) {
+ O << "\tmov\t" << getRegisterName(MI->getOperand(0).getReg()) << ", #"
+ << formatImm(SignExtend64(Value, RegWidth));
+ return;
+ }
+ }
+
+ if ((Opcode == AArch64::ORRXri || Opcode == AArch64::ORRWri) &&
+ (MI->getOperand(1).getReg() == AArch64::XZR ||
+ MI->getOperand(1).getReg() == AArch64::WZR) &&
+ MI->getOperand(2).isImm()) {
+ int RegWidth = Opcode == AArch64::ORRXri ? 64 : 32;
+ uint64_t Value = AArch64_AM::decodeLogicalImmediate(
+ MI->getOperand(2).getImm(), RegWidth);
+ if (!AArch64_AM::isAnyMOVWMovAlias(Value, RegWidth)) {
+ O << "\tmov\t" << getRegisterName(MI->getOperand(0).getReg()) << ", #"
+ << formatImm(SignExtend64(Value, RegWidth));
+ return;
+ }
+ }
+
if (!printAliasInstr(MI, STI, O))
printInstruction(MI, STI, O);
@@ -928,14 +976,21 @@ void AArch64InstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
unsigned Reg = Op.getReg();
O << getRegisterName(Reg);
} else if (Op.isImm()) {
- O << '#' << Op.getImm();
+ printImm(MI, OpNo, STI, O);
} else {
assert(Op.isExpr() && "unknown operand kind in printOperand");
Op.getExpr()->print(O, &MAI);
}
}
-void AArch64InstPrinter::printHexImm(const MCInst *MI, unsigned OpNo,
+void AArch64InstPrinter::printImm(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ const MCOperand &Op = MI->getOperand(OpNo);
+ O << "#" << formatImm(Op.getImm());
+}
+
+void AArch64InstPrinter::printImmHex(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI,
raw_ostream &O) {
const MCOperand &Op = MI->getOperand(OpNo);
@@ -981,12 +1036,12 @@ void AArch64InstPrinter::printAddSubImm(const MCInst *MI, unsigned OpNum,
assert(Val == MO.getImm() && "Add/sub immediate out of range!");
unsigned Shift =
AArch64_AM::getShiftValue(MI->getOperand(OpNum + 1).getImm());
- O << '#' << Val;
+ O << '#' << formatImm(Val);
if (Shift != 0)
printShifter(MI, OpNum + 1, STI, O);
if (CommentStream)
- *CommentStream << '=' << (Val << Shift) << '\n';
+ *CommentStream << '=' << formatImm(Val << Shift) << '\n';
} else {
assert(MO.isExpr() && "Unexpected operand type!");
MO.getExpr()->print(O, &MAI);
@@ -1104,14 +1159,14 @@ template<int Scale>
void AArch64InstPrinter::printImmScale(const MCInst *MI, unsigned OpNum,
const MCSubtargetInfo &STI,
raw_ostream &O) {
- O << '#' << Scale * MI->getOperand(OpNum).getImm();
+ O << '#' << formatImm(Scale * MI->getOperand(OpNum).getImm());
}
void AArch64InstPrinter::printUImm12Offset(const MCInst *MI, unsigned OpNum,
unsigned Scale, raw_ostream &O) {
const MCOperand MO = MI->getOperand(OpNum);
if (MO.isImm()) {
- O << "#" << (MO.getImm() * Scale);
+ O << "#" << formatImm(MO.getImm() * Scale);
} else {
assert(MO.isExpr() && "Unexpected operand type!");
MO.getExpr()->print(O, &MAI);
@@ -1123,7 +1178,7 @@ void AArch64InstPrinter::printAMIndexedWB(const MCInst *MI, unsigned OpNum,
const MCOperand MO1 = MI->getOperand(OpNum + 1);
O << '[' << getRegisterName(MI->getOperand(OpNum).getReg());
if (MO1.isImm()) {
- O << ", #" << (MO1.getImm() * Scale);
+ O << ", #" << formatImm(MO1.getImm() * Scale);
} else {
assert(MO1.isExpr() && "Unexpected operand type!");
O << ", ";
@@ -1136,26 +1191,22 @@ void AArch64InstPrinter::printPrefetchOp(const MCInst *MI, unsigned OpNum,
const MCSubtargetInfo &STI,
raw_ostream &O) {
unsigned prfop = MI->getOperand(OpNum).getImm();
- bool Valid;
- StringRef Name =
- AArch64PRFM::PRFMMapper().toString(prfop, STI.getFeatureBits(), Valid);
- if (Valid)
- O << Name;
+ auto PRFM = AArch64PRFM::lookupPRFMByEncoding(prfop);
+ if (PRFM)
+ O << PRFM->Name;
else
- O << '#' << prfop;
+ O << '#' << formatImm(prfop);
}
void AArch64InstPrinter::printPSBHintOp(const MCInst *MI, unsigned OpNum,
const MCSubtargetInfo &STI,
raw_ostream &O) {
unsigned psbhintop = MI->getOperand(OpNum).getImm();
- bool Valid;
- StringRef Name =
- AArch64PSBHint::PSBHintMapper().toString(psbhintop, STI.getFeatureBits(), Valid);
- if (Valid)
- O << Name;
+ auto PSB = AArch64PSBHint::lookupPSBByEncoding(psbhintop);
+ if (PSB)
+ O << PSB->Name;
else
- O << '#' << psbhintop;
+ O << '#' << formatImm(psbhintop);
}
void AArch64InstPrinter::printFPImmOperand(const MCInst *MI, unsigned OpNum,
@@ -1310,7 +1361,7 @@ void AArch64InstPrinter::printAlignedLabel(const MCInst *MI, unsigned OpNum,
// If the label has already been resolved to an immediate offset (say, when
// we're running the disassembler), just print the immediate.
if (Op.isImm()) {
- O << "#" << (Op.getImm() * 4);
+ O << "#" << formatImm(Op.getImm() * 4);
return;
}
@@ -1335,7 +1386,7 @@ void AArch64InstPrinter::printAdrpLabel(const MCInst *MI, unsigned OpNum,
// If the label has already been resolved to an immediate offset (say, when
// we're running the disassembler), just print the immediate.
if (Op.isImm()) {
- O << "#" << (Op.getImm() * (1 << 12));
+ O << "#" << formatImm(Op.getImm() * (1 << 12));
return;
}
@@ -1349,15 +1400,15 @@ void AArch64InstPrinter::printBarrierOption(const MCInst *MI, unsigned OpNo,
unsigned Val = MI->getOperand(OpNo).getImm();
unsigned Opcode = MI->getOpcode();
- bool Valid;
StringRef Name;
- if (Opcode == AArch64::ISB)
- Name = AArch64ISB::ISBMapper().toString(Val, STI.getFeatureBits(),
- Valid);
- else
- Name = AArch64DB::DBarrierMapper().toString(Val, STI.getFeatureBits(),
- Valid);
- if (Valid)
+ if (Opcode == AArch64::ISB) {
+ auto ISB = AArch64ISB::lookupISBByEncoding(Val);
+ Name = ISB ? ISB->Name : "";
+ } else {
+ auto DB = AArch64DB::lookupDBByEncoding(Val);
+ Name = DB ? DB->Name : "";
+ }
+ if (!Name.empty())
O << Name;
else
O << "#" << Val;
@@ -1368,10 +1419,19 @@ void AArch64InstPrinter::printMRSSystemRegister(const MCInst *MI, unsigned OpNo,
raw_ostream &O) {
unsigned Val = MI->getOperand(OpNo).getImm();
- auto Mapper = AArch64SysReg::MRSMapper();
- std::string Name = Mapper.toString(Val, STI.getFeatureBits());
+ // Horrible hack for the one register that has identical encodings but
+ // different names in MSR and MRS. Because of this, one of MRS and MSR is
+ // going to get the wrong entry
+ if (Val == AArch64SysReg::DBGDTRRX_EL0) {
+ O << "DBGDTRRX_EL0";
+ return;
+ }
- O << StringRef(Name).upper();
+ const AArch64SysReg::SysReg *Reg = AArch64SysReg::lookupSysRegByEncoding(Val);
+ if (Reg && Reg->Readable && Reg->haveFeatures(STI.getFeatureBits()))
+ O << Reg->Name;
+ else
+ O << AArch64SysReg::genericRegisterString(Val);
}
void AArch64InstPrinter::printMSRSystemRegister(const MCInst *MI, unsigned OpNo,
@@ -1379,10 +1439,19 @@ void AArch64InstPrinter::printMSRSystemRegister(const MCInst *MI, unsigned OpNo,
raw_ostream &O) {
unsigned Val = MI->getOperand(OpNo).getImm();
- auto Mapper = AArch64SysReg::MSRMapper();
- std::string Name = Mapper.toString(Val, STI.getFeatureBits());
+ // Horrible hack for the one register that has identical encodings but
+ // different names in MSR and MRS. Because of this, one of MRS and MSR is
+ // going to get the wrong entry
+ if (Val == AArch64SysReg::DBGDTRTX_EL0) {
+ O << "DBGDTRTX_EL0";
+ return;
+ }
- O << StringRef(Name).upper();
+ const AArch64SysReg::SysReg *Reg = AArch64SysReg::lookupSysRegByEncoding(Val);
+ if (Reg && Reg->Writeable && Reg->haveFeatures(STI.getFeatureBits()))
+ O << Reg->Name;
+ else
+ O << AArch64SysReg::genericRegisterString(Val);
}
void AArch64InstPrinter::printSystemPStateField(const MCInst *MI, unsigned OpNo,
@@ -1390,13 +1459,11 @@ void AArch64InstPrinter::printSystemPStateField(const MCInst *MI, unsigned OpNo,
raw_ostream &O) {
unsigned Val = MI->getOperand(OpNo).getImm();
- bool Valid;
- StringRef Name =
- AArch64PState::PStateMapper().toString(Val, STI.getFeatureBits(), Valid);
- if (Valid)
- O << Name.upper();
+ auto PState = AArch64PState::lookupPStateByEncoding(Val);
+ if (PState && PState->haveFeatures(STI.getFeatureBits()))
+ O << PState->Name;
else
- O << "#" << Val;
+ O << "#" << formatImm(Val);
}
void AArch64InstPrinter::printSIMDType10Operand(const MCInst *MI, unsigned OpNo,
diff --git a/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h b/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h
index ea68d9848b427..65dca99ed04e7 100644
--- a/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h
+++ b/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h
@@ -49,7 +49,9 @@ protected:
// Operand printers
void printOperand(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
raw_ostream &O);
- void printHexImm(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
+ void printImm(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
+ raw_ostream &O);
+ void printImmHex(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
raw_ostream &O);
void printPostIncOperand(const MCInst *MI, unsigned OpNo, unsigned Imm,
raw_ostream &O);
diff --git a/lib/Target/AArch64/InstPrinter/Makefile b/lib/Target/AArch64/InstPrinter/Makefile
deleted file mode 100644
index b17e8d080119b..0000000000000
--- a/lib/Target/AArch64/InstPrinter/Makefile
+++ /dev/null
@@ -1,15 +0,0 @@
-##===- lib/Target/AArch64/AsmPrinter/Makefile --------------*- Makefile -*-===##
-#
-# The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-LEVEL = ../../../..
-LIBRARYNAME = LLVMAArch64AsmPrinter
-
-# Hack: we need to include 'main' arm target directory to grab private headers
-CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
-
-include $(LEVEL)/Makefile.common
diff --git a/lib/Target/AArch64/LLVMBuild.txt b/lib/Target/AArch64/LLVMBuild.txt
index 642c18394a67e..0196c505ba3cc 100644
--- a/lib/Target/AArch64/LLVMBuild.txt
+++ b/lib/Target/AArch64/LLVMBuild.txt
@@ -31,5 +31,5 @@ has_jit = 1
type = Library
name = AArch64CodeGen
parent = AArch64
-required_libraries = AArch64AsmPrinter AArch64Desc AArch64Info AArch64Utils Analysis AsmPrinter CodeGen Core MC Scalar SelectionDAG Support Target
+required_libraries = AArch64AsmPrinter AArch64Desc AArch64Info AArch64Utils Analysis AsmPrinter CodeGen Core MC Scalar SelectionDAG Support Target GlobalISel
add_to_library_groups = AArch64
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h b/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h
index 648b1dfc8c5ef..3e5ef4df47060 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h
@@ -753,6 +753,49 @@ static inline uint64_t decodeAdvSIMDModImmType12(uint8_t Imm) {
return (EncVal << 32) | EncVal;
}
+inline static bool isAnyMOVZMovAlias(uint64_t Value, int RegWidth) {
+ for (int Shift = 0; Shift <= RegWidth - 16; Shift += 16)
+ if ((Value & ~(0xffffULL << Shift)) == 0)
+ return true;
+
+ return false;
+}
+
+inline static bool isMOVZMovAlias(uint64_t Value, int Shift, int RegWidth) {
+ if (RegWidth == 32)
+ Value &= 0xffffffffULL;
+
+ // "lsl #0" takes precedence: in practice this only affects "#0, lsl #0".
+ if (Value == 0 && Shift != 0)
+ return false;
+
+ return (Value & ~(0xffffULL << Shift)) == 0;
+}
+
+inline static bool isMOVNMovAlias(uint64_t Value, int Shift, int RegWidth) {
+ // MOVZ takes precedence over MOVN.
+ if (isAnyMOVZMovAlias(Value, RegWidth))
+ return false;
+
+ Value = ~Value;
+ if (RegWidth == 32)
+ Value &= 0xffffffffULL;
+
+ return isMOVZMovAlias(Value, Shift, RegWidth);
+}
+
+inline static bool isAnyMOVWMovAlias(uint64_t Value, int RegWidth) {
+ if (isAnyMOVZMovAlias(Value, RegWidth))
+ return true;
+
+ // It's not a MOVZ, but it might be a MOVN.
+ Value = ~Value;
+ if (RegWidth == 32)
+ Value &= 0xffffffffULL;
+
+ return isAnyMOVZMovAlias(Value, RegWidth);
+}
+
} // end namespace AArch64_AM
} // end namespace llvm
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
index 7624c7240d688..27993246eb07c 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
@@ -12,6 +12,7 @@
#include "MCTargetDesc/AArch64FixupKinds.h"
#include "llvm/ADT/Triple.h"
#include "llvm/MC/MCAsmBackend.h"
+#include "llvm/MC/MCContext.h"
#include "llvm/MC/MCDirectives.h"
#include "llvm/MC/MCELFObjectWriter.h"
#include "llvm/MC/MCFixupKindInfo.h"
@@ -28,9 +29,12 @@ namespace {
class AArch64AsmBackend : public MCAsmBackend {
static const unsigned PCRelFlagVal =
MCFixupKindInfo::FKF_IsAlignedDownTo32Bits | MCFixupKindInfo::FKF_IsPCRel;
+public:
+ bool IsLittleEndian;
public:
- AArch64AsmBackend(const Target &T) : MCAsmBackend() {}
+ AArch64AsmBackend(const Target &T, bool IsLittleEndian)
+ : MCAsmBackend(), IsLittleEndian(IsLittleEndian) {}
unsigned getNumFixupKinds() const override {
return AArch64::NumTargetFixupKinds;
@@ -74,12 +78,15 @@ public:
bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value,
const MCRelaxableFragment *DF,
const MCAsmLayout &Layout) const override;
- void relaxInstruction(const MCInst &Inst, MCInst &Res) const override;
+ void relaxInstruction(const MCInst &Inst, const MCSubtargetInfo &STI,
+ MCInst &Res) const override;
bool writeNopData(uint64_t Count, MCObjectWriter *OW) const override;
void HandleAssemblerFlag(MCAssemblerFlag Flag) {}
unsigned getPointerSize() const { return 8; }
+
+ unsigned getFixupKindContainereSizeInBytes(unsigned Kind) const;
};
} // end anonymous namespace
@@ -129,14 +136,16 @@ static unsigned AdrImmBits(unsigned Value) {
return (hi19 << 5) | (lo2 << 29);
}
-static uint64_t adjustFixupValue(unsigned Kind, uint64_t Value) {
+static uint64_t adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
+ MCContext *Ctx) {
+ unsigned Kind = Fixup.getKind();
int64_t SignedValue = static_cast<int64_t>(Value);
switch (Kind) {
default:
llvm_unreachable("Unknown fixup kind!");
case AArch64::fixup_aarch64_pcrel_adr_imm21:
- if (SignedValue > 2097151 || SignedValue < -2097152)
- report_fatal_error("fixup value out of range");
+ if (Ctx && (SignedValue > 2097151 || SignedValue < -2097152))
+ Ctx->reportError(Fixup.getLoc(), "fixup value out of range");
return AdrImmBits(Value & 0x1fffffULL);
case AArch64::fixup_aarch64_pcrel_adrp_imm21:
return AdrImmBits((Value & 0x1fffff000ULL) >> 12);
@@ -144,54 +153,66 @@ static uint64_t adjustFixupValue(unsigned Kind, uint64_t Value) {
case AArch64::fixup_aarch64_pcrel_branch19:
// Signed 21-bit immediate
if (SignedValue > 2097151 || SignedValue < -2097152)
- report_fatal_error("fixup value out of range");
+ if (Ctx) Ctx->reportError(Fixup.getLoc(), "fixup value out of range");
+ if (Ctx && (Value & 0x3))
+ Ctx->reportError(Fixup.getLoc(), "fixup not sufficiently aligned");
// Low two bits are not encoded.
return (Value >> 2) & 0x7ffff;
case AArch64::fixup_aarch64_add_imm12:
case AArch64::fixup_aarch64_ldst_imm12_scale1:
// Unsigned 12-bit immediate
- if (Value >= 0x1000)
- report_fatal_error("invalid imm12 fixup value");
+ if (Ctx && Value >= 0x1000)
+ Ctx->reportError(Fixup.getLoc(), "fixup value out of range");
return Value;
case AArch64::fixup_aarch64_ldst_imm12_scale2:
// Unsigned 12-bit immediate which gets multiplied by 2
- if (Value & 1 || Value >= 0x2000)
- report_fatal_error("invalid imm12 fixup value");
+ if (Ctx && (Value >= 0x2000))
+ Ctx->reportError(Fixup.getLoc(), "fixup value out of range");
+ if (Ctx && (Value & 0x1))
+ Ctx->reportError(Fixup.getLoc(), "fixup must be 2-byte aligned");
return Value >> 1;
case AArch64::fixup_aarch64_ldst_imm12_scale4:
// Unsigned 12-bit immediate which gets multiplied by 4
- if (Value & 3 || Value >= 0x4000)
- report_fatal_error("invalid imm12 fixup value");
+ if (Ctx && (Value >= 0x4000))
+ Ctx->reportError(Fixup.getLoc(), "fixup value out of range");
+ if (Ctx && (Value & 0x3))
+ Ctx->reportError(Fixup.getLoc(), "fixup must be 4-byte aligned");
return Value >> 2;
case AArch64::fixup_aarch64_ldst_imm12_scale8:
// Unsigned 12-bit immediate which gets multiplied by 8
- if (Value & 7 || Value >= 0x8000)
- report_fatal_error("invalid imm12 fixup value");
+ if (Ctx && (Value >= 0x8000))
+ Ctx->reportError(Fixup.getLoc(), "fixup value out of range");
+ if (Ctx && (Value & 0x7))
+ Ctx->reportError(Fixup.getLoc(), "fixup must be 8-byte aligned");
return Value >> 3;
case AArch64::fixup_aarch64_ldst_imm12_scale16:
// Unsigned 12-bit immediate which gets multiplied by 16
- if (Value & 15 || Value >= 0x10000)
- report_fatal_error("invalid imm12 fixup value");
+ if (Ctx && (Value >= 0x10000))
+ Ctx->reportError(Fixup.getLoc(), "fixup value out of range");
+ if (Ctx && (Value & 0xf))
+ Ctx->reportError(Fixup.getLoc(), "fixup must be 16-byte aligned");
return Value >> 4;
case AArch64::fixup_aarch64_movw:
- report_fatal_error("no resolvable MOVZ/MOVK fixups supported yet");
+ if (Ctx)
+ Ctx->reportError(Fixup.getLoc(),
+ "no resolvable MOVZ/MOVK fixups supported yet");
return Value;
case AArch64::fixup_aarch64_pcrel_branch14:
// Signed 16-bit immediate
- if (SignedValue > 32767 || SignedValue < -32768)
- report_fatal_error("fixup value out of range");
+ if (Ctx && (SignedValue > 32767 || SignedValue < -32768))
+ Ctx->reportError(Fixup.getLoc(), "fixup value out of range");
// Low two bits are not encoded (4-byte alignment assumed).
- if (Value & 0x3)
- report_fatal_error("fixup not sufficiently aligned");
+ if (Ctx && (Value & 0x3))
+ Ctx->reportError(Fixup.getLoc(), "fixup not sufficiently aligned");
return (Value >> 2) & 0x3fff;
case AArch64::fixup_aarch64_pcrel_branch26:
case AArch64::fixup_aarch64_pcrel_call26:
// Signed 28-bit immediate
- if (SignedValue > 134217727 || SignedValue < -134217728)
- report_fatal_error("fixup value out of range");
+ if (Ctx && (SignedValue > 134217727 || SignedValue < -134217728))
+ Ctx->reportError(Fixup.getLoc(), "fixup value out of range");
// Low two bits are not encoded (4-byte alignment assumed).
- if (Value & 0x3)
- report_fatal_error("fixup not sufficiently aligned");
+ if (Ctx && (Value & 0x3))
+ Ctx->reportError(Fixup.getLoc(), "fixup not sufficiently aligned");
return (Value >> 2) & 0x3ffffff;
case FK_Data_1:
case FK_Data_2:
@@ -201,6 +222,45 @@ static uint64_t adjustFixupValue(unsigned Kind, uint64_t Value) {
}
}
+/// getFixupKindContainereSizeInBytes - The number of bytes of the
+/// container involved in big endian or 0 if the item is little endian
+unsigned AArch64AsmBackend::getFixupKindContainereSizeInBytes(unsigned Kind) const {
+ if (IsLittleEndian)
+ return 0;
+
+ switch (Kind) {
+ default:
+ llvm_unreachable("Unknown fixup kind!");
+
+ case FK_Data_1:
+ return 1;
+ case FK_Data_2:
+ return 2;
+ case FK_Data_4:
+ return 4;
+ case FK_Data_8:
+ return 8;
+
+ case AArch64::fixup_aarch64_tlsdesc_call:
+ case AArch64::fixup_aarch64_movw:
+ case AArch64::fixup_aarch64_pcrel_branch14:
+ case AArch64::fixup_aarch64_add_imm12:
+ case AArch64::fixup_aarch64_ldst_imm12_scale1:
+ case AArch64::fixup_aarch64_ldst_imm12_scale2:
+ case AArch64::fixup_aarch64_ldst_imm12_scale4:
+ case AArch64::fixup_aarch64_ldst_imm12_scale8:
+ case AArch64::fixup_aarch64_ldst_imm12_scale16:
+ case AArch64::fixup_aarch64_ldr_pcrel_imm19:
+ case AArch64::fixup_aarch64_pcrel_branch19:
+ case AArch64::fixup_aarch64_pcrel_adr_imm21:
+ case AArch64::fixup_aarch64_pcrel_adrp_imm21:
+ case AArch64::fixup_aarch64_pcrel_branch26:
+ case AArch64::fixup_aarch64_pcrel_call26:
+ // Instructions are always little endian
+ return 0;
+ }
+}
+
void AArch64AsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
unsigned DataSize, uint64_t Value,
bool IsPCRel) const {
@@ -209,7 +269,7 @@ void AArch64AsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
return; // Doesn't change encoding.
MCFixupKindInfo Info = getFixupKindInfo(Fixup.getKind());
// Apply any target-specific value adjustments.
- Value = adjustFixupValue(Fixup.getKind(), Value);
+ Value = adjustFixupValue(Fixup, Value, nullptr);
// Shift the value into position.
Value <<= Info.TargetOffset;
@@ -217,10 +277,25 @@ void AArch64AsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
unsigned Offset = Fixup.getOffset();
assert(Offset + NumBytes <= DataSize && "Invalid fixup offset!");
+ // Used to point to big endian bytes.
+ unsigned FulleSizeInBytes = getFixupKindContainereSizeInBytes(Fixup.getKind());
+
// For each byte of the fragment that the fixup touches, mask in the
// bits from the fixup value.
- for (unsigned i = 0; i != NumBytes; ++i)
- Data[Offset + i] |= uint8_t((Value >> (i * 8)) & 0xff);
+ if (FulleSizeInBytes == 0) {
+ // Handle as little-endian
+ for (unsigned i = 0; i != NumBytes; ++i) {
+ Data[Offset + i] |= uint8_t((Value >> (i * 8)) & 0xff);
+ }
+ } else {
+ // Handle as big-endian
+ assert((Offset + FulleSizeInBytes) <= DataSize && "Invalid fixup size!");
+ assert(NumBytes <= FulleSizeInBytes && "Invalid fixup size!");
+ for (unsigned i = 0; i != NumBytes; ++i) {
+ unsigned Idx = FulleSizeInBytes - 1 - i;
+ Data[Offset + Idx] |= uint8_t((Value >> (i * 8)) & 0xff);
+ }
+ }
}
bool AArch64AsmBackend::mayNeedRelaxation(const MCInst &Inst) const {
@@ -239,6 +314,7 @@ bool AArch64AsmBackend::fixupNeedsRelaxation(const MCFixup &Fixup,
}
void AArch64AsmBackend::relaxInstruction(const MCInst &Inst,
+ const MCSubtargetInfo &STI,
MCInst &Res) const {
llvm_unreachable("AArch64AsmBackend::relaxInstruction() unimplemented");
}
@@ -264,14 +340,14 @@ namespace CU {
enum CompactUnwindEncodings {
/// \brief A "frameless" leaf function, where no non-volatile registers are
/// saved. The return remains in LR throughout the function.
- UNWIND_AArch64_MODE_FRAMELESS = 0x02000000,
+ UNWIND_ARM64_MODE_FRAMELESS = 0x02000000,
/// \brief No compact unwind encoding available. Instead the low 23-bits of
/// the compact unwind encoding is the offset of the DWARF FDE in the
/// __eh_frame section. This mode is never used in object files. It is only
/// generated by the linker in final linked images, which have only DWARF info
/// for a function.
- UNWIND_AArch64_MODE_DWARF = 0x03000000,
+ UNWIND_ARM64_MODE_DWARF = 0x03000000,
/// \brief This is a standard arm64 prologue where FP/LR are immediately
/// pushed on the stack, then SP is copied to FP. If there are any
@@ -279,18 +355,18 @@ enum CompactUnwindEncodings {
/// in a contiguous ranger right below the saved FP/LR pair. Any subset of the
/// five X pairs and four D pairs can be saved, but the memory layout must be
/// in register number order.
- UNWIND_AArch64_MODE_FRAME = 0x04000000,
+ UNWIND_ARM64_MODE_FRAME = 0x04000000,
/// \brief Frame register pair encodings.
- UNWIND_AArch64_FRAME_X19_X20_PAIR = 0x00000001,
- UNWIND_AArch64_FRAME_X21_X22_PAIR = 0x00000002,
- UNWIND_AArch64_FRAME_X23_X24_PAIR = 0x00000004,
- UNWIND_AArch64_FRAME_X25_X26_PAIR = 0x00000008,
- UNWIND_AArch64_FRAME_X27_X28_PAIR = 0x00000010,
- UNWIND_AArch64_FRAME_D8_D9_PAIR = 0x00000100,
- UNWIND_AArch64_FRAME_D10_D11_PAIR = 0x00000200,
- UNWIND_AArch64_FRAME_D12_D13_PAIR = 0x00000400,
- UNWIND_AArch64_FRAME_D14_D15_PAIR = 0x00000800
+ UNWIND_ARM64_FRAME_X19_X20_PAIR = 0x00000001,
+ UNWIND_ARM64_FRAME_X21_X22_PAIR = 0x00000002,
+ UNWIND_ARM64_FRAME_X23_X24_PAIR = 0x00000004,
+ UNWIND_ARM64_FRAME_X25_X26_PAIR = 0x00000008,
+ UNWIND_ARM64_FRAME_X27_X28_PAIR = 0x00000010,
+ UNWIND_ARM64_FRAME_D8_D9_PAIR = 0x00000100,
+ UNWIND_ARM64_FRAME_D10_D11_PAIR = 0x00000200,
+ UNWIND_ARM64_FRAME_D12_D13_PAIR = 0x00000400,
+ UNWIND_ARM64_FRAME_D14_D15_PAIR = 0x00000800
};
} // end CU namespace
@@ -300,7 +376,7 @@ class DarwinAArch64AsmBackend : public AArch64AsmBackend {
const MCRegisterInfo &MRI;
/// \brief Encode compact unwind stack adjustment for frameless functions.
- /// See UNWIND_AArch64_FRAMELESS_STACK_SIZE_MASK in compact_unwind_encoding.h.
+ /// See UNWIND_ARM64_FRAMELESS_STACK_SIZE_MASK in compact_unwind_encoding.h.
/// The stack size always needs to be 16 byte aligned.
uint32_t encodeStackAdjustment(uint32_t StackSize) const {
return (StackSize / 16) << 12;
@@ -308,7 +384,7 @@ class DarwinAArch64AsmBackend : public AArch64AsmBackend {
public:
DarwinAArch64AsmBackend(const Target &T, const MCRegisterInfo &MRI)
- : AArch64AsmBackend(T), MRI(MRI) {}
+ : AArch64AsmBackend(T, /*IsLittleEndian*/true), MRI(MRI) {}
MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override {
return createAArch64MachObjectWriter(OS, MachO::CPU_TYPE_ARM64,
@@ -319,7 +395,7 @@ public:
uint32_t generateCompactUnwindEncoding(
ArrayRef<MCCFIInstruction> Instrs) const override {
if (Instrs.empty())
- return CU::UNWIND_AArch64_MODE_FRAMELESS;
+ return CU::UNWIND_ARM64_MODE_FRAMELESS;
bool HasFP = false;
unsigned StackSize = 0;
@@ -331,7 +407,7 @@ public:
switch (Inst.getOperation()) {
default:
// Cannot handle this directive: bail out.
- return CU::UNWIND_AArch64_MODE_DWARF;
+ return CU::UNWIND_ARM64_MODE_DWARF;
case MCCFIInstruction::OpDefCfa: {
// Defines a frame pointer.
assert(getXRegFromWReg(MRI.getLLVMRegNum(Inst.getRegister(), true)) ==
@@ -356,7 +432,7 @@ public:
"Pushing invalid registers for frame!");
// Indicate that the function has a frame.
- CompactUnwindEncoding |= CU::UNWIND_AArch64_MODE_FRAME;
+ CompactUnwindEncoding |= CU::UNWIND_ARM64_MODE_FRAME;
HasFP = true;
break;
}
@@ -370,11 +446,11 @@ public:
// `.cfi_offset' instructions with the appropriate registers specified.
unsigned Reg1 = MRI.getLLVMRegNum(Inst.getRegister(), true);
if (i + 1 == e)
- return CU::UNWIND_AArch64_MODE_DWARF;
+ return CU::UNWIND_ARM64_MODE_DWARF;
const MCCFIInstruction &Inst2 = Instrs[++i];
if (Inst2.getOperation() != MCCFIInstruction::OpOffset)
- return CU::UNWIND_AArch64_MODE_DWARF;
+ return CU::UNWIND_ARM64_MODE_DWARF;
unsigned Reg2 = MRI.getLLVMRegNum(Inst2.getRegister(), true);
// N.B. The encodings must be in register number order, and the X
@@ -390,19 +466,19 @@ public:
if (Reg1 == AArch64::X19 && Reg2 == AArch64::X20 &&
(CompactUnwindEncoding & 0xF1E) == 0)
- CompactUnwindEncoding |= CU::UNWIND_AArch64_FRAME_X19_X20_PAIR;
+ CompactUnwindEncoding |= CU::UNWIND_ARM64_FRAME_X19_X20_PAIR;
else if (Reg1 == AArch64::X21 && Reg2 == AArch64::X22 &&
(CompactUnwindEncoding & 0xF1C) == 0)
- CompactUnwindEncoding |= CU::UNWIND_AArch64_FRAME_X21_X22_PAIR;
+ CompactUnwindEncoding |= CU::UNWIND_ARM64_FRAME_X21_X22_PAIR;
else if (Reg1 == AArch64::X23 && Reg2 == AArch64::X24 &&
(CompactUnwindEncoding & 0xF18) == 0)
- CompactUnwindEncoding |= CU::UNWIND_AArch64_FRAME_X23_X24_PAIR;
+ CompactUnwindEncoding |= CU::UNWIND_ARM64_FRAME_X23_X24_PAIR;
else if (Reg1 == AArch64::X25 && Reg2 == AArch64::X26 &&
(CompactUnwindEncoding & 0xF10) == 0)
- CompactUnwindEncoding |= CU::UNWIND_AArch64_FRAME_X25_X26_PAIR;
+ CompactUnwindEncoding |= CU::UNWIND_ARM64_FRAME_X25_X26_PAIR;
else if (Reg1 == AArch64::X27 && Reg2 == AArch64::X28 &&
(CompactUnwindEncoding & 0xF00) == 0)
- CompactUnwindEncoding |= CU::UNWIND_AArch64_FRAME_X27_X28_PAIR;
+ CompactUnwindEncoding |= CU::UNWIND_ARM64_FRAME_X27_X28_PAIR;
else {
Reg1 = getDRegFromBReg(Reg1);
Reg2 = getDRegFromBReg(Reg2);
@@ -413,18 +489,18 @@ public:
// D14/D15 pair = 0x00000800
if (Reg1 == AArch64::D8 && Reg2 == AArch64::D9 &&
(CompactUnwindEncoding & 0xE00) == 0)
- CompactUnwindEncoding |= CU::UNWIND_AArch64_FRAME_D8_D9_PAIR;
+ CompactUnwindEncoding |= CU::UNWIND_ARM64_FRAME_D8_D9_PAIR;
else if (Reg1 == AArch64::D10 && Reg2 == AArch64::D11 &&
(CompactUnwindEncoding & 0xC00) == 0)
- CompactUnwindEncoding |= CU::UNWIND_AArch64_FRAME_D10_D11_PAIR;
+ CompactUnwindEncoding |= CU::UNWIND_ARM64_FRAME_D10_D11_PAIR;
else if (Reg1 == AArch64::D12 && Reg2 == AArch64::D13 &&
(CompactUnwindEncoding & 0x800) == 0)
- CompactUnwindEncoding |= CU::UNWIND_AArch64_FRAME_D12_D13_PAIR;
+ CompactUnwindEncoding |= CU::UNWIND_ARM64_FRAME_D12_D13_PAIR;
else if (Reg1 == AArch64::D14 && Reg2 == AArch64::D15)
- CompactUnwindEncoding |= CU::UNWIND_AArch64_FRAME_D14_D15_PAIR;
+ CompactUnwindEncoding |= CU::UNWIND_ARM64_FRAME_D14_D15_PAIR;
else
// A pair was pushed which we cannot handle.
- return CU::UNWIND_AArch64_MODE_DWARF;
+ return CU::UNWIND_ARM64_MODE_DWARF;
}
break;
@@ -436,9 +512,9 @@ public:
// With compact unwind info we can only represent stack adjustments of up
// to 65520 bytes.
if (StackSize > 65520)
- return CU::UNWIND_AArch64_MODE_DWARF;
+ return CU::UNWIND_ARM64_MODE_DWARF;
- CompactUnwindEncoding |= CU::UNWIND_AArch64_MODE_FRAMELESS;
+ CompactUnwindEncoding |= CU::UNWIND_ARM64_MODE_FRAMELESS;
CompactUnwindEncoding |= encodeStackAdjustment(StackSize);
}
@@ -453,10 +529,9 @@ namespace {
class ELFAArch64AsmBackend : public AArch64AsmBackend {
public:
uint8_t OSABI;
- bool IsLittleEndian;
ELFAArch64AsmBackend(const Target &T, uint8_t OSABI, bool IsLittleEndian)
- : AArch64AsmBackend(T), OSABI(OSABI), IsLittleEndian(IsLittleEndian) {}
+ : AArch64AsmBackend(T, IsLittleEndian), OSABI(OSABI) {}
MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override {
return createAArch64ELFObjectWriter(OS, OSABI, IsLittleEndian);
@@ -466,9 +541,6 @@ public:
const MCFixup &Fixup, const MCFragment *DF,
const MCValue &Target, uint64_t &Value,
bool &IsResolved) override;
-
- void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
- uint64_t Value, bool IsPCRel) const override;
};
void ELFAArch64AsmBackend::processFixupValue(
@@ -489,34 +561,14 @@ void ELFAArch64AsmBackend::processFixupValue(
// to the linker -- a relocation!
if ((uint32_t)Fixup.getKind() == AArch64::fixup_aarch64_pcrel_adrp_imm21)
IsResolved = false;
-}
-
-// Returns whether this fixup is based on an address in the .eh_frame section,
-// and therefore should be byte swapped.
-// FIXME: Should be replaced with something more principled.
-static bool isByteSwappedFixup(const MCExpr *E) {
- MCValue Val;
- if (!E->evaluateAsRelocatable(Val, nullptr, nullptr))
- return false;
- if (!Val.getSymA() || Val.getSymA()->getSymbol().isUndefined())
- return false;
-
- const MCSectionELF *SecELF =
- dyn_cast<MCSectionELF>(&Val.getSymA()->getSymbol().getSection());
- return SecELF->getSectionName() == ".eh_frame";
+ // Try to get the encoded value for the fixup as-if we're mapping it into
+ // the instruction. This allows adjustFixupValue() to issue a diagnostic
+ // if the value is invalid.
+ if (IsResolved)
+ (void)adjustFixupValue(Fixup, Value, &Asm.getContext());
}
-void ELFAArch64AsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
- unsigned DataSize, uint64_t Value,
- bool IsPCRel) const {
- // store fixups in .eh_frame section in big endian order
- if (!IsLittleEndian && Fixup.getKind() == FK_Data_4) {
- if (isByteSwappedFixup(Fixup.getValue()))
- Value = ByteSwap_32(unsigned(Value));
- }
- AArch64AsmBackend::applyFixup (Fixup, Data, DataSize, Value, IsPCRel);
-}
}
MCAsmBackend *llvm::createAArch64leAsmBackend(const Target &T,
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp
index 1f516d1db8968..4b4c4097b97b4 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp
@@ -15,6 +15,7 @@
#include "MCTargetDesc/AArch64FixupKinds.h"
#include "MCTargetDesc/AArch64MCExpr.h"
#include "MCTargetDesc/AArch64MCTargetDesc.h"
+#include "llvm/MC/MCContext.h"
#include "llvm/MC/MCELFObjectWriter.h"
#include "llvm/MC/MCValue.h"
#include "llvm/Support/ErrorHandling.h"
@@ -29,8 +30,8 @@ public:
~AArch64ELFObjectWriter() override;
protected:
- unsigned GetRelocType(const MCValue &Target, const MCFixup &Fixup,
- bool IsPCRel) const override;
+ unsigned getRelocType(MCContext &Ctx, const MCValue &Target,
+ const MCFixup &Fixup, bool IsPCRel) const override;
private:
};
@@ -43,9 +44,10 @@ AArch64ELFObjectWriter::AArch64ELFObjectWriter(uint8_t OSABI,
AArch64ELFObjectWriter::~AArch64ELFObjectWriter() {}
-unsigned AArch64ELFObjectWriter::GetRelocType(const MCValue &Target,
- const MCFixup &Fixup,
- bool IsPCRel) const {
+unsigned AArch64ELFObjectWriter::getRelocType(MCContext &Ctx,
+ const MCValue &Target,
+ const MCFixup &Fixup,
+ bool IsPCRel) const {
AArch64MCExpr::VariantKind RefKind =
static_cast<AArch64MCExpr::VariantKind>(Target.getRefKind());
AArch64MCExpr::VariantKind SymLoc = AArch64MCExpr::getSymbolLoc(RefKind);
@@ -61,6 +63,9 @@ unsigned AArch64ELFObjectWriter::GetRelocType(const MCValue &Target,
if (IsPCRel) {
switch ((unsigned)Fixup.getKind()) {
+ case FK_Data_1:
+ Ctx.reportError(Fixup.getLoc(), "1-byte data relocations not supported");
+ return ELF::R_AARCH64_NONE;
case FK_Data_2:
return ELF::R_AARCH64_PREL16;
case FK_Data_4:
@@ -79,7 +84,9 @@ unsigned AArch64ELFObjectWriter::GetRelocType(const MCValue &Target,
return ELF::R_AARCH64_TLSIE_ADR_GOTTPREL_PAGE21;
if (SymLoc == AArch64MCExpr::VK_TLSDESC && !IsNC)
return ELF::R_AARCH64_TLSDESC_ADR_PAGE21;
- llvm_unreachable("invalid symbol kind for ADRP relocation");
+ Ctx.reportError(Fixup.getLoc(),
+ "invalid symbol kind for ADRP relocation");
+ return ELF::R_AARCH64_NONE;
case AArch64::fixup_aarch64_pcrel_branch26:
return ELF::R_AARCH64_JUMP26;
case AArch64::fixup_aarch64_pcrel_call26:
@@ -93,10 +100,14 @@ unsigned AArch64ELFObjectWriter::GetRelocType(const MCValue &Target,
case AArch64::fixup_aarch64_pcrel_branch19:
return ELF::R_AARCH64_CONDBR19;
default:
- llvm_unreachable("Unsupported pc-relative fixup kind");
+ Ctx.reportError(Fixup.getLoc(), "Unsupported pc-relative fixup kind");
+ return ELF::R_AARCH64_NONE;
}
} else {
switch ((unsigned)Fixup.getKind()) {
+ case FK_Data_1:
+ Ctx.reportError(Fixup.getLoc(), "1-byte data relocations not supported");
+ return ELF::R_AARCH64_NONE;
case FK_Data_2:
return ELF::R_AARCH64_ABS16;
case FK_Data_4:
@@ -121,8 +132,9 @@ unsigned AArch64ELFObjectWriter::GetRelocType(const MCValue &Target,
if (SymLoc == AArch64MCExpr::VK_ABS && IsNC)
return ELF::R_AARCH64_ADD_ABS_LO12_NC;
- report_fatal_error("invalid fixup for add (uimm12) instruction");
- return 0;
+ Ctx.reportError(Fixup.getLoc(),
+ "invalid fixup for add (uimm12) instruction");
+ return ELF::R_AARCH64_NONE;
case AArch64::fixup_aarch64_ldst_imm12_scale1:
if (SymLoc == AArch64MCExpr::VK_ABS && IsNC)
return ELF::R_AARCH64_LDST8_ABS_LO12_NC;
@@ -135,8 +147,9 @@ unsigned AArch64ELFObjectWriter::GetRelocType(const MCValue &Target,
if (SymLoc == AArch64MCExpr::VK_TPREL && IsNC)
return ELF::R_AARCH64_TLSLE_LDST8_TPREL_LO12_NC;
- report_fatal_error("invalid fixup for 8-bit load/store instruction");
- return 0;
+ Ctx.reportError(Fixup.getLoc(),
+ "invalid fixup for 8-bit load/store instruction");
+ return ELF::R_AARCH64_NONE;
case AArch64::fixup_aarch64_ldst_imm12_scale2:
if (SymLoc == AArch64MCExpr::VK_ABS && IsNC)
return ELF::R_AARCH64_LDST16_ABS_LO12_NC;
@@ -149,8 +162,9 @@ unsigned AArch64ELFObjectWriter::GetRelocType(const MCValue &Target,
if (SymLoc == AArch64MCExpr::VK_TPREL && IsNC)
return ELF::R_AARCH64_TLSLE_LDST16_TPREL_LO12_NC;
- report_fatal_error("invalid fixup for 16-bit load/store instruction");
- return 0;
+ Ctx.reportError(Fixup.getLoc(),
+ "invalid fixup for 16-bit load/store instruction");
+ return ELF::R_AARCH64_NONE;
case AArch64::fixup_aarch64_ldst_imm12_scale4:
if (SymLoc == AArch64MCExpr::VK_ABS && IsNC)
return ELF::R_AARCH64_LDST32_ABS_LO12_NC;
@@ -163,8 +177,9 @@ unsigned AArch64ELFObjectWriter::GetRelocType(const MCValue &Target,
if (SymLoc == AArch64MCExpr::VK_TPREL && IsNC)
return ELF::R_AARCH64_TLSLE_LDST32_TPREL_LO12_NC;
- report_fatal_error("invalid fixup for 32-bit load/store instruction");
- return 0;
+ Ctx.reportError(Fixup.getLoc(),
+ "invalid fixup for 32-bit load/store instruction");
+ return ELF::R_AARCH64_NONE;
case AArch64::fixup_aarch64_ldst_imm12_scale8:
if (SymLoc == AArch64MCExpr::VK_ABS && IsNC)
return ELF::R_AARCH64_LDST64_ABS_LO12_NC;
@@ -183,14 +198,16 @@ unsigned AArch64ELFObjectWriter::GetRelocType(const MCValue &Target,
if (SymLoc == AArch64MCExpr::VK_TLSDESC && IsNC)
return ELF::R_AARCH64_TLSDESC_LD64_LO12_NC;
- report_fatal_error("invalid fixup for 64-bit load/store instruction");
- return 0;
+ Ctx.reportError(Fixup.getLoc(),
+ "invalid fixup for 64-bit load/store instruction");
+ return ELF::R_AARCH64_NONE;
case AArch64::fixup_aarch64_ldst_imm12_scale16:
if (SymLoc == AArch64MCExpr::VK_ABS && IsNC)
return ELF::R_AARCH64_LDST128_ABS_LO12_NC;
- report_fatal_error("invalid fixup for 128-bit load/store instruction");
- return 0;
+ Ctx.reportError(Fixup.getLoc(),
+ "invalid fixup for 128-bit load/store instruction");
+ return ELF::R_AARCH64_NONE;
case AArch64::fixup_aarch64_movw:
if (RefKind == AArch64MCExpr::VK_ABS_G3)
return ELF::R_AARCH64_MOVW_UABS_G3;
@@ -236,12 +253,14 @@ unsigned AArch64ELFObjectWriter::GetRelocType(const MCValue &Target,
return ELF::R_AARCH64_TLSIE_MOVW_GOTTPREL_G1;
if (RefKind == AArch64MCExpr::VK_GOTTPREL_G0_NC)
return ELF::R_AARCH64_TLSIE_MOVW_GOTTPREL_G0_NC;
- report_fatal_error("invalid fixup for movz/movk instruction");
- return 0;
+ Ctx.reportError(Fixup.getLoc(),
+ "invalid fixup for movz/movk instruction");
+ return ELF::R_AARCH64_NONE;
case AArch64::fixup_aarch64_tlsdesc_call:
return ELF::R_AARCH64_TLSDESC_CALL;
default:
- llvm_unreachable("Unknown ELF relocation type");
+ Ctx.reportError(Fixup.getLoc(), "Unknown ELF relocation type");
+ return ELF::R_AARCH64_NONE;
}
}
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp
index 7d8e79bc63c87..7b9ff8fa05031 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp
@@ -154,24 +154,6 @@ public:
SmallVectorImpl<MCFixup> &Fixups,
const MCSubtargetInfo &STI) const;
- /// getSIMDShift64OpValue - Return the encoded value for the
- // shift-by-immediate AdvSIMD instructions.
- uint32_t getSIMDShift64OpValue(const MCInst &MI, unsigned OpIdx,
- SmallVectorImpl<MCFixup> &Fixups,
- const MCSubtargetInfo &STI) const;
-
- uint32_t getSIMDShift64_32OpValue(const MCInst &MI, unsigned OpIdx,
- SmallVectorImpl<MCFixup> &Fixups,
- const MCSubtargetInfo &STI) const;
-
- uint32_t getSIMDShift32OpValue(const MCInst &MI, unsigned OpIdx,
- SmallVectorImpl<MCFixup> &Fixups,
- const MCSubtargetInfo &STI) const;
-
- uint32_t getSIMDShift16OpValue(const MCInst &MI, unsigned OpIdx,
- SmallVectorImpl<MCFixup> &Fixups,
- const MCSubtargetInfo &STI) const;
-
unsigned fixMOVZ(const MCInst &MI, unsigned EncodedValue,
const MCSubtargetInfo &STI) const;
@@ -428,41 +410,6 @@ AArch64MCCodeEmitter::getVecShifterOpValue(const MCInst &MI, unsigned OpIdx,
llvm_unreachable("Invalid value for vector shift amount!");
}
-uint32_t
-AArch64MCCodeEmitter::getSIMDShift64OpValue(const MCInst &MI, unsigned OpIdx,
- SmallVectorImpl<MCFixup> &Fixups,
- const MCSubtargetInfo &STI) const {
- const MCOperand &MO = MI.getOperand(OpIdx);
- assert(MO.isImm() && "Expected an immediate value for the shift amount!");
- return 64 - (MO.getImm());
-}
-
-uint32_t AArch64MCCodeEmitter::getSIMDShift64_32OpValue(
- const MCInst &MI, unsigned OpIdx, SmallVectorImpl<MCFixup> &Fixups,
- const MCSubtargetInfo &STI) const {
- const MCOperand &MO = MI.getOperand(OpIdx);
- assert(MO.isImm() && "Expected an immediate value for the shift amount!");
- return 64 - (MO.getImm() | 32);
-}
-
-uint32_t
-AArch64MCCodeEmitter::getSIMDShift32OpValue(const MCInst &MI, unsigned OpIdx,
- SmallVectorImpl<MCFixup> &Fixups,
- const MCSubtargetInfo &STI) const {
- const MCOperand &MO = MI.getOperand(OpIdx);
- assert(MO.isImm() && "Expected an immediate value for the shift amount!");
- return 32 - (MO.getImm() | 16);
-}
-
-uint32_t
-AArch64MCCodeEmitter::getSIMDShift16OpValue(const MCInst &MI, unsigned OpIdx,
- SmallVectorImpl<MCFixup> &Fixups,
- const MCSubtargetInfo &STI) const {
- const MCOperand &MO = MI.getOperand(OpIdx);
- assert(MO.isImm() && "Expected an immediate value for the shift amount!");
- return 16 - (MO.getImm() | 8);
-}
-
/// getFixedPointScaleOpValue - Return the encoded value for the
// FP-to-fixed-point scale factor.
uint32_t AArch64MCCodeEmitter::getFixedPointScaleOpValue(
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
index 9f7bed0d3b125..7027806212084 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
@@ -15,7 +15,6 @@
#include "AArch64ELFStreamer.h"
#include "AArch64MCAsmInfo.h"
#include "InstPrinter/AArch64InstPrinter.h"
-#include "llvm/MC/MCCodeGenInfo.h"
#include "llvm/MC/MCInstrInfo.h"
#include "llvm/MC/MCRegisterInfo.h"
#include "llvm/MC/MCStreamer.h"
@@ -72,10 +71,8 @@ static MCAsmInfo *createAArch64MCAsmInfo(const MCRegisterInfo &MRI,
return MAI;
}
-static MCCodeGenInfo *createAArch64MCCodeGenInfo(const Triple &TT,
- Reloc::Model RM,
- CodeModel::Model CM,
- CodeGenOpt::Level OL) {
+static void adjustCodeGenOpts(const Triple &TT, Reloc::Model RM,
+ CodeModel::Model &CM) {
assert((TT.isOSBinFormatELF() || TT.isOSBinFormatMachO()) &&
"Only expect Darwin and ELF targets");
@@ -89,19 +86,6 @@ static MCCodeGenInfo *createAArch64MCCodeGenInfo(const Triple &TT,
else if (CM != CodeModel::Small && CM != CodeModel::Large)
report_fatal_error(
"Only small and large code models are allowed on AArch64");
-
- // AArch64 Darwin is always PIC.
- if (TT.isOSDarwin())
- RM = Reloc::PIC_;
- // On ELF platforms the default static relocation model has a smart enough
- // linker to cope with referencing external symbols defined in a shared
- // library. Hence DynamicNoPIC doesn't need to be promoted to PIC.
- else if (RM == Reloc::Default || RM == Reloc::DynamicNoPIC)
- RM = Reloc::Static;
-
- MCCodeGenInfo *X = new MCCodeGenInfo();
- X->initMCCodeGenInfo(RM, CM, OL);
- return X;
}
static MCInstPrinter *createAArch64MCInstPrinter(const Triple &T,
@@ -140,7 +124,7 @@ extern "C" void LLVMInitializeAArch64TargetMC() {
RegisterMCAsmInfoFn X(*T, createAArch64MCAsmInfo);
// Register the MC codegen info.
- TargetRegistry::RegisterMCCodeGenInfo(*T, createAArch64MCCodeGenInfo);
+ TargetRegistry::registerMCAdjustCodeGenOpts(*T, adjustCodeGenOpts);
// Register the MC instruction info.
TargetRegistry::RegisterMCInstrInfo(*T, createAArch64MCInstrInfo);
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h b/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h
index 342384437c6a4..39414cc0c6a52 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h
@@ -15,7 +15,6 @@
#define LLVM_LIB_TARGET_AARCH64_MCTARGETDESC_AARCH64MCTARGETDESC_H
#include "llvm/Support/DataTypes.h"
-#include <string>
namespace llvm {
class formatted_raw_ostream;
diff --git a/lib/Target/AArch64/MCTargetDesc/Makefile b/lib/Target/AArch64/MCTargetDesc/Makefile
deleted file mode 100644
index 5779ac5ac60a8..0000000000000
--- a/lib/Target/AArch64/MCTargetDesc/Makefile
+++ /dev/null
@@ -1,16 +0,0 @@
-##===- lib/Target/AArch64/TargetDesc/Makefile --------------*- Makefile -*-===##
-#
-# The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-
-LEVEL = ../../../..
-LIBRARYNAME = LLVMAArch64Desc
-
-# Hack: we need to include 'main' target directory to grab private headers
-CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
-
-include $(LEVEL)/Makefile.common
diff --git a/lib/Target/AArch64/Makefile b/lib/Target/AArch64/Makefile
deleted file mode 100644
index f356c58504131..0000000000000
--- a/lib/Target/AArch64/Makefile
+++ /dev/null
@@ -1,25 +0,0 @@
-##===- lib/Target/AArch64/Makefile -------------------------*- Makefile -*-===##
-#
-# The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-
-LEVEL = ../../..
-LIBRARYNAME = LLVMAArch64CodeGen
-TARGET = AArch64
-
-# Make sure that tblgen is run, first thing.
-BUILT_SOURCES = AArch64GenRegisterInfo.inc AArch64GenInstrInfo.inc \
- AArch64GenAsmWriter.inc AArch64GenAsmWriter1.inc \
- AArch64GenDAGISel.inc \
- AArch64GenCallingConv.inc AArch64GenAsmMatcher.inc \
- AArch64GenSubtargetInfo.inc AArch64GenMCCodeEmitter.inc \
- AArch64GenFastISel.inc AArch64GenDisassemblerTables.inc \
- AArch64GenMCPseudoLowering.inc
-
-DIRS = TargetInfo InstPrinter AsmParser Disassembler MCTargetDesc Utils
-
-include $(LEVEL)/Makefile.common
diff --git a/lib/Target/AArch64/TargetInfo/Makefile b/lib/Target/AArch64/TargetInfo/Makefile
deleted file mode 100644
index 9dc9aa4bccf7a..0000000000000
--- a/lib/Target/AArch64/TargetInfo/Makefile
+++ /dev/null
@@ -1,15 +0,0 @@
-##===- lib/Target/AArch64/TargetInfo/Makefile --------------*- Makefile -*-===##
-#
-# The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-LEVEL = ../../../..
-LIBRARYNAME = LLVMAArch64Info
-
-# Hack: we need to include 'main' target directory to grab private headers
-CPPFLAGS = -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
-
-include $(LEVEL)/Makefile.common
diff --git a/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp b/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp
index cde1c6df26084..e65ba1f2401d7 100644
--- a/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp
+++ b/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp
@@ -11,858 +11,84 @@
//
//===----------------------------------------------------------------------===//
#include "AArch64BaseInfo.h"
-#include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/StringExtras.h"
#include "llvm/Support/Regex.h"
using namespace llvm;
-StringRef AArch64NamedImmMapper::toString(uint32_t Value,
- const FeatureBitset& FeatureBits, bool &Valid) const {
- for (unsigned i = 0; i < NumMappings; ++i) {
- if (Mappings[i].isValueEqual(Value, FeatureBits)) {
- Valid = true;
- return Mappings[i].Name;
- }
+namespace llvm {
+ namespace AArch64AT {
+#define GET_AT_IMPL
+#include "AArch64GenSystemOperands.inc"
}
-
- Valid = false;
- return StringRef();
}
-uint32_t AArch64NamedImmMapper::fromString(StringRef Name,
- const FeatureBitset& FeatureBits, bool &Valid) const {
- std::string LowerCaseName = Name.lower();
- for (unsigned i = 0; i < NumMappings; ++i) {
- if (Mappings[i].isNameEqual(LowerCaseName, FeatureBits)) {
- Valid = true;
- return Mappings[i].Value;
- }
- }
- Valid = false;
- return -1;
+namespace llvm {
+ namespace AArch64DB {
+#define GET_DB_IMPL
+#include "AArch64GenSystemOperands.inc"
+ }
}
-bool AArch64NamedImmMapper::validImm(uint32_t Value) const {
- return Value < TooBigImm;
+namespace llvm {
+ namespace AArch64DC {
+#define GET_DC_IMPL
+#include "AArch64GenSystemOperands.inc"
+ }
}
-const AArch64NamedImmMapper::Mapping AArch64AT::ATMapper::ATMappings[] = {
- {"s1e1r", S1E1R, {}},
- {"s1e2r", S1E2R, {}},
- {"s1e3r", S1E3R, {}},
- {"s1e1w", S1E1W, {}},
- {"s1e2w", S1E2W, {}},
- {"s1e3w", S1E3W, {}},
- {"s1e0r", S1E0R, {}},
- {"s1e0w", S1E0W, {}},
- {"s12e1r", S12E1R, {}},
- {"s12e1w", S12E1W, {}},
- {"s12e0r", S12E0R, {}},
- {"s12e0w", S12E0W, {}},
-};
-
-AArch64AT::ATMapper::ATMapper()
- : AArch64NamedImmMapper(ATMappings, 0) {}
-
-const AArch64NamedImmMapper::Mapping AArch64DB::DBarrierMapper::DBarrierMappings[] = {
- {"oshld", OSHLD, {}},
- {"oshst", OSHST, {}},
- {"osh", OSH, {}},
- {"nshld", NSHLD, {}},
- {"nshst", NSHST, {}},
- {"nsh", NSH, {}},
- {"ishld", ISHLD, {}},
- {"ishst", ISHST, {}},
- {"ish", ISH, {}},
- {"ld", LD, {}},
- {"st", ST, {}},
- {"sy", SY, {}}
-};
-
-AArch64DB::DBarrierMapper::DBarrierMapper()
- : AArch64NamedImmMapper(DBarrierMappings, 16u) {}
-
-const AArch64NamedImmMapper::Mapping AArch64DC::DCMapper::DCMappings[] = {
- {"zva", ZVA, {}},
- {"ivac", IVAC, {}},
- {"isw", ISW, {}},
- {"cvac", CVAC, {}},
- {"csw", CSW, {}},
- {"cvau", CVAU, {}},
- {"civac", CIVAC, {}},
- {"cisw", CISW, {}}
-};
-
-AArch64DC::DCMapper::DCMapper()
- : AArch64NamedImmMapper(DCMappings, 0) {}
-
-const AArch64NamedImmMapper::Mapping AArch64IC::ICMapper::ICMappings[] = {
- {"ialluis", IALLUIS, {}},
- {"iallu", IALLU, {}},
- {"ivau", IVAU, {}}
-};
-
-AArch64IC::ICMapper::ICMapper()
- : AArch64NamedImmMapper(ICMappings, 0) {}
-
-const AArch64NamedImmMapper::Mapping AArch64ISB::ISBMapper::ISBMappings[] = {
- {"sy", SY, {}},
-};
-
-AArch64ISB::ISBMapper::ISBMapper()
- : AArch64NamedImmMapper(ISBMappings, 16) {}
-
-const AArch64NamedImmMapper::Mapping AArch64PRFM::PRFMMapper::PRFMMappings[] = {
- {"pldl1keep", PLDL1KEEP, {}},
- {"pldl1strm", PLDL1STRM, {}},
- {"pldl2keep", PLDL2KEEP, {}},
- {"pldl2strm", PLDL2STRM, {}},
- {"pldl3keep", PLDL3KEEP, {}},
- {"pldl3strm", PLDL3STRM, {}},
- {"plil1keep", PLIL1KEEP, {}},
- {"plil1strm", PLIL1STRM, {}},
- {"plil2keep", PLIL2KEEP, {}},
- {"plil2strm", PLIL2STRM, {}},
- {"plil3keep", PLIL3KEEP, {}},
- {"plil3strm", PLIL3STRM, {}},
- {"pstl1keep", PSTL1KEEP, {}},
- {"pstl1strm", PSTL1STRM, {}},
- {"pstl2keep", PSTL2KEEP, {}},
- {"pstl2strm", PSTL2STRM, {}},
- {"pstl3keep", PSTL3KEEP, {}},
- {"pstl3strm", PSTL3STRM, {}}
-};
-
-AArch64PRFM::PRFMMapper::PRFMMapper()
- : AArch64NamedImmMapper(PRFMMappings, 32) {}
-
-const AArch64NamedImmMapper::Mapping AArch64PState::PStateMapper::PStateMappings[] = {
- {"spsel", SPSel, {}},
- {"daifset", DAIFSet, {}},
- {"daifclr", DAIFClr, {}},
-
- // v8.1a "Privileged Access Never" extension-specific PStates
- {"pan", PAN, {AArch64::HasV8_1aOps}},
-
- // v8.2a
- {"uao", UAO, {AArch64::HasV8_2aOps}},
-};
-
-AArch64PState::PStateMapper::PStateMapper()
- : AArch64NamedImmMapper(PStateMappings, 0) {}
-
-const AArch64NamedImmMapper::Mapping AArch64PSBHint::PSBHintMapper::PSBHintMappings[] = {
- // v8.2a "Statistical Profiling" extension-specific PSB operand
- {"csync", CSync, {AArch64::FeatureSPE}},
-};
-
-AArch64PSBHint::PSBHintMapper::PSBHintMapper()
- : AArch64NamedImmMapper(PSBHintMappings, 0) {}
-
-const AArch64NamedImmMapper::Mapping AArch64SysReg::MRSMapper::MRSMappings[] = {
- {"mdccsr_el0", MDCCSR_EL0, {}},
- {"dbgdtrrx_el0", DBGDTRRX_EL0, {}},
- {"mdrar_el1", MDRAR_EL1, {}},
- {"oslsr_el1", OSLSR_EL1, {}},
- {"dbgauthstatus_el1", DBGAUTHSTATUS_EL1, {}},
- {"pmceid0_el0", PMCEID0_EL0, {}},
- {"pmceid1_el0", PMCEID1_EL0, {}},
- {"midr_el1", MIDR_EL1, {}},
- {"ccsidr_el1", CCSIDR_EL1, {}},
- {"clidr_el1", CLIDR_EL1, {}},
- {"ctr_el0", CTR_EL0, {}},
- {"mpidr_el1", MPIDR_EL1, {}},
- {"revidr_el1", REVIDR_EL1, {}},
- {"aidr_el1", AIDR_EL1, {}},
- {"dczid_el0", DCZID_EL0, {}},
- {"id_pfr0_el1", ID_PFR0_EL1, {}},
- {"id_pfr1_el1", ID_PFR1_EL1, {}},
- {"id_dfr0_el1", ID_DFR0_EL1, {}},
- {"id_afr0_el1", ID_AFR0_EL1, {}},
- {"id_mmfr0_el1", ID_MMFR0_EL1, {}},
- {"id_mmfr1_el1", ID_MMFR1_EL1, {}},
- {"id_mmfr2_el1", ID_MMFR2_EL1, {}},
- {"id_mmfr3_el1", ID_MMFR3_EL1, {}},
- {"id_mmfr4_el1", ID_MMFR4_EL1, {}},
- {"id_isar0_el1", ID_ISAR0_EL1, {}},
- {"id_isar1_el1", ID_ISAR1_EL1, {}},
- {"id_isar2_el1", ID_ISAR2_EL1, {}},
- {"id_isar3_el1", ID_ISAR3_EL1, {}},
- {"id_isar4_el1", ID_ISAR4_EL1, {}},
- {"id_isar5_el1", ID_ISAR5_EL1, {}},
- {"id_aa64pfr0_el1", ID_A64PFR0_EL1, {}},
- {"id_aa64pfr1_el1", ID_A64PFR1_EL1, {}},
- {"id_aa64dfr0_el1", ID_A64DFR0_EL1, {}},
- {"id_aa64dfr1_el1", ID_A64DFR1_EL1, {}},
- {"id_aa64afr0_el1", ID_A64AFR0_EL1, {}},
- {"id_aa64afr1_el1", ID_A64AFR1_EL1, {}},
- {"id_aa64isar0_el1", ID_A64ISAR0_EL1, {}},
- {"id_aa64isar1_el1", ID_A64ISAR1_EL1, {}},
- {"id_aa64mmfr0_el1", ID_A64MMFR0_EL1, {}},
- {"id_aa64mmfr1_el1", ID_A64MMFR1_EL1, {}},
- {"id_aa64mmfr2_el1", ID_A64MMFR2_EL1, {AArch64::HasV8_2aOps}},
- {"mvfr0_el1", MVFR0_EL1, {}},
- {"mvfr1_el1", MVFR1_EL1, {}},
- {"mvfr2_el1", MVFR2_EL1, {}},
- {"rvbar_el1", RVBAR_EL1, {}},
- {"rvbar_el2", RVBAR_EL2, {}},
- {"rvbar_el3", RVBAR_EL3, {}},
- {"isr_el1", ISR_EL1, {}},
- {"cntpct_el0", CNTPCT_EL0, {}},
- {"cntvct_el0", CNTVCT_EL0, {}},
-
- // Trace registers
- {"trcstatr", TRCSTATR, {}},
- {"trcidr8", TRCIDR8, {}},
- {"trcidr9", TRCIDR9, {}},
- {"trcidr10", TRCIDR10, {}},
- {"trcidr11", TRCIDR11, {}},
- {"trcidr12", TRCIDR12, {}},
- {"trcidr13", TRCIDR13, {}},
- {"trcidr0", TRCIDR0, {}},
- {"trcidr1", TRCIDR1, {}},
- {"trcidr2", TRCIDR2, {}},
- {"trcidr3", TRCIDR3, {}},
- {"trcidr4", TRCIDR4, {}},
- {"trcidr5", TRCIDR5, {}},
- {"trcidr6", TRCIDR6, {}},
- {"trcidr7", TRCIDR7, {}},
- {"trcoslsr", TRCOSLSR, {}},
- {"trcpdsr", TRCPDSR, {}},
- {"trcdevaff0", TRCDEVAFF0, {}},
- {"trcdevaff1", TRCDEVAFF1, {}},
- {"trclsr", TRCLSR, {}},
- {"trcauthstatus", TRCAUTHSTATUS, {}},
- {"trcdevarch", TRCDEVARCH, {}},
- {"trcdevid", TRCDEVID, {}},
- {"trcdevtype", TRCDEVTYPE, {}},
- {"trcpidr4", TRCPIDR4, {}},
- {"trcpidr5", TRCPIDR5, {}},
- {"trcpidr6", TRCPIDR6, {}},
- {"trcpidr7", TRCPIDR7, {}},
- {"trcpidr0", TRCPIDR0, {}},
- {"trcpidr1", TRCPIDR1, {}},
- {"trcpidr2", TRCPIDR2, {}},
- {"trcpidr3", TRCPIDR3, {}},
- {"trccidr0", TRCCIDR0, {}},
- {"trccidr1", TRCCIDR1, {}},
- {"trccidr2", TRCCIDR2, {}},
- {"trccidr3", TRCCIDR3, {}},
-
- // GICv3 registers
- {"icc_iar1_el1", ICC_IAR1_EL1, {}},
- {"icc_iar0_el1", ICC_IAR0_EL1, {}},
- {"icc_hppir1_el1", ICC_HPPIR1_EL1, {}},
- {"icc_hppir0_el1", ICC_HPPIR0_EL1, {}},
- {"icc_rpr_el1", ICC_RPR_EL1, {}},
- {"ich_vtr_el2", ICH_VTR_EL2, {}},
- {"ich_eisr_el2", ICH_EISR_EL2, {}},
- {"ich_elsr_el2", ICH_ELSR_EL2, {}},
-
- // v8.1a "Limited Ordering Regions" extension-specific system registers
- {"lorid_el1", LORID_EL1, {AArch64::HasV8_1aOps}},
-};
-
-AArch64SysReg::MRSMapper::MRSMapper() {
- InstMappings = &MRSMappings[0];
- NumInstMappings = llvm::array_lengthof(MRSMappings);
+namespace llvm {
+ namespace AArch64IC {
+#define GET_IC_IMPL
+#include "AArch64GenSystemOperands.inc"
+ }
}
-const AArch64NamedImmMapper::Mapping AArch64SysReg::MSRMapper::MSRMappings[] = {
- {"dbgdtrtx_el0", DBGDTRTX_EL0, {}},
- {"oslar_el1", OSLAR_EL1, {}},
- {"pmswinc_el0", PMSWINC_EL0, {}},
-
- // Trace registers
- {"trcoslar", TRCOSLAR, {}},
- {"trclar", TRCLAR, {}},
-
- // GICv3 registers
- {"icc_eoir1_el1", ICC_EOIR1_EL1, {}},
- {"icc_eoir0_el1", ICC_EOIR0_EL1, {}},
- {"icc_dir_el1", ICC_DIR_EL1, {}},
- {"icc_sgi1r_el1", ICC_SGI1R_EL1, {}},
- {"icc_asgi1r_el1", ICC_ASGI1R_EL1, {}},
- {"icc_sgi0r_el1", ICC_SGI0R_EL1, {}},
-};
-
-AArch64SysReg::MSRMapper::MSRMapper() {
- InstMappings = &MSRMappings[0];
- NumInstMappings = llvm::array_lengthof(MSRMappings);
+namespace llvm {
+ namespace AArch64ISB {
+#define GET_ISB_IMPL
+#include "AArch64GenSystemOperands.inc"
+ }
+}
+namespace llvm {
+ namespace AArch64PRFM {
+#define GET_PRFM_IMPL
+#include "AArch64GenSystemOperands.inc"
+ }
}
+namespace llvm {
+ namespace AArch64PState {
+#define GET_PSTATE_IMPL
+#include "AArch64GenSystemOperands.inc"
+ }
+}
-const AArch64NamedImmMapper::Mapping AArch64SysReg::SysRegMapper::SysRegMappings[] = {
- {"osdtrrx_el1", OSDTRRX_EL1, {}},
- {"osdtrtx_el1", OSDTRTX_EL1, {}},
- {"teecr32_el1", TEECR32_EL1, {}},
- {"mdccint_el1", MDCCINT_EL1, {}},
- {"mdscr_el1", MDSCR_EL1, {}},
- {"dbgdtr_el0", DBGDTR_EL0, {}},
- {"oseccr_el1", OSECCR_EL1, {}},
- {"dbgvcr32_el2", DBGVCR32_EL2, {}},
- {"dbgbvr0_el1", DBGBVR0_EL1, {}},
- {"dbgbvr1_el1", DBGBVR1_EL1, {}},
- {"dbgbvr2_el1", DBGBVR2_EL1, {}},
- {"dbgbvr3_el1", DBGBVR3_EL1, {}},
- {"dbgbvr4_el1", DBGBVR4_EL1, {}},
- {"dbgbvr5_el1", DBGBVR5_EL1, {}},
- {"dbgbvr6_el1", DBGBVR6_EL1, {}},
- {"dbgbvr7_el1", DBGBVR7_EL1, {}},
- {"dbgbvr8_el1", DBGBVR8_EL1, {}},
- {"dbgbvr9_el1", DBGBVR9_EL1, {}},
- {"dbgbvr10_el1", DBGBVR10_EL1, {}},
- {"dbgbvr11_el1", DBGBVR11_EL1, {}},
- {"dbgbvr12_el1", DBGBVR12_EL1, {}},
- {"dbgbvr13_el1", DBGBVR13_EL1, {}},
- {"dbgbvr14_el1", DBGBVR14_EL1, {}},
- {"dbgbvr15_el1", DBGBVR15_EL1, {}},
- {"dbgbcr0_el1", DBGBCR0_EL1, {}},
- {"dbgbcr1_el1", DBGBCR1_EL1, {}},
- {"dbgbcr2_el1", DBGBCR2_EL1, {}},
- {"dbgbcr3_el1", DBGBCR3_EL1, {}},
- {"dbgbcr4_el1", DBGBCR4_EL1, {}},
- {"dbgbcr5_el1", DBGBCR5_EL1, {}},
- {"dbgbcr6_el1", DBGBCR6_EL1, {}},
- {"dbgbcr7_el1", DBGBCR7_EL1, {}},
- {"dbgbcr8_el1", DBGBCR8_EL1, {}},
- {"dbgbcr9_el1", DBGBCR9_EL1, {}},
- {"dbgbcr10_el1", DBGBCR10_EL1, {}},
- {"dbgbcr11_el1", DBGBCR11_EL1, {}},
- {"dbgbcr12_el1", DBGBCR12_EL1, {}},
- {"dbgbcr13_el1", DBGBCR13_EL1, {}},
- {"dbgbcr14_el1", DBGBCR14_EL1, {}},
- {"dbgbcr15_el1", DBGBCR15_EL1, {}},
- {"dbgwvr0_el1", DBGWVR0_EL1, {}},
- {"dbgwvr1_el1", DBGWVR1_EL1, {}},
- {"dbgwvr2_el1", DBGWVR2_EL1, {}},
- {"dbgwvr3_el1", DBGWVR3_EL1, {}},
- {"dbgwvr4_el1", DBGWVR4_EL1, {}},
- {"dbgwvr5_el1", DBGWVR5_EL1, {}},
- {"dbgwvr6_el1", DBGWVR6_EL1, {}},
- {"dbgwvr7_el1", DBGWVR7_EL1, {}},
- {"dbgwvr8_el1", DBGWVR8_EL1, {}},
- {"dbgwvr9_el1", DBGWVR9_EL1, {}},
- {"dbgwvr10_el1", DBGWVR10_EL1, {}},
- {"dbgwvr11_el1", DBGWVR11_EL1, {}},
- {"dbgwvr12_el1", DBGWVR12_EL1, {}},
- {"dbgwvr13_el1", DBGWVR13_EL1, {}},
- {"dbgwvr14_el1", DBGWVR14_EL1, {}},
- {"dbgwvr15_el1", DBGWVR15_EL1, {}},
- {"dbgwcr0_el1", DBGWCR0_EL1, {}},
- {"dbgwcr1_el1", DBGWCR1_EL1, {}},
- {"dbgwcr2_el1", DBGWCR2_EL1, {}},
- {"dbgwcr3_el1", DBGWCR3_EL1, {}},
- {"dbgwcr4_el1", DBGWCR4_EL1, {}},
- {"dbgwcr5_el1", DBGWCR5_EL1, {}},
- {"dbgwcr6_el1", DBGWCR6_EL1, {}},
- {"dbgwcr7_el1", DBGWCR7_EL1, {}},
- {"dbgwcr8_el1", DBGWCR8_EL1, {}},
- {"dbgwcr9_el1", DBGWCR9_EL1, {}},
- {"dbgwcr10_el1", DBGWCR10_EL1, {}},
- {"dbgwcr11_el1", DBGWCR11_EL1, {}},
- {"dbgwcr12_el1", DBGWCR12_EL1, {}},
- {"dbgwcr13_el1", DBGWCR13_EL1, {}},
- {"dbgwcr14_el1", DBGWCR14_EL1, {}},
- {"dbgwcr15_el1", DBGWCR15_EL1, {}},
- {"teehbr32_el1", TEEHBR32_EL1, {}},
- {"osdlr_el1", OSDLR_EL1, {}},
- {"dbgprcr_el1", DBGPRCR_EL1, {}},
- {"dbgclaimset_el1", DBGCLAIMSET_EL1, {}},
- {"dbgclaimclr_el1", DBGCLAIMCLR_EL1, {}},
- {"csselr_el1", CSSELR_EL1, {}},
- {"vpidr_el2", VPIDR_EL2, {}},
- {"vmpidr_el2", VMPIDR_EL2, {}},
- {"sctlr_el1", SCTLR_EL1, {}},
- {"sctlr_el2", SCTLR_EL2, {}},
- {"sctlr_el3", SCTLR_EL3, {}},
- {"actlr_el1", ACTLR_EL1, {}},
- {"actlr_el2", ACTLR_EL2, {}},
- {"actlr_el3", ACTLR_EL3, {}},
- {"cpacr_el1", CPACR_EL1, {}},
- {"hcr_el2", HCR_EL2, {}},
- {"scr_el3", SCR_EL3, {}},
- {"mdcr_el2", MDCR_EL2, {}},
- {"sder32_el3", SDER32_EL3, {}},
- {"cptr_el2", CPTR_EL2, {}},
- {"cptr_el3", CPTR_EL3, {}},
- {"hstr_el2", HSTR_EL2, {}},
- {"hacr_el2", HACR_EL2, {}},
- {"mdcr_el3", MDCR_EL3, {}},
- {"ttbr0_el1", TTBR0_EL1, {}},
- {"ttbr0_el2", TTBR0_EL2, {}},
- {"ttbr0_el3", TTBR0_EL3, {}},
- {"ttbr1_el1", TTBR1_EL1, {}},
- {"tcr_el1", TCR_EL1, {}},
- {"tcr_el2", TCR_EL2, {}},
- {"tcr_el3", TCR_EL3, {}},
- {"vttbr_el2", VTTBR_EL2, {}},
- {"vtcr_el2", VTCR_EL2, {}},
- {"dacr32_el2", DACR32_EL2, {}},
- {"spsr_el1", SPSR_EL1, {}},
- {"spsr_el2", SPSR_EL2, {}},
- {"spsr_el3", SPSR_EL3, {}},
- {"elr_el1", ELR_EL1, {}},
- {"elr_el2", ELR_EL2, {}},
- {"elr_el3", ELR_EL3, {}},
- {"sp_el0", SP_EL0, {}},
- {"sp_el1", SP_EL1, {}},
- {"sp_el2", SP_EL2, {}},
- {"spsel", SPSel, {}},
- {"nzcv", NZCV, {}},
- {"daif", DAIF, {}},
- {"currentel", CurrentEL, {}},
- {"spsr_irq", SPSR_irq, {}},
- {"spsr_abt", SPSR_abt, {}},
- {"spsr_und", SPSR_und, {}},
- {"spsr_fiq", SPSR_fiq, {}},
- {"fpcr", FPCR, {}},
- {"fpsr", FPSR, {}},
- {"dspsr_el0", DSPSR_EL0, {}},
- {"dlr_el0", DLR_EL0, {}},
- {"ifsr32_el2", IFSR32_EL2, {}},
- {"afsr0_el1", AFSR0_EL1, {}},
- {"afsr0_el2", AFSR0_EL2, {}},
- {"afsr0_el3", AFSR0_EL3, {}},
- {"afsr1_el1", AFSR1_EL1, {}},
- {"afsr1_el2", AFSR1_EL2, {}},
- {"afsr1_el3", AFSR1_EL3, {}},
- {"esr_el1", ESR_EL1, {}},
- {"esr_el2", ESR_EL2, {}},
- {"esr_el3", ESR_EL3, {}},
- {"fpexc32_el2", FPEXC32_EL2, {}},
- {"far_el1", FAR_EL1, {}},
- {"far_el2", FAR_EL2, {}},
- {"far_el3", FAR_EL3, {}},
- {"hpfar_el2", HPFAR_EL2, {}},
- {"par_el1", PAR_EL1, {}},
- {"pmcr_el0", PMCR_EL0, {}},
- {"pmcntenset_el0", PMCNTENSET_EL0, {}},
- {"pmcntenclr_el0", PMCNTENCLR_EL0, {}},
- {"pmovsclr_el0", PMOVSCLR_EL0, {}},
- {"pmselr_el0", PMSELR_EL0, {}},
- {"pmccntr_el0", PMCCNTR_EL0, {}},
- {"pmxevtyper_el0", PMXEVTYPER_EL0, {}},
- {"pmxevcntr_el0", PMXEVCNTR_EL0, {}},
- {"pmuserenr_el0", PMUSERENR_EL0, {}},
- {"pmintenset_el1", PMINTENSET_EL1, {}},
- {"pmintenclr_el1", PMINTENCLR_EL1, {}},
- {"pmovsset_el0", PMOVSSET_EL0, {}},
- {"mair_el1", MAIR_EL1, {}},
- {"mair_el2", MAIR_EL2, {}},
- {"mair_el3", MAIR_EL3, {}},
- {"amair_el1", AMAIR_EL1, {}},
- {"amair_el2", AMAIR_EL2, {}},
- {"amair_el3", AMAIR_EL3, {}},
- {"vbar_el1", VBAR_EL1, {}},
- {"vbar_el2", VBAR_EL2, {}},
- {"vbar_el3", VBAR_EL3, {}},
- {"rmr_el1", RMR_EL1, {}},
- {"rmr_el2", RMR_EL2, {}},
- {"rmr_el3", RMR_EL3, {}},
- {"contextidr_el1", CONTEXTIDR_EL1, {}},
- {"tpidr_el0", TPIDR_EL0, {}},
- {"tpidr_el2", TPIDR_EL2, {}},
- {"tpidr_el3", TPIDR_EL3, {}},
- {"tpidrro_el0", TPIDRRO_EL0, {}},
- {"tpidr_el1", TPIDR_EL1, {}},
- {"cntfrq_el0", CNTFRQ_EL0, {}},
- {"cntvoff_el2", CNTVOFF_EL2, {}},
- {"cntkctl_el1", CNTKCTL_EL1, {}},
- {"cnthctl_el2", CNTHCTL_EL2, {}},
- {"cntp_tval_el0", CNTP_TVAL_EL0, {}},
- {"cnthp_tval_el2", CNTHP_TVAL_EL2, {}},
- {"cntps_tval_el1", CNTPS_TVAL_EL1, {}},
- {"cntp_ctl_el0", CNTP_CTL_EL0, {}},
- {"cnthp_ctl_el2", CNTHP_CTL_EL2, {}},
- {"cntps_ctl_el1", CNTPS_CTL_EL1, {}},
- {"cntp_cval_el0", CNTP_CVAL_EL0, {}},
- {"cnthp_cval_el2", CNTHP_CVAL_EL2, {}},
- {"cntps_cval_el1", CNTPS_CVAL_EL1, {}},
- {"cntv_tval_el0", CNTV_TVAL_EL0, {}},
- {"cntv_ctl_el0", CNTV_CTL_EL0, {}},
- {"cntv_cval_el0", CNTV_CVAL_EL0, {}},
- {"pmevcntr0_el0", PMEVCNTR0_EL0, {}},
- {"pmevcntr1_el0", PMEVCNTR1_EL0, {}},
- {"pmevcntr2_el0", PMEVCNTR2_EL0, {}},
- {"pmevcntr3_el0", PMEVCNTR3_EL0, {}},
- {"pmevcntr4_el0", PMEVCNTR4_EL0, {}},
- {"pmevcntr5_el0", PMEVCNTR5_EL0, {}},
- {"pmevcntr6_el0", PMEVCNTR6_EL0, {}},
- {"pmevcntr7_el0", PMEVCNTR7_EL0, {}},
- {"pmevcntr8_el0", PMEVCNTR8_EL0, {}},
- {"pmevcntr9_el0", PMEVCNTR9_EL0, {}},
- {"pmevcntr10_el0", PMEVCNTR10_EL0, {}},
- {"pmevcntr11_el0", PMEVCNTR11_EL0, {}},
- {"pmevcntr12_el0", PMEVCNTR12_EL0, {}},
- {"pmevcntr13_el0", PMEVCNTR13_EL0, {}},
- {"pmevcntr14_el0", PMEVCNTR14_EL0, {}},
- {"pmevcntr15_el0", PMEVCNTR15_EL0, {}},
- {"pmevcntr16_el0", PMEVCNTR16_EL0, {}},
- {"pmevcntr17_el0", PMEVCNTR17_EL0, {}},
- {"pmevcntr18_el0", PMEVCNTR18_EL0, {}},
- {"pmevcntr19_el0", PMEVCNTR19_EL0, {}},
- {"pmevcntr20_el0", PMEVCNTR20_EL0, {}},
- {"pmevcntr21_el0", PMEVCNTR21_EL0, {}},
- {"pmevcntr22_el0", PMEVCNTR22_EL0, {}},
- {"pmevcntr23_el0", PMEVCNTR23_EL0, {}},
- {"pmevcntr24_el0", PMEVCNTR24_EL0, {}},
- {"pmevcntr25_el0", PMEVCNTR25_EL0, {}},
- {"pmevcntr26_el0", PMEVCNTR26_EL0, {}},
- {"pmevcntr27_el0", PMEVCNTR27_EL0, {}},
- {"pmevcntr28_el0", PMEVCNTR28_EL0, {}},
- {"pmevcntr29_el0", PMEVCNTR29_EL0, {}},
- {"pmevcntr30_el0", PMEVCNTR30_EL0, {}},
- {"pmccfiltr_el0", PMCCFILTR_EL0, {}},
- {"pmevtyper0_el0", PMEVTYPER0_EL0, {}},
- {"pmevtyper1_el0", PMEVTYPER1_EL0, {}},
- {"pmevtyper2_el0", PMEVTYPER2_EL0, {}},
- {"pmevtyper3_el0", PMEVTYPER3_EL0, {}},
- {"pmevtyper4_el0", PMEVTYPER4_EL0, {}},
- {"pmevtyper5_el0", PMEVTYPER5_EL0, {}},
- {"pmevtyper6_el0", PMEVTYPER6_EL0, {}},
- {"pmevtyper7_el0", PMEVTYPER7_EL0, {}},
- {"pmevtyper8_el0", PMEVTYPER8_EL0, {}},
- {"pmevtyper9_el0", PMEVTYPER9_EL0, {}},
- {"pmevtyper10_el0", PMEVTYPER10_EL0, {}},
- {"pmevtyper11_el0", PMEVTYPER11_EL0, {}},
- {"pmevtyper12_el0", PMEVTYPER12_EL0, {}},
- {"pmevtyper13_el0", PMEVTYPER13_EL0, {}},
- {"pmevtyper14_el0", PMEVTYPER14_EL0, {}},
- {"pmevtyper15_el0", PMEVTYPER15_EL0, {}},
- {"pmevtyper16_el0", PMEVTYPER16_EL0, {}},
- {"pmevtyper17_el0", PMEVTYPER17_EL0, {}},
- {"pmevtyper18_el0", PMEVTYPER18_EL0, {}},
- {"pmevtyper19_el0", PMEVTYPER19_EL0, {}},
- {"pmevtyper20_el0", PMEVTYPER20_EL0, {}},
- {"pmevtyper21_el0", PMEVTYPER21_EL0, {}},
- {"pmevtyper22_el0", PMEVTYPER22_EL0, {}},
- {"pmevtyper23_el0", PMEVTYPER23_EL0, {}},
- {"pmevtyper24_el0", PMEVTYPER24_EL0, {}},
- {"pmevtyper25_el0", PMEVTYPER25_EL0, {}},
- {"pmevtyper26_el0", PMEVTYPER26_EL0, {}},
- {"pmevtyper27_el0", PMEVTYPER27_EL0, {}},
- {"pmevtyper28_el0", PMEVTYPER28_EL0, {}},
- {"pmevtyper29_el0", PMEVTYPER29_EL0, {}},
- {"pmevtyper30_el0", PMEVTYPER30_EL0, {}},
-
- // Trace registers
- {"trcprgctlr", TRCPRGCTLR, {}},
- {"trcprocselr", TRCPROCSELR, {}},
- {"trcconfigr", TRCCONFIGR, {}},
- {"trcauxctlr", TRCAUXCTLR, {}},
- {"trceventctl0r", TRCEVENTCTL0R, {}},
- {"trceventctl1r", TRCEVENTCTL1R, {}},
- {"trcstallctlr", TRCSTALLCTLR, {}},
- {"trctsctlr", TRCTSCTLR, {}},
- {"trcsyncpr", TRCSYNCPR, {}},
- {"trcccctlr", TRCCCCTLR, {}},
- {"trcbbctlr", TRCBBCTLR, {}},
- {"trctraceidr", TRCTRACEIDR, {}},
- {"trcqctlr", TRCQCTLR, {}},
- {"trcvictlr", TRCVICTLR, {}},
- {"trcviiectlr", TRCVIIECTLR, {}},
- {"trcvissctlr", TRCVISSCTLR, {}},
- {"trcvipcssctlr", TRCVIPCSSCTLR, {}},
- {"trcvdctlr", TRCVDCTLR, {}},
- {"trcvdsacctlr", TRCVDSACCTLR, {}},
- {"trcvdarcctlr", TRCVDARCCTLR, {}},
- {"trcseqevr0", TRCSEQEVR0, {}},
- {"trcseqevr1", TRCSEQEVR1, {}},
- {"trcseqevr2", TRCSEQEVR2, {}},
- {"trcseqrstevr", TRCSEQRSTEVR, {}},
- {"trcseqstr", TRCSEQSTR, {}},
- {"trcextinselr", TRCEXTINSELR, {}},
- {"trccntrldvr0", TRCCNTRLDVR0, {}},
- {"trccntrldvr1", TRCCNTRLDVR1, {}},
- {"trccntrldvr2", TRCCNTRLDVR2, {}},
- {"trccntrldvr3", TRCCNTRLDVR3, {}},
- {"trccntctlr0", TRCCNTCTLR0, {}},
- {"trccntctlr1", TRCCNTCTLR1, {}},
- {"trccntctlr2", TRCCNTCTLR2, {}},
- {"trccntctlr3", TRCCNTCTLR3, {}},
- {"trccntvr0", TRCCNTVR0, {}},
- {"trccntvr1", TRCCNTVR1, {}},
- {"trccntvr2", TRCCNTVR2, {}},
- {"trccntvr3", TRCCNTVR3, {}},
- {"trcimspec0", TRCIMSPEC0, {}},
- {"trcimspec1", TRCIMSPEC1, {}},
- {"trcimspec2", TRCIMSPEC2, {}},
- {"trcimspec3", TRCIMSPEC3, {}},
- {"trcimspec4", TRCIMSPEC4, {}},
- {"trcimspec5", TRCIMSPEC5, {}},
- {"trcimspec6", TRCIMSPEC6, {}},
- {"trcimspec7", TRCIMSPEC7, {}},
- {"trcrsctlr2", TRCRSCTLR2, {}},
- {"trcrsctlr3", TRCRSCTLR3, {}},
- {"trcrsctlr4", TRCRSCTLR4, {}},
- {"trcrsctlr5", TRCRSCTLR5, {}},
- {"trcrsctlr6", TRCRSCTLR6, {}},
- {"trcrsctlr7", TRCRSCTLR7, {}},
- {"trcrsctlr8", TRCRSCTLR8, {}},
- {"trcrsctlr9", TRCRSCTLR9, {}},
- {"trcrsctlr10", TRCRSCTLR10, {}},
- {"trcrsctlr11", TRCRSCTLR11, {}},
- {"trcrsctlr12", TRCRSCTLR12, {}},
- {"trcrsctlr13", TRCRSCTLR13, {}},
- {"trcrsctlr14", TRCRSCTLR14, {}},
- {"trcrsctlr15", TRCRSCTLR15, {}},
- {"trcrsctlr16", TRCRSCTLR16, {}},
- {"trcrsctlr17", TRCRSCTLR17, {}},
- {"trcrsctlr18", TRCRSCTLR18, {}},
- {"trcrsctlr19", TRCRSCTLR19, {}},
- {"trcrsctlr20", TRCRSCTLR20, {}},
- {"trcrsctlr21", TRCRSCTLR21, {}},
- {"trcrsctlr22", TRCRSCTLR22, {}},
- {"trcrsctlr23", TRCRSCTLR23, {}},
- {"trcrsctlr24", TRCRSCTLR24, {}},
- {"trcrsctlr25", TRCRSCTLR25, {}},
- {"trcrsctlr26", TRCRSCTLR26, {}},
- {"trcrsctlr27", TRCRSCTLR27, {}},
- {"trcrsctlr28", TRCRSCTLR28, {}},
- {"trcrsctlr29", TRCRSCTLR29, {}},
- {"trcrsctlr30", TRCRSCTLR30, {}},
- {"trcrsctlr31", TRCRSCTLR31, {}},
- {"trcssccr0", TRCSSCCR0, {}},
- {"trcssccr1", TRCSSCCR1, {}},
- {"trcssccr2", TRCSSCCR2, {}},
- {"trcssccr3", TRCSSCCR3, {}},
- {"trcssccr4", TRCSSCCR4, {}},
- {"trcssccr5", TRCSSCCR5, {}},
- {"trcssccr6", TRCSSCCR6, {}},
- {"trcssccr7", TRCSSCCR7, {}},
- {"trcsscsr0", TRCSSCSR0, {}},
- {"trcsscsr1", TRCSSCSR1, {}},
- {"trcsscsr2", TRCSSCSR2, {}},
- {"trcsscsr3", TRCSSCSR3, {}},
- {"trcsscsr4", TRCSSCSR4, {}},
- {"trcsscsr5", TRCSSCSR5, {}},
- {"trcsscsr6", TRCSSCSR6, {}},
- {"trcsscsr7", TRCSSCSR7, {}},
- {"trcsspcicr0", TRCSSPCICR0, {}},
- {"trcsspcicr1", TRCSSPCICR1, {}},
- {"trcsspcicr2", TRCSSPCICR2, {}},
- {"trcsspcicr3", TRCSSPCICR3, {}},
- {"trcsspcicr4", TRCSSPCICR4, {}},
- {"trcsspcicr5", TRCSSPCICR5, {}},
- {"trcsspcicr6", TRCSSPCICR6, {}},
- {"trcsspcicr7", TRCSSPCICR7, {}},
- {"trcpdcr", TRCPDCR, {}},
- {"trcacvr0", TRCACVR0, {}},
- {"trcacvr1", TRCACVR1, {}},
- {"trcacvr2", TRCACVR2, {}},
- {"trcacvr3", TRCACVR3, {}},
- {"trcacvr4", TRCACVR4, {}},
- {"trcacvr5", TRCACVR5, {}},
- {"trcacvr6", TRCACVR6, {}},
- {"trcacvr7", TRCACVR7, {}},
- {"trcacvr8", TRCACVR8, {}},
- {"trcacvr9", TRCACVR9, {}},
- {"trcacvr10", TRCACVR10, {}},
- {"trcacvr11", TRCACVR11, {}},
- {"trcacvr12", TRCACVR12, {}},
- {"trcacvr13", TRCACVR13, {}},
- {"trcacvr14", TRCACVR14, {}},
- {"trcacvr15", TRCACVR15, {}},
- {"trcacatr0", TRCACATR0, {}},
- {"trcacatr1", TRCACATR1, {}},
- {"trcacatr2", TRCACATR2, {}},
- {"trcacatr3", TRCACATR3, {}},
- {"trcacatr4", TRCACATR4, {}},
- {"trcacatr5", TRCACATR5, {}},
- {"trcacatr6", TRCACATR6, {}},
- {"trcacatr7", TRCACATR7, {}},
- {"trcacatr8", TRCACATR8, {}},
- {"trcacatr9", TRCACATR9, {}},
- {"trcacatr10", TRCACATR10, {}},
- {"trcacatr11", TRCACATR11, {}},
- {"trcacatr12", TRCACATR12, {}},
- {"trcacatr13", TRCACATR13, {}},
- {"trcacatr14", TRCACATR14, {}},
- {"trcacatr15", TRCACATR15, {}},
- {"trcdvcvr0", TRCDVCVR0, {}},
- {"trcdvcvr1", TRCDVCVR1, {}},
- {"trcdvcvr2", TRCDVCVR2, {}},
- {"trcdvcvr3", TRCDVCVR3, {}},
- {"trcdvcvr4", TRCDVCVR4, {}},
- {"trcdvcvr5", TRCDVCVR5, {}},
- {"trcdvcvr6", TRCDVCVR6, {}},
- {"trcdvcvr7", TRCDVCVR7, {}},
- {"trcdvcmr0", TRCDVCMR0, {}},
- {"trcdvcmr1", TRCDVCMR1, {}},
- {"trcdvcmr2", TRCDVCMR2, {}},
- {"trcdvcmr3", TRCDVCMR3, {}},
- {"trcdvcmr4", TRCDVCMR4, {}},
- {"trcdvcmr5", TRCDVCMR5, {}},
- {"trcdvcmr6", TRCDVCMR6, {}},
- {"trcdvcmr7", TRCDVCMR7, {}},
- {"trccidcvr0", TRCCIDCVR0, {}},
- {"trccidcvr1", TRCCIDCVR1, {}},
- {"trccidcvr2", TRCCIDCVR2, {}},
- {"trccidcvr3", TRCCIDCVR3, {}},
- {"trccidcvr4", TRCCIDCVR4, {}},
- {"trccidcvr5", TRCCIDCVR5, {}},
- {"trccidcvr6", TRCCIDCVR6, {}},
- {"trccidcvr7", TRCCIDCVR7, {}},
- {"trcvmidcvr0", TRCVMIDCVR0, {}},
- {"trcvmidcvr1", TRCVMIDCVR1, {}},
- {"trcvmidcvr2", TRCVMIDCVR2, {}},
- {"trcvmidcvr3", TRCVMIDCVR3, {}},
- {"trcvmidcvr4", TRCVMIDCVR4, {}},
- {"trcvmidcvr5", TRCVMIDCVR5, {}},
- {"trcvmidcvr6", TRCVMIDCVR6, {}},
- {"trcvmidcvr7", TRCVMIDCVR7, {}},
- {"trccidcctlr0", TRCCIDCCTLR0, {}},
- {"trccidcctlr1", TRCCIDCCTLR1, {}},
- {"trcvmidcctlr0", TRCVMIDCCTLR0, {}},
- {"trcvmidcctlr1", TRCVMIDCCTLR1, {}},
- {"trcitctrl", TRCITCTRL, {}},
- {"trcclaimset", TRCCLAIMSET, {}},
- {"trcclaimclr", TRCCLAIMCLR, {}},
-
- // GICv3 registers
- {"icc_bpr1_el1", ICC_BPR1_EL1, {}},
- {"icc_bpr0_el1", ICC_BPR0_EL1, {}},
- {"icc_pmr_el1", ICC_PMR_EL1, {}},
- {"icc_ctlr_el1", ICC_CTLR_EL1, {}},
- {"icc_ctlr_el3", ICC_CTLR_EL3, {}},
- {"icc_sre_el1", ICC_SRE_EL1, {}},
- {"icc_sre_el2", ICC_SRE_EL2, {}},
- {"icc_sre_el3", ICC_SRE_EL3, {}},
- {"icc_igrpen0_el1", ICC_IGRPEN0_EL1, {}},
- {"icc_igrpen1_el1", ICC_IGRPEN1_EL1, {}},
- {"icc_igrpen1_el3", ICC_IGRPEN1_EL3, {}},
- {"icc_seien_el1", ICC_SEIEN_EL1, {}},
- {"icc_ap0r0_el1", ICC_AP0R0_EL1, {}},
- {"icc_ap0r1_el1", ICC_AP0R1_EL1, {}},
- {"icc_ap0r2_el1", ICC_AP0R2_EL1, {}},
- {"icc_ap0r3_el1", ICC_AP0R3_EL1, {}},
- {"icc_ap1r0_el1", ICC_AP1R0_EL1, {}},
- {"icc_ap1r1_el1", ICC_AP1R1_EL1, {}},
- {"icc_ap1r2_el1", ICC_AP1R2_EL1, {}},
- {"icc_ap1r3_el1", ICC_AP1R3_EL1, {}},
- {"ich_ap0r0_el2", ICH_AP0R0_EL2, {}},
- {"ich_ap0r1_el2", ICH_AP0R1_EL2, {}},
- {"ich_ap0r2_el2", ICH_AP0R2_EL2, {}},
- {"ich_ap0r3_el2", ICH_AP0R3_EL2, {}},
- {"ich_ap1r0_el2", ICH_AP1R0_EL2, {}},
- {"ich_ap1r1_el2", ICH_AP1R1_EL2, {}},
- {"ich_ap1r2_el2", ICH_AP1R2_EL2, {}},
- {"ich_ap1r3_el2", ICH_AP1R3_EL2, {}},
- {"ich_hcr_el2", ICH_HCR_EL2, {}},
- {"ich_misr_el2", ICH_MISR_EL2, {}},
- {"ich_vmcr_el2", ICH_VMCR_EL2, {}},
- {"ich_vseir_el2", ICH_VSEIR_EL2, {}},
- {"ich_lr0_el2", ICH_LR0_EL2, {}},
- {"ich_lr1_el2", ICH_LR1_EL2, {}},
- {"ich_lr2_el2", ICH_LR2_EL2, {}},
- {"ich_lr3_el2", ICH_LR3_EL2, {}},
- {"ich_lr4_el2", ICH_LR4_EL2, {}},
- {"ich_lr5_el2", ICH_LR5_EL2, {}},
- {"ich_lr6_el2", ICH_LR6_EL2, {}},
- {"ich_lr7_el2", ICH_LR7_EL2, {}},
- {"ich_lr8_el2", ICH_LR8_EL2, {}},
- {"ich_lr9_el2", ICH_LR9_EL2, {}},
- {"ich_lr10_el2", ICH_LR10_EL2, {}},
- {"ich_lr11_el2", ICH_LR11_EL2, {}},
- {"ich_lr12_el2", ICH_LR12_EL2, {}},
- {"ich_lr13_el2", ICH_LR13_EL2, {}},
- {"ich_lr14_el2", ICH_LR14_EL2, {}},
- {"ich_lr15_el2", ICH_LR15_EL2, {}},
-
- // Cyclone registers
- {"cpm_ioacc_ctl_el3", CPM_IOACC_CTL_EL3, {AArch64::ProcCyclone}},
-
- // v8.1a "Privileged Access Never" extension-specific system registers
- {"pan", PAN, {AArch64::HasV8_1aOps}},
-
- // v8.1a "Limited Ordering Regions" extension-specific system registers
- {"lorsa_el1", LORSA_EL1, {AArch64::HasV8_1aOps}},
- {"lorea_el1", LOREA_EL1, {AArch64::HasV8_1aOps}},
- {"lorn_el1", LORN_EL1, {AArch64::HasV8_1aOps}},
- {"lorc_el1", LORC_EL1, {AArch64::HasV8_1aOps}},
-
- // v8.1a "Virtualization host extensions" system registers
- {"ttbr1_el2", TTBR1_EL2, {AArch64::HasV8_1aOps}},
- {"contextidr_el2", CONTEXTIDR_EL2, {AArch64::HasV8_1aOps}},
- {"cnthv_tval_el2", CNTHV_TVAL_EL2, {AArch64::HasV8_1aOps}},
- {"cnthv_cval_el2", CNTHV_CVAL_EL2, {AArch64::HasV8_1aOps}},
- {"cnthv_ctl_el2", CNTHV_CTL_EL2, {AArch64::HasV8_1aOps}},
- {"sctlr_el12", SCTLR_EL12, {AArch64::HasV8_1aOps}},
- {"cpacr_el12", CPACR_EL12, {AArch64::HasV8_1aOps}},
- {"ttbr0_el12", TTBR0_EL12, {AArch64::HasV8_1aOps}},
- {"ttbr1_el12", TTBR1_EL12, {AArch64::HasV8_1aOps}},
- {"tcr_el12", TCR_EL12, {AArch64::HasV8_1aOps}},
- {"afsr0_el12", AFSR0_EL12, {AArch64::HasV8_1aOps}},
- {"afsr1_el12", AFSR1_EL12, {AArch64::HasV8_1aOps}},
- {"esr_el12", ESR_EL12, {AArch64::HasV8_1aOps}},
- {"far_el12", FAR_EL12, {AArch64::HasV8_1aOps}},
- {"mair_el12", MAIR_EL12, {AArch64::HasV8_1aOps}},
- {"amair_el12", AMAIR_EL12, {AArch64::HasV8_1aOps}},
- {"vbar_el12", VBAR_EL12, {AArch64::HasV8_1aOps}},
- {"contextidr_el12", CONTEXTIDR_EL12, {AArch64::HasV8_1aOps}},
- {"cntkctl_el12", CNTKCTL_EL12, {AArch64::HasV8_1aOps}},
- {"cntp_tval_el02", CNTP_TVAL_EL02, {AArch64::HasV8_1aOps}},
- {"cntp_ctl_el02", CNTP_CTL_EL02, {AArch64::HasV8_1aOps}},
- {"cntp_cval_el02", CNTP_CVAL_EL02, {AArch64::HasV8_1aOps}},
- {"cntv_tval_el02", CNTV_TVAL_EL02, {AArch64::HasV8_1aOps}},
- {"cntv_ctl_el02", CNTV_CTL_EL02, {AArch64::HasV8_1aOps}},
- {"cntv_cval_el02", CNTV_CVAL_EL02, {AArch64::HasV8_1aOps}},
- {"spsr_el12", SPSR_EL12, {AArch64::HasV8_1aOps}},
- {"elr_el12", ELR_EL12, {AArch64::HasV8_1aOps}},
-
- // v8.2a registers
- {"uao", UAO, {AArch64::HasV8_2aOps}},
-
- // v8.2a "Statistical Profiling extension" registers
- {"pmblimitr_el1", PMBLIMITR_EL1, {AArch64::FeatureSPE}},
- {"pmbptr_el1", PMBPTR_EL1, {AArch64::FeatureSPE}},
- {"pmbsr_el1", PMBSR_EL1, {AArch64::FeatureSPE}},
- {"pmbidr_el1", PMBIDR_EL1, {AArch64::FeatureSPE}},
- {"pmscr_el2", PMSCR_EL2, {AArch64::FeatureSPE}},
- {"pmscr_el12", PMSCR_EL12, {AArch64::FeatureSPE}},
- {"pmscr_el1", PMSCR_EL1, {AArch64::FeatureSPE}},
- {"pmsicr_el1", PMSICR_EL1, {AArch64::FeatureSPE}},
- {"pmsirr_el1", PMSIRR_EL1, {AArch64::FeatureSPE}},
- {"pmsfcr_el1", PMSFCR_EL1, {AArch64::FeatureSPE}},
- {"pmsevfr_el1", PMSEVFR_EL1, {AArch64::FeatureSPE}},
- {"pmslatfr_el1", PMSLATFR_EL1, {AArch64::FeatureSPE}},
- {"pmsidr_el1", PMSIDR_EL1, {AArch64::FeatureSPE}},
-};
-
-uint32_t
-AArch64SysReg::SysRegMapper::fromString(StringRef Name,
- const FeatureBitset& FeatureBits, bool &Valid) const {
- std::string NameLower = Name.lower();
-
- // First search the registers shared by all
- for (unsigned i = 0; i < array_lengthof(SysRegMappings); ++i) {
- if (SysRegMappings[i].isNameEqual(NameLower, FeatureBits)) {
- Valid = true;
- return SysRegMappings[i].Value;
- }
+namespace llvm {
+ namespace AArch64PSBHint {
+#define GET_PSB_IMPL
+#include "AArch64GenSystemOperands.inc"
}
+}
- // Now try the instruction-specific registers (either read-only or
- // write-only).
- for (unsigned i = 0; i < NumInstMappings; ++i) {
- if (InstMappings[i].isNameEqual(NameLower, FeatureBits)) {
- Valid = true;
- return InstMappings[i].Value;
- }
+namespace llvm {
+ namespace AArch64SysReg {
+#define GET_SYSREG_IMPL
+#include "AArch64GenSystemOperands.inc"
}
+}
+uint32_t AArch64SysReg::parseGenericRegister(StringRef Name) {
// Try to parse an S<op0>_<op1>_<Cn>_<Cm>_<op2> register name
- Regex GenericRegPattern("^s([0-3])_([0-7])_c([0-9]|1[0-5])_c([0-9]|1[0-5])_([0-7])$");
+ Regex GenericRegPattern("^S([0-3])_([0-7])_C([0-9]|1[0-5])_C([0-9]|1[0-5])_([0-7])$");
+ std::string UpperName = Name.upper();
SmallVector<StringRef, 5> Ops;
- if (!GenericRegPattern.match(NameLower, &Ops)) {
- Valid = false;
+ if (!GenericRegPattern.match(UpperName, &Ops))
return -1;
- }
uint32_t Op0 = 0, Op1 = 0, CRn = 0, CRm = 0, Op2 = 0;
uint32_t Bits;
@@ -873,28 +99,10 @@ AArch64SysReg::SysRegMapper::fromString(StringRef Name,
Ops[5].getAsInteger(10, Op2);
Bits = (Op0 << 14) | (Op1 << 11) | (CRn << 7) | (CRm << 3) | Op2;
- Valid = true;
return Bits;
}
-std::string
-AArch64SysReg::SysRegMapper::toString(uint32_t Bits,
- const FeatureBitset& FeatureBits) const {
- // First search the registers shared by all
- for (unsigned i = 0; i < array_lengthof(SysRegMappings); ++i) {
- if (SysRegMappings[i].isValueEqual(Bits, FeatureBits)) {
- return SysRegMappings[i].Name;
- }
- }
-
- // Now try the instruction-specific registers (either read-only or
- // write-only).
- for (unsigned i = 0; i < NumInstMappings; ++i) {
- if (InstMappings[i].isValueEqual(Bits, FeatureBits)) {
- return InstMappings[i].Name;
- }
- }
-
+std::string AArch64SysReg::genericRegisterString(uint32_t Bits) {
assert(Bits < 0x10000);
uint32_t Op0 = (Bits >> 14) & 0x3;
uint32_t Op1 = (Bits >> 11) & 0x7;
@@ -902,44 +110,13 @@ AArch64SysReg::SysRegMapper::toString(uint32_t Bits,
uint32_t CRm = (Bits >> 3) & 0xf;
uint32_t Op2 = Bits & 0x7;
- return "s" + utostr(Op0)+ "_" + utostr(Op1) + "_c" + utostr(CRn)
- + "_c" + utostr(CRm) + "_" + utostr(Op2);
+ return "S" + utostr(Op0) + "_" + utostr(Op1) + "_C" + utostr(CRn) + "_C" +
+ utostr(CRm) + "_" + utostr(Op2);
}
-const AArch64NamedImmMapper::Mapping AArch64TLBI::TLBIMapper::TLBIMappings[] = {
- {"ipas2e1is", IPAS2E1IS, {}},
- {"ipas2le1is", IPAS2LE1IS, {}},
- {"vmalle1is", VMALLE1IS, {}},
- {"alle2is", ALLE2IS, {}},
- {"alle3is", ALLE3IS, {}},
- {"vae1is", VAE1IS, {}},
- {"vae2is", VAE2IS, {}},
- {"vae3is", VAE3IS, {}},
- {"aside1is", ASIDE1IS, {}},
- {"vaae1is", VAAE1IS, {}},
- {"alle1is", ALLE1IS, {}},
- {"vale1is", VALE1IS, {}},
- {"vale2is", VALE2IS, {}},
- {"vale3is", VALE3IS, {}},
- {"vmalls12e1is", VMALLS12E1IS, {}},
- {"vaale1is", VAALE1IS, {}},
- {"ipas2e1", IPAS2E1, {}},
- {"ipas2le1", IPAS2LE1, {}},
- {"vmalle1", VMALLE1, {}},
- {"alle2", ALLE2, {}},
- {"alle3", ALLE3, {}},
- {"vae1", VAE1, {}},
- {"vae2", VAE2, {}},
- {"vae3", VAE3, {}},
- {"aside1", ASIDE1, {}},
- {"vaae1", VAAE1, {}},
- {"alle1", ALLE1, {}},
- {"vale1", VALE1, {}},
- {"vale2", VALE2, {}},
- {"vale3", VALE3, {}},
- {"vmalls12e1", VMALLS12E1, {}},
- {"vaale1", VAALE1, {}}
-};
-
-AArch64TLBI::TLBIMapper::TLBIMapper()
- : AArch64NamedImmMapper(TLBIMappings, 0) {}
+namespace llvm {
+ namespace AArch64TLBI {
+#define GET_TLBI_IMPL
+#include "AArch64GenSystemOperands.inc"
+ }
+}
diff --git a/lib/Target/AArch64/Utils/AArch64BaseInfo.h b/lib/Target/AArch64/Utils/AArch64BaseInfo.h
index e63627eae123b..dcc39176031c5 100644
--- a/lib/Target/AArch64/Utils/AArch64BaseInfo.h
+++ b/lib/Target/AArch64/Utils/AArch64BaseInfo.h
@@ -266,231 +266,85 @@ inline static unsigned getNZCVToSatisfyCondCode(CondCode Code) {
}
} // end namespace AArch64CC
-/// Instances of this class can perform bidirectional mapping from random
-/// identifier strings to operand encodings. For example "MSR" takes a named
-/// system-register which must be encoded somehow and decoded for printing. This
-/// central location means that the information for those transformations is not
-/// duplicated and remains in sync.
-///
-/// FIXME: currently the algorithm is a completely unoptimised linear
-/// search. Obviously this could be improved, but we would probably want to work
-/// out just how often these instructions are emitted before working on it. It
-/// might even be optimal to just reorder the tables for the common instructions
-/// rather than changing the algorithm.
-struct AArch64NamedImmMapper {
- struct Mapping {
+namespace AArch64AT{
+ struct AT {
const char *Name;
- uint32_t Value;
- // Set of features this mapping is available for
- // Zero value of FeatureBitSet means the mapping is always available
- FeatureBitset FeatureBitSet;
-
- bool isNameEqual(std::string Other,
- const FeatureBitset& FeatureBits) const {
- if (FeatureBitSet.any() &&
- (FeatureBitSet & FeatureBits).none())
- return false;
- return Name == Other;
- }
-
- bool isValueEqual(uint32_t Other,
- const FeatureBitset& FeatureBits) const {
- if (FeatureBitSet.any() &&
- (FeatureBitSet & FeatureBits).none())
- return false;
- return Value == Other;
- }
- };
-
- template<int N>
- AArch64NamedImmMapper(const Mapping (&Mappings)[N], uint32_t TooBigImm)
- : Mappings(&Mappings[0]), NumMappings(N), TooBigImm(TooBigImm) {}
-
- // Maps value to string, depending on availability for FeatureBits given
- StringRef toString(uint32_t Value, const FeatureBitset& FeatureBits,
- bool &Valid) const;
- // Maps string to value, depending on availability for FeatureBits given
- uint32_t fromString(StringRef Name, const FeatureBitset& FeatureBits,
- bool &Valid) const;
-
- /// Many of the instructions allow an alternative assembly form consisting of
- /// a simple immediate. Currently the only valid forms are ranges [0, N) where
- /// N being 0 indicates no immediate syntax-form is allowed.
- bool validImm(uint32_t Value) const;
-protected:
- const Mapping *Mappings;
- size_t NumMappings;
- uint32_t TooBigImm;
-};
-
-namespace AArch64AT {
- enum ATValues {
- Invalid = -1, // Op0 Op1 CRn CRm Op2
- S1E1R = 0x43c0, // 01 000 0111 1000 000
- S1E2R = 0x63c0, // 01 100 0111 1000 000
- S1E3R = 0x73c0, // 01 110 0111 1000 000
- S1E1W = 0x43c1, // 01 000 0111 1000 001
- S1E2W = 0x63c1, // 01 100 0111 1000 001
- S1E3W = 0x73c1, // 01 110 0111 1000 001
- S1E0R = 0x43c2, // 01 000 0111 1000 010
- S1E0W = 0x43c3, // 01 000 0111 1000 011
- S12E1R = 0x63c4, // 01 100 0111 1000 100
- S12E1W = 0x63c5, // 01 100 0111 1000 101
- S12E0R = 0x63c6, // 01 100 0111 1000 110
- S12E0W = 0x63c7, // 01 100 0111 1000 111
- S1E1RP = 0x43c8, // 01 000 0111 1001 000
- S1E1WP = 0x43c9 // 01 000 0111 1001 001
+ uint16_t Encoding;
};
- struct ATMapper : AArch64NamedImmMapper {
- const static Mapping ATMappings[];
-
- ATMapper();
- };
+ #define GET_AT_DECL
+ #include "AArch64GenSystemOperands.inc"
}
namespace AArch64DB {
- enum DBValues {
- Invalid = -1,
- OSHLD = 0x1,
- OSHST = 0x2,
- OSH = 0x3,
- NSHLD = 0x5,
- NSHST = 0x6,
- NSH = 0x7,
- ISHLD = 0x9,
- ISHST = 0xa,
- ISH = 0xb,
- LD = 0xd,
- ST = 0xe,
- SY = 0xf
+ struct DB {
+ const char *Name;
+ uint16_t Encoding;
};
- struct DBarrierMapper : AArch64NamedImmMapper {
- const static Mapping DBarrierMappings[];
-
- DBarrierMapper();
- };
+ #define GET_DB_DECL
+ #include "AArch64GenSystemOperands.inc"
}
namespace AArch64DC {
- enum DCValues {
- Invalid = -1, // Op1 CRn CRm Op2
- ZVA = 0x5ba1, // 01 011 0111 0100 001
- IVAC = 0x43b1, // 01 000 0111 0110 001
- ISW = 0x43b2, // 01 000 0111 0110 010
- CVAC = 0x5bd1, // 01 011 0111 1010 001
- CSW = 0x43d2, // 01 000 0111 1010 010
- CVAU = 0x5bd9, // 01 011 0111 1011 001
- CIVAC = 0x5bf1, // 01 011 0111 1110 001
- CISW = 0x43f2 // 01 000 0111 1110 010
- };
-
- struct DCMapper : AArch64NamedImmMapper {
- const static Mapping DCMappings[];
-
- DCMapper();
+ struct DC {
+ const char *Name;
+ uint16_t Encoding;
};
+ #define GET_DC_DECL
+ #include "AArch64GenSystemOperands.inc"
}
namespace AArch64IC {
- enum ICValues {
- Invalid = -1, // Op1 CRn CRm Op2
- IALLUIS = 0x0388, // 000 0111 0001 000
- IALLU = 0x03a8, // 000 0111 0101 000
- IVAU = 0x1ba9 // 011 0111 0101 001
- };
-
-
- struct ICMapper : AArch64NamedImmMapper {
- const static Mapping ICMappings[];
-
- ICMapper();
+ struct IC {
+ const char *Name;
+ uint16_t Encoding;
+ bool NeedsReg;
};
-
- static inline bool NeedsRegister(ICValues Val) {
- return Val == IVAU;
- }
+ #define GET_IC_DECL
+ #include "AArch64GenSystemOperands.inc"
}
namespace AArch64ISB {
- enum ISBValues {
- Invalid = -1,
- SY = 0xf
- };
- struct ISBMapper : AArch64NamedImmMapper {
- const static Mapping ISBMappings[];
-
- ISBMapper();
+ struct ISB {
+ const char *Name;
+ uint16_t Encoding;
};
+ #define GET_ISB_DECL
+ #include "AArch64GenSystemOperands.inc"
}
namespace AArch64PRFM {
- enum PRFMValues {
- Invalid = -1,
- PLDL1KEEP = 0x00,
- PLDL1STRM = 0x01,
- PLDL2KEEP = 0x02,
- PLDL2STRM = 0x03,
- PLDL3KEEP = 0x04,
- PLDL3STRM = 0x05,
- PLIL1KEEP = 0x08,
- PLIL1STRM = 0x09,
- PLIL2KEEP = 0x0a,
- PLIL2STRM = 0x0b,
- PLIL3KEEP = 0x0c,
- PLIL3STRM = 0x0d,
- PSTL1KEEP = 0x10,
- PSTL1STRM = 0x11,
- PSTL2KEEP = 0x12,
- PSTL2STRM = 0x13,
- PSTL3KEEP = 0x14,
- PSTL3STRM = 0x15
- };
-
- struct PRFMMapper : AArch64NamedImmMapper {
- const static Mapping PRFMMappings[];
-
- PRFMMapper();
+ struct PRFM {
+ const char *Name;
+ uint16_t Encoding;
};
+ #define GET_PRFM_DECL
+ #include "AArch64GenSystemOperands.inc"
}
namespace AArch64PState {
- enum PStateValues {
- Invalid = -1,
- SPSel = 0x05,
- DAIFSet = 0x1e,
- DAIFClr = 0x1f,
-
- // v8.1a "Privileged Access Never" extension-specific PStates
- PAN = 0x04,
-
- // v8.2a "User Access Override" extension-specific PStates
- UAO = 0x03
- };
-
- struct PStateMapper : AArch64NamedImmMapper {
- const static Mapping PStateMappings[];
+ struct PState {
+ const char *Name;
+ uint16_t Encoding;
+ FeatureBitset FeaturesRequired;
- PStateMapper();
+ bool haveFeatures(FeatureBitset ActiveFeatures) const {
+ return (FeaturesRequired & ActiveFeatures) == FeaturesRequired;
+ }
};
-
+ #define GET_PSTATE_DECL
+ #include "AArch64GenSystemOperands.inc"
}
namespace AArch64PSBHint {
- enum PSBHintValues {
- Invalid = -1,
- // v8.2a "Statistical Profiling" extension-specific PSB operands
- CSync = 0x11, // psb csync = hint #0x11
- };
-
- struct PSBHintMapper : AArch64NamedImmMapper {
- const static Mapping PSBHintMappings[];
-
- PSBHintMapper();
+ struct PSB {
+ const char *Name;
+ uint16_t Encoding;
};
-
+ #define GET_PSB_DECL
+ #include "AArch64GenSystemOperands.inc"
}
namespace AArch64SE {
@@ -574,754 +428,36 @@ AArch64StringToVectorLayout(StringRef LayoutStr) {
}
namespace AArch64SysReg {
- enum SysRegROValues {
- MDCCSR_EL0 = 0x9808, // 10 011 0000 0001 000
- DBGDTRRX_EL0 = 0x9828, // 10 011 0000 0101 000
- MDRAR_EL1 = 0x8080, // 10 000 0001 0000 000
- OSLSR_EL1 = 0x808c, // 10 000 0001 0001 100
- DBGAUTHSTATUS_EL1 = 0x83f6, // 10 000 0111 1110 110
- PMCEID0_EL0 = 0xdce6, // 11 011 1001 1100 110
- PMCEID1_EL0 = 0xdce7, // 11 011 1001 1100 111
- MIDR_EL1 = 0xc000, // 11 000 0000 0000 000
- CCSIDR_EL1 = 0xc800, // 11 001 0000 0000 000
- CLIDR_EL1 = 0xc801, // 11 001 0000 0000 001
- CTR_EL0 = 0xd801, // 11 011 0000 0000 001
- MPIDR_EL1 = 0xc005, // 11 000 0000 0000 101
- REVIDR_EL1 = 0xc006, // 11 000 0000 0000 110
- AIDR_EL1 = 0xc807, // 11 001 0000 0000 111
- DCZID_EL0 = 0xd807, // 11 011 0000 0000 111
- ID_PFR0_EL1 = 0xc008, // 11 000 0000 0001 000
- ID_PFR1_EL1 = 0xc009, // 11 000 0000 0001 001
- ID_DFR0_EL1 = 0xc00a, // 11 000 0000 0001 010
- ID_AFR0_EL1 = 0xc00b, // 11 000 0000 0001 011
- ID_MMFR0_EL1 = 0xc00c, // 11 000 0000 0001 100
- ID_MMFR1_EL1 = 0xc00d, // 11 000 0000 0001 101
- ID_MMFR2_EL1 = 0xc00e, // 11 000 0000 0001 110
- ID_MMFR3_EL1 = 0xc00f, // 11 000 0000 0001 111
- ID_ISAR0_EL1 = 0xc010, // 11 000 0000 0010 000
- ID_ISAR1_EL1 = 0xc011, // 11 000 0000 0010 001
- ID_ISAR2_EL1 = 0xc012, // 11 000 0000 0010 010
- ID_ISAR3_EL1 = 0xc013, // 11 000 0000 0010 011
- ID_ISAR4_EL1 = 0xc014, // 11 000 0000 0010 100
- ID_ISAR5_EL1 = 0xc015, // 11 000 0000 0010 101
- ID_A64PFR0_EL1 = 0xc020, // 11 000 0000 0100 000
- ID_A64PFR1_EL1 = 0xc021, // 11 000 0000 0100 001
- ID_A64DFR0_EL1 = 0xc028, // 11 000 0000 0101 000
- ID_A64DFR1_EL1 = 0xc029, // 11 000 0000 0101 001
- ID_A64AFR0_EL1 = 0xc02c, // 11 000 0000 0101 100
- ID_A64AFR1_EL1 = 0xc02d, // 11 000 0000 0101 101
- ID_A64ISAR0_EL1 = 0xc030, // 11 000 0000 0110 000
- ID_A64ISAR1_EL1 = 0xc031, // 11 000 0000 0110 001
- ID_A64MMFR0_EL1 = 0xc038, // 11 000 0000 0111 000
- ID_A64MMFR1_EL1 = 0xc039, // 11 000 0000 0111 001
- ID_A64MMFR2_EL1 = 0xc03a, // 11 000 0000 0111 010
- MVFR0_EL1 = 0xc018, // 11 000 0000 0011 000
- MVFR1_EL1 = 0xc019, // 11 000 0000 0011 001
- MVFR2_EL1 = 0xc01a, // 11 000 0000 0011 010
- RVBAR_EL1 = 0xc601, // 11 000 1100 0000 001
- RVBAR_EL2 = 0xe601, // 11 100 1100 0000 001
- RVBAR_EL3 = 0xf601, // 11 110 1100 0000 001
- ISR_EL1 = 0xc608, // 11 000 1100 0001 000
- CNTPCT_EL0 = 0xdf01, // 11 011 1110 0000 001
- CNTVCT_EL0 = 0xdf02, // 11 011 1110 0000 010
- ID_MMFR4_EL1 = 0xc016, // 11 000 0000 0010 110
-
- // Trace registers
- TRCSTATR = 0x8818, // 10 001 0000 0011 000
- TRCIDR8 = 0x8806, // 10 001 0000 0000 110
- TRCIDR9 = 0x880e, // 10 001 0000 0001 110
- TRCIDR10 = 0x8816, // 10 001 0000 0010 110
- TRCIDR11 = 0x881e, // 10 001 0000 0011 110
- TRCIDR12 = 0x8826, // 10 001 0000 0100 110
- TRCIDR13 = 0x882e, // 10 001 0000 0101 110
- TRCIDR0 = 0x8847, // 10 001 0000 1000 111
- TRCIDR1 = 0x884f, // 10 001 0000 1001 111
- TRCIDR2 = 0x8857, // 10 001 0000 1010 111
- TRCIDR3 = 0x885f, // 10 001 0000 1011 111
- TRCIDR4 = 0x8867, // 10 001 0000 1100 111
- TRCIDR5 = 0x886f, // 10 001 0000 1101 111
- TRCIDR6 = 0x8877, // 10 001 0000 1110 111
- TRCIDR7 = 0x887f, // 10 001 0000 1111 111
- TRCOSLSR = 0x888c, // 10 001 0001 0001 100
- TRCPDSR = 0x88ac, // 10 001 0001 0101 100
- TRCDEVAFF0 = 0x8bd6, // 10 001 0111 1010 110
- TRCDEVAFF1 = 0x8bde, // 10 001 0111 1011 110
- TRCLSR = 0x8bee, // 10 001 0111 1101 110
- TRCAUTHSTATUS = 0x8bf6, // 10 001 0111 1110 110
- TRCDEVARCH = 0x8bfe, // 10 001 0111 1111 110
- TRCDEVID = 0x8b97, // 10 001 0111 0010 111
- TRCDEVTYPE = 0x8b9f, // 10 001 0111 0011 111
- TRCPIDR4 = 0x8ba7, // 10 001 0111 0100 111
- TRCPIDR5 = 0x8baf, // 10 001 0111 0101 111
- TRCPIDR6 = 0x8bb7, // 10 001 0111 0110 111
- TRCPIDR7 = 0x8bbf, // 10 001 0111 0111 111
- TRCPIDR0 = 0x8bc7, // 10 001 0111 1000 111
- TRCPIDR1 = 0x8bcf, // 10 001 0111 1001 111
- TRCPIDR2 = 0x8bd7, // 10 001 0111 1010 111
- TRCPIDR3 = 0x8bdf, // 10 001 0111 1011 111
- TRCCIDR0 = 0x8be7, // 10 001 0111 1100 111
- TRCCIDR1 = 0x8bef, // 10 001 0111 1101 111
- TRCCIDR2 = 0x8bf7, // 10 001 0111 1110 111
- TRCCIDR3 = 0x8bff, // 10 001 0111 1111 111
-
- // GICv3 registers
- ICC_IAR1_EL1 = 0xc660, // 11 000 1100 1100 000
- ICC_IAR0_EL1 = 0xc640, // 11 000 1100 1000 000
- ICC_HPPIR1_EL1 = 0xc662, // 11 000 1100 1100 010
- ICC_HPPIR0_EL1 = 0xc642, // 11 000 1100 1000 010
- ICC_RPR_EL1 = 0xc65b, // 11 000 1100 1011 011
- ICH_VTR_EL2 = 0xe659, // 11 100 1100 1011 001
- ICH_EISR_EL2 = 0xe65b, // 11 100 1100 1011 011
- ICH_ELSR_EL2 = 0xe65d // 11 100 1100 1011 101
- };
-
- enum SysRegWOValues {
- DBGDTRTX_EL0 = 0x9828, // 10 011 0000 0101 000
- OSLAR_EL1 = 0x8084, // 10 000 0001 0000 100
- PMSWINC_EL0 = 0xdce4, // 11 011 1001 1100 100
-
- // Trace Registers
- TRCOSLAR = 0x8884, // 10 001 0001 0000 100
- TRCLAR = 0x8be6, // 10 001 0111 1100 110
-
- // GICv3 registers
- ICC_EOIR1_EL1 = 0xc661, // 11 000 1100 1100 001
- ICC_EOIR0_EL1 = 0xc641, // 11 000 1100 1000 001
- ICC_DIR_EL1 = 0xc659, // 11 000 1100 1011 001
- ICC_SGI1R_EL1 = 0xc65d, // 11 000 1100 1011 101
- ICC_ASGI1R_EL1 = 0xc65e, // 11 000 1100 1011 110
- ICC_SGI0R_EL1 = 0xc65f // 11 000 1100 1011 111
- };
-
- enum SysRegValues {
- Invalid = -1, // Op0 Op1 CRn CRm Op2
- OSDTRRX_EL1 = 0x8002, // 10 000 0000 0000 010
- OSDTRTX_EL1 = 0x801a, // 10 000 0000 0011 010
- TEECR32_EL1 = 0x9000, // 10 010 0000 0000 000
- MDCCINT_EL1 = 0x8010, // 10 000 0000 0010 000
- MDSCR_EL1 = 0x8012, // 10 000 0000 0010 010
- DBGDTR_EL0 = 0x9820, // 10 011 0000 0100 000
- OSECCR_EL1 = 0x8032, // 10 000 0000 0110 010
- DBGVCR32_EL2 = 0xa038, // 10 100 0000 0111 000
- DBGBVR0_EL1 = 0x8004, // 10 000 0000 0000 100
- DBGBVR1_EL1 = 0x800c, // 10 000 0000 0001 100
- DBGBVR2_EL1 = 0x8014, // 10 000 0000 0010 100
- DBGBVR3_EL1 = 0x801c, // 10 000 0000 0011 100
- DBGBVR4_EL1 = 0x8024, // 10 000 0000 0100 100
- DBGBVR5_EL1 = 0x802c, // 10 000 0000 0101 100
- DBGBVR6_EL1 = 0x8034, // 10 000 0000 0110 100
- DBGBVR7_EL1 = 0x803c, // 10 000 0000 0111 100
- DBGBVR8_EL1 = 0x8044, // 10 000 0000 1000 100
- DBGBVR9_EL1 = 0x804c, // 10 000 0000 1001 100
- DBGBVR10_EL1 = 0x8054, // 10 000 0000 1010 100
- DBGBVR11_EL1 = 0x805c, // 10 000 0000 1011 100
- DBGBVR12_EL1 = 0x8064, // 10 000 0000 1100 100
- DBGBVR13_EL1 = 0x806c, // 10 000 0000 1101 100
- DBGBVR14_EL1 = 0x8074, // 10 000 0000 1110 100
- DBGBVR15_EL1 = 0x807c, // 10 000 0000 1111 100
- DBGBCR0_EL1 = 0x8005, // 10 000 0000 0000 101
- DBGBCR1_EL1 = 0x800d, // 10 000 0000 0001 101
- DBGBCR2_EL1 = 0x8015, // 10 000 0000 0010 101
- DBGBCR3_EL1 = 0x801d, // 10 000 0000 0011 101
- DBGBCR4_EL1 = 0x8025, // 10 000 0000 0100 101
- DBGBCR5_EL1 = 0x802d, // 10 000 0000 0101 101
- DBGBCR6_EL1 = 0x8035, // 10 000 0000 0110 101
- DBGBCR7_EL1 = 0x803d, // 10 000 0000 0111 101
- DBGBCR8_EL1 = 0x8045, // 10 000 0000 1000 101
- DBGBCR9_EL1 = 0x804d, // 10 000 0000 1001 101
- DBGBCR10_EL1 = 0x8055, // 10 000 0000 1010 101
- DBGBCR11_EL1 = 0x805d, // 10 000 0000 1011 101
- DBGBCR12_EL1 = 0x8065, // 10 000 0000 1100 101
- DBGBCR13_EL1 = 0x806d, // 10 000 0000 1101 101
- DBGBCR14_EL1 = 0x8075, // 10 000 0000 1110 101
- DBGBCR15_EL1 = 0x807d, // 10 000 0000 1111 101
- DBGWVR0_EL1 = 0x8006, // 10 000 0000 0000 110
- DBGWVR1_EL1 = 0x800e, // 10 000 0000 0001 110
- DBGWVR2_EL1 = 0x8016, // 10 000 0000 0010 110
- DBGWVR3_EL1 = 0x801e, // 10 000 0000 0011 110
- DBGWVR4_EL1 = 0x8026, // 10 000 0000 0100 110
- DBGWVR5_EL1 = 0x802e, // 10 000 0000 0101 110
- DBGWVR6_EL1 = 0x8036, // 10 000 0000 0110 110
- DBGWVR7_EL1 = 0x803e, // 10 000 0000 0111 110
- DBGWVR8_EL1 = 0x8046, // 10 000 0000 1000 110
- DBGWVR9_EL1 = 0x804e, // 10 000 0000 1001 110
- DBGWVR10_EL1 = 0x8056, // 10 000 0000 1010 110
- DBGWVR11_EL1 = 0x805e, // 10 000 0000 1011 110
- DBGWVR12_EL1 = 0x8066, // 10 000 0000 1100 110
- DBGWVR13_EL1 = 0x806e, // 10 000 0000 1101 110
- DBGWVR14_EL1 = 0x8076, // 10 000 0000 1110 110
- DBGWVR15_EL1 = 0x807e, // 10 000 0000 1111 110
- DBGWCR0_EL1 = 0x8007, // 10 000 0000 0000 111
- DBGWCR1_EL1 = 0x800f, // 10 000 0000 0001 111
- DBGWCR2_EL1 = 0x8017, // 10 000 0000 0010 111
- DBGWCR3_EL1 = 0x801f, // 10 000 0000 0011 111
- DBGWCR4_EL1 = 0x8027, // 10 000 0000 0100 111
- DBGWCR5_EL1 = 0x802f, // 10 000 0000 0101 111
- DBGWCR6_EL1 = 0x8037, // 10 000 0000 0110 111
- DBGWCR7_EL1 = 0x803f, // 10 000 0000 0111 111
- DBGWCR8_EL1 = 0x8047, // 10 000 0000 1000 111
- DBGWCR9_EL1 = 0x804f, // 10 000 0000 1001 111
- DBGWCR10_EL1 = 0x8057, // 10 000 0000 1010 111
- DBGWCR11_EL1 = 0x805f, // 10 000 0000 1011 111
- DBGWCR12_EL1 = 0x8067, // 10 000 0000 1100 111
- DBGWCR13_EL1 = 0x806f, // 10 000 0000 1101 111
- DBGWCR14_EL1 = 0x8077, // 10 000 0000 1110 111
- DBGWCR15_EL1 = 0x807f, // 10 000 0000 1111 111
- TEEHBR32_EL1 = 0x9080, // 10 010 0001 0000 000
- OSDLR_EL1 = 0x809c, // 10 000 0001 0011 100
- DBGPRCR_EL1 = 0x80a4, // 10 000 0001 0100 100
- DBGCLAIMSET_EL1 = 0x83c6, // 10 000 0111 1000 110
- DBGCLAIMCLR_EL1 = 0x83ce, // 10 000 0111 1001 110
- CSSELR_EL1 = 0xd000, // 11 010 0000 0000 000
- VPIDR_EL2 = 0xe000, // 11 100 0000 0000 000
- VMPIDR_EL2 = 0xe005, // 11 100 0000 0000 101
- CPACR_EL1 = 0xc082, // 11 000 0001 0000 010
- SCTLR_EL1 = 0xc080, // 11 000 0001 0000 000
- SCTLR_EL2 = 0xe080, // 11 100 0001 0000 000
- SCTLR_EL3 = 0xf080, // 11 110 0001 0000 000
- ACTLR_EL1 = 0xc081, // 11 000 0001 0000 001
- ACTLR_EL2 = 0xe081, // 11 100 0001 0000 001
- ACTLR_EL3 = 0xf081, // 11 110 0001 0000 001
- HCR_EL2 = 0xe088, // 11 100 0001 0001 000
- SCR_EL3 = 0xf088, // 11 110 0001 0001 000
- MDCR_EL2 = 0xe089, // 11 100 0001 0001 001
- SDER32_EL3 = 0xf089, // 11 110 0001 0001 001
- CPTR_EL2 = 0xe08a, // 11 100 0001 0001 010
- CPTR_EL3 = 0xf08a, // 11 110 0001 0001 010
- HSTR_EL2 = 0xe08b, // 11 100 0001 0001 011
- HACR_EL2 = 0xe08f, // 11 100 0001 0001 111
- MDCR_EL3 = 0xf099, // 11 110 0001 0011 001
- TTBR0_EL1 = 0xc100, // 11 000 0010 0000 000
- TTBR0_EL2 = 0xe100, // 11 100 0010 0000 000
- TTBR0_EL3 = 0xf100, // 11 110 0010 0000 000
- TTBR1_EL1 = 0xc101, // 11 000 0010 0000 001
- TCR_EL1 = 0xc102, // 11 000 0010 0000 010
- TCR_EL2 = 0xe102, // 11 100 0010 0000 010
- TCR_EL3 = 0xf102, // 11 110 0010 0000 010
- VTTBR_EL2 = 0xe108, // 11 100 0010 0001 000
- VTCR_EL2 = 0xe10a, // 11 100 0010 0001 010
- DACR32_EL2 = 0xe180, // 11 100 0011 0000 000
- SPSR_EL1 = 0xc200, // 11 000 0100 0000 000
- SPSR_EL2 = 0xe200, // 11 100 0100 0000 000
- SPSR_EL3 = 0xf200, // 11 110 0100 0000 000
- ELR_EL1 = 0xc201, // 11 000 0100 0000 001
- ELR_EL2 = 0xe201, // 11 100 0100 0000 001
- ELR_EL3 = 0xf201, // 11 110 0100 0000 001
- SP_EL0 = 0xc208, // 11 000 0100 0001 000
- SP_EL1 = 0xe208, // 11 100 0100 0001 000
- SP_EL2 = 0xf208, // 11 110 0100 0001 000
- SPSel = 0xc210, // 11 000 0100 0010 000
- NZCV = 0xda10, // 11 011 0100 0010 000
- DAIF = 0xda11, // 11 011 0100 0010 001
- CurrentEL = 0xc212, // 11 000 0100 0010 010
- SPSR_irq = 0xe218, // 11 100 0100 0011 000
- SPSR_abt = 0xe219, // 11 100 0100 0011 001
- SPSR_und = 0xe21a, // 11 100 0100 0011 010
- SPSR_fiq = 0xe21b, // 11 100 0100 0011 011
- FPCR = 0xda20, // 11 011 0100 0100 000
- FPSR = 0xda21, // 11 011 0100 0100 001
- DSPSR_EL0 = 0xda28, // 11 011 0100 0101 000
- DLR_EL0 = 0xda29, // 11 011 0100 0101 001
- IFSR32_EL2 = 0xe281, // 11 100 0101 0000 001
- AFSR0_EL1 = 0xc288, // 11 000 0101 0001 000
- AFSR0_EL2 = 0xe288, // 11 100 0101 0001 000
- AFSR0_EL3 = 0xf288, // 11 110 0101 0001 000
- AFSR1_EL1 = 0xc289, // 11 000 0101 0001 001
- AFSR1_EL2 = 0xe289, // 11 100 0101 0001 001
- AFSR1_EL3 = 0xf289, // 11 110 0101 0001 001
- ESR_EL1 = 0xc290, // 11 000 0101 0010 000
- ESR_EL2 = 0xe290, // 11 100 0101 0010 000
- ESR_EL3 = 0xf290, // 11 110 0101 0010 000
- FPEXC32_EL2 = 0xe298, // 11 100 0101 0011 000
- FAR_EL1 = 0xc300, // 11 000 0110 0000 000
- FAR_EL2 = 0xe300, // 11 100 0110 0000 000
- FAR_EL3 = 0xf300, // 11 110 0110 0000 000
- HPFAR_EL2 = 0xe304, // 11 100 0110 0000 100
- PAR_EL1 = 0xc3a0, // 11 000 0111 0100 000
- PMCR_EL0 = 0xdce0, // 11 011 1001 1100 000
- PMCNTENSET_EL0 = 0xdce1, // 11 011 1001 1100 001
- PMCNTENCLR_EL0 = 0xdce2, // 11 011 1001 1100 010
- PMOVSCLR_EL0 = 0xdce3, // 11 011 1001 1100 011
- PMSELR_EL0 = 0xdce5, // 11 011 1001 1100 101
- PMCCNTR_EL0 = 0xdce8, // 11 011 1001 1101 000
- PMXEVTYPER_EL0 = 0xdce9, // 11 011 1001 1101 001
- PMXEVCNTR_EL0 = 0xdcea, // 11 011 1001 1101 010
- PMUSERENR_EL0 = 0xdcf0, // 11 011 1001 1110 000
- PMINTENSET_EL1 = 0xc4f1, // 11 000 1001 1110 001
- PMINTENCLR_EL1 = 0xc4f2, // 11 000 1001 1110 010
- PMOVSSET_EL0 = 0xdcf3, // 11 011 1001 1110 011
- MAIR_EL1 = 0xc510, // 11 000 1010 0010 000
- MAIR_EL2 = 0xe510, // 11 100 1010 0010 000
- MAIR_EL3 = 0xf510, // 11 110 1010 0010 000
- AMAIR_EL1 = 0xc518, // 11 000 1010 0011 000
- AMAIR_EL2 = 0xe518, // 11 100 1010 0011 000
- AMAIR_EL3 = 0xf518, // 11 110 1010 0011 000
- VBAR_EL1 = 0xc600, // 11 000 1100 0000 000
- VBAR_EL2 = 0xe600, // 11 100 1100 0000 000
- VBAR_EL3 = 0xf600, // 11 110 1100 0000 000
- RMR_EL1 = 0xc602, // 11 000 1100 0000 010
- RMR_EL2 = 0xe602, // 11 100 1100 0000 010
- RMR_EL3 = 0xf602, // 11 110 1100 0000 010
- CONTEXTIDR_EL1 = 0xc681, // 11 000 1101 0000 001
- TPIDR_EL0 = 0xde82, // 11 011 1101 0000 010
- TPIDR_EL2 = 0xe682, // 11 100 1101 0000 010
- TPIDR_EL3 = 0xf682, // 11 110 1101 0000 010
- TPIDRRO_EL0 = 0xde83, // 11 011 1101 0000 011
- TPIDR_EL1 = 0xc684, // 11 000 1101 0000 100
- CNTFRQ_EL0 = 0xdf00, // 11 011 1110 0000 000
- CNTVOFF_EL2 = 0xe703, // 11 100 1110 0000 011
- CNTKCTL_EL1 = 0xc708, // 11 000 1110 0001 000
- CNTHCTL_EL2 = 0xe708, // 11 100 1110 0001 000
- CNTP_TVAL_EL0 = 0xdf10, // 11 011 1110 0010 000
- CNTHP_TVAL_EL2 = 0xe710, // 11 100 1110 0010 000
- CNTPS_TVAL_EL1 = 0xff10, // 11 111 1110 0010 000
- CNTP_CTL_EL0 = 0xdf11, // 11 011 1110 0010 001
- CNTHP_CTL_EL2 = 0xe711, // 11 100 1110 0010 001
- CNTPS_CTL_EL1 = 0xff11, // 11 111 1110 0010 001
- CNTP_CVAL_EL0 = 0xdf12, // 11 011 1110 0010 010
- CNTHP_CVAL_EL2 = 0xe712, // 11 100 1110 0010 010
- CNTPS_CVAL_EL1 = 0xff12, // 11 111 1110 0010 010
- CNTV_TVAL_EL0 = 0xdf18, // 11 011 1110 0011 000
- CNTV_CTL_EL0 = 0xdf19, // 11 011 1110 0011 001
- CNTV_CVAL_EL0 = 0xdf1a, // 11 011 1110 0011 010
- PMEVCNTR0_EL0 = 0xdf40, // 11 011 1110 1000 000
- PMEVCNTR1_EL0 = 0xdf41, // 11 011 1110 1000 001
- PMEVCNTR2_EL0 = 0xdf42, // 11 011 1110 1000 010
- PMEVCNTR3_EL0 = 0xdf43, // 11 011 1110 1000 011
- PMEVCNTR4_EL0 = 0xdf44, // 11 011 1110 1000 100
- PMEVCNTR5_EL0 = 0xdf45, // 11 011 1110 1000 101
- PMEVCNTR6_EL0 = 0xdf46, // 11 011 1110 1000 110
- PMEVCNTR7_EL0 = 0xdf47, // 11 011 1110 1000 111
- PMEVCNTR8_EL0 = 0xdf48, // 11 011 1110 1001 000
- PMEVCNTR9_EL0 = 0xdf49, // 11 011 1110 1001 001
- PMEVCNTR10_EL0 = 0xdf4a, // 11 011 1110 1001 010
- PMEVCNTR11_EL0 = 0xdf4b, // 11 011 1110 1001 011
- PMEVCNTR12_EL0 = 0xdf4c, // 11 011 1110 1001 100
- PMEVCNTR13_EL0 = 0xdf4d, // 11 011 1110 1001 101
- PMEVCNTR14_EL0 = 0xdf4e, // 11 011 1110 1001 110
- PMEVCNTR15_EL0 = 0xdf4f, // 11 011 1110 1001 111
- PMEVCNTR16_EL0 = 0xdf50, // 11 011 1110 1010 000
- PMEVCNTR17_EL0 = 0xdf51, // 11 011 1110 1010 001
- PMEVCNTR18_EL0 = 0xdf52, // 11 011 1110 1010 010
- PMEVCNTR19_EL0 = 0xdf53, // 11 011 1110 1010 011
- PMEVCNTR20_EL0 = 0xdf54, // 11 011 1110 1010 100
- PMEVCNTR21_EL0 = 0xdf55, // 11 011 1110 1010 101
- PMEVCNTR22_EL0 = 0xdf56, // 11 011 1110 1010 110
- PMEVCNTR23_EL0 = 0xdf57, // 11 011 1110 1010 111
- PMEVCNTR24_EL0 = 0xdf58, // 11 011 1110 1011 000
- PMEVCNTR25_EL0 = 0xdf59, // 11 011 1110 1011 001
- PMEVCNTR26_EL0 = 0xdf5a, // 11 011 1110 1011 010
- PMEVCNTR27_EL0 = 0xdf5b, // 11 011 1110 1011 011
- PMEVCNTR28_EL0 = 0xdf5c, // 11 011 1110 1011 100
- PMEVCNTR29_EL0 = 0xdf5d, // 11 011 1110 1011 101
- PMEVCNTR30_EL0 = 0xdf5e, // 11 011 1110 1011 110
- PMCCFILTR_EL0 = 0xdf7f, // 11 011 1110 1111 111
- PMEVTYPER0_EL0 = 0xdf60, // 11 011 1110 1100 000
- PMEVTYPER1_EL0 = 0xdf61, // 11 011 1110 1100 001
- PMEVTYPER2_EL0 = 0xdf62, // 11 011 1110 1100 010
- PMEVTYPER3_EL0 = 0xdf63, // 11 011 1110 1100 011
- PMEVTYPER4_EL0 = 0xdf64, // 11 011 1110 1100 100
- PMEVTYPER5_EL0 = 0xdf65, // 11 011 1110 1100 101
- PMEVTYPER6_EL0 = 0xdf66, // 11 011 1110 1100 110
- PMEVTYPER7_EL0 = 0xdf67, // 11 011 1110 1100 111
- PMEVTYPER8_EL0 = 0xdf68, // 11 011 1110 1101 000
- PMEVTYPER9_EL0 = 0xdf69, // 11 011 1110 1101 001
- PMEVTYPER10_EL0 = 0xdf6a, // 11 011 1110 1101 010
- PMEVTYPER11_EL0 = 0xdf6b, // 11 011 1110 1101 011
- PMEVTYPER12_EL0 = 0xdf6c, // 11 011 1110 1101 100
- PMEVTYPER13_EL0 = 0xdf6d, // 11 011 1110 1101 101
- PMEVTYPER14_EL0 = 0xdf6e, // 11 011 1110 1101 110
- PMEVTYPER15_EL0 = 0xdf6f, // 11 011 1110 1101 111
- PMEVTYPER16_EL0 = 0xdf70, // 11 011 1110 1110 000
- PMEVTYPER17_EL0 = 0xdf71, // 11 011 1110 1110 001
- PMEVTYPER18_EL0 = 0xdf72, // 11 011 1110 1110 010
- PMEVTYPER19_EL0 = 0xdf73, // 11 011 1110 1110 011
- PMEVTYPER20_EL0 = 0xdf74, // 11 011 1110 1110 100
- PMEVTYPER21_EL0 = 0xdf75, // 11 011 1110 1110 101
- PMEVTYPER22_EL0 = 0xdf76, // 11 011 1110 1110 110
- PMEVTYPER23_EL0 = 0xdf77, // 11 011 1110 1110 111
- PMEVTYPER24_EL0 = 0xdf78, // 11 011 1110 1111 000
- PMEVTYPER25_EL0 = 0xdf79, // 11 011 1110 1111 001
- PMEVTYPER26_EL0 = 0xdf7a, // 11 011 1110 1111 010
- PMEVTYPER27_EL0 = 0xdf7b, // 11 011 1110 1111 011
- PMEVTYPER28_EL0 = 0xdf7c, // 11 011 1110 1111 100
- PMEVTYPER29_EL0 = 0xdf7d, // 11 011 1110 1111 101
- PMEVTYPER30_EL0 = 0xdf7e, // 11 011 1110 1111 110
-
- // Trace registers
- TRCPRGCTLR = 0x8808, // 10 001 0000 0001 000
- TRCPROCSELR = 0x8810, // 10 001 0000 0010 000
- TRCCONFIGR = 0x8820, // 10 001 0000 0100 000
- TRCAUXCTLR = 0x8830, // 10 001 0000 0110 000
- TRCEVENTCTL0R = 0x8840, // 10 001 0000 1000 000
- TRCEVENTCTL1R = 0x8848, // 10 001 0000 1001 000
- TRCSTALLCTLR = 0x8858, // 10 001 0000 1011 000
- TRCTSCTLR = 0x8860, // 10 001 0000 1100 000
- TRCSYNCPR = 0x8868, // 10 001 0000 1101 000
- TRCCCCTLR = 0x8870, // 10 001 0000 1110 000
- TRCBBCTLR = 0x8878, // 10 001 0000 1111 000
- TRCTRACEIDR = 0x8801, // 10 001 0000 0000 001
- TRCQCTLR = 0x8809, // 10 001 0000 0001 001
- TRCVICTLR = 0x8802, // 10 001 0000 0000 010
- TRCVIIECTLR = 0x880a, // 10 001 0000 0001 010
- TRCVISSCTLR = 0x8812, // 10 001 0000 0010 010
- TRCVIPCSSCTLR = 0x881a, // 10 001 0000 0011 010
- TRCVDCTLR = 0x8842, // 10 001 0000 1000 010
- TRCVDSACCTLR = 0x884a, // 10 001 0000 1001 010
- TRCVDARCCTLR = 0x8852, // 10 001 0000 1010 010
- TRCSEQEVR0 = 0x8804, // 10 001 0000 0000 100
- TRCSEQEVR1 = 0x880c, // 10 001 0000 0001 100
- TRCSEQEVR2 = 0x8814, // 10 001 0000 0010 100
- TRCSEQRSTEVR = 0x8834, // 10 001 0000 0110 100
- TRCSEQSTR = 0x883c, // 10 001 0000 0111 100
- TRCEXTINSELR = 0x8844, // 10 001 0000 1000 100
- TRCCNTRLDVR0 = 0x8805, // 10 001 0000 0000 101
- TRCCNTRLDVR1 = 0x880d, // 10 001 0000 0001 101
- TRCCNTRLDVR2 = 0x8815, // 10 001 0000 0010 101
- TRCCNTRLDVR3 = 0x881d, // 10 001 0000 0011 101
- TRCCNTCTLR0 = 0x8825, // 10 001 0000 0100 101
- TRCCNTCTLR1 = 0x882d, // 10 001 0000 0101 101
- TRCCNTCTLR2 = 0x8835, // 10 001 0000 0110 101
- TRCCNTCTLR3 = 0x883d, // 10 001 0000 0111 101
- TRCCNTVR0 = 0x8845, // 10 001 0000 1000 101
- TRCCNTVR1 = 0x884d, // 10 001 0000 1001 101
- TRCCNTVR2 = 0x8855, // 10 001 0000 1010 101
- TRCCNTVR3 = 0x885d, // 10 001 0000 1011 101
- TRCIMSPEC0 = 0x8807, // 10 001 0000 0000 111
- TRCIMSPEC1 = 0x880f, // 10 001 0000 0001 111
- TRCIMSPEC2 = 0x8817, // 10 001 0000 0010 111
- TRCIMSPEC3 = 0x881f, // 10 001 0000 0011 111
- TRCIMSPEC4 = 0x8827, // 10 001 0000 0100 111
- TRCIMSPEC5 = 0x882f, // 10 001 0000 0101 111
- TRCIMSPEC6 = 0x8837, // 10 001 0000 0110 111
- TRCIMSPEC7 = 0x883f, // 10 001 0000 0111 111
- TRCRSCTLR2 = 0x8890, // 10 001 0001 0010 000
- TRCRSCTLR3 = 0x8898, // 10 001 0001 0011 000
- TRCRSCTLR4 = 0x88a0, // 10 001 0001 0100 000
- TRCRSCTLR5 = 0x88a8, // 10 001 0001 0101 000
- TRCRSCTLR6 = 0x88b0, // 10 001 0001 0110 000
- TRCRSCTLR7 = 0x88b8, // 10 001 0001 0111 000
- TRCRSCTLR8 = 0x88c0, // 10 001 0001 1000 000
- TRCRSCTLR9 = 0x88c8, // 10 001 0001 1001 000
- TRCRSCTLR10 = 0x88d0, // 10 001 0001 1010 000
- TRCRSCTLR11 = 0x88d8, // 10 001 0001 1011 000
- TRCRSCTLR12 = 0x88e0, // 10 001 0001 1100 000
- TRCRSCTLR13 = 0x88e8, // 10 001 0001 1101 000
- TRCRSCTLR14 = 0x88f0, // 10 001 0001 1110 000
- TRCRSCTLR15 = 0x88f8, // 10 001 0001 1111 000
- TRCRSCTLR16 = 0x8881, // 10 001 0001 0000 001
- TRCRSCTLR17 = 0x8889, // 10 001 0001 0001 001
- TRCRSCTLR18 = 0x8891, // 10 001 0001 0010 001
- TRCRSCTLR19 = 0x8899, // 10 001 0001 0011 001
- TRCRSCTLR20 = 0x88a1, // 10 001 0001 0100 001
- TRCRSCTLR21 = 0x88a9, // 10 001 0001 0101 001
- TRCRSCTLR22 = 0x88b1, // 10 001 0001 0110 001
- TRCRSCTLR23 = 0x88b9, // 10 001 0001 0111 001
- TRCRSCTLR24 = 0x88c1, // 10 001 0001 1000 001
- TRCRSCTLR25 = 0x88c9, // 10 001 0001 1001 001
- TRCRSCTLR26 = 0x88d1, // 10 001 0001 1010 001
- TRCRSCTLR27 = 0x88d9, // 10 001 0001 1011 001
- TRCRSCTLR28 = 0x88e1, // 10 001 0001 1100 001
- TRCRSCTLR29 = 0x88e9, // 10 001 0001 1101 001
- TRCRSCTLR30 = 0x88f1, // 10 001 0001 1110 001
- TRCRSCTLR31 = 0x88f9, // 10 001 0001 1111 001
- TRCSSCCR0 = 0x8882, // 10 001 0001 0000 010
- TRCSSCCR1 = 0x888a, // 10 001 0001 0001 010
- TRCSSCCR2 = 0x8892, // 10 001 0001 0010 010
- TRCSSCCR3 = 0x889a, // 10 001 0001 0011 010
- TRCSSCCR4 = 0x88a2, // 10 001 0001 0100 010
- TRCSSCCR5 = 0x88aa, // 10 001 0001 0101 010
- TRCSSCCR6 = 0x88b2, // 10 001 0001 0110 010
- TRCSSCCR7 = 0x88ba, // 10 001 0001 0111 010
- TRCSSCSR0 = 0x88c2, // 10 001 0001 1000 010
- TRCSSCSR1 = 0x88ca, // 10 001 0001 1001 010
- TRCSSCSR2 = 0x88d2, // 10 001 0001 1010 010
- TRCSSCSR3 = 0x88da, // 10 001 0001 1011 010
- TRCSSCSR4 = 0x88e2, // 10 001 0001 1100 010
- TRCSSCSR5 = 0x88ea, // 10 001 0001 1101 010
- TRCSSCSR6 = 0x88f2, // 10 001 0001 1110 010
- TRCSSCSR7 = 0x88fa, // 10 001 0001 1111 010
- TRCSSPCICR0 = 0x8883, // 10 001 0001 0000 011
- TRCSSPCICR1 = 0x888b, // 10 001 0001 0001 011
- TRCSSPCICR2 = 0x8893, // 10 001 0001 0010 011
- TRCSSPCICR3 = 0x889b, // 10 001 0001 0011 011
- TRCSSPCICR4 = 0x88a3, // 10 001 0001 0100 011
- TRCSSPCICR5 = 0x88ab, // 10 001 0001 0101 011
- TRCSSPCICR6 = 0x88b3, // 10 001 0001 0110 011
- TRCSSPCICR7 = 0x88bb, // 10 001 0001 0111 011
- TRCPDCR = 0x88a4, // 10 001 0001 0100 100
- TRCACVR0 = 0x8900, // 10 001 0010 0000 000
- TRCACVR1 = 0x8910, // 10 001 0010 0010 000
- TRCACVR2 = 0x8920, // 10 001 0010 0100 000
- TRCACVR3 = 0x8930, // 10 001 0010 0110 000
- TRCACVR4 = 0x8940, // 10 001 0010 1000 000
- TRCACVR5 = 0x8950, // 10 001 0010 1010 000
- TRCACVR6 = 0x8960, // 10 001 0010 1100 000
- TRCACVR7 = 0x8970, // 10 001 0010 1110 000
- TRCACVR8 = 0x8901, // 10 001 0010 0000 001
- TRCACVR9 = 0x8911, // 10 001 0010 0010 001
- TRCACVR10 = 0x8921, // 10 001 0010 0100 001
- TRCACVR11 = 0x8931, // 10 001 0010 0110 001
- TRCACVR12 = 0x8941, // 10 001 0010 1000 001
- TRCACVR13 = 0x8951, // 10 001 0010 1010 001
- TRCACVR14 = 0x8961, // 10 001 0010 1100 001
- TRCACVR15 = 0x8971, // 10 001 0010 1110 001
- TRCACATR0 = 0x8902, // 10 001 0010 0000 010
- TRCACATR1 = 0x8912, // 10 001 0010 0010 010
- TRCACATR2 = 0x8922, // 10 001 0010 0100 010
- TRCACATR3 = 0x8932, // 10 001 0010 0110 010
- TRCACATR4 = 0x8942, // 10 001 0010 1000 010
- TRCACATR5 = 0x8952, // 10 001 0010 1010 010
- TRCACATR6 = 0x8962, // 10 001 0010 1100 010
- TRCACATR7 = 0x8972, // 10 001 0010 1110 010
- TRCACATR8 = 0x8903, // 10 001 0010 0000 011
- TRCACATR9 = 0x8913, // 10 001 0010 0010 011
- TRCACATR10 = 0x8923, // 10 001 0010 0100 011
- TRCACATR11 = 0x8933, // 10 001 0010 0110 011
- TRCACATR12 = 0x8943, // 10 001 0010 1000 011
- TRCACATR13 = 0x8953, // 10 001 0010 1010 011
- TRCACATR14 = 0x8963, // 10 001 0010 1100 011
- TRCACATR15 = 0x8973, // 10 001 0010 1110 011
- TRCDVCVR0 = 0x8904, // 10 001 0010 0000 100
- TRCDVCVR1 = 0x8924, // 10 001 0010 0100 100
- TRCDVCVR2 = 0x8944, // 10 001 0010 1000 100
- TRCDVCVR3 = 0x8964, // 10 001 0010 1100 100
- TRCDVCVR4 = 0x8905, // 10 001 0010 0000 101
- TRCDVCVR5 = 0x8925, // 10 001 0010 0100 101
- TRCDVCVR6 = 0x8945, // 10 001 0010 1000 101
- TRCDVCVR7 = 0x8965, // 10 001 0010 1100 101
- TRCDVCMR0 = 0x8906, // 10 001 0010 0000 110
- TRCDVCMR1 = 0x8926, // 10 001 0010 0100 110
- TRCDVCMR2 = 0x8946, // 10 001 0010 1000 110
- TRCDVCMR3 = 0x8966, // 10 001 0010 1100 110
- TRCDVCMR4 = 0x8907, // 10 001 0010 0000 111
- TRCDVCMR5 = 0x8927, // 10 001 0010 0100 111
- TRCDVCMR6 = 0x8947, // 10 001 0010 1000 111
- TRCDVCMR7 = 0x8967, // 10 001 0010 1100 111
- TRCCIDCVR0 = 0x8980, // 10 001 0011 0000 000
- TRCCIDCVR1 = 0x8990, // 10 001 0011 0010 000
- TRCCIDCVR2 = 0x89a0, // 10 001 0011 0100 000
- TRCCIDCVR3 = 0x89b0, // 10 001 0011 0110 000
- TRCCIDCVR4 = 0x89c0, // 10 001 0011 1000 000
- TRCCIDCVR5 = 0x89d0, // 10 001 0011 1010 000
- TRCCIDCVR6 = 0x89e0, // 10 001 0011 1100 000
- TRCCIDCVR7 = 0x89f0, // 10 001 0011 1110 000
- TRCVMIDCVR0 = 0x8981, // 10 001 0011 0000 001
- TRCVMIDCVR1 = 0x8991, // 10 001 0011 0010 001
- TRCVMIDCVR2 = 0x89a1, // 10 001 0011 0100 001
- TRCVMIDCVR3 = 0x89b1, // 10 001 0011 0110 001
- TRCVMIDCVR4 = 0x89c1, // 10 001 0011 1000 001
- TRCVMIDCVR5 = 0x89d1, // 10 001 0011 1010 001
- TRCVMIDCVR6 = 0x89e1, // 10 001 0011 1100 001
- TRCVMIDCVR7 = 0x89f1, // 10 001 0011 1110 001
- TRCCIDCCTLR0 = 0x8982, // 10 001 0011 0000 010
- TRCCIDCCTLR1 = 0x898a, // 10 001 0011 0001 010
- TRCVMIDCCTLR0 = 0x8992, // 10 001 0011 0010 010
- TRCVMIDCCTLR1 = 0x899a, // 10 001 0011 0011 010
- TRCITCTRL = 0x8b84, // 10 001 0111 0000 100
- TRCCLAIMSET = 0x8bc6, // 10 001 0111 1000 110
- TRCCLAIMCLR = 0x8bce, // 10 001 0111 1001 110
-
- // GICv3 registers
- ICC_BPR1_EL1 = 0xc663, // 11 000 1100 1100 011
- ICC_BPR0_EL1 = 0xc643, // 11 000 1100 1000 011
- ICC_PMR_EL1 = 0xc230, // 11 000 0100 0110 000
- ICC_CTLR_EL1 = 0xc664, // 11 000 1100 1100 100
- ICC_CTLR_EL3 = 0xf664, // 11 110 1100 1100 100
- ICC_SRE_EL1 = 0xc665, // 11 000 1100 1100 101
- ICC_SRE_EL2 = 0xe64d, // 11 100 1100 1001 101
- ICC_SRE_EL3 = 0xf665, // 11 110 1100 1100 101
- ICC_IGRPEN0_EL1 = 0xc666, // 11 000 1100 1100 110
- ICC_IGRPEN1_EL1 = 0xc667, // 11 000 1100 1100 111
- ICC_IGRPEN1_EL3 = 0xf667, // 11 110 1100 1100 111
- ICC_SEIEN_EL1 = 0xc668, // 11 000 1100 1101 000
- ICC_AP0R0_EL1 = 0xc644, // 11 000 1100 1000 100
- ICC_AP0R1_EL1 = 0xc645, // 11 000 1100 1000 101
- ICC_AP0R2_EL1 = 0xc646, // 11 000 1100 1000 110
- ICC_AP0R3_EL1 = 0xc647, // 11 000 1100 1000 111
- ICC_AP1R0_EL1 = 0xc648, // 11 000 1100 1001 000
- ICC_AP1R1_EL1 = 0xc649, // 11 000 1100 1001 001
- ICC_AP1R2_EL1 = 0xc64a, // 11 000 1100 1001 010
- ICC_AP1R3_EL1 = 0xc64b, // 11 000 1100 1001 011
- ICH_AP0R0_EL2 = 0xe640, // 11 100 1100 1000 000
- ICH_AP0R1_EL2 = 0xe641, // 11 100 1100 1000 001
- ICH_AP0R2_EL2 = 0xe642, // 11 100 1100 1000 010
- ICH_AP0R3_EL2 = 0xe643, // 11 100 1100 1000 011
- ICH_AP1R0_EL2 = 0xe648, // 11 100 1100 1001 000
- ICH_AP1R1_EL2 = 0xe649, // 11 100 1100 1001 001
- ICH_AP1R2_EL2 = 0xe64a, // 11 100 1100 1001 010
- ICH_AP1R3_EL2 = 0xe64b, // 11 100 1100 1001 011
- ICH_HCR_EL2 = 0xe658, // 11 100 1100 1011 000
- ICH_MISR_EL2 = 0xe65a, // 11 100 1100 1011 010
- ICH_VMCR_EL2 = 0xe65f, // 11 100 1100 1011 111
- ICH_VSEIR_EL2 = 0xe64c, // 11 100 1100 1001 100
- ICH_LR0_EL2 = 0xe660, // 11 100 1100 1100 000
- ICH_LR1_EL2 = 0xe661, // 11 100 1100 1100 001
- ICH_LR2_EL2 = 0xe662, // 11 100 1100 1100 010
- ICH_LR3_EL2 = 0xe663, // 11 100 1100 1100 011
- ICH_LR4_EL2 = 0xe664, // 11 100 1100 1100 100
- ICH_LR5_EL2 = 0xe665, // 11 100 1100 1100 101
- ICH_LR6_EL2 = 0xe666, // 11 100 1100 1100 110
- ICH_LR7_EL2 = 0xe667, // 11 100 1100 1100 111
- ICH_LR8_EL2 = 0xe668, // 11 100 1100 1101 000
- ICH_LR9_EL2 = 0xe669, // 11 100 1100 1101 001
- ICH_LR10_EL2 = 0xe66a, // 11 100 1100 1101 010
- ICH_LR11_EL2 = 0xe66b, // 11 100 1100 1101 011
- ICH_LR12_EL2 = 0xe66c, // 11 100 1100 1101 100
- ICH_LR13_EL2 = 0xe66d, // 11 100 1100 1101 101
- ICH_LR14_EL2 = 0xe66e, // 11 100 1100 1101 110
- ICH_LR15_EL2 = 0xe66f, // 11 100 1100 1101 111
-
- // v8.1a "Privileged Access Never" extension-specific system registers
- PAN = 0xc213, // 11 000 0100 0010 011
-
- // v8.1a "Limited Ordering Regions" extension-specific system registers
- LORSA_EL1 = 0xc520, // 11 000 1010 0100 000
- LOREA_EL1 = 0xc521, // 11 000 1010 0100 001
- LORN_EL1 = 0xc522, // 11 000 1010 0100 010
- LORC_EL1 = 0xc523, // 11 000 1010 0100 011
- LORID_EL1 = 0xc527, // 11 000 1010 0100 111
-
- // v8.1a "Virtualization host extensions" system registers
- TTBR1_EL2 = 0xe101, // 11 100 0010 0000 001
- CONTEXTIDR_EL2 = 0xe681, // 11 100 1101 0000 001
- CNTHV_TVAL_EL2 = 0xe718, // 11 100 1110 0011 000
- CNTHV_CVAL_EL2 = 0xe71a, // 11 100 1110 0011 010
- CNTHV_CTL_EL2 = 0xe719, // 11 100 1110 0011 001
- SCTLR_EL12 = 0xe880, // 11 101 0001 0000 000
- CPACR_EL12 = 0xe882, // 11 101 0001 0000 010
- TTBR0_EL12 = 0xe900, // 11 101 0010 0000 000
- TTBR1_EL12 = 0xe901, // 11 101 0010 0000 001
- TCR_EL12 = 0xe902, // 11 101 0010 0000 010
- AFSR0_EL12 = 0xea88, // 11 101 0101 0001 000
- AFSR1_EL12 = 0xea89, // 11 101 0101 0001 001
- ESR_EL12 = 0xea90, // 11 101 0101 0010 000
- FAR_EL12 = 0xeb00, // 11 101 0110 0000 000
- MAIR_EL12 = 0xed10, // 11 101 1010 0010 000
- AMAIR_EL12 = 0xed18, // 11 101 1010 0011 000
- VBAR_EL12 = 0xee00, // 11 101 1100 0000 000
- CONTEXTIDR_EL12 = 0xee81, // 11 101 1101 0000 001
- CNTKCTL_EL12 = 0xef08, // 11 101 1110 0001 000
- CNTP_TVAL_EL02 = 0xef10, // 11 101 1110 0010 000
- CNTP_CTL_EL02 = 0xef11, // 11 101 1110 0010 001
- CNTP_CVAL_EL02 = 0xef12, // 11 101 1110 0010 010
- CNTV_TVAL_EL02 = 0xef18, // 11 101 1110 0011 000
- CNTV_CTL_EL02 = 0xef19, // 11 101 1110 0011 001
- CNTV_CVAL_EL02 = 0xef1a, // 11 101 1110 0011 010
- SPSR_EL12 = 0xea00, // 11 101 0100 0000 000
- ELR_EL12 = 0xea01, // 11 101 0100 0000 001
-
- // v8.2a registers
- UAO = 0xc214, // 11 000 0100 0010 100
-
- // v8.2a "Statistical Profiling extension" registers
- PMBLIMITR_EL1 = 0xc4d0, // 11 000 1001 1010 000
- PMBPTR_EL1 = 0xc4d1, // 11 000 1001 1010 001
- PMBSR_EL1 = 0xc4d3, // 11 000 1001 1010 011
- PMBIDR_EL1 = 0xc4d7, // 11 000 1001 1010 111
- PMSCR_EL2 = 0xe4c8, // 11 100 1001 1001 000
- PMSCR_EL12 = 0xecc8, // 11 101 1001 1001 000
- PMSCR_EL1 = 0xc4c8, // 11 000 1001 1001 000
- PMSICR_EL1 = 0xc4ca, // 11 000 1001 1001 010
- PMSIRR_EL1 = 0xc4cb, // 11 000 1001 1001 011
- PMSFCR_EL1 = 0xc4cc, // 11 000 1001 1001 100
- PMSEVFR_EL1 = 0xc4cd, // 11 000 1001 1001 101
- PMSLATFR_EL1 = 0xc4ce, // 11 000 1001 1001 110
- PMSIDR_EL1 = 0xc4cf, // 11 000 1001 1001 111
+ struct SysReg {
+ const char *Name;
+ unsigned Encoding;
+ bool Readable;
+ bool Writeable;
+ FeatureBitset FeaturesRequired;
- // Cyclone specific system registers
- CPM_IOACC_CTL_EL3 = 0xff90,
+ bool haveFeatures(FeatureBitset ActiveFeatures) const {
+ return (FeaturesRequired & ActiveFeatures) == FeaturesRequired;
+ }
};
- // Note that these do not inherit from AArch64NamedImmMapper. This class is
- // sufficiently different in its behaviour that I don't believe it's worth
- // burdening the common AArch64NamedImmMapper with abstractions only needed in
- // this one case.
- struct SysRegMapper {
- static const AArch64NamedImmMapper::Mapping SysRegMappings[];
+ #define GET_SYSREG_DECL
+ #include "AArch64GenSystemOperands.inc"
- const AArch64NamedImmMapper::Mapping *InstMappings;
- size_t NumInstMappings;
+ const SysReg *lookupSysRegByName(StringRef);
+ const SysReg *lookupSysRegByEncoding(uint16_t);
- SysRegMapper() { }
- uint32_t fromString(StringRef Name, const FeatureBitset& FeatureBits,
- bool &Valid) const;
- std::string toString(uint32_t Bits, const FeatureBitset& FeatureBits) const;
- };
-
- struct MSRMapper : SysRegMapper {
- static const AArch64NamedImmMapper::Mapping MSRMappings[];
- MSRMapper();
- };
-
- struct MRSMapper : SysRegMapper {
- static const AArch64NamedImmMapper::Mapping MRSMappings[];
- MRSMapper();
- };
-
- uint32_t ParseGenericRegister(StringRef Name, bool &Valid);
+ uint32_t parseGenericRegister(StringRef Name);
+ std::string genericRegisterString(uint32_t Bits);
}
namespace AArch64TLBI {
- enum TLBIValues {
- Invalid = -1, // Op0 Op1 CRn CRm Op2
- IPAS2E1IS = 0x6401, // 01 100 1000 0000 001
- IPAS2LE1IS = 0x6405, // 01 100 1000 0000 101
- VMALLE1IS = 0x4418, // 01 000 1000 0011 000
- ALLE2IS = 0x6418, // 01 100 1000 0011 000
- ALLE3IS = 0x7418, // 01 110 1000 0011 000
- VAE1IS = 0x4419, // 01 000 1000 0011 001
- VAE2IS = 0x6419, // 01 100 1000 0011 001
- VAE3IS = 0x7419, // 01 110 1000 0011 001
- ASIDE1IS = 0x441a, // 01 000 1000 0011 010
- VAAE1IS = 0x441b, // 01 000 1000 0011 011
- ALLE1IS = 0x641c, // 01 100 1000 0011 100
- VALE1IS = 0x441d, // 01 000 1000 0011 101
- VALE2IS = 0x641d, // 01 100 1000 0011 101
- VALE3IS = 0x741d, // 01 110 1000 0011 101
- VMALLS12E1IS = 0x641e, // 01 100 1000 0011 110
- VAALE1IS = 0x441f, // 01 000 1000 0011 111
- IPAS2E1 = 0x6421, // 01 100 1000 0100 001
- IPAS2LE1 = 0x6425, // 01 100 1000 0100 101
- VMALLE1 = 0x4438, // 01 000 1000 0111 000
- ALLE2 = 0x6438, // 01 100 1000 0111 000
- ALLE3 = 0x7438, // 01 110 1000 0111 000
- VAE1 = 0x4439, // 01 000 1000 0111 001
- VAE2 = 0x6439, // 01 100 1000 0111 001
- VAE3 = 0x7439, // 01 110 1000 0111 001
- ASIDE1 = 0x443a, // 01 000 1000 0111 010
- VAAE1 = 0x443b, // 01 000 1000 0111 011
- ALLE1 = 0x643c, // 01 100 1000 0111 100
- VALE1 = 0x443d, // 01 000 1000 0111 101
- VALE2 = 0x643d, // 01 100 1000 0111 101
- VALE3 = 0x743d, // 01 110 1000 0111 101
- VMALLS12E1 = 0x643e, // 01 100 1000 0111 110
- VAALE1 = 0x443f // 01 000 1000 0111 111
- };
-
- struct TLBIMapper : AArch64NamedImmMapper {
- const static Mapping TLBIMappings[];
-
- TLBIMapper();
+ struct TLBI {
+ const char *Name;
+ uint16_t Encoding;
+ bool NeedsReg;
};
-
- static inline bool NeedsRegister(TLBIValues Val) {
- switch (Val) {
- case VMALLE1IS:
- case ALLE2IS:
- case ALLE3IS:
- case ALLE1IS:
- case VMALLS12E1IS:
- case VMALLE1:
- case ALLE2:
- case ALLE3:
- case ALLE1:
- case VMALLS12E1:
- return false;
- default:
- return true;
- }
- }
+ #define GET_TLBI_DECL
+ #include "AArch64GenSystemOperands.inc"
}
namespace AArch64II {
@@ -1379,12 +515,7 @@ namespace AArch64II {
/// thread-local symbol. On Darwin, only one type of thread-local access
/// exists (pre linker-relaxation), but on ELF the TLSModel used for the
/// referee will affect interpretation.
- MO_TLS = 0x40,
-
- /// MO_CONSTPOOL - This flag indicates that a symbol operand represents
- /// the address of a constant pool entry for the symbol, rather than the
- /// address of the symbol itself.
- MO_CONSTPOOL = 0x80
+ MO_TLS = 0x40
};
} // end namespace AArch64II
diff --git a/lib/Target/AArch64/Utils/Makefile b/lib/Target/AArch64/Utils/Makefile
deleted file mode 100644
index 0b80f82f2b99b..0000000000000
--- a/lib/Target/AArch64/Utils/Makefile
+++ /dev/null
@@ -1,16 +0,0 @@
-##===- lib/Target/AArch64/Utils/Makefile -------------------*- Makefile -*-===##
-#
-# The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-LEVEL = ../../../..
-LIBRARYNAME = LLVMAArch64Utils
-
-# Hack: we need to include 'main' AArch64 target directory to grab private
-# headers
-CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
-
-include $(LEVEL)/Makefile.common